├── .clang-format
├── .github
    └── ISSUE_TEMPLATE
    │   └── bug_report.yml
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── build.sh
├── dockerfile
    └── Dockerfile.triton.trt_llm_backend
├── docs
    ├── baichuan.md
    ├── build.md
    ├── encoder_decoder.md
    ├── gemma.md
    ├── guided_decoding.md
    ├── llama.md
    ├── llama_multi_instance.md
    ├── llmapi.md
    ├── lora.md
    ├── model_config.md
    ├── multimodal.md
    └── whisper.md
└── images
    ├── leader-mode.png
    └── orchestrator-mode.png


/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | AccessModifierOffset: -4
 3 | AlignAfterOpenBracket: DontAlign
 4 | AlignConsecutiveAssignments: None
 5 | AlignConsecutiveDeclarations: None
 6 | AlignOperands:   false
 7 | AlignTrailingComments: true
 8 | AllowAllParametersOfDeclarationOnNextLine: true
 9 | AllowShortBlocksOnASingleLine: Empty
10 | AllowShortCaseLabelsOnASingleLine: true
11 | AllowShortFunctionsOnASingleLine: Empty
12 | AllowShortIfStatementsOnASingleLine: false
13 | AllowShortLoopsOnASingleLine: false
14 | AlwaysBreakAfterDefinitionReturnType: None
15 | AlwaysBreakAfterReturnType: None
16 | AlwaysBreakBeforeMultilineStrings: true
17 | AlwaysBreakTemplateDeclarations: Yes
18 | BasedOnStyle: None
19 | BinPackArguments: true
20 | BinPackParameters: true
21 | BreakBeforeBinaryOperators: All
22 | BreakBeforeBraces: Allman
23 | BreakBeforeTernaryOperators: true
24 | BreakConstructorInitializersBeforeComma: true
25 | ColumnLimit:     120
26 | CommentPragmas:  '^ IWYU pragma:'
27 | ConstructorInitializerAllOnOneLineOrOnePerLine: false
28 | ConstructorInitializerIndentWidth: 4
29 | ContinuationIndentWidth: 4
30 | Cpp11BracedListStyle: true
31 | DerivePointerAlignment: false
32 | DisableFormat:   false
33 | ExperimentalAutoDetectBinPacking: false
34 | ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
35 | IncludeBlocks: Preserve
36 | IncludeCategories:
37 |   - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
38 |     Priority:        2
39 |   - Regex:           '^(<|"(gtest|isl|json)/)'
40 |     Priority:        3
41 |   - Regex:           '.*'
42 |     Priority:        1
43 | IndentCaseLabels: false
44 | IndentWidth:     4
45 | IndentWrappedFunctionNames: false
46 | KeepEmptyLinesAtTheStartOfBlocks: true
47 | Language: Cpp
48 | MacroBlockBegin: ''
49 | MacroBlockEnd:   ''
50 | MaxEmptyLinesToKeep: 1
51 | NamespaceIndentation: None
52 | ObjCBlockIndentWidth: 4
53 | ObjCSpaceAfterProperty: true
54 | ObjCSpaceBeforeProtocolList: true
55 | PenaltyBreakBeforeFirstCallParameter: 19
56 | PenaltyBreakComment: 300
57 | PenaltyBreakFirstLessLess: 120
58 | PenaltyBreakString: 1000
59 | PenaltyExcessCharacter: 1000000
60 | PenaltyReturnTypeOnItsOwnLine: 60
61 | PointerAlignment: Left
62 | QualifierAlignment: Right
63 | ReflowComments:  true
64 | SeparateDefinitionBlocks: Always
65 | SortIncludes:    CaseSensitive
66 | SpaceAfterCStyleCast: true
67 | SpaceBeforeAssignmentOperators: true
68 | SpaceBeforeParens: ControlStatements
69 | SpaceInEmptyParentheses: false
70 | SpacesBeforeTrailingComments: 1
71 | SpacesInAngles:  false
72 | SpacesInCStyleCastParentheses: false
73 | SpacesInContainerLiterals: true
74 | SpacesInParentheses: false
75 | SpacesInSquareBrackets: false
76 | Standard:        c++14
77 | TabWidth:        4
78 | UseTab:          Never
79 | ...
80 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
  1 | name: "Bug Report"
  2 | description: Submit a bug report to help us improve TensorRT-LLM backend
  3 | labels: [ "bug" ]
  4 | body:
  5 |   - type: textarea
  6 |     id: system-info
  7 |     attributes:
  8 |       label: System Info
  9 |       description: Please share your system info with us.
 10 |       placeholder: |
 11 |         - CPU architecture (e.g., x86_64, aarch64)
 12 |         - CPU/Host memory size (if known)
 13 |         - GPU properties
 14 |           - GPU name (e.g., NVIDIA H100, NVIDIA A100, NVIDIA L40S)
 15 |           - GPU memory size (if known)
 16 |           - Clock frequencies used (if applicable)
 17 |         - Libraries
 18 |           - TensorRT-LLM branch or tag (e.g., main, v0.7.1)
 19 |           - TensorRT-LLM commit (if known)
 20 |           - Versions of TensorRT, AMMO, CUDA, cuBLAS, etc. used
 21 |           - Container used (if running TensorRT-LLM in a container)
 22 |         - NVIDIA driver version
 23 |         - OS (Ubuntu 22.04, CentOS 7, Windows 10)
 24 |         - Docker image version
 25 |         - Any other information that may be useful in reproducing the bug
 26 |     validations:
 27 |       required: true
 28 | 
 29 |   - type: textarea
 30 |     id: who-can-help
 31 |     attributes:
 32 |       label: Who can help?
 33 |       description: |
 34 |         To expedite the response to your issue, it would be helpful if you could identify the appropriate person
 35 |         to tag using the **@** symbol. Here is a general guideline on **whom to tag**.
 36 | 
 37 |         Rest assured that all issues are reviewed by the core maintainers. If you are unsure about whom to tag,
 38 |         you can leave it blank, and a core maintainer will make sure to involve the appropriate person.
 39 | 
 40 |         Please tag fewer than 3 people.
 41 | 
 42 |         Quantization: @Tracin
 43 | 
 44 |         Documentation: @juney-nvidia
 45 | 
 46 |         Feature request: @ncomly-nvidia
 47 | 
 48 |         Performance: @kaiyux
 49 | 
 50 |         Others: @byshiue @schetlur-nv
 51 | 
 52 |       placeholder: "@Username ..."
 53 | 
 54 |   - type: checkboxes
 55 |     id: information-scripts-examples
 56 |     attributes:
 57 |       label: Information
 58 |       description: 'The problem arises when using:'
 59 |       options:
 60 |         - label: "The official example scripts"
 61 |         - label: "My own modified scripts"
 62 | 
 63 |   - type: checkboxes
 64 |     id: information-tasks
 65 |     attributes:
 66 |       label: Tasks
 67 |       description: "The tasks I am working on are:"
 68 |       options:
 69 |         - label: "An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)"
 70 |         - label: "My own task or dataset (give details below)"
 71 | 
 72 |   - type: textarea
 73 |     id: reproduction
 74 |     validations:
 75 |       required: true
 76 |     attributes:
 77 |       label: Reproduction
 78 |       description: |
 79 |         Kindly share a code example that demonstrates the issue you encountered. It is recommending to provide a code snippet directly.
 80 |         Additionally, if you have any error messages, or stack traces related to the problem, please include them here.
 81 | 
 82 |         Remember to use code tags to properly format your code. You can refer to the
 83 |         link https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting for guidance on code formatting.
 84 | 
 85 |         Please refrain from using screenshots, as they can be difficult to read and prevent others from copying and pasting your code.
 86 |         It would be most helpful if we could reproduce your issue by simply copying and pasting your scripts and codes.
 87 | 
 88 |       placeholder: |
 89 |         Steps to reproduce the behavior:
 90 | 
 91 |           1.
 92 |           2.
 93 |           3.
 94 | 
 95 |   - type: textarea
 96 |     id: expected-behavior
 97 |     validations:
 98 |       required: true
 99 |     attributes:
100 |       label: Expected behavior
101 |       description: "Provide a brief summary of the expected behavior of the software. Provide output files or examples if possible."
102 | 
103 |   - type: textarea
104 |     id: actual-behavior
105 |     validations:
106 |       required: true
107 |     attributes:
108 |       label: actual behavior
109 |       description: "Describe the actual behavior of the software and how it deviates from the expected behavior. Provide output files or examples if possible."
110 | 
111 |   - type: textarea
112 |     id: additioanl-notes
113 |     validations:
114 |       required: true
115 |     attributes:
116 |       label: additional notes
117 |       description: "Provide any additional context here you think might be useful for the TensorRT-LLM team to help debug this issue (such as experiments done, potential things to investigate)."
118 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | .vscode
 3 | *.cache
 4 | *.nsys-rep
 5 | .VSCodeCounter
 6 | build/
 7 | *.so
 8 | *.egg-info/
 9 | .coverage
10 | *.onnx
11 | tmp/
12 | .idea
13 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tensorrt_llm"]
2 | 	path = tensorrt_llm
3 | 	url = https://github.com/NVIDIA/TensorRT-LLM.git
4 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pycqa/isort
 3 |     rev: 5.12.0
 4 |     hooks:
 5 |     -   id: isort
 6 | -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
 7 |     rev: v1.1.13
 8 |     hooks:
 9 |     -   id: remove-crlf
10 | -   repo: https://github.com/google/yapf
11 |     rev: v0.43.0
12 |     hooks:
13 |     -   id: yapf
14 | -   repo: https://github.com/pre-commit/pre-commit-hooks
15 |     rev: v4.1.0
16 |     hooks:
17 |     -   id: check-added-large-files
18 |         exclude: 'tensorrt_llm/'
19 |     -   id: check-merge-conflict
20 |     -   id: check-symlinks
21 |     -   id: detect-private-key
22 |     -   id: end-of-file-fixer
23 |     -   id: check-yaml
24 |     -   id: trailing-whitespace
25 | -   repo: https://github.com/PyCQA/autoflake
26 |     rev: v1.6.1
27 |     hooks:
28 |     -   id: autoflake
29 |         args: ['--in-place', '--remove-all-unused-imports', '--remove-unused-variables']
30 | -   repo: https://github.com/pre-commit/mirrors-clang-format
31 |     rev: v16.0.0
32 |     hooks:
33 |     -   id: clang-format
34 |         types_or: [c++, c, cuda]
35 |         exclude: |
36 |             (?x)^(
37 |                 tensorrt_llm/.*
38 |             )$
39 | -   repo: https://github.com/cheshirekow/cmake-format-precommit
40 |     rev: v0.6.10
41 |     hooks:
42 |     -   id: cmake-format
43 | -   repo: https://github.com/codespell-project/codespell
44 |     rev: v2.2.4
45 |     hooks:
46 |     -   id: codespell
47 |         exclude: tools/dataset/
48 |         args:
49 |         - --skip=".git,tensorrt_llm"
50 |         - --exclude-file=all_models/whisper/whisper_bls/1/tokenizer.py
51 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | git lfs install
 4 | git submodule update --init --recursive
 5 | 
 6 | # Default values will be used if not set
 7 | BASE_IMAGE=${BASE_IMAGE:-nvcr.io/nvidia/tritonserver:24.11-py3-min}
 8 | PYTORCH_IMAGE=${PYTORCH_IMAGE:-nvcr.io/nvidia/pytorch:24.11-py3}
 9 | TRT_VERSION=${TRT_VERSION:-10.7.0.23}
10 | TRT_URL_x86=${TRT_URL_x86:-https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/tars/TensorRT-${TRT_VERSION}.Linux.x86_64-gnu.cuda-12.6.tar.gz}
11 | TRT_URL_ARM=${TRT_URL_ARM:-https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.7.0/tars/TensorRT-${TRT_VERSION}.ubuntu-24.04.aarch64-gnu.cuda-12.6.tar.gz}
12 | 
13 | # Build the TRT-LLM base image that has TRT-LLM installed and will be used as
14 | # the base image for building Triton server and TRT-LLM backend.
15 | docker build -t trtllm_base \
16 |              --build-arg BASE_IMAGE="${BASE_IMAGE}" \
17 |              --build-arg PYTORCH_IMAGE="${PYTORCH_IMAGE}" \
18 |              --build-arg TRT_VER="${TRT_VERSION}" \
19 |              --build-arg RELEASE_URL_TRT_x86="${TRT_URL_x86}" \
20 |              --build-arg RELEASE_URL_TRT_ARM="${TRT_URL_ARM}" \
21 |              -f dockerfile/Dockerfile.triton.trt_llm_backend .
22 | 
23 | # Clone the Triton server repository on the same level as the TRT-LLM backend repository.
24 | cd ../
25 | # Need to use the aligned version of the Triton server repository.
26 | # Refer to the support matrix for the aligned version: https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
27 | TRITON_SERVER_REPO_TAG=${TRITON_SERVER_REPO_TAG:-r24.11}
28 | git clone -b ${TRITON_SERVER_REPO_TAG} https://github.com/triton-inference-server/server.git
29 | cd server
30 | 
31 | # The `TRTLLM_BASE_IMAGE` is the base image that will be used to build the
32 | # container. The `TENSORRTLLM_BACKEND_REPO_TAG` and `PYTHON_BACKEND_REPO_TAG` are
33 | # the tags of the TensorRT-LLM backend and Python backend repositories that will
34 | # be used to build the container.
35 | TRTLLM_BASE_IMAGE=${TRTLLM_BASE_IMAGE:-trtllm_base}
36 | TENSORRTLLM_BACKEND_REPO_TAG=${TENSORRTLLM_BACKEND_REPO_TAG:-v0.15.0}
37 | PYTHON_BACKEND_REPO_TAG=${PYTHON_BACKEND_REPO_TAG:-r24.11}
38 | 
39 | TRITON_GITHUB_ORGANIZATION=${TRITON_GITHUB_ORGANIZATION:-}
40 | if [ "$TRITON_GITHUB_ORGANIZATION" != "" ]
41 | then
42 |     GITHUB_ORGANIZATION="--github-organization=${TRITON_GITHUB_ORGANIZATION}"
43 | else
44 |     GITHUB_ORGANIZATION=""
45 | fi
46 | 
47 | TRITON_CONTAINER_PREBUILD_COMMAND=${TRITON_CONTAINER_PREBUILD_COMMAND:-}
48 | if [ "$TRITON_CONTAINER_PREBUILD_COMMAND" != "" ]
49 | then
50 |     CONTAINER_PREBUILD_COMMAND="--container-prebuild-command=${TRITON_CONTAINER_PREBUILD_COMMAND}"
51 | else
52 |     CONTAINER_PREBUILD_COMMAND=""
53 | fi
54 | 
55 | # The flags for some features or endpoints can be removed if not needed.
56 | ./build.py -v --no-container-interactive --enable-logging --enable-stats --enable-tracing \
57 |               --enable-metrics --enable-gpu-metrics --enable-cpu-metrics \
58 |               --filesystem=gcs --filesystem=s3 --filesystem=azure_storage \
59 |               --endpoint=http --endpoint=grpc --endpoint=sagemaker --endpoint=vertex-ai \
60 |               --backend=ensemble --enable-gpu --no-container-pull \
61 |               --repoagent=checksum --cache=local --cache=redis \
62 |               --image=base,${TRTLLM_BASE_IMAGE} \
63 |               --backend=tensorrtllm:${TENSORRTLLM_BACKEND_REPO_TAG} \
64 |               --backend=python:${PYTHON_BACKEND_REPO_TAG} \
65 |               "${GITHUB_ORGANIZATION}" "${CONTAINER_PREBUILD_COMMAND}"
66 | 


--------------------------------------------------------------------------------
/dockerfile/Dockerfile.triton.trt_llm_backend:
--------------------------------------------------------------------------------
  1 | ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.04-py3-min
  2 | ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:25.04-py3
  3 | ARG NVRTC_VER=12.9.41-1
  4 | ARG TRT_VER=10.10.0.31
  5 | ARG RELEASE_URL_TRT_x86=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.10.0/tars/TensorRT-${TRT_VER}.Linux.x86_64-gnu.cuda-12.9.tar.gz
  6 | ARG RELEASE_URL_TRT_ARM=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.10.0/tars/TensorRT-${TRT_VER}.Linux.aarch64-gnu.cuda-12.9.tar.gz
  7 | 
  8 | # Versions of packages to copy from pytorch image
  9 | ARG TORCH_VER=2.7.0a0+79aa17489c.nv25.4
 10 | ARG TORCHVISION_VER=0.22.0a0
 11 | ARG SETUPTOOLS_VER=78.1.0
 12 | ARG PYTORCH_TRITON_VER=3.2.0+git4b3bb1f8b.nvinternal
 13 | ARG JINJA2_VER=3.1.6
 14 | ARG NETWORKX_VER=3.4.2
 15 | ARG SYMPY_VER=1.13.3
 16 | ARG PACKAGING_VER=23.2
 17 | ARG FLASH_ATTN_VER=2.7.3
 18 | 
 19 | FROM ${PYTORCH_IMAGE} AS pytorch_image
 20 | FROM ${BASE_IMAGE} AS install_dependencies
 21 | 
 22 | ARG TENSORRTLLM_REPO=https://github.com/NVIDIA/TensorRT-LLM.git
 23 | ARG TENSORRTLLM_REPO_TAG=main
 24 | 
 25 | WORKDIR /workspace
 26 | 
 27 | ARG CCACHE_REMOTE_STORAGE
 28 | ARG CCACHE_URL
 29 | ENV CCACHE_DEBUG=1
 30 | 
 31 | RUN if [ -n "${CCACHE_REMOTE_STORAGE}" ] ; then \
 32 |       curl -k -L ${CCACHE_URL} -o ccache.tar.gz ; \
 33 |       tar -xzf ccache.tar.gz -C /usr/local --strip-components=1 ; \
 34 |       rm ccache.tar.gz ; \
 35 |       ccache --set-config=remote_only=true ; \
 36 |       ccache --set-config=remote_storage=${CCACHE_REMOTE_STORAGE} ; \
 37 |       ccache --set-config=log_file=/tmp/ccache.log ; \
 38 |       ccache -p ; \
 39 |     fi
 40 | 
 41 | ARG TORCH_VER
 42 | ARG TORCHVISION_VER
 43 | ARG SETUPTOOLS_VER
 44 | ARG PYTORCH_TRITON_VER
 45 | ARG JINJA2_VER
 46 | ARG NETWORKX_VER
 47 | ARG SYMPY_VER
 48 | ARG PACKAGING_VER
 49 | ARG FLASH_ATTN_VER
 50 | # Copy PyTorch package from PyTorch image
 51 | COPY --from=pytorch_image /usr/local/lib/lib* /usr/local/lib/
 52 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torch /usr/local/lib/python3.12/dist-packages/torch
 53 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torch-${TORCH_VER}.dist-info /usr/local/lib/python3.12/dist-packages/torch-${TORCH_VER}.dist-info
 54 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchgen /usr/local/lib/python3.12/dist-packages/torchgen
 55 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision /usr/local/lib/python3.12/dist-packages/torchvision
 56 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision-${TORCHVISION_VER}.dist-info /usr/local/lib/python3.12/dist-packages/torchvision-${TORCHVISION_VER}.dist-info
 57 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision.libs /usr/local/lib/python3.12/dist-packages/torchvision.libs
 58 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/setuptools /usr/local/lib/python3.12/dist-packages/setuptools
 59 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/setuptools-${SETUPTOOLS_VER}.dist-info /usr/local/lib/python3.12/dist-packages/setuptools-${SETUPTOOLS_VER}.dist-info
 60 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/functorch /usr/local/lib/python3.12/dist-packages/functorch
 61 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info /usr/local/lib/python3.12/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info
 62 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/triton /usr/local/lib/python3.12/dist-packages/triton
 63 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/jinja2 /usr/local/lib/python3.12/dist-packages/jinja2
 64 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/jinja2-${JINJA2_VER}.dist-info /usr/local/lib/python3.12/dist-packages/jinja2-${JINJA2_VER}.dist-info
 65 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/networkx /usr/local/lib/python3.12/dist-packages/networkx
 66 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/networkx-${NETWORKX_VER}.dist-info /usr/local/lib/python3.12/dist-packages/networkx-${NETWORKX_VER}.dist-info
 67 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/sympy /usr/local/lib/python3.12/dist-packages/sympy
 68 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/sympy-${SYMPY_VER}.dist-info /usr/local/lib/python3.12/dist-packages/sympy-${SYMPY_VER}.dist-info
 69 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/packaging /usr/local/lib/python3.12/dist-packages/packaging
 70 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/packaging-${PACKAGING_VER}.dist-info /usr/local/lib/python3.12/dist-packages/packaging-${PACKAGING_VER}.dist-info
 71 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn /usr/local/lib/python3.12/dist-packages/flash_attn
 72 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info /usr/local/lib/python3.12/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info
 73 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn_2_cuda.cpython-312-*-linux-gnu.so /usr/local/lib/python3.12/dist-packages/
 74 | 
 75 | # Might not need to copy cusparseLt in the future once it's included in DLFW cuda container
 76 | COPY --from=pytorch_image /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/
 77 | 
 78 | ENV PIP_BREAK_SYSTEM_PACKAGES=1
 79 | RUN apt-get update -q=2 \
 80 |     && apt-get install -y --no-install-recommends \
 81 |         python3-dev \
 82 |         python3-pip \
 83 |         git-lfs \
 84 |         # Remove previous TRT installation
 85 |     && apt-get purge -y "libnvinfer*" \
 86 |     && pip3 uninstall -y tensorrt \
 87 |     && rm -rf /var/lib/apt/lists/*
 88 | 
 89 | ARG TRT_VER
 90 | ARG NVRTC_VER
 91 | 
 92 | ENV TRT_VERSION=$TRT_VER \
 93 |     TRT_VER=$TRT_VER \
 94 |     CUDA_VER=$CUDA_VERSION \
 95 |     CUDNN_VER=$CUDNN_VERSION \
 96 |     NCCL_VER=$NCCL_VERSION \
 97 |     CUBLAS_VER=$CUBLAS_VERSION \
 98 |     NVRTC_VER="${NVRTC_VER}"
 99 | 
100 | LABEL TRT_VERSION $TRT_VER
101 | 
102 | # Install NVRTC
103 | RUN [ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" \
104 |     && curl -o /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$arch/cuda-keyring_1.0-1_all.deb \
105 |     && apt install /tmp/cuda-keyring.deb \
106 |     && rm /tmp/cuda-keyring.deb \
107 |     && apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev* \
108 |     && CUDA_VER_SHORT=${CUDA_VER: 0:4} \
109 |     && NVRTC_CUDA_VERSION=${CUDA_VER_SHORT/./-} \
110 |     && apt-get update -qq \
111 |     && apt-get install -y --no-install-recommends cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER} \
112 |     && rm -rf /var/lib/apt/lists/*
113 | 
114 | # Download & install TRT release
115 | ARG RELEASE_URL_TRT_x86
116 | ARG RELEASE_URL_TRT_ARM
117 | 
118 | RUN [ "$(uname -m)" != "x86_64" ] && RELEASE_URL_TRT=${RELEASE_URL_TRT_ARM} || RELEASE_URL_TRT=${RELEASE_URL_TRT_x86} \
119 |     && curl -fSL -o /tmp/tensorrt.tar.gz ${RELEASE_URL_TRT} \
120 |     # Extract the tarball, excluding Windows libraries and static libraries as
121 |     # they are not needed for Linux build
122 |     && tar xzvf /tmp/tensorrt.tar.gz --exclude="lib*win.so*" --exclude="*.a" -C /usr/local \
123 |     && rm /tmp/tensorrt.tar.gz \
124 |     && find /usr/local -maxdepth 1 -name Tens* -type d -exec ln -s {} /usr/local/tensorrt \;
125 | 
126 | RUN pip3 install /usr/local/tensorrt/python/tensorrt-*-cp$( python3 -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))" )*
127 | 
128 | ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH}
129 | ENV TRT_ROOT=/usr/local/tensorrt
130 | 
131 | FROM install_dependencies AS tensorrt_llm_build
132 | 
133 | WORKDIR /workspace
134 | 
135 | ARG TENSORRTLLM_REPO
136 | ARG TENSORRTLLM_REPO_TAG
137 | RUN git-lfs install \
138 |     && git clone --single-branch --recurse-submodules --depth=1 -b ${TENSORRTLLM_REPO_TAG} ${TENSORRTLLM_REPO} tensorrt_llm
139 | 
140 | RUN pip3 install --no-cache-dir polygraphy==0.49.9 mpi4py==3.1.5 cmake==3.30.2 ninja
141 | 
142 | RUN cd tensorrt_llm && \
143 |     if [ -n "${CCACHE_REMOTE_STORAGE}" ] ; then \
144 |       python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" --clean --use_ccache ; \
145 |     else \
146 |       python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" --clean ; \
147 |     fi
148 | 
149 | # Final stage to build the TRT-LLM container
150 | FROM ${BASE_IMAGE} AS final_stage
151 | 
152 | ARG TORCH_VER
153 | ARG TORCHVISION_VER
154 | ARG SETUPTOOLS_VER
155 | ARG PYTORCH_TRITON_VER
156 | ARG JINJA2_VER
157 | ARG NETWORKX_VER
158 | ARG SYMPY_VER
159 | ARG PACKAGING_VER
160 | ARG FLASH_ATTN_VER
161 | # Copy necessary files from the base stage
162 | COPY --from=pytorch_image /usr/local/lib/lib* /usr/local/lib/
163 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torch /usr/local/lib/python3.12/dist-packages/torch
164 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torch-${TORCH_VER}.dist-info /usr/local/lib/python3.12/dist-packages/torch-${TORCH_VER}.dist-info
165 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchgen /usr/local/lib/python3.12/dist-packages/torchgen
166 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision /usr/local/lib/python3.12/dist-packages/torchvision
167 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision-${TORCHVISION_VER}.dist-info /usr/local/lib/python3.12/dist-packages/torchvision-${TORCHVISION_VER}.dist-info
168 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/torchvision.libs /usr/local/lib/python3.12/dist-packages/torchvision.libs
169 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/setuptools /usr/local/lib/python3.12/dist-packages/setuptools
170 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/setuptools-${SETUPTOOLS_VER}.dist-info /usr/local/lib/python3.12/dist-packages/setuptools-${SETUPTOOLS_VER}.dist-info
171 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/functorch /usr/local/lib/python3.12/dist-packages/functorch
172 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info /usr/local/lib/python3.12/dist-packages/pytorch_triton-${PYTORCH_TRITON_VER}.dist-info
173 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/triton /usr/local/lib/python3.12/dist-packages/triton
174 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/jinja2 /usr/local/lib/python3.12/dist-packages/jinja2
175 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/jinja2-${JINJA2_VER}.dist-info /usr/local/lib/python3.12/dist-packages/jinja2-${JINJA2_VER}.dist-info
176 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/networkx /usr/local/lib/python3.12/dist-packages/networkx
177 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/networkx-${NETWORKX_VER}.dist-info /usr/local/lib/python3.12/dist-packages/networkx-${NETWORKX_VER}.dist-info
178 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/sympy /usr/local/lib/python3.12/dist-packages/sympy
179 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/sympy-${SYMPY_VER}.dist-info /usr/local/lib/python3.12/dist-packages/sympy-${SYMPY_VER}.dist-info
180 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/packaging /usr/local/lib/python3.12/dist-packages/packaging
181 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/packaging-${PACKAGING_VER}.dist-info /usr/local/lib/python3.12/dist-packages/packaging-${PACKAGING_VER}.dist-info
182 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn /usr/local/lib/python3.12/dist-packages/flash_attn
183 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info /usr/local/lib/python3.12/dist-packages/flash_attn-${FLASH_ATTN_VER}.dist-info
184 | COPY --from=pytorch_image /usr/local/lib/python3.12/dist-packages/flash_attn_2_cuda.cpython-312-*-linux-gnu.so /usr/local/lib/python3.12/dist-packages/
185 | 
186 | # Might not need to copy cusparseLt in the future once it's included in DLFW cuda container
187 | COPY --from=pytorch_image /usr/local/cuda/lib64/libcusparseLt* /usr/local/cuda/lib64/
188 | 
189 | ARG NVRTC_VER
190 | ENV CUDA_VER=$CUDA_VERSION \
191 |     NVRTC_VER="${NVRTC_VER}"
192 | 
193 | # Install the necessary dependencies and remove previous TRT installation in the
194 | # final image
195 | ENV PIP_BREAK_SYSTEM_PACKAGES=1
196 | RUN apt-get update -q=2 \
197 |     && apt-get install -y --no-install-recommends \
198 |         python3-dev \
199 |         python3-pip \
200 |         git-lfs \
201 |         # Remove previous TRT installation
202 |     && apt-get purge -y "libnvinfer*" \
203 |     && pip3 uninstall -y tensorrt \
204 |     && rm -rf /var/lib/apt/lists/* \
205 |     && pip3 install --no-cache-dir polygraphy==0.49.9 mpi4py==3.1.5
206 | 
207 | # Install NVRTC
208 | RUN [ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" \
209 |     && curl -o /tmp/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$arch/cuda-keyring_1.0-1_all.deb \
210 |     && apt install /tmp/cuda-keyring.deb \
211 |     && rm /tmp/cuda-keyring.deb \
212 |     && apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev* \
213 |     && CUDA_VER_SHORT=${CUDA_VER: 0:4} \
214 |     && NVRTC_CUDA_VERSION=${CUDA_VER_SHORT/./-} \
215 |     && apt-get update -qq \
216 |     && apt-get install -y --no-install-recommends cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER} \
217 |     && rm -rf /var/lib/apt/lists/*
218 | 
219 | # Install TRT
220 | COPY --from=install_dependencies /usr/local/tensorrt /usr/local/tensorrt
221 | RUN pip3 install /usr/local/tensorrt/python/tensorrt-*-cp$( python3 -c "import sys; print(str(sys.version_info.major) + str(sys.version_info.minor))" )*
222 | 
223 | # Set environment variables
224 | ARG TRT_VER
225 | ENV TRT_VERSION=$TRT_VER
226 | ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH}
227 | ENV TRT_ROOT=/usr/local/tensorrt
228 | 
229 | WORKDIR /tmp
230 | 
231 | # Install TRT-LLM wheel after all the dependencies are installed
232 | COPY --from=tensorrt_llm_build /workspace/tensorrt_llm/build/tensorrt_llm*whl .
233 | RUN pip3 install --no-cache-dir tensorrt_llm*.whl \
234 |     && rm -f tensorrt_llm*.whl
235 | 
236 | 


--------------------------------------------------------------------------------
/docs/baichuan.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## End to end workflow to run baichuan
  3 | 
  4 | * Build engine
  5 | 
  6 | ```bash
  7 | export HF_BAICHUAN_MODEL=Baichuan-13B-Chat/
  8 | python build.py --model_dir ${HF_BAICHUAN_MODEL} \
  9 |                 --dtype float16 \
 10 |                 --remove_input_padding \
 11 |                 --use_gpt_attention_plugin float16 \
 12 |                 --enable_context_fmha \
 13 |                 --use_gemm_plugin float16 \
 14 |                 --output_dir /tmp/baichuan/13B/trt_engines/fp16/1-gpu/ \
 15 |                 --kv_cache_type paged \
 16 |                 --max_batch_size 64
 17 | 
 18 | [11/29/2023-08:20:34] [TRT] [I] Total Host Persistent Memory: 77008
 19 | [11/29/2023-08:20:34] [TRT] [I] Total Device Persistent Memory: 0
 20 | [11/29/2023-08:20:34] [TRT] [I] Total Scratch Memory: 1342439424
 21 | [11/29/2023-08:20:34] [TRT] [I] [BlockAssignment] Started assigning block shifts. This will take 690 steps to complete.
 22 | [11/29/2023-08:20:34] [TRT] [I] [BlockAssignment] Algorithm ShiftNTopDown took 25.5938ms to assign 11 blocks to 690 nodes requiring 6308236288 bytes.
 23 | [11/29/2023-08:20:34] [TRT] [I] Total Activation Memory: 6308236288
 24 | [11/29/2023-08:20:35] [TRT] [I] Total Weights Memory: 26529804072
 25 | [11/29/2023-08:20:35] [TRT] [I] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +64, now: CPU 56027, GPU 28529 (MiB)
 26 | [11/29/2023-08:20:35] [TRT] [I] [MemUsageChange] Init cuDNN: CPU +0, GPU +72, now: CPU 56027, GPU 28601 (MiB)
 27 | [11/29/2023-08:20:35] [TRT] [I] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 1250 MiB, GPU 41088 MiB
 28 | [11/29/2023-08:20:35] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in building engine: CPU +0, GPU +25301, now: CPU 0, GPU 25301 (MiB)
 29 | [11/29/2023-08:20:44] [TRT] [I] [MemUsageStats] Peak memory usage during Engine building and serialization: CPU: 81260 MiB
 30 | [11/29/2023-08:20:44] [TRT-LLM] [I] Total time of building baichuan_float16_tp1_rank0.engine: 00:00:37
 31 | [11/29/2023-08:20:44] [TRT-LLM] [I] Config saved to /tmp/baichuan/13B/trt_engines/fp16/1-gpu/config.json.
 32 | [11/29/2023-08:20:45] [TRT-LLM] [I] Serializing engine to /tmp/baichuan/13B/trt_engines/fp16/1-gpu/baichuan_float16_tp1_rank0.engine...
 33 | [11/29/2023-08:21:35] [TRT-LLM] [I] Engine serialized. Total time: 00:00:49
 34 | [11/29/2023-08:21:36] [TRT-LLM] [I] Timing cache serialized to /tmp/baichuan/13B/trt_engines/fp16/1-gpu/model.cache
 35 | [11/29/2023-08:21:36] [TRT-LLM] [I] Total time of building all 1 engines: 00:05:00
 36 | ```
 37 | 
 38 | * Prepare configs
 39 | 
 40 | ```bash
 41 | cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ baichuan_ifb -r
 42 | 
 43 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1
 44 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
 45 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
 46 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
 47 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
 48 | ````
 49 | 
 50 | * Launch server
 51 | 
 52 | ```bash
 53 | pip install SentencePiece
 54 | python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=baichuan_ifb/
 55 | ```
 56 | 
 57 | this setting requires about 35GB
 58 | 
 59 | ```bash
 60 | nvidia-smi
 61 | 
 62 | Wed Nov 29 08:33:50 2023
 63 | +---------------------------------------------------------------------------------------+
 64 | | NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
 65 | |-----------------------------------------+----------------------+----------------------+
 66 | | GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
 67 | | Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
 68 | |                                         |                      |               MIG M. |
 69 | |=========================================+======================+======================|
 70 | |   0  NVIDIA H100 PCIe               On  | 00000000:41:00.0 Off |                    0 |
 71 | | N/A   43C    P0              81W / 350W |  34743MiB / 81559MiB |      0%      Default |
 72 | |                                         |                      |             Disabled |
 73 | +-----------------------------------------+----------------------+----------------------+
 74 | 
 75 | +---------------------------------------------------------------------------------------+
 76 | | Processes:                                                                            |
 77 | |  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
 78 | |        ID   ID                                                             Usage      |
 79 | |=======================================================================================|
 80 | +---------------------------------------------------------------------------------------+
 81 | ```
 82 | 
 83 | If you encounter error
 84 | 
 85 | ```bash
 86 | I1129 08:28:33.267969 15088 model_lifecycle.cc:818] successfully loaded 'tensorrt_llm_bls'
 87 | I1129 08:28:33.928915 15088 pb_stub.cc:325] Failed to initialize Python stub: ValueError: Tokenizer class BaichuanTokenizer does not exist or is not currently imported.
 88 | 
 89 | At:
 90 |   /home/bhsueh/.local/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py(748): from_pretrained
 91 |   /home/scratch.bhsueh_sw_1/workspace/TensorRT-LLM/tllm_backend_nvbug/baichuan_ifb/preprocessing/1/model.py(66): initialize
 92 | 
 93 | I1129 08:28:33.928991 15088 pb_stub.cc:325] Failed to initialize Python stub: ValueError: Tokenizer class BaichuanTokenizer does not exist or is not currently imported.
 94 | 
 95 | At:
 96 |   /home/bhsueh/.local/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py(748): from_pretrained
 97 |   /home/scratch.bhsueh_sw_1/workspace/TensorRT-LLM/tllm_backend_nvbug/baichuan_ifb/postprocessing/1/model.py(65): initialize
 98 | 
 99 | E1129 08:28:34.285773 15088 backend_model.cc:634] ERROR: Failed to create instance: ValueError: Tokenizer class BaichuanTokenizer does not exist or is not currently imported.
100 | 
101 | At:
102 |   /home/bhsueh/.local/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py(748): from_pretrained
103 |   /home/scratch.bhsueh_sw_1/workspace/TensorRT-LLM/tllm_backend_nvbug/baichuan_ifb/postprocessing/1/model.py(65): initialize
104 | 
105 | E1129 08:28:34.285879 15088 model_lifecycle.cc:621] failed to load 'postprocessing' version 1: Internal: ValueError: Tokenizer class BaichuanTokenizer does not exist or is not currently imported.
106 | 
107 | At:
108 |   /home/bhsueh/.local/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py(748): from_pretrained
109 |   /home/scratch.bhsueh_sw_1/workspace/TensorRT-LLM/tllm_backend_nvbug/baichuan_ifb/postprocessing/1/model.py(65): initialize
110 | 
111 | I1129 08:28:34.285894 15088 model_lifecycle.cc:756] failed to load 'postprocessing'
112 | E1129 08:28:34.304925 15088 backend_model.cc:634] ERROR: Failed to create instance: ValueError: Tokenizer class BaichuanTokenizer does not exist or is not currently imported.
113 | 
114 | At:
115 |   /home/bhsueh/.local/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py(748): from_pretrained
116 |   /home/scratch.bhsueh_sw_1/workspace/TensorRT-LLM/tllm_backend_nvbug/baichuan_ifb/preprocessing/1/model.py(66): initialize
117 | 
118 | E1129 08:28:34.305028 15088 model_lifecycle.cc:621] failed to load 'preprocessing' version 1: Internal: ValueError: Tokenizer class BaichuanTokenizer does not exist or is not currently imported.
119 | 
120 | At:
121 |   /home/bhsueh/.local/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py(748): from_pretrained
122 |   /home/scratch.bhsueh_sw_1/workspace/TensorRT-LLM/tllm_backend_nvbug/baichuan_ifb/preprocessing/1/model.py(66): initialize
123 | 
124 | I1129 08:28:34.305052 15088 model_lifecycle.cc:756] failed to load 'preprocessing'
125 | ```
126 | 
127 | please add `trust_remote_code=True` in tokenizer of preprocessing and postprocessing. Considering the security, we don't add it by default.
128 | 
129 | * Send request
130 | 
131 | ```bash
132 | curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": "", "stop_words": "", "pad_id": 2, "end_id": 2}'
133 | 
134 | {"cum_log_probs":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"\nMachine learning is a subset of artificial intelligence (AI) that focuses on the"}
135 | ```
136 | 
137 | * Send request with bad_words and stop_words
138 | 
139 | ```bash
140 | curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": ["intelligence","model"], "stop_words": ["focuses","learn"], "pad_id": 2, "end_id": 2}'
141 | 
142 | {"cum_log_probs":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"\nMachine learning is a subset of artificial intelligent (AI) that focuses"}
143 | ```
144 | 
145 | * Send request by `inflight_batcher_llm_client.py` (Remember to add `trust_remote_code=True` in tokenizer of `inflight_batcher_llm_client.py`)
146 | 
147 | ```bash
148 | python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_BAICHUAN_MODEL}
149 | 
150 | =========
151 | Input sequence:  [16448, 677, 5611, 31136, 21309, 4746, 31125, 694, 1033, 653, 8808, 754, 650]
152 | Got completed request
153 | Input: Born in north-east France, Soyer trained as a
154 | Output beam 0: . He became the chef at the Reform Club, and later at the Vegetarian Restaurant, where he pioneered the use of vegetables in fine dining. He also wrote a number of books, including The London Art of Cookery (1858), The Modern Housekeeper (1861), and The Compleat Housekeeper (1862).
155 | Soyer was a strong supporter of the British National Rifle Association, and was a member of the organisation's council. He was also a member of the Reform Club, the Athenaeum, and the Rifle Club. He died in London in 1904.
156 | Soyer was born in the village of Montigny-lès-Cormeilles, in the department of Aisne, France. He was the son of a baker, and was educated in the
157 | Output sequence:  [16814, 677, 5621, 1412, 4514, 678, 2835, 677, 31106, 53, 60, 57, 59, 79, 1057, 3142, 656, 16814, 772, 656, 15824, 4305, 31125, 680, 2384, 772, 656, 9592, 1161, 8480, 13550, 807, 31125, 1238, 742, 11135, 2521, 656, 1226, 679, 8431, 3392, 677, 4816, 8946, 79, 1057, 982, 4251, 650, 1697, 679, 3594, 31125, 1516, 776, 2835, 2409, 679, 7782, 1620, 762, 53, 60, 57, 60, 1098, 776, 8753, 2542, 17655, 762, 53, 60, 58, 53, 1098, 680, 776, 1127, 1596, 658, 2542, 17655, 762, 53, 60, 58, 54, 31145, 79, 5, 31131, 1033, 653, 796, 650, 2427, 23747, 679, 656, 3681, 2024, 751, 19422, 2790, 728, 31125, 680, 796, 650, 2736, 679, 656, 1625, 4859, 31155, 31114, 7284, 79, 1057, 796, 982, 650, 2736, 679, 656, 15824, 4305, 31125, 656, 1996, 1179, 4302, 784, 31125, 680, 656, 751, 19422, 4305, 79, 1057, 4357, 677, 2835, 677, 31106, 53, 61, 52, 56, 79, 5, 31131, 1033, 653, 796, 4204, 677, 656, 6730, 679, 5136, 942, 31124, 31136, 31115, 16987, 31136, 31133, 908, 31107, 22542, 31125, 677, 656, 1664, 2049, 679, 703, 667, 1024, 31125, 4746, 79, 1057, 796, 656, 3652, 679, 650, 675, 3034, 31125, 680, 796, 18735, 677, 656]
158 | ```
159 | 
160 | * Run test on dataset
161 | 
162 | ```
163 | python3 tensorrt_llm/triton_backend/inflight_batcher_llm/end_to_end_test.py --dataset ci/L0_backend_trtllm/simple_data.json --max-input-len 500
164 | 
165 | [INFO] Start testing on 13 prompts.
166 | [INFO] Functionality test succeed.
167 | [INFO] Warm up for benchmarking.
168 | [INFO] Start benchmarking on 13 prompts.
169 | [INFO] Total Latency: 1598.328 ms
170 | ```
171 | 
172 | * Run with decoupled mode (streaming)
173 | 
174 | ```bash
175 | cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ baichuan_ifb -r
176 | 
177 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1
178 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_BAICHUAN_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
179 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:True,logits_datatype:TYPE_FP32
180 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
181 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i baichuan_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:/tmp/baichuan/13B/trt_engines/fp16/1-gpu/,max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
182 | 
183 | pip install SentencePiece
184 | # please add `trust_remote_code=True` in tokenizer of preprocessing and postprocessing. Considering the security, we don't add it by default.
185 | python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=baichuan_ifb/
186 | 
187 | python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_BAICHUAN_MODEL} --streaming
188 | ```
189 | 
190 | <details>
191 | <summary> The result would be like
192 | </summary>
193 | 
194 | ```bash
195 | =========
196 | Input sequence:  [16448, 677, 5611, 31136, 21309, 4746, 31125, 694, 1033, 653, 8808, 754, 650]
197 | [16814]
198 | [677]
199 | [5621]
200 | [1412]
201 | [4514]
202 | [678]
203 | [2835]
204 | [677]
205 | [31106]
206 | [53]
207 | [60]
208 | [57]
209 | [59]
210 | [79]
211 | [1057]
212 | [3142]
213 | [656]
214 | [16814]
215 | [772]
216 | [656]
217 | [15824]
218 | [4305]
219 | [31125]
220 | [680]
221 | [2384]
222 | [772]
223 | [656]
224 | [9592]
225 | [1161]
226 | [8480]
227 | [13550]
228 | [807]
229 | [31125]
230 | [1238]
231 | [742]
232 | [11135]
233 | [2521]
234 | [656]
235 | [1226]
236 | [679]
237 | [8431]
238 | [3392]
239 | [677]
240 | [4816]
241 | [8946]
242 | [79]
243 | [1057]
244 | [982]
245 | [4251]
246 | [650]
247 | [1697]
248 | [679]
249 | [3594]
250 | [31125]
251 | [1516]
252 | [776]
253 | [2835]
254 | [2409]
255 | [679]
256 | [7782]
257 | [1620]
258 | [762]
259 | [53]
260 | [60]
261 | [57]
262 | [60]
263 | [1098]
264 | [776]
265 | [8753]
266 | [2542]
267 | [17655]
268 | [762]
269 | [53]
270 | [60]
271 | [58]
272 | [53]
273 | [1098]
274 | [680]
275 | [776]
276 | [1127]
277 | [1596]
278 | [658]
279 | [2542]
280 | [17655]
281 | [762]
282 | [53]
283 | [60]
284 | [58]
285 | [54]
286 | [31145]
287 | [79]
288 | [5]
289 | [31131]
290 | [1033]
291 | [653]
292 | [796]
293 | [650]
294 | [2427]
295 | [23747]
296 | [679]
297 | [656]
298 | [3681]
299 | [2024]
300 | [751]
301 | [19422]
302 | [2790]
303 | [728]
304 | [31125]
305 | [680]
306 | [796]
307 | [650]
308 | [2736]
309 | [679]
310 | [656]
311 | [1625]
312 | [4859]
313 | [31155]
314 | [31114]
315 | [7284]
316 | [79]
317 | [1057]
318 | [796]
319 | [982]
320 | [650]
321 | [2736]
322 | [679]
323 | [656]
324 | [15824]
325 | [4305]
326 | [31125]
327 | [656]
328 | [1996]
329 | [1179]
330 | [4302]
331 | [784]
332 | [31125]
333 | [680]
334 | [656]
335 | [751]
336 | [19422]
337 | [4305]
338 | [79]
339 | [1057]
340 | [4357]
341 | [677]
342 | [2835]
343 | [677]
344 | [31106]
345 | [53]
346 | [61]
347 | [52]
348 | [56]
349 | [79]
350 | [5]
351 | [31131]
352 | [1033]
353 | [653]
354 | [796]
355 | [4204]
356 | [677]
357 | [656]
358 | [6730]
359 | [679]
360 | [5136]
361 | [942]
362 | [31124]
363 | [31136]
364 | [31115]
365 | [16987]
366 | [31136]
367 | [31133]
368 | [908]
369 | [31107]
370 | [22542]
371 | [31125]
372 | [677]
373 | [656]
374 | [1664]
375 | [2049]
376 | [679]
377 | [703]
378 | [667]
379 | [1024]
380 | [31125]
381 | [4746]
382 | [79]
383 | [1057]
384 | [796]
385 | [656]
386 | [3652]
387 | [679]
388 | [650]
389 | [675]
390 | [3034]
391 | [31125]
392 | [680]
393 | [796]
394 | [18735]
395 | [677]
396 | [656]
397 | Input: Born in north-east France, Soyer trained as a
398 | Output beam 0: chef in Paris before moving to London in 1857. He became the chef at the Reform Club, and later at the Vegetarian Restaurant, where he pioneered the use of vegetables in fine dining. He also wrote a number of books, including The London Art of Cookery (1858), The Modern Housekeeper (1861), and The Compleat Housekeeper (1862).
399 | Soyer was a strong supporter of the British National Rifle Association, and was a member of the organisation's council. He was also a member of the Reform Club, the Athenaeum, and the Rifle Club. He died in London in 1904.
400 | Soyer was born in the village of Montigny-lès-Cormeilles, in the department of Aisne, France. He was the son of a baker, and was educated in the
401 | Output sequence:  [16448, 677, 5611, 31136, 21309, 4746, 31125, 694, 1033, 653, 8808, 754, 650, 16814, 677, 5621, 1412, 4514, 678, 2835, 677, 31106, 53, 60, 57, 59, 79, 1057, 3142, 656, 16814, 772, 656, 15824, 4305, 31125, 680, 2384, 772, 656, 9592, 1161, 8480, 13550, 807, 31125, 1238, 742, 11135, 2521, 656, 1226, 679, 8431, 3392, 677, 4816, 8946, 79, 1057, 982, 4251, 650, 1697, 679, 3594, 31125, 1516, 776, 2835, 2409, 679, 7782, 1620, 762, 53, 60, 57, 60, 1098, 776, 8753, 2542, 17655, 762, 53, 60, 58, 53, 1098, 680, 776, 1127, 1596, 658, 2542, 17655, 762, 53, 60, 58, 54, 31145, 79, 5, 31131, 1033, 653, 796, 650, 2427, 23747, 679, 656, 3681, 2024, 751, 19422, 2790, 728, 31125, 680, 796, 650, 2736, 679, 656, 1625, 4859, 31155, 31114, 7284, 79, 1057, 796, 982, 650, 2736, 679, 656, 15824, 4305, 31125, 656, 1996, 1179, 4302, 784, 31125, 680, 656, 751, 19422, 4305, 79, 1057, 4357, 677, 2835, 677, 31106, 53, 61, 52, 56, 79, 5, 31131, 1033, 653, 796, 4204, 677, 656, 6730, 679, 5136, 942, 31124, 31136, 31115, 16987, 31136, 31133, 908, 31107, 22542, 31125, 677, 656, 1664, 2049, 679, 703, 667, 1024, 31125, 4746, 79, 1057, 796, 656, 3652, 679, 650, 675, 3034, 31125, 680, 796, 18735, 677, 656]
402 | ```
403 | 
404 | </details>
405 | 
406 | 
407 | * Run several requests at the same time
408 | 
409 | ```bash
410 | echo '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": "", "stop_words": "", "pad_id": 2, "end_id": 2}' > tmp.txt
411 | printf '%s\n' {1..20} | xargs -I % -P 20 curl -X POST localhost:8000/v2/models/ensemble/generate -d @tmp.txt
412 | ```
413 | 


--------------------------------------------------------------------------------
/docs/build.md:
--------------------------------------------------------------------------------
 1 | # Building from Source
 2 | 
 3 | This document describes how to build the TensorRT-LLM backend and the Triton
 4 | TRT-LLM container from source. The Triton container includes TensorRT-LLM,
 5 | along with the TensorRT-LLM backend and the Python backend.
 6 | 
 7 | ## Build the TensorRT-LLM Backend from source
 8 | 
 9 | Make sure TensorRT-LLM is installed before building the backend. Since the
10 | version of TensorRT-LLM and the TensorRT-LLM backend has to be aligned, it is
11 | recommended to directly use the Triton TRT-LLM container from NGC or build the
12 | whole container from source as described below in the Build the Docker Container
13 | section.
14 | 
15 | ```bash
16 | cd tensorrt_llm/triton_backend/inflight_batcher_llm
17 | bash scripts/build.sh
18 | ```
19 | 
20 | ## Build the Docker Container
21 | 
22 | > [!CAUTION]
23 | > [build.sh](../build.sh) is currently not working and will be fixed in the next weekly update.
24 | 
25 | #### Build via Docker
26 | 
27 | You can build the container using the instructions in the [TensorRT-LLM Docker Build](../tensorrt_llm/docker/README.md)
28 | with `tritonrelease` stage.
29 | 
30 | ```bash
31 | cd tensorrt_llm/
32 | make -C docker tritonrelease_build
33 | ```
34 | 


--------------------------------------------------------------------------------
/docs/encoder_decoder.md:
--------------------------------------------------------------------------------
  1 | # End to end workflow to run an Encoder-Decoder model
  2 | 
  3 | ### Support Matrix
  4 | For the specific models supported by encoder-decoder family, please visit [TensorRT-LLM encoder-decoder examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/enc_dec#encoder-decoder-model-support). The following two model types are supported:
  5 | * T5
  6 | * BART
  7 | 
  8 | ## Run Encoder-Decoder with Tritonserver
  9 | ### Tritonserver setup steps
 10 | 
 11 | #### 1. Make sure that you have initialized the TRT-LLM submodule:
 12 | 
 13 | ```
 14 |     git clone https://github.com/triton-inference-server/tensorrtllm_backend.git && cd tensorrtllm_backend
 15 |     git lfs install
 16 |     git submodule update --init --recursive
 17 | ```
 18 | 
 19 | #### 2. Start the Triton Server Docker container within `tensorrtllm_backend` repo:
 20 | 
 21 | If you're using [Triton TRT-LLM NGC container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver/tags)
 22 | 
 23 | ```
 24 |     # Replace <yy.mm> with the version of Triton you want to use. Here using 24.08.
 25 |     # The commands below assumes the the current directory is the
 26 |     # TRT-LLM backend root git repository.
 27 | 
 28 |     docker run --gpus all --ipc=host --ulimit memlock=-1 --shm-size=20g `pwd`:/workspace -w /workspace nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3 bash
 29 | ```
 30 | 
 31 | If [building your own TensorRT-LLM Backend container](https://github.com/triton-inference-server/tensorrtllm_backend#option-2-build-via-docker) then you can run the `tensorrtllm_backend` container:
 32 | 
 33 | ```
 34 |     docker run --gpus all --ipc=host --ulimit memlock=-1 --shm-size=20g `pwd`:/workspace -w /workspace triton_trt_llm bash
 35 | ```
 36 | 
 37 | #### 3. Build the engines:
 38 | 
 39 | Clone the target model repository from HuggingFace. Here we use [T5-small model](https://huggingface.co/google-t5/t5-small) as example but you can also follow the same steps for BART model.
 40 | 
 41 | 
 42 |     git lfs install
 43 |     git clone https://huggingface.co/google-t5/t5-small /workspace/hf_models/t5-small
 44 | 
 45 | 
 46 | Build TensorRT-LLM engines.
 47 | 
 48 | ```
 49 |     export MODEL_NAME=t5-small # or bart-base
 50 |     export MODEL_TYPE=t5 # or bart
 51 |     export HF_MODEL_PATH=/workspace/hf_models/${MODEL_NAME}
 52 |     export UNIFIED_CKPT_PATH=/workspace/ckpt/${MODEL_NAME}
 53 |     export ENGINE_PATH=/workspace/engines/${MODEL_NAME}
 54 |     export INFERENCE_PRECISION=float16
 55 |     export TP_SIZE=1
 56 |     export MAX_BEAM_WIDTH=1
 57 |     export MAX_BATCH_SIZE=8
 58 |     export INPUT_LEN=1024
 59 |     export OUTPUT_LEN=201
 60 | 
 61 |     python3 tensorrt_llm/examples/models/core/enc_dec/convert_checkpoint.py \
 62 |     --model_type ${MODEL_TYPE} \
 63 |     --model_dir ${HF_MODEL_PATH} \
 64 |     --output_dir ${UNIFIED_CKPT_PATH} \
 65 |     --dtype ${INFERENCE_PRECISION} \
 66 |     --tp_size ${TP_SIZE}
 67 | 
 68 |     trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH}/encoder \
 69 |     --output_dir ${ENGINE_PATH}/encoder \
 70 |     --kv_cache_type disabled \
 71 |     --moe_plugin disable \
 72 |     --max_beam_width ${MAX_BEAM_WIDTH} \
 73 |     --max_input_len ${INPUT_LEN} \
 74 |     --max_batch_size ${MAX_BATCH_SIZE} \
 75 |     --gemm_plugin ${INFERENCE_PRECISION} \
 76 |     --bert_attention_plugin ${INFERENCE_PRECISION} \
 77 |     --gpt_attention_plugin ${INFERENCE_PRECISION} \
 78 |     --context_fmha disable # remove for BART
 79 | 
 80 |     trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH}/decoder \
 81 |     --output_dir ${ENGINE_PATH}/decoder \
 82 |     --moe_plugin disable \
 83 |     --max_beam_width ${MAX_BEAM_WIDTH} \
 84 |     --max_batch_size ${MAX_BATCH_SIZE} \
 85 |     --gemm_plugin ${INFERENCE_PRECISION} \
 86 |     --bert_attention_plugin ${INFERENCE_PRECISION} \
 87 |     --gpt_attention_plugin ${INFERENCE_PRECISION} \
 88 |     --max_input_len 1 \
 89 |     --max_encoder_input_len ${INPUT_LEN} \
 90 |     --max_seq_len ${OUTPUT_LEN} \
 91 |     --context_fmha disable # remove for BART
 92 | ```
 93 | 
 94 | > **NOTE**
 95 | >
 96 | > If you want to build multi-GPU engine using Tensor Parallelism then you can set `--tp_size` in convert_checkpoint.py. For example, for TP=2 on 2-GPU you can set `--tp_size=2`. If you want to use beam search then set `--max_beam_width` to higher value than 1. The `--max_input_len` in encoder trtllm-build controls the model input length and should be same as `--max_encoder_input_len` in decoder trtllm-build. Additionally, to control the model output len you should set `--max_seq_len` in decoder trtllm-build to `desired output length + 1`. It is also advisable to tune [`--max_num_tokens`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-best-practices.md#max_num_tokens) as the default value of 8192 might be too large or too small depending on your input, output len and use-cases. For BART family models, make sure to remove `--context_fmha disable` from both encoder and decoder trtllm-build commands. Please refer to [TensorRT-LLM enc-dec example](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/enc_dec#build-tensorrt-engines) for more details.
 97 | 
 98 | #### 4. Prepare Tritonserver configs <a id="prepare-tritonserver-configs"></a>
 99 | 
100 | ```
101 |     cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ enc_dec_ifb -r
102 | 
103 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:False,max_beam_width:${MAX_BEAM_WIDTH},engine_dir:${ENGINE_PATH}/decoder,encoder_engine_dir:${ENGINE_PATH}/encoder,kv_cache_free_gpu_mem_fraction:0.8,cross_kv_cache_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,enable_chunked_context:False,max_queue_size:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
104 | 
105 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:${MAX_BATCH_SIZE},preprocessing_instance_count:1
106 | 
107 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:${MAX_BATCH_SIZE},postprocessing_instance_count:1
108 | 
109 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/ensemble/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},logits_datatype:TYPE_FP32
110 | 
111 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
112 | 
113 | ```
114 | 
115 | > **NOTE**
116 | >
117 | > Currently, encoder-decoder models don't support running with chunked context.
118 | 
119 | #### 5. Launch Tritonserver
120 | 
121 | ```
122 | python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=enc_dec_ifb/
123 | ```
124 | 
125 | ### Send requests
126 | #### 1. Send request with CURL
127 | 
128 | ```
129 | curl -X POST localhost:8000/v2/models/ensemble/generate -d "{\"text_input\": \"Summarize the following news article: (CNN)Following last year's successful U.K. tour, Prince and 3rdEyeGirl are bringing the Hit & Run Tour to the U.S. for the first time. The first -- and so far only -- scheduled show will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton. Slated for March 14, tickets will go on sale Monday, March 9 at 10 a.m. local time. Prince crowns dual rock charts . A venue has yet to be announced. When the Hit & Run worked its way through the U.K. in 2014, concert venues were revealed via Twitter prior to each show. Portions of the ticket sales will be donated to various Louisville charities. See the original story at Billboard.com. ©2015 Billboard. All Rights Reserved.\", \"max_tokens\": 1024, \"bad_words\": \"\", \"stop_words\": \"\"}"
130 | 
131 |     {"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":0.0,"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":":::: (CNN): (CNN): (CNN) the Hit & Run Tour to the U.S. for the first time. the Hit & Run Tour will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton. Tickets will go on sale Monday, March 9 at 10 a.m. local time."}
132 | ```
133 | 
134 | #### 2. Send request with `bad_words` and `stop_words`
135 | 
136 | After applying the `stop_words` and `bad_words`, the output avoids the bad words and stops at the first generated stop word.
137 | 
138 | ```
139 | curl -X POST localhost:8000/v2/models/ensemble/generate -d "{\"text_input\": \"Summarize the following news article: (CNN)Following last year's successful U.K. tour, Prince and 3rdEyeGirl are bringing the Hit & Run Tour to the U.S. for the first time. The first -- and so far only -- scheduled show will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton. Slated for March 14, tickets will go on sale Monday, March 9 at 10 a.m. local time. Prince crowns dual rock charts . A venue has yet to be announced. When the Hit & Run worked its way through the U.K. in 2014, concert venues were revealed via Twitter prior to each show. Portions of the ticket sales will be donated to various Louisville charities. See the original story at Billboard.com. ©2015 Billboard. All Rights Reserved.\", \"max_tokens\": 1024, \"bad_words\": [\"drummer\", \"hometown\"], \"stop_words\": [\"Tickets\", \"sale\"]}"
140 | 
141 |     {"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":0.0,"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":":::: (CNN): (CNN): (CNN) the Hit & Run Tour to the U.S. for the first time. the Hit & Run Tour will take place in Louisville, Kentucky, the home of 3rdEyeGirl's Hannah Welton."}
142 | ```
143 | 
144 | #### 3. Send request by `inflight_batcher_llm_client.py`
145 | If not already installed, install `tritonclient`
146 | 
147 | ```
148 |     pip install tritonclient[all]
149 |     python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py --text "translate English to German: This is good" --request-output-len 200 --exclude-input-in-output --tokenizer-dir ${HF_MODEL_PATH} --beam-width ${MAX_BEAM_WIDTH}
150 | 
151 |     ========
152 |     Using pad_id:  0
153 |     Using end_id:  1
154 |     Input sequence:  [13959, 1566, 12, 2968, 10, 100, 19, 207, 1]
155 |     [TensorRT-LLM][WARNING] decoder_input_ids is not present in the request for encoder-decoder model. The decoder input tokens will be set to [padId]
156 |     Got completed request
157 |     Input: translate English to German: This is good
158 |     Output beam 0: Das is gut.
159 |     Output sequence:  [644, 229, 1806, 5]
160 | ```
161 | 
162 | > **NOTE**
163 | >
164 | > Please ignore any exception thrown with the output. It's a known issue to be fixed.
165 | 
166 | #### 4. Run test on dataset
167 | 
168 | ```
169 |     python3 tensorrt_llm/triton_backend/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500
170 | 
171 |     [INFO] Start testing on 13 prompts.
172 |     [INFO] Functionality test succeed.
173 |     [INFO] Warm up for benchmarking.
174 |     [INFO] Start benchmarking on 13 prompts.
175 |     [INFO] Total Latency: 155.756 ms
176 | ```
177 | 
178 | #### 5. Run several requests at the same time
179 | 
180 | ```
181 | echo "{\"text_input\": \"Summarize the following news article: (CNN)Following last year's successful U.K. tour, Prince and 3rdEyeGirl are bringing the Hit & Run Tour to the U.S. for the first time. The first -- and so far only -- scheduled show will take place in Louisville, Kentucky, the hometown of 3rdEyeGirl drummer Hannah Welton. Slated for March 14, tickets will go on sale Monday, March 9 at 10 a.m. local time. Prince crowns dual rock charts . A venue has yet to be announced. When the Hit & Run worked its way through the U.K. in 2014, concert venues were revealed via Twitter prior to each show. Portions of the ticket sales will be donated to various Louisville charities. See the original story at Billboard.com. ©2015 Billboard. All Rights Reserved.\", \"max_tokens\": 1024, \"bad_words\": [\"drummer\", \"hometown\"], \"stop_words\": [\"Tickets\", \"sale\"]}" > tmp.txt
182 | 
183 | printf '%s\n' {1..20} | xargs -I % -P 20 curl -X POST localhost:8000/v2/models/ensemble/generate -d @tmp.txt
184 | ```
185 | #### 6. Evaluating performance with Gen-AI Perf
186 | 
187 | Gen-AI Perf is a command line tool for measuring the throughput and latency of generative AI models as served through an inference server. You can read more about installing Gen-AI Perf [here](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/perf_analyzer/genai-perf/README.html#installation).
188 | 
189 | To use Gen-AI Perf, run the following command:
190 | 
191 | ```
192 | genai-perf profile \
193 |   -m ensemble \
194 |   --service-kind triton \
195 |   --backend tensorrtllm \
196 |   --num-prompts 100 \
197 |   --random-seed 123 \
198 |   --synthetic-input-tokens-mean 200 \
199 |   --synthetic-input-tokens-stddev 0 \
200 |   --output-tokens-mean 100 \
201 |   --output-tokens-stddev 0 \
202 |   --tokenizer ${HF_MODEL_PATH} \
203 |   --concurrency 1 \
204 |   --measurement-interval 4000 \
205 |   --profile-export-file my_profile_export.json \
206 |   --url localhost:8001
207 | ```
208 | 
209 | You should expect an output that looks like this (the output below was obtained on A100-80GB with TRT-LLM v0.12):
210 | 
211 | ```                                  LLM Metrics
212 | ┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┓
213 | ┃              Statistic ┃    avg ┃    min ┃    max ┃    p99 ┃    p90 ┃    p75 ┃
214 | ┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━┩
215 | │   Request latency (ms) │  80.92 │  78.84 │ 323.55 │  85.14 │  79.90 │  79.64 │
216 | │ Output sequence length │  95.83 │  65.00 │ 100.00 │ 100.00 │  99.00 │  98.00 │
217 | │  Input sequence length │ 200.01 │ 200.00 │ 201.00 │ 200.00 │ 200.00 │ 200.00 │
218 | └────────────────────────┴────────┴────────┴────────┴────────┴────────┴────────┘
219 | Output token throughput (per sec): 1182.70
220 | Request throughput (per sec): 12.34
221 | ```
222 | 
223 | #### 7. Run with decoupled mode (streaming)
224 | 
225 | To enable streaming, we set `decoupled_mode:True` in config.pbtxt of `tensorrt_llm` and `tensorrt_llm_bls` model (if you are using BLS instead of ensemble).
226 | 
227 | ```
228 |     cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ enc_dec_ifb -r
229 | 
230 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:True,max_beam_width:${MAX_BEAM_WIDTH},engine_dir:${ENGINE_PATH}/decoder,encoder_engine_dir:${ENGINE_PATH}/encoder,kv_cache_free_gpu_mem_fraction:0.8,cross_kv_cache_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,enable_chunked_context:False,max_queue_size:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
231 | 
232 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:${MAX_BATCH_SIZE},preprocessing_instance_count:1
233 | 
234 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:${MAX_BATCH_SIZE},postprocessing_instance_count:1
235 | 
236 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/ensemble/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},logits_datatype:TYPE_FP32
237 | 
238 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i enc_dec_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:True,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
239 | 
240 | ```
241 | 
242 | We launch Tritonserver
243 | 
244 | ```
245 | python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=enc_dec_ifb/
246 | ```
247 | 
248 | Then send request by `inflight_batcher_llm_client.py`
249 | 
250 | ```
251 | pip install tritonclient[all]
252 | python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py --text "translate English to German: This is good" --request-output-len 200 --exclude-input-in-output --tokenizer-dir ${HF_MODEL_PATH} --beam-width ${MAX_BEAM_WIDTH} --streaming
253 | ```
254 | 
255 | To use Gen-AI Perf to benchmark streaming/decoupled mode, run the following command:
256 | 
257 | ```
258 | genai-perf profile \
259 |   -m ensemble \
260 |   --service-kind triton \
261 |   --backend tensorrtllm \
262 |   --num-prompts 100 \
263 |   --random-seed 123 \
264 |   --synthetic-input-tokens-mean 200 \
265 |   --synthetic-input-tokens-stddev 0 \
266 |   --streaming \
267 |   --output-tokens-mean 100 \
268 |   --output-tokens-stddev 0 \
269 |   --tokenizer ${HF_MODEL_PATH} \
270 |   --concurrency 1 \
271 |   --measurement-interval 4000 \
272 |   --profile-export-file my_profile_export.json \
273 |   --url localhost:8001
274 | ```
275 | 
276 | You should see output like this (the output below was obtained on A100-80GB with TRT-LLM v0.12)
277 | 
278 | ```
279 |                                    LLM Metrics
280 | ┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┳━━━━━━━━┓
281 | ┃                Statistic ┃    avg ┃    min ┃    max ┃    p99 ┃    p90 ┃    p75 ┃
282 | ┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━╇━━━━━━━━┩
283 | │ Time to first token (ms) │   4.69 │   3.99 │  14.05 │   5.70 │   5.04 │   4.76 │
284 | │ Inter token latency (ms) │   0.63 │   0.38 │   1.04 │   0.98 │   0.70 │   0.66 │
285 | │     Request latency (ms) │  75.32 │  46.34 │ 114.27 │  90.35 │  79.27 │  79.11 │
286 | │   Output sequence length │ 116.50 │  58.00 │ 197.00 │ 197.00 │ 132.00 │ 128.00 │
287 | │    Input sequence length │ 200.01 │ 200.00 │ 201.00 │ 200.10 │ 200.00 │ 200.00 │
288 | └──────────────────────────┴────────┴────────┴────────┴────────┴────────┴────────┘
289 | Output token throughput (per sec): 1542.81
290 | Request throughput (per sec): 13.24
291 | ```
292 | 
293 | ## Running multiple instances of encoder-decoder model on multiple GPUs
294 | 
295 | In this section, we demonstrate how you can use
296 | [Leader Mode](../README.md#leader-mode) for running multiple instances of a encoder-decoder model on different GPUs.
297 | 
298 | For this section, let's assume that we have four GPUs and the CUDA device ids
299 | are 0, 1, 2, and 3.  We will be launching two instances of the T5-small model
300 | with tensor parallelism 2 (TP=2). The first instance will run on GPUs 0 and 1
301 | and the second instance will run on GPUs 2 and 3. We will launch two separate `mpirun` commands to launch two separate Triton servers, one for each GPU (4 Triton Server instances in total). We also need to use a reverse proxy in front of them to load balance the requests between the servers.
302 | 
303 | [Orchestrator Mode](../README.md#orchestrator-mode) currently not supported.
304 | 
305 | 
306 | ### Triton setup steps
307 | 1. Build the model, but add `--tp_size 2` when converting checkpoints. The rest of the steps are the same as [Tritonserver Setup
308 | ](#Tritonserver-setup-steps).
309 | 
310 | ```
311 |     export MODEL_NAME=t5-small
312 |     export MODEL_TYPE=t5 # or bart
313 |     export HF_MODEL_PATH=/workspace/hf_models/${MODEL_NAME}
314 |     export UNIFIED_CKPT_PATH=/workspace/ckpt/${MODEL_NAME}-2tp-2gpu
315 |     export ENGINE_PATH=/workspace/engines/${MODEL_NAME}-2tp-2gpu
316 | 
317 |     python tensorrt_llm/examples/models/core/enc_dec/convert_checkpoint.py \
318 |         --model_type ${MODEL_TYPE} \
319 |         --model_dir ${HF_MODEL_PATH} \
320 |         --output_dir ${UNIFIED_CKPT_PATH} \
321 |         --dtype float16 \
322 |         --tp_size 2
323 | 
324 |     trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH}/encoder \
325 |         --output_dir ${ENGINE_PATH}/encoder \
326 |         --kv_cache_type disabled \
327 |         --moe_plugin disable \
328 |         --max_batch_size 64 \
329 |         --gemm_plugin float16 \
330 |         --bert_attention_plugin float16 \
331 |         --gpt_attention_plugin float16 \
332 |         --max_input_len 2048 \
333 |         --context_fmha disable
334 | 
335 |     trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH}/decoder \
336 |         --output_dir ${ENGINE_PATH}/decoder \
337 |         --moe_plugin disable \
338 |         --max_batch_size 64 \
339 |         --gemm_plugin float16 \
340 |         --bert_attention_plugin float16 \
341 |         --gpt_attention_plugin float16 \
342 |         --context_fmha disable \
343 |         --max_input_len 1 \
344 |         --max_encoder_input_len 2048
345 | ```
346 | 
347 | 3. Setup Tritonserver config with the same commands in [step 4](#prepare-tritonserver-configs) above.
348 | 
349 | 4. Launch the servers:
350 | 
351 | ```
352 |     CUDA_VISIBLE_DEVICES=0,1 python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=enc_dec_ifb/ --http_port 8000 --grpc_port 8001 --metrics_port 8004
353 |     CUDA_VISIBLE_DEVICES=2,3 python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=enc_dec_ifb/ --http_port 8002 --grpc_port 8003 --metrics_port 8005
354 | ```
355 | 
356 | 4. Install NGINX:
357 | 
358 | ```
359 |     apt update
360 |     apt install nginx -y
361 | ```
362 | 
363 | 5. Setup the NGINX configuration and store it in `/etc/nginx/sites-available/tritonserver`:
364 | 
365 | ```
366 |     upstream tritonserver {
367 |         server localhost:8000;
368 |         server localhost:8002;
369 |     }
370 | 
371 |     server {
372 |         listen 8080;
373 | 
374 |         location / {
375 |             proxy_pass http://tritonserver;
376 |         }
377 |     }
378 | ```
379 | 
380 | 6. Create a symlink and restart NGINX to enable the configuration:
381 | 
382 | ```
383 |     ln -s /etc/nginx/sites-available/tritonserver /etc/nginx/sites-enabled/tritonserver
384 |     service nginx restart
385 | ```
386 | 
387 | ### Send the request
388 | 
389 | 1. Run test on dataset
390 | 
391 | ```
392 |     # Test the load on all the servers
393 |     python3 tensorrt_llm/triton_backend/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8080
394 | 
395 |     # Test the load on one of the servers
396 |     python3 tensorrt_llm/triton_backend/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8000
397 | ```
398 | 
399 | ### Kill the server
400 | ```
401 | pgrep mpirun | xargs kill
402 | ```
403 | 


--------------------------------------------------------------------------------
/docs/gemma.md:
--------------------------------------------------------------------------------
 1 | ## End to end workflow to run sp model
 2 | 
 3 | * Build engine
 4 | 
 5 | assume tokenizer model is put in `/tmp/gemma/tmp_vocab.model` and the engine is put in `/tmp/gemma/2B/bf16/1-gpu/`.
 6 | 
 7 | ```bash
 8 | TOKENIZER_DIR=/tmp/models/gemma_nv/checkpoints/tmp_vocab.model
 9 | ENGINE_PATH=/tmp/gemma/2B/bf16/1-gpu/
10 | ```
11 | 
12 | * Prepare configs
13 | 
14 | Note that we use `tokenizer_type=sp` (sentencepiece) tokenizer.
15 | 
16 | ```bash
17 | cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ gemma -r
18 | 
19 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},tokenizer_type:sp,triton_max_batch_size:64,preprocessing_instance_count:1,add_special_tokens:True
20 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_DIR},tokenizer_type:sp,triton_max_batch_size:64,postprocessing_instance_count:1
21 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
22 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
23 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i gemma/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,batch_scheduler_policy:guaranteed_no_evict,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
24 | 
25 | ```
26 | 
27 | * Launch server
28 | 
29 | ```bash
30 | python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=gemma/
31 | ```
32 | 
33 | 
34 | * Send request
35 | 
36 | ```bash
37 | curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": "", "stop_words": ""}'
38 | 
39 | {"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"\n\nMachine learning is a branch of artificial intelligence that allows computers to learn from data without being explicitly programmed"}
40 | ```
41 | 
42 | * Send request with bad_words and stop_words
43 | 
44 | ```bash
45 | curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": [" intelligence", " allows"], "stop_words": [" computers", "learn"]}'
46 | 
47 | {"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"\n\nMachine learning is a branch of artificial intelligent that enables computers"}
48 | ```
49 | 
50 | The words ` intelligence` and ` allows` are replaced by ` intelligent` and ` enables`, and the generation stops when generating ` computers`.
51 | 


--------------------------------------------------------------------------------
/docs/guided_decoding.md:
--------------------------------------------------------------------------------
  1 | # End-to-End Workflow for Guided Decoding with TensorRT-LLM Backend
  2 | 
  3 | This document outlines the process for running guided decoding using the TensorRT-LLM backend. Guided decoding ensures that generated outputs adhere to specified formats, such as JSON. Currently, this feature is supported through the [XGrammar](https://github.com/mlc-ai/xgrammar) backend.
  4 | 
  5 | For more information, refer to the [guided decoding documentation](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/executor.md#structured-output-with-guided-decoding) from TensorRT-LLM. Additionally, you can explore another example of [guided decoding + LLM API example](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/llm_guided_decoding.html).
  6 | 
  7 | ## Overview of Guided Decoding
  8 | Guided decoding ensures that generated outputs conform to specific constraints or formats. Supported guide types include:
  9 | - **None**: No constraints.
 10 | - **JSON**: Outputs in JSON format.
 11 | - **JSON Schema**: JSON format with schema validation.
 12 | - **Regex**: Outputs matching a regular expression.
 13 | - **EBNF Grammar**: Outputs adhering to extended Backus-Naur form (EBNF) grammar rules.
 14 | 
 15 | # Build TensorRT-LLM engine and launch Tritonserver
 16 | 
 17 | From this point, we assume you installed all requirements for tensorrtllm_backend. You can refer to [build.md](build.md) for installation and docker launch.
 18 | 
 19 | ## Build TensorRT-LLM engine
 20 | ```bash
 21 | # Clone model from Hugging Face
 22 | export MODEL_NAME=TinyLlama-1.1B-Chat-v1.0
 23 | git clone https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0 hf_models/${MODEL_NAME}
 24 | 
 25 | export HF_MODEL_PATH=hf_models/${MODEL_NAME}
 26 | export UNIFIED_CKPT_PATH=trt_ckpts/tiny_llama_1b/1-gpu/fp16
 27 | export ENGINE_PATH=trt_engines/tiny_llama_1b/1-gpu/fp16
 28 | 
 29 | python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py --model_dir ${HF_MODEL_PATH} \
 30 |                              --output_dir ${UNIFIED_CKPT_PATH} \
 31 |                              --dtype float16
 32 | 
 33 | trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
 34 |              --remove_input_padding enable \
 35 |              --gpt_attention_plugin float16 \
 36 |              --context_fmha enable \
 37 |              --gemm_plugin float16 \
 38 |              --output_dir ${ENGINE_PATH} \
 39 |              --kv_cache_type paged \
 40 |              --max_batch_size 64
 41 | ```
 42 | ## Launch Tritonserver
 43 | 
 44 | ## Python Backend
 45 | ```bash
 46 | export GUIDED_DECODING_BACKEND=xgrammar
 47 | export TRITON_BACKEND=python
 48 | 
 49 | cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ llama_ifb -r
 50 | 
 51 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:64,preprocessing_instance_count:1
 52 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:64,postprocessing_instance_count:1
 53 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
 54 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
 55 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:${TRITON_BACKEND},triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,tokenizer_dir:${HF_MODEL_PATH},guided_decoding_backend:${GUIDED_DECODING_BACKEND}
 56 | 
 57 | python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/
 58 | ```
 59 | 
 60 | ## C++ Backend
 61 | In order to do run `TRITON_BACKEND=tensorrtllm` which means to do run on C++ backend, you need an extra step to extract tokenizer's information into json format. `generate_xgrammar_tokenizer_info.py` will create `xgrammar_tokenizer_info.json` under given output_dir argument. And we fill the `xgrammer_tokenizer_info_path` parameter in `tensorrt_llm/config.pbtxt`.
 62 | ```bash
 63 | export XGRAMMAR_TOKENIZER_INFO_DIR=tokenizer_info/${MODEL_NAME}
 64 | 
 65 | python3 tensorrt_llm/examples/generate_xgrammar_tokenizer_info.py --model_dir ${HF_MODEL_PATH} --output_dir ${XGRAMMAR_TOKENIZER_INFO_DIR}
 66 | 
 67 | export XGRAMMAR_TOKENIZER_INFO_PATH=tokenizer_info/${MODEL_NAME}/xgrammar_tokenizer_info.json
 68 | export GUIDED_DECODING_BACKEND=xgrammar
 69 | export TRITON_BACKEND=tensorrtllm
 70 | 
 71 | cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ llama_ifb -r
 72 | 
 73 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:64,preprocessing_instance_count:1
 74 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:64,postprocessing_instance_count:1
 75 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
 76 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
 77 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:${TRITON_BACKEND},triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,guided_decoding_backend:${GUIDED_DECODING_BACKEND},xgrammar_tokenizer_info_path:${XGRAMMAR_TOKENIZER_INFO_PATH}
 78 | 
 79 | python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/
 80 | ```
 81 | # Sending Guided Decoding Requests
 82 | 
 83 | Use the provided gRPC client to send requests with different guide types.
 84 | ```bash
 85 | # Set the prompt
 86 | PROMPT="What is the year after 2024? Answer:"
 87 | 
 88 | # 0. Guide type: None
 89 | python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble
 90 | 
 91 | # Output:
 92 | # 0: 2025
 93 | #
 94 | # Question 3: What is the year after 2025? Answer: 2026
 95 | #
 96 | 
 97 | # 1. Guide type: json
 98 | python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p  "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble --guided-decoding-guide-type json
 99 | 
100 | # Output:
101 | # 0: [2025]
102 | 
103 | # 2. Guide type: json_schema
104 | python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p  "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble --guided-decoding-guide-type json_schema --guided-decoding-guide '{"properties": {"answer": {"title": "Answer", "type": "integer"}}, "required": ["answer"], "title": "Answer", "type": "object"}'
105 | 
106 | # Output:
107 | # 0: {"answer": 2026}
108 | 
109 | # 3. Guide type: regex
110 | python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble --guided-decoding-guide-type regex --guided-decoding-guide '\d+'
111 | 
112 | # Output:
113 | # 0: 2025
114 | 
115 | # 4. Guide type: ebnf_grammar
116 | python3 tensorrt_llm/triton_backend/inflight_batcher_llm/client/end_to_end_grpc_client.py -p "${PROMPT}" -o 30 --exclude-input-in-output --verbose --model-name ensemble --guided-decoding-guide-type ebnf_grammar --guided-decoding-guide 'root ::= [0-9]+'
117 | 
118 | # Output:
119 | # 0: 2025
120 | ```
121 | 
122 | Use curl method to send requests
123 | ```bash
124 | curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is the year after 2024? Answer:", "max_tokens": 20, "bad_words": "", "stop_words": "", "pad_id": 2, "end_id": 2, "guided_decoding_guide_type":"json"}'
125 | 
126 | # Output:
127 | # {"model_name":"ensemble","model_version":"1","sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"[2025]"}
128 | ```
129 | 


--------------------------------------------------------------------------------
/docs/llama.md:
--------------------------------------------------------------------------------
  1 | ## End to end workflow to run llama 7b
  2 | 
  3 | 0. Make sure that you have initialized the TRT-LLM submodule:
  4 | 
  5 | ```bash
  6 | git lfs install
  7 | git submodule update --init --recursive
  8 | ```
  9 | 
 10 | 1. (Optional) Download the LLaMa model from HuggingFace:
 11 | 
 12 | ```bash
 13 | huggingface-cli login
 14 | 
 15 | huggingface-cli download meta-llama/Llama-2-7b-hf
 16 | ```
 17 | 
 18 | > **NOTE**
 19 | >
 20 | > Make sure that you have access to https://huggingface.co/meta-llama/Llama-2-7b-hf.
 21 | 
 22 | 2. Start the Triton Server Docker container:
 23 | 
 24 | ```bash
 25 | # Replace <yy.mm> with the version of Triton you want to use.
 26 | # The command below assumes the the current directory is the
 27 | # TRT-LLM backend root git repository.
 28 | 
 29 | docker run --rm -ti -v `pwd`:/mnt -w /mnt -v ~/.cache/huggingface:~/.cache/huggingface --gpus all nvcr.io/nvidia/tritonserver:\<yy.mm\>-trtllm-python-py3 bash
 30 | ```
 31 | 
 32 | 3. Build the engine:
 33 | ```bash
 34 | # Replace 'HF_LLAMA_MODE' with another path if you didn't download the model from step 1
 35 | # or you're not using HuggingFace.
 36 | export HF_LLAMA_MODEL=`python3 -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('meta-llama/Llama-2-7b-hf', filename='config.json')).parent)"`
 37 | export UNIFIED_CKPT_PATH=/tmp/ckpt/llama/7b/
 38 | export ENGINE_PATH=/tmp/engines/llama/7b/
 39 | python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \
 40 |                              --output_dir ${UNIFIED_CKPT_PATH} \
 41 |                              --dtype float16
 42 | 
 43 | trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
 44 |              --remove_input_padding enable \
 45 |              --gpt_attention_plugin float16 \
 46 |              --context_fmha enable \
 47 |              --gemm_plugin float16 \
 48 |              --output_dir ${ENGINE_PATH} \
 49 |              --kv_cache_type paged \
 50 |              --max_batch_size 64
 51 | ```
 52 | 
 53 | * Prepare configs
 54 | 
 55 | ```bash
 56 | cp tensorrt_llm/triton_backend/ci/all_models/inflight_batcher_llm/ llama_ifb -r
 57 | 
 58 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1
 59 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
 60 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
 61 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
 62 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
 63 | ```
 64 | 
 65 | * Launch server
 66 | 
 67 | ```bash
 68 | pip install SentencePiece
 69 | python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/
 70 | ```
 71 | 
 72 | this setting requires about 25GB
 73 | 
 74 | ```bash
 75 | nvidia-smi
 76 | 
 77 | Wed Nov 29 08:51:30 2023
 78 | +---------------------------------------------------------------------------------------+
 79 | | NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
 80 | |-----------------------------------------+----------------------+----------------------+
 81 | | GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
 82 | | Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
 83 | |                                         |                      |               MIG M. |
 84 | |=========================================+======================+======================|
 85 | |   0  NVIDIA H100 PCIe               On  | 00000000:41:00.0 Off |                    0 |
 86 | | N/A   40C    P0              79W / 350W |  25169MiB / 81559MiB |      0%      Default |
 87 | |                                         |                      |             Disabled |
 88 | +-----------------------------------------+----------------------+----------------------+
 89 | 
 90 | +---------------------------------------------------------------------------------------+
 91 | | Processes:                                                                            |
 92 | |  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
 93 | |        ID   ID                                                             Usage      |
 94 | |=======================================================================================|
 95 | +---------------------------------------------------------------------------------------+
 96 | ```
 97 | 
 98 | * Send request
 99 | 
100 | ```bash
101 | curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": "", "stop_words": "", "pad_id": 2, "end_id": 2}'
102 | 
103 | {"cum_log_probs":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"\nMachine learning is a subset of artificial intelligence (AI) that uses algorithms to learn from data and"}
104 | ```
105 | 
106 | * Send request with bad_words and stop_words
107 | 
108 | ```bash
109 | curl -X POST localhost:8000/v2/models/ensemble/generate -d '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": ["intelligence", "model"], "stop_words": ["focuses", "learn"], "pad_id": 2, "end_id": 2}'
110 | 
111 | {"cum_log_probs":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"\nMachine learning is a subset of artificial Intelligence (AI) that allows computers to learn"}
112 | ```
113 | 
114 | * Send request by `inflight_batcher_llm_client.py`
115 | 
116 | ```bash
117 | python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_LLAMA_MODEL}
118 | 
119 | =========
120 | [[1, 19298, 297, 6641, 29899, 23027, 3444, 29892, 1105, 7598, 16370, 408, 263]]
121 | Got completed request
122 | Input: Born in north-east France, Soyer trained as a
123 | Output beam 0: 850. He was the first chef to be hired by the newly opened Delmonico’s restaurant, where he worked for 10 years. He then opened his own restaurant, which was a huge success.
124 | Soyer was a prolific writer and his books include The Gastronomic Regenerator (1854), The Gastronomic Regenerator and Cookery for the People (1855), The Cuisine of To-day (1859), The Cuisine of To-morrow (1864), The Cuisine of the Future (1867), The Cuisine of the Future (1873), The Cuisine of the Future (1874), The Cuisine of the Future (1875), The Cuisine of the Future (1876), The
125 | output_ids =  [14547, 297, 3681, 322, 4517, 1434, 8401, 304, 1570, 3088, 297, 29871, 29896, 29947, 29945, 29900, 29889, 940, 471, 278, 937, 14547, 304, 367, 298, 2859, 491, 278, 15141, 6496, 5556, 3712, 1417, 30010, 29879, 27144, 29892, 988, 540, 3796, 363, 29871, 29896, 29900, 2440, 29889, 940, 769, 6496, 670, 1914, 27144, 29892, 607, 471, 263, 12176, 2551, 29889, 13, 6295, 7598, 471, 263, 410, 29880, 928, 9227, 322, 670, 8277, 3160, 450, 402, 7614, 4917, 293, 2169, 759, 1061, 313, 29896, 29947, 29945, 29946, 511, 450, 402, 7614, 4917, 293, 2169, 759, 1061, 322, 17278, 708, 363, 278, 11647, 313, 29896, 29947, 29945, 29945, 511, 450, 315, 4664, 457, 310, 1763, 29899, 3250, 313, 29896, 29947, 29945, 29929, 511, 450, 315, 4664, 457, 310, 1763, 29899, 26122, 313, 29896, 29947, 29953, 29946, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29953, 29955, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29955, 29941, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29955, 29946, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29955, 29945, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29955, 29953, 511, 450]
126 | ```
127 | 
128 | * Run test on dataset
129 | 
130 | ```
131 | python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500
132 | 
133 | [INFO] Start testing on 13 prompts.
134 | [INFO] Functionality test succeed.
135 | [INFO] Warm up for benchmarking.
136 | [INFO] Start benchmarking on 13 prompts.
137 | [INFO] Total Latency: 962.179 ms
138 | ```
139 | 
140 | 
141 | 
142 | * Run with decoupled mode (streaming)
143 | 
144 | ```bash
145 | cp tensorrt_llm/triton_backend/ci/all_models/inflight_batcher_llm/ llama_ifb -r
146 | 
147 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1
148 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
149 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:Truelogits_datatype:TYPE_FP32
150 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
151 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
152 | 
153 | pip install SentencePiece
154 | python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=llama_ifb/
155 | 
156 | python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/inflight_batcher_llm_client.py --request-output-len 200 --tokenizer-dir ${HF_LLAMA_MODEL} --streaming
157 | ```
158 | 
159 | <details>
160 | <summary> The result would be like
161 | </summary>
162 | 
163 | ```bash
164 | =========
165 | Input sequence:  [1, 19298, 297, 6641, 29899, 23027, 3444, 29892, 1105, 7598, 16370, 408, 263]
166 | [14547]
167 | [297]
168 | [3681]
169 | [322]
170 | [4517]
171 | [1434]
172 | [8401]
173 | [304]
174 | [1570]
175 | [3088]
176 | [297]
177 | [29871]
178 | [29896]
179 | [29947]
180 | [29945]
181 | [29900]
182 | [29889]
183 | [940]
184 | [471]
185 | [278]
186 | [937]
187 | [14547]
188 | [304]
189 | [367]
190 | [298]
191 | [2859]
192 | [491]
193 | [278]
194 | [15141]
195 | [6496]
196 | [5556]
197 | [3712]
198 | [1417]
199 | [30010]
200 | [29879]
201 | [27144]
202 | [29892]
203 | [988]
204 | [540]
205 | [3796]
206 | [363]
207 | [29871]
208 | [29896]
209 | [29900]
210 | [2440]
211 | [29889]
212 | [940]
213 | [769]
214 | [6496]
215 | [670]
216 | [1914]
217 | [27144]
218 | [29892]
219 | [607]
220 | [471]
221 | [263]
222 | [12176]
223 | [2551]
224 | [29889]
225 | [13]
226 | [6295]
227 | [7598]
228 | [471]
229 | [263]
230 | [410]
231 | [29880]
232 | [928]
233 | [9227]
234 | [322]
235 | [670]
236 | [8277]
237 | [3160]
238 | [450]
239 | [402]
240 | [7614]
241 | [4917]
242 | [293]
243 | [2169]
244 | [759]
245 | [1061]
246 | [313]
247 | [29896]
248 | [29947]
249 | [29945]
250 | [29946]
251 | [511]
252 | [450]
253 | [402]
254 | [7614]
255 | [4917]
256 | [293]
257 | [2169]
258 | [759]
259 | [1061]
260 | [322]
261 | [17278]
262 | [708]
263 | [363]
264 | [278]
265 | [11647]
266 | [313]
267 | [29896]
268 | [29947]
269 | [29945]
270 | [29945]
271 | [511]
272 | [450]
273 | [315]
274 | [4664]
275 | [457]
276 | [310]
277 | [1763]
278 | [29899]
279 | [3250]
280 | [313]
281 | [29896]
282 | [29947]
283 | [29945]
284 | [29929]
285 | [511]
286 | [450]
287 | [315]
288 | [4664]
289 | [457]
290 | [310]
291 | [1763]
292 | [29899]
293 | [26122]
294 | [313]
295 | [29896]
296 | [29947]
297 | [29953]
298 | [29946]
299 | [511]
300 | [450]
301 | [315]
302 | [4664]
303 | [457]
304 | [310]
305 | [278]
306 | [16367]
307 | [313]
308 | [29896]
309 | [29947]
310 | [29953]
311 | [29955]
312 | [511]
313 | [450]
314 | [315]
315 | [4664]
316 | [457]
317 | [310]
318 | [278]
319 | [16367]
320 | [313]
321 | [29896]
322 | [29947]
323 | [29955]
324 | [29941]
325 | [511]
326 | [450]
327 | [315]
328 | [4664]
329 | [457]
330 | [310]
331 | [278]
332 | [16367]
333 | [313]
334 | [29896]
335 | [29947]
336 | [29955]
337 | [29946]
338 | [511]
339 | [450]
340 | [315]
341 | [4664]
342 | [457]
343 | [310]
344 | [278]
345 | [16367]
346 | [313]
347 | [29896]
348 | [29947]
349 | [29955]
350 | [29945]
351 | [511]
352 | [450]
353 | [315]
354 | [4664]
355 | [457]
356 | [310]
357 | [278]
358 | [16367]
359 | [313]
360 | [29896]
361 | [29947]
362 | [29955]
363 | [29953]
364 | [511]
365 | [450]
366 | Input: Born in north-east France, Soyer trained as a
367 | Output beam 0: chef in Paris and London before moving to New York in 1850. He was the first chef to be hired by the newly opened Delmonico’s restaurant, where he worked for 10 years. He then opened his own restaurant, which was a huge success.
368 | Soyer was a prolific writer and his books include The Gastronomic Regenerator (1854), The Gastronomic Regenerator and Cookery for the People (1855), The Cuisine of To-day (1859), The Cuisine of To-morrow (1864), The Cuisine of the Future (1867), The Cuisine of the Future (1873), The Cuisine of the Future (1874), The Cuisine of the Future (1875), The Cuisine of the Future (1876), The
369 | Output sequence:  [1, 19298, 297, 6641, 29899, 23027, 3444, 29892, 1105, 7598, 16370, 408, 263, 14547, 297, 3681, 322, 4517, 1434, 8401, 304, 1570, 3088, 297, 29871, 29896, 29947, 29945, 29900, 29889, 940, 471, 278, 937, 14547, 304, 367, 298, 2859, 491, 278, 15141, 6496, 5556, 3712, 1417, 30010, 29879, 27144, 29892, 988, 540, 3796, 363, 29871, 29896, 29900, 2440, 29889, 940, 769, 6496, 670, 1914, 27144, 29892, 607, 471, 263, 12176, 2551, 29889, 13, 6295, 7598, 471, 263, 410, 29880, 928, 9227, 322, 670, 8277, 3160, 450, 402, 7614, 4917, 293, 2169, 759, 1061, 313, 29896, 29947, 29945, 29946, 511, 450, 402, 7614, 4917, 293, 2169, 759, 1061, 322, 17278, 708, 363, 278, 11647, 313, 29896, 29947, 29945, 29945, 511, 450, 315, 4664, 457, 310, 1763, 29899, 3250, 313, 29896, 29947, 29945, 29929, 511, 450, 315, 4664, 457, 310, 1763, 29899, 26122, 313, 29896, 29947, 29953, 29946, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29953, 29955, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29955, 29941, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29955, 29946, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29955, 29945, 511, 450, 315, 4664, 457, 310, 278, 16367, 313, 29896, 29947, 29955, 29953, 511, 450]
370 | ```
371 | 
372 | </details>
373 | 
374 | 
375 | * Run several requests at the same time
376 | 
377 | ```bash
378 | echo '{"text_input": "What is machine learning?", "max_tokens": 20, "bad_words": "", "stop_words": "", "pad_id": 2, "end_id": 2}' > tmp.txt
379 | printf '%s\n' {1..20} | xargs -I % -P 20 curl -X POST localhost:8000/v2/models/ensemble/generate -d @tmp.txt
380 | ```
381 | 


--------------------------------------------------------------------------------
/docs/llama_multi_instance.md:
--------------------------------------------------------------------------------
  1 | <!--
  2 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 | #
  4 | # Redistribution and use in source and binary forms, with or without
  5 | # modification, are permitted provided that the following conditions
  6 | # are met:
  7 | #  * Redistributions of source code must retain the above copyright
  8 | #    notice, this list of conditions and the following disclaimer.
  9 | #  * Redistributions in binary form must reproduce the above copyright
 10 | #    notice, this list of conditions and the following disclaimer in the
 11 | #    documentation and/or other materials provided with the distribution.
 12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 | #    contributors may be used to endorse or promote products derived
 14 | #    from this software without specific prior written permission.
 15 | #
 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | !-->
 28 | 
 29 | # Running Multiple Instances of the LLaMa Model
 30 | 
 31 | This document describes how you can run multiple instances of
 32 | LLaMa model on single and multiple GPUs running on the
 33 | same machine. The guide focuses on the following scenarios:
 34 | 
 35 | * [Running multiple instances of LLaMa model on a single GPU](#running-multiple-instances-of-llama-model-on-a-single-gpu).
 36 | * [Running multiple instances of LLaMa model on multiple GPUs](#running-multiple-instances-of-llama-model-on-multiple-gpus):
 37 | 
 38 |   a. Using [Orchestrator mode](#orchestrator-mode).
 39 | 
 40 |   b. Using [Leader mode](#leader-mode).
 41 | 
 42 | ## Running multiple instances of LLaMa model on a single GPU
 43 | 
 44 | 1. Setup the model repository as described in [LLaMa Guide](./llama.md).
 45 | 
 46 | 2. Increase the number of instances for the `instance_group` parameter for
 47 | the `tensorrt_llm` model.
 48 | 
 49 | 3. Start the triton server:
 50 | 
 51 | ```bash
 52 | # Replace the <gpu> with the gpu you want to use for this model.
 53 | CUDA_VISIBLE_DEVICES=<gpu> tritonserver --model-repository `pwd`/llama_ifb &
 54 | ```
 55 | 
 56 | This would create multiple instances of the `tensorrt_llm` model, running on the
 57 | same GPU.
 58 | 
 59 | > **Note**
 60 | >
 61 | > Running multiple instances of a single model is generally not
 62 | > recommended. If you choose to do this, you need to ensure the GPU has enough
 63 | > resources for multiple copies of a single model. The performance implications
 64 | > of running multiple models on the same GPU are unpredictable.
 65 | 
 66 | > **Note**
 67 | >
 68 | > For production deployments please make sure to adjust the
 69 | > `max_tokens_in_paged_kv_cache` parameter, otherwise you may run out of GPU
 70 | > memory since TensorRT-LLM by default may use 90% of GPU for KV-Cache for each
 71 | > model instance. Additionally, if you rely on `kv_cache_free_gpu_mem_fraction`
 72 | > the memory allocated to each instance will depend on the order in which instances are loaded.
 73 | 
 74 | 4. Run the test client to measure performance:
 75 | 
 76 | ```bash
 77 | python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500
 78 | ```
 79 | 
 80 | If you plan to use the BLS version instead of the ensemble model, you might also
 81 | need to adjust the number of model instances for the `tensorrt_llm_bls` model.
 82 | The default value only allows a single request for the whole pipeline which
 83 | might increase the latency and reduce the throughput.
 84 | 
 85 | 5. Kill the server:
 86 | 
 87 | ```bash
 88 | pgrep tritonserver | xargs kill
 89 | ```
 90 | 
 91 | ## Running multiple instances of LLaMa model on multiple GPUs
 92 | 
 93 | Unlike other Triton backend models, the TensorRT-LLM backend does not support
 94 | using `instance_group` setting for determining the placement of model instances
 95 | on different GPUs. In this section, we demonstrate how you can use
 96 | [Leader Mode](../README.md#leader-mode) and [Orchestrator Mode](../README.md#orchestrator-mode)
 97 | for running multiple instances of a LLaMa model on different GPUs.
 98 | 
 99 | For this section, let's assume that we have four GPUs and the CUDA device ids
100 | are 0, 1, 2, and 3.  We will be launching two instances of the LLaMa2-7b model
101 | with tensor parallelism equal to 2. The first instance will run on GPUs 0 and 1
102 | and the second instance will run on GPUs 2 and 3.
103 | 
104 | 1. Create the engines:
105 | 
106 | ```bash
107 | # Update if the model is not available in huggingface cache
108 | export HF_LLAMA_MODEL=`python3 -c "from pathlib import Path; from huggingface_hub import hf_hub_download; print(Path(hf_hub_download('meta-llama/Llama-2-7b-hf', filename='config.json')).parent)"`
109 | 
110 | export UNIFIED_CKPT_PATH=/tmp/ckpt/llama/7b-2tp-2gpu/
111 | export ENGINE_PATH=/tmp/engines/llama/7b-2tp-2gpu/
112 | 
113 | # Create the checkpoint
114 | python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py --model_dir ${HF_LLAMA_MODEL} \
115 |                              --output_dir ${UNIFIED_CKPT_PATH} \
116 |                              --dtype float16 \
117 |                              --tp_size 2
118 | 
119 | # Build the engines
120 | trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
121 |              --remove_input_padding enable \
122 |              --gpt_attention_plugin float16 \
123 |              --context_fmha enable \
124 |              --gemm_plugin float16 \
125 |              --output_dir ${ENGINE_PATH} \
126 |              --kv_cache_type paged \
127 |              --max_batch_size 64
128 | ```
129 | 
130 | 2. Setup the model repository:
131 | 
132 | ```bash
133 | # Setup the model repository for the first instance.
134 | cp tensorrt_llm/triton_backend/ci/all_models/inflight_batcher_llm/ llama_ifb -r
135 | 
136 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,preprocessing_instance_count:1
137 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
138 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
139 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/ensemble/config.pbtxt triton_max_batch_size:64,logits_datatype:TYPE_FP32
140 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i llama_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
141 | ```
142 | 
143 | ### Leader Mode
144 | 
145 | For leader mode, we will launch two separate `mpirun` commands to launch two
146 | separate Triton servers, one for each GPU (4 Triton Server instances in total).
147 | We also need to use a reverse proxy in front of them to load balance the requests
148 | between the servers.
149 | 
150 | 3a. Launch the servers:
151 | 
152 | ```bash
153 | CUDA_VISIBLE_DEVICES=0,1 python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=llama_ifb/ --http_port 8000 --grpc_port 8001 --metrics_port 8004
154 | CUDA_VISIBLE_DEVICES=2,3 python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=llama_ifb/ --http_port 8002 --grpc_port 8003 --metrics_port 8005
155 | ```
156 | 
157 | 4a. Install NGINX:
158 | 
159 | ```bash
160 | apt update
161 | apt install nginx -y
162 | ```
163 | 
164 | 5a. Setup the NGINX configuration and store it in `/etc/nginx/sites-available/tritonserver`:
165 | 
166 | ```conf
167 | upstream tritonserver {
168 |     server localhost:8000;
169 |     server localhost:8002;
170 | }
171 | 
172 | server {
173 |     listen 8080;
174 | 
175 |     location / {
176 |         proxy_pass http://tritonserver;
177 |     }
178 | }
179 | ```
180 | 
181 | 6a. Create a symlink and restart NGINX to enable the configuration:
182 | 
183 | ```
184 | ln -s /etc/nginx/sites-available/tritonserver /etc/nginx/sites-enabled/tritonserver
185 | service nginx restart
186 | ```
187 | 
188 | 7a. Run the test client to measure performance:
189 | 
190 | ```bash
191 | pip3 install tritonclient[all]
192 | 
193 | # Test the load on all the servers
194 | python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8080
195 | 
196 | # Test the load on one of the servers
197 | python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 -u localhost:8000
198 | ```
199 | 
200 | 8a. Kill the server:
201 | 
202 | ```bash
203 | pgrep mpirun | xargs kill
204 | ```
205 | 
206 | ### Orchestrator Mode
207 | 
208 | With orchestrator mode, there are two options for running multiple instances
209 | of a single model:
210 | 
211 | 1. Creating separate Triton models
212 | 
213 | 2. Starting from the 24.08 release, you can use Triton `instance_group` field to specify the number TRT-LLM model instances. With that option, the load balancing decision will be done in Triton core.
214 | 
215 | #### 1. Creating Separate Triton Models
216 | 
217 | 3b. Create a copy of the `tensorrt_llm` model:
218 | 
219 | ```bash
220 | cp llama_ifb/tensorrt_llm llama_ifb/tensorrt_llm_2 -r
221 | ```
222 | 
223 | 4b. Modify the `gpu_device_ids` field in the config file to specify which GPUs
224 | should be used by each model:
225 | 
226 | ```bash
227 | sed -i 's/\${gpu_device_ids}/0,1/g' llama_ifb/tensorrt_llm/config.pbtxt
228 | sed -i 's/\${gpu_device_ids}/2,3/g' llama_ifb/tensorrt_llm_2/config.pbtxt
229 | sed -i 's/name: "tensorrt_llm"/name: "tensorrt_llm_2"/g' llama_ifb/tensorrt_llm_2/config.pbtxt
230 | ```
231 | 
232 | > **Note**
233 | >
234 | > If you want to use the ensemble or BLS models, you have to create a
235 | > copy of the ensemble and BLS models as well and modify the "tensorrt_llm"
236 | > model name to "tensorrt_llm_2" in the config file.
237 | 
238 | 5b. Launch the server:
239 | 
240 | ```bash
241 | python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --multi-model --model_repo=llama_ifb/
242 | ```
243 | 
244 | Alternatively, you can start all MPI ranks at once and avoid dynamic process spawning
245 | by using the `--disable-spawn-processes`. The config file must specify which ranks each
246 | model should use:
247 | 
248 | ```bash
249 | sed -i 's/\${participant_ids}/1,2/g' llama_ifb/tensorrt_llm/config.pbtxt
250 | sed -i 's/\${participant_ids}/3,4/g' llama_ifb/tensorrt_llm_2/config.pbtxt
251 | ```
252 | 
253 | Note that rank 0 is reserved for the orchestrator rank.
254 | 
255 | ```bash
256 | python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --multi-model --model_repo=llama_ifb/ --disable-spawn-processes --world_size=5
257 | ```
258 | 
259 | 6b. Run the test client to measure performance:
260 | 
261 | ```bash
262 | pip3 install tritonclient[all]
263 | 
264 | # We will only benchmark the core tensorrtllm models.
265 | python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/benchmark_core_model.py --max-input-len 500 \
266 |      --tensorrt-llm-model-name tensorrt_llm \
267 |      --tensorrt-llm-model-name tensorrt_llm_2 \
268 |      dataset --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json \
269 |      --tokenizer-dir $HF_LLAMA_MODEL
270 | ```
271 | 
272 | 7b. Kill the server:
273 | 
274 | ```bash
275 | pgrep mpirun | xargs kill
276 | ```
277 | 
278 | #### 2. Using Triton Core's Load Balancing
279 | 
280 | In order to use Triton core's load balancing for multiple instances, you can
281 | increase the number of instances in the `instance_group` field and use the
282 | `gpu_device_ids` parameter to specify which GPUs will be used by each model
283 | instance.
284 | 
285 | For example, if you're running a TP=2 model on a 4-GPU system and you want
286 | to run one instance on GPUs 0 and 1 and the other instance on GPUs 2 and 3,
287 | you can use the following model configuration:
288 | 
289 | ```
290 | instance_group [
291 |     {kind: KIND_CPU, count: 2}
292 | ]
293 | 
294 | parameters: {
295 |   key: "gpu_device_ids"
296 |   value: {
297 |     string_value: "0,1;2,3"
298 |   }
299 | }
300 | ```
301 | 
302 | Please note that the number of set of GPU device ids must equal the number of instances.
303 | 
304 | ### Orchestrator Mode vs Leader Mode Summary
305 | 
306 | The table below summarizes the differences between the orchestrator mode and
307 | leader mode:
308 | 
309 | |                                   | Orchestrator Mode (Separate Models)  | Orchestrator Mode (Triton Load Balancing) |Leader Mode |
310 | | ----------------------------------| :----------------: | :----------------: |:----------:|
311 | | Requires Reverse Proxy            |         ❌         |           ❌        |     ✅     |
312 | | Requires Client Changes           |         ✅         |           ❌         |     ❌     |
313 | 
314 | Orchestrator mode by default uses `MPI_Comm_Spawn` to create the child
315 | processes. If `MPI_Comm_Spawn` is used, it is not possible to distribute
316 | the model across multiple nodes.
317 | 
318 | It is also possible to use orchestrator mode with MPI processes that have been
319 | pre-spawned. In order to do that, you need to set `--disable-spawn-processes`
320 | when using the [launch_triton_server.py](../scripts/launch_triton_server.py)
321 | script or `export TRTLLM_ORCHESTRATOR_SPAWN_PROCESSES=0`. In this mode,
322 | it is possible to run the server across different nodes in orchestrator mode.
323 | 
324 | In order to use the orchestrator mode itself, you need to set the `--multi-model`
325 | flag when using the [launch_triton_server.py](../scripts/launch_triton_server.py)
326 | script or `export TRTLLM_ORCHESTRATOR=1`.
327 | 


--------------------------------------------------------------------------------
/docs/llmapi.md:
--------------------------------------------------------------------------------
 1 | ## End to end workflow to use the pytorch LLMAPI workflow
 2 | 
 3 | * Start the Triton Server Docker container:
 4 | 
 5 | ```bash
 6 | # Replace <yy.mm> with the version of Triton you want to use.
 7 | # The command below assumes the the current directory is the
 8 | # TRT-LLM backend root git repository.
 9 | 
10 | docker run --rm -ti -v `pwd`:/mnt -w /mnt -v ~/.cache/huggingface:~/.cache/huggingface --gpus all nvcr.io/nvidia/tritonserver:\<yy.mm\>-trtllm-python-py3 bash
11 | ```
12 | 
13 | * Prepare config
14 | 
15 | ```bash
16 |  cp -R tensorrt_llm/triton_backend/all_models/llmapi/ llmapi_repo/
17 | ```
18 | 
19 | Edit `llmapi_repo/tensorrt_llm/1/model.yaml` to change the model. You can either use a HuggingFace path or a local path. The following is based on `meta-llama/Llama-3.1-8B`.
20 | 
21 | This configuration file also allows you to enable CUDA graphs support and set pipeline parallelism and tensor parallelism sizes.
22 | 
23 | * Launch server
24 | 
25 | ```bash
26 | python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --model_repo=llmapi_repo/
27 | ```
28 | 
29 | * Send request
30 | 
31 | ```bash
32 | curl -X POST localhost:8000/v2/models/tensorrt_llm/generate -d '{"text_input": "The future of AI is", "max_tokens":10}' | jq
33 | ```
34 | 
35 | `inflight_batcher_llm_client.py` is not supported yet.
36 | 
37 | * Run test on dataset
38 | 
39 | ```bash
40 | python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/end_to_end_test.py --dataset tensorrt_llm/triton_backend/ci/L0_backend_trtllm/simple_data.json --max-input-len 500 --test-llmapi --model-name tensorrt_llm
41 | 
42 | [INFO] Start testing on 13 prompts.
43 | [INFO] Functionality test succeeded.
44 | [INFO] Warm up for benchmarking.
45 | FLAGS.model_name: tensorrt_llm
46 | [INFO] Start benchmarking on 13 prompts.
47 | [INFO] Total Latency: 377.254 ms
48 | ```
49 | 
50 | * Run benchmark
51 | 
52 | ```bash
53 |  python3 tensorrt_llm/triton_backend/tools/inflight_batcher_llm/benchmark_core_model.py --max-input-len 500 \
54 |     --tensorrt-llm-model-name tensorrt_llm \
55 |      --test-llmapi \
56 |      dataset --dataset ./tensorrt_llm/triton_backend/tools/dataset/mini_cnn_eval.json \
57 |      --tokenizer-dir meta-llama/Llama-3.1-8B
58 | 
59 | dataset
60 | Tokenizer: Tokens per word =  1.308
61 | [INFO] Warm up for benchmarking.
62 | [INFO] Start benchmarking on 39 prompts.
63 | [INFO] Total Latency: 1446.623 ms
64 | ```
65 | 


--------------------------------------------------------------------------------
/docs/lora.md:
--------------------------------------------------------------------------------
  1 | # Running LoRA inference with inflight batching
  2 | 
  3 | Below is an example of how to run LoRA inference with inflight batching. See the
  4 | [LoRA documentation](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/lora.md)
  5 | in the TensorRT-LLM repository for more information about running gpt-2b with
  6 | LoRA using inflight batching.
  7 | 
  8 | ## Launch Triton TensorRT-LLM container
  9 | 
 10 | ```bash
 11 | docker run --rm -it --net host --shm-size=2g \
 12 |     --ulimit memlock=-1 --ulimit stack=67108864 --gpus all \
 13 |     -v </path/to/tensorrtllm_backend>:/tensorrtllm_backend \
 14 |     -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
 15 |     -v </path/to/engines>:/engines \
 16 |     nvcr.io/nvidia/tritonserver:<xx.yy>-trtllm-python-py3
 17 | ```
 18 | 
 19 | ## Prepare TensorRT-LLM engines with LoRA enable
 20 | 
 21 | (Optional) Download the LLaMa model from HuggingFace if you haven't already.
 22 | 
 23 | ```bash
 24 | huggingface-cli login
 25 | huggingface-cli download meta-llama/Llama-2-7b-hf
 26 | ```
 27 | 
 28 | > **NOTE**
 29 | >
 30 | > Make sure that you have access to https://huggingface.co/meta-llama/Llama-2-7b-hf.
 31 | 
 32 | ```bash
 33 | cd /tensorrtllm_backend/tensorrt_llm/examples/models/core/llama
 34 | BASE_LLAMA_MODEL=/path/to/llama-7b-hf
 35 | 
 36 | python3 convert_checkpoint.py --model_dir ${BASE_LLAMA_MODEL} \
 37 |                             --output_dir ./c-model/llama/fp16/1-gpu \
 38 |                             --dtype float16
 39 | 
 40 | trtllm-build --checkpoint_dir ./c-model/llama/fp16/1-gpu \
 41 |             --output_dir /engines/llama_7b_with_lora_qkv/fp16/1-gpu \
 42 |             --gemm_plugin float16 \
 43 |             --max_batch_size 8 \
 44 |             --max_seq_len 562 \
 45 |             --gpt_attention_plugin float16 \
 46 |             --kv_cache_type paged \
 47 |             --remove_input_padding enable \
 48 |             --use_paged_context_fmha enable \
 49 |             --lora_plugin float16 \
 50 |             --lora_target_modules attn_q attn_k attn_v \
 51 |             --max_lora_rank 8
 52 | ```
 53 | 
 54 | Note that you still need to use `hf_lora_convert.py` to convert the lora weights and store in `/tmp/lora_prefetch`. But users don't need to send the `--lora-path` when you run the inference at the first time.
 55 | 
 56 | ## Generate LoRA tensors
 57 | 
 58 | Now generate LoRA tensors that will be passed in with each request to triton.
 59 | 
 60 | ```bash
 61 | git-lfs clone https://huggingface.co/qychen/luotuo-lora-7b-0.1
 62 | git-lfs clone https://huggingface.co/kunishou/Japanese-Alpaca-LoRA-7b-v0
 63 | 
 64 | python3 ..//hf_lora_convert.py -i luotuo-lora-7b-0.1 -o luotuo-lora-7b-0.1-weights --storage-type float16
 65 | python3 ../hf_lora_convert.py -i Japanese-Alpaca-LoRA-7b-v0 -o Japanese-Alpaca-LoRA-7b-v0-weights --storage-type float16
 66 | ```
 67 | 
 68 | ## Create a Triton model repository and launch the Triton server
 69 | 
 70 | Create a Triton model repository following the instructions
 71 | [here](../README.md#prepare-the-model-repository), and modify the model
 72 | configuration following the steps
 73 | [here](../README.md#modify-the-model-configuration).
 74 | 
 75 | ## LoRA Cache
 76 | 
 77 | As LoRA weights are passed to the backend they will be cached in a host cache.
 78 | As requests are scheduled, those weights with be prefetched to a gpu cache.
 79 | After a LoRA is loaded into the cache, only `lora_task_id` is needed for inference.
 80 | 
 81 | ### lora_cache_optimal_adapter_size
 82 | 
 83 | Optimal adapter size used to size cache pages. Typically optimally sized
 84 | adapters will fix exactly into 1 cache page. (default: 8)
 85 | 
 86 | ```
 87 | parameters: {
 88 |   key: "lora_cache_optimal_adapter_size"
 89 |   value: {
 90 |     string_value: "${lora_cache_optimal_adapter_size}"
 91 |   }
 92 | }
 93 | ```
 94 | 
 95 | ### lora_cache_max_adapter_size
 96 | 
 97 | Used to set the minimum size of a cache page.  Pages must be at least large enough to fit a single module, single later adapter_size `maxAdapterSize` row of weights. (default: 64)
 98 | 
 99 | ```
100 | parameters: {
101 |   key: "lora_cache_max_adapter_size"
102 |   value: {
103 |     string_value: "${lora_cache_max_adapter_size}"
104 |   }
105 | }
106 | ```
107 | 
108 | ### lora_cache_gpu_memory_fraction
109 | 
110 | Fraction of GPU memory used for LoRA cache. Computed as a fraction of left over memory after engine load, and after KV cache is loaded (default: 0.05)
111 | 
112 | ```
113 | parameters: {
114 |   key: "lora_cache_gpu_memory_fraction"
115 |   value: {
116 |     string_value: "${lora_cache_gpu_memory_fraction}"
117 |   }
118 | }
119 | ```
120 | 
121 | ### lora_cache_host_memory_bytes
122 | 
123 | Size of host LoRA cache in bytes (default: 1G)
124 | 
125 | ```
126 | parameters: {
127 |   key: "lora_cache_host_memory_bytes"
128 |   value: {
129 |     string_value: "${lora_cache_host_memory_bytes}"
130 |   }
131 | }
132 | ```
133 | 
134 | ### prefetch lora cache during initializing the model instance
135 | 
136 | If users want to load the lora models during initializing the model instance,
137 | instead of passing the lora weight as input, users can store the lora weights in `<lora_prefetch_dir>`
138 | and pass it as a parameter to initialize the model instance.
139 | Then, the model instance will try to load the lora weights from the folder.
140 | In the folder, users can put many folders for different lora tasks.
141 | For example, assume we want to store lora weights in `/tmp/lora_prefetch` and
142 | there are three lora tasks `0`, `1` and `3`, then the architecture of the folder would be like
143 | 
144 | ```bash
145 | /tmp/lora_prefetch
146 | ├── 0
147 | │   ├── model.lora_config.npy
148 | │   └── model.lora_weights.npy
149 | ├── 1
150 | │   ├── model.lora_config.npy
151 | │   └── model.lora_weights.npy
152 | └── 3
153 |     ├── model.lora_config.npy
154 |     └── model.lora_weights.npy
155 | ```
156 | 
157 | Note that you must name the folder by digit because the lora cache manager will view these name as lora task ids.
158 | 
159 | ```pbtxt
160 | parameters: {
161 |   key: "lora_prefetch_dir"
162 |   value: {
163 |     string_value: "${lora_prefetch_dir}"
164 |   }
165 | }
166 | ```
167 | 
168 | ## Launch tritonserver
169 | 
170 | ```bash
171 | MODEL_FOLDER=/path/to/triton_model_repo
172 | # 'world_size' is the number of GPUs you want to use for serving. This should
173 | # be aligned with the number of GPUs used to build the TensorRT-LLM engine.
174 | python3 /tensorrtllm_backend/tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size=1 --model_repo=${MODEL_FOLDER}
175 | ```
176 | 
177 | Run Multi-LoRA example by issuing multiple concurrent requests.
178 | The inflight batcher will execute mixed batches with multiple LoRAs in the same batch.
179 | 
180 | First we cache the LoRAs by sending dummy requests for each adapter.  The TASK_IDS are uniq to the adapter
181 | 
182 | ```bash
183 | pip3 install tritonclient[all]
184 | 
185 | TASK_IDS=("1" "2")
186 | LORA_PATHS=("luotuo-lora-7b-0.1-weights" "Japanese-Alpaca-LoRA-7b-v0-weights")
187 | INFLIGHT_BATCHER_LLM_CLIENT=/tensorrtllm_backend/tensorrt_llm/triton_backend/tools/inflight_batcher_llm/inflight_batcher_llm_client.py
188 | 
189 | for index in ${!TASK_IDS[@]}; do
190 |     text="dummy"
191 |     lora_path=${LORA_PATHS[$index]}
192 |     task_id=${TASK_IDS[$index]}
193 |     lora_arg="--lora-path ${lora_path} --lora-task-id ${task_id}"
194 | 
195 |     python3 ${INFLIGHT_BATCHER_LLM_CLIENT} \
196 |         --top-k 0 \
197 |         --top-p 0.5 \
198 |         --request-output-len 10 \
199 |         --text "${text}" \
200 |         --tokenizer-dir /path/to/llama-7b-hf \
201 |         ${lora_arg} &
202 | done
203 | ```
204 | 
205 | Now perform inference with just `--lora-task-id`
206 | 
207 | ```bash
208 | INPUT_TEXT=("美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "美国的首都在哪里? \n答案:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:" "アメリカ合衆国の首都はどこですか? \n答え:")
209 | TASK_IDS=("" "1" "2" "" "1" "2")
210 | 
211 | for index in ${!INPUT_TEXT[@]}; do
212 |     text=${INPUT_TEXT[$index]}
213 |     task_id=${TASK_IDS[$index]}
214 |     lora_arg=""
215 |     if [ "${task_id}" != "" ]; then
216 |         lora_arg="--lora-task-id ${task_id}"
217 |     fi
218 | 
219 |     python3 inflight_batcher_llm/client/inflight_batcher_llm_client.py \
220 |         --top-k 0 \
221 |         --top-p 0.5 \
222 |         --request-output-len 10 \
223 |         --text "${text}" \
224 |         --tokenizer-dir /home/scratch.trt_llm_data/llm-models/llama-models/llama-7b-hf \
225 |         ${lora_arg} &
226 | done
227 | 
228 | wait
229 | ```
230 | 
231 | Example Output:
232 | 
233 | ```
234 | Input sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901]
235 | Input sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901]
236 | Input sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901]
237 | Input sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901]
238 | Input sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901]
239 | Input sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901]
240 | Got completed request
241 | Input: アメリカ合衆国の首都はどこですか? \n答え:
242 | Output beam 0: ワシントン D.C.
243 | Output sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 29871, 31028, 30373, 30203, 30279, 30203, 360, 29889, 29907, 29889]
244 | Got completed request
245 | Input: 美国的首都在哪里? \n答案:
246 | Output beam 0: Washington, D.C.
247 | What is the
248 | Output sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 7660, 29892, 360, 29889, 29907, 29889, 13, 5618, 338, 278]
249 | Got completed request
250 | Input: 美国的首都在哪里? \n答案:
251 | Output beam 0: Washington D.C.
252 | Washington D.
253 | Output sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 7660, 360, 29889, 29907, 29889, 13, 29956, 7321, 360, 29889]
254 | Got completed request
255 | Input: アメリカ合衆国の首都はどこですか? \n答え:
256 | Output beam 0: Washington, D.C.
257 | Which of
258 | Output sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 7660, 29892, 360, 29889, 29907, 29889, 13, 8809, 436, 310]
259 | Got completed request
260 | Input: アメリカ合衆国の首都はどこですか? \n答え:
261 | Output beam 0: Washington D.C.
262 | 1. ア
263 | Output sequence:  [1, 29871, 30310, 30604, 30303, 30439, 30733, 235, 164, 137, 30356, 30199, 31688, 30769, 30449, 31250, 30589, 30499, 30427, 30412, 29973, 320, 29876, 234, 176, 151, 30914, 29901, 7660, 360, 29889, 29907, 29889, 13, 29896, 29889, 29871, 30310]
264 | Got completed request
265 | Input: 美国的首都在哪里? \n答案:
266 | Output beam 0: 华盛顿
267 | W
268 | Output sequence:  [1, 29871, 30630, 30356, 30210, 31688, 30769, 30505, 232, 150, 173, 30755, 29973, 320, 29876, 234, 176, 151, 233, 164, 139, 29901, 29871, 31266, 234, 158, 158, 236, 164, 194, 13, 29956]
269 | ```
270 | 


--------------------------------------------------------------------------------
/docs/model_config.md:
--------------------------------------------------------------------------------
  1 | # Model Configuration
  2 | 
  3 | ## Model Parameters
  4 | 
  5 | The following tables show the parameters in the `config.pbtxt` of the models in
  6 | [all_models/inflight_batcher_llm](../tensorrt_llm/triton_backend/all_models/inflight_batcher_llm).
  7 | that can be modified before deployment. For optimal performance or custom
  8 | parameters, please refer to
  9 | [perf_best_practices](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-best-practices.md).
 10 | 
 11 | The names of the parameters listed below are the values in the `config.pbtxt`
 12 | that can be modified using the
 13 | [`fill_template.py`](../tensorrt_llm/triton_backend/tools/fill_template.py) script.
 14 | 
 15 | **NOTE** For fields that have comma as the value (e.g. `gpu_device_ids`,
 16 | `participant_ids`), you need to escape the comma with
 17 | a backslash. For example, if you want to set `gpu_device_ids` to `0,1` you need
 18 | to run `python3 fill_template.py -i config.pbtxt "gpu_device_ids:0\,1".`
 19 | 
 20 | The mandatory parameters must be set for the model to run. The optional
 21 | parameters are not required but can be set to customize the model.
 22 | 
 23 | ### ensemble model
 24 | 
 25 | See
 26 | [here](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#ensemble-models)
 27 | to learn more about ensemble models.
 28 | 
 29 | *Mandatory parameters*
 30 | 
 31 | | Name | Description |
 32 | | :----------------------: | :-----------------------------: |
 33 | | `triton_max_batch_size` | The maximum batch size that the Triton model instance will run with. Note that for the `tensorrt_llm` model, the actual runtime batch size can be larger than `triton_max_batch_size`. The runtime batch size will be determined by the TRT-LLM scheduler based on a number of parameters such as number of available requests in the queue, and the engine build `trtllm-build` parameters (such `max_num_tokens` and `max_batch_size`). |
 34 | | `logits_datatype` | The data type for context and generation logits. |
 35 | 
 36 | ### preprocessing model
 37 | 
 38 | *Mandatory parameters*
 39 | 
 40 | | Name | Description |
 41 | | :----------------------: | :-----------------------------: |
 42 | | `triton_max_batch_size` | The maximum batch size that Triton should use with the model. |
 43 | | `tokenizer_dir` | The path to the tokenizer for the model. |
 44 | | `preprocessing_instance_count` | The number of instances of the model to run. |
 45 | | `max_queue_delay_microseconds` | The maximum queue delay in microseconds. Setting this parameter to a value greater than 0 can improve the chances that two requests arriving within `max_queue_delay_microseconds` will be scheduled in the same TRT-LLM iteration. |
 46 | | `max_queue_size` | The maximum number of requests allowed in the TRT-LLM queue before rejecting new requests. |
 47 | 
 48 | *Optional parameters*
 49 | 
 50 | | Name | Description |
 51 | | :----------------------: | :-----------------------------: |
 52 | | `add_special_tokens` | The `add_special_tokens` flag used by [HF tokenizers](https://huggingface.co/transformers/v2.11.0/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.add_special_tokens). |
 53 | | `multimodal_model_path` | The vision engine path used in multimodal workflow. |
 54 | | `engine_dir` | The path to the engine for the model. This parameter is only needed for *multimodal processing* to extract the `vocab_size` from the engine_dir's config.json for `fake_prompt_id` mappings. |
 55 | 
 56 | 
 57 | ### multimodal_encoders model
 58 | 
 59 | *Mandatory parameters*
 60 | 
 61 | | Name | Description |
 62 | | :----------------------: | :-----------------------------: |
 63 | | `triton_max_batch_size` | The maximum batch size that Triton should use with the model. |
 64 | | `max_queue_delay_microseconds` | The maximum queue delay in microseconds. Setting this parameter to a value greater than 0 can improve the chances that two requests arriving within `max_queue_delay_microseconds` will be scheduled in the same TRT-LLM iteration. |
 65 | | `max_queue_size` | The maximum number of requests allowed in the TRT-LLM queue before rejecting new requests. |
 66 | | `multimodal_model_path` | The vision engine path used in multimodal workflow. |
 67 | | `hf_model_path` | The Huggingface model path used for `llava_onevision` and `mllama` models. |
 68 | 
 69 | 
 70 | ### postprocessing model
 71 | 
 72 | *Mandatory parameters*
 73 | 
 74 | | Name | Description |
 75 | | :----------------------: | :-----------------------------: |
 76 | | `triton_max_batch_size` | The maximum batch size that Triton should use with the model. |
 77 | | `tokenizer_dir` | The path to the tokenizer for the model. |
 78 | | `postprocessing_instance_count` | The number of instances of the model to run. |
 79 | 
 80 | *Optional parameters*
 81 | 
 82 | | Name | Description |
 83 | | :----------------------: | :-----------------------------: |
 84 | | `skip_special_tokens` | The `skip_special_tokens` flag used by [HF detokenizers](https://huggingface.co/transformers/v2.11.0/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.decode). |
 85 | 
 86 | ### tensorrt_llm model
 87 | 
 88 | The majority of the `tensorrt_llm` model parameters and input/output tensors
 89 | can be mapped to parameters in the TRT-LLM C++ runtime API defined in
 90 | [`executor.h`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/include/tensorrt_llm/executor/executor.h).
 91 | Please refer to the Doxygen comments in `executor.h` for a more detailed
 92 | description of the parameters below.
 93 | 
 94 | *Mandatory parameters*
 95 | 
 96 | | Name | Description |
 97 | | :----------------------: | :-----------------------------: |
 98 | | `triton_backend` | The backend to use for the model. Set to `tensorrtllm` to utilize the C++ TRT-LLM backend implementation. Set to `python` to utlize the TRT-LLM Python runtime. |
 99 | | `triton_max_batch_size` | The maximum batch size that the Triton model instance will run with. Note that for the `tensorrt_llm` model, the actual runtime batch size can be larger than `triton_max_batch_size`. The runtime batch size will be determined by the TRT-LLM scheduler based on a number of parameters such as number of available requests in the queue, and the engine build `trtllm-build` parameters (such `max_num_tokens` and `max_batch_size`). |
100 | | `decoupled_mode` | Whether to use decoupled mode. Must be set to `true` for requests setting the `stream` tensor to `true`. |
101 | | `max_queue_delay_microseconds` | The maximum queue delay in microseconds. Setting this parameter to a value greater than 0 can improve the chances that two requests arriving within `max_queue_delay_microseconds` will be scheduled in the same TRT-LLM iteration. |
102 | | `max_queue_size` | The maximum number of requests allowed in the TRT-LLM queue before rejecting new requests. |
103 | | `engine_dir` | The path to the engine for the model. |
104 | | `batching_strategy` | The batching strategy to use. Set to `inflight_fused_batching` when enabling in-flight batching support. To disable in-flight batching, set to `V1` |
105 | | `encoder_input_features_data_type` | The dtype for the input tensor `encoder_input_features`. For the mllama model, this must be `TYPE_BF16`. For other models like whisper, this is `TYPE_FP16`. |
106 | | `logits_datatype` | The data type for context and generation logits. |
107 | 
108 | *Optional parameters*
109 | 
110 | - General
111 | 
112 | | Name | Description |
113 | | :----------------------: | :-----------------------------: |
114 | | `encoder_engine_dir` | When running encoder-decoder models, this is the path to the folder that contains the model configuration and engine for the encoder model. |
115 | | `max_attention_window_size` | When using techniques like sliding window attention, the maximum number of tokens that are attended to generate one token. Defaults attends to all tokens in sequence. (default=max_sequence_length) |
116 | | `sink_token_length` | Number of sink tokens to always keep in attention window. |
117 | | `exclude_input_in_output` | Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens. (default=`false`) |
118 | | `cancellation_check_period_ms` | The time for cancellation check thread to sleep before doing the next check. It checks if any of the current active requests are cancelled through triton and prevent further execution of them. (default=100) |
119 | | `stats_check_period_ms` | The time for the statistics reporting thread to sleep before doing the next check. (default=100) |
120 | | `recv_poll_period_ms` | The time for the receiving thread in orchestrator mode to sleep before doing the next check. (default=0) |
121 | | `iter_stats_max_iterations` | The maximum number of iterations for which to keep statistics. (default=ExecutorConfig::kDefaultIterStatsMaxIterations) |
122 | | `request_stats_max_iterations` | The maximum number of iterations for which to keep per-request statistics. (default=executor::kDefaultRequestStatsMaxIterations) |
123 | | `normalize_log_probs` | Controls if log probabilities should be normalized or not. Set to `false` to skip normalization of `output_log_probs`. (default=`true`) |
124 | | `gpu_device_ids` | Comma-separated list of GPU IDs to use for this model. Use semicolons to separate multiple instances of the model. If not provided, the model will use all visible GPUs. (default=unspecified) |
125 | | `participant_ids` | Comma-separated list of MPI ranks to use for this model. Mandatory when using orchestrator mode with -disable-spawn-process (default=unspecified) |
126 | | `num_nodes` | Number of MPI nodes to use for this model. (default=1) |
127 | | `gpu_weights_percent` | Set to a number between 0.0 and 1.0 to specify the percentage of weights that reside on GPU instead of CPU and streaming load during runtime. Values less than 1.0 are only supported for an engine built with `weight_streaming` on. (default=1.0) |
128 | 
129 | - KV cache
130 | 
131 | Note that the parameter `enable_trt_overlap` has been removed from the
132 | config.pbtxt. This option allowed to overlap execution of two micro-batches to
133 | hide CPU overhead. Optimization work has been done to reduce the CPU overhead
134 | and it was found that the overlapping of micro-batches did not provide
135 | additional benefits.
136 | 
137 | | Name | Description |
138 | | :----------------------: | :-----------------------------: |
139 | | `max_tokens_in_paged_kv_cache` | The maximum size of the KV cache in number of tokens. If unspecified, value is interpreted as 'infinite'. KV cache allocation is the min of max_tokens_in_paged_kv_cache and value derived from kv_cache_free_gpu_mem_fraction below. (default=unspecified) |
140 | | `kv_cache_free_gpu_mem_fraction` | Set to a number between 0 and 1 to indicate the maximum fraction of GPU memory (after loading the model) that may be used for KV cache. (default=0.9) |
141 | | `cross_kv_cache_fraction` | Set to a number between 0 and 1 to indicate the maximum fraction of KV cache that may be used for cross attention, and the rest will be used for self attention. Optional param and should be set for encoder-decoder models ONLY. (default=0.5) |
142 | | `kv_cache_host_memory_bytes` |  Enable offloading to host memory for the given byte size. |
143 | | `enable_kv_cache_reuse` | Set to `true` to reuse previously computed KV cache values (e.g. for system prompt) |
144 | 
145 | - LoRA cache
146 | 
147 | | Name | Description |
148 | | :----------------------: | :-----------------------------: |
149 | | `lora_cache_optimal_adapter_size` | Optimal adapter size used to size cache pages. Typically optimally sized adapters will fix exactly into 1 cache page. (default=8) |
150 | | `lora_cache_max_adapter_size` | Used to set the minimum size of a cache page.  Pages must be at least large enough to fit a single module, single later adapter_size `maxAdapterSize` row of weights. (default=64) |
151 | | `lora_cache_gpu_memory_fraction` | Fraction of GPU memory used for LoRA cache. Computed as a fraction of left over memory after engine load, and after KV cache is loaded. (default=0.05) |
152 | | `lora_cache_host_memory_bytes` | Size of host LoRA cache in bytes. (default=1G) |
153 | | `lora_prefetch_dir` | Folder to store the LoRA weights we hope to load during engine initialization. |
154 | 
155 | - Decoding mode
156 | 
157 | | Name | Description |
158 | | :----------------------: | :-----------------------------: |
159 | | `max_beam_width` | The beam width value of requests that will be sent to the executor. (default=1) |
160 | | `decoding_mode` | Set to one of the following: `{top_k, top_p, top_k_top_p, beam_search, medusa, redrafter, lookahead, eagle}` to select the decoding mode. The `top_k` mode exclusively uses Top-K algorithm for sampling, The `top_p` mode uses exclusively Top-P algorithm for sampling. The top_k_top_p mode employs both Top-K and Top-P algorithms, depending on the runtime sampling params of the request. Note that the `top_k_top_p option` requires more memory and has a longer runtime than using `top_k` or `top_p` individually; therefore, it should be used only when necessary. `beam_search` uses beam search algorithm. If not specified, the default is to use `top_k_top_p` if `max_beam_width == 1`; otherwise, `beam_search` is used. When Medusa model is used, `medusa` decoding mode should be set. However, TensorRT-LLM detects loaded Medusa model and overwrites decoding mode to `medusa` with warning. Same applies to the ReDrafter, Lookahead and Eagle. |
161 | 
162 | - Optimization
163 | 
164 | | Name | Description |
165 | | :----------------------: | :-----------------------------: |
166 | | `enable_chunked_context` | Set to `true` to enable context chunking. (default=`false`) |
167 | | `multi_block_mode` | Set to `false` to disable multi block mode. (default=`true`) |
168 | | `enable_context_fmha_fp32_acc` | Set to `true` to enable FMHA runner FP32 accumulation. (default=`false`) |
169 | | `cuda_graph_mode` | Set to `true` to enable cuda graph. (default=`false`) |
170 | | `cuda_graph_cache_size` | Sets the size of the CUDA graph cache, in numbers of CUDA graphs. (default=0) |
171 | 
172 | - Scheduling
173 | 
174 | | Name | Description |
175 | | :----------------------: | :-----------------------------: |
176 | | `batch_scheduler_policy` | Set to `max_utilization` to greedily pack as many requests as possible in each current in-flight batching iteration. This maximizes the throughput but may result in overheads due to request pause/resume if KV cache limits are reached during execution. Set to `guaranteed_no_evict` to guarantee that a started request is never paused. (default=`guaranteed_no_evict`) |
177 | 
178 | - Medusa
179 | 
180 | | Name | Description |
181 | | :----------------------: | :-----------------------------: |
182 | | `medusa_choices` | To specify Medusa choices tree in the format of e.g. "{0, 0, 0}, {0, 1}". By default, `mc_sim_7b_63` choices are used. |
183 | 
184 | - Eagle
185 | 
186 | | Name | Description |
187 | | :----------------------: | :-----------------------------: |
188 | | `eagle_choices` | To specify default per-server Eagle choices tree in the format of e.g. "{0, 0, 0}, {0, 1}". By default, `mc_sim_7b_63` choices are used. |
189 | 
190 | - Guided decoding
191 | 
192 | | Name | Description |
193 | | :----------------------: | :-----------------------------: |
194 | | `guided_decoding_backend` | Set to `xgrammar` to activate guided decoder. |
195 | | `tokenizer_dir` | The guided decoding of tensorrt_llm python backend requires tokenizer's information. |
196 | | `xgrammar_tokenizer_info_path` | The guided decoding of tensorrt_llm C++ backend requires xgrammar's tokenizer's info in 'json' format. |
197 | 
198 | ### tensorrt_llm_bls model
199 | 
200 | See
201 | [here](https://github.com/triton-inference-server/python_backend#business-logic-scripting)
202 | to learn more about BLS models.
203 | 
204 | *Mandatory parameters*
205 | 
206 | | Name | Description |
207 | | :----------------------: | :-----------------------------: |
208 | | `triton_max_batch_size` | The maximum batch size that the model can handle. |
209 | | `decoupled_mode` | Whether to use decoupled mode. |
210 | | `bls_instance_count` | The number of instances of the model to run. When using the BLS model instead of the ensemble, you should set the number of model instances to the maximum batch size supported by the TRT engine to allow concurrent request execution. |
211 | | `logits_datatype` | The data type for context and generation logits. |
212 | 
213 | *Optional parameters*
214 | 
215 | - General
216 | 
217 | | Name | Description |
218 | | :----------------------: | :-----------------------------: |
219 | | `accumulate_tokens` | Used in the streaming mode to call the postprocessing model with all accumulated tokens, instead of only one token. This might be necessary for certain tokenizers. |
220 | 
221 | - Speculative decoding
222 | 
223 | The BLS model supports speculative decoding. Target and draft triton models are set with the parameters `tensorrt_llm_model_name` `tensorrt_llm_draft_model_name`. Speculative decodingis performed by setting `num_draft_tokens` in the request.  `use_draft_logits` may be set to use logits comparison speculative decoding. Note that `return_generation_logits` and `return_context_logits` are not supported when using speculative decoding. Also note that requests with batch size greater than 1 is not supported with speculative decoding right now.
224 | 
225 | | Name | Description |
226 | | :----------------------: | :-----------------------------: |
227 | | `tensorrt_llm_model_name` | The name of the TensorRT-LLM model to use. |
228 | | `tensorrt_llm_draft_model_name` | The name of the TensorRT-LLM draft model to use. |
229 | 
230 | ### Model Input and Output
231 | 
232 | Below is the lists of input and output tensors for the `tensorrt_llm` and
233 | `tensorrt_llm_bls` models.
234 | 
235 | #### Common Inputs
236 | 
237 | | Name | Shape | Type | Description |
238 | | :------------: | :---------------: | :-----------: | :--------: |
239 | | `end_id` | [1] | `int32` | End token ID. If not specified, defaults to -1 |
240 | | `pad_id` | [1] | `int32` | Padding token ID |
241 | | `temperature` | [1] | `float32` | Sampling Config param: `temperature` |
242 | | `repetition_penalty` | [1] | `float` | Sampling Config param: `repetitionPenalty` |
243 | | `min_tokens` | [1] | `int32_t` | Sampling Config param: `minTokens` |
244 | | `presence_penalty` | [1] | `float` | Sampling Config param: `presencePenalty` |
245 | | `frequency_penalty` | [1] | `float` | Sampling Config param: `frequencyPenalty` |
246 | | `seed` | [1] | `uint64_t` | Sampling Config param: `seed` |
247 | | `return_log_probs` | [1] | `bool` | When `true`, include log probs in the output |
248 | | `return_context_logits` | [1] | `bool` | When `true`, include context logits in the output |
249 | | `return_generation_logits` | [1] | `bool` | When `true`, include generation logits in the output |
250 | | `num_return_sequences` | [1] | `int32_t` | Number of generated sequences per request. (Default=1) |
251 | | `beam_width` | [1] | `int32_t` | Beam width for this request; set to 1 for greedy sampling (Default=1) |
252 | | `prompt_embedding_table` | [1] | `float16` (model data type) | P-tuning prompt embedding table |
253 | | `prompt_vocab_size` | [1] | `int32` | P-tuning prompt vocab size |
254 | | `return_perf_metrics` | [1] | `bool` | When `true`, include perf metrics in the output, such as kv cache reuse stats |
255 | | `guided_decoding_guide_type` | [1] | `string` | Guided decoding param: `guide_type` |
256 | | `guided_decoding_guide` | [1] | `string` | Guided decoding param: `guide` |
257 | 
258 | The following inputs for lora are for both `tensorrt_llm` and `tensorrt_llm_bls`
259 | models. The inputs are passed through the `tensorrt_llm` model and the
260 | `tensorrt_llm_bls` model will refer to the inputs from the `tensorrt_llm` model.
261 | 
262 | | Name | Shape | Type | Description |
263 | | :------------: | :---------------: | :-----------: | :--------: |
264 | | `lora_task_id` | [1] | `uint64` | The unique task ID for the given LoRA. To perform inference with a specific LoRA for the first time, `lora_task_id`, `lora_weights`, and `lora_config` must all be given. The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`. If the cache is full, the oldest LoRA will be evicted to make space for new ones. An error is returned if `lora_task_id` is not cached |
265 | | `lora_weights` | [ num_lora_modules_layers, D x Hi + Ho x D ] | `float` (model data type) | Weights for a LoRA adapter. See the config file for more details. |
266 | | `lora_config` | [ num_lora_modules_layers, 3] | `int32t` | Module identifier. See the config file for more details. |
267 | 
268 | #### Common Outputs
269 | 
270 | Note: the timing metrics oputputs are represented as the number of nanoseconds since epoch.
271 | 
272 | | Name | Shape | Type | Description |
273 | | :------------: | :---------------: | :-----------: | :--------: |
274 | | `cum_log_probs` | [-1] | `float` | Cumulative probabilities for each output |
275 | | `output_log_probs` | [beam_width, -1] | `float` | Log probabilities for each output |
276 | | `context_logits` | [-1, vocab_size] | `float` | Context logits for input |
277 | | `generation_logits` | [beam_width, seq_len, vocab_size] | `float` | Generation logits for each output |
278 | | `batch_index` | [1] | `int32` | Batch index |
279 | | `kv_cache_alloc_new_blocks` | [1] | `int32` | KV cache reuse metrics. Number of newly allocated blocks per request. Set the optional input `return_perf_metrics` to `true` to include `kv_cache_alloc_new_blocks` in the outputs. |
280 | | `kv_cache_reused_blocks` | [1] | `int32` | KV cache reuse metrics. Number of reused blocks per request. Set the optional input `return_perf_metrics` to `true` to include `kv_cache_reused_blocks` in the outputs. |
281 | | `kv_cache_alloc_total_blocks` | [1] | `int32` | KV cache reuse metrics. Number of total allocated blocks per request. Set the optional input `return_perf_metrics` to `true` to include `kv_cache_alloc_total_blocks` in the outputs. |
282 | | `arrival_time_ns` | [1] | `float` | Time when the request was received by TRT-LLM. Set the optional input `return_perf_metrics` to `true` to include `arrival_time_ns` in the outputs. |
283 | | `first_scheduled_time_ns` | [1] | `float` | Time when the request was first scheduled. Set the optional input `return_perf_metrics` to `true` to include `first_scheduled_time_ns` in the outputs. |
284 | | `first_token_time_ns` | [1] | `float` | Time when the first token was generated. Set the optional input `return_perf_metrics` to `true` to include `first_token_time_ns` in the outputs. |
285 | | `last_token_time_ns` | [1] | `float` | Time when the last token was generated. Set the optional input `return_perf_metrics` to `true` to include `last_token_time_ns` in the outputs. |
286 | | `acceptance_rate` | [1] | `float` | Acceptance rate of the speculative decoding model. Set the optional input `return_perf_metrics` to `true` to include `acceptance_rate` in the outputs. |
287 | | `total_accepted_draft_tokens` | [1] | `int32` | Number of tokens accepted by the target model in speculative decoding. Set the optional input `return_perf_metrics` to `true` to include `total_accepted_draft_tokens` in the outputs. |
288 | | `total_draft_tokens` | [1] | `int32` | Maximum number of draft tokens acceptable by the target model in speculative decoding. Set the optional input `return_perf_metrics` to `true` to include `total_draft_tokens` in the outputs. |
289 | 
290 | #### Unique Inputs for tensorrt_llm model
291 | 
292 | | Name | Shape | Type | Description |
293 | | :------------: | :---------------: | :-----------: | :--------: |
294 | | `input_ids` | [-1] | `int32` | Input token IDs |
295 | | `input_lengths` | [1] | `int32` | Input lengths |
296 | | `request_output_len` | [1] | `int32` | Requested output length |
297 | | `draft_input_ids` | [-1] | `int32` | Draft input IDs |
298 | | `decoder_input_ids` | [-1] | `int32` | Decoder input IDs |
299 | | `decoder_input_lengths` | [1] | `int32` | Decoder input lengths |
300 | | `draft_logits` | [-1, -1] | `float32` | Draft logits |
301 | | `draft_acceptance_threshold` | [1] | `float32` | Draft acceptance threshold |
302 | | `stop_words_list` | [2, -1] | `int32` | List of stop words |
303 | | `bad_words_list` | [2, -1] | `int32` | List of bad words |
304 | | `embedding_bias` | [-1] | `string` | Embedding bias words |
305 | | `runtime_top_k` | [1] | `int32` | Top-k value for runtime top-k sampling |
306 | | `runtime_top_p` | [1] | `float32` | Top-p value for runtime top-p sampling |
307 | | `runtime_top_p_min` | [1] | `float32` | Minimum value for runtime top-p sampling |
308 | | `runtime_top_p_decay` | [1] | `float32` | Decay value for runtime top-p sampling |
309 | | `runtime_top_p_reset_ids` | [1] | `int32` | Reset IDs for runtime top-p sampling |
310 | | `len_penalty` | [1] | `float32` | Controls how to penalize longer sequences in beam search (Default=0.f) |
311 | | `early_stopping` | [1] | `bool` | Enable early stopping |
312 | | `beam_search_diversity_rate` | [1] | `float32` | Beam search diversity rate |
313 | | `stop` | [1] | `bool` | Stop flag |
314 | | `streaming` | [1] | `bool` | Enable streaming |
315 | 
316 | #### Unique Outputs for tensorrt_llm model
317 | 
318 | | Name | Shape | Type | Description |
319 | | :------------: | :---------------: | :-----------: | :--------: |
320 | | `output_ids` | [-1, -1] | `int32` | Output token IDs |
321 | | `sequence_length` | [-1] | `int32` | Sequence length |
322 | 
323 | #### Unique Inputs for tensorrt_llm_bls model
324 | 
325 | | Name | Shape | Type | Description |
326 | | :------------: | :---------------: | :-----------: | :--------: |
327 | | `text_input` | [-1] | `string` | Prompt text |
328 | | `decoder_text_input` | [1] | `string` | Decoder input text |
329 | | `image_input` | [3, 224, 224] | `float16` | Input image |
330 | | `max_tokens` | [-1] | `int32` | Number of tokens to generate |
331 | | `bad_words` | [2, num_bad_words] | `int32` | Bad words list |
332 | | `stop_words` | [2, num_stop_words] | `int32` | Stop words list |
333 | | `top_k` | [1] | `int32` | Sampling Config param: `topK` |
334 | | `top_p` | [1] | `float32` | Sampling Config param: `topP` |
335 | | `length_penalty` | [1] | `float32` | Sampling Config param: `lengthPenalty` |
336 | | `stream` | [1] | `bool` | When `true`, stream out tokens as they are generated. When `false` return only when the full generation has completed (Default=`false`) |
337 | |`embedding_bias_words` | [-1] | `string` | Embedding bias words |
338 | | `embedding_bias_weights` | [-1] | `float32` | Embedding bias weights |
339 | | `num_draft_tokens` | [1] | `int32` | Number of tokens to get from draft model during speculative decoding |
340 | | `use_draft_logits` | [1] | `bool` | Use logit comparison during speculative decoding |
341 | 
342 | #### Unique Outputs for tensorrt_llm_bls model
343 | 
344 | | Name | Shape | Type | Description |
345 | | :------------: | :---------------: | :-----------: | :--------: |
346 | | `text_output` | [-1] | `string` | Text output |
347 | 
348 | ## Some tips for model configuration
349 | 
350 | Below are some tips for configuring models for optimal performance. These
351 | recommendations are based on our experiments and may not apply to all use cases.
352 | For guidance on other parameters, please refer to the
353 | [perf_best_practices](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-best-practices.md).
354 | 
355 | - **Setting the `instance_count` for models to better utilize inflight batching**
356 | 
357 |    The `instance_count` parameter in the config.pbtxt file specifies the number
358 |    of instances of the model to run. Ideally, this should be set to match the
359 |    maximum batch size supported by the TRT engine, as this allows for concurrent
360 |    request execution and reduces performance bottlenecks. However, it will also
361 |    consume more CPU memory resources. While the optimal value isn't something we
362 |    can determine in advance, it generally shouldn't be set to a very small
363 |    value, such as 1.
364 |    For most use cases, we have found that setting `instance_count` to 5 works
365 |    well across a variety of workloads in our experiments.
366 | 
367 | - **Adjusting `max_batch_size` and `max_num_tokens` to optimize inflight batching**
368 | 
369 |   `max_batch_size` and `max_num_tokens` are important parameters for optimizing
370 |   inflight batching. You can modify `max_batch_size` in the model configuration
371 |   file, while `max_num_tokens` is set during the conversion to a TRT-LLM engine
372 |   using the `trtllm-build` command. Tuning these parameters is necessary for
373 |   different scenarios, and experimentation is currently the best approach to
374 |   finding optimal values. Generally, the total number of requests should be
375 |   lower than `max_batch_size`, and the total tokens should be less than
376 |   `max_num_tokens`.
377 | 


--------------------------------------------------------------------------------
/docs/multimodal.md:
--------------------------------------------------------------------------------
  1 | # End to end workflow to run a Multimodal model
  2 | 
  3 | ### Support Matrix
  4 | The following multimodal model is supported in tensorrtllm_backend:
  5 | * BLIP2-OPT
  6 | * LLAVA
  7 | * VILA
  8 | * LLaVA OneVision
  9 | * MLLAMA
 10 | * Qwen2-VL
 11 | 
 12 | For more multimodal models supported in TensorRT-LLM, please visit [TensorRT-LLM multimodal examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/multimodal).
 13 | 
 14 | ## Run Multimodal with single-GPU Tritonserver
 15 | ### Tritonserver setup steps
 16 | 0. Make sure that you have initialized the TRT-LLM submodule:
 17 | 
 18 |     ```bash
 19 |     git clone https://github.com/triton-inference-server/tensorrtllm_backend.git && cd tensorrtllm_backend
 20 |     git lfs install
 21 |     git submodule update --init --recursive
 22 |     ```
 23 | 
 24 | 1. Start the Triton Server Docker container:
 25 | 
 26 |     1-1. If you're using Tritonserver from nvcr.io
 27 |     ```bash
 28 |     # Replace <yy.mm> with the version of Triton you want to use.
 29 |     # The command below assumes the the current directory is the
 30 |     # TRT-LLM backend root git repository.
 31 | 
 32 |     docker run --rm -ti --net=host -v `pwd`:/mnt -w /mnt --gpus all nvcr.io/nvidia/tritonserver:\<yy.mm\>-trtllm-python-py3 bash
 33 |     ```
 34 |     1-2. If you are using `tensorrtllm_backend` container:
 35 |     ```bash
 36 |     docker run --rm -ti --net=host -v `pwd`:/mnt -w /mnt --gpus all triton_trt_llm
 37 |     ```
 38 | 
 39 | 2. Build the engine:
 40 | 
 41 |     2-1. Clone the target model repository
 42 |     ```bash
 43 |     # For BLIP-OPT2
 44 |     export MODEL_NAME="blip2-opt-2.7b"
 45 |     git clone https://huggingface.co/Salesforce/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
 46 | 
 47 |     # For LLAVA
 48 |     export MODEL_NAME="llava-1.5-7b-hf"
 49 |     git clone https://huggingface.co/llava-hf/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
 50 | 
 51 |     # For VILA
 52 |     pip install -r all_models/multimodal/requirements-vila.txt
 53 | 
 54 |     export MODEL_NAME="vila1.5-3b"
 55 |     git clone https://huggingface.co/Efficient-Large-Model/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
 56 | 
 57 |     export VILA_PATH="tmp/hf_models/VILA"
 58 |     git clone https://github.com/Efficient-Large-Model/VILA.git ${VILA_PATH}
 59 | 
 60 |     # For LLaVA OneVision
 61 |     pip install -r all_models/multimodal/requirements-llava-onevision.txt
 62 | 
 63 |     export MODEL_NAME="llava-onevision-qwen2-7b-ov-hf"
 64 |     git clone https://huggingface.co/llava-hf/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
 65 | 
 66 |     # For MLLAMA
 67 |     pip install -r all_models/multimodal/requirements-mllama.txt
 68 | 
 69 |     export MODEL_NAME="Llama-3.2-11B-Vision"
 70 |     git clone https://huggingface.co/meta-llama/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
 71 | 
 72 |     # For Qwen2-VL
 73 |     pip install -r all_models/multimodal/requirements-qwen2vl.txt
 74 | 
 75 |     export MODEL_NAME="Qwen2-VL-7B-Instruct"
 76 |     git clone https://huggingface.co/Qwen/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
 77 | 
 78 |     export
 79 |     ```
 80 |     2-2. Build TensorRT-LLM engines
 81 |     ```bash
 82 |     export HF_MODEL_PATH=tmp/hf_models/${MODEL_NAME}
 83 |     export UNIFIED_CKPT_PATH=tmp/trt_models/${MODEL_NAME}/fp16/1-gpu
 84 |     export ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/fp16/1-gpu
 85 |     export MULTIMODAL_ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/multimodal_encoder
 86 | 
 87 |     # For BLIP-OPT2
 88 |     python tensorrt_llm/examples/models/contrib/opt/convert_checkpoint.py --model_type blip2 \
 89 |         --model_dir ${HF_MODEL_PATH} \
 90 |         --output_dir ${UNIFIED_CKPT_PATH} \
 91 |         --dtype float16
 92 | 
 93 |     trtllm-build \
 94 |         --checkpoint_dir ${UNIFIED_CKPT_PATH} \
 95 |         --output_dir ${ENGINE_PATH} \
 96 |         --gemm_plugin float16 \
 97 |         --max_beam_width 1 \
 98 |         --max_batch_size 8 \
 99 |         --max_seq_len 1024 \
100 |         --max_input_len 924 \
101 |         --max_multimodal_len 256 # 8 (max_batch_size) * 32 (num_multimodal_features) for BLIP2
102 | 
103 |     python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_type blip2 --model_path ${HF_MODEL_PATH} --max_batch_size 8
104 | 
105 |     # For LLAVA
106 |     python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py \
107 |         --model_dir ${HF_MODEL_PATH} \
108 |         --output_dir ${UNIFIED_CKPT_PATH} \
109 |         --dtype float16
110 | 
111 |     trtllm-build \
112 |         --checkpoint_dir ${UNIFIED_CKPT_PATH} \
113 |         --output_dir ${ENGINE_PATH} \
114 |         --gemm_plugin float16 \
115 |         --max_batch_size 8 \
116 |         --max_input_len 2048 \
117 |         --max_seq_len 2560 \
118 |         --max_multimodal_len 4608 # 8 (max_batch_size) * 576 (num_multimodal_features) for LLaVA
119 | 
120 |     python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type llava --max_batch_size 8
121 | 
122 |     # For VILA
123 |     python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py \
124 |         --model_dir ${HF_MODEL_PATH} \
125 |         --output_dir ${UNIFIED_CKPT_PATH} \
126 |         --dtype float16
127 | 
128 |     trtllm-build \
129 |         --checkpoint_dir ${UNIFIED_CKPT_PATH} \
130 |         --output_dir ${ENGINE_PATH} \
131 |         --gemm_plugin float16 \
132 |         --max_batch_size 8 \
133 |         --max_input_len 2048 \
134 |         --max_seq_len 2560 \
135 |         --max_multimodal_len 6272 # 8 (max_batch_size) * 196 (num_multimodal_features) * 4 (max_num_images_per_request)
136 | 
137 |     python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type vila --vila_path ${VILA_PATH} --max_batch_size 32 #max_batch_size * max_num_images_per_request since vila support multiple images inference
138 | 
139 |     # For LLaVA OneVision
140 |     python tensorrt_llm/examples/models/contrib/qwen/convert_checkpoint.py \
141 |         --model_dir ${HF_MODEL_PATH} \
142 |         --output_dir ${UNIFIED_CKPT_PATH} \
143 |         --dtype float16
144 | 
145 |     trtllm-build \
146 |         --checkpoint_dir ${UNIFIED_CKPT_PATH} \
147 |         --output_dir ${ENGINE_PATH} \
148 |         --gemm_plugin float16 \
149 |         --max_batch_size 1 \
150 |         --max_input_len  7500 \
151 |         --max_seq_len  7600 \
152 |         --max_multimodal_len 7300 # max_batch_size * num_multimodal_features(depends on the image size or the specified video num frame)
153 | 
154 |     python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type llava_onevision --max_batch_size 16 # max_batch_size * patch for image or frame for video
155 | 
156 |     # For MLLAMA
157 |     python tensorrt_llm/examples/models/core/mllama/convert_checkpoint.py \
158 |         --model_dir ${HF_MODEL_PATH} \
159 |         --output_dir ${UNIFIED_CKPT_PATH} \
160 |         --dtype bfloat16
161 | 
162 |     trtllm-build \
163 |     --checkpoint_dir ${UNIFIED_CKPT_PATH} \
164 |     --output_dir ${ENGINE_PATH} \
165 |     --gemm_plugin auto \
166 |     --max_batch_size 8 \
167 |     --max_seq_len 2048 \
168 |     --max_num_tokens 4096 \
169 |     --max_encoder_input_len 6404
170 | 
171 |     python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type mllama --output_dir ${MULTIMODAL_ENGINE_PATH} --max_batch_size 8 #max_batch_size * max_num_images_per_request
172 | 
173 |     # For Qwen2-VL
174 |     python3 ../qwen/convert_checkpoint.py \
175 |         --model_dir ${HF_MODEL_PATH} \
176 |         --output_dir ${UNIFIED_CKPT_PATH} \
177 |         --dtype float16
178 | 
179 |     trtllm-build --checkpoint_dir ${UNIFIED_CKPT_PATH} \
180 |         --output_dir ${ENGINE_PATH} \
181 |         --gemm_plugin=float16 \
182 |         --gpt_attention_plugin=float16 \
183 |         --max_batch_size 4 \
184 |         --max_input_len 2048 \
185 |         --max_seq_len 3072 \
186 |         --max_multimodal_len 1296 #(max_batch_size) * 324 (num_multimodal_features), this's for image_shape=[504,504]
187 | 
188 |     python build_multimodal_engine.py --model_type qwen2_vl --model_path tmp/hf_models/${MODEL_NAME} --output_dir ${MULTIMODAL_ENGINE_PATH}
189 |     ```
190 | 
191 |     > **NOTE**:
192 |     >
193 |     > `max_multimodal_len = max_batch_size * num_multimodal_features`, so if you change `max_batch_size`, `max_multimodal_len` **MUST** be changed accordingly.
194 |     > For multi-image inference, where a single request could contain multiple images, `max_multimodal_len = max_batch_size * num_multimodal_features * max_num_images_per_request`
195 |     >
196 |     > The built visual engines are located in `tmp/trt_engines/${MODEL_NAME}/multimodal_encoder`.
197 | 
198 | 3. Prepare Tritonserver configs
199 | 
200 |     ```bash
201 |     cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ multimodal_ifb -r
202 |     # Override the ensemble and creates new multimodal_encoders directories for multimodal
203 |     cp tensorrt_llm/triton_backend/all_models/multimodal/ensemble multimodal_ifb -r
204 |     cp tensorrt_llm/triton_backend/all_models/multimodal/multimodal_encoders multimodal_ifb -r
205 | 
206 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:8,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,enable_chunked_context:False,encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},logits_datatype:TYPE_FP32,cross_kv_cache_fraction:0.5
207 | 
208 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:8,preprocessing_instance_count:1,multimodal_model_path:${MULTIMODAL_ENGINE_PATH},engine_dir:${ENGINE_PATH},max_num_images:1,max_queue_delay_microseconds:20000
209 | 
210 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:8,postprocessing_instance_count:1
211 | 
212 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/ensemble/config.pbtxt triton_max_batch_size:8,logits_datatype:TYPE_FP32
213 | 
214 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:8,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,tensorrt_llm_model_name:tensorrt_llm,multimodal_encoders_name:multimodal_encoders,logits_datatype:TYPE_FP32
215 | 
216 |     # Newly added for multimodal
217 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/multimodal_encoders/config.pbtxt triton_max_batch_size:8,multimodal_model_path:${MULTIMODAL_ENGINE_PATH},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},hf_model_path:${HF_MODEL_PATH},max_queue_delay_microseconds:20000
218 |     ```
219 |     > **NOTE**:
220 |     >
221 |     > You can set the `decoupled_mode` option to True to use streaming mode.
222 |     >
223 |     > You can set the `accumulate_tokens` option to True in streaming mode to call the postprocessing model with all accumulated tokens.
224 |     >
225 |     > You can set the `enable_kv_cache_reuse` option to True to enable kv cache reuse. Requests with the same image/prompt table/input tokens will reuse the KV cache, which will help reduce latency. The specific performance improvement depends on the length of reuse.
226 |     >
227 |     > You can set the `max_num_images` to the max number of images per request. The value should be the same as the `max_num_images_per_request` value used at build the engine step above.
228 |     >
229 |     > Set `${ENCODER_INPUT_FEATURES_DTYPE}` to `TYPE_BF16` for mllama, and `TYPE_FP16` for other models.
230 |     > `cross_kv_cache_fraction` is used to determine the paged kv cache memory pool size of enc-dec models. For such case, we distinguish `free_fraction * (1 - cross_kv_cache_fraction)` to self attention kv caches, and `free_fraction * cross_kv_cache_fraction` to cross attention kv caches.
231 | 
232 | 4. Launch Tritonserver
233 | 
234 |     ```bash
235 |     python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=multimodal_ifb/ --tensorrt_llm_model_name tensorrt_llm,multimodal_encoders --multimodal_gpu0_cuda_mem_pool_bytes 300000000
236 |     ```
237 | 
238 |     > **NOTE**:
239 |     > If there is an error associated with 'MPI_Init_thread', please do `export PMIX_MCA_gds=hash`'
240 |     >
241 |     > When launching the server, since the prompt_embedding_table is in GPU memory, we need to set the CUDA pool memory for inter-step communication. For example, when we have a shape of (1, 576, 4096) promp_embedding table, we would need 300MB of CUDA pool memory, so we set 30MB to have some GPU buffers. (2(fp16=>2bytes) * 576 * 4096 * 8(max_batch_size) = 18,874,368)
242 |     >
243 |     > Also, the tensorrt_llm initialization assumes using another GPU, we need to initialize it but not use them.
244 | 
245 | ### Send requests
246 | 1. Send request with `decoupled_mode` set to False
247 |     ```bash
248 |     python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2
249 | 
250 |     [beam 0 ]:
251 |     Question: which city is this? Answer: singapore
252 |     [INFO] Latency: 41.942 ms
253 |     ```
254 | 2. Send request with `decoupled_mode` set to True
255 |     ```bash
256 |     python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 --streaming
257 | 
258 |     [beam 0 ]:   sing
259 |     [beam 0 ]:  apore
260 |     [beam 0 ]:
261 |     [INFO] Latency: 43.441 ms
262 |     ```
263 | 3. Send request to the `tensorrt_llm_bls` model
264 |     ```bash
265 |     python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 --use_bls
266 | 
267 |     [beam 0 ]:
268 |     Question: which city is this? Answer: singapore
269 |     [INFO] Latency: 44.152 ms
270 |     ```
271 | 
272 | 4. Send request to the `tensorrt_llm_bls` model with `accumulate_tokens` set to True
273 |     ```bash
274 |     python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 --use_bls --streaming
275 | 
276 |     [beam 0 ]:   sing
277 |     [beam 0 ]:   singapore
278 |     [beam 0 ]:   singapore
279 |     [INFO] Latency: 45.48 ms
280 |     ```
281 | 
282 | 5. Send request with `enable_kv_cache_reuse` set to True
283 |     ```bash
284 |     python tensorrt_llm/triton_backend/tools/multimodal/client.py --text 'Question: which city is this? Answer:' --image 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 16 --model_type blip2 --prompt_table_extra_id ${id}
285 | 
286 |     [beam 0 ]:
287 |     Question: which city is this? Answer: singapore
288 |     [INFO] Latency: 42.514 ms
289 |     ```
290 | 6. Send request with multiple images per request
291 |     ```bash
292 |     wget -O av.png https://raw.githubusercontent.com/Efficient-Large-Model/VILA/main/demo_images/av.png
293 | 
294 |     python tensorrt_llm/triton_backend/tools/multimodal/client.py --text '<image>\n<image>\n Please elaborate what you see in the images?' --image av.png,'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png' --request-output-len 68 --model_type vila --hf_model_dir ${HF_MODEL_PATH}
295 | 
296 |     [beam 0 ]:
297 |     A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:  \n \n Please elaborate what you see in the images? ASSISTANT: The first image shows a busy street scene with a car driving through a crosswalk, surrounded by pedestrians and traffic lights. The second image captures a beautiful sunset with the iconic Merlion statue spouting water into the bay, with the Singapore Flyer and the city skyline in the background.
298 | 
299 |     [INFO] Latency: 403.879 ms
300 |     ```
301 | 
302 | 7. Send request with curl
303 |     The triton server supports curl requests with an image url in the payload. For example here is a request sent to a Llama-3.2-11B-Vision (mLLama) model:
304 |     ``` bash
305 |     curl -X POST localhost:8000/v2/models/ensemble/generate_stream \
306 |     -d '{"id": "42", "text_input": "<|image|>If I had to write a haiku for this one", "image_url_input": "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png", "parameters": {"max_tokens": 16, "beam_width": 1, "end_id": 128001, "pad_id": 128004, "top_k": 1, "top_p": 0, "stream": false, "temperature": 0}}'
307 | 
308 |     # response
309 |     data: {"batch_index":0,"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"id":"42","model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_index":0,"sequence_start":false,"text_output":"If I had to write a haiku for this one, it would be:.\\nMerlion spouts water.\\nMarina"}
310 |    ```
311 |    You can also send requests with base64 encoded images. Just replace the url above with `data:image/jpeg;base64,<base64_encoded_image>`.
312 | 
313 | 8. Send request with video input
314 |     ```bash
315 |     python tensorrt_llm/triton_backend/tools/multimodal/client.py --text "Why is this video funny?" --video sample_demo_1.mp4 --video_num_frames 8 --request-output-len 30 --model_type llava_onevision  --end-id 151645
316 | 
317 |     [beam 0 ]:
318 |     user
319 |     Why is this video funny?assistant
320 |     The video is funny because the child's actions are playful and exaggerated, as if they are reading the book with great enthusiasm.
321 |     [INFO] Latency: 507.537 ms
322 |     ```
323 | 
324 | > **NOTE**:
325 | > Please ignore any exception thrown with the output. It's a known issue to be fixed.
326 | >
327 | > When `enable_kv_cache_reuse` is set to true, the `prompt_table_extra_id` must be specified in the requests. The `prompt_table_extra_id` is a unique identifier representing the image (or prompt table), the same image uses the same id. The data type is `uint64`, and the minimum value is 1.
328 | 
329 | ### Kill the server
330 | ```bash
331 | pkill tritonserver
332 | ```
333 | 
334 | ### Supported image input types
335 | When programmatically preparing your own request for the server, note that `ensemble`:
336 | - `image_input`: a float16 5D tensor of shape `[batch_size, num_images, num_channels, height, width]` or `[batch_size, num_images, height, width, num_channels]` representing a batch of images already processed (via transformers AutoProcessor) for the vision encoder.
337 | - `image_bytes_input`: a uint8 5D tensor of shape `[batch_size, num_images, num_channels, height, width]` or `[batch_size, num_images, height, width, num_channels]` representing a batch of raw images.
338 | - `image_url_input`: a list of strings of shape `[batch_size, num_images]` representing a batch of image urls.
339 | 
340 | You may populate only one of these image inputs in a request. We suggest you use `image_bytes_input` when using grpc requests and `image_url_input` when sending http requests. For grpc requests where the client can preprocess images to reduce load on the server, use `image_input`. Note that `tensorrt_llm_bls` only supports `image_input`.
341 | 
342 | ### Long multimodal context, FP8 KV cache and tensor parallelism
343 | 
344 | Follow these steps to enable chunked context inference (using LLaVA as an example) with FP8 KV cache and 2-way tensor parallelism. Ensure you convert the checkpoint using `--tp_size 2` and build the model with `--use_paged_context_fmha enable` and `--use_fp8_context_fmha enable`. Set the chunked context to true in the Tritonserver configuration file. The chunk size is determined by the `max_num_tokens` flag when building the engine, which defaults to 8192. When launching the server, you need to change the `--world_size` to match your tensor parallelism size.
345 | 1. Build the engine
346 | ```bash
347 |     export MODEL_NAME="llava-1.5-7b-hf"
348 |     export HF_MODEL_PATH=tmp/hf_models/${MODEL_NAME}
349 | 
350 |     # Convert checkpoint
351 |     # For fp16 KV cache
352 |     export UNIFIED_CKPT_PATH=tmp/trt_models/${MODEL_NAME}/fp8/2-gpu
353 |     export ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/fp8/2-gpu
354 |     export MULTIMODAL_ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/multimodal_encoder
355 |     python tensorrt_llm/examples/models/core/llama/convert_checkpoint.py \
356 |         --model_dir ${HF_MODEL_PATH} \
357 |         --output_dir ${UNIFIED_CKPT_PATH} \
358 |         --dtype float16 \
359 |         --tp_size 2
360 | 
361 |     # For fp8 KV cache
362 |     export UNIFIED_CKPT_PATH=tmp/trt_models/${MODEL_NAME}/fp8/2-gpu
363 |     export ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/fp8/2-gpu
364 |     export MULTIMODAL_ENGINE_PATH=tmp/trt_engines/${MODEL_NAME}/multimodal_encoder
365 |     python ./tensorrt_llm/examples/quantization/quantize.py \
366 |                                 --model_dir ${HF_MODEL_PATH} \
367 |                                 --dtype float16 \
368 |                                 --qformat fp8 \
369 |                                 --kv_cache_dtype fp8 \
370 |                                 --output_dir ${UNIFIED_CKPT_PATH} \
371 |                                 --calib_size 512 \
372 |                                 --tp_size 2
373 | 
374 |     # Build the llm engine
375 |     # --use_paged_context_fmha and --use_fp8_context_fmha are defaultly enabled
376 |     # include --max_num_tokens to set the chunk size
377 |     trtllm-build \
378 |         --checkpoint_dir ${UNIFIED_CKPT_PATH} \
379 |         --output_dir ${ENGINE_PATH} \
380 |         --gemm_plugin auto \
381 |         --max_batch_size 8 \
382 |         --max_input_len 2048 \
383 |         --max_seq_len 2560 \
384 |         --max_multimodal_len 4608 # 8 (max_batch_size) * 576 (num_multimodal_features) for LLaVA
385 | 
386 |     # Build the multimodal engine
387 |     python tensorrt_llm/examples/models/core/multimodal/build_multimodal_engine.py --model_path ${HF_MODEL_PATH} --model_type llava --max_batch_size 8 --output_dir ${MULTIMODAL_ENGINE_PATH}
388 | ```
389 | 2. Prepare the Tritonserver config file
390 | Prepare the Tritonserver config file with `enable_chunked_context` set to True. Also, to further utilize the free memory, we can set `kv_cache_free_gpu_mem_fraction` to 0.9.
391 | ```bash
392 | cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/ multimodal_ifb -r
393 | # Override the ensemble and creates new multimodal_encoders directories for multimodal
394 | cp tensorrt_llm/triton_backend/all_models/multimodal/ensemble multimodal_ifb -r
395 | cp tensorrt_llm/triton_backend/all_models/multimodal/multimodal_encoders multimodal_ifb -r
396 | 
397 | # Changes the enable_chunked_context to True, and set kv_cache_free_gpu_mem_fraction to 0.9
398 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:8,decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,enable_chunked_context:True,encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},logits_datatype:TYPE_FP32,kv_cache_free_gpu_mem_fraction:0.9
399 | 
400 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/preprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:8,preprocessing_instance_count:1,multimodal_model_path:${MULTIMODAL_ENGINE_PATH},engine_dir:${ENGINE_PATH},max_num_images:1,max_queue_delay_microseconds:20000
401 | 
402 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/postprocessing/config.pbtxt tokenizer_dir:${HF_MODEL_PATH},triton_max_batch_size:8,postprocessing_instance_count:1
403 | 
404 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/ensemble/config.pbtxt triton_max_batch_size:8,logits_datatype:TYPE_FP32
405 | 
406 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:8,decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False,tensorrt_llm_model_name:tensorrt_llm,multimodal_encoders_name:multimodal_encoders,logits_datatype:TYPE_FP32
407 | 
408 | # Newly added for multimodal
409 | python3 tensorrt_llm/triton_backend/tools/fill_template.py -i multimodal_ifb/multimodal_encoders/config.pbtxt triton_max_batch_size:8,multimodal_model_path:${MULTIMODAL_ENGINE_PATH},encoder_input_features_data_type:${ENCODER_INPUT_FEATURES_DTYPE},hf_model_path:${HF_MODEL_PATH},max_queue_delay_microseconds:20000
410 | ```
411 | 3. Launch the server
412 | ```bash
413 | # Change --world_size to your tp size
414 | python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 2 --model_repo=multimodal_ifb/ --tensorrt_llm_model_name tensorrt_llm,multimodal_encoders --multimodal_gpu0_cuda_mem_pool_bytes 300000000
415 | ```
416 | 
417 | When you launch the server, you will see logs similar to the following. In theory, now you can process long multimodal context up to the "max tokens in paged KV cache" value, and the context prefill phase will be done in chunk sizes.
418 | ```bash
419 | [TensorRT-LLM][INFO] Memory usage when calculating max tokens in paged kv cache: total: 93.10 GiB, available: 85.57 GiB
420 | ...
421 | [TensorRT-LLM][INFO] [MemUsageChange] Allocated 77.02 GiB for max tokens in paged KV cache (315488).
422 | ```
423 | 


--------------------------------------------------------------------------------
/docs/whisper.md:
--------------------------------------------------------------------------------
  1 | # End to end workflow to run a Multimodal model
  2 | 
  3 | ### Support Matrix
  4 | The following multimodal model is supported in tensorrtllm_backend:
  5 | * Whisper
  6 | * Distil-Whisper
  7 | 
  8 | ## Run Whisper with single-GPU Tritonserver
  9 | ### Tritonserver setup steps
 10 | 0. Make sure that you have initialized the TRT-LLM submodule:
 11 | 
 12 |     ```bash
 13 |     git clone https://github.com/triton-inference-server/tensorrtllm_backend.git && cd tensorrtllm_backend
 14 |     git lfs install
 15 |     git submodule update --init --recursive
 16 |     ```
 17 | 
 18 | 1. Start the Triton Server Docker container:
 19 | 
 20 |     1-1. If you're using Tritonserver from nvcr.io
 21 |     ```bash
 22 |     # Replace <yy.mm> with the version of Triton you want to use.
 23 |     # The command below assumes the the current directory is the
 24 |     # TRT-LLM backend root git repository.
 25 | 
 26 |     docker run --rm -ti --net=host -v `pwd`:/mnt -w /mnt --gpus all nvcr.io/nvidia/tritonserver:\<yy.mm\>-trtllm-python-py3 bash
 27 |     ```
 28 |     1-2. If you are using `tensorrtllm_backend` container:
 29 |     ```bash
 30 |     docker run --rm -ti --net=host -v `pwd`:/mnt -w /mnt --gpus all triton_trt_llm
 31 |     ```
 32 | 
 33 | 2. Build the engine:
 34 | 
 35 |     2-1. Download the whisper models
 36 |     ```bash
 37 |     wget --directory-prefix=assets https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken
 38 |     wget --directory-prefix=assets assets/mel_filters.npz https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz
 39 |     wget --directory-prefix=assets https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav
 40 |     # take large-v3 model as an example
 41 |     wget --directory-prefix=assets https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt
 42 |     ```
 43 |     2-2. Build TensorRT-LLM engines
 44 |     ```bash
 45 |     INFERENCE_PRECISION=float16
 46 |     MAX_BEAM_WIDTH=4
 47 |     MAX_BATCH_SIZE=64
 48 |     checkpoint_dir=tllm_checkpoint
 49 |     output_dir=whisper_large_v3_max_batch_${MAX_BATCH_SIZE}
 50 | 
 51 |     python3 convert_checkpoint.py --model_dir ${MODEL_DIR} --output_dir ${checkpoint_dir}
 52 | 
 53 |     trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \
 54 |                 --output_dir ${output_dir}/encoder \
 55 |                 --moe_plugin disable \
 56 |                 --max_batch_size ${MAX_BATCH_SIZE} \
 57 |                 --gemm_plugin disable \
 58 |                 --bert_attention_plugin ${INFERENCE_PRECISION} \
 59 |                 --max_input_len 3000 --max_seq_len=3000
 60 | 
 61 |     trtllm-build  --checkpoint_dir ${checkpoint_dir}/decoder \
 62 |                 --output_dir ${output_dir}/decoder \
 63 |                 --moe_plugin disable \
 64 |                 --max_beam_width ${MAX_BEAM_WIDTH} \
 65 |                 --max_batch_size ${MAX_BATCH_SIZE} \
 66 |                 --max_seq_len 114 \
 67 |                 --max_input_len 14 \
 68 |                 --max_encoder_input_len 3000 \
 69 |                 --gemm_plugin ${INFERENCE_PRECISION} \
 70 |                 --bert_attention_plugin ${INFERENCE_PRECISION} \
 71 |                 --gpt_attention_plugin ${INFERENCE_PRECISION}
 72 | 
 73 |     ```
 74 | 
 75 |     > **NOTE**:
 76 |     >
 77 |     > TensorRT-LLM also supports using [distil-whisper's](https://github.com/huggingface/distil-whisper) different models by first converting their params and weights from huggingface's naming format to [openai whisper](https://github.com/openai/whisper) naming format. You can do so by running the script [distil_whisper/convert_from_distil_whisper.py](./convert_from_distil_whisper.py).
 78 | 
 79 | 3. Prepare Tritonserver configs
 80 | 
 81 |     ```bash
 82 |     cp tensorrt_llm/triton_backend/all_models/whisper/ model_repo_whisper -r
 83 |     cp tensorrt_llm/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm model_repo_whisper -r
 84 |     wget --directory-prefix=model_repo_whisper/whisper_bls/1 https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken
 85 |     wget --directory-prefix=model_repo_whisper/whisper_bls/1 https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz
 86 | 
 87 |     BACKEND=tensorrtllm
 88 |     DECOUPLED_MODE=false
 89 |     DECODER_ENGINE_PATH=${output_dir}/decoder
 90 |     ENCODER_ENGINE_PATH=${output_dir}/encoder
 91 |     MAX_TOKENS_IN_KV_CACHE=24000
 92 |     BATCHING_STRATEGY=inflight_fused_batching
 93 |     KV_CACHE_FREE_GPU_MEM_FRACTION=0.5
 94 |     EXCLUDE_INPUT_IN_OUTPUT=True
 95 |     TRITON_MAX_BATCH_SIZE=8
 96 |     MAX_QUEUE_DELAY_MICROSECONDS=0
 97 |     MAX_BEAM_WIDTH=1
 98 |     MAX_QUEUE_SIZE="0"
 99 |     ENABLE_KV_CACHE_REUSE=false
100 |     ENABLE_CHUNKED_CONTEXT=false
101 |     CROSS_KV_CACHE_FRACTION="0.5"
102 |     n_mels=128
103 |     zero_pad=false
104 | 
105 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i model_repo_whisper/tensorrt_llm/config.pbtxt triton_backend:${BACKEND},engine_dir:${DECODER_ENGINE_PATH},encoder_engine_dir:${ENCODER_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE},max_queue_size:${MAX_QUEUE_SIZE},enable_context_fmha_fp32_acc:${ENABLE_CONTEXT_FMHA_FP32_ACC},cross_kv_cache_fraction:${CROSS_KV_CACHE_FRACTION},encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
106 | 
107 |     python3 tensorrt_llm/triton_backend/tools/fill_template.py -i model_repo_whisper/whisper_bls/config.pbtxt engine_dir:${ENCODER_ENGINE_PATH},n_mels:$n_mels,zero_pad:$zero_pad,triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE}
108 |     ```
109 |     > **NOTE**:
110 |     >
111 |     > TODO: You can set the `decoupled_mode` option to True to use streaming mode.
112 | 
113 | 4. Launch Tritonserver
114 | 
115 |     ```bash
116 |     python3 tensorrt_llm/triton_backend/scripts/launch_triton_server.py --world_size 1 --model_repo=model_repo_whisper/ --tensorrt_llm_model_name tensorrt_llm,whisper_bls --multimodal_gpu0_cuda_mem_pool_bytes 300000000
117 |     ```
118 | 
119 | ### Send requests
120 | 1. Send request with a single audio file
121 |     ```bash
122 |     wget -nc https://raw.githubusercontent.com/yuekaizhang/Triton-ASR-Client/main/datasets/mini_en/wav/1221-135766-0002.wav
123 |     # Test non-streaming
124 |     python3 tensorrt_llm/triton_backend/whisper/client.py --audio-path 1221-135766-0002.wav
125 |     ```
126 | 2. Send requests with a whole audio dataset
127 |    ```bash
128 |     git clone https://github.com/yuekaizhang/Triton-ASR-Client.git
129 |     cd Triton-ASR-Client
130 |     num_task=16
131 |     python3 tensorrt_llm/triton_backend/whisper/client.py \
132 |         --server-addr localhost \
133 |         --model-name whisper_bls \
134 |         --num-tasks $num_task \
135 |         --text-prompt "<|startoftranscript|><|zh|><|transcribe|><|notimestamps|>" \
136 |         --manifest-dir ./datasets/aishell1_test \
137 |         --compute-cer
138 |     ```
139 | ### Kill the server
140 | ```bash
141 | pkill tritonserver
142 | ```
143 | 


--------------------------------------------------------------------------------
/images/leader-mode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tensorrtllm_backend/402e36e96195cecfdefff119f6df756548fd4312/images/leader-mode.png


--------------------------------------------------------------------------------
/images/orchestrator-mode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tensorrtllm_backend/402e36e96195cecfdefff119f6df756548fd4312/images/orchestrator-mode.png


--------------------------------------------------------------------------------