├── .github └── workflows │ ├── codeql.yml │ └── pre-commit.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── Conceptual_Guide ├── Part_1-model_deployment │ ├── README.md │ ├── client.py │ ├── img │ │ └── multiple_models.PNG │ ├── img1.jpg │ ├── model_repository │ │ ├── text_detection │ │ │ └── config.pbtxt │ │ └── text_recognition │ │ │ └── config.pbtxt │ └── utils │ │ └── model.py ├── Part_2-improving_resource_utilization │ ├── README.md │ ├── img │ │ ├── dynamic_batching.PNG │ │ └── multi_instance.PNG │ └── model_repository │ │ └── text_recognition │ │ └── config.pbtxt ├── Part_3-optimizing_triton_configuration │ ├── README.md │ ├── img │ │ ├── arch.jpg │ │ ├── report_1.PNG │ │ └── report_2.PNG │ └── reports │ │ ├── detailed │ │ ├── text_recognition_config_4 │ │ │ └── detailed_report.pdf │ │ ├── text_recognition_config_5 │ │ │ └── detailed_report.pdf │ │ └── text_recognition_config_default │ │ │ └── detailed_report.pdf │ │ └── summaries │ │ └── text_recognition │ │ └── result_summary.pdf ├── Part_4-inference_acceleration │ ├── README.md │ ├── img │ │ ├── fw-trt-workflow.PNG │ │ ├── query_flow.PNG │ │ └── selecting_accelerator.PNG │ └── sample_configs │ │ ├── ORT_TRT_config.pbtxt │ │ ├── ORT_cuda_ep_config.pbtxt │ │ └── ORT_openvino_config.pbtxt ├── Part_5-Model_Ensembles │ ├── README.md │ ├── client.py │ ├── img1.jpg │ ├── model_repository │ │ ├── detection_postprocessing │ │ │ ├── 1 │ │ │ │ └── model.py │ │ │ └── config.pbtxt │ │ ├── detection_preprocessing │ │ │ ├── 1 │ │ │ │ └── model.py │ │ │ └── config.pbtxt │ │ ├── ensemble_model │ │ │ ├── 1 │ │ │ │ └── .gitkeep │ │ │ └── config.pbtxt │ │ ├── recognition_postprocessing │ │ │ ├── 1 │ │ │ │ └── model.py │ │ │ └── config.pbtxt │ │ ├── text_detection │ │ │ └── config.pbtxt │ │ └── text_recognition │ │ │ └── config.pbtxt │ └── utils │ │ ├── export_text_detection.sh │ │ ├── export_text_recognition.py │ │ ├── export_text_recognition.sh │ │ └── model.py ├── Part_6-building_complex_pipelines │ ├── README.md │ ├── client.py │ ├── export.py │ ├── gui │ │ ├── README.md │ │ ├── client.py │ │ └── requirements.txt │ ├── img │ │ └── multiple_backends.PNG │ └── model_repository │ │ ├── pipeline │ │ ├── 1 │ │ │ └── model.py │ │ └── config.pbtxt │ │ ├── text_encoder │ │ └── config.pbtxt │ │ └── vae │ │ └── config.pbtxt ├── Part_7-iterative_scheduling │ ├── README.md │ ├── client │ │ ├── client.py │ │ └── print_utils.py │ ├── input_data.json │ └── model_repository │ │ ├── iterative-gpt2 │ │ ├── 1 │ │ │ └── model.py │ │ └── config.pbtxt │ │ └── simple-gpt2 │ │ ├── 1 │ │ └── model.py │ │ └── config.pbtxt ├── Part_8-semantic_caching │ ├── README.md │ └── artifacts │ │ ├── semantic_cache.patch │ │ └── semantic_caching.py └── README.md ├── Deployment └── Kubernetes │ ├── EKS_Multinode_Triton_TRTLLM │ ├── 1. Create_EKS_Cluster.md │ ├── 2. Configure_EKS_Cluster.md │ ├── 3. Deploy_Triton.md │ ├── README.md │ ├── eks_cluster_config.yaml │ ├── multinode_helm_chart │ │ ├── aws-efa-k8s-device-plugin │ │ │ ├── .helmignore │ │ │ ├── Chart.yaml │ │ │ ├── README.md │ │ │ ├── templates │ │ │ │ ├── NOTES.txt │ │ │ │ ├── _helpers.tpl │ │ │ │ └── daemonset.yaml │ │ │ └── values.yaml │ │ ├── chart │ │ │ ├── Chart.yaml │ │ │ ├── example_values.yaml │ │ │ ├── templates │ │ │ │ ├── NOTES.txt │ │ │ │ ├── deployment.yaml │ │ │ │ ├── hpa.yaml │ │ │ │ ├── pod-monitor.yaml │ │ │ │ ├── rbac.yaml │ │ │ │ └── service.yaml │ │ │ ├── values.schema.json │ │ │ └── values.yaml │ │ ├── containers │ │ │ ├── README.md │ │ │ ├── kubessh │ │ │ ├── server.py │ │ │ └── triton_trt_llm.containerfile │ │ ├── gen_ai_perf.yaml │ │ ├── nccl_test.yaml │ │ ├── nvidia_dcgm-exporter_values.yaml │ │ ├── nvidia_gpu-feature-discovery_daemonset.yaml │ │ ├── setup_ssh_efs.yaml │ │ └── triton-metrics_prometheus-rule.yaml │ ├── p5-trtllm-cluster-config.yaml │ └── pvc │ │ ├── claim.yaml │ │ ├── pv.yaml │ │ └── storageclass.yaml │ ├── README.md │ ├── TensorRT-LLM_Autoscaling_and_Load_Balancing │ ├── .gitignore │ ├── README.md │ ├── chart │ │ ├── .gitignore │ │ ├── Chart.yaml │ │ ├── gpt2_values.yaml │ │ ├── llama-2-7b-chat_values.yaml │ │ ├── llama-2-7b_values.yaml │ │ ├── llama-3-70b-instruct_values.yaml │ │ ├── llama-3-8b-instruct_values.yaml │ │ ├── llama-3-8b_values.yaml │ │ ├── opt125m_values.yaml │ │ ├── pvc_aws │ │ │ ├── claim_aws.yaml │ │ │ ├── pv_aws.yaml │ │ │ └── storageclass_aws.yaml │ │ ├── templates │ │ │ ├── NOTES.txt │ │ │ ├── deployment.yaml │ │ │ ├── horizontal-pod-autoscaler.yaml │ │ │ ├── pod-monitor.yaml │ │ │ └── service.yaml │ │ ├── values.schema.json │ │ └── values.yaml │ ├── clients │ │ ├── README.md │ │ ├── gpt2.yaml │ │ ├── llama-2-70b-instruct.yaml │ │ ├── llama-2-7b.yaml │ │ ├── llama-3-8b-instruct.yaml │ │ ├── llama-3-8b.yaml │ │ └── opt125m.yaml │ ├── containers │ │ ├── README.md │ │ ├── client.containerfile │ │ ├── client.py │ │ ├── server.py │ │ └── triton_trt-llm.containerfile │ ├── grafana_inference-metrics_dashboard.json │ ├── images │ │ ├── grafana-dashboard.png │ │ ├── grafana_import-dashboard.png │ │ ├── grafana_new-dashboard.png │ │ ├── graph_gpu-utilization.png │ │ └── graph_queue-compute-ratio.png │ ├── nvidia_dcgm-exporter_values.yaml │ ├── nvidia_gpu-feature-discovery_daemonset.yaml │ ├── prometheus-adapter_values.yaml │ ├── setup_ssh-nfs.yaml │ └── triton-metrics_prometheus-rule.yaml │ └── TensorRT-LLM_Multi-Node_Distributed_Models │ ├── .gitignore │ ├── README.md │ ├── chart │ ├── .gitignore │ ├── Chart.yaml │ ├── gpt2_values.yaml │ ├── llama-2-70b_values.yaml │ ├── llama-2-7b-chat_values.yaml │ ├── llama-2-7b_values.yaml │ ├── llama-3-70b-instruct_values.yaml │ ├── llama-3-8b-instruct_values.yaml │ ├── llama-3-8b_values.yaml │ ├── opt125m_values.yaml │ ├── templates │ │ ├── NOTES.txt │ │ ├── deployment.yaml │ │ ├── job.yaml │ │ ├── pod-monitor.yaml │ │ ├── rbac.yaml │ │ └── service.yaml │ ├── values.schema.json │ └── values.yaml │ ├── containers │ ├── README.md │ ├── kubessh │ ├── server.py │ └── triton_trt-llm.containerfile │ ├── nvidia_dcgm-exporter_values.yaml │ ├── nvidia_gpu-feature-discovery_daemonset.yaml │ └── pvc.yaml ├── Feature_Guide ├── Constrained_Decoding │ ├── README.md │ └── artifacts │ │ ├── client.py │ │ ├── client_utils.py │ │ └── utils.py ├── Data_Pipelines │ ├── README.md │ ├── client.py │ ├── img │ │ └── Flow.PNG │ └── model_repository │ │ ├── ensemble_model │ │ └── 1 │ │ │ └── config.pbtxt │ │ ├── model1 │ │ ├── 1 │ │ │ └── model.py │ │ └── config.pbtxt │ │ └── model2 │ │ ├── 1 │ │ └── model.py │ │ └── config.pbtxt ├── Function_Calling │ ├── README.md │ └── artifacts │ │ ├── client.py │ │ ├── client_utils.py │ │ └── system_prompt_schema.yml └── Speculative_Decoding │ ├── README.md │ ├── TRT-LLM │ └── README.md │ ├── dataset-converter.py │ └── vLLM │ ├── README.md │ └── model_repository │ ├── base_model │ ├── 1 │ │ └── model.json │ └── config.pbtxt │ ├── eagle_model │ ├── 1 │ │ └── model.json │ └── config.pbtxt │ └── opt_model │ ├── 1 │ └── model.json │ └── config.pbtxt ├── HuggingFace ├── README.md ├── client.py ├── ensemble_model_repository │ ├── ensemble_model │ │ └── config.pbtxt │ └── preprocessing │ │ ├── 1 │ │ └── model.py │ │ └── config.pbtxt ├── img │ ├── Approach.PNG │ └── netron.PNG └── python_model_repository │ └── python_vit │ ├── 1 │ └── model.py │ └── config.pbtxt ├── LICENSE ├── Migration_Guide ├── img │ └── arch.PNG └── migration_guide.md ├── Popular_Models_Guide ├── DeepSeek │ └── README.md ├── Hermes-2-Pro-Llama-3-8B │ └── README.md ├── Llama2 │ ├── README.md │ ├── deploy_trtllm_llama.sh │ ├── llama2vllm │ │ ├── 1 │ │ │ └── model.json │ │ └── config.pbtxt │ ├── trtllm_guide.md │ └── vllm_guide.md ├── Llava1.5 │ ├── llava_trtllm_guide.md │ ├── model_repository │ │ ├── llava-1.5 │ │ │ ├── 1 │ │ │ │ └── model.py │ │ │ └── config.pbtxt │ │ ├── tensorrt_llm │ │ │ ├── 1 │ │ │ │ └── .gitkeep │ │ │ └── config.pbtxt │ │ └── vision_encoder │ │ │ ├── 1 │ │ │ └── model.py │ │ │ └── config.pbtxt │ └── multi_modal_client.py └── StableDiffusion │ ├── README.md │ ├── backend │ └── diffusion │ │ └── model.py │ ├── build.sh │ ├── client.py │ ├── diffusion-models │ ├── stable_diffusion_1_5 │ │ ├── 1 │ │ │ └── .gitkeep │ │ └── config.pbtxt │ └── stable_diffusion_xl │ │ ├── 1 │ │ └── .gitkeep │ │ └── config.pbtxt │ ├── docker │ ├── Dockerfile │ └── Dockerfile.dockerignore │ ├── docs │ ├── client_0_generated_image_0_1_5.jpg │ ├── client_0_generated_image_0_xl.jpg │ └── model_configuration.md │ ├── run.sh │ └── scripts │ ├── build_models.py │ └── build_models.sh ├── Quick_Deploy ├── HuggingFaceTransformers │ ├── Dockerfile │ ├── README.md │ ├── falcon7b │ │ ├── 1 │ │ │ └── model.py │ │ └── config.pbtxt │ ├── llama7b │ │ ├── 1 │ │ │ └── model.py │ │ └── config.pbtxt │ └── persimmon8b │ │ ├── 1 │ │ └── model.py │ │ └── config.pbtxt ├── ONNX │ ├── README.md │ └── client.py ├── OpenVINO │ └── README.md ├── PyTorch │ ├── README.md │ ├── client.py │ ├── config.pbtxt │ └── export.py ├── TensorFlow │ ├── README.md │ ├── client.py │ ├── config.pbtxt │ └── export.py └── vLLM │ ├── .gitignore │ └── README.md ├── README.md ├── Triton_Inference_Server_Python_API ├── README.md ├── build.sh ├── deps │ └── requirements.txt ├── docker │ ├── Dockerfile │ └── Dockerfile.dockerignore ├── docs │ ├── car_sample.jpg │ └── sample_generated_image.jpg ├── examples │ ├── kafka-io │ │ ├── README.md │ │ ├── models │ │ │ └── tokenizer │ │ │ │ ├── 1 │ │ │ │ └── model.py │ │ │ │ └── config.pbtxt │ │ ├── requirements.txt │ │ ├── start-kafka.sh │ │ ├── start-server.sh │ │ ├── tritonserver_deployment.py │ │ └── utils │ │ │ ├── kafka_consumer.py │ │ │ └── kafka_producer.py │ └── rayserve │ │ ├── README.md │ │ ├── client.py │ │ ├── start_ray.sh │ │ ├── stop_ray.sh │ │ └── tritonserver_deployment.py ├── identity-models │ └── identity │ │ ├── 1 │ │ └── model.py │ │ └── config.pbtxt └── run.sh └── pyproject.toml /.github/workflows/pre-commit.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: pre-commit 28 | 29 | on: 30 | pull_request: 31 | 32 | jobs: 33 | pre-commit: 34 | runs-on: ubuntu-22.04 35 | steps: 36 | - uses: actions/checkout@v3 37 | - uses: actions/setup-python@v3 38 | - uses: pre-commit/action@v3.0.0 39 | 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Pretrained Models 2 | **/*.onnx 3 | **/onnx/*.opt 4 | **/*.bin 5 | **/*.plan 6 | **/pytorch_model 7 | 8 | # Python Stuff 9 | **/__pycache__ 10 | 11 | # Downloaded Assets 12 | **/downloads 13 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_1-model_deployment/img/multiple_models.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_1-model_deployment/img/multiple_models.PNG -------------------------------------------------------------------------------- /Conceptual_Guide/Part_1-model_deployment/img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_1-model_deployment/img1.jpg -------------------------------------------------------------------------------- /Conceptual_Guide/Part_1-model_deployment/model_repository/text_detection/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "text_detection" 28 | backend: "onnxruntime" 29 | max_batch_size : 0 30 | input [ 31 | { 32 | name: "input_images:0" 33 | data_type: TYPE_FP32 34 | dims: [ -1, -1, -1, 3 ] 35 | } 36 | ] 37 | output [ 38 | { 39 | name: "feature_fusion/Conv_7/Sigmoid:0" 40 | data_type: TYPE_FP32 41 | dims: [ -1, -1, -1, 1 ] 42 | } 43 | ] 44 | output [ 45 | { 46 | name: "feature_fusion/concat_3:0" 47 | data_type: TYPE_FP32 48 | dims: [ -1, -1, -1, 5 ] 49 | } 50 | ] 51 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_1-model_deployment/model_repository/text_recognition/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "text_recognition" 28 | backend: "onnxruntime" 29 | max_batch_size : 0 30 | input [ 31 | { 32 | name: "input.1" 33 | data_type: TYPE_FP32 34 | dims: [ 1, 1, 32, 100 ] 35 | } 36 | ] 37 | output [ 38 | { 39 | name: "308" 40 | data_type: TYPE_FP32 41 | dims: [ 1, 26, 37 ] 42 | } 43 | ] 44 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_2-improving_resource_utilization/img/dynamic_batching.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_2-improving_resource_utilization/img/dynamic_batching.PNG -------------------------------------------------------------------------------- /Conceptual_Guide/Part_2-improving_resource_utilization/img/multi_instance.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_2-improving_resource_utilization/img/multi_instance.PNG -------------------------------------------------------------------------------- /Conceptual_Guide/Part_2-improving_resource_utilization/model_repository/text_recognition/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "text_recognition" 28 | backend: "onnxruntime" 29 | max_batch_size : 8 30 | input [ 31 | { 32 | name: "input.1" 33 | data_type: TYPE_FP32 34 | dims: [ 1, 32, 100 ] 35 | } 36 | ] 37 | output [ 38 | { 39 | name: "308" 40 | data_type: TYPE_FP32 41 | dims: [ 26, 37 ] 42 | } 43 | ] 44 | 45 | dynamic_batching { } 46 | 47 | instance_group [ 48 | { 49 | count: 2 50 | kind: KIND_GPU 51 | } 52 | ] 53 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_3-optimizing_triton_configuration/img/arch.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_3-optimizing_triton_configuration/img/arch.jpg -------------------------------------------------------------------------------- /Conceptual_Guide/Part_3-optimizing_triton_configuration/img/report_1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_3-optimizing_triton_configuration/img/report_1.PNG -------------------------------------------------------------------------------- /Conceptual_Guide/Part_3-optimizing_triton_configuration/img/report_2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_3-optimizing_triton_configuration/img/report_2.PNG -------------------------------------------------------------------------------- /Conceptual_Guide/Part_3-optimizing_triton_configuration/reports/detailed/text_recognition_config_4/detailed_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_3-optimizing_triton_configuration/reports/detailed/text_recognition_config_4/detailed_report.pdf -------------------------------------------------------------------------------- /Conceptual_Guide/Part_3-optimizing_triton_configuration/reports/detailed/text_recognition_config_5/detailed_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_3-optimizing_triton_configuration/reports/detailed/text_recognition_config_5/detailed_report.pdf -------------------------------------------------------------------------------- /Conceptual_Guide/Part_3-optimizing_triton_configuration/reports/detailed/text_recognition_config_default/detailed_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_3-optimizing_triton_configuration/reports/detailed/text_recognition_config_default/detailed_report.pdf -------------------------------------------------------------------------------- /Conceptual_Guide/Part_3-optimizing_triton_configuration/reports/summaries/text_recognition/result_summary.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_3-optimizing_triton_configuration/reports/summaries/text_recognition/result_summary.pdf -------------------------------------------------------------------------------- /Conceptual_Guide/Part_4-inference_acceleration/img/fw-trt-workflow.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_4-inference_acceleration/img/fw-trt-workflow.PNG -------------------------------------------------------------------------------- /Conceptual_Guide/Part_4-inference_acceleration/img/query_flow.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_4-inference_acceleration/img/query_flow.PNG -------------------------------------------------------------------------------- /Conceptual_Guide/Part_4-inference_acceleration/img/selecting_accelerator.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_4-inference_acceleration/img/selecting_accelerator.PNG -------------------------------------------------------------------------------- /Conceptual_Guide/Part_4-inference_acceleration/sample_configs/ORT_TRT_config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "text_recognition" 28 | backend: "onnxruntime" 29 | max_batch_size : 16 30 | input [ 31 | { 32 | name: "input.1" 33 | data_type: TYPE_FP32 34 | dims: [ 1, 32, 100 ] 35 | } 36 | ] 37 | output [ 38 | { 39 | name: "308" 40 | data_type: TYPE_FP32 41 | dims: [ 26, 37 ] 42 | } 43 | ] 44 | 45 | model_warmup { 46 | name: "text_recognition" 47 | batch_size: 16 48 | inputs: { 49 | key: "input.1" 50 | value: { 51 | data_type: TYPE_FP32 52 | dims: 1 53 | dims: 32 54 | dims: 100 55 | zero_data: true 56 | } 57 | } 58 | } 59 | 60 | optimization { 61 | graph : { 62 | level : 1 63 | } 64 | execution_accelerators { 65 | gpu_execution_accelerator : [ { 66 | name : "tensorrt", 67 | parameters { key: "precision_mode" value: "FP16" }, 68 | parameters { key: "max_workspace_size_bytes" value: "1073741824" } 69 | }] 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_4-inference_acceleration/sample_configs/ORT_cuda_ep_config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "text_recognition" 28 | backend: "onnxruntime" 29 | max_batch_size : 16 30 | input [ 31 | { 32 | name: "input.1" 33 | data_type: TYPE_FP32 34 | dims: [ 1, 32, 100 ] 35 | } 36 | ] 37 | output [ 38 | { 39 | name: "308" 40 | data_type: TYPE_FP32 41 | dims: [ 26, 37 ] 42 | } 43 | ] 44 | 45 | parameters { key: "cudnn_conv_algo_search" value: { string_value: "0" } } 46 | parameters { key: "gpu_mem_limit" value: { string_value: "4294967200" } } 47 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_4-inference_acceleration/sample_configs/ORT_openvino_config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "text_recognition" 28 | backend: "onnxruntime" 29 | max_batch_size : 16 30 | input [ 31 | { 32 | name: "input.1" 33 | data_type: TYPE_FP32 34 | dims: [ 1, 32, 100 ] 35 | } 36 | ] 37 | output [ 38 | { 39 | name: "308" 40 | data_type: TYPE_FP32 41 | dims: [ 26, 37 ] 42 | } 43 | ] 44 | 45 | optimization { execution_accelerators { 46 | cpu_execution_accelerator : [ { 47 | name : "openvino" 48 | } ] 49 | }} 50 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_5-Model_Ensembles/client.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | import numpy as np 28 | import tritonclient.grpc as grpcclient 29 | 30 | client = grpcclient.InferenceServerClient(url="localhost:8001") 31 | 32 | image_data = np.fromfile("img1.jpg", dtype="uint8") 33 | image_data = np.expand_dims(image_data, axis=0) 34 | 35 | input_tensors = [grpcclient.InferInput("input_image", image_data.shape, "UINT8")] 36 | input_tensors[0].set_data_from_numpy(image_data) 37 | results = client.infer(model_name="ensemble_model", inputs=input_tensors) 38 | output_data = results.as_numpy("recognized_text").astype(str) 39 | print(output_data) 40 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_5-Model_Ensembles/img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_5-Model_Ensembles/img1.jpg -------------------------------------------------------------------------------- /Conceptual_Guide/Part_5-Model_Ensembles/model_repository/detection_postprocessing/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "detection_postprocessing" 28 | backend: "python" 29 | max_batch_size: 256 30 | input [ 31 | { 32 | name: "detection_postprocessing_input_1" 33 | data_type: TYPE_FP32 34 | dims: [ -1, -1, 1 ] 35 | }, 36 | { 37 | name: "detection_postprocessing_input_2" 38 | data_type: TYPE_FP32 39 | dims: [ -1, -1, 5 ] 40 | }, 41 | { 42 | name: "detection_postprocessing_input_3" 43 | data_type: TYPE_FP32 44 | dims: [ -1, -1, 3 ] 45 | } 46 | ] 47 | 48 | output [ 49 | { 50 | name: "detection_postprocessing_output" 51 | data_type: TYPE_FP32 52 | dims: [ -1, -1, -1 ] 53 | } 54 | ] 55 | 56 | instance_group [{ kind: KIND_CPU }] 57 | 58 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_5-Model_Ensembles/model_repository/detection_preprocessing/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "detection_preprocessing" 28 | backend: "python" 29 | max_batch_size: 256 30 | input [ 31 | { 32 | name: "detection_preprocessing_input" 33 | data_type: TYPE_UINT8 34 | dims: [ -1 ] 35 | } 36 | ] 37 | 38 | output [ 39 | { 40 | name: "detection_preprocessing_output" 41 | data_type: TYPE_FP32 42 | dims: [ -1, -1, 3 ] 43 | } 44 | ] 45 | 46 | instance_group [{ kind: KIND_CPU }] 47 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_5-Model_Ensembles/model_repository/ensemble_model/1/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_5-Model_Ensembles/model_repository/ensemble_model/1/.gitkeep -------------------------------------------------------------------------------- /Conceptual_Guide/Part_5-Model_Ensembles/model_repository/recognition_postprocessing/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "recognition_postprocessing" 28 | backend: "python" 29 | max_batch_size: 256 30 | input [ 31 | { 32 | name: "recognition_postprocessing_input" 33 | data_type: TYPE_FP32 34 | dims: [ 26, 37] 35 | } 36 | ] 37 | output [ 38 | { 39 | name: "recognition_postprocessing_output" 40 | data_type: TYPE_STRING 41 | dims: [ -1 ] 42 | } 43 | ] 44 | 45 | instance_group [{ kind: KIND_CPU }] 46 | 47 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_5-Model_Ensembles/model_repository/text_detection/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "text_detection" 28 | platform: "onnxruntime_onnx" 29 | max_batch_size : 256 30 | input [ 31 | { 32 | name: "input_images:0" 33 | data_type: TYPE_FP32 34 | dims: [ -1, -1, 3 ] 35 | } 36 | ] 37 | output [ 38 | { 39 | name: "feature_fusion/Conv_7/Sigmoid:0" 40 | data_type: TYPE_FP32 41 | dims: [ -1, -1, 1 ] 42 | } 43 | ] 44 | output [ 45 | { 46 | name: "feature_fusion/concat_3:0" 47 | data_type: TYPE_FP32 48 | dims: [ -1, -1, 5 ] 49 | } 50 | ] 51 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_5-Model_Ensembles/model_repository/text_recognition/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "text_recognition" 28 | platform: "onnxruntime_onnx" 29 | max_batch_size : 256 30 | input [ 31 | { 32 | name: "input.1" 33 | data_type: TYPE_FP32 34 | dims: [ 1, 32, 100 ] 35 | } 36 | ] 37 | output [ 38 | { 39 | name: "308" 40 | data_type: TYPE_FP32 41 | dims: [ 26, 37 ] 42 | } 43 | ] 44 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_5-Model_Ensembles/utils/export_text_detection.sh: -------------------------------------------------------------------------------- 1 | ## Execute from Part_5-Model_Ensembles Directory 2 | if [ ! -d "./model_repository" ]; then 3 | echo "Execute from the 'Part_5-Model_Ensembles' directory" 4 | exit 1 5 | fi 6 | 7 | ## Download Text Detection Model 8 | mkdir -p downloads 9 | wget -P downloads https://www.dropbox.com/s/r2ingd0l3zt8hxs/frozen_east_text_detection.tar.gz 10 | tar -xvf downloads/frozen_east_text_detection.tar.gz -C downloads 11 | 12 | ## Convert to ONNX 13 | pip install -U tf2onnx 14 | mkdir -p model_repository/text_detection/1 15 | python -m tf2onnx.convert \ 16 | --input downloads/frozen_east_text_detection.pb \ 17 | --inputs "input_images:0" \ 18 | --outputs "feature_fusion/Conv_7/Sigmoid:0","feature_fusion/concat_3:0" \ 19 | --output model_repository/text_detection/1/model.onnx 20 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_5-Model_Ensembles/utils/export_text_recognition.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | from pathlib import Path 28 | 29 | import torch 30 | from model import STRModel 31 | 32 | # Create PyTorch Model Object 33 | model = STRModel(input_channels=1, output_channels=512, num_classes=37) 34 | 35 | # Load model weights from external file 36 | state = torch.load("downloads/None-ResNet-None-CTC.pth") 37 | state = {key.replace("module.", ""): value for key, value in state.items()} 38 | model.load_state_dict(state) 39 | 40 | # Create ONNX file by tracing model 41 | model_directory = Path("model_repository/text_recognition/1/") 42 | model_directory.mkdir(parents=True, exist_ok=True) 43 | trace_input = torch.randn(1, 1, 32, 100) 44 | torch.onnx.export( 45 | model, 46 | trace_input, 47 | model_directory / "model.onnx", 48 | verbose=True, 49 | dynamic_axes={"input.1": [0], "308": [0]}, 50 | ) 51 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_5-Model_Ensembles/utils/export_text_recognition.sh: -------------------------------------------------------------------------------- 1 | ## Execute from Part_5-Model_Ensembles Directory 2 | if [ ! -d "./model_repository" ]; then 3 | echo "Execute from the 'Part_5-Model_Ensembles' directory" 4 | exit 1 5 | fi 6 | 7 | ## Download Text Detection Model 8 | mkdir -p downloads 9 | wget -P downloads https://www.dropbox.com/sh/j3xmli4di1zuv3s/AABzCC1KGbIRe2wRwa3diWKwa/None-ResNet-None-CTC.pth 10 | 11 | ## Convert to ONNX 12 | python utils/export_text_recognition.py 13 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_6-building_complex_pipelines/client.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | import time 28 | 29 | import numpy as np 30 | import tritonclient.http as httpclient 31 | from PIL import Image 32 | from tritonclient.utils import * 33 | 34 | 35 | def main(): 36 | client = httpclient.InferenceServerClient(url="localhost:8000") 37 | 38 | prompt = "Pikachu with a hat, 4k, 3d render" 39 | text_obj = np.array([prompt], dtype="object").reshape((-1, 1)) 40 | 41 | input_text = httpclient.InferInput( 42 | "prompt", text_obj.shape, np_to_triton_dtype(text_obj.dtype) 43 | ) 44 | input_text.set_data_from_numpy(text_obj) 45 | 46 | output_img = httpclient.InferRequestedOutput("generated_image") 47 | 48 | query_response = client.infer( 49 | model_name="pipeline", inputs=[input_text], outputs=[output_img] 50 | ) 51 | 52 | image = query_response.as_numpy("generated_image") 53 | im = Image.fromarray(np.squeeze(image.astype(np.uint8))) 54 | im.save("generated_image2.jpg") 55 | 56 | 57 | if __name__ == "__main__": 58 | start = time.time() 59 | main() 60 | end = time.time() 61 | 62 | print("Time taken:", end - start) 63 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_6-building_complex_pipelines/gui/README.md: -------------------------------------------------------------------------------- 1 | 28 | 29 | # Stable Diffusion UI 30 | A simple Gradio UI for communicating with Stable Diffusion on Triton 31 | 32 | ## To deploy 33 | ``` 34 | pip install -r requirements.txt 35 | python client.py --triton_url 36 | ``` -------------------------------------------------------------------------------- /Conceptual_Guide/Part_6-building_complex_pipelines/gui/requirements.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | gradio 28 | tritonclient[grpc] 29 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_6-building_complex_pipelines/img/multiple_backends.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_6-building_complex_pipelines/img/multiple_backends.PNG -------------------------------------------------------------------------------- /Conceptual_Guide/Part_6-building_complex_pipelines/model_repository/pipeline/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | backend: "python" 28 | max_batch_size: 8 29 | 30 | input [ 31 | { 32 | name: "prompt" 33 | data_type: TYPE_STRING 34 | dims: [1] 35 | } 36 | ] 37 | output [ 38 | { 39 | name: "generated_image" 40 | data_type: TYPE_FP32 41 | dims: [ -1, -1, -1] 42 | } 43 | ] 44 | 45 | instance_group [ 46 | { 47 | kind: KIND_GPU 48 | } 49 | ] 50 | 51 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_6-building_complex_pipelines/model_repository/text_encoder/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "text_encoder" 28 | platform: "onnxruntime_onnx" 29 | max_batch_size: 8 30 | 31 | input [ 32 | { 33 | name: "input_ids" 34 | data_type: TYPE_INT32 35 | dims: [ -1] 36 | } 37 | ] 38 | output [ 39 | { 40 | name: "pooler_output" 41 | data_type: TYPE_FP32 42 | dims: [ 768] 43 | }, 44 | { 45 | name: "last_hidden_state" 46 | data_type: TYPE_FP32 47 | dims: [ -1, 768] 48 | } 49 | ] 50 | 51 | instance_group [ 52 | { 53 | kind: KIND_GPU 54 | } 55 | ] 56 | optimization { 57 | graph : { 58 | level : 1 59 | } 60 | } 61 | 62 | parameters { key: "execution_mode" value: { string_value: "1" } } 63 | parameters { key: "cudnn_conv_algo_search" value: { string_value: "0" } } 64 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_6-building_complex_pipelines/model_repository/vae/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "vae" 28 | platform: "tensorrt_plan" 29 | max_batch_size: 8 30 | 31 | input [ 32 | { 33 | name: "latent_sample" 34 | data_type: TYPE_FP32 35 | dims: [ -1, -1, -1] 36 | } 37 | ] 38 | output [ 39 | { 40 | name: "sample" 41 | data_type: TYPE_FP32 42 | dims: [ 3, -1, -1] 43 | } 44 | ] 45 | 46 | instance_group [ 47 | { 48 | kind: KIND_GPU 49 | } 50 | ] 51 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_7-iterative_scheduling/client/print_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | from tqdm import tqdm 28 | 29 | 30 | class Display: 31 | def __init__(self, max_tokens) -> None: 32 | self._top = tqdm(position=0, total=max_tokens, miniters=1) 33 | self._bottom = tqdm(position=1, total=max_tokens, miniters=1) 34 | self._max_tokens = max_tokens 35 | 36 | def update_top(self): 37 | self._top.update(1) 38 | self._top.refresh() 39 | 40 | def update_bottom(self): 41 | self._bottom.update(1) 42 | self._bottom.refresh() 43 | 44 | def clear(self): 45 | self._top.reset() 46 | self._bottom.reset() 47 | -------------------------------------------------------------------------------- /Conceptual_Guide/Part_7-iterative_scheduling/input_data.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": 3 | [ 4 | { 5 | "input": ["machine learning is"] 6 | } 7 | ] 8 | } -------------------------------------------------------------------------------- /Conceptual_Guide/Part_7-iterative_scheduling/model_repository/iterative-gpt2/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | backend: "python" 28 | sequence_batching { 29 | iterative_sequence: true 30 | control_input: [{ 31 | name: "correlation_id" 32 | control [ 33 | { 34 | kind: CONTROL_SEQUENCE_CORRID 35 | data_type: TYPE_UINT64 36 | } 37 | ] 38 | }, 39 | { 40 | name: "start" 41 | control [ 42 | { 43 | kind: CONTROL_SEQUENCE_START 44 | fp32_false_true: [ 0, 1 ] 45 | } 46 | ] 47 | }, 48 | { 49 | name: "end" 50 | control [ 51 | { 52 | kind: CONTROL_SEQUENCE_END 53 | fp32_false_true: [ 0, 1 ] 54 | } 55 | ]} 56 | ] 57 | oldest {} 58 | max_sequence_idle_microseconds: 400000000 59 | } 60 | 61 | instance_group [ 62 | { 63 | count: 1 64 | kind: KIND_GPU 65 | } 66 | ] -------------------------------------------------------------------------------- /Conceptual_Guide/Part_7-iterative_scheduling/model_repository/simple-gpt2/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | backend: "python" 28 | 29 | instance_group [ 30 | { 31 | count: 1 32 | kind: KIND_GPU 33 | } 34 | ] -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/eks_cluster_config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: eksctl.io/v1alpha5 2 | kind: ClusterConfig 3 | 4 | metadata: 5 | name: wenhant-eks-cluster-east2 6 | version: "1.30" 7 | region: us-east-2 8 | 9 | availabilityZones: 10 | - us-east-2a 11 | - us-east-2b 12 | - us-east-2c 13 | 14 | iam: 15 | withOIDC: true 16 | 17 | managedNodeGroups: 18 | - name: sys-nodes 19 | instanceType: c5.2xlarge 20 | minSize: 1 21 | desiredCapacity: 1 22 | maxSize: 1 23 | volumeSize: 80 24 | availabilityZones: ["us-east-2a"] 25 | iam: 26 | withAddonPolicies: 27 | imageBuilder: true 28 | autoScaler: true 29 | ebs: true 30 | efs: true 31 | awsLoadBalancerController: true 32 | cloudWatch: true 33 | albIngress: true 34 | 35 | - name: efa-compute-ng 36 | instanceType: g5.12xlarge 37 | minSize: 1 38 | desiredCapacity: 1 39 | maxSize: 1 40 | volumeSize: 300 41 | efaEnabled: true 42 | privateNetworking: true 43 | availabilityZones: ["us-east-2a"] 44 | iam: 45 | withAddonPolicies: 46 | imageBuilder: true 47 | autoScaler: true 48 | ebs: true 49 | efs: true 50 | awsLoadBalancerController: true 51 | cloudWatch: true 52 | albIngress: true 53 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | appVersion: v0.5.3 3 | description: A Helm chart for EFA device plugin. 4 | home: https://github.com/aws/eks-charts 5 | icon: https://raw.githubusercontent.com/aws/eks-charts/master/docs/logo/aws.png 6 | name: aws-efa-k8s-device-plugin 7 | sources: 8 | - https://github.com/aws/eks-charts 9 | version: v0.5.3 10 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/README.md: -------------------------------------------------------------------------------- 1 | # AWS EFA Kubernetes Device Plugin 2 | This chart installs the AWS EFA Kubernetes Device Plugin daemonset 3 | 4 | ## Prerequisites 5 | - Helm v3 6 | 7 | ## Installing the Chart 8 | First add the EKS repository to Helm: 9 | 10 | ```shell 11 | helm repo add eks https://aws.github.io/eks-charts 12 | ``` 13 | 14 | To install the chart with the release name `efa` in the `kube-system` namespace and default configuration: 15 | 16 | ```shell 17 | helm install efa ./aws-efa-k8s-device-plugin -n kube-system 18 | ``` 19 | 20 | # Configuration 21 | 22 | Parameter | Description | Default 23 | --- | --- | --- 24 | `image.repository` | EFA image repository | `602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/aws-efa-k8s-device-plugin` 25 | `image.tag` | EFA image tag | `v0.5.3` 26 | `securityContext.allowPrivilegeEscalation` | Controls whether a process can gain more privilege than its parent process | `false` 27 | `securityContext` | EFA plugin security context | `capabilities: drop: ["ALL"] runAsNonRoot: false` 28 | `supportedInstanceLabels.keys` | Kubernetes key to interpret as instance type | `nodes.kubernetes.io/instance-type` 29 | `supportedInstanceLabels.values` | List of instances which currently support EFA devices | `see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types` 30 | `resources` | Resources for containers in pod | `requests.cpu: 10m requests.memory: 20Mi` 31 | `nodeSelector` | Node labels for pod assignment | `{}` 32 | `tolerations` | Optional deployment tolerations | `[]` 33 | `additionalPodAnnotations` | Pod annotations to apply in addition to the default ones | `{}` 34 | `additionalPodLabels` | Pod labels to apply in addition to the default ones | `{}` 35 | `nameOverride` | Override the name of the chart | `""` 36 | `fullnameOverride` | Override the full name of the chart | `""` 37 | `imagePullSecrets` | Docker registry pull secret | `[]` 38 | 39 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | EFA device plugin is installed, it can be requested as `vpc.amazonaws.com/efa` resource. -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "aws-efa-k8s-device-plugin.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "aws-efa-k8s-device-plugin.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "aws-efa-k8s-device-plugin.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "aws-efa-k8s-device-plugin.labels" -}} 37 | helm.sh/chart: {{ include "aws-efa-k8s-device-plugin.chart" . }} 38 | {{ include "aws-efa-k8s-device-plugin.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "aws-efa-k8s-device-plugin.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "aws-efa-k8s-device-plugin.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "aws-efa-k8s-device-plugin.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "aws-efa-k8s-device-plugin.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | appVersion: 0.1.0 17 | description: Generative AI Multi-Node w/ Triton and TensorRT-LLM Guide/Tutorial 18 | icon: https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/01-nvidia-logo-vert-500x200-2c50-d@2x.png 19 | name: triton_trt-llm_multi-node_example 20 | version: 0.1.0 21 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/example_values.yaml: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # See values.yaml for reference values. 17 | 18 | gpu: NVIDIA-A10G 19 | # gpu: NVIDIA-A100-SXM4-40GB 20 | gpuPerNode: 4 21 | persistentVolumeClaim: efs-claim-2 22 | 23 | tensorrtLLM: 24 | parallelism: 25 | tensor: 4 26 | pipeline: 2 27 | 28 | triton: 29 | image: 30 | name: 210086341041.dkr.ecr.us-west-2.amazonaws.com/triton_trtllm_multinode:24.08 31 | # name: 354625738399.dkr.ecr.us-east-1.amazonaws.com/wenhant_triton_trtllm_multinode:24.07.3 32 | resources: 33 | cpu: 8 34 | memory: 32Gi 35 | efa: 1 # If you don't want to enable EFA, set this to 0. 36 | # triton_model_repo_path: /var/run/models/mixtral_8x7b_tp8_ep2_moetp4/triton_model_repo 37 | # triton_model_repo_path: /var/run/models/llama3_8b_tp2_pp4/triton_model_repo 38 | # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_tp4_moeep2_moetp2_pp2_v11_a10g/triton_model_repo 39 | # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_tp4_moetp4_pp2_v11_a10g/triton_model_repo 40 | # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_tp4_moeep4_pp2_v11_a10g/triton_model_repo 41 | # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_pp8_v11_a10g/triton_model_repo 42 | # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_tp8_v11_a10g/triton_model_repo 43 | # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_tp4_pp2_v11_a10g/triton_model_repo 44 | triton_model_repo_path: /var/run/models/tensorrtllm_backend/triton_model_repo 45 | # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x22b_tp16_v11_a100/triton_model_repo 46 | enable_nsys: false # Note if you send lots of requests, nsys report can be very large. 47 | 48 | logging: 49 | tritonServer: 50 | verbose: true 51 | 52 | autoscaling: 53 | enable: true 54 | replicas: 55 | maximum: 2 56 | minimum: 1 57 | metric: 58 | name: triton:queue_compute:ratio 59 | value: 1 60 | 61 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | {{ $.Chart.Name }} ({{ $.Chart.Version }}) installation complete. 2 | 3 | Release Name: {{ $.Release.Name }} 4 | Namespace: {{ $.Release.Namespace }} 5 | Deployment Name: {{ $.Release.Name }} 6 | {{- if not $.Values.kubernetes.noService }} 7 | Service Name: {{ $.Release.Name }} 8 | {{- end }} 9 | {{- if $.Values.kubernetes.serviceAccount }} 10 | ServiceAccount Name: {{ $.Release.Name }} 11 | {{- end }} 12 | 13 | Helpful commands: 14 | 15 | $ helm status --namespace={{ $.Release.Namespace }} {{ $.Release.Name }} 16 | $ helm get --namespace={{ $.Release.Namespace }} all {{ $.Release.Name }} 17 | $ kubectl get --namespace={{ $.Release.Namespace }} --selector='app={{ $.Release.Name }}' deployments 18 | ,pods 19 | {{- if not $.Values.kubernetes.noService -}} 20 | ,services 21 | {{- end -}} 22 | ,podmonitors 23 | {{- if $.Values.kubernetes.serviceAccount -}} 24 | ,serviceAccounts 25 | {{- end -}} 26 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/hpa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: autoscaling/v2 16 | kind: HorizontalPodAutoscaler 17 | metadata: 18 | name: {{ $.Release.Name }} 19 | labels: 20 | app: {{ $.Release.Name }} 21 | app.kubernetes.io/component: autoscaler 22 | release: prometheus 23 | {{- with $.Values.kubernetes }} 24 | {{- with .labels }} 25 | {{ toYaml . | indent 4 }} 26 | {{- end }} 27 | {{- end }} 28 | spec: 29 | maxReplicas: {{ $.Values.autoscaling.replicas.maximum }} 30 | minReplicas: {{ $.Values.autoscaling.replicas.minimum }} 31 | metrics: 32 | - type: Pods 33 | pods: 34 | metric: 35 | name: {{ $.Values.autoscaling.metric.name }} 36 | target: 37 | type: AverageValue 38 | averageValue: {{ $.Values.autoscaling.metric.value }} 39 | scaleTargetRef: 40 | apiVersion: leaderworkerset.x-k8s.io/v1 41 | kind: LeaderWorkerSet 42 | name: leaderworkerset-sample 43 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/pod-monitor.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: monitoring.coreos.com/v1 16 | kind: PodMonitor 17 | metadata: 18 | name: {{ $.Release.Name }} 19 | labels: 20 | app: {{ $.Release.Name }} 21 | app.kubernetes.io/component: monitor 22 | release: prometheus 23 | {{- with $.Values.kubernetes }} 24 | {{- with .labels }} 25 | {{ toYaml . | indent 4 }} 26 | {{- end }} 27 | {{- end }} 28 | spec: 29 | selector: 30 | matchLabels: 31 | role: leader 32 | podMetricsEndpoints: 33 | - port: metrics 34 | path: /metrics 35 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/rbac.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | {{- if not $.Values.kubernetes.service_account }} 16 | apiVersion: rbac.authorization.k8s.io/v1 17 | kind: Role 18 | metadata: 19 | labels: 20 | {{- with $.Values.kubernetes }} 21 | {{- with .labels }} 22 | {{ toYaml . | indent 4 }} 23 | {{- end }} 24 | {{- end }} 25 | name: {{ $.Release.Name }} 26 | rules: 27 | - apiGroups: 28 | - '' 29 | - apps 30 | - batch 31 | resources: 32 | - deployments 33 | - jobs 34 | - pods 35 | - pods/status 36 | - services 37 | verbs: 38 | - get 39 | - list 40 | - apiGroups: [''] 41 | resources: 42 | - pods/exec 43 | verbs: 44 | - create 45 | 46 | --- 47 | 48 | apiVersion: v1 49 | kind: ServiceAccount 50 | metadata: 51 | labels: 52 | {{- with $.Values.kubernetes }} 53 | {{- with .labels }} 54 | {{ toYaml . | indent 4 }} 55 | {{- end }} 56 | {{- end }} 57 | name: {{ $.Release.Name }} 58 | 59 | --- 60 | 61 | apiVersion: rbac.authorization.k8s.io/v1 62 | kind: RoleBinding 63 | metadata: 64 | labels: 65 | {{- with $.Values.kubernetes }} 66 | {{- with .labels }} 67 | {{ toYaml . | indent 4 }} 68 | {{- end }} 69 | {{- end }} 70 | name: {{ $.Release.Name }} 71 | subjects: 72 | - kind: ServiceAccount 73 | name: {{ $.Release.Name }} 74 | roleRef: 75 | apiGroup: rbac.authorization.k8s.io 76 | kind: Role 77 | name: {{ $.Release.Name }} 78 | {{- end }} 79 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/service.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | {{- if $.Values.kubernetes.noService }} 16 | # Chart values optioned to not create a service. Service not created. 17 | {{- else }} 18 | apiVersion: v1 19 | kind: Service 20 | metadata: 21 | name: {{ $.Release.Name }} 22 | labels: 23 | app: {{ $.Release.Name }} 24 | app.kubernetes.io/component: service 25 | {{- with $.Values.kubernetes }} 26 | {{- with .labels }} 27 | {{ toYaml . | indent 4 }} 28 | {{- end }} 29 | {{- end }} 30 | spec: 31 | type: LoadBalancer 32 | ports: 33 | - name: http 34 | port: 8000 35 | targetPort: http 36 | - name: grpc 37 | port: 8001 38 | targetPort: grpc 39 | - name: metrics 40 | port: 8002 41 | targetPort: metrics 42 | selector: 43 | role: leader 44 | {{- end }} 45 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/README.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | 18 | # Container Generation 19 | 20 | The files in this folder are intended to be used to create the custom container image for multi-node Triton + TRT-LLM EKS deployment including installation of EFA components. 21 | 22 | Run the following command to create the container image. 23 | 24 | ```bash 25 | docker build --file ./triton_trt_llm.containerfile --tag . 26 | ``` 27 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/kubessh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | pod=$1 18 | shift 19 | kubectl exec $pod -- /bin/sh -c "$*" 20 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/gen_ai_perf.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: gen-ai-perf 5 | labels: 6 | app: gen-ai-perf 7 | spec: 8 | containers: 9 | - name: triton 10 | image: nvcr.io/nvidia/tritonserver:24.07-py3-sdk 11 | command: ["sleep", "infinity"] 12 | volumeMounts: 13 | - mountPath: /var/run/models 14 | name: model-repository 15 | volumes: 16 | - name: model-repository 17 | persistentVolumeClaim: 18 | claimName: efs-claim-2 19 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/setup_ssh_efs.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: setup-ssh-efs 5 | labels: 6 | app: setup-ssh-efs 7 | spec: 8 | containers: 9 | - name: triton 10 | image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 11 | command: ["sleep", "infinity"] 12 | resources: 13 | limits: 14 | nvidia.com/gpu: 4 15 | requests: 16 | nvidia.com/gpu: 4 17 | volumeMounts: 18 | - mountPath: /var/run/models 19 | name: model-repository 20 | - mountPath: /dev/shm 21 | name: dshm 22 | volumes: 23 | - name: model-repository 24 | persistentVolumeClaim: 25 | claimName: efs-claim-2 26 | - name: dshm 27 | emptyDir: 28 | medium: Memory 29 | sizeLimit: 32Gi 30 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/triton-metrics_prometheus-rule.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: monitoring.coreos.com/v1 16 | kind: PrometheusRule 17 | metadata: 18 | name: triton-metrics 19 | labels: 20 | app.kubernetes.io/component: autoscaler 21 | release: prometheus 22 | spec: 23 | groups: 24 | - name: autoscaling 25 | interval: 6s 26 | rules: 27 | # Average number of microseconds inference requests take to compute after unqueueing (not including cache hits). 28 | - expr: rate(nv_inference_compute_infer_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1) 29 | record: triton:compute_duration:average 30 | # Average number of microseconds inference requests spend queue before being processed (not including cache hits). 31 | - expr: rate(nv_inference_queue_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1) 32 | record: triton:queue_duration:average 33 | # Average number of microseconds inference requests take in total (not including cache hits). 34 | - expr: rate(nv_inference_request_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1) 35 | record: triton:request_duration:average 36 | # Average percentage of time inference requests spend in queue (not including cache hits). 37 | - expr: rate(nv_inference_queue_duration_us[1m])/clamp_min(rate(nv_inference_compute_infer_duration_us[1m]),1) 38 | record: triton:queue_compute:ratio 39 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/p5-trtllm-cluster-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: eksctl.io/v1alpha5 2 | kind: ClusterConfig 3 | 4 | metadata: 5 | name: trtllm-inference-cluster 6 | region: us-east-1 7 | version: "1.30" 8 | 9 | vpc: 10 | id: $PLACEHOLDER_VPC_ID 11 | subnets: 12 | private: 13 | us-east-1a: 14 | id: $PLACEHOLDER_SUBNET_PRIVATE_1 15 | public: 16 | us-east-1a: 17 | id: $PLACEHOLDER_SUBNET_PUBLIC_1 18 | 19 | clusterEndpoints: 20 | privateAccess: true 21 | publicAccess: true 22 | 23 | cloudwatch: 24 | clusterLogging: 25 | enableTypes: ["*"] 26 | 27 | iam: 28 | withOIDC: true 29 | 30 | 31 | managedNodeGroups: 32 | - name: cpu-node-group 33 | instanceType: c5.2xlarge 34 | minSize: 0 35 | desiredCapacity: 0 36 | maxSize: 1 37 | iam: 38 | withAddonPolicies: 39 | imageBuilder: true 40 | autoScaler: true 41 | ebs: true 42 | efs: true 43 | awsLoadBalancerController: true 44 | cloudWatch: true 45 | albIngress: true 46 | - name: gpu-compute-node-group 47 | instanceType: p5.48xlarge 48 | instancePrefix: trtllm-compute-node 49 | privateNetworking: true 50 | efaEnabled: true 51 | minSize: 0 52 | desiredCapacity: 0 53 | maxSize: 2 54 | volumeSize: 500 55 | # comment out capacityReservation if you do not need ODCR 56 | capacityReservation: 57 | capacityReservationTarget: 58 | capacityReservationID: "cr-xxxxxxxxxxxxxx" 59 | iam: 60 | withAddonPolicies: 61 | imageBuilder: true 62 | autoScaler: true 63 | ebs: true 64 | efs: true 65 | awsLoadBalancerController: true 66 | cloudWatch: true 67 | albIngress: true 68 | externalDNS: true 69 | certManager: true 70 | autoScaler: true 71 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/claim.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: efs-claim-2 5 | spec: 6 | accessModes: 7 | - ReadWriteMany 8 | storageClassName: efs-sc-1 9 | resources: 10 | requests: 11 | storage: 200Gi 12 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/pv.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: efs-pv-2 5 | spec: 6 | capacity: 7 | storage: 200Gi 8 | volumeMode: Filesystem 9 | accessModes: 10 | - ReadWriteMany 11 | persistentVolumeReclaimPolicy: Retain 12 | storageClassName: efs-sc-1 13 | csi: 14 | driver: efs.csi.aws.com 15 | volumeHandle: fs-0d5ec63b9f8ebb2db 16 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/storageclass.yaml: -------------------------------------------------------------------------------- 1 | kind: StorageClass 2 | apiVersion: storage.k8s.io/v1 3 | metadata: 4 | name: efs-sc-1 5 | provisioner: efs.csi.aws.com 6 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/README.md: -------------------------------------------------------------------------------- 1 | # Kubernetes Deployment of Triton Server Guides 2 | 3 | * [TensorRT-LLM Gen. AI Autoscaling & Load Balancing](./TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md) 4 | * [Multi-Node Generative AI w/ Triton Server and TensorRT-LLM](./TensorRT-LLM_Multi-Node_Distributed_Models/README.md) 5 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | **/.vscode/ -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/.gitignore: -------------------------------------------------------------------------------- 1 | dev_values.yaml 2 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | appVersion: 0.1.0 17 | description: Triton + TensorRT-LLM autoscaling and load balancing example. 18 | icon: https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/01-nvidia-logo-vert-500x200-2c50-d@2x.png 19 | name: triton_trt-llm_aslb-example 20 | version: 0.1.0 21 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/gpt2_values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # See values.yaml for reference values. 16 | 17 | gpu: 18 | - Tesla-T4 19 | - Tesla-V100-SXM2-16GB 20 | 21 | model: 22 | name: gpt2 23 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b-chat_values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # See values.yaml for reference values. 16 | 17 | gpu: 18 | - Tesla-T4 19 | - Tesla-V100-SXM2-16GB 20 | 21 | model: 22 | name: llama-2-7b-chat 23 | tensorrtLlm: 24 | parallelism: 25 | tensor: 2 26 | 27 | autoscaling: 28 | metric: 29 | value: 1500m 30 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b_values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # See values.yaml for reference values. 16 | 17 | gpu: 18 | - Tesla-T4 19 | - Tesla-V100-SXM2-16GB 20 | 21 | model: 22 | name: llama-2-7b 23 | tensorrtLlm: 24 | parallelism: 25 | tensor: 2 26 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-70b-instruct_values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # See values.yaml for reference values. 16 | 17 | gpu: 18 | - NVIDIA-A100-SXM4-40GB 19 | 20 | model: 21 | name: llama-3-70b-instruct 22 | tensorrtLlm: 23 | parallelism: 24 | tensor: 4 25 | 26 | autoscaling: 27 | metric: 28 | value: 3500m 29 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b-instruct_values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # See values.yaml for reference values. 16 | 17 | gpu: 18 | - Tesla-T4 19 | - Tesla-V100-SXM2-16GB 20 | 21 | model: 22 | name: llama-3-8b-instruct 23 | tensorrtLlm: 24 | parallelism: 25 | tensor: 2 26 | 27 | autoscaling: 28 | metric: 29 | value: 1500m 30 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b_values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # See values.yaml for reference values. 16 | 17 | gpu: 18 | - NVIDIA-A10G 19 | - NVIDIA-A100-SXM4-40GB 20 | 21 | model: 22 | name: llama-3-8b 23 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/opt125m_values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # See values.yaml for reference values. 16 | 17 | gpu: 18 | - Tesla-T4 19 | - Tesla-V100-SXM2-16GB 20 | 21 | model: 22 | name: opt125m 23 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/pvc_aws/claim_aws.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolumeClaim 3 | metadata: 4 | name: nfs-claim-autoscaling-2 5 | spec: 6 | accessModes: 7 | - ReadWriteMany 8 | storageClassName: efs-autoscaling-sc 9 | resources: 10 | requests: 11 | storage: 200Gi 12 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/pvc_aws/pv_aws.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: efs-autoscaling-pv-2 5 | spec: 6 | capacity: 7 | storage: 200Gi 8 | volumeMode: Filesystem 9 | accessModes: 10 | - ReadWriteMany 11 | persistentVolumeReclaimPolicy: Retain 12 | storageClassName: efs-autoscaling-sc 13 | csi: 14 | driver: efs.csi.aws.com 15 | volumeHandle: fs-0c6ba87870e4be751 16 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/pvc_aws/storageclass_aws.yaml: -------------------------------------------------------------------------------- 1 | kind: StorageClass 2 | apiVersion: storage.k8s.io/v1 3 | metadata: 4 | name: efs-autoscaling-sc 5 | provisioner: efs.csi.aws.com -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | {{ $.Chart.Name }} ({{ $.Chart.Version }}) installation complete. 2 | 3 | Release Name: {{ $.Release.Name }} 4 | Namespace: {{ $.Release.Namespace }} 5 | Deployment Name: {{ $.Release.Name }} 6 | Service Name: {{ $.Release.Name }} 7 | 8 | Helpful commands: 9 | 10 | $ helm status --namespace={{ $.Release.Namespace }} {{ $.Release.Name }} 11 | $ helm get --namespace={{ $.Release.Namespace }} all {{ $.Release.Name }} 12 | $ kubectl get --namespace={{ $.Release.Namespace }} --selector='app={{ $.Release.Name }}' deployments,pods,hpa,services,podmonitors 13 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/horizontal-pod-autoscaler.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | {{- $metric_name := "triton:queue_compute:ratio" }} 16 | {{- $metric_value := "1000m" }} 17 | {{- $replicasMax := 4 }} 18 | {{- $replicasMin := 1 }} 19 | {{- with $.Values.autoscaling }} 20 | {{- if .enable }} 21 | {{- with .replicas }} 22 | {{- with .maximum }} 23 | {{- $replicasMax = . }} 24 | {{- end }} 25 | {{- with .minimum }} 26 | {{- $replicasMin = . }} 27 | {{- end }} 28 | {{- end }} 29 | {{- with .metric }} 30 | {{- with .name }} 31 | {{- $metric_name = . }} 32 | {{- end }} 33 | {{- with .value }} 34 | {{- $metric_value = . }} 35 | {{- end }} 36 | {{- end }} 37 | apiVersion: autoscaling/v2 38 | kind: HorizontalPodAutoscaler 39 | metadata: 40 | name: {{ $.Release.Name }} 41 | labels: 42 | app: {{ $.Release.Name }} 43 | app.kubernetes.io/component: autoscaler 44 | release: prometheus 45 | {{- with $.Values.kubernetes }} 46 | {{- with .labels }} 47 | {{ toYaml . | indent 4 }} 48 | {{- end }} 49 | {{- end }} 50 | spec: 51 | maxReplicas: {{ $replicasMax }} 52 | minReplicas: {{ $replicasMin }} 53 | metrics: 54 | - type: Pods 55 | pods: 56 | metric: 57 | name: {{ $metric_name }} 58 | target: 59 | type: AverageValue 60 | averageValue: {{ $metric_value }} 61 | scaleTargetRef: 62 | apiVersion: apps/v1 63 | kind: Deployment 64 | name: {{ $.Release.Name }} 65 | {{- end }} 66 | {{- end }} 67 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/pod-monitor.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: monitoring.coreos.com/v1 16 | kind: PodMonitor 17 | metadata: 18 | name: {{ $.Release.Name }} 19 | labels: 20 | app: {{ $.Release.Name }} 21 | app.kubernetes.io/component: autoscaler 22 | release: prometheus 23 | {{- with $.Values.kubernetes }} 24 | {{- with .labels }} 25 | {{ toYaml . | indent 4 }} 26 | {{- end }} 27 | {{- end }} 28 | spec: 29 | selector: 30 | matchLabels: 31 | app: {{ $.Release.Name }} 32 | app.kubernetes.io/component: server 33 | podMetricsEndpoints: 34 | - port: metrics 35 | path: /metrics 36 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/service.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | {{- $noService := false }} 16 | {{- with $.Values.kubernetes }} 17 | {{- with .noService }} 18 | {{- $noService = . }} 19 | {{- end }} 20 | {{- end }} 21 | {{- if $noService }} 22 | # Chart values optioned to not create a service. Service not created. 23 | {{- else }} 24 | apiVersion: v1 25 | kind: Service 26 | metadata: 27 | name: {{ $.Release.Name }} 28 | labels: 29 | app: {{ $.Release.Name }} 30 | app.kubernetes.io/component: service 31 | {{- with $.Values.kubernetes }} 32 | {{- with .labels }} 33 | {{ toYaml . | indent 4 }} 34 | {{- end }} 35 | {{- end }} 36 | spec: 37 | ports: 38 | - name: http 39 | port: 8000 40 | targetPort: http 41 | - name: grpc 42 | port: 8001 43 | targetPort: grpc 44 | - name: metrics 45 | port: 8002 46 | targetPort: metrics 47 | selector: 48 | app: {{ $.Release.Name }} 49 | type: ClusterIP 50 | {{- end }} 51 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/README.md: -------------------------------------------------------------------------------- 1 | # Client Inference Generators 2 | 3 | The files in this folder are for the deployment of client pods in the same cluster as a model hosted by Triton + TRT-LLM using 4 | the provided sample Helm chart. 5 | Each file creates a single deployment of a client container which can be used to generate inference requests for the deployed 6 | model. 7 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/gpt2.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property 4 | # and proprietary rights in and to this software, related documentation 5 | # and any modifications thereto. Any use, reproduction, disclosure or 6 | # distribution of this software and related documentation without an express 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 8 | 9 | apiVersion: apps/v1 10 | kind: Deployment 11 | metadata: 12 | name: client-gpt2 13 | spec: 14 | selector: 15 | matchLabels: 16 | app: client-gpt2 17 | replicas: 1 18 | template: 19 | metadata: 20 | labels: 21 | app: client-gpt2 22 | app.kubernetes.io/component: client 23 | spec: 24 | containers: 25 | - name: client 26 | command: 27 | - python3 28 | - ./client.py 29 | env: 30 | - name: TRTLLM_MODEL_NAME 31 | value: gpt2 32 | - name: TRTLLM_TRITON_URL 33 | value: gpt2 34 | - name: TRTLLM_MAX_TOKENS 35 | value: "256" 36 | # - name: TRTLLM_DEBUG 37 | # value: debug 38 | image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1 39 | imagePullPolicy: IfNotPresent 40 | resources: 41 | limits: 42 | cpu: 1000m 43 | ephemeral-storage: 1Gi 44 | memory: 1Gi 45 | requests: 46 | cpu: 500m 47 | ephemeral-storage: 1Gi 48 | memory: 1Gi 49 | imagePullSecrets: 50 | - name: ngc-container-pull 51 | restartPolicy: Always 52 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-2-70b-instruct.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property 4 | # and proprietary rights in and to this software, related documentation 5 | # and any modifications thereto. Any use, reproduction, disclosure or 6 | # distribution of this software and related documentation without an express 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 8 | 9 | apiVersion: apps/v1 10 | kind: Deployment 11 | metadata: 12 | name: client-llama-2-70b-instruct 13 | spec: 14 | selector: 15 | matchLabels: 16 | app: client-llama-2-70b-instruct 17 | replicas: 1 18 | template: 19 | metadata: 20 | labels: 21 | app: client-llama-2-70b-instruct 22 | app.kubernetes.io/component: client 23 | spec: 24 | containers: 25 | - name: client 26 | command: 27 | - python3 28 | - ./client.py 29 | env: 30 | - name: TRTLLM_MODEL_NAME 31 | value: llama-2-70b-instruct 32 | - name: TRTLLM_TRITON_URL 33 | value: llama-2-70b-instruct 34 | # - name: TRTLLM_MAX_TOKENS 35 | # value: "512" 36 | # - name: TRTLLM_DEBUG 37 | # value: debug 38 | image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1 39 | imagePullPolicy: IfNotPresent 40 | resources: 41 | limits: 42 | cpu: 1000m 43 | ephemeral-storage: 1Gi 44 | memory: 2Gi 45 | requests: 46 | cpu: 750m 47 | ephemeral-storage: 1Gi 48 | memory: 1536Mi 49 | imagePullSecrets: 50 | - name: ngc-container-pull 51 | restartPolicy: Always 52 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-2-7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property 4 | # and proprietary rights in and to this software, related documentation 5 | # and any modifications thereto. Any use, reproduction, disclosure or 6 | # distribution of this software and related documentation without an express 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 8 | 9 | apiVersion: apps/v1 10 | kind: Deployment 11 | metadata: 12 | name: client-llama-2-7b 13 | spec: 14 | selector: 15 | matchLabels: 16 | app: client-llama-2-7b 17 | replicas: 1 18 | template: 19 | metadata: 20 | labels: 21 | app: client-llama-2-7b 22 | app.kubernetes.io/component: client 23 | spec: 24 | containers: 25 | - name: client 26 | command: 27 | - python3 28 | - ./client.py 29 | env: 30 | - name: TRTLLM_MODEL_NAME 31 | value: llama-2-7b 32 | - name: TRTLLM_TRITON_URL 33 | value: llama-2-7b 34 | # - name: TRTLLM_MAX_TOKENS 35 | # value: "512" 36 | # - name: TRTLLM_DEBUG 37 | # value: debug 38 | image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1 39 | imagePullPolicy: IfNotPresent 40 | resources: 41 | limits: 42 | cpu: 1000m 43 | ephemeral-storage: 1Gi 44 | memory: 2Gi 45 | requests: 46 | cpu: 750m 47 | ephemeral-storage: 1Gi 48 | memory: 1536Mi 49 | imagePullSecrets: 50 | - name: ngc-container-pull 51 | restartPolicy: Always 52 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-3-8b-instruct.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property 4 | # and proprietary rights in and to this software, related documentation 5 | # and any modifications thereto. Any use, reproduction, disclosure or 6 | # distribution of this software and related documentation without an express 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 8 | 9 | apiVersion: apps/v1 10 | kind: Deployment 11 | metadata: 12 | name: client-llama-3-8b-instruct 13 | spec: 14 | selector: 15 | matchLabels: 16 | app: client-llama-3-8b-instruct 17 | replicas: 1 18 | template: 19 | metadata: 20 | labels: 21 | app: client-llama-3-8b-instruct 22 | app.kubernetes.io/component: client 23 | spec: 24 | containers: 25 | - name: client 26 | command: 27 | - python3 28 | - ./client.py 29 | env: 30 | - name: TRTLLM_MODEL_NAME 31 | value: llama-3-8b-instruct 32 | - name: TRTLLM_TRITON_URL 33 | value: llama-3-8b-instruct 34 | # - name: TRTLLM_MAX_TOKENS 35 | # value: "512" 36 | # - name: TRTLLM_DEBUG 37 | # value: debug 38 | image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1 39 | imagePullPolicy: IfNotPresent 40 | resources: 41 | limits: 42 | cpu: 1000m 43 | ephemeral-storage: 1Gi 44 | memory: 2Gi 45 | requests: 46 | cpu: 750m 47 | ephemeral-storage: 1Gi 48 | memory: 1536Mi 49 | imagePullSecrets: 50 | - name: ngc-container-pull 51 | restartPolicy: Always 52 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-3-8b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property 4 | # and proprietary rights in and to this software, related documentation 5 | # and any modifications thereto. Any use, reproduction, disclosure or 6 | # distribution of this software and related documentation without an express 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 8 | 9 | apiVersion: apps/v1 10 | kind: Deployment 11 | metadata: 12 | name: client-llama-3-8b 13 | spec: 14 | selector: 15 | matchLabels: 16 | app: client-llama-3-8b 17 | replicas: 1 18 | template: 19 | metadata: 20 | labels: 21 | app: client-llama-3-8b 22 | app.kubernetes.io/component: client 23 | spec: 24 | containers: 25 | - name: client 26 | command: 27 | - python3 28 | - ./client.py 29 | env: 30 | - name: TRTLLM_MODEL_NAME 31 | value: llama-3-8b 32 | - name: TRTLLM_TRITON_URL 33 | value: llama-3-8b 34 | image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1 35 | imagePullPolicy: IfNotPresent 36 | resources: 37 | limits: 38 | cpu: 1000m 39 | ephemeral-storage: 1Gi 40 | memory: 2Gi 41 | requests: 42 | cpu: 750m 43 | ephemeral-storage: 1Gi 44 | memory: 1536Mi 45 | imagePullSecrets: 46 | - name: ngc-container-pull 47 | restartPolicy: Always 48 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/opt125m.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property 4 | # and proprietary rights in and to this software, related documentation 5 | # and any modifications thereto. Any use, reproduction, disclosure or 6 | # distribution of this software and related documentation without an express 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 8 | 9 | apiVersion: apps/v1 10 | kind: Deployment 11 | metadata: 12 | name: client-opt125m 13 | spec: 14 | selector: 15 | matchLabels: 16 | app: client-opt125m 17 | replicas: 1 18 | template: 19 | metadata: 20 | labels: 21 | app: client-opt125m 22 | app.kubernetes.io/component: client 23 | spec: 24 | containers: 25 | - name: client 26 | command: 27 | - python3 28 | - ./client.py 29 | env: 30 | - name: TRTLLM_MODEL_NAME 31 | value: opt125m 32 | - name: TRTLLM_TRITON_URL 33 | value: opt125m 34 | # - name: TRTLLM_MAX_TOKENS 35 | # value: "512" 36 | # - name: TRTLLM_DEBUG 37 | # value: debug 38 | image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1 39 | imagePullPolicy: IfNotPresent 40 | resources: 41 | limits: 42 | cpu: 1000m 43 | ephemeral-storage: 1Gi 44 | memory: 2Gi 45 | requests: 46 | cpu: 750m 47 | ephemeral-storage: 1Gi 48 | memory: 1536Mi 49 | imagePullSecrets: 50 | - name: ngc-container-pull 51 | restartPolicy: Always 52 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/README.md: -------------------------------------------------------------------------------- 1 | # Container Generation 2 | 3 | The files in this folder are intended to be used to create the Triton Server container image. 4 | 5 | Run the following command to create a Triton Server container image. 6 | 7 | ```bash 8 | docker build --file ./server.containerfile --tag . 9 | ``` 10 | 11 | Run the following command to create a client load generation container image. 12 | 13 | ```bash 14 | docker build --file ./client.containerfile --tag . 15 | ``` 16 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/client.containerfile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property 4 | # and proprietary rights in and to this software, related documentation 5 | # and any modifications thereto. Any use, reproduction, disclosure or 6 | # distribution of this software and related documentation without an express 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 8 | 9 | ARG BASE_CONTAINER_IMAGE=ubuntu:jammy 10 | 11 | FROM ${BASE_CONTAINER_IMAGE} 12 | 13 | # Set a set of useful labels. 14 | LABEL "base"="${BASE_CONTAINER_IMAGE}" 15 | LABEL "role"="client" 16 | 17 | # Stop APT (Debian package manager) from complaining about interactivity. 18 | ENV DEBIAN_FRONTEND=noninteractive 19 | # Set additional environment values that make usage more pleasant. 20 | ENV TERM=xterm-256color 21 | 22 | RUN apt update \ 23 | && apt install --fix-missing --no-install-recommends --yes \ 24 | ca-certificates \ 25 | wget \ 26 | apt-transport-https \ 27 | software-properties-common \ 28 | python3 \ 29 | python3-pip \ 30 | icu-devtools \ 31 | curl \ 32 | git \ 33 | && apt autoremove --yes \ 34 | && apt purge --yes \ 35 | && rm -rf /var/lib/apt/lists/* 36 | 37 | COPY client.py . 38 | 39 | ENTRYPOINT [ "/bin/bash" ] 40 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/triton_trt-llm.containerfile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ARG BASE_CONTAINER_IMAGE=nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3 16 | ARG ENGINE_DEST_PATH=/var/run/engines 17 | ARG HF_HOME=/var/run/cache 18 | 19 | FROM ${BASE_CONTAINER_IMAGE} 20 | 21 | # Set a set of useful labels. 22 | LABEL "base"="${BASE_CONTAINER_IMAGE}" 23 | LABEL "role"="server" 24 | 25 | # Stop APT (Debian package manager) from complaining about interactivity. 26 | ENV DEBIAN_FRONTEND=noninteractive 27 | # Set additional environment values that make usage more pleasant. 28 | ENV TERM=xterm-256color 29 | 30 | # Set Triton CLI environment variables which control where 31 | # TRTLLM engine and model files are downloaded to; and where 32 | # the path to the Huggingface cache. 33 | ENV ENGINE_DEST_PATH ${ENGINE_DEST_PATH} 34 | ENV HF_HOME ${HF_HOME} 35 | 36 | # Set the active working directory. 37 | WORKDIR /workspace 38 | 39 | # Copy the server script. 40 | COPY server.py . 41 | 42 | RUN apt list --installed \ 43 | && pip list --version 44 | 45 | ENTRYPOINT [ "/bin/bash" ] 46 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana-dashboard.png -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana_import-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana_import-dashboard.png -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana_new-dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana_new-dashboard.png -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/graph_gpu-utilization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/graph_gpu-utilization.png -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/graph_queue-compute-ratio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/graph_queue-compute-ratio.png -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/setup_ssh-nfs.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: setup-ssh-nfs 5 | labels: 6 | app: setup-ssh-nfs 7 | spec: 8 | containers: 9 | - name: triton 10 | image: nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3 11 | command: ["sleep", "infinity"] 12 | resources: 13 | limits: 14 | nvidia.com/gpu: 4 15 | requests: 16 | nvidia.com/gpu: 4 17 | volumeMounts: 18 | - mountPath: /var/run/models 19 | name: model-repository 20 | - mountPath: /dev/shm 21 | name: dshm 22 | volumes: 23 | - name: model-repository 24 | persistentVolumeClaim: 25 | claimName: nfs-claim-autoscaling-2 26 | - name: dshm 27 | emptyDir: 28 | medium: Memory 29 | sizeLimit: 512Gi 30 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/triton-metrics_prometheus-rule.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: monitoring.coreos.com/v1 16 | kind: PrometheusRule 17 | metadata: 18 | name: triton-metrics 19 | labels: 20 | app.kubernetes.io/component: autoscaler 21 | release: prometheus 22 | spec: 23 | groups: 24 | - name: autoscaling 25 | interval: 6s 26 | rules: 27 | # Average number of microseconds inference requests take to compute after unqueueing (not including cache hits). 28 | - expr: rate(nv_inference_compute_infer_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1) 29 | record: triton:compute_duration:average 30 | # Average number of microseconds inference requests spend queue before being processed (not including cache hits). 31 | - expr: rate(nv_inference_queue_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1) 32 | record: triton:queue_duration:average 33 | # Average number of microseconds inference requests take in total (not including cache hits). 34 | - expr: rate(nv_inference_request_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1) 35 | record: triton:request_duration:average 36 | # Average percentage of time inference requests spend in queue (not including cache hits). 37 | - expr: rate(nv_inference_queue_duration_us[1m])/clamp_min(rate(nv_inference_compute_infer_duration_us[1m]),1) 38 | record: triton:queue_compute:ratio 39 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | **/.vscode/ 3 | 4 | dev_* 5 | **/dev_* 6 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/.gitignore: -------------------------------------------------------------------------------- 1 | dev_values.yaml 2 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/Chart.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v2 16 | appVersion: 0.1.0 17 | description: Generative AI Multi-Node w/ Triton and TensorRT-LLM Guide/Tutorial 18 | icon: https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/01-nvidia-logo-vert-500x200-2c50-d@2x.png 19 | name: triton_trt-llm_multi-node_example 20 | version: 0.1.0 21 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/gpt2_values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | gpu: Tesla-V100-SXM2-16GB 16 | 17 | model: 18 | name: gpt2 19 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-70b_values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # See values.yaml for reference values. 16 | 17 | gpu: NVIDIA-A10G 18 | 19 | model: 20 | name: llama-2-70b 21 | tensorrtLlm: 22 | conversion: 23 | gpu: 8 24 | memory: 256Gi 25 | parallelism: 26 | tensor: 8 27 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b-chat_values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # See values.yaml for reference values. 16 | 17 | gpu: Tesla-V100-SXM2-16GB 18 | 19 | model: 20 | name: llama-2-7b-chat 21 | tensorrtLlm: 22 | conversion: 23 | gpu: 2 24 | memory: 64Gi 25 | parallelism: 26 | tensor: 2 27 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b_values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # See values.yaml for reference values. 16 | 17 | gpu: Tesla-V100-SXM2-16GB 18 | 19 | model: 20 | name: llama-2-7b 21 | tensorrtLlm: 22 | conversion: 23 | gpu: 2 24 | memory: 64Gi 25 | parallelism: 26 | tensor: 2 27 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-70b-instruct_values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # See values.yaml for reference values. 16 | 17 | gpu: NVIDIA-A10G 18 | 19 | model: 20 | name: llama-3-70b-instruct 21 | tensorrtLlm: 22 | conversion: 23 | gpu: 8 24 | memory: 256Gi 25 | parallelism: 26 | tensor: 8 27 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b-instruct_values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # See values.yaml for reference values. 16 | 17 | gpu: Tesla-V100-SXM2-16GB 18 | 19 | model: 20 | name: llama-3-8b-instruct 21 | tensorrtLlm: 22 | conversion: 23 | gpu: 4 24 | memory: 128Gi 25 | parallelism: 26 | tensor: 4 27 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b_values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # See values.yaml for reference values. 16 | 17 | gpu: Tesla-V100-SXM2-16GB 18 | 19 | model: 20 | name: llama-3-8b 21 | tensorrtLlm: 22 | conversion: 23 | gpu: 2 24 | memory: 64Gi 25 | parallelism: 26 | tensor: 2 27 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/opt125m_values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # See values.yaml for reference values. 16 | 17 | gpu: Tesla-V100-SXM2-16GB 18 | 19 | model: 20 | name: opt125m 21 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | {{- $create_account := true }} 2 | {{- $create_job := true }} 3 | {{- $create_service := true }} 4 | {{- with $.Values.model }} 5 | {{- if .skipConversion }} 6 | {{- $create_job = false }} 7 | {{- end }} 8 | {{- end }} 9 | {{- with $.Values.kubernetes }} 10 | {{- if .noService }} 11 | {{- $create_service = false }} 12 | {{- end }} 13 | {{- if .serviceAccount}} 14 | {{- $create_account = false }} 15 | {{- end }} 16 | {{- end }} 17 | 18 | {{ $.Chart.Name }} ({{ $.Chart.Version }}) installation complete. 19 | 20 | Release Name: {{ $.Release.Name }} 21 | Namespace: {{ $.Release.Namespace }} 22 | Deployment Name: {{ $.Release.Name }} 23 | {{- if $create_job }} 24 | Conversion Job: {{ $.Release.Name }} 25 | {{- end }} 26 | {{- if $create_service }} 27 | Service Name: {{ $.Release.Name }} 28 | {{- end }} 29 | {{- if $create_account }} 30 | ServiceAccount Name: {{ $.Release.Name }} 31 | {{- end }} 32 | 33 | Helpful commands: 34 | 35 | $ helm status --namespace={{ $.Release.Namespace }} {{ $.Release.Name }} 36 | $ helm get --namespace={{ $.Release.Namespace }} all {{ $.Release.Name }} 37 | $ kubectl get --namespace={{ $.Release.Namespace }} --selector='app={{ $.Release.Name }}' deployments 38 | {{- if $create_job -}} 39 | ,jobs 40 | {{- end -}} 41 | ,pods 42 | {{- if $create_service -}} 43 | ,services 44 | {{- end -}} 45 | ,podmonitors 46 | {{- if $create_account -}} 47 | ,serviceAccounts 48 | {{- end -}} 49 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/pod-monitor.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: monitoring.coreos.com/v1 16 | kind: PodMonitor 17 | metadata: 18 | name: {{ $.Release.Name }} 19 | labels: 20 | app: {{ $.Release.Name }} 21 | app.kubernetes.io/component: monitor 22 | release: prometheus 23 | {{- with $.Values.kubernetes }} 24 | {{- with .labels }} 25 | {{ toYaml . | indent 4 }} 26 | {{- end }} 27 | {{- end }} 28 | spec: 29 | selector: 30 | matchLabels: 31 | app: {{ $.Release.Name }} 32 | app.kubernetes.io/component: server 33 | podMetricsEndpoints: 34 | - port: metrics 35 | path: /metrics 36 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/rbac.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | {{- $service_account := 0 }} 16 | {{- with $.Values.kubernetes }} 17 | {{- with .serviceAccount }} 18 | {{- $service_account = . }} 19 | {{- end }} 20 | {{- end }} 21 | {{- if not $service_account }} 22 | apiVersion: rbac.authorization.k8s.io/v1 23 | kind: Role 24 | metadata: 25 | labels: 26 | {{- with $.Values.kubernetes }} 27 | {{- with .labels }} 28 | {{ toYaml . | indent 4 }} 29 | {{- end }} 30 | {{- end }} 31 | name: {{ $.Release.Name }} 32 | rules: 33 | - apiGroups: 34 | - '' 35 | - apps 36 | - batch 37 | resources: 38 | - deployments 39 | - jobs 40 | - pods 41 | - pods/status 42 | - services 43 | verbs: 44 | - get 45 | - list 46 | - apiGroups: [''] 47 | resources: 48 | - pods/exec 49 | verbs: 50 | - create 51 | 52 | --- 53 | 54 | apiVersion: v1 55 | kind: ServiceAccount 56 | metadata: 57 | labels: 58 | {{- with $.Values.kubernetes }} 59 | {{- with .labels }} 60 | {{ toYaml . | indent 4 }} 61 | {{- end }} 62 | {{- end }} 63 | name: {{ $.Release.Name }} 64 | 65 | --- 66 | 67 | apiVersion: rbac.authorization.k8s.io/v1 68 | kind: RoleBinding 69 | metadata: 70 | labels: 71 | {{- with $.Values.kubernetes }} 72 | {{- with .labels }} 73 | {{ toYaml . | indent 4 }} 74 | {{- end }} 75 | {{- end }} 76 | name: {{ $.Release.Name }} 77 | subjects: 78 | - kind: ServiceAccount 79 | name: {{ $.Release.Name }} 80 | roleRef: 81 | apiGroup: rbac.authorization.k8s.io 82 | kind: Role 83 | name: {{ $.Release.Name }} 84 | {{- end }} 85 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/service.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | {{- $noService := false }} 16 | {{- with $.Values.kubernetes }} 17 | {{- with .noService }} 18 | {{- $noService = . }} 19 | {{- end }} 20 | {{- end }} 21 | {{- if $noService }} 22 | # Chart values optioned to not create a service. Service not created. 23 | {{- else }} 24 | apiVersion: v1 25 | kind: Service 26 | metadata: 27 | name: {{ $.Release.Name }} 28 | labels: 29 | app: {{ $.Release.Name }} 30 | app.kubernetes.io/component: service 31 | {{- with $.Values.kubernetes }} 32 | {{- with .labels }} 33 | {{ toYaml . | indent 4 }} 34 | {{- end }} 35 | {{- end }} 36 | spec: 37 | ports: 38 | - name: http 39 | port: 8000 40 | targetPort: http 41 | - name: grpc 42 | port: 8001 43 | targetPort: grpc 44 | - name: metrics 45 | port: 8002 46 | targetPort: metrics 47 | selector: 48 | app: {{ $.Release.Name }} 49 | app.kubernetes.io/component: server 50 | pod-rank: {{ 0 | quote}} 51 | type: ClusterIP 52 | {{- end }} 53 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/README.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | 18 | # Container Generation 19 | 20 | The files in this folder are intended to be used to create the Triton Server container image. 21 | 22 | Run the following command to create a Triton Server container image. 23 | 24 | ```bash 25 | docker build --file ./triton_trt-llm.containerfile --tag . 26 | ``` 27 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/kubessh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | pod=$1 18 | shift 19 | kubectl exec $pod -- /bin/sh -c "$*" 20 | -------------------------------------------------------------------------------- /Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/pvc.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: PersistentVolumeClaim 17 | metadata: 18 | name: model-volume 19 | spec: 20 | accessModes: 21 | # The PVC must support multiple, concurrent readers and writers. 22 | # This is because multiple pods will be mapped to the PVC as each worker pod needs access to the model's data. 23 | # Additionally, multiple models could be converted in parallel by concurrent conversion jobs. 24 | - ReadWriteMany 25 | resources: 26 | requests: 27 | # This size does not need to match the PV's `spec.capacity.storage` value, but not doing so will prevent utilization of the entire PV. 28 | storage: 512Gi 29 | # Depending on your storage class provider, this value should be empty or the value specified by the provider. 30 | # Please read your provider's documentation when determining this value. 31 | storageClassName: "" 32 | # This value must be an exact match for the PV's `metadata.name` property. 33 | volumeName: model-volume 34 | -------------------------------------------------------------------------------- /Feature_Guide/Data_Pipelines/img/Flow.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Feature_Guide/Data_Pipelines/img/Flow.PNG -------------------------------------------------------------------------------- /Feature_Guide/Data_Pipelines/model_repository/model1/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "model1" 28 | backend: "python" 29 | max_batch_size: 8 30 | 31 | input [ 32 | { 33 | name: "model_1_input_string" 34 | data_type: TYPE_STRING 35 | dims: [-1] 36 | }, 37 | { 38 | name: "model_1_input_UINT8_array" 39 | data_type: TYPE_UINT8 40 | dims: [-1] 41 | }, 42 | { 43 | name: "model_1_input_INT8_array" 44 | data_type: TYPE_INT8 45 | dims: [-1] 46 | }, 47 | { 48 | name: "model_1_input_FP32_image" 49 | data_type: TYPE_FP32 50 | dims: [-1, -1, -1] 51 | }, 52 | { 53 | name: "model_1_input_bool" 54 | data_type: TYPE_BOOL 55 | dims: [-1] 56 | } 57 | ] 58 | output [ 59 | { 60 | name: "model_1_output_string" 61 | data_type: TYPE_STRING 62 | dims: [-1] 63 | }, 64 | { 65 | name: "model_1_output_UINT8_array" 66 | data_type: TYPE_UINT8 67 | dims: [-1] 68 | }, 69 | { 70 | name: "model_1_output_INT8_array" 71 | data_type: TYPE_INT8 72 | dims: [-1] 73 | }, 74 | { 75 | name: "model_1_output_FP32_image" 76 | data_type: TYPE_FP32 77 | dims: [-1, -1, -1] 78 | }, 79 | { 80 | name: "model_1_output_bool" 81 | data_type: TYPE_BOOL 82 | dims: [-1] 83 | } 84 | ] 85 | -------------------------------------------------------------------------------- /Feature_Guide/Data_Pipelines/model_repository/model2/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "model2" 28 | backend: "python" 29 | max_batch_size: 8 30 | 31 | input [ 32 | { 33 | name: "model_2_input_string" 34 | data_type: TYPE_STRING 35 | dims: [-1] 36 | }, 37 | { 38 | name: "model_2_input_UINT8_array" 39 | data_type: TYPE_UINT8 40 | dims: [-1] 41 | }, 42 | { 43 | name: "model_2_input_INT8_array" 44 | data_type: TYPE_INT8 45 | dims: [-1] 46 | }, 47 | { 48 | name: "model_2_input_FP32_image" 49 | data_type: TYPE_FP32 50 | dims: [-1, -1, -1] 51 | }, 52 | { 53 | name: "model_2_input_bool" 54 | data_type: TYPE_BOOL 55 | dims: [-1] 56 | } 57 | ] 58 | output [ 59 | { 60 | name: "model_2_output_string" 61 | data_type: TYPE_STRING 62 | dims: [-1] 63 | }, 64 | { 65 | name: "model_2_output_UINT8_array" 66 | data_type: TYPE_UINT8 67 | dims: [-1] 68 | }, 69 | { 70 | name: "model_2_output_INT8_array" 71 | data_type: TYPE_INT8 72 | dims: [-1] 73 | }, 74 | { 75 | name: "model_2_output_FP32_image" 76 | data_type: TYPE_FP32 77 | dims: [-1, -1, -1] 78 | }, 79 | { 80 | name: "model_2_output_bool" 81 | data_type: TYPE_BOOL 82 | dims: [-1] 83 | } 84 | ] 85 | -------------------------------------------------------------------------------- /Feature_Guide/Speculative_Decoding/vLLM/model_repository/base_model/1/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": "/hf-models/Meta-Llama-3-8B-Instruct" 3 | } 4 | -------------------------------------------------------------------------------- /Feature_Guide/Speculative_Decoding/vLLM/model_repository/base_model/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | # Note: You do not need to change any fields in this configuration. 28 | 29 | backend: "vllm" 30 | 31 | # The usage of device is deferred to the vLLM engine 32 | instance_group [ 33 | { 34 | count: 1 35 | kind: KIND_MODEL 36 | } 37 | ] 38 | -------------------------------------------------------------------------------- /Feature_Guide/Speculative_Decoding/vLLM/model_repository/eagle_model/1/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": "/hf-models/Meta-Llama-3-8B-Instruct", 3 | "speculative_model": "/hf-models/EAGLE-LLaMA3-Instruct-8B", 4 | "speculative_draft_tensor_parallel_size": 1, 5 | "num_speculative_tokens": 5 6 | } 7 | -------------------------------------------------------------------------------- /Feature_Guide/Speculative_Decoding/vLLM/model_repository/eagle_model/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | # Note: You do not need to change any fields in this configuration. 28 | 29 | backend: "vllm" 30 | 31 | # The usage of device is deferred to the vLLM engine 32 | instance_group [ 33 | { 34 | count: 1 35 | kind: KIND_MODEL 36 | } 37 | ] 38 | -------------------------------------------------------------------------------- /Feature_Guide/Speculative_Decoding/vLLM/model_repository/opt_model/1/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": "facebook/opt-6.7b", 3 | "speculative_model": "facebook/opt-125m", 4 | "tensor_parallel_size": 1, 5 | "num_speculative_tokens": 5 6 | } 7 | -------------------------------------------------------------------------------- /Feature_Guide/Speculative_Decoding/vLLM/model_repository/opt_model/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | # Note: You do not need to change any fields in this configuration. 28 | 29 | backend: "vllm" 30 | 31 | # The usage of device is deferred to the vLLM engine 32 | instance_group [ 33 | { 34 | count: 1 35 | kind: KIND_MODEL 36 | } 37 | ] 38 | -------------------------------------------------------------------------------- /HuggingFace/ensemble_model_repository/preprocessing/1/model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | import numpy as np 28 | import triton_python_backend_utils as pb_utils 29 | from transformers import ViTFeatureExtractor 30 | 31 | 32 | class TritonPythonModel: 33 | def initialize(self, args): 34 | self.feature_extractor = ViTFeatureExtractor.from_pretrained( 35 | "google/vit-base-patch16-224-in21k" 36 | ) 37 | 38 | def execute(self, requests): 39 | responses = [] 40 | for request in requests: 41 | inp = pb_utils.get_input_tensor_by_name(request, "image") 42 | input_image = np.squeeze(inp.as_numpy()).transpose((2, 0, 1)) 43 | 44 | inputs = self.feature_extractor(images=input_image, return_tensors="pt") 45 | pixel_values = inputs["pixel_values"].numpy() 46 | 47 | inference_response = pb_utils.InferenceResponse( 48 | output_tensors=[ 49 | pb_utils.Tensor( 50 | "pixel_values", 51 | pixel_values, 52 | ) 53 | ] 54 | ) 55 | responses.append(inference_response) 56 | return responses 57 | -------------------------------------------------------------------------------- /HuggingFace/ensemble_model_repository/preprocessing/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "preprocessing" 28 | backend: "python" 29 | max_batch_size: 8 30 | 31 | input [ 32 | { 33 | name: "image" 34 | data_type: TYPE_FP32 35 | dims: [-1, -1, -1] 36 | } 37 | ] 38 | output [ 39 | { 40 | name: "pixel_values" 41 | data_type: TYPE_FP32 42 | dims: [-1, -1, -1] 43 | } 44 | ] 45 | 46 | instance_group [ 47 | { 48 | kind: KIND_GPU 49 | } 50 | ] 51 | -------------------------------------------------------------------------------- /HuggingFace/img/Approach.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/HuggingFace/img/Approach.PNG -------------------------------------------------------------------------------- /HuggingFace/img/netron.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/HuggingFace/img/netron.PNG -------------------------------------------------------------------------------- /HuggingFace/python_model_repository/python_vit/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "python_vit" 28 | backend: "python" 29 | max_batch_size: 8 30 | 31 | input [ 32 | { 33 | name: "image" 34 | data_type: TYPE_FP32 35 | dims: [-1, -1, -1] 36 | } 37 | ] 38 | output [ 39 | { 40 | name: "last_hidden_state" 41 | data_type: TYPE_FP32 42 | dims: [-1, -1] 43 | } 44 | ] 45 | 46 | instance_group [ 47 | { 48 | kind: KIND_GPU 49 | } 50 | ] 51 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions 5 | are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of NVIDIA CORPORATION nor the names of its 12 | contributors may be used to endorse or promote products derived 13 | from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /Migration_Guide/img/arch.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Migration_Guide/img/arch.PNG -------------------------------------------------------------------------------- /Popular_Models_Guide/Llama2/README.md: -------------------------------------------------------------------------------- 1 | 28 | 29 | # Deploying Hugging Face Transformer Models in Triton 30 | 31 | There are multiple ways to run Llama2 with Tritonserver. 32 | 1. Infer with [TensorRT-LLM Backend](trtllm_guide.md#infer-with-tensorrt-llm-backend) 33 | 2. Infer with [vLLM Backend](vllm_guide.md#infer-with-vllm-backend) 34 | 3. Infer with [Python-based Backends as a HuggingFace model](../Quick_Deploy/HuggingFaceTransformers/README.md#deploying-hugging-face-transformer-models-in-triton) 35 | 36 | ## Pre-build instructions 37 | 38 | For the tutorials we are assuming that the Llama2 models, weights, and tokens are cloned from the Huggingface Llama2 repo [here](https://huggingface.co/meta-llama/Llama-2-7b-hf/tree/main). 39 | To run the tutorials, you will need to get permissions for the Llama2 repository as well as access to the huggingface cli. 40 | The cli uses [User access tokens](https://huggingface.co/docs/hub/security-tokens). The tokens can be found here: [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens). 41 | -------------------------------------------------------------------------------- /Popular_Models_Guide/Llama2/llama2vllm/1/model.json: -------------------------------------------------------------------------------- 1 | { 2 | "model":"meta-llama/Llama-2-7b-hf", 3 | "trust_remote_code":true, 4 | "download_dir":"/opt/tritonserver/model_repository/llama2vllm/hf-cache", 5 | "disable_log_requests": "true", 6 | "gpu_memory_utilization": 0.5 7 | } 8 | -------------------------------------------------------------------------------- /Popular_Models_Guide/Llama2/llama2vllm/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | # Note: You do not need to change any fields in this configuration. 28 | 29 | backend: "vllm" 30 | 31 | # The usage of device is deferred to the vLLM engine 32 | instance_group [ 33 | { 34 | count: 1 35 | kind: KIND_MODEL 36 | } 37 | ] 38 | -------------------------------------------------------------------------------- /Popular_Models_Guide/Llava1.5/model_repository/llava-1.5/config.pbtxt: -------------------------------------------------------------------------------- 1 | model_transaction_policy { 2 | decoupled: True 3 | } 4 | 5 | input [ 6 | { 7 | name: "prompt" 8 | data_type: TYPE_STRING 9 | dims: [ 1 ] 10 | }, 11 | { 12 | name: "image" 13 | data_type: TYPE_STRING 14 | dims: [ -1 ] 15 | }, 16 | { 17 | name: "max_tokens" 18 | data_type: TYPE_INT32 19 | dims: [ 1 ] 20 | optional: true 21 | }, 22 | { 23 | name: "temperature" 24 | data_type: TYPE_FP32 25 | dims: [ 1 ] 26 | optional: true 27 | }, 28 | { 29 | name: "top_k" 30 | data_type: TYPE_INT32 31 | dims: [ 1 ] 32 | optional: true 33 | }, 34 | { 35 | name: "frequency_penalty" 36 | data_type: TYPE_FP32 37 | dims: [ 1 ] 38 | optional: true 39 | }, 40 | { 41 | name: "seed" 42 | data_type: TYPE_UINT64 43 | dims: [ 1 ] 44 | optional: true 45 | } 46 | ] 47 | 48 | output [ 49 | { 50 | name: "text" 51 | data_type: TYPE_STRING 52 | dims: [ 1 ] 53 | }, 54 | { 55 | name: "finish_reason" 56 | data_type: TYPE_STRING 57 | dims: [ 1 ] 58 | }, 59 | { 60 | name: "prompt_tokens" 61 | data_type: TYPE_INT32 62 | dims: [ 1 ] 63 | }, 64 | { 65 | name: "completion_tokens" 66 | data_type: TYPE_INT32 67 | dims: [ 1 ] 68 | }, 69 | { 70 | name: "total_tokens" 71 | data_type: TYPE_INT32 72 | dims: [ 1 ] 73 | } 74 | ] 75 | 76 | 77 | instance_group [ 78 | { 79 | count: 1 80 | kind: KIND_GPU 81 | gpus: [ 0 ] 82 | } 83 | ] -------------------------------------------------------------------------------- /Popular_Models_Guide/Llava1.5/model_repository/tensorrt_llm/1/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Popular_Models_Guide/Llava1.5/model_repository/tensorrt_llm/1/.gitkeep -------------------------------------------------------------------------------- /Popular_Models_Guide/Llava1.5/model_repository/vision_encoder/config.pbtxt: -------------------------------------------------------------------------------- 1 | input [ 2 | { 3 | name: "image" 4 | data_type: TYPE_FP16 5 | dims: [ -1, 3, 336, 336 ] 6 | } 7 | ] 8 | output [ 9 | { 10 | name: "features" 11 | data_type: TYPE_FP16 12 | dims: [ 576 , -1] 13 | } 14 | ] 15 | 16 | instance_group [ 17 | { 18 | count: 1 19 | kind: KIND_GPU 20 | gpus: [ 0 ] 21 | } 22 | ] 23 | -------------------------------------------------------------------------------- /Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_1_5/1/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_1_5/1/.gitkeep -------------------------------------------------------------------------------- /Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_xl/1/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_xl/1/.gitkeep -------------------------------------------------------------------------------- /Popular_Models_Guide/StableDiffusion/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver 28 | ARG BASE_IMAGE_TAG=24.01-py3 29 | 30 | FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} as tritonserver-stable-diffusion 31 | 32 | RUN pip install --pre --upgrade --extra-index-url https://pypi.nvidia.com tensorrt-cu12==10.4.0 33 | 34 | RUN git clone https://github.com/NVIDIA/TensorRT.git -b release/10.4 --single-branch /tmp/TensorRT 35 | 36 | RUN pip3 install -r /tmp/TensorRT/demo/Diffusion/requirements.txt 37 | 38 | RUN pip3 install tritonclient[all] 39 | 40 | RUN mkdir -p /opt/tritonserver/backends/diffusion 41 | 42 | RUN cp -rf /tmp/TensorRT/demo/Diffusion /opt/tritonserver/backends/diffusion/ 43 | 44 | COPY ./backend/diffusion/model.py /opt/tritonserver/backends/diffusion/model.py 45 | 46 | COPY ./diffusion-models /workspace/diffusion-models 47 | -------------------------------------------------------------------------------- /Popular_Models_Guide/StableDiffusion/docker/Dockerfile.dockerignore: -------------------------------------------------------------------------------- 1 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | **/*.onnx 28 | **/*.plan 29 | **/*.cache/* 30 | **/*onnx* 31 | **/*engine* 32 | **/*pytorch_model* 33 | **/*.pth* 34 | -------------------------------------------------------------------------------- /Popular_Models_Guide/StableDiffusion/docs/client_0_generated_image_0_1_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Popular_Models_Guide/StableDiffusion/docs/client_0_generated_image_0_1_5.jpg -------------------------------------------------------------------------------- /Popular_Models_Guide/StableDiffusion/docs/client_0_generated_image_0_xl.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Popular_Models_Guide/StableDiffusion/docs/client_0_generated_image_0_xl.jpg -------------------------------------------------------------------------------- /Popular_Models_Guide/StableDiffusion/scripts/build_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | SOURCE_DIR=$(dirname "$(readlink -f "$0")") 29 | 30 | # install tritonserver in process api 31 | find /opt/tritonserver/python -maxdepth 1 -type f -name \ 32 | "tritonserver-*.whl" | xargs -I {} pip3 install --upgrade {}[all] 33 | 34 | 35 | # Run python script 36 | 37 | python3 $SOURCE_DIR/build_models.py "$@" 38 | -------------------------------------------------------------------------------- /Quick_Deploy/HuggingFaceTransformers/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | FROM nvcr.io/nvidia/tritonserver:23.10-py3 27 | RUN pip install transformers==4.34.0 protobuf==3.20.3 sentencepiece==0.1.99 accelerate==0.23.0 einops==0.6.1 28 | -------------------------------------------------------------------------------- /Quick_Deploy/HuggingFaceTransformers/falcon7b/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Triton backend to use 2 | backend: "python" 3 | 4 | # Hugging face model path. Parameters must follow this 5 | # key/value structure 6 | parameters: { 7 | key: "huggingface_model", 8 | value: {string_value: "tiiuae/falcon-7b"} 9 | } 10 | 11 | # The maximum number of tokens to generate in response 12 | # to our input 13 | parameters: { 14 | key: "max_output_length", 15 | value: {string_value: "15"} 16 | } 17 | 18 | # Triton should expect as input a single string of set 19 | # length named 'text_input' 20 | input [ 21 | { 22 | name: "text_input" 23 | data_type: TYPE_STRING 24 | dims: [ 1 ] 25 | } 26 | ] 27 | 28 | # Triton should expect to respond with a single string 29 | # output of variable length named 'text_output' 30 | output [ 31 | { 32 | name: "text_output" 33 | data_type: TYPE_STRING 34 | dims: [ -1 ] 35 | } 36 | ] 37 | -------------------------------------------------------------------------------- /Quick_Deploy/HuggingFaceTransformers/llama7b/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Triton backend to use 2 | backend: "python" 3 | 4 | # Hugging face model path. Parameters must follow this 5 | # key/value structure 6 | parameters: { 7 | key: "huggingface_model", 8 | value: {string_value: "meta-llama/Llama-2-7b-hf"} 9 | } 10 | # The maximum number of tokens to generate in response 11 | # to our input 12 | parameters: { 13 | key: "max_output_length", 14 | value: {string_value: "15"} 15 | } 16 | 17 | # Triton should expect as input a single string of set 18 | # length named 'text_input' 19 | input [ 20 | { 21 | name: "text_input" 22 | data_type: TYPE_STRING 23 | dims: [ 1 ] 24 | } 25 | ] 26 | 27 | # Triton should expect to respond with a single string 28 | # output of variable length named 'text_output' 29 | output [ 30 | { 31 | name: "text_output" 32 | data_type: TYPE_STRING 33 | dims: [ -1 ] 34 | } 35 | ] 36 | -------------------------------------------------------------------------------- /Quick_Deploy/HuggingFaceTransformers/persimmon8b/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Triton backend to use 2 | backend: "python" 3 | 4 | # Hugging face model path. Parameters must follow this 5 | # key/value structure 6 | parameters: { 7 | key: "huggingface_model", 8 | value: {string_value: "adept/persimmon-8b-base"} 9 | } 10 | 11 | # The maximum number of tokens to generate in response 12 | # to our input 13 | parameters: { 14 | key: "max_output_length", 15 | value: {string_value: "15"} 16 | } 17 | 18 | # Triton should expect as input a single string of set 19 | # length named 'text_input' 20 | input [ 21 | { 22 | name: "text_input" 23 | data_type: TYPE_STRING 24 | dims: [ 1 ] 25 | } 26 | ] 27 | 28 | # Triton should expect to respond with a single string 29 | # output of variable length named 'text_output' 30 | output [ 31 | { 32 | name: "text_output" 33 | data_type: TYPE_STRING 34 | dims: [ -1 ] 35 | } 36 | ] 37 | -------------------------------------------------------------------------------- /Quick_Deploy/PyTorch/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "resnet50" 28 | platform: "pytorch_libtorch" 29 | max_batch_size : 0 30 | input [ 31 | { 32 | name: "input__0" 33 | data_type: TYPE_FP32 34 | dims: [ 3, 224, 224 ] 35 | reshape { shape: [ 1, 3, 224, 224 ] } 36 | } 37 | ] 38 | output [ 39 | { 40 | name: "output__0" 41 | data_type: TYPE_FP32 42 | dims: [ 1, 1000 ,1, 1] 43 | reshape { shape: [ 1, 1000 ] } 44 | } 45 | ] 46 | -------------------------------------------------------------------------------- /Quick_Deploy/PyTorch/export.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | import torch 28 | 29 | torch.hub._validate_not_a_forked_repo = lambda a, b, c: True 30 | 31 | model = ( 32 | torch.hub.load("pytorch/vision:v0.10.0", "resnet50", pretrained=True) 33 | .eval() 34 | .to("cuda") 35 | ) 36 | traced_model = torch.jit.trace(model, torch.randn(1, 3, 224, 224).to("cuda")) 37 | torch.jit.save(traced_model, "model.pt") 38 | -------------------------------------------------------------------------------- /Quick_Deploy/TensorFlow/client.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | import numpy as np 28 | import tritonclient.http as httpclient 29 | from tensorflow.keras.applications.resnet50 import preprocess_input 30 | from tensorflow.keras.preprocessing import image 31 | from tritonclient.utils import triton_to_np_dtype 32 | 33 | 34 | def process_image(image_path="img1.jpg"): 35 | img = image.load_img(image_path, target_size=(224, 224)) 36 | x = image.img_to_array(img) 37 | x = np.expand_dims(x, axis=0) 38 | return preprocess_input(x) 39 | 40 | 41 | transformed_img = process_image() 42 | 43 | # Setting up client 44 | triton_client = httpclient.InferenceServerClient(url="localhost:8000") 45 | 46 | inputs = httpclient.InferInput("input_1", transformed_img.shape, datatype="FP32") 47 | inputs.set_data_from_numpy(transformed_img, binary_data=True) 48 | 49 | output = httpclient.InferRequestedOutput( 50 | "predictions", binary_data=True, class_count=1000 51 | ) 52 | 53 | # Querying the server 54 | results = triton_client.infer(model_name="resnet50", inputs=[inputs], outputs=[output]) 55 | 56 | predictions = results.as_numpy("predictions") 57 | print(predictions) 58 | -------------------------------------------------------------------------------- /Quick_Deploy/TensorFlow/config.pbtxt: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | name: "resnet50" 28 | platform: "tensorflow_savedmodel" 29 | max_batch_size : 0 30 | input [ 31 | { 32 | name: "input_1" 33 | data_type: TYPE_FP32 34 | dims: [-1, 224, 224, 3 ] 35 | } 36 | ] 37 | output [ 38 | { 39 | name: "predictions" 40 | data_type: TYPE_FP32 41 | dims: [-1, 1000] 42 | } 43 | ] 44 | -------------------------------------------------------------------------------- /Quick_Deploy/TensorFlow/export.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | import tensorflow as tf 28 | from tensorflow.keras.applications.resnet50 import ResNet50 29 | 30 | # Load model0 31 | model = ResNet50(weights="imagenet") 32 | model.save("resnet50_saved_model") 33 | -------------------------------------------------------------------------------- /Quick_Deploy/vLLM/.gitignore: -------------------------------------------------------------------------------- 1 | Miniconda* 2 | miniconda 3 | model_repository/vllm/vllm_env.tar.gz 4 | model_repository/vllm/triton_python_backend_stub 5 | python_backend 6 | results.txt 7 | -------------------------------------------------------------------------------- /Triton_Inference_Server_Python_API/deps/requirements.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | pyright 28 | pytest 29 | ray[all]==2.36.0 30 | -------------------------------------------------------------------------------- /Triton_Inference_Server_Python_API/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver 28 | ARG BASE_IMAGE_TAG=24.08-py3 29 | 30 | FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} as triton-python-api 31 | 32 | RUN apt-get update; apt-get install -y gdb 33 | 34 | RUN --mount=type=bind,source=./deps/requirements.txt,target=/tmp/requirements.txt \ 35 | pip install --timeout=2000 --requirement /tmp/requirements.txt 36 | 37 | # Finish pyright install 38 | 39 | RUN pyright --help 40 | 41 | RUN find /opt/tritonserver/python -maxdepth 1 -type f -name \ 42 | "tritonserver-*.whl" | xargs -I {} pip3 install --upgrade {}[all] 43 | 44 | # grafana 45 | RUN apt-get install -y adduser libfontconfig1 musl && \ 46 | wget https://dl.grafana.com/enterprise/release/grafana-enterprise_11.2.0_amd64.deb && \ 47 | dpkg -i grafana-enterprise_11.2.0_amd64.deb && \ 48 | rm -rf grafana-enterprise_11.2.0_amd64.deb 49 | 50 | RUN ln -sf /bin/bash /bin/sh 51 | 52 | -------------------------------------------------------------------------------- /Triton_Inference_Server_Python_API/docker/Dockerfile.dockerignore: -------------------------------------------------------------------------------- 1 | **/*.onnx 2 | **/*.plan 3 | **/diffuser-models/* 4 | **/identity-models/* 5 | **/scripts/stable_diffusion/models/*/*/*.onnx 6 | **/scripts/stable_diffusion/models/*/*/*.plan 7 | **/*.onnx 8 | **/*.plan 9 | **/.cache/* 10 | **/*onnx* 11 | **/*engine* 12 | **/*pytorch_model* 13 | **/*.pth* -------------------------------------------------------------------------------- /Triton_Inference_Server_Python_API/docs/car_sample.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Triton_Inference_Server_Python_API/docs/car_sample.jpg -------------------------------------------------------------------------------- /Triton_Inference_Server_Python_API/docs/sample_generated_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Triton_Inference_Server_Python_API/docs/sample_generated_image.jpg -------------------------------------------------------------------------------- /Triton_Inference_Server_Python_API/examples/kafka-io/models/tokenizer/1/model.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | import numpy as np 4 | import triton_python_backend_utils as pb_utils 5 | from transformers import BertTokenizerFast, TensorType 6 | 7 | 8 | class TritonPythonModel: 9 | tokenizer: BertTokenizerFast 10 | 11 | def initialize(self, args: Dict[str, str]) -> None: 12 | """ 13 | Initialize the tokenization process 14 | :param args: arguments from Triton config file 15 | """ 16 | self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased") 17 | 18 | def execute(self, requests) -> "List[List[pb_utils.Tensor]]": 19 | """ 20 | Parse and tokenize each request 21 | :param requests: 1 or more requests received by Triton server. 22 | :return: text as input tensors 23 | """ 24 | responses = [] 25 | # for loop for batch requests (disabled in our case) 26 | for request in requests: 27 | # binary data typed back to string 28 | query = [ 29 | t.decode("UTF-8") 30 | for t in pb_utils.get_input_tensor_by_name(request, "TEXT") 31 | .as_numpy() 32 | .tolist() 33 | ] 34 | tokens: Dict[str, np.ndarray] = self.tokenizer( 35 | text=query, 36 | return_tensors=TensorType.NUMPY, 37 | padding="max_length", 38 | max_length=256, 39 | truncation=True, 40 | ) 41 | # tensorrt uses int32 as input type, ort uses int64 42 | tokens = {k: v.astype(np.int64) for k, v in tokens.items()} 43 | # communicate the tokenization results to Triton server 44 | outputs = list() 45 | for input_name in self.tokenizer.model_input_names: 46 | tensor_input = pb_utils.Tensor(input_name, tokens[input_name]) 47 | outputs.append(tensor_input) 48 | 49 | inference_response = pb_utils.InferenceResponse(output_tensors=outputs) 50 | responses.append(inference_response) 51 | 52 | return responses 53 | 54 | def finalize(self): 55 | """`finalize` is called only once when the model is being unloaded. 56 | Implementing `finalize` function is OPTIONAL. This function allows 57 | the model to perform any necessary clean ups before exit. 58 | """ 59 | print("Cleaning up...") 60 | -------------------------------------------------------------------------------- /Triton_Inference_Server_Python_API/examples/kafka-io/models/tokenizer/config.pbtxt: -------------------------------------------------------------------------------- 1 | name: "tokenizer" 2 | max_batch_size: 0 3 | backend: "python" 4 | 5 | input [ 6 | { 7 | name: "TEXT" 8 | data_type: TYPE_STRING 9 | dims: [ -1 ] 10 | } 11 | ] 12 | 13 | output [ 14 | { 15 | name: "input_ids" 16 | data_type: TYPE_INT64 17 | dims: [-1, 256] 18 | }, 19 | { 20 | name: "attention_mask" 21 | data_type: TYPE_INT64 22 | dims: [-1, 256] 23 | }, 24 | { 25 | name: "token_type_ids" 26 | data_type: TYPE_INT64 27 | dims: [ -1, 256 ] 28 | } 29 | ] 30 | -------------------------------------------------------------------------------- /Triton_Inference_Server_Python_API/examples/kafka-io/requirements.txt: -------------------------------------------------------------------------------- 1 | click==8.1.7 2 | confluent_kafka==2.5.0 3 | gcn-kafka==0.3.3 4 | jsonschema==4.23.0 5 | pandas==2.2.2 6 | ray==2.32.0 7 | ray[serve]==2.32.0 8 | torch==2.3.1 9 | transformers==4.42.4 10 | tritonclient==2.47.0 11 | -------------------------------------------------------------------------------- /Triton_Inference_Server_Python_API/examples/kafka-io/start-kafka.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | export DEBIAN_FRONTEND=noninteractive 3 | 4 | wget https://dlcdn.apache.org/kafka/3.7.0/kafka_2.13-3.7.0.tgz 5 | tar -xzf kafka_2.13-3.7.0.tgz 6 | cd kafka_2.13-3.7.0 7 | 8 | echo "Setting up JAVA 17" 9 | apt-get update -q -y 10 | apt install -q -y openjdk-17-jdk openjdk-17-jre 11 | 12 | echo "Configuring brokers to localhost for kafka server" 13 | sed -i -e 's/#listeners=PLAINTEXT:\/\/:9092/listeners=PLAINTEXT:\/\/localhost:9092/g' config/server.properties 14 | 15 | echo "Starting zookeeper" 16 | nohup bin/zookeeper-server-start.sh -daemon config/zookeeper.properties > /dev/null 2>&1 & 17 | sleep 5 18 | echo "Successfully started zookeeper, starting kafka brokers" 19 | nohup bin/kafka-server-start.sh -daemon config/server.properties > /dev/null 2>&1 & 20 | sleep 5 21 | echo "Successfully started kafka brokers, creating input and output topics..." 22 | 23 | bin/kafka-topics.sh --create --topic inference-input --bootstrap-server localhost:9092 24 | bin/kafka-topics.sh --create --topic inference-output --bootstrap-server localhost:9092 25 | 26 | echo "Successfully created topics.\nInput topic: inference-input\nOutput topic: inference-output" 27 | 28 | echo "Topic description:" 29 | bin/kafka-topics.sh --describe --topic inference-input --bootstrap-server localhost:9092 30 | bin/kafka-topics.sh --describe --topic inference-output --bootstrap-server localhost:9092 31 | -------------------------------------------------------------------------------- /Triton_Inference_Server_Python_API/examples/kafka-io/start-server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export KAFKA_CONSUMER_MAX_WORKER_THREADS=1 4 | export CONSUMER_CONFIGS='{"bootstrap.servers": "localhost:9092", "security.protocol": "PLAINTEXT", "group.id": "triton-server-kafka-consumer"}' 5 | export PRODUCER_CONFIGS='{"bootstrap.servers": "localhost:9092", "security.protocol": "PLAINTEXT"}' 6 | export CONSUMER_TOPICS='inference-input' 7 | export PRODUCER_TOPIC='inference-output' 8 | export MODEL_INPUT_NAME='TEXT' 9 | export MODEL_NAME='tokenizer' 10 | export MODEL_REPOSITORY='./models' 11 | 12 | nohup serve run tritonserver_deployment:entrypoint & 13 | tail -f nohup.out 14 | -------------------------------------------------------------------------------- /Triton_Inference_Server_Python_API/examples/kafka-io/utils/kafka_consumer.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import deque 3 | from concurrent.futures import ThreadPoolExecutor 4 | from multiprocessing import Queue 5 | from typing import List 6 | 7 | from confluent_kafka import KafkaError, KafkaException 8 | from gcn_kafka import Consumer 9 | from ray.serve.handle import DeploymentHandle 10 | 11 | 12 | class KafkaConsumer: 13 | def __init__( 14 | self, 15 | config: dict, 16 | topics: List[str], 17 | triton_server_handle: DeploymentHandle, 18 | output_queue: deque, 19 | ): 20 | self.config = config 21 | self.topics = topics 22 | self.triton_handle = triton_server_handle 23 | self.output_queue = output_queue 24 | 25 | def read(self): 26 | consumer = Consumer(self.config) 27 | consumer.subscribe(self.topics) 28 | self._consume_data(consumer) 29 | 30 | def _infer(self, future): 31 | print("The custom callback was called.") 32 | result = future.result() 33 | self.output_queue.append(result.result()) 34 | print(f"Got: {future.result()}") 35 | 36 | def _consume_data(self, consumer): 37 | while True: 38 | try: 39 | msg = consumer.poll(0.1) 40 | if not msg: 41 | continue 42 | if msg.error(): 43 | print(msg.error()) 44 | if msg.error().code() == KafkaError._PARTITION_EOF: 45 | print( 46 | f"End of partition has been reached {msg.topic()}/{msg.partition()}" 47 | ) 48 | else: 49 | raise KafkaException(msg.error()) 50 | print(f"Key: {msg.key()}, Value: {msg.value()}") 51 | with ThreadPoolExecutor( 52 | max_workers=int( 53 | os.environ.get("KAFKA_CONSUMER_MAX_WORKER_THREADS", 1) 54 | ) 55 | ) as executor: 56 | future = executor.submit( 57 | self.triton_handle.infer.remote, [msg.value()] 58 | ) 59 | future.add_done_callback(self._infer) 60 | except KeyboardInterrupt as e: 61 | print(f"Keyboard Interrupt Received: {e}") 62 | break 63 | except Exception as e: 64 | print(f"Exception {e}") 65 | consumer.close() 66 | -------------------------------------------------------------------------------- /Triton_Inference_Server_Python_API/examples/kafka-io/utils/kafka_producer.py: -------------------------------------------------------------------------------- 1 | import json 2 | from collections import deque 3 | from datetime import datetime 4 | 5 | import numpy as np 6 | from confluent_kafka.serialization import StringSerializer 7 | from gcn_kafka import Producer 8 | 9 | 10 | class NumpyEncoder(json.JSONEncoder): 11 | def default(self, obj): 12 | if isinstance(obj, np.ndarray): 13 | return obj.tolist() 14 | return json.JSONEncoder.default(self, obj) 15 | 16 | 17 | class KafkaProducer: 18 | def __init__(self, config: dict, topic: str, message_queue: deque): 19 | self.config = config 20 | self.topics = topic 21 | self.message_queue = message_queue 22 | self.serializer = StringSerializer("utf_8") 23 | 24 | def send_data(self): 25 | producer = Producer(self.config) 26 | self._produce(producer) 27 | 28 | def _produce(self, producer): 29 | def delivery_report(err, msg): 30 | """ 31 | Reports the failure or success of a message delivery. 32 | Args: 33 | err (KafkaError): The error that occurred on None on success. 34 | msg (Message): The message that was produced or failed. 35 | """ 36 | if err is not None: 37 | print(f"Delivery failed for User record {msg.key()}: {err}") 38 | return 39 | print( 40 | f"User record successfully produced to {msg.topic()} [{msg.partition()}] at offset {msg.offset()}" 41 | ) 42 | 43 | while True: 44 | producer.poll(0.0) 45 | try: 46 | if self.message_queue.__len__() > 0: 47 | producer.produce( 48 | topic=self.topics, 49 | key=datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f"), 50 | value=self.serializer( 51 | json.dumps(self.message_queue.pop(), cls=NumpyEncoder) 52 | ), 53 | on_delivery=delivery_report, 54 | ) 55 | producer.flush() 56 | except KeyboardInterrupt as e: 57 | print(f"Keyboard Interrupt received {e}") 58 | break 59 | except Exception as e: 60 | print(f"Error while producing the message {e}") 61 | finally: 62 | producer.flush() 63 | producer.close() 64 | -------------------------------------------------------------------------------- /Triton_Inference_Server_Python_API/examples/rayserve/start_ray.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | ip_address=$(hostname -I | awk '{print $1}') 29 | 30 | echo $ip_address 31 | 32 | mkdir -p /tmp/rayserve-demo; cd /tmp/rayserve-demo 33 | 34 | ray metrics launch-prometheus 35 | 36 | export RAY_GRAFANA_HOST=http://${ip_address}:3000 37 | 38 | ray start --head --dashboard-host 0.0.0.0 --metrics-export-port 8080 --disable-usage-stats 39 | 40 | /usr/share/grafana/bin/grafana-server --homepath /usr/share/grafana --config /tmp/ray/session_latest/metrics/grafana/grafana.ini web >grafana.stdout.log 2>&1 & 41 | -------------------------------------------------------------------------------- /Triton_Inference_Server_Python_API/examples/rayserve/stop_ray.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # * Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # * Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # * Neither the name of NVIDIA CORPORATION nor the names of its 13 | # contributors may be used to endorse or promote products derived 14 | # from this software without specific prior written permission. 15 | # 16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | ray stop 29 | pkill prometheus.* 30 | pkill grafana.* 31 | -------------------------------------------------------------------------------- /Triton_Inference_Server_Python_API/identity-models/identity/config.pbtxt: -------------------------------------------------------------------------------- 1 | backend: "python" 2 | input [ 3 | { 4 | name: "string_input" 5 | data_type: TYPE_STRING 6 | dims: [ -1, -1 ] 7 | optional: true 8 | }, 9 | { 10 | name: "fp16_input", 11 | data_type: TYPE_FP16 12 | dims: [-1,-1], 13 | optional: true 14 | } 15 | ] 16 | output [ 17 | { 18 | name: "string_output" 19 | data_type: TYPE_STRING 20 | dims: [ -1, -1 ] 21 | }, 22 | { 23 | name: "fp16_output", 24 | data_type: TYPE_FP16 25 | dims: [-1, -1] 26 | } 27 | ] -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Redistribution and use in source and binary forms, with or without 4 | # modification, are permitted provided that the following conditions 5 | # are met: 6 | # * Redistributions of source code must retain the above copyright 7 | # notice, this list of conditions and the following disclaimer. 8 | # * Redistributions in binary form must reproduce the above copyright 9 | # notice, this list of conditions and the following disclaimer in the 10 | # documentation and/or other materials provided with the distribution. 11 | # * Neither the name of NVIDIA CORPORATION nor the names of its 12 | # contributors may be used to endorse or promote products derived 13 | # from this software without specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | [tool.codespell] 28 | # note: pre-commit passes explicit lists of files here, which this skip file list doesn't override - 29 | # this is only to allow you to run codespell interactively 30 | skip = "./.git,./.github" 31 | # ignore short words, and typename parameters like OffsetT 32 | ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b" 33 | # use the 'clear' dictionary for unambiguous spelling mistakes 34 | builtin = "clear" 35 | # disable warnings about binary files and wrong encoding 36 | quiet-level = 3 37 | 38 | [tool.isort] 39 | profile = "black" 40 | use_parentheses = true 41 | multi_line_output = 3 42 | include_trailing_comma = true 43 | force_grid_wrap = 0 44 | ensure_newline_before_comments = true 45 | line_length = 88 46 | balanced_wrapping = true 47 | indent = " " 48 | skip = ["build"] 49 | 50 | --------------------------------------------------------------------------------