├── .github
    └── workflows
    │   ├── codeql.yml
    │   └── pre-commit.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── Conceptual_Guide
    ├── Part_1-model_deployment
    │   ├── README.md
    │   ├── client.py
    │   ├── img
    │   │   └── multiple_models.PNG
    │   ├── img1.jpg
    │   ├── model_repository
    │   │   ├── text_detection
    │   │   │   └── config.pbtxt
    │   │   └── text_recognition
    │   │   │   └── config.pbtxt
    │   └── utils
    │   │   └── model.py
    ├── Part_2-improving_resource_utilization
    │   ├── README.md
    │   ├── img
    │   │   ├── dynamic_batching.PNG
    │   │   └── multi_instance.PNG
    │   └── model_repository
    │   │   └── text_recognition
    │   │       └── config.pbtxt
    ├── Part_3-optimizing_triton_configuration
    │   ├── README.md
    │   ├── img
    │   │   ├── arch.jpg
    │   │   ├── report_1.PNG
    │   │   └── report_2.PNG
    │   └── reports
    │   │   ├── detailed
    │   │       ├── text_recognition_config_4
    │   │       │   └── detailed_report.pdf
    │   │       ├── text_recognition_config_5
    │   │       │   └── detailed_report.pdf
    │   │       └── text_recognition_config_default
    │   │       │   └── detailed_report.pdf
    │   │   └── summaries
    │   │       └── text_recognition
    │   │           └── result_summary.pdf
    ├── Part_4-inference_acceleration
    │   ├── README.md
    │   ├── img
    │   │   ├── fw-trt-workflow.PNG
    │   │   ├── query_flow.PNG
    │   │   └── selecting_accelerator.PNG
    │   └── sample_configs
    │   │   ├── ORT_TRT_config.pbtxt
    │   │   ├── ORT_cuda_ep_config.pbtxt
    │   │   └── ORT_openvino_config.pbtxt
    ├── Part_5-Model_Ensembles
    │   ├── README.md
    │   ├── client.py
    │   ├── img1.jpg
    │   ├── model_repository
    │   │   ├── detection_postprocessing
    │   │   │   ├── 1
    │   │   │   │   └── model.py
    │   │   │   └── config.pbtxt
    │   │   ├── detection_preprocessing
    │   │   │   ├── 1
    │   │   │   │   └── model.py
    │   │   │   └── config.pbtxt
    │   │   ├── ensemble_model
    │   │   │   ├── 1
    │   │   │   │   └── .gitkeep
    │   │   │   └── config.pbtxt
    │   │   ├── recognition_postprocessing
    │   │   │   ├── 1
    │   │   │   │   └── model.py
    │   │   │   └── config.pbtxt
    │   │   ├── text_detection
    │   │   │   └── config.pbtxt
    │   │   └── text_recognition
    │   │   │   └── config.pbtxt
    │   └── utils
    │   │   ├── export_text_detection.sh
    │   │   ├── export_text_recognition.py
    │   │   ├── export_text_recognition.sh
    │   │   └── model.py
    ├── Part_6-building_complex_pipelines
    │   ├── README.md
    │   ├── client.py
    │   ├── export.py
    │   ├── gui
    │   │   ├── README.md
    │   │   ├── client.py
    │   │   └── requirements.txt
    │   ├── img
    │   │   └── multiple_backends.PNG
    │   └── model_repository
    │   │   ├── pipeline
    │   │       ├── 1
    │   │       │   └── model.py
    │   │       └── config.pbtxt
    │   │   ├── text_encoder
    │   │       └── config.pbtxt
    │   │   └── vae
    │   │       └── config.pbtxt
    ├── Part_7-iterative_scheduling
    │   ├── README.md
    │   ├── client
    │   │   ├── client.py
    │   │   └── print_utils.py
    │   ├── input_data.json
    │   └── model_repository
    │   │   ├── iterative-gpt2
    │   │       ├── 1
    │   │       │   └── model.py
    │   │       └── config.pbtxt
    │   │   └── simple-gpt2
    │   │       ├── 1
    │   │           └── model.py
    │   │       └── config.pbtxt
    ├── Part_8-semantic_caching
    │   ├── README.md
    │   └── artifacts
    │   │   ├── semantic_cache.patch
    │   │   └── semantic_caching.py
    └── README.md
├── Deployment
    └── Kubernetes
    │   ├── EKS_Multinode_Triton_TRTLLM
    │       ├── 1. Create_EKS_Cluster.md
    │       ├── 2. Configure_EKS_Cluster.md
    │       ├── 3. Deploy_Triton.md
    │       ├── README.md
    │       ├── eks_cluster_config.yaml
    │       ├── multinode_helm_chart
    │       │   ├── aws-efa-k8s-device-plugin
    │       │   │   ├── .helmignore
    │       │   │   ├── Chart.yaml
    │       │   │   ├── README.md
    │       │   │   ├── templates
    │       │   │   │   ├── NOTES.txt
    │       │   │   │   ├── _helpers.tpl
    │       │   │   │   └── daemonset.yaml
    │       │   │   └── values.yaml
    │       │   ├── chart
    │       │   │   ├── Chart.yaml
    │       │   │   ├── example_values.yaml
    │       │   │   ├── templates
    │       │   │   │   ├── NOTES.txt
    │       │   │   │   ├── deployment.yaml
    │       │   │   │   ├── hpa.yaml
    │       │   │   │   ├── pod-monitor.yaml
    │       │   │   │   ├── rbac.yaml
    │       │   │   │   └── service.yaml
    │       │   │   ├── values.schema.json
    │       │   │   └── values.yaml
    │       │   ├── containers
    │       │   │   ├── README.md
    │       │   │   ├── kubessh
    │       │   │   ├── server.py
    │       │   │   └── triton_trt_llm.containerfile
    │       │   ├── gen_ai_perf.yaml
    │       │   ├── nccl_test.yaml
    │       │   ├── nvidia_dcgm-exporter_values.yaml
    │       │   ├── nvidia_gpu-feature-discovery_daemonset.yaml
    │       │   ├── setup_ssh_efs.yaml
    │       │   └── triton-metrics_prometheus-rule.yaml
    │       ├── p5-trtllm-cluster-config.yaml
    │       └── pvc
    │       │   ├── claim.yaml
    │       │   ├── pv.yaml
    │       │   └── storageclass.yaml
    │   ├── README.md
    │   ├── TensorRT-LLM_Autoscaling_and_Load_Balancing
    │       ├── .gitignore
    │       ├── README.md
    │       ├── chart
    │       │   ├── .gitignore
    │       │   ├── Chart.yaml
    │       │   ├── gpt2_values.yaml
    │       │   ├── llama-2-7b-chat_values.yaml
    │       │   ├── llama-2-7b_values.yaml
    │       │   ├── llama-3-70b-instruct_values.yaml
    │       │   ├── llama-3-8b-instruct_values.yaml
    │       │   ├── llama-3-8b_values.yaml
    │       │   ├── opt125m_values.yaml
    │       │   ├── pvc_aws
    │       │   │   ├── claim_aws.yaml
    │       │   │   ├── pv_aws.yaml
    │       │   │   └── storageclass_aws.yaml
    │       │   ├── templates
    │       │   │   ├── NOTES.txt
    │       │   │   ├── deployment.yaml
    │       │   │   ├── horizontal-pod-autoscaler.yaml
    │       │   │   ├── pod-monitor.yaml
    │       │   │   └── service.yaml
    │       │   ├── values.schema.json
    │       │   └── values.yaml
    │       ├── clients
    │       │   ├── README.md
    │       │   ├── gpt2.yaml
    │       │   ├── llama-2-70b-instruct.yaml
    │       │   ├── llama-2-7b.yaml
    │       │   ├── llama-3-8b-instruct.yaml
    │       │   ├── llama-3-8b.yaml
    │       │   └── opt125m.yaml
    │       ├── containers
    │       │   ├── README.md
    │       │   ├── client.containerfile
    │       │   ├── client.py
    │       │   ├── server.py
    │       │   └── triton_trt-llm.containerfile
    │       ├── grafana_inference-metrics_dashboard.json
    │       ├── images
    │       │   ├── grafana-dashboard.png
    │       │   ├── grafana_import-dashboard.png
    │       │   ├── grafana_new-dashboard.png
    │       │   ├── graph_gpu-utilization.png
    │       │   └── graph_queue-compute-ratio.png
    │       ├── nvidia_dcgm-exporter_values.yaml
    │       ├── nvidia_gpu-feature-discovery_daemonset.yaml
    │       ├── prometheus-adapter_values.yaml
    │       ├── setup_ssh-nfs.yaml
    │       └── triton-metrics_prometheus-rule.yaml
    │   └── TensorRT-LLM_Multi-Node_Distributed_Models
    │       ├── .gitignore
    │       ├── README.md
    │       ├── chart
    │           ├── .gitignore
    │           ├── Chart.yaml
    │           ├── gpt2_values.yaml
    │           ├── llama-2-70b_values.yaml
    │           ├── llama-2-7b-chat_values.yaml
    │           ├── llama-2-7b_values.yaml
    │           ├── llama-3-70b-instruct_values.yaml
    │           ├── llama-3-8b-instruct_values.yaml
    │           ├── llama-3-8b_values.yaml
    │           ├── opt125m_values.yaml
    │           ├── templates
    │           │   ├── NOTES.txt
    │           │   ├── deployment.yaml
    │           │   ├── job.yaml
    │           │   ├── pod-monitor.yaml
    │           │   ├── rbac.yaml
    │           │   └── service.yaml
    │           ├── values.schema.json
    │           └── values.yaml
    │       ├── containers
    │           ├── README.md
    │           ├── kubessh
    │           ├── server.py
    │           └── triton_trt-llm.containerfile
    │       ├── nvidia_dcgm-exporter_values.yaml
    │       ├── nvidia_gpu-feature-discovery_daemonset.yaml
    │       └── pvc.yaml
├── Feature_Guide
    ├── Constrained_Decoding
    │   ├── README.md
    │   └── artifacts
    │   │   ├── client.py
    │   │   ├── client_utils.py
    │   │   └── utils.py
    ├── Data_Pipelines
    │   ├── README.md
    │   ├── client.py
    │   ├── img
    │   │   └── Flow.PNG
    │   └── model_repository
    │   │   ├── ensemble_model
    │   │       └── 1
    │   │       │   └── config.pbtxt
    │   │   ├── model1
    │   │       ├── 1
    │   │       │   └── model.py
    │   │       └── config.pbtxt
    │   │   └── model2
    │   │       ├── 1
    │   │           └── model.py
    │   │       └── config.pbtxt
    ├── Function_Calling
    │   ├── README.md
    │   └── artifacts
    │   │   ├── client.py
    │   │   ├── client_utils.py
    │   │   └── system_prompt_schema.yml
    └── Speculative_Decoding
    │   ├── README.md
    │   ├── TRT-LLM
    │       └── README.md
    │   ├── dataset-converter.py
    │   └── vLLM
    │       ├── README.md
    │       └── model_repository
    │           ├── base_model
    │               ├── 1
    │               │   └── model.json
    │               └── config.pbtxt
    │           ├── eagle_model
    │               ├── 1
    │               │   └── model.json
    │               └── config.pbtxt
    │           └── opt_model
    │               ├── 1
    │                   └── model.json
    │               └── config.pbtxt
├── HuggingFace
    ├── README.md
    ├── client.py
    ├── ensemble_model_repository
    │   ├── ensemble_model
    │   │   └── config.pbtxt
    │   └── preprocessing
    │   │   ├── 1
    │   │       └── model.py
    │   │   └── config.pbtxt
    ├── img
    │   ├── Approach.PNG
    │   └── netron.PNG
    └── python_model_repository
    │   └── python_vit
    │       ├── 1
    │           └── model.py
    │       └── config.pbtxt
├── LICENSE
├── Migration_Guide
    ├── img
    │   └── arch.PNG
    └── migration_guide.md
├── Popular_Models_Guide
    ├── DeepSeek
    │   └── README.md
    ├── Hermes-2-Pro-Llama-3-8B
    │   └── README.md
    ├── Llama2
    │   ├── README.md
    │   ├── deploy_trtllm_llama.sh
    │   ├── llama2vllm
    │   │   ├── 1
    │   │   │   └── model.json
    │   │   └── config.pbtxt
    │   ├── trtllm_guide.md
    │   └── vllm_guide.md
    ├── Llava1.5
    │   ├── llava_trtllm_guide.md
    │   ├── model_repository
    │   │   ├── llava-1.5
    │   │   │   ├── 1
    │   │   │   │   └── model.py
    │   │   │   └── config.pbtxt
    │   │   ├── tensorrt_llm
    │   │   │   ├── 1
    │   │   │   │   └── .gitkeep
    │   │   │   └── config.pbtxt
    │   │   └── vision_encoder
    │   │   │   ├── 1
    │   │   │       └── model.py
    │   │   │   └── config.pbtxt
    │   └── multi_modal_client.py
    └── StableDiffusion
    │   ├── README.md
    │   ├── backend
    │       └── diffusion
    │       │   └── model.py
    │   ├── build.sh
    │   ├── client.py
    │   ├── diffusion-models
    │       ├── stable_diffusion_1_5
    │       │   ├── 1
    │       │   │   └── .gitkeep
    │       │   └── config.pbtxt
    │       └── stable_diffusion_xl
    │       │   ├── 1
    │       │       └── .gitkeep
    │       │   └── config.pbtxt
    │   ├── docker
    │       ├── Dockerfile
    │       └── Dockerfile.dockerignore
    │   ├── docs
    │       ├── client_0_generated_image_0_1_5.jpg
    │       ├── client_0_generated_image_0_xl.jpg
    │       └── model_configuration.md
    │   ├── run.sh
    │   └── scripts
    │       ├── build_models.py
    │       └── build_models.sh
├── Quick_Deploy
    ├── HuggingFaceTransformers
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── falcon7b
    │   │   ├── 1
    │   │   │   └── model.py
    │   │   └── config.pbtxt
    │   ├── llama7b
    │   │   ├── 1
    │   │   │   └── model.py
    │   │   └── config.pbtxt
    │   └── persimmon8b
    │   │   ├── 1
    │   │       └── model.py
    │   │   └── config.pbtxt
    ├── ONNX
    │   ├── README.md
    │   └── client.py
    ├── OpenVINO
    │   └── README.md
    ├── PyTorch
    │   ├── README.md
    │   ├── client.py
    │   ├── config.pbtxt
    │   └── export.py
    ├── TensorFlow
    │   ├── README.md
    │   ├── client.py
    │   ├── config.pbtxt
    │   └── export.py
    └── vLLM
    │   ├── .gitignore
    │   └── README.md
├── README.md
├── Triton_Inference_Server_Python_API
    ├── README.md
    ├── build.sh
    ├── deps
    │   └── requirements.txt
    ├── docker
    │   ├── Dockerfile
    │   └── Dockerfile.dockerignore
    ├── docs
    │   ├── car_sample.jpg
    │   └── sample_generated_image.jpg
    ├── examples
    │   ├── kafka-io
    │   │   ├── README.md
    │   │   ├── models
    │   │   │   └── tokenizer
    │   │   │   │   ├── 1
    │   │   │   │       └── model.py
    │   │   │   │   └── config.pbtxt
    │   │   ├── requirements.txt
    │   │   ├── start-kafka.sh
    │   │   ├── start-server.sh
    │   │   ├── tritonserver_deployment.py
    │   │   └── utils
    │   │   │   ├── kafka_consumer.py
    │   │   │   └── kafka_producer.py
    │   └── rayserve
    │   │   ├── README.md
    │   │   ├── client.py
    │   │   ├── start_ray.sh
    │   │   ├── stop_ray.sh
    │   │   └── tritonserver_deployment.py
    ├── identity-models
    │   └── identity
    │   │   ├── 1
    │   │       └── model.py
    │   │   └── config.pbtxt
    └── run.sh
└── pyproject.toml


/.github/workflows/pre-commit.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: pre-commit
28 | 
29 | on:
30 |   pull_request:
31 | 
32 | jobs:
33 |   pre-commit:
34 |     runs-on: ubuntu-22.04
35 |     steps:
36 |     - uses: actions/checkout@v3
37 |     - uses: actions/setup-python@v3
38 |     - uses: pre-commit/action@v3.0.0
39 | 
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Pretrained Models
 2 | **/*.onnx
 3 | **/onnx/*.opt
 4 | **/*.bin
 5 | **/*.plan
 6 | **/pytorch_model
 7 | 
 8 | # Python Stuff
 9 | **/__pycache__
10 | 
11 | # Downloaded Assets
12 | **/downloads
13 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_1-model_deployment/img/multiple_models.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_1-model_deployment/img/multiple_models.PNG


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_1-model_deployment/img1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_1-model_deployment/img1.jpg


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_1-model_deployment/model_repository/text_detection/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "text_detection"
28 | backend: "onnxruntime"
29 | max_batch_size : 0
30 | input [
31 |   {
32 |     name: "input_images:0"
33 |     data_type: TYPE_FP32
34 |     dims: [ -1, -1, -1, 3 ]
35 |   }
36 | ]
37 | output [
38 |   {
39 |     name: "feature_fusion/Conv_7/Sigmoid:0"
40 |     data_type: TYPE_FP32
41 |     dims: [ -1, -1, -1, 1 ]
42 |   }
43 | ]
44 | output [
45 |   {
46 |     name: "feature_fusion/concat_3:0"
47 |     data_type: TYPE_FP32
48 |     dims: [ -1, -1, -1, 5 ]
49 |   }
50 | ]
51 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_1-model_deployment/model_repository/text_recognition/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "text_recognition"
28 | backend: "onnxruntime"
29 | max_batch_size : 0
30 | input [
31 |   {
32 |     name: "input.1"
33 |     data_type: TYPE_FP32
34 |     dims: [ 1, 1, 32, 100 ]
35 |   }
36 | ]
37 | output [
38 |   {
39 |     name: "308"
40 |     data_type: TYPE_FP32
41 |     dims: [ 1, 26, 37 ]
42 |   }
43 | ]
44 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_2-improving_resource_utilization/img/dynamic_batching.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_2-improving_resource_utilization/img/dynamic_batching.PNG


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_2-improving_resource_utilization/img/multi_instance.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_2-improving_resource_utilization/img/multi_instance.PNG


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_2-improving_resource_utilization/model_repository/text_recognition/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "text_recognition"
28 | backend: "onnxruntime"
29 | max_batch_size : 8
30 | input [
31 |   {
32 |     name: "input.1"
33 |     data_type: TYPE_FP32
34 |     dims: [ 1, 32, 100 ]
35 |   }
36 | ]
37 | output [
38 |   {
39 |     name: "308"
40 |     data_type: TYPE_FP32
41 |     dims: [ 26, 37 ]
42 |   }
43 | ]
44 | 
45 | dynamic_batching { }
46 | 
47 | instance_group [
48 |     {
49 |       count: 2
50 |       kind: KIND_GPU
51 |     }
52 | ]
53 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_3-optimizing_triton_configuration/img/arch.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_3-optimizing_triton_configuration/img/arch.jpg


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_3-optimizing_triton_configuration/img/report_1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_3-optimizing_triton_configuration/img/report_1.PNG


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_3-optimizing_triton_configuration/img/report_2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_3-optimizing_triton_configuration/img/report_2.PNG


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_3-optimizing_triton_configuration/reports/detailed/text_recognition_config_4/detailed_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_3-optimizing_triton_configuration/reports/detailed/text_recognition_config_4/detailed_report.pdf


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_3-optimizing_triton_configuration/reports/detailed/text_recognition_config_5/detailed_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_3-optimizing_triton_configuration/reports/detailed/text_recognition_config_5/detailed_report.pdf


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_3-optimizing_triton_configuration/reports/detailed/text_recognition_config_default/detailed_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_3-optimizing_triton_configuration/reports/detailed/text_recognition_config_default/detailed_report.pdf


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_3-optimizing_triton_configuration/reports/summaries/text_recognition/result_summary.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_3-optimizing_triton_configuration/reports/summaries/text_recognition/result_summary.pdf


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_4-inference_acceleration/img/fw-trt-workflow.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_4-inference_acceleration/img/fw-trt-workflow.PNG


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_4-inference_acceleration/img/query_flow.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_4-inference_acceleration/img/query_flow.PNG


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_4-inference_acceleration/img/selecting_accelerator.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_4-inference_acceleration/img/selecting_accelerator.PNG


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_4-inference_acceleration/sample_configs/ORT_TRT_config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "text_recognition"
28 | backend: "onnxruntime"
29 | max_batch_size : 16
30 | input [
31 |   {
32 |     name: "input.1"
33 |     data_type: TYPE_FP32
34 |     dims: [ 1, 32, 100 ]
35 |   }
36 | ]
37 | output [
38 |   {
39 |     name: "308"
40 |     data_type: TYPE_FP32
41 |     dims: [ 26, 37 ]
42 |   }
43 | ]
44 | 
45 | model_warmup {
46 |    name: "text_recognition"
47 |    batch_size: 16
48 |    inputs: {
49 |        key: "input.1"
50 |        value: {
51 |            data_type: TYPE_FP32
52 |            dims: 1
53 |            dims: 32
54 |            dims: 100
55 |            zero_data: true
56 |        }
57 |    }
58 | }
59 | 
60 | optimization {
61 |   graph : {
62 |     level : 1
63 |   }
64 |  execution_accelerators {
65 |     gpu_execution_accelerator : [ {
66 |       name : "tensorrt",
67 |       parameters { key: "precision_mode" value: "FP16" },
68 |       parameters { key: "max_workspace_size_bytes" value: "1073741824" }
69 |     }]
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_4-inference_acceleration/sample_configs/ORT_cuda_ep_config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "text_recognition"
28 | backend: "onnxruntime"
29 | max_batch_size : 16
30 | input [
31 |   {
32 |     name: "input.1"
33 |     data_type: TYPE_FP32
34 |     dims: [ 1, 32, 100 ]
35 |   }
36 | ]
37 | output [
38 |   {
39 |     name: "308"
40 |     data_type: TYPE_FP32
41 |     dims: [ 26, 37 ]
42 |   }
43 | ]
44 | 
45 | parameters { key: "cudnn_conv_algo_search" value: { string_value: "0" } }
46 | parameters { key: "gpu_mem_limit" value: { string_value: "4294967200" } }
47 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_4-inference_acceleration/sample_configs/ORT_openvino_config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "text_recognition"
28 | backend: "onnxruntime"
29 | max_batch_size : 16
30 | input [
31 |   {
32 |     name: "input.1"
33 |     data_type: TYPE_FP32
34 |     dims: [ 1, 32, 100 ]
35 |   }
36 | ]
37 | output [
38 |   {
39 |     name: "308"
40 |     data_type: TYPE_FP32
41 |     dims: [ 26, 37 ]
42 |   }
43 | ]
44 | 
45 | optimization { execution_accelerators {
46 |   cpu_execution_accelerator : [ {
47 |     name : "openvino"
48 |   } ]
49 | }}
50 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_5-Model_Ensembles/client.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | import numpy as np
28 | import tritonclient.grpc as grpcclient
29 | 
30 | client = grpcclient.InferenceServerClient(url="localhost:8001")
31 | 
32 | image_data = np.fromfile("img1.jpg", dtype="uint8")
33 | image_data = np.expand_dims(image_data, axis=0)
34 | 
35 | input_tensors = [grpcclient.InferInput("input_image", image_data.shape, "UINT8")]
36 | input_tensors[0].set_data_from_numpy(image_data)
37 | results = client.infer(model_name="ensemble_model", inputs=input_tensors)
38 | output_data = results.as_numpy("recognized_text").astype(str)
39 | print(output_data)
40 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_5-Model_Ensembles/img1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_5-Model_Ensembles/img1.jpg


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_5-Model_Ensembles/model_repository/detection_postprocessing/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "detection_postprocessing"
28 | backend: "python"
29 | max_batch_size: 256
30 | input [
31 | {
32 |     name: "detection_postprocessing_input_1"
33 |     data_type: TYPE_FP32
34 |     dims: [ -1, -1, 1 ]
35 | },
36 | {
37 |     name: "detection_postprocessing_input_2"
38 |     data_type: TYPE_FP32
39 |     dims: [ -1, -1, 5 ]
40 | },
41 | {
42 |     name: "detection_postprocessing_input_3"
43 |     data_type: TYPE_FP32
44 |     dims: [ -1, -1, 3 ]
45 | }
46 | ]
47 | 
48 | output [
49 | {
50 |     name: "detection_postprocessing_output"
51 |     data_type: TYPE_FP32
52 |     dims: [ -1, -1, -1 ]
53 | }
54 | ]
55 | 
56 | instance_group [{ kind: KIND_CPU }]
57 | 
58 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_5-Model_Ensembles/model_repository/detection_preprocessing/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "detection_preprocessing"
28 | backend: "python"
29 | max_batch_size: 256
30 | input [
31 | {
32 |     name: "detection_preprocessing_input"
33 |     data_type: TYPE_UINT8
34 |     dims: [ -1 ]
35 | }
36 | ]
37 | 
38 | output [
39 | {
40 |     name: "detection_preprocessing_output"
41 |     data_type: TYPE_FP32
42 |     dims: [ -1, -1, 3 ]
43 | }
44 | ]
45 | 
46 | instance_group [{ kind: KIND_CPU }]
47 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_5-Model_Ensembles/model_repository/ensemble_model/1/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_5-Model_Ensembles/model_repository/ensemble_model/1/.gitkeep


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_5-Model_Ensembles/model_repository/recognition_postprocessing/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "recognition_postprocessing"
28 | backend: "python"
29 | max_batch_size: 256
30 | input [
31 |     {
32 |         name: "recognition_postprocessing_input"
33 |         data_type: TYPE_FP32
34 |         dims: [ 26, 37]
35 |     }
36 | ]
37 | output [
38 |     {
39 |         name: "recognition_postprocessing_output"
40 |         data_type: TYPE_STRING
41 |         dims: [ -1 ]
42 |     }
43 | ]
44 | 
45 | instance_group [{ kind: KIND_CPU }]
46 | 
47 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_5-Model_Ensembles/model_repository/text_detection/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "text_detection"
28 | platform: "onnxruntime_onnx"
29 | max_batch_size : 256
30 | input [
31 |   {
32 |     name: "input_images:0"
33 |     data_type: TYPE_FP32
34 |     dims: [ -1, -1, 3 ]
35 |   }
36 | ]
37 | output [
38 |   {
39 |     name: "feature_fusion/Conv_7/Sigmoid:0"
40 |     data_type: TYPE_FP32
41 |     dims: [ -1, -1, 1 ]
42 |   }
43 | ]
44 | output [
45 |   {
46 |     name: "feature_fusion/concat_3:0"
47 |     data_type: TYPE_FP32
48 |     dims: [ -1, -1, 5 ]
49 |   }
50 | ]
51 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_5-Model_Ensembles/model_repository/text_recognition/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "text_recognition"
28 | platform: "onnxruntime_onnx"
29 | max_batch_size : 256
30 | input [
31 |   {
32 |     name: "input.1"
33 |     data_type: TYPE_FP32
34 |     dims: [ 1, 32, 100 ]
35 |   }
36 | ]
37 | output [
38 |   {
39 |     name: "308"
40 |     data_type: TYPE_FP32
41 |     dims: [ 26, 37 ]
42 |   }
43 | ]
44 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_5-Model_Ensembles/utils/export_text_detection.sh:
--------------------------------------------------------------------------------
 1 | ## Execute from Part_5-Model_Ensembles Directory
 2 | if [ ! -d "./model_repository" ]; then
 3 |     echo "Execute from the 'Part_5-Model_Ensembles' directory"
 4 |     exit 1
 5 | fi
 6 | 
 7 | ## Download Text Detection Model
 8 | mkdir -p downloads
 9 | wget -P downloads https://www.dropbox.com/s/r2ingd0l3zt8hxs/frozen_east_text_detection.tar.gz
10 | tar -xvf downloads/frozen_east_text_detection.tar.gz -C downloads
11 | 
12 | ## Convert to ONNX
13 | pip install -U tf2onnx
14 | mkdir -p model_repository/text_detection/1
15 | python -m tf2onnx.convert \
16 |     --input downloads/frozen_east_text_detection.pb \
17 |     --inputs "input_images:0" \
18 |     --outputs "feature_fusion/Conv_7/Sigmoid:0","feature_fusion/concat_3:0" \
19 |     --output model_repository/text_detection/1/model.onnx
20 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_5-Model_Ensembles/utils/export_text_recognition.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | from pathlib import Path
28 | 
29 | import torch
30 | from model import STRModel
31 | 
32 | # Create PyTorch Model Object
33 | model = STRModel(input_channels=1, output_channels=512, num_classes=37)
34 | 
35 | # Load model weights from external file
36 | state = torch.load("downloads/None-ResNet-None-CTC.pth")
37 | state = {key.replace("module.", ""): value for key, value in state.items()}
38 | model.load_state_dict(state)
39 | 
40 | # Create ONNX file by tracing model
41 | model_directory = Path("model_repository/text_recognition/1/")
42 | model_directory.mkdir(parents=True, exist_ok=True)
43 | trace_input = torch.randn(1, 1, 32, 100)
44 | torch.onnx.export(
45 |     model,
46 |     trace_input,
47 |     model_directory / "model.onnx",
48 |     verbose=True,
49 |     dynamic_axes={"input.1": [0], "308": [0]},
50 | )
51 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_5-Model_Ensembles/utils/export_text_recognition.sh:
--------------------------------------------------------------------------------
 1 | ## Execute from Part_5-Model_Ensembles Directory
 2 | if [ ! -d "./model_repository" ]; then
 3 |     echo "Execute from the 'Part_5-Model_Ensembles' directory"
 4 |     exit 1
 5 | fi
 6 | 
 7 | ## Download Text Detection Model
 8 | mkdir -p downloads
 9 | wget -P downloads https://www.dropbox.com/sh/j3xmli4di1zuv3s/AABzCC1KGbIRe2wRwa3diWKwa/None-ResNet-None-CTC.pth
10 | 
11 | ## Convert to ONNX
12 | python utils/export_text_recognition.py
13 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_6-building_complex_pipelines/client.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | import time
28 | 
29 | import numpy as np
30 | import tritonclient.http as httpclient
31 | from PIL import Image
32 | from tritonclient.utils import *
33 | 
34 | 
35 | def main():
36 |     client = httpclient.InferenceServerClient(url="localhost:8000")
37 | 
38 |     prompt = "Pikachu with a hat, 4k, 3d render"
39 |     text_obj = np.array([prompt], dtype="object").reshape((-1, 1))
40 | 
41 |     input_text = httpclient.InferInput(
42 |         "prompt", text_obj.shape, np_to_triton_dtype(text_obj.dtype)
43 |     )
44 |     input_text.set_data_from_numpy(text_obj)
45 | 
46 |     output_img = httpclient.InferRequestedOutput("generated_image")
47 | 
48 |     query_response = client.infer(
49 |         model_name="pipeline", inputs=[input_text], outputs=[output_img]
50 |     )
51 | 
52 |     image = query_response.as_numpy("generated_image")
53 |     im = Image.fromarray(np.squeeze(image.astype(np.uint8)))
54 |     im.save("generated_image2.jpg")
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     start = time.time()
59 |     main()
60 |     end = time.time()
61 | 
62 |     print("Time taken:", end - start)
63 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_6-building_complex_pipelines/gui/README.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | -->
28 | 
29 | # Stable Diffusion UI
30 | A simple Gradio UI for communicating with Stable Diffusion on Triton
31 | 
32 | ## To deploy
33 | ```
34 | pip install -r requirements.txt
35 | python client.py --triton_url <YOUR_TRITON_SERVER_URL>
36 | ```


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_6-building_complex_pipelines/gui/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | gradio
28 | tritonclient[grpc]
29 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_6-building_complex_pipelines/img/multiple_backends.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Conceptual_Guide/Part_6-building_complex_pipelines/img/multiple_backends.PNG


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_6-building_complex_pipelines/model_repository/pipeline/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | backend: "python"
28 | max_batch_size: 8
29 | 
30 | input [
31 |   {
32 |     name: "prompt"
33 |     data_type: TYPE_STRING
34 |     dims: [1]
35 |   }
36 | ]
37 | output [
38 |   {
39 |     name: "generated_image"
40 |     data_type: TYPE_FP32
41 |     dims: [ -1, -1, -1]
42 |   }
43 | ]
44 | 
45 | instance_group [
46 |   {
47 |     kind: KIND_GPU
48 |   }
49 | ]
50 | 
51 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_6-building_complex_pipelines/model_repository/text_encoder/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "text_encoder"
28 | platform: "onnxruntime_onnx"
29 | max_batch_size: 8
30 | 
31 | input [
32 |   {
33 |     name: "input_ids"
34 |     data_type: TYPE_INT32
35 |     dims: [ -1]
36 |   }
37 | ]
38 | output [
39 |   {
40 |     name: "pooler_output"
41 |     data_type: TYPE_FP32
42 |     dims: [ 768]
43 |   },
44 |   {
45 |     name: "last_hidden_state"
46 |     data_type: TYPE_FP32
47 |     dims: [ -1, 768]
48 |   }
49 | ]
50 | 
51 | instance_group [
52 |   {
53 |     kind: KIND_GPU
54 |   }
55 | ]
56 | optimization {
57 |   graph : {
58 |     level : 1
59 |   }
60 | }
61 | 
62 | parameters { key: "execution_mode" value: { string_value: "1" } }
63 | parameters { key: "cudnn_conv_algo_search" value: { string_value: "0" } }
64 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_6-building_complex_pipelines/model_repository/vae/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "vae"
28 | platform: "tensorrt_plan"
29 | max_batch_size: 8
30 | 
31 | input [
32 |   {
33 |     name: "latent_sample"
34 |     data_type: TYPE_FP32
35 |     dims: [ -1, -1, -1]
36 |   }
37 | ]
38 | output [
39 |   {
40 |     name: "sample"
41 |     data_type: TYPE_FP32
42 |     dims: [ 3, -1, -1]
43 |   }
44 | ]
45 | 
46 | instance_group [
47 |   {
48 |     kind: KIND_GPU
49 |   }
50 | ]
51 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_7-iterative_scheduling/client/print_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | from tqdm import tqdm
28 | 
29 | 
30 | class Display:
31 |     def __init__(self, max_tokens) -> None:
32 |         self._top = tqdm(position=0, total=max_tokens, miniters=1)
33 |         self._bottom = tqdm(position=1, total=max_tokens, miniters=1)
34 |         self._max_tokens = max_tokens
35 | 
36 |     def update_top(self):
37 |         self._top.update(1)
38 |         self._top.refresh()
39 | 
40 |     def update_bottom(self):
41 |         self._bottom.update(1)
42 |         self._bottom.refresh()
43 | 
44 |     def clear(self):
45 |         self._top.reset()
46 |         self._bottom.reset()
47 | 


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_7-iterative_scheduling/input_data.json:
--------------------------------------------------------------------------------
1 | {
2 |     "data":
3 |       [
4 |         {
5 |           "input": ["machine learning is"]
6 |         }
7 |       ]
8 |   }


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_7-iterative_scheduling/model_repository/iterative-gpt2/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | backend: "python"
28 | sequence_batching {
29 |     iterative_sequence: true
30 |     control_input: [{
31 |       name: "correlation_id"
32 |       control [
33 |         {
34 |           kind: CONTROL_SEQUENCE_CORRID
35 |           data_type: TYPE_UINT64
36 |         }
37 |       ]
38 |     },
39 |     {
40 |       name: "start"
41 |       control [
42 |         {
43 |           kind: CONTROL_SEQUENCE_START
44 |           fp32_false_true: [ 0, 1 ]
45 |         }
46 |       ]
47 |     },
48 |    {
49 |       name: "end"
50 |       control [
51 |         {
52 |           kind: CONTROL_SEQUENCE_END
53 |           fp32_false_true: [ 0, 1 ]
54 |         }
55 |       ]}
56 |     ]
57 |     oldest {}
58 |     max_sequence_idle_microseconds: 400000000
59 | }
60 | 
61 | instance_group [
62 |   {
63 |     count: 1
64 |     kind: KIND_GPU
65 |   }
66 | ]


--------------------------------------------------------------------------------
/Conceptual_Guide/Part_7-iterative_scheduling/model_repository/simple-gpt2/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | backend: "python"
28 | 
29 | instance_group [
30 |   {
31 |     count: 1
32 |     kind: KIND_GPU
33 |   }
34 | ]


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/eks_cluster_config.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: eksctl.io/v1alpha5
 2 | kind: ClusterConfig
 3 | 
 4 | metadata:
 5 |   name: wenhant-eks-cluster-east2
 6 |   version: "1.30"
 7 |   region: us-east-2
 8 | 
 9 | availabilityZones:
10 |   - us-east-2a
11 |   - us-east-2b
12 |   - us-east-2c
13 | 
14 | iam:
15 |   withOIDC: true
16 | 
17 | managedNodeGroups:
18 |   - name: sys-nodes
19 |     instanceType: c5.2xlarge
20 |     minSize: 1
21 |     desiredCapacity: 1
22 |     maxSize: 1
23 |     volumeSize: 80
24 |     availabilityZones: ["us-east-2a"]
25 |     iam:
26 |       withAddonPolicies:
27 |         imageBuilder: true
28 |         autoScaler: true
29 |         ebs: true
30 |         efs: true
31 |         awsLoadBalancerController: true
32 |         cloudWatch: true
33 |         albIngress: true
34 | 
35 |   - name: efa-compute-ng
36 |     instanceType: g5.12xlarge
37 |     minSize: 1
38 |     desiredCapacity: 1
39 |     maxSize: 1
40 |     volumeSize: 300
41 |     efaEnabled: true
42 |     privateNetworking: true
43 |     availabilityZones: ["us-east-2a"]
44 |     iam:
45 |       withAddonPolicies:
46 |         imageBuilder: true
47 |         autoScaler: true
48 |         ebs: true
49 |         efs: true
50 |         awsLoadBalancerController: true
51 |         cloudWatch: true
52 |         albIngress: true
53 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | appVersion: v0.5.3
 3 | description: A Helm chart for EFA device plugin.
 4 | home: https://github.com/aws/eks-charts
 5 | icon: https://raw.githubusercontent.com/aws/eks-charts/master/docs/logo/aws.png
 6 | name: aws-efa-k8s-device-plugin
 7 | sources:
 8 | - https://github.com/aws/eks-charts
 9 | version: v0.5.3
10 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/README.md:
--------------------------------------------------------------------------------
 1 | # AWS EFA Kubernetes Device Plugin
 2 | This chart installs the AWS EFA Kubernetes Device Plugin daemonset
 3 | 
 4 | ## Prerequisites
 5 | - Helm v3
 6 | 
 7 | ## Installing the Chart
 8 | First add the EKS repository to Helm:
 9 | 
10 | ```shell
11 | helm repo add eks https://aws.github.io/eks-charts
12 | ```
13 | 
14 | To install the chart with the release name `efa` in the `kube-system` namespace and default configuration:
15 | 
16 | ```shell
17 | helm install efa ./aws-efa-k8s-device-plugin -n kube-system
18 | ```
19 | 
20 | # Configuration
21 | 
22 | Parameter | Description | Default
23 | --- | --- | ---
24 | `image.repository` | EFA image repository | `602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/aws-efa-k8s-device-plugin`
25 | `image.tag` | EFA image tag | `v0.5.3`
26 | `securityContext.allowPrivilegeEscalation` | Controls whether a process can gain more privilege than its parent process | `false`
27 | `securityContext` | EFA plugin security context | `capabilities: drop: ["ALL"] runAsNonRoot: false`
28 | `supportedInstanceLabels.keys` | Kubernetes key to interpret as instance type | `nodes.kubernetes.io/instance-type`
29 | `supportedInstanceLabels.values` | List of instances which currently support EFA devices | `see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types`
30 | `resources` | Resources for containers in pod | `requests.cpu: 10m requests.memory: 20Mi`
31 | `nodeSelector` | Node labels for pod assignment | `{}`
32 | `tolerations` | Optional deployment tolerations | `[]`
33 | `additionalPodAnnotations` | Pod annotations to apply in addition to the default ones | `{}`
34 | `additionalPodLabels` | Pod labels to apply in addition to the default ones | `{}`
35 | `nameOverride` | Override the name of the chart | `""`
36 | `fullnameOverride` | Override the full name of the chart | `""`
37 | `imagePullSecrets` | Docker registry pull secret | `[]`
38 | 
39 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/templates/NOTES.txt:
--------------------------------------------------------------------------------
1 | EFA device plugin is installed, it can be requested as `vpc.amazonaws.com/efa` resource.


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/*
 2 | Expand the name of the chart.
 3 | */}}
 4 | {{- define "aws-efa-k8s-device-plugin.name" -}}
 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
 6 | {{- end }}
 7 | 
 8 | {{/*
 9 | Create a default fully qualified app name.
10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
11 | If release name contains chart name it will be used as a full name.
12 | */}}
13 | {{- define "aws-efa-k8s-device-plugin.fullname" -}}
14 | {{- if .Values.fullnameOverride }}
15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
16 | {{- else }}
17 | {{- $name := default .Chart.Name .Values.nameOverride }}
18 | {{- if contains $name .Release.Name }}
19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }}
20 | {{- else }}
21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
22 | {{- end }}
23 | {{- end }}
24 | {{- end }}
25 | 
26 | {{/*
27 | Create chart name and version as used by the chart label.
28 | */}}
29 | {{- define "aws-efa-k8s-device-plugin.chart" -}}
30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
31 | {{- end }}
32 | 
33 | {{/*
34 | Common labels
35 | */}}
36 | {{- define "aws-efa-k8s-device-plugin.labels" -}}
37 | helm.sh/chart: {{ include "aws-efa-k8s-device-plugin.chart" . }}
38 | {{ include "aws-efa-k8s-device-plugin.selectorLabels" . }}
39 | {{- if .Chart.AppVersion }}
40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
41 | {{- end }}
42 | app.kubernetes.io/managed-by: {{ .Release.Service }}
43 | {{- end }}
44 | 
45 | {{/*
46 | Selector labels
47 | */}}
48 | {{- define "aws-efa-k8s-device-plugin.selectorLabels" -}}
49 | app.kubernetes.io/name: {{ include "aws-efa-k8s-device-plugin.name" . }}
50 | app.kubernetes.io/instance: {{ .Release.Name }}
51 | {{- end }}
52 | 
53 | {{/*
54 | Create the name of the service account to use
55 | */}}
56 | {{- define "aws-efa-k8s-device-plugin.serviceAccountName" -}}
57 | {{- if .Values.serviceAccount.create }}
58 | {{- default (include "aws-efa-k8s-device-plugin.fullname" .) .Values.serviceAccount.name }}
59 | {{- else }}
60 | {{- default "default" .Values.serviceAccount.name }}
61 | {{- end }}
62 | {{- end }}
63 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/Chart.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: v2
16 | appVersion: 0.1.0
17 | description: Generative AI Multi-Node w/ Triton and TensorRT-LLM Guide/Tutorial
18 | icon: https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/01-nvidia-logo-vert-500x200-2c50-d@2x.png
19 | name: triton_trt-llm_multi-node_example
20 | version: 0.1.0
21 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/example_values.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # See values.yaml for reference values.
17 | 
18 | gpu: NVIDIA-A10G
19 | # gpu: NVIDIA-A100-SXM4-40GB
20 | gpuPerNode: 4
21 | persistentVolumeClaim: efs-claim-2
22 | 
23 | tensorrtLLM:
24 |   parallelism:
25 |     tensor: 4
26 |     pipeline: 2
27 | 
28 | triton:
29 |   image:
30 |    name: 210086341041.dkr.ecr.us-west-2.amazonaws.com/triton_trtllm_multinode:24.08
31 |     # name: 354625738399.dkr.ecr.us-east-1.amazonaws.com/wenhant_triton_trtllm_multinode:24.07.3
32 |   resources:
33 |     cpu: 8
34 |     memory: 32Gi
35 |     efa: 1 # If you don't want to enable EFA, set this to 0.
36 |   # triton_model_repo_path: /var/run/models/mixtral_8x7b_tp8_ep2_moetp4/triton_model_repo
37 |   # triton_model_repo_path: /var/run/models/llama3_8b_tp2_pp4/triton_model_repo
38 |   # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_tp4_moeep2_moetp2_pp2_v11_a10g/triton_model_repo
39 |   # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_tp4_moetp4_pp2_v11_a10g/triton_model_repo
40 |   # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_tp4_moeep4_pp2_v11_a10g/triton_model_repo
41 |   # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_pp8_v11_a10g/triton_model_repo
42 |   # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_tp8_v11_a10g/triton_model_repo
43 |   # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_tp4_pp2_v11_a10g/triton_model_repo
44 |   triton_model_repo_path: /var/run/models/tensorrtllm_backend/triton_model_repo
45 |   # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x22b_tp16_v11_a100/triton_model_repo
46 |   enable_nsys: false # Note if you send lots of requests, nsys report can be very large.
47 | 
48 | logging:
49 |   tritonServer:
50 |     verbose: true
51 | 
52 | autoscaling:
53 |   enable: true
54 |   replicas:
55 |     maximum: 2
56 |     minimum: 1
57 |   metric:
58 |     name: triton:queue_compute:ratio
59 |     value: 1
60 | 
61 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/NOTES.txt:
--------------------------------------------------------------------------------
 1 | {{ $.Chart.Name }} ({{ $.Chart.Version }}) installation complete.
 2 | 
 3 | Release Name: {{ $.Release.Name }}
 4 | Namespace: {{ $.Release.Namespace }}
 5 | Deployment Name: {{ $.Release.Name }}
 6 | {{- if not $.Values.kubernetes.noService }}
 7 | Service Name: {{ $.Release.Name }}
 8 | {{- end }}
 9 | {{- if $.Values.kubernetes.serviceAccount }}
10 | ServiceAccount Name: {{ $.Release.Name }}
11 | {{- end }}
12 | 
13 | Helpful commands:
14 | 
15 |   $ helm status --namespace={{ $.Release.Namespace }} {{ $.Release.Name }}
16 |   $ helm get --namespace={{ $.Release.Namespace }} all {{ $.Release.Name }}
17 |   $ kubectl get --namespace={{ $.Release.Namespace }} --selector='app={{ $.Release.Name }}' deployments
18 | ,pods
19 | {{- if not $.Values.kubernetes.noService -}}
20 | ,services
21 | {{- end -}}
22 | ,podmonitors
23 | {{- if $.Values.kubernetes.serviceAccount -}}
24 | ,serviceAccounts
25 | {{- end -}}
26 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/hpa.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: autoscaling/v2
16 | kind: HorizontalPodAutoscaler
17 | metadata:
18 |   name: {{ $.Release.Name }}
19 |   labels:
20 |     app: {{ $.Release.Name }}
21 |     app.kubernetes.io/component: autoscaler
22 |     release: prometheus
23 | {{-     with $.Values.kubernetes }}
24 | {{-       with .labels }}
25 | {{          toYaml . | indent 4 }}
26 | {{-       end }}
27 | {{-     end }}
28 | spec:
29 |   maxReplicas: {{ $.Values.autoscaling.replicas.maximum }}
30 |   minReplicas: {{ $.Values.autoscaling.replicas.minimum }}
31 |   metrics:
32 |   - type: Pods
33 |     pods:
34 |       metric:
35 |         name: {{ $.Values.autoscaling.metric.name }}
36 |       target:
37 |         type: AverageValue
38 |         averageValue: {{ $.Values.autoscaling.metric.value }}
39 |   scaleTargetRef:
40 |     apiVersion: leaderworkerset.x-k8s.io/v1
41 |     kind: LeaderWorkerSet
42 |     name: leaderworkerset-sample
43 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/pod-monitor.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: monitoring.coreos.com/v1
16 | kind: PodMonitor
17 | metadata:
18 |   name: {{ $.Release.Name }}
19 |   labels:
20 |     app: {{ $.Release.Name }}
21 |     app.kubernetes.io/component: monitor
22 |     release: prometheus
23 | {{- with $.Values.kubernetes }}
24 | {{-   with .labels }}
25 | {{      toYaml . | indent 4 }}
26 | {{-   end }}
27 | {{- end }}
28 | spec:
29 |   selector:
30 |     matchLabels:
31 |       role: leader
32 |   podMetricsEndpoints:
33 |   - port: metrics
34 |     path: /metrics
35 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/rbac.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | {{- if not $.Values.kubernetes.service_account }}
16 | apiVersion: rbac.authorization.k8s.io/v1
17 | kind: Role
18 | metadata:
19 |   labels:
20 | {{-   with $.Values.kubernetes }}
21 | {{-     with .labels }}
22 | {{        toYaml . | indent 4 }}
23 | {{-     end }}
24 | {{-   end }}
25 |   name: {{ $.Release.Name }}
26 | rules:
27 | - apiGroups:
28 |   - ''
29 |   - apps
30 |   - batch
31 |   resources:
32 |   - deployments
33 |   - jobs
34 |   - pods
35 |   - pods/status
36 |   - services
37 |   verbs:
38 |   - get
39 |   - list
40 | - apiGroups: ['']
41 |   resources:
42 |   - pods/exec
43 |   verbs:
44 |   - create
45 | 
46 | ---
47 | 
48 | apiVersion: v1
49 | kind: ServiceAccount
50 | metadata:
51 |   labels:
52 | {{-   with $.Values.kubernetes }}
53 | {{-     with .labels }}
54 | {{        toYaml . | indent 4 }}
55 | {{-     end }}
56 | {{-   end }}
57 |   name: {{ $.Release.Name }}
58 | 
59 | ---
60 | 
61 | apiVersion: rbac.authorization.k8s.io/v1
62 | kind: RoleBinding
63 | metadata:
64 |   labels:
65 | {{-   with $.Values.kubernetes }}
66 | {{-     with .labels }}
67 | {{        toYaml . | indent 4 }}
68 | {{-     end }}
69 | {{-   end }}
70 |   name: {{ $.Release.Name }}
71 | subjects:
72 | - kind: ServiceAccount
73 |   name: {{ $.Release.Name }}
74 | roleRef:
75 |   apiGroup: rbac.authorization.k8s.io
76 |   kind: Role
77 |   name: {{ $.Release.Name }}
78 | {{- end }}
79 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | {{- if $.Values.kubernetes.noService }}
16 | # Chart values optioned to not create a service. Service not created.
17 | {{- else }}
18 | apiVersion: v1
19 | kind: Service
20 | metadata:
21 |   name: {{ $.Release.Name }}
22 |   labels:
23 |     app: {{ $.Release.Name }}
24 |     app.kubernetes.io/component: service
25 | {{- with $.Values.kubernetes }}
26 | {{-   with .labels }}
27 | {{      toYaml . | indent 4 }}
28 | {{-   end }}
29 | {{- end }}
30 | spec:
31 |   type: LoadBalancer
32 |   ports:
33 |   - name: http
34 |     port: 8000
35 |     targetPort: http
36 |   - name: grpc
37 |     port: 8001
38 |     targetPort: grpc
39 |   - name: metrics
40 |     port: 8002
41 |     targetPort: metrics
42 |   selector:
43 |     role: leader
44 | {{- end }}
45 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/README.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | --->
16 | 
17 | 
18 | # Container Generation
19 | 
20 | The files in this folder are intended to be used to create the custom container image for multi-node Triton + TRT-LLM EKS deployment including installation of EFA components.
21 | 
22 | Run the following command to create the container image.
23 | 
24 | ```bash
25 | docker build --file ./triton_trt_llm.containerfile --tag <image_name_here> .
26 | ```
27 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/kubessh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | pod=$1
18 | shift
19 | kubectl exec $pod  -- /bin/sh -c "$*"
20 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/gen_ai_perf.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: gen-ai-perf
 5 |   labels:
 6 |     app: gen-ai-perf
 7 | spec:
 8 |   containers:
 9 |   - name: triton
10 |     image: nvcr.io/nvidia/tritonserver:24.07-py3-sdk
11 |     command: ["sleep", "infinity"]
12 |     volumeMounts:
13 |       - mountPath: /var/run/models
14 |         name: model-repository
15 |   volumes:
16 |     - name: model-repository
17 |       persistentVolumeClaim:
18 |         claimName: efs-claim-2
19 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/setup_ssh_efs.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: setup-ssh-efs
 5 |   labels:
 6 |     app: setup-ssh-efs
 7 | spec:
 8 |   containers:
 9 |   - name: triton
10 |     image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
11 |     command: ["sleep", "infinity"]
12 |     resources:
13 |       limits:
14 |         nvidia.com/gpu: 4
15 |       requests:
16 |         nvidia.com/gpu: 4
17 |     volumeMounts:
18 |       - mountPath: /var/run/models
19 |         name: model-repository
20 |       - mountPath: /dev/shm
21 |         name: dshm
22 |   volumes:
23 |     - name: model-repository
24 |       persistentVolumeClaim:
25 |         claimName: efs-claim-2
26 |     - name: dshm
27 |       emptyDir:
28 |         medium: Memory
29 |         sizeLimit: 32Gi
30 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/triton-metrics_prometheus-rule.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: monitoring.coreos.com/v1
16 | kind: PrometheusRule
17 | metadata:
18 |   name: triton-metrics
19 |   labels:
20 |     app.kubernetes.io/component: autoscaler
21 |     release: prometheus
22 | spec:
23 |   groups:
24 |   - name: autoscaling
25 |     interval: 6s
26 |     rules:
27 |       # Average number of microseconds inference requests take to compute after unqueueing (not including cache hits).
28 |     - expr: rate(nv_inference_compute_infer_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1)
29 |       record: triton:compute_duration:average
30 |       # Average number of microseconds inference requests spend queue before being processed (not including cache hits).
31 |     - expr: rate(nv_inference_queue_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1)
32 |       record: triton:queue_duration:average
33 |       # Average number of microseconds inference requests take in total (not including cache hits).
34 |     - expr: rate(nv_inference_request_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1)
35 |       record: triton:request_duration:average
36 |       # Average percentage of time inference requests spend in queue (not including cache hits).
37 |     - expr: rate(nv_inference_queue_duration_us[1m])/clamp_min(rate(nv_inference_compute_infer_duration_us[1m]),1)
38 |       record: triton:queue_compute:ratio
39 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/p5-trtllm-cluster-config.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: eksctl.io/v1alpha5
 2 | kind: ClusterConfig
 3 | 
 4 | metadata:
 5 |   name: trtllm-inference-cluster
 6 |   region: us-east-1
 7 |   version: "1.30"
 8 | 
 9 | vpc:
10 |   id: $PLACEHOLDER_VPC_ID
11 |   subnets:
12 |     private:
13 |       us-east-1a:
14 |         id: $PLACEHOLDER_SUBNET_PRIVATE_1
15 |     public:
16 |       us-east-1a:
17 |         id: $PLACEHOLDER_SUBNET_PUBLIC_1
18 | 
19 |   clusterEndpoints:
20 |     privateAccess: true
21 |     publicAccess: true
22 | 
23 | cloudwatch:
24 |   clusterLogging:
25 |     enableTypes: ["*"]
26 | 
27 | iam:
28 |   withOIDC: true
29 | 
30 | 
31 | managedNodeGroups:
32 |   - name: cpu-node-group
33 |     instanceType: c5.2xlarge
34 |     minSize: 0
35 |     desiredCapacity: 0
36 |     maxSize: 1
37 |     iam:
38 |       withAddonPolicies:
39 |         imageBuilder: true
40 |         autoScaler: true
41 |         ebs: true
42 |         efs: true
43 |         awsLoadBalancerController: true
44 |         cloudWatch: true
45 |         albIngress: true
46 |   - name: gpu-compute-node-group
47 |     instanceType: p5.48xlarge
48 |     instancePrefix: trtllm-compute-node
49 |     privateNetworking: true
50 |     efaEnabled: true
51 |     minSize: 0
52 |     desiredCapacity: 0
53 |     maxSize: 2
54 |     volumeSize: 500
55 |     # comment out capacityReservation if you do not need ODCR
56 |     capacityReservation:
57 |       capacityReservationTarget:
58 |         capacityReservationID: "cr-xxxxxxxxxxxxxx"
59 |     iam:
60 |       withAddonPolicies:
61 |         imageBuilder: true
62 |         autoScaler: true
63 |         ebs: true
64 |         efs: true
65 |         awsLoadBalancerController: true
66 |         cloudWatch: true
67 |         albIngress: true
68 |         externalDNS: true
69 |         certManager: true
70 |         autoScaler: true
71 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/claim.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: efs-claim-2
 5 | spec:
 6 |   accessModes:
 7 |     - ReadWriteMany
 8 |   storageClassName: efs-sc-1
 9 |   resources:
10 |     requests:
11 |       storage: 200Gi
12 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/pv.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: efs-pv-2
 5 | spec:
 6 |   capacity:
 7 |     storage: 200Gi
 8 |   volumeMode: Filesystem
 9 |   accessModes:
10 |     - ReadWriteMany
11 |   persistentVolumeReclaimPolicy: Retain
12 |   storageClassName: efs-sc-1
13 |   csi:
14 |     driver: efs.csi.aws.com
15 |     volumeHandle: fs-0d5ec63b9f8ebb2db
16 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/storageclass.yaml:
--------------------------------------------------------------------------------
1 | kind: StorageClass
2 | apiVersion: storage.k8s.io/v1
3 | metadata:
4 |   name: efs-sc-1
5 | provisioner: efs.csi.aws.com
6 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/README.md:
--------------------------------------------------------------------------------
1 | # Kubernetes Deployment of Triton Server Guides
2 | 
3 | * [TensorRT-LLM Gen. AI Autoscaling &amp; Load Balancing](./TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md)
4 | * [Multi-Node Generative AI w/ Triton Server and TensorRT-LLM](./TensorRT-LLM_Multi-Node_Distributed_Models/README.md)
5 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
2 | **/.vscode/


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/.gitignore:
--------------------------------------------------------------------------------
1 | dev_values.yaml
2 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/Chart.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: v2
16 | appVersion: 0.1.0
17 | description: Triton + TensorRT-LLM autoscaling and load balancing example.
18 | icon: https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/01-nvidia-logo-vert-500x200-2c50-d@2x.png
19 | name: triton_trt-llm_aslb-example
20 | version: 0.1.0
21 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/gpt2_values.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # See values.yaml for reference values.
16 | 
17 | gpu:
18 | - Tesla-T4
19 | - Tesla-V100-SXM2-16GB
20 | 
21 | model:
22 |   name: gpt2
23 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b-chat_values.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # See values.yaml for reference values.
16 | 
17 | gpu:
18 | - Tesla-T4
19 | - Tesla-V100-SXM2-16GB
20 | 
21 | model:
22 |   name: llama-2-7b-chat
23 |   tensorrtLlm:
24 |     parallelism:
25 |       tensor: 2
26 | 
27 | autoscaling:
28 |   metric:
29 |     value: 1500m
30 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b_values.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # See values.yaml for reference values.
16 | 
17 | gpu:
18 | - Tesla-T4
19 | - Tesla-V100-SXM2-16GB
20 | 
21 | model:
22 |   name: llama-2-7b
23 |   tensorrtLlm:
24 |     parallelism:
25 |       tensor: 2
26 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-70b-instruct_values.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # See values.yaml for reference values.
16 | 
17 | gpu:
18 | - NVIDIA-A100-SXM4-40GB
19 | 
20 | model:
21 |   name: llama-3-70b-instruct
22 |   tensorrtLlm:
23 |     parallelism:
24 |       tensor: 4
25 | 
26 | autoscaling:
27 |   metric:
28 |     value: 3500m
29 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b-instruct_values.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # See values.yaml for reference values.
16 | 
17 | gpu:
18 | - Tesla-T4
19 | - Tesla-V100-SXM2-16GB
20 | 
21 | model:
22 |   name: llama-3-8b-instruct
23 |   tensorrtLlm:
24 |     parallelism:
25 |       tensor: 2
26 | 
27 | autoscaling:
28 |   metric:
29 |     value: 1500m
30 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b_values.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # See values.yaml for reference values.
16 | 
17 | gpu:
18 | - NVIDIA-A10G
19 | - NVIDIA-A100-SXM4-40GB
20 | 
21 | model:
22 |   name: llama-3-8b
23 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/opt125m_values.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # See values.yaml for reference values.
16 | 
17 | gpu:
18 | - Tesla-T4
19 | - Tesla-V100-SXM2-16GB
20 | 
21 | model:
22 |   name: opt125m
23 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/pvc_aws/claim_aws.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolumeClaim
 3 | metadata:
 4 |   name: nfs-claim-autoscaling-2
 5 | spec:
 6 |   accessModes:
 7 |     - ReadWriteMany
 8 |   storageClassName: efs-autoscaling-sc
 9 |   resources:
10 |     requests:
11 |       storage: 200Gi
12 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/pvc_aws/pv_aws.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: efs-autoscaling-pv-2
 5 | spec:
 6 |   capacity:
 7 |     storage: 200Gi
 8 |   volumeMode: Filesystem
 9 |   accessModes:
10 |     - ReadWriteMany
11 |   persistentVolumeReclaimPolicy: Retain
12 |   storageClassName: efs-autoscaling-sc
13 |   csi:
14 |     driver: efs.csi.aws.com
15 |     volumeHandle: fs-0c6ba87870e4be751
16 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/pvc_aws/storageclass_aws.yaml:
--------------------------------------------------------------------------------
1 | kind: StorageClass
2 | apiVersion: storage.k8s.io/v1
3 | metadata:
4 |   name: efs-autoscaling-sc
5 | provisioner: efs.csi.aws.com


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/NOTES.txt:
--------------------------------------------------------------------------------
 1 | {{ $.Chart.Name }} ({{ $.Chart.Version }}) installation complete.
 2 | 
 3 | Release Name: {{ $.Release.Name }}
 4 | Namespace: {{ $.Release.Namespace }}
 5 | Deployment Name: {{ $.Release.Name }}
 6 | Service Name: {{ $.Release.Name }}
 7 | 
 8 | Helpful commands:
 9 | 
10 |   $ helm status --namespace={{ $.Release.Namespace }} {{ $.Release.Name }}
11 |   $ helm get --namespace={{ $.Release.Namespace }} all {{ $.Release.Name }}
12 |   $ kubectl get --namespace={{ $.Release.Namespace }} --selector='app={{ $.Release.Name }}' deployments,pods,hpa,services,podmonitors
13 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/horizontal-pod-autoscaler.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | {{- $metric_name := "triton:queue_compute:ratio" }}
16 | {{- $metric_value := "1000m" }}
17 | {{- $replicasMax := 4 }}
18 | {{- $replicasMin := 1 }}
19 | {{- with $.Values.autoscaling }}
20 | {{-   if .enable }}
21 | {{-     with .replicas }}
22 | {{-       with .maximum }}
23 | {{-         $replicasMax = . }}
24 | {{-       end }}
25 | {{-       with .minimum }}
26 | {{-         $replicasMin = . }}
27 | {{-       end }}
28 | {{-     end }}
29 | {{-     with .metric }}
30 | {{-       with .name }}
31 | {{-         $metric_name = . }}
32 | {{-       end }}
33 | {{-       with .value }}
34 | {{-         $metric_value = . }}
35 | {{-       end }}
36 | {{-     end }}
37 | apiVersion: autoscaling/v2
38 | kind: HorizontalPodAutoscaler
39 | metadata:
40 |   name: {{ $.Release.Name }}
41 |   labels:
42 |     app: {{ $.Release.Name }}
43 |     app.kubernetes.io/component: autoscaler
44 |     release: prometheus
45 | {{-     with $.Values.kubernetes }}
46 | {{-       with .labels }}
47 | {{          toYaml . | indent 4 }}
48 | {{-       end }}
49 | {{-     end }}
50 | spec:
51 |   maxReplicas: {{ $replicasMax }}
52 |   minReplicas: {{ $replicasMin }}
53 |   metrics:
54 |   - type: Pods
55 |     pods:
56 |       metric:
57 |         name: {{ $metric_name }}
58 |       target:
59 |         type: AverageValue
60 |         averageValue: {{ $metric_value }}
61 |   scaleTargetRef:
62 |     apiVersion: apps/v1
63 |     kind: Deployment
64 |     name: {{ $.Release.Name }}
65 | {{-   end }}
66 | {{- end }}
67 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/pod-monitor.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: monitoring.coreos.com/v1
16 | kind: PodMonitor
17 | metadata:
18 |   name: {{ $.Release.Name }}
19 |   labels:
20 |     app: {{ $.Release.Name }}
21 |     app.kubernetes.io/component: autoscaler
22 |     release: prometheus
23 | {{- with $.Values.kubernetes }}
24 | {{-   with .labels }}
25 | {{      toYaml . | indent 4 }}
26 | {{-   end }}
27 | {{- end }}
28 | spec:
29 |   selector:
30 |     matchLabels:
31 |       app: {{ $.Release.Name }}
32 |       app.kubernetes.io/component: server
33 |   podMetricsEndpoints:
34 |   - port: metrics
35 |     path: /metrics
36 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | {{- $noService := false }}
16 | {{- with $.Values.kubernetes }}
17 | {{-   with .noService }}
18 | {{-     $noService = . }}
19 | {{-   end }}
20 | {{- end }}
21 | {{- if $noService }}
22 | # Chart values optioned to not create a service. Service not created.
23 | {{- else }}
24 | apiVersion: v1
25 | kind: Service
26 | metadata:
27 |   name: {{ $.Release.Name }}
28 |   labels:
29 |     app: {{ $.Release.Name }}
30 |     app.kubernetes.io/component: service
31 | {{-   with $.Values.kubernetes }}
32 | {{-     with .labels }}
33 | {{        toYaml . | indent 4 }}
34 | {{-     end }}
35 | {{-   end }}
36 | spec:
37 |   ports:
38 |   - name: http
39 |     port: 8000
40 |     targetPort: http
41 |   - name: grpc
42 |     port: 8001
43 |     targetPort: grpc
44 |   - name: metrics
45 |     port: 8002
46 |     targetPort: metrics
47 |   selector:
48 |     app: {{ $.Release.Name }}
49 |   type: ClusterIP
50 | {{- end }}
51 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/README.md:
--------------------------------------------------------------------------------
1 | # Client Inference Generators
2 | 
3 | The files in this folder are for the deployment of client pods in the same cluster as a model hosted by Triton + TRT-LLM using
4 | the provided sample Helm chart.
5 | Each file creates a single deployment of a client container which can be used to generate inference requests for the deployed
6 | model.
7 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/gpt2.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 8 | 
 9 | apiVersion: apps/v1
10 | kind: Deployment
11 | metadata:
12 |   name: client-gpt2
13 | spec:
14 |   selector:
15 |     matchLabels:
16 |       app: client-gpt2
17 |   replicas: 1
18 |   template:
19 |     metadata:
20 |       labels:
21 |         app: client-gpt2
22 |         app.kubernetes.io/component: client
23 |     spec:
24 |       containers:
25 |       - name: client
26 |         command:
27 |         - python3
28 |         - ./client.py
29 |         env:
30 |         - name: TRTLLM_MODEL_NAME
31 |           value: gpt2
32 |         - name: TRTLLM_TRITON_URL
33 |           value: gpt2
34 |         - name: TRTLLM_MAX_TOKENS
35 |           value: "256"
36 |         # - name: TRTLLM_DEBUG
37 |         #   value: debug
38 |         image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1
39 |         imagePullPolicy: IfNotPresent
40 |         resources:
41 |           limits:
42 |             cpu: 1000m
43 |             ephemeral-storage: 1Gi
44 |             memory: 1Gi
45 |           requests:
46 |             cpu: 500m
47 |             ephemeral-storage: 1Gi
48 |             memory: 1Gi
49 |       imagePullSecrets:
50 |       - name: ngc-container-pull
51 |       restartPolicy: Always
52 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-2-70b-instruct.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 8 | 
 9 | apiVersion: apps/v1
10 | kind: Deployment
11 | metadata:
12 |   name: client-llama-2-70b-instruct
13 | spec:
14 |   selector:
15 |     matchLabels:
16 |       app: client-llama-2-70b-instruct
17 |   replicas: 1
18 |   template:
19 |     metadata:
20 |       labels:
21 |         app: client-llama-2-70b-instruct
22 |         app.kubernetes.io/component: client
23 |     spec:
24 |       containers:
25 |       - name: client
26 |         command:
27 |         - python3
28 |         - ./client.py
29 |         env:
30 |         - name: TRTLLM_MODEL_NAME
31 |           value: llama-2-70b-instruct
32 |         - name: TRTLLM_TRITON_URL
33 |           value: llama-2-70b-instruct
34 |         # - name: TRTLLM_MAX_TOKENS
35 |         #   value: "512"
36 |         # - name: TRTLLM_DEBUG
37 |         #   value: debug
38 |         image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1
39 |         imagePullPolicy: IfNotPresent
40 |         resources:
41 |           limits:
42 |             cpu: 1000m
43 |             ephemeral-storage: 1Gi
44 |             memory: 2Gi
45 |           requests:
46 |             cpu: 750m
47 |             ephemeral-storage: 1Gi
48 |             memory: 1536Mi
49 |       imagePullSecrets:
50 |       - name: ngc-container-pull
51 |       restartPolicy: Always
52 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-2-7b.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 8 | 
 9 | apiVersion: apps/v1
10 | kind: Deployment
11 | metadata:
12 |   name: client-llama-2-7b
13 | spec:
14 |   selector:
15 |     matchLabels:
16 |       app: client-llama-2-7b
17 |   replicas: 1
18 |   template:
19 |     metadata:
20 |       labels:
21 |         app: client-llama-2-7b
22 |         app.kubernetes.io/component: client
23 |     spec:
24 |       containers:
25 |       - name: client
26 |         command:
27 |         - python3
28 |         - ./client.py
29 |         env:
30 |         - name: TRTLLM_MODEL_NAME
31 |           value: llama-2-7b
32 |         - name: TRTLLM_TRITON_URL
33 |           value: llama-2-7b
34 |         # - name: TRTLLM_MAX_TOKENS
35 |         #   value: "512"
36 |         # - name: TRTLLM_DEBUG
37 |         #   value: debug
38 |         image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1
39 |         imagePullPolicy: IfNotPresent
40 |         resources:
41 |           limits:
42 |             cpu: 1000m
43 |             ephemeral-storage: 1Gi
44 |             memory: 2Gi
45 |           requests:
46 |             cpu: 750m
47 |             ephemeral-storage: 1Gi
48 |             memory: 1536Mi
49 |       imagePullSecrets:
50 |       - name: ngc-container-pull
51 |       restartPolicy: Always
52 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-3-8b-instruct.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 8 | 
 9 | apiVersion: apps/v1
10 | kind: Deployment
11 | metadata:
12 |   name: client-llama-3-8b-instruct
13 | spec:
14 |   selector:
15 |     matchLabels:
16 |       app: client-llama-3-8b-instruct
17 |   replicas: 1
18 |   template:
19 |     metadata:
20 |       labels:
21 |         app: client-llama-3-8b-instruct
22 |         app.kubernetes.io/component: client
23 |     spec:
24 |       containers:
25 |       - name: client
26 |         command:
27 |         - python3
28 |         - ./client.py
29 |         env:
30 |         - name: TRTLLM_MODEL_NAME
31 |           value: llama-3-8b-instruct
32 |         - name: TRTLLM_TRITON_URL
33 |           value: llama-3-8b-instruct
34 |         # - name: TRTLLM_MAX_TOKENS
35 |         #   value: "512"
36 |         # - name: TRTLLM_DEBUG
37 |         #   value: debug
38 |         image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1
39 |         imagePullPolicy: IfNotPresent
40 |         resources:
41 |           limits:
42 |             cpu: 1000m
43 |             ephemeral-storage: 1Gi
44 |             memory: 2Gi
45 |           requests:
46 |             cpu: 750m
47 |             ephemeral-storage: 1Gi
48 |             memory: 1536Mi
49 |       imagePullSecrets:
50 |       - name: ngc-container-pull
51 |       restartPolicy: Always
52 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/llama-3-8b.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 8 | 
 9 | apiVersion: apps/v1
10 | kind: Deployment
11 | metadata:
12 |   name: client-llama-3-8b
13 | spec:
14 |   selector:
15 |     matchLabels:
16 |       app: client-llama-3-8b
17 |   replicas: 1
18 |   template:
19 |     metadata:
20 |       labels:
21 |         app: client-llama-3-8b
22 |         app.kubernetes.io/component: client
23 |     spec:
24 |       containers:
25 |       - name: client
26 |         command:
27 |         - python3
28 |         - ./client.py
29 |         env:
30 |         - name: TRTLLM_MODEL_NAME
31 |           value: llama-3-8b
32 |         - name: TRTLLM_TRITON_URL
33 |           value: llama-3-8b
34 |         image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1
35 |         imagePullPolicy: IfNotPresent
36 |         resources:
37 |           limits:
38 |             cpu: 1000m
39 |             ephemeral-storage: 1Gi
40 |             memory: 2Gi
41 |           requests:
42 |             cpu: 750m
43 |             ephemeral-storage: 1Gi
44 |             memory: 1536Mi
45 |       imagePullSecrets:
46 |       - name: ngc-container-pull
47 |       restartPolicy: Always
48 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/clients/opt125m.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 8 | 
 9 | apiVersion: apps/v1
10 | kind: Deployment
11 | metadata:
12 |   name: client-opt125m
13 | spec:
14 |   selector:
15 |     matchLabels:
16 |       app: client-opt125m
17 |   replicas: 1
18 |   template:
19 |     metadata:
20 |       labels:
21 |         app: client-opt125m
22 |         app.kubernetes.io/component: client
23 |     spec:
24 |       containers:
25 |       - name: client
26 |         command:
27 |         - python3
28 |         - ./client.py
29 |         env:
30 |         - name: TRTLLM_MODEL_NAME
31 |           value: opt125m
32 |         - name: TRTLLM_TRITON_URL
33 |           value: opt125m
34 |         # - name: TRTLLM_MAX_TOKENS
35 |         #   value: "512"
36 |         # - name: TRTLLM_DEBUG
37 |         #   value: debug
38 |         image: nvcr.io/nvstaging/nvaie/jwyman:trtllm-client-0520.1
39 |         imagePullPolicy: IfNotPresent
40 |         resources:
41 |           limits:
42 |             cpu: 1000m
43 |             ephemeral-storage: 1Gi
44 |             memory: 2Gi
45 |           requests:
46 |             cpu: 750m
47 |             ephemeral-storage: 1Gi
48 |             memory: 1536Mi
49 |       imagePullSecrets:
50 |       - name: ngc-container-pull
51 |       restartPolicy: Always
52 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/README.md:
--------------------------------------------------------------------------------
 1 | # Container Generation
 2 | 
 3 | The files in this folder are intended to be used to create the Triton Server container image.
 4 | 
 5 | Run the following command to create a Triton Server container image.
 6 | 
 7 | ```bash
 8 | docker build --file ./server.containerfile --tag <image_name_here> .
 9 | ```
10 | 
11 | Run the following command to create a client load generation container image.
12 | 
13 | ```bash
14 | docker build --file ./client.containerfile --tag <image_name_here> .
15 | ```
16 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/client.containerfile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 8 | 
 9 | ARG BASE_CONTAINER_IMAGE=ubuntu:jammy
10 | 
11 | FROM ${BASE_CONTAINER_IMAGE}
12 | 
13 | # Set a set of useful labels.
14 | LABEL "base"="${BASE_CONTAINER_IMAGE}"
15 | LABEL "role"="client"
16 | 
17 | # Stop APT (Debian package manager) from complaining about interactivity.
18 | ENV DEBIAN_FRONTEND=noninteractive
19 | # Set additional environment values that make usage more pleasant.
20 | ENV TERM=xterm-256color
21 | 
22 | RUN apt update \
23 |  && apt install --fix-missing --no-install-recommends --yes \
24 |     ca-certificates \
25 |     wget \
26 |     apt-transport-https \
27 |     software-properties-common \
28 |     python3 \
29 |     python3-pip \
30 |     icu-devtools \
31 |     curl \
32 |     git \
33 |  && apt autoremove --yes \
34 |  && apt purge --yes \
35 |  && rm -rf /var/lib/apt/lists/*
36 | 
37 | COPY client.py .
38 | 
39 | ENTRYPOINT [ "/bin/bash" ]
40 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/triton_trt-llm.containerfile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | ARG BASE_CONTAINER_IMAGE=nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3
16 | ARG ENGINE_DEST_PATH=/var/run/engines
17 | ARG HF_HOME=/var/run/cache
18 | 
19 | FROM ${BASE_CONTAINER_IMAGE}
20 | 
21 | # Set a set of useful labels.
22 | LABEL "base"="${BASE_CONTAINER_IMAGE}"
23 | LABEL "role"="server"
24 | 
25 | # Stop APT (Debian package manager) from complaining about interactivity.
26 | ENV DEBIAN_FRONTEND=noninteractive
27 | # Set additional environment values that make usage more pleasant.
28 | ENV TERM=xterm-256color
29 | 
30 | # Set Triton CLI environment variables which control where
31 | # TRTLLM engine and model files are downloaded to; and where
32 | # the path to the Huggingface cache.
33 | ENV ENGINE_DEST_PATH ${ENGINE_DEST_PATH}
34 | ENV HF_HOME ${HF_HOME}
35 | 
36 | # Set the active working directory.
37 | WORKDIR /workspace
38 | 
39 | # Copy the server script.
40 | COPY server.py .
41 | 
42 | RUN apt list --installed \
43 |  && pip list --version
44 | 
45 | ENTRYPOINT [ "/bin/bash" ]
46 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana-dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana-dashboard.png


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana_import-dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana_import-dashboard.png


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana_new-dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/grafana_new-dashboard.png


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/graph_gpu-utilization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/graph_gpu-utilization.png


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/graph_queue-compute-ratio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/images/graph_queue-compute-ratio.png


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/setup_ssh-nfs.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: setup-ssh-nfs
 5 |   labels:
 6 |     app: setup-ssh-nfs
 7 | spec:
 8 |   containers:
 9 |   - name: triton
10 |     image: nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3
11 |     command: ["sleep", "infinity"]
12 |     resources:
13 |       limits:
14 |         nvidia.com/gpu: 4
15 |       requests:
16 |         nvidia.com/gpu: 4
17 |     volumeMounts:
18 |       - mountPath: /var/run/models
19 |         name: model-repository
20 |       - mountPath: /dev/shm
21 |         name: dshm
22 |   volumes:
23 |     - name: model-repository
24 |       persistentVolumeClaim:
25 |         claimName: nfs-claim-autoscaling-2
26 |     - name: dshm
27 |       emptyDir:
28 |         medium: Memory
29 |         sizeLimit: 512Gi
30 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/triton-metrics_prometheus-rule.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: monitoring.coreos.com/v1
16 | kind: PrometheusRule
17 | metadata:
18 |   name: triton-metrics
19 |   labels:
20 |     app.kubernetes.io/component: autoscaler
21 |     release: prometheus
22 | spec:
23 |   groups:
24 |   - name: autoscaling
25 |     interval: 6s
26 |     rules:
27 |       # Average number of microseconds inference requests take to compute after unqueueing (not including cache hits).
28 |     - expr: rate(nv_inference_compute_infer_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1)
29 |       record: triton:compute_duration:average
30 |       # Average number of microseconds inference requests spend queue before being processed (not including cache hits).
31 |     - expr: rate(nv_inference_queue_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1)
32 |       record: triton:queue_duration:average
33 |       # Average number of microseconds inference requests take in total (not including cache hits).
34 |     - expr: rate(nv_inference_request_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1)
35 |       record: triton:request_duration:average
36 |       # Average percentage of time inference requests spend in queue (not including cache hits).
37 |     - expr: rate(nv_inference_queue_duration_us[1m])/clamp_min(rate(nv_inference_compute_infer_duration_us[1m]),1)
38 |       record: triton:queue_compute:ratio
39 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
2 | **/.vscode/
3 | 
4 | dev_*
5 | **/dev_*
6 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/.gitignore:
--------------------------------------------------------------------------------
1 | dev_values.yaml
2 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/Chart.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: v2
16 | appVersion: 0.1.0
17 | description: Generative AI Multi-Node w/ Triton and TensorRT-LLM Guide/Tutorial
18 | icon: https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/01-nvidia-logo-vert-500x200-2c50-d@2x.png
19 | name: triton_trt-llm_multi-node_example
20 | version: 0.1.0
21 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/gpt2_values.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | gpu: Tesla-V100-SXM2-16GB
16 | 
17 | model:
18 |   name: gpt2
19 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-70b_values.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # See values.yaml for reference values.
16 | 
17 | gpu: NVIDIA-A10G
18 | 
19 | model:
20 |   name: llama-2-70b
21 |   tensorrtLlm:
22 |     conversion:
23 |       gpu: 8
24 |       memory: 256Gi
25 |     parallelism:
26 |       tensor: 8
27 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b-chat_values.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # See values.yaml for reference values.
16 | 
17 | gpu: Tesla-V100-SXM2-16GB
18 | 
19 | model:
20 |   name: llama-2-7b-chat
21 |   tensorrtLlm:
22 |     conversion:
23 |       gpu: 2
24 |       memory: 64Gi
25 |     parallelism:
26 |       tensor: 2
27 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b_values.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # See values.yaml for reference values.
16 | 
17 | gpu: Tesla-V100-SXM2-16GB
18 | 
19 | model:
20 |   name: llama-2-7b
21 |   tensorrtLlm:
22 |     conversion:
23 |       gpu: 2
24 |       memory: 64Gi
25 |     parallelism:
26 |       tensor: 2
27 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-70b-instruct_values.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # See values.yaml for reference values.
16 | 
17 | gpu: NVIDIA-A10G
18 | 
19 | model:
20 |   name: llama-3-70b-instruct
21 |   tensorrtLlm:
22 |     conversion:
23 |       gpu: 8
24 |       memory: 256Gi
25 |     parallelism:
26 |       tensor: 8
27 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b-instruct_values.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # See values.yaml for reference values.
16 | 
17 | gpu: Tesla-V100-SXM2-16GB
18 | 
19 | model:
20 |   name: llama-3-8b-instruct
21 |   tensorrtLlm:
22 |     conversion:
23 |       gpu: 4
24 |       memory: 128Gi
25 |     parallelism:
26 |       tensor: 4
27 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b_values.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # See values.yaml for reference values.
16 | 
17 | gpu: Tesla-V100-SXM2-16GB
18 | 
19 | model:
20 |   name: llama-3-8b
21 |   tensorrtLlm:
22 |     conversion:
23 |       gpu: 2
24 |       memory: 64Gi
25 |     parallelism:
26 |       tensor: 2
27 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/opt125m_values.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # See values.yaml for reference values.
16 | 
17 | gpu: Tesla-V100-SXM2-16GB
18 | 
19 | model:
20 |   name: opt125m
21 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/NOTES.txt:
--------------------------------------------------------------------------------
 1 | {{- $create_account := true }}
 2 | {{- $create_job := true }}
 3 | {{- $create_service := true }}
 4 | {{- with $.Values.model }}
 5 | {{-   if .skipConversion }}
 6 | {{-     $create_job = false }}
 7 | {{-   end }}
 8 | {{- end }}
 9 | {{- with $.Values.kubernetes }}
10 | {{-   if .noService }}
11 | {{-     $create_service = false }}
12 | {{-   end }}
13 | {{-   if .serviceAccount}}
14 | {{-     $create_account = false }}
15 | {{-   end }}
16 | {{- end }}
17 | 
18 | {{ $.Chart.Name }} ({{ $.Chart.Version }}) installation complete.
19 | 
20 | Release Name: {{ $.Release.Name }}
21 | Namespace: {{ $.Release.Namespace }}
22 | Deployment Name: {{ $.Release.Name }}
23 | {{- if $create_job }}
24 | Conversion Job: {{ $.Release.Name }}
25 | {{- end }}
26 | {{- if $create_service }}
27 | Service Name: {{ $.Release.Name }}
28 | {{- end }}
29 | {{- if $create_account }}
30 | ServiceAccount Name: {{ $.Release.Name }}
31 | {{- end }}
32 | 
33 | Helpful commands:
34 | 
35 |   $ helm status --namespace={{ $.Release.Namespace }} {{ $.Release.Name }}
36 |   $ helm get --namespace={{ $.Release.Namespace }} all {{ $.Release.Name }}
37 |   $ kubectl get --namespace={{ $.Release.Namespace }} --selector='app={{ $.Release.Name }}' deployments
38 | {{- if $create_job -}}
39 | ,jobs
40 | {{- end -}}
41 | ,pods
42 | {{- if $create_service -}}
43 | ,services
44 | {{- end -}}
45 | ,podmonitors
46 | {{- if $create_account -}}
47 | ,serviceAccounts
48 | {{- end -}}
49 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/pod-monitor.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: monitoring.coreos.com/v1
16 | kind: PodMonitor
17 | metadata:
18 |   name: {{ $.Release.Name }}
19 |   labels:
20 |     app: {{ $.Release.Name }}
21 |     app.kubernetes.io/component: monitor
22 |     release: prometheus
23 | {{- with $.Values.kubernetes }}
24 | {{-   with .labels }}
25 | {{      toYaml . | indent 4 }}
26 | {{-   end }}
27 | {{- end }}
28 | spec:
29 |   selector:
30 |     matchLabels:
31 |       app: {{ $.Release.Name }}
32 |       app.kubernetes.io/component: server
33 |   podMetricsEndpoints:
34 |   - port: metrics
35 |     path: /metrics
36 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/rbac.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | {{- $service_account := 0 }}
16 | {{- with $.Values.kubernetes }}
17 | {{-   with .serviceAccount }}
18 | {{-     $service_account = . }}
19 | {{-   end }}
20 | {{- end }}
21 | {{- if not $service_account }}
22 | apiVersion: rbac.authorization.k8s.io/v1
23 | kind: Role
24 | metadata:
25 |   labels:
26 | {{-   with $.Values.kubernetes }}
27 | {{-     with .labels }}
28 | {{        toYaml . | indent 4 }}
29 | {{-     end }}
30 | {{-   end }}
31 |   name: {{ $.Release.Name }}
32 | rules:
33 | - apiGroups:
34 |   - ''
35 |   - apps
36 |   - batch
37 |   resources:
38 |   - deployments
39 |   - jobs
40 |   - pods
41 |   - pods/status
42 |   - services
43 |   verbs:
44 |   - get
45 |   - list
46 | - apiGroups: ['']
47 |   resources:
48 |   - pods/exec
49 |   verbs:
50 |   - create
51 | 
52 | ---
53 | 
54 | apiVersion: v1
55 | kind: ServiceAccount
56 | metadata:
57 |   labels:
58 | {{-   with $.Values.kubernetes }}
59 | {{-     with .labels }}
60 | {{        toYaml . | indent 4 }}
61 | {{-     end }}
62 | {{-   end }}
63 |   name: {{ $.Release.Name }}
64 | 
65 | ---
66 | 
67 | apiVersion: rbac.authorization.k8s.io/v1
68 | kind: RoleBinding
69 | metadata:
70 |   labels:
71 | {{-   with $.Values.kubernetes }}
72 | {{-     with .labels }}
73 | {{        toYaml . | indent 4 }}
74 | {{-     end }}
75 | {{-   end }}
76 |   name: {{ $.Release.Name }}
77 | subjects:
78 | - kind: ServiceAccount
79 |   name: {{ $.Release.Name }}
80 | roleRef:
81 |   apiGroup: rbac.authorization.k8s.io
82 |   kind: Role
83 |   name: {{ $.Release.Name }}
84 | {{- end }}
85 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | {{- $noService := false }}
16 | {{- with $.Values.kubernetes }}
17 | {{-   with .noService }}
18 | {{-     $noService = . }}
19 | {{-   end }}
20 | {{- end }}
21 | {{- if $noService }}
22 | # Chart values optioned to not create a service. Service not created.
23 | {{- else }}
24 | apiVersion: v1
25 | kind: Service
26 | metadata:
27 |   name: {{ $.Release.Name }}
28 |   labels:
29 |     app: {{ $.Release.Name }}
30 |     app.kubernetes.io/component: service
31 | {{- with $.Values.kubernetes }}
32 | {{-   with .labels }}
33 | {{      toYaml . | indent 4 }}
34 | {{-   end }}
35 | {{- end }}
36 | spec:
37 |   ports:
38 |   - name: http
39 |     port: 8000
40 |     targetPort: http
41 |   - name: grpc
42 |     port: 8001
43 |     targetPort: grpc
44 |   - name: metrics
45 |     port: 8002
46 |     targetPort: metrics
47 |   selector:
48 |     app: {{ $.Release.Name }}
49 |     app.kubernetes.io/component: server
50 |     pod-rank: {{ 0 | quote}}
51 |   type: ClusterIP
52 | {{- end }}
53 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/README.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | --->
16 | 
17 | 
18 | # Container Generation
19 | 
20 | The files in this folder are intended to be used to create the Triton Server container image.
21 | 
22 | Run the following command to create a Triton Server container image.
23 | 
24 | ```bash
25 | docker build --file ./triton_trt-llm.containerfile --tag <image_name_here> .
26 | ```
27 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/kubessh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | pod=$1
18 | shift
19 | kubectl exec $pod  -- /bin/sh -c "$*"
20 | 


--------------------------------------------------------------------------------
/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/pvc.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: v1
16 | kind: PersistentVolumeClaim
17 | metadata:
18 |   name: model-volume
19 | spec:
20 |   accessModes:
21 |   # The PVC must support multiple, concurrent readers and writers.
22 |   # This is because multiple pods will be mapped to the PVC as each worker pod needs access to the model's data.
23 |   # Additionally, multiple models could be converted in parallel by concurrent conversion jobs.
24 |   - ReadWriteMany
25 |   resources:
26 |     requests:
27 |       # This size does not need to match the PV's `spec.capacity.storage` value, but not doing so will prevent utilization of the entire PV.
28 |       storage: 512Gi
29 |   # Depending on your storage class provider, this value should be empty or the value specified by the provider.
30 |   # Please read your provider's documentation when determining this value.
31 |   storageClassName: ""
32 |   # This value must be an exact match for the PV's `metadata.name` property.
33 |   volumeName: model-volume
34 | 


--------------------------------------------------------------------------------
/Feature_Guide/Data_Pipelines/img/Flow.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Feature_Guide/Data_Pipelines/img/Flow.PNG


--------------------------------------------------------------------------------
/Feature_Guide/Data_Pipelines/model_repository/model1/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "model1"
28 | backend: "python"
29 | max_batch_size: 8
30 | 
31 | input [
32 |   {
33 |     name: "model_1_input_string"
34 |     data_type: TYPE_STRING
35 |     dims: [-1]
36 |   },
37 |   {
38 |     name: "model_1_input_UINT8_array"
39 |     data_type: TYPE_UINT8
40 |     dims: [-1]
41 |   },
42 |   {
43 |     name: "model_1_input_INT8_array"
44 |     data_type: TYPE_INT8
45 |     dims: [-1]
46 |   },
47 |   {
48 |     name: "model_1_input_FP32_image"
49 |     data_type: TYPE_FP32
50 |     dims: [-1, -1, -1]
51 |   },
52 |   {
53 |     name: "model_1_input_bool"
54 |     data_type: TYPE_BOOL
55 |     dims: [-1]
56 |   }
57 | ]
58 | output [
59 |   {
60 |     name: "model_1_output_string"
61 |     data_type: TYPE_STRING
62 |     dims: [-1]
63 |   },
64 |   {
65 |     name: "model_1_output_UINT8_array"
66 |     data_type: TYPE_UINT8
67 |     dims: [-1]
68 |   },
69 |   {
70 |     name: "model_1_output_INT8_array"
71 |     data_type: TYPE_INT8
72 |     dims: [-1]
73 |   },
74 |   {
75 |     name: "model_1_output_FP32_image"
76 |     data_type: TYPE_FP32
77 |     dims: [-1, -1, -1]
78 |   },
79 |   {
80 |     name: "model_1_output_bool"
81 |     data_type: TYPE_BOOL
82 |     dims: [-1]
83 |   }
84 | ]
85 | 


--------------------------------------------------------------------------------
/Feature_Guide/Data_Pipelines/model_repository/model2/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "model2"
28 | backend: "python"
29 | max_batch_size: 8
30 | 
31 | input [
32 |   {
33 |     name: "model_2_input_string"
34 |     data_type: TYPE_STRING
35 |     dims: [-1]
36 |   },
37 |   {
38 |     name: "model_2_input_UINT8_array"
39 |     data_type: TYPE_UINT8
40 |     dims: [-1]
41 |   },
42 |   {
43 |     name: "model_2_input_INT8_array"
44 |     data_type: TYPE_INT8
45 |     dims: [-1]
46 |   },
47 |   {
48 |     name: "model_2_input_FP32_image"
49 |     data_type: TYPE_FP32
50 |     dims: [-1, -1, -1]
51 |   },
52 |   {
53 |     name: "model_2_input_bool"
54 |     data_type: TYPE_BOOL
55 |     dims: [-1]
56 |   }
57 | ]
58 | output [
59 |   {
60 |     name: "model_2_output_string"
61 |     data_type: TYPE_STRING
62 |     dims: [-1]
63 |   },
64 |   {
65 |     name: "model_2_output_UINT8_array"
66 |     data_type: TYPE_UINT8
67 |     dims: [-1]
68 |   },
69 |   {
70 |     name: "model_2_output_INT8_array"
71 |     data_type: TYPE_INT8
72 |     dims: [-1]
73 |   },
74 |   {
75 |     name: "model_2_output_FP32_image"
76 |     data_type: TYPE_FP32
77 |     dims: [-1, -1, -1]
78 |   },
79 |   {
80 |     name: "model_2_output_bool"
81 |     data_type: TYPE_BOOL
82 |     dims: [-1]
83 |   }
84 | ]
85 | 


--------------------------------------------------------------------------------
/Feature_Guide/Speculative_Decoding/vLLM/model_repository/base_model/1/model.json:
--------------------------------------------------------------------------------
1 | {
2 |     "model": "/hf-models/Meta-Llama-3-8B-Instruct"
3 | }
4 | 


--------------------------------------------------------------------------------
/Feature_Guide/Speculative_Decoding/vLLM/model_repository/base_model/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | # Note: You do not need to change any fields in this configuration.
28 | 
29 | backend: "vllm"
30 | 
31 | # The usage of device is deferred to the vLLM engine
32 | instance_group [
33 |   {
34 |     count: 1
35 |     kind: KIND_MODEL
36 |   }
37 | ]
38 | 


--------------------------------------------------------------------------------
/Feature_Guide/Speculative_Decoding/vLLM/model_repository/eagle_model/1/model.json:
--------------------------------------------------------------------------------
1 | {
2 |     "model": "/hf-models/Meta-Llama-3-8B-Instruct",
3 |     "speculative_model": "/hf-models/EAGLE-LLaMA3-Instruct-8B",
4 |     "speculative_draft_tensor_parallel_size": 1,
5 |     "num_speculative_tokens": 5
6 | }
7 | 


--------------------------------------------------------------------------------
/Feature_Guide/Speculative_Decoding/vLLM/model_repository/eagle_model/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | # Note: You do not need to change any fields in this configuration.
28 | 
29 | backend: "vllm"
30 | 
31 | # The usage of device is deferred to the vLLM engine
32 | instance_group [
33 |   {
34 |     count: 1
35 |     kind: KIND_MODEL
36 |   }
37 | ]
38 | 


--------------------------------------------------------------------------------
/Feature_Guide/Speculative_Decoding/vLLM/model_repository/opt_model/1/model.json:
--------------------------------------------------------------------------------
1 | {
2 |     "model": "facebook/opt-6.7b",
3 |     "speculative_model": "facebook/opt-125m",
4 |     "tensor_parallel_size": 1,
5 |     "num_speculative_tokens": 5
6 | }
7 | 


--------------------------------------------------------------------------------
/Feature_Guide/Speculative_Decoding/vLLM/model_repository/opt_model/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | # Note: You do not need to change any fields in this configuration.
28 | 
29 | backend: "vllm"
30 | 
31 | # The usage of device is deferred to the vLLM engine
32 | instance_group [
33 |   {
34 |     count: 1
35 |     kind: KIND_MODEL
36 |   }
37 | ]
38 | 


--------------------------------------------------------------------------------
/HuggingFace/ensemble_model_repository/preprocessing/1/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | import numpy as np
28 | import triton_python_backend_utils as pb_utils
29 | from transformers import ViTFeatureExtractor
30 | 
31 | 
32 | class TritonPythonModel:
33 |     def initialize(self, args):
34 |         self.feature_extractor = ViTFeatureExtractor.from_pretrained(
35 |             "google/vit-base-patch16-224-in21k"
36 |         )
37 | 
38 |     def execute(self, requests):
39 |         responses = []
40 |         for request in requests:
41 |             inp = pb_utils.get_input_tensor_by_name(request, "image")
42 |             input_image = np.squeeze(inp.as_numpy()).transpose((2, 0, 1))
43 | 
44 |             inputs = self.feature_extractor(images=input_image, return_tensors="pt")
45 |             pixel_values = inputs["pixel_values"].numpy()
46 | 
47 |             inference_response = pb_utils.InferenceResponse(
48 |                 output_tensors=[
49 |                     pb_utils.Tensor(
50 |                         "pixel_values",
51 |                         pixel_values,
52 |                     )
53 |                 ]
54 |             )
55 |             responses.append(inference_response)
56 |         return responses
57 | 


--------------------------------------------------------------------------------
/HuggingFace/ensemble_model_repository/preprocessing/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "preprocessing"
28 | backend: "python"
29 | max_batch_size: 8
30 | 
31 | input [
32 |   {
33 |     name: "image"
34 |     data_type: TYPE_FP32
35 |     dims: [-1, -1, -1]
36 |   }
37 | ]
38 | output [
39 |   {
40 |     name: "pixel_values"
41 |     data_type: TYPE_FP32
42 |     dims: [-1, -1, -1]
43 |   }
44 | ]
45 | 
46 | instance_group [
47 |   {
48 |     kind: KIND_GPU
49 |   }
50 | ]
51 | 


--------------------------------------------------------------------------------
/HuggingFace/img/Approach.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/HuggingFace/img/Approach.PNG


--------------------------------------------------------------------------------
/HuggingFace/img/netron.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/HuggingFace/img/netron.PNG


--------------------------------------------------------------------------------
/HuggingFace/python_model_repository/python_vit/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "python_vit"
28 | backend: "python"
29 | max_batch_size: 8
30 | 
31 | input [
32 |   {
33 |     name: "image"
34 |     data_type: TYPE_FP32
35 |     dims: [-1, -1, -1]
36 |   }
37 | ]
38 | output [
39 |   {
40 |     name: "last_hidden_state"
41 |     data_type: TYPE_FP32
42 |     dims: [-1, -1]
43 |   }
44 | ]
45 | 
46 | instance_group [
47 |   {
48 |     kind: KIND_GPU
49 |   }
50 | ]
51 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions
 5 | are met:
 6 |  * Redistributions of source code must retain the above copyright
 7 |    notice, this list of conditions and the following disclaimer.
 8 |  * Redistributions in binary form must reproduce the above copyright
 9 |    notice, this list of conditions and the following disclaimer in the
10 |    documentation and/or other materials provided with the distribution.
11 |  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |    contributors may be used to endorse or promote products derived
13 |    from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/Migration_Guide/img/arch.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Migration_Guide/img/arch.PNG


--------------------------------------------------------------------------------
/Popular_Models_Guide/Llama2/README.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | -->
28 | 
29 | # Deploying Hugging Face Transformer Models in Triton
30 | 
31 | There are multiple ways to run Llama2 with Tritonserver.
32 | 1. Infer with [TensorRT-LLM Backend](trtllm_guide.md#infer-with-tensorrt-llm-backend)
33 | 2. Infer with [vLLM Backend](vllm_guide.md#infer-with-vllm-backend)
34 | 3. Infer with [Python-based Backends as a HuggingFace model](../Quick_Deploy/HuggingFaceTransformers/README.md#deploying-hugging-face-transformer-models-in-triton)
35 | 
36 | ## Pre-build instructions
37 | 
38 | For the tutorials we are assuming that the Llama2 models, weights, and tokens are cloned from the Huggingface Llama2 repo [here](https://huggingface.co/meta-llama/Llama-2-7b-hf/tree/main).
39 | To run the tutorials, you will need to get permissions for the Llama2 repository as well as access to the huggingface cli.
40 | The cli uses [User access tokens](https://huggingface.co/docs/hub/security-tokens). The tokens can be found here: [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
41 | 


--------------------------------------------------------------------------------
/Popular_Models_Guide/Llama2/llama2vllm/1/model.json:
--------------------------------------------------------------------------------
1 | {
2 |     "model":"meta-llama/Llama-2-7b-hf",
3 |     "trust_remote_code":true,
4 |     "download_dir":"/opt/tritonserver/model_repository/llama2vllm/hf-cache",
5 |     "disable_log_requests": "true",
6 |     "gpu_memory_utilization": 0.5
7 | }
8 | 


--------------------------------------------------------------------------------
/Popular_Models_Guide/Llama2/llama2vllm/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | # Note: You do not need to change any fields in this configuration.
28 | 
29 | backend: "vllm"
30 | 
31 | # The usage of device is deferred to the vLLM engine
32 | instance_group [
33 |   {
34 |     count: 1
35 |     kind: KIND_MODEL
36 |   }
37 | ]
38 | 


--------------------------------------------------------------------------------
/Popular_Models_Guide/Llava1.5/model_repository/llava-1.5/config.pbtxt:
--------------------------------------------------------------------------------
 1 | model_transaction_policy {
 2 |     decoupled: True
 3 | }
 4 | 
 5 | input [
 6 |     {
 7 |         name: "prompt"
 8 |         data_type: TYPE_STRING
 9 |         dims: [ 1 ]
10 |     },
11 |     {
12 |         name: "image"
13 |         data_type: TYPE_STRING
14 |         dims: [ -1 ]
15 |     },
16 |     {
17 |         name: "max_tokens"
18 |         data_type: TYPE_INT32
19 |         dims: [ 1 ]
20 |         optional: true
21 |     },
22 |     {
23 |         name: "temperature"
24 |         data_type: TYPE_FP32
25 |         dims: [ 1 ]
26 |         optional: true
27 |     },
28 |     {
29 |         name: "top_k"
30 |         data_type: TYPE_INT32
31 |         dims: [ 1 ]
32 |         optional: true
33 |     },
34 |     {
35 |         name: "frequency_penalty"
36 |         data_type: TYPE_FP32
37 |         dims: [ 1 ]
38 |         optional: true
39 |     },
40 |     {
41 |         name: "seed"
42 |         data_type: TYPE_UINT64
43 |         dims: [ 1 ]
44 |         optional: true
45 |     }
46 | ]
47 | 
48 | output [
49 |     {
50 |         name: "text"
51 |         data_type: TYPE_STRING
52 |         dims: [ 1 ]
53 |     },
54 |     {
55 |         name: "finish_reason"
56 |         data_type: TYPE_STRING
57 |         dims: [ 1 ]
58 |     },
59 |     {
60 |         name: "prompt_tokens"
61 |         data_type: TYPE_INT32
62 |         dims: [ 1 ]
63 |     },
64 |     {
65 |         name: "completion_tokens"
66 |         data_type: TYPE_INT32
67 |         dims: [ 1 ]
68 |     },
69 |     {
70 |         name: "total_tokens"
71 |         data_type: TYPE_INT32
72 |         dims: [ 1 ]
73 |     }
74 | ]
75 | 
76 | 
77 | instance_group [
78 |   {
79 |     count: 1
80 |     kind: KIND_GPU
81 |     gpus: [ 0 ]
82 |   }
83 | ]


--------------------------------------------------------------------------------
/Popular_Models_Guide/Llava1.5/model_repository/tensorrt_llm/1/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Popular_Models_Guide/Llava1.5/model_repository/tensorrt_llm/1/.gitkeep


--------------------------------------------------------------------------------
/Popular_Models_Guide/Llava1.5/model_repository/vision_encoder/config.pbtxt:
--------------------------------------------------------------------------------
 1 | input [
 2 |   {
 3 |     name: "image"
 4 |     data_type: TYPE_FP16
 5 |     dims: [ -1, 3, 336, 336 ]
 6 |   }
 7 | ]
 8 | output [
 9 |   {
10 |     name: "features"
11 |     data_type: TYPE_FP16
12 |     dims: [ 576 , -1]
13 |   }
14 | ]
15 | 
16 | instance_group [
17 |   {
18 |     count: 1
19 |     kind: KIND_GPU
20 |     gpus: [ 0 ]
21 |   }
22 | ]
23 | 


--------------------------------------------------------------------------------
/Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_1_5/1/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_1_5/1/.gitkeep


--------------------------------------------------------------------------------
/Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_xl/1/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_xl/1/.gitkeep


--------------------------------------------------------------------------------
/Popular_Models_Guide/StableDiffusion/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver
28 | ARG BASE_IMAGE_TAG=24.01-py3
29 | 
30 | FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} as tritonserver-stable-diffusion
31 | 
32 | RUN pip install --pre --upgrade --extra-index-url https://pypi.nvidia.com tensorrt-cu12==10.4.0
33 | 
34 | RUN git clone https://github.com/NVIDIA/TensorRT.git -b release/10.4 --single-branch /tmp/TensorRT
35 | 
36 | RUN pip3 install -r /tmp/TensorRT/demo/Diffusion/requirements.txt
37 | 
38 | RUN pip3 install tritonclient[all]
39 | 
40 | RUN mkdir -p /opt/tritonserver/backends/diffusion
41 | 
42 | RUN cp -rf /tmp/TensorRT/demo/Diffusion /opt/tritonserver/backends/diffusion/
43 | 
44 | COPY ./backend/diffusion/model.py /opt/tritonserver/backends/diffusion/model.py
45 | 
46 | COPY ./diffusion-models /workspace/diffusion-models
47 | 


--------------------------------------------------------------------------------
/Popular_Models_Guide/StableDiffusion/docker/Dockerfile.dockerignore:
--------------------------------------------------------------------------------
 1 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | **/*.onnx
28 | **/*.plan
29 | **/*.cache/*
30 | **/*onnx*
31 | **/*engine*
32 | **/*pytorch_model*
33 | **/*.pth*
34 | 


--------------------------------------------------------------------------------
/Popular_Models_Guide/StableDiffusion/docs/client_0_generated_image_0_1_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Popular_Models_Guide/StableDiffusion/docs/client_0_generated_image_0_1_5.jpg


--------------------------------------------------------------------------------
/Popular_Models_Guide/StableDiffusion/docs/client_0_generated_image_0_xl.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Popular_Models_Guide/StableDiffusion/docs/client_0_generated_image_0_xl.jpg


--------------------------------------------------------------------------------
/Popular_Models_Guide/StableDiffusion/scripts/build_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | SOURCE_DIR=$(dirname "$(readlink -f "$0")")
29 | 
30 | # install tritonserver in process api
31 | find /opt/tritonserver/python -maxdepth 1 -type f -name \
32 |      "tritonserver-*.whl" | xargs -I {} pip3 install --upgrade {}[all]
33 | 
34 | 
35 | # Run python script
36 | 
37 | python3 $SOURCE_DIR/build_models.py "$@"
38 | 


--------------------------------------------------------------------------------
/Quick_Deploy/HuggingFaceTransformers/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | FROM nvcr.io/nvidia/tritonserver:23.10-py3
27 | RUN pip install transformers==4.34.0 protobuf==3.20.3 sentencepiece==0.1.99 accelerate==0.23.0 einops==0.6.1
28 | 


--------------------------------------------------------------------------------
/Quick_Deploy/HuggingFaceTransformers/falcon7b/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Triton backend to use
 2 | backend: "python"
 3 | 
 4 | # Hugging face model path. Parameters must follow this
 5 | # key/value structure
 6 | parameters: {
 7 |   key: "huggingface_model",
 8 |   value: {string_value: "tiiuae/falcon-7b"}
 9 | }
10 | 
11 | # The maximum number of tokens to generate in response
12 | # to our input
13 | parameters: {
14 |   key: "max_output_length",
15 |   value: {string_value: "15"}
16 | }
17 | 
18 | # Triton should expect as input a single string of set
19 | # length named 'text_input'
20 | input [
21 |   {
22 |     name: "text_input"
23 |     data_type: TYPE_STRING
24 |     dims: [ 1 ]
25 |   }
26 | ]
27 | 
28 | # Triton should expect to respond with a single string
29 | # output of variable length named 'text_output'
30 | output [
31 |   {
32 |     name: "text_output"
33 |     data_type: TYPE_STRING
34 |     dims: [ -1 ]
35 |   }
36 | ]
37 | 


--------------------------------------------------------------------------------
/Quick_Deploy/HuggingFaceTransformers/llama7b/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Triton backend to use
 2 | backend: "python"
 3 | 
 4 | # Hugging face model path. Parameters must follow this
 5 | # key/value structure
 6 | parameters: {
 7 |   key: "huggingface_model",
 8 |   value: {string_value: "meta-llama/Llama-2-7b-hf"}
 9 | }
10 | # The maximum number of tokens to generate in response
11 | # to our input
12 | parameters: {
13 |   key: "max_output_length",
14 |   value: {string_value: "15"}
15 | }
16 | 
17 | # Triton should expect as input a single string of set
18 | # length named 'text_input'
19 | input [
20 |   {
21 |     name: "text_input"
22 |     data_type: TYPE_STRING
23 |     dims: [ 1 ]
24 |   }
25 | ]
26 | 
27 | # Triton should expect to respond with a single string
28 | # output of variable length named 'text_output'
29 | output [
30 |   {
31 |     name: "text_output"
32 |     data_type: TYPE_STRING
33 |     dims: [ -1 ]
34 |   }
35 | ]
36 | 


--------------------------------------------------------------------------------
/Quick_Deploy/HuggingFaceTransformers/persimmon8b/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Triton backend to use
 2 | backend: "python"
 3 | 
 4 | # Hugging face model path. Parameters must follow this
 5 | # key/value structure
 6 | parameters: {
 7 |   key: "huggingface_model",
 8 |   value: {string_value: "adept/persimmon-8b-base"}
 9 | }
10 | 
11 | # The maximum number of tokens to generate in response
12 | # to our input
13 | parameters: {
14 |   key: "max_output_length",
15 |   value: {string_value: "15"}
16 | }
17 | 
18 | # Triton should expect as input a single string of set
19 | # length named 'text_input'
20 | input [
21 |   {
22 |     name: "text_input"
23 |     data_type: TYPE_STRING
24 |     dims: [ 1 ]
25 |   }
26 | ]
27 | 
28 | # Triton should expect to respond with a single string
29 | # output of variable length named 'text_output'
30 | output [
31 |   {
32 |     name: "text_output"
33 |     data_type: TYPE_STRING
34 |     dims: [ -1 ]
35 |   }
36 | ]
37 | 


--------------------------------------------------------------------------------
/Quick_Deploy/PyTorch/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "resnet50"
28 | platform: "pytorch_libtorch"
29 | max_batch_size : 0
30 | input [
31 |   {
32 |     name: "input__0"
33 |     data_type: TYPE_FP32
34 |     dims: [ 3, 224, 224 ]
35 |     reshape { shape: [ 1, 3, 224, 224 ] }
36 |   }
37 | ]
38 | output [
39 |   {
40 |     name: "output__0"
41 |     data_type: TYPE_FP32
42 |     dims: [ 1, 1000 ,1, 1]
43 |     reshape { shape: [ 1, 1000 ] }
44 |   }
45 | ]
46 | 


--------------------------------------------------------------------------------
/Quick_Deploy/PyTorch/export.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | import torch
28 | 
29 | torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
30 | 
31 | model = (
32 |     torch.hub.load("pytorch/vision:v0.10.0", "resnet50", pretrained=True)
33 |     .eval()
34 |     .to("cuda")
35 | )
36 | traced_model = torch.jit.trace(model, torch.randn(1, 3, 224, 224).to("cuda"))
37 | torch.jit.save(traced_model, "model.pt")
38 | 


--------------------------------------------------------------------------------
/Quick_Deploy/TensorFlow/client.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | import numpy as np
28 | import tritonclient.http as httpclient
29 | from tensorflow.keras.applications.resnet50 import preprocess_input
30 | from tensorflow.keras.preprocessing import image
31 | from tritonclient.utils import triton_to_np_dtype
32 | 
33 | 
34 | def process_image(image_path="img1.jpg"):
35 |     img = image.load_img(image_path, target_size=(224, 224))
36 |     x = image.img_to_array(img)
37 |     x = np.expand_dims(x, axis=0)
38 |     return preprocess_input(x)
39 | 
40 | 
41 | transformed_img = process_image()
42 | 
43 | # Setting up client
44 | triton_client = httpclient.InferenceServerClient(url="localhost:8000")
45 | 
46 | inputs = httpclient.InferInput("input_1", transformed_img.shape, datatype="FP32")
47 | inputs.set_data_from_numpy(transformed_img, binary_data=True)
48 | 
49 | output = httpclient.InferRequestedOutput(
50 |     "predictions", binary_data=True, class_count=1000
51 | )
52 | 
53 | # Querying the server
54 | results = triton_client.infer(model_name="resnet50", inputs=[inputs], outputs=[output])
55 | 
56 | predictions = results.as_numpy("predictions")
57 | print(predictions)
58 | 


--------------------------------------------------------------------------------
/Quick_Deploy/TensorFlow/config.pbtxt:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | name: "resnet50"
28 | platform: "tensorflow_savedmodel"
29 | max_batch_size : 0
30 | input [
31 |   {
32 |     name: "input_1"
33 |     data_type: TYPE_FP32
34 |     dims: [-1, 224, 224, 3 ]
35 |   }
36 | ]
37 | output [
38 |   {
39 |     name: "predictions"
40 |     data_type: TYPE_FP32
41 |     dims: [-1, 1000]
42 |   }
43 | ]
44 | 


--------------------------------------------------------------------------------
/Quick_Deploy/TensorFlow/export.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | import tensorflow as tf
28 | from tensorflow.keras.applications.resnet50 import ResNet50
29 | 
30 | # Load model0
31 | model = ResNet50(weights="imagenet")
32 | model.save("resnet50_saved_model")
33 | 


--------------------------------------------------------------------------------
/Quick_Deploy/vLLM/.gitignore:
--------------------------------------------------------------------------------
1 | Miniconda*
2 | miniconda
3 | model_repository/vllm/vllm_env.tar.gz
4 | model_repository/vllm/triton_python_backend_stub
5 | python_backend
6 | results.txt
7 | 


--------------------------------------------------------------------------------
/Triton_Inference_Server_Python_API/deps/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | pyright
28 | pytest
29 | ray[all]==2.36.0
30 | 


--------------------------------------------------------------------------------
/Triton_Inference_Server_Python_API/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver
28 | ARG BASE_IMAGE_TAG=24.08-py3
29 | 
30 | FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} as triton-python-api
31 | 
32 | RUN apt-get update; apt-get install -y gdb
33 | 
34 | RUN --mount=type=bind,source=./deps/requirements.txt,target=/tmp/requirements.txt \
35 |     pip install --timeout=2000 --requirement /tmp/requirements.txt
36 | 
37 | # Finish pyright install
38 | 
39 | RUN pyright --help
40 | 
41 | RUN find /opt/tritonserver/python -maxdepth 1 -type f -name \
42 |     "tritonserver-*.whl" | xargs -I {} pip3 install --upgrade {}[all]
43 | 
44 | # grafana
45 | RUN apt-get install -y adduser libfontconfig1 musl && \
46 |     wget https://dl.grafana.com/enterprise/release/grafana-enterprise_11.2.0_amd64.deb && \
47 |     dpkg -i grafana-enterprise_11.2.0_amd64.deb && \
48 |     rm -rf grafana-enterprise_11.2.0_amd64.deb
49 | 
50 | RUN ln -sf /bin/bash /bin/sh
51 | 
52 | 


--------------------------------------------------------------------------------
/Triton_Inference_Server_Python_API/docker/Dockerfile.dockerignore:
--------------------------------------------------------------------------------
 1 | **/*.onnx
 2 | **/*.plan
 3 | **/diffuser-models/*
 4 | **/identity-models/*
 5 | **/scripts/stable_diffusion/models/*/*/*.onnx
 6 | **/scripts/stable_diffusion/models/*/*/*.plan
 7 | **/*.onnx
 8 | **/*.plan
 9 | **/.cache/*
10 | **/*onnx*
11 | **/*engine*
12 | **/*pytorch_model*
13 | **/*.pth*


--------------------------------------------------------------------------------
/Triton_Inference_Server_Python_API/docs/car_sample.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Triton_Inference_Server_Python_API/docs/car_sample.jpg


--------------------------------------------------------------------------------
/Triton_Inference_Server_Python_API/docs/sample_generated_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/triton-inference-server/tutorials/17331012af74eab68ad7c86d8a4ae494272ca4f7/Triton_Inference_Server_Python_API/docs/sample_generated_image.jpg


--------------------------------------------------------------------------------
/Triton_Inference_Server_Python_API/examples/kafka-io/models/tokenizer/1/model.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | 
 3 | import numpy as np
 4 | import triton_python_backend_utils as pb_utils
 5 | from transformers import BertTokenizerFast, TensorType
 6 | 
 7 | 
 8 | class TritonPythonModel:
 9 |     tokenizer: BertTokenizerFast
10 | 
11 |     def initialize(self, args: Dict[str, str]) -> None:
12 |         """
13 |         Initialize the tokenization process
14 |         :param args: arguments from Triton config file
15 |         """
16 |         self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
17 | 
18 |     def execute(self, requests) -> "List[List[pb_utils.Tensor]]":
19 |         """
20 |         Parse and tokenize each request
21 |         :param requests: 1 or more requests received by Triton server.
22 |         :return: text as input tensors
23 |         """
24 |         responses = []
25 |         # for loop for batch requests (disabled in our case)
26 |         for request in requests:
27 |             # binary data typed back to string
28 |             query = [
29 |                 t.decode("UTF-8")
30 |                 for t in pb_utils.get_input_tensor_by_name(request, "TEXT")
31 |                 .as_numpy()
32 |                 .tolist()
33 |             ]
34 |             tokens: Dict[str, np.ndarray] = self.tokenizer(
35 |                 text=query,
36 |                 return_tensors=TensorType.NUMPY,
37 |                 padding="max_length",
38 |                 max_length=256,
39 |                 truncation=True,
40 |             )
41 |             # tensorrt uses int32 as input type, ort uses int64
42 |             tokens = {k: v.astype(np.int64) for k, v in tokens.items()}
43 |             # communicate the tokenization results to Triton server
44 |             outputs = list()
45 |             for input_name in self.tokenizer.model_input_names:
46 |                 tensor_input = pb_utils.Tensor(input_name, tokens[input_name])
47 |                 outputs.append(tensor_input)
48 | 
49 |             inference_response = pb_utils.InferenceResponse(output_tensors=outputs)
50 |             responses.append(inference_response)
51 | 
52 |         return responses
53 | 
54 |     def finalize(self):
55 |         """`finalize` is called only once when the model is being unloaded.
56 |         Implementing `finalize` function is OPTIONAL. This function allows
57 |         the model to perform any necessary clean ups before exit.
58 |         """
59 |         print("Cleaning up...")
60 | 


--------------------------------------------------------------------------------
/Triton_Inference_Server_Python_API/examples/kafka-io/models/tokenizer/config.pbtxt:
--------------------------------------------------------------------------------
 1 | name: "tokenizer"
 2 | max_batch_size: 0
 3 | backend: "python"
 4 | 
 5 | input [
 6 | {
 7 |     name: "TEXT"
 8 |     data_type: TYPE_STRING
 9 |     dims: [ -1 ]
10 | }
11 | ]
12 | 
13 | output [
14 | {
15 |     name: "input_ids"
16 |     data_type: TYPE_INT64
17 |     dims: [-1, 256]
18 | },
19 | {
20 |     name: "attention_mask"
21 |     data_type: TYPE_INT64
22 |     dims: [-1, 256]
23 | },
24 | {
25 |     name: "token_type_ids"
26 |     data_type: TYPE_INT64
27 |     dims: [ -1, 256 ]
28 |   }
29 | ]
30 | 


--------------------------------------------------------------------------------
/Triton_Inference_Server_Python_API/examples/kafka-io/requirements.txt:
--------------------------------------------------------------------------------
 1 | click==8.1.7
 2 | confluent_kafka==2.5.0
 3 | gcn-kafka==0.3.3
 4 | jsonschema==4.23.0
 5 | pandas==2.2.2
 6 | ray==2.32.0
 7 | ray[serve]==2.32.0
 8 | torch==2.3.1
 9 | transformers==4.42.4
10 | tritonclient==2.47.0
11 | 


--------------------------------------------------------------------------------
/Triton_Inference_Server_Python_API/examples/kafka-io/start-kafka.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | export DEBIAN_FRONTEND=noninteractive
 3 | 
 4 | wget https://dlcdn.apache.org/kafka/3.7.0/kafka_2.13-3.7.0.tgz
 5 | tar -xzf kafka_2.13-3.7.0.tgz
 6 | cd kafka_2.13-3.7.0
 7 | 
 8 | echo "Setting up JAVA 17"
 9 | apt-get update -q -y
10 | apt install -q -y openjdk-17-jdk openjdk-17-jre
11 | 
12 | echo "Configuring brokers to localhost for kafka server"
13 | sed -i -e 's/#listeners=PLAINTEXT:\/\/:9092/listeners=PLAINTEXT:\/\/localhost:9092/g' config/server.properties
14 | 
15 | echo "Starting zookeeper"
16 | nohup bin/zookeeper-server-start.sh -daemon config/zookeeper.properties > /dev/null 2>&1 &
17 | sleep 5
18 | echo "Successfully started zookeeper, starting kafka brokers"
19 | nohup bin/kafka-server-start.sh -daemon config/server.properties > /dev/null 2>&1 &
20 | sleep 5
21 | echo "Successfully started kafka brokers, creating input and output topics..."
22 | 
23 | bin/kafka-topics.sh --create --topic inference-input --bootstrap-server localhost:9092
24 | bin/kafka-topics.sh --create --topic inference-output --bootstrap-server localhost:9092
25 | 
26 | echo "Successfully created topics.\nInput topic: inference-input\nOutput topic: inference-output"
27 | 
28 | echo "Topic description:"
29 | bin/kafka-topics.sh --describe --topic inference-input --bootstrap-server localhost:9092
30 | bin/kafka-topics.sh --describe --topic inference-output --bootstrap-server localhost:9092
31 | 


--------------------------------------------------------------------------------
/Triton_Inference_Server_Python_API/examples/kafka-io/start-server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | export KAFKA_CONSUMER_MAX_WORKER_THREADS=1
 4 | export CONSUMER_CONFIGS='{"bootstrap.servers": "localhost:9092", "security.protocol": "PLAINTEXT", "group.id": "triton-server-kafka-consumer"}'
 5 | export PRODUCER_CONFIGS='{"bootstrap.servers": "localhost:9092", "security.protocol": "PLAINTEXT"}'
 6 | export CONSUMER_TOPICS='inference-input'
 7 | export PRODUCER_TOPIC='inference-output'
 8 | export MODEL_INPUT_NAME='TEXT'
 9 | export MODEL_NAME='tokenizer'
10 | export MODEL_REPOSITORY='./models'
11 | 
12 | nohup serve run tritonserver_deployment:entrypoint &
13 | tail -f nohup.out
14 | 


--------------------------------------------------------------------------------
/Triton_Inference_Server_Python_API/examples/kafka-io/utils/kafka_consumer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from collections import deque
 3 | from concurrent.futures import ThreadPoolExecutor
 4 | from multiprocessing import Queue
 5 | from typing import List
 6 | 
 7 | from confluent_kafka import KafkaError, KafkaException
 8 | from gcn_kafka import Consumer
 9 | from ray.serve.handle import DeploymentHandle
10 | 
11 | 
12 | class KafkaConsumer:
13 |     def __init__(
14 |         self,
15 |         config: dict,
16 |         topics: List[str],
17 |         triton_server_handle: DeploymentHandle,
18 |         output_queue: deque,
19 |     ):
20 |         self.config = config
21 |         self.topics = topics
22 |         self.triton_handle = triton_server_handle
23 |         self.output_queue = output_queue
24 | 
25 |     def read(self):
26 |         consumer = Consumer(self.config)
27 |         consumer.subscribe(self.topics)
28 |         self._consume_data(consumer)
29 | 
30 |     def _infer(self, future):
31 |         print("The custom callback was called.")
32 |         result = future.result()
33 |         self.output_queue.append(result.result())
34 |         print(f"Got: {future.result()}")
35 | 
36 |     def _consume_data(self, consumer):
37 |         while True:
38 |             try:
39 |                 msg = consumer.poll(0.1)
40 |                 if not msg:
41 |                     continue
42 |                 if msg.error():
43 |                     print(msg.error())
44 |                     if msg.error().code() == KafkaError._PARTITION_EOF:
45 |                         print(
46 |                             f"End of partition has been reached {msg.topic()}/{msg.partition()}"
47 |                         )
48 |                     else:
49 |                         raise KafkaException(msg.error())
50 |                 print(f"Key: {msg.key()}, Value: {msg.value()}")
51 |                 with ThreadPoolExecutor(
52 |                     max_workers=int(
53 |                         os.environ.get("KAFKA_CONSUMER_MAX_WORKER_THREADS", 1)
54 |                     )
55 |                 ) as executor:
56 |                     future = executor.submit(
57 |                         self.triton_handle.infer.remote, [msg.value()]
58 |                     )
59 |                     future.add_done_callback(self._infer)
60 |             except KeyboardInterrupt as e:
61 |                 print(f"Keyboard Interrupt Received: {e}")
62 |                 break
63 |             except Exception as e:
64 |                 print(f"Exception {e}")
65 |         consumer.close()
66 | 


--------------------------------------------------------------------------------
/Triton_Inference_Server_Python_API/examples/kafka-io/utils/kafka_producer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from collections import deque
 3 | from datetime import datetime
 4 | 
 5 | import numpy as np
 6 | from confluent_kafka.serialization import StringSerializer
 7 | from gcn_kafka import Producer
 8 | 
 9 | 
10 | class NumpyEncoder(json.JSONEncoder):
11 |     def default(self, obj):
12 |         if isinstance(obj, np.ndarray):
13 |             return obj.tolist()
14 |         return json.JSONEncoder.default(self, obj)
15 | 
16 | 
17 | class KafkaProducer:
18 |     def __init__(self, config: dict, topic: str, message_queue: deque):
19 |         self.config = config
20 |         self.topics = topic
21 |         self.message_queue = message_queue
22 |         self.serializer = StringSerializer("utf_8")
23 | 
24 |     def send_data(self):
25 |         producer = Producer(self.config)
26 |         self._produce(producer)
27 | 
28 |     def _produce(self, producer):
29 |         def delivery_report(err, msg):
30 |             """
31 |             Reports the failure or success of a message delivery.
32 |             Args:
33 |                  err (KafkaError): The error that occurred on None on success.
34 |                 msg (Message): The message that was produced or failed.
35 |             """
36 |             if err is not None:
37 |                 print(f"Delivery failed for User record {msg.key()}: {err}")
38 |                 return
39 |             print(
40 |                 f"User record successfully produced to {msg.topic()} [{msg.partition()}] at offset {msg.offset()}"
41 |             )
42 | 
43 |         while True:
44 |             producer.poll(0.0)
45 |             try:
46 |                 if self.message_queue.__len__() > 0:
47 |                     producer.produce(
48 |                         topic=self.topics,
49 |                         key=datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f"),
50 |                         value=self.serializer(
51 |                             json.dumps(self.message_queue.pop(), cls=NumpyEncoder)
52 |                         ),
53 |                         on_delivery=delivery_report,
54 |                     )
55 |                     producer.flush()
56 |             except KeyboardInterrupt as e:
57 |                 print(f"Keyboard Interrupt received {e}")
58 |                 break
59 |             except Exception as e:
60 |                 print(f"Error while producing the message {e}")
61 |             finally:
62 |                 producer.flush()
63 |         producer.close()
64 | 


--------------------------------------------------------------------------------
/Triton_Inference_Server_Python_API/examples/rayserve/start_ray.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | ip_address=$(hostname -I | awk '{print $1}')
29 | 
30 | echo $ip_address
31 | 
32 | mkdir -p /tmp/rayserve-demo; cd /tmp/rayserve-demo
33 | 
34 | ray metrics launch-prometheus
35 | 
36 | export RAY_GRAFANA_HOST=http://${ip_address}:3000
37 | 
38 | ray start --head --dashboard-host 0.0.0.0 --metrics-export-port 8080 --disable-usage-stats
39 | 
40 | /usr/share/grafana/bin/grafana-server --homepath /usr/share/grafana --config /tmp/ray/session_latest/metrics/grafana/grafana.ini web >grafana.stdout.log 2>&1 &
41 | 


--------------------------------------------------------------------------------
/Triton_Inference_Server_Python_API/examples/rayserve/stop_ray.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | #  * Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | #  * Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
13 | #    contributors may be used to endorse or promote products derived
14 | #    from this software without specific prior written permission.
15 | #
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | ray stop
29 | pkill prometheus.*
30 | pkill grafana.*
31 | 


--------------------------------------------------------------------------------
/Triton_Inference_Server_Python_API/identity-models/identity/config.pbtxt:
--------------------------------------------------------------------------------
 1 | backend: "python"
 2 | input [
 3 | {
 4 |   name: "string_input"
 5 |   data_type: TYPE_STRING
 6 |   dims: [ -1, -1 ]
 7 |   optional: true
 8 | },
 9 | {
10 |   name: "fp16_input",
11 |   data_type: TYPE_FP16
12 |   dims: [-1,-1],
13 |   optional: true
14 | }
15 | ]
16 | output [
17 | {
18 |   name: "string_output"
19 |   data_type: TYPE_STRING
20 |   dims: [ -1, -1 ]
21 | },
22 | {
23 |   name: "fp16_output",
24 |   data_type: TYPE_FP16
25 |   dims: [-1, -1]
26 | }
27 | ]


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | #
 3 | # Redistribution and use in source and binary forms, with or without
 4 | # modification, are permitted provided that the following conditions
 5 | # are met:
 6 | #  * Redistributions of source code must retain the above copyright
 7 | #    notice, this list of conditions and the following disclaimer.
 8 | #  * Redistributions in binary form must reproduce the above copyright
 9 | #    notice, this list of conditions and the following disclaimer in the
10 | #    documentation and/or other materials provided with the distribution.
11 | #  * Neither the name of NVIDIA CORPORATION nor the names of its
12 | #    contributors may be used to endorse or promote products derived
13 | #    from this software without specific prior written permission.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | [tool.codespell]
28 | # note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
29 | # this is only to allow you to run codespell interactively
30 | skip = "./.git,./.github"
31 | # ignore short words, and typename parameters like OffsetT
32 | ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
33 | # use the 'clear' dictionary for unambiguous spelling mistakes
34 | builtin = "clear"
35 | # disable warnings about binary files and wrong encoding
36 | quiet-level = 3
37 | 
38 | [tool.isort]
39 | profile = "black"
40 | use_parentheses = true
41 | multi_line_output = 3
42 | include_trailing_comma = true
43 | force_grid_wrap = 0
44 | ensure_newline_before_comments = true
45 | line_length = 88
46 | balanced_wrapping = true
47 | indent = "    "
48 | skip = ["build"]
49 | 
50 | 


--------------------------------------------------------------------------------