├── CMakeLists.txt
├── LICENSE
├── README.md
├── asset
    ├── Bench_YOLO_V11.JPG
    ├── Yolo_v11_cpp_tenosrrt.PNG
    ├── output.gif
    └── output.mp4
├── include
    ├── YOLOv11.h
    ├── common.h
    ├── cuda_utils.h
    ├── logging.h
    ├── macros.h
    └── preprocess.h
├── main.cpp
└── src
    ├── YOLOv11.cpp
    └── preprocess.cu


/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ﻿cmake_minimum_required(VERSION 3.18)
 2 | 
 3 | # Project declaration with C++ and CUDA support
 4 | project(YOLOv11TRT LANGUAGES CXX CUDA)
 5 | 
 6 | # Set C++ standard to C++17
 7 | set(CMAKE_CXX_STANDARD 17)
 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 9 | set(CMAKE_CXX_EXTENSIONS OFF)
10 | 
11 | # Define the path to TensorRT installation
12 | set(TENSORRT_PATH "F:/Program Files/TensorRT-8.6.1.6")  # Update this to the actual path for TensorRT
13 | 
14 | # Define the path to OpenCV installation
15 | 
16 | # Allow overriding TensorRT and OpenCV paths via command line
17 | # e.g., cmake -DTENSORRT_PATH="path/to/TensorRT" -DOpenCV_DIR="path/to/OpenCV" ..
18 | option(TENSORRT_PATH_OPTION "Path to TensorRT installation" ${TENSORRT_PATH})
19 | set(TENSORRT_PATH ${TENSORRT_PATH_OPTION} CACHE PATH "Path to TensorRT installation")
20 | 
21 | # Find OpenCV
22 | find_package(OpenCV REQUIRED)
23 | if(NOT OpenCV_FOUND)
24 |     message(FATAL_ERROR "OpenCV not found. Please install OpenCV or set OpenCV_DIR.")
25 | endif()
26 | 
27 | # Find CUDA
28 | find_package(CUDA REQUIRED)
29 | if(NOT CUDA_FOUND)
30 |     message(FATAL_ERROR "CUDA not found. Please install the CUDA Toolkit.")
31 | endif()
32 | 
33 | # Include directories for TensorRT
34 | include_directories(${TENSORRT_PATH}/include)
35 | 
36 | # Include directory for your project
37 | include_directories(${CMAKE_SOURCE_DIR}/include)
38 | 
39 | # Define source files (including CUDA sources)
40 | set(SOURCES
41 |     main.cpp
42 |     src/yolov11.cpp
43 |     src/preprocess.cu
44 | )
45 | 
46 | # Create executable (CMake handles CUDA sources automatically)
47 | add_executable(${PROJECT_NAME} ${SOURCES} ${HEADERS})
48 | 
49 | # Define API_EXPORTS macro
50 | target_compile_definitions(${PROJECT_NAME} PRIVATE API_EXPORTS)
51 | 
52 | # Specify include directories (modern CMake approach)
53 | target_include_directories(${PROJECT_NAME} PRIVATE
54 |     src/
55 |     ${OpenCV_INCLUDE_DIRS}
56 |     ${CUDA_INCLUDE_DIRS}
57 |     ${TENSORRT_PATH}/include
58 | )
59 | 
60 | # Link TensorRT libraries
61 | # Specify full paths to TensorRT libraries to avoid relying on link_directories
62 | set(TENSORRT_LIBS
63 |     "${TENSORRT_PATH}/lib/nvinfer.lib"
64 |     "${TENSORRT_PATH}/lib/nvonnxparser.lib"
65 |     "${TENSORRT_PATH}/lib/nvparsers.lib"
66 |     "${TENSORRT_PATH}/lib/nvinfer_plugin.lib"
67 | )
68 | 
69 | # Link libraries to the target
70 | target_link_libraries(${PROJECT_NAME} PRIVATE
71 |     ${OpenCV_LIBS}
72 |     ${CUDA_LIBRARIES}
73 |     ${TENSORRT_LIBS}
74 | )
75 | 
76 | # Enable separable compilation for CUDA (optional but recommended)
77 | set_target_properties(${PROJECT_NAME} PROPERTIES
78 |     CUDA_SEPARABLE_COMPILATION ON
79 | )
80 | 
81 | # (Optional) Specify CUDA architectures based on your GPU hardware
82 | # set(CMAKE_CUDA_ARCHITECTURES 75)  # Example for Turing architecture
83 | 
84 | # (Optional) Set output directories for binaries
85 | # set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
86 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Custom License Agreement
 2 | 
 3 | 1. License Grant You are hereby granted a non-exclusive, non-transferable license to use, reproduce, and distribute the code (hereinafter referred to as "the Software") under the following conditions:
 4 | 
 5 | 2. Conditions of Use
 6 | 
 7 | Non-Commercial Use: You may use the Software for personal, educational, or non-commercial purposes without any additional permissions.
 8 | Commercial Use: Any commercial use of the Software, including but not limited to selling, licensing, or using it in a commercial product, requires prior written permission from the original developer.
 9 | 3. Contact Requirement
10 | 
11 | If you wish to use the Software for commercial purposes, you must contact the original developer at [https://www.linkedin.com/in/hamdi-boukamcha/] to obtain a commercial license.
12 | The terms of any commercial license will be mutually agreed upon and may involve a licensing fee.
13 | 4. Attribution
14 | 
15 | Regardless of whether you are using the Software for commercial or non-commercial purposes, you must provide appropriate credit to the original developer in any distributions or products that use the Software.
16 | 5. Disclaimer of Warranty
17 | 
18 | The Software is provided "as is," without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose, and non-infringement. In no event shall the original developer be liable for any claim, damages, or other liability, whether in an action of contract, tort, or otherwise, arising from, out of, or in connection with the Software or the use or other dealings in the Software.
19 | 6. Governing Law
20 | 
21 | This License Agreement shall be governed by and construed in accordance with the laws of France.
22 | By using the Software, you agree to abide by the terms outlined in this License Agreement.
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # YOLOv11 C++ TensorRT
  2 | ![Inference Time of YOLOv11 ](asset/Yolo_v11_cpp_tenosrrt.PNG)
  3 | ![Inference Time of YOLOv11 ](asset/output.gif)
  4 | 
  5 | <a href="https://github.com/hamdiboukamcha/Yolo-V11-cpp-TensorRT" style="margin: 0 2px;">
  6 |     <img src="https://img.shields.io/badge/GitHub-Repo-blue?style=flat&logo=GitHub" alt="GitHub">
  7 | </a>
  8 | 
  9 | <a href="https://github.com/yourusername/YOLOv11-TensorRT/blob/main/LICENSE" style="margin: 0 2px;">
 10 |     <img src="https://img.shields.io/badge/License-MIT-lightgreen?style=flat&logo=License" alt="License">
 11 | </a>
 12 | 
 13 | ## 📜 Citation
 14 | 
 15 | The **YOLOv11 C++ TensorRT Project** is a high-performance object detection solution implemented in **C++** and optimized using **NVIDIA TensorRT**. This project leverages the YOLOv11 model to deliver fast and accurate object detection, utilizing TensorRT to maximize inference efficiency and performance.
 16 | 
 17 | ---
 18 | 
 19 | ## 📢 Updates
 20 | 
 21 | ### Key Features:
 22 | - **Model Conversion**: Convert ONNX models to TensorRT engine files to accelerate inference.
 23 | - **Inference on Videos**: Efficiently perform object detection on video files.
 24 | - **Inference on Images**: Execute object detection on individual images.
 25 | - **High Efficiency**: Optimized for real-time object detection using NVIDIA GPUs.
 26 | - **Preprocessing with CUDA**: CUDA-enabled preprocessing for faster input handling.
 27 | 
 28 | ![Benchmark Inference Time of YOLOv11 Models](asset/Bench_YOLO_V11.JPG)
 29 | 
 30 | ---
 31 | ## 📂 Project Structure
 32 |   
 33 |     YOLOv11-TensorRT/
 34 |     ├── CMakeLists.txt          # Build configuration for the project
 35 |     ├── include/                # Header files
 36 |     ├── src/
 37 |     │   ├── main.cpp            # Main entry point for the application
 38 |     │   ├── yolov11.cpp         # YOLOv11 implementation
 39 |     │   └── preprocess.cu       # CUDA preprocessing code
 40 |     ├── assets/                 # Images and benchmarks for README
 41 |     └── build/                  # Compiled binaries
 42 | 
 43 | ## 🛠️ Setup
 44 | 
 45 | ### Prerequisites
 46 | 
 47 | - **CMake** (version 3.18 or higher)
 48 | - **TensorRT** (V8.6.1.6: For optimized inference with YOLOv11.)
 49 | - **CUDA Toolkit** (V11.7: For GPU acceleration)
 50 | - **OpenCV** (V4.10.0: For image and video processing)
 51 | - **NVIDIA GPU** (with compute capability 7.5 or higher)
 52 | 
 53 | ### Installation
 54 | 
 55 | 1. Clone the repository:
 56 |    ```bash
 57 |    git clone https://github.com/hamdiboukamcha/Yolo-V11-cpp-TensorRT.git
 58 |    cd YOLOv11-TensorRT
 59 | 2. Update the TensorRT and OpenCV paths in CMakeLists.txt:
 60 |    ```bash
 61 |    set(TENSORRT_PATH "F:/Program Files/TensorRT-8.6.1.6")  # Adjust this to your path
 62 | 
 63 | 4. Build the project:
 64 |     ```bash
 65 |     mkdir build
 66 |     cd build
 67 |     cmake ..
 68 |     make -j$(nproc)
 69 | ## 🚀 Usage
 70 | 
 71 | ### Convert Yolov11 To ONNX Model
 72 |     from ultralytics import YOLO
 73 |     Load the YOLO model
 74 |     model = YOLO("yolo11s.pt")
 75 |     #Export the model to ONNX format
 76 |     export_path = model.export(format="onnx")
 77 | 
 78 | ### Convert ONNX Model to TensorRT Engine
 79 | 
 80 | To convert an ONNX model to a TensorRT engine file, use the following command:
 81 | 
 82 |     ./YOLOv11TRT convert path_to_your_model.onnx path_to_your_engine.engine.
 83 |         
 84 | path_to_your_model.onnx: Path to the ONNX model file.
 85 | 
 86 | path_to_your_engine.engine: Path where the TensorRT engine file will be saved.
 87 | 
 88 | ### Run Inference on Video
 89 | To run inference on a video, use the following command:
 90 | 
 91 |     ./YOLOv11TRT infer_video path_to_your_video.mp4 path_to_your_engine.engine
 92 | 
 93 | path_to_your_video.mp4: Path to the input video file.
 94 | 
 95 | path_to_your_engine.engine: Path to the TensorRT engine file.
 96 | 
 97 | ### Run Inference on Video
 98 | Run Inference on Image
 99 | To run inference on an image, use the following command:
100 | 
101 |     ./YOLOv11TRT infer_image path_to_your_image.jpg path_to_your_engine.engine
102 |     
103 | path_to_your_image.jpg: Path to the input image file.
104 | 
105 | path_to_your_engine.engine: Path to the TensorRT engine file.
106 | 
107 | ## ⚙️ Configuration
108 | 
109 | ### CMake Configuration
110 | In the CMakeLists.txt, update the paths for TensorRT and OpenCV if they are installed in non-default locations:
111 | 
112 | #### Set the path to TensorRT installation
113 | 
114 |     #Define the path to TensorRT installation
115 |     set(TENSORRT_PATH "F:/Program Files/TensorRT-8.6.1.6")  # Update this to the actual path for TensorRT
116 |     
117 | Ensure that the path points to the directory where TensorRT is installed.
118 | 
119 | ### Troubleshooting
120 | Cannot find nvinfer.lib: Ensure that TensorRT is correctly installed and that nvinfer.lib is in the specified path. Update CMakeLists.txt to include the correct path to TensorRT libraries.
121 | 
122 | Linker Errors: Verify that all dependencies (OpenCV, CUDA, TensorRT) are correctly installed and that their paths are correctly set in CMakeLists.txt.
123 | 
124 | Run-time Errors: Ensure that your system has the correct CUDA drivers and that TensorRT runtime libraries are accessible. Add TensorRT’s bin directory to your system PATH.
125 | 
126 | ## 📞 Contact
127 | 
128 | For advanced inquiries, feel free to contact me on LinkedIn: <a href="https://www.linkedin.com/in/hamdi-boukamcha/" target="_blank"> <img src="assets/blue-linkedin-logo.png" alt="LinkedIn" width="32" height="32"></a>
129 | 
130 | ## 📜 Citation
131 | 
132 | If you use this code in your research, please cite the repository as follows:
133 | 
134 |         @misc{boukamcha2024yolov11,
135 |             author = {Hamdi Boukamcha},
136 |             title = {Yolo-V11-cpp-TensorRT},
137 |             year = {2024},
138 |             publisher = {GitHub},
139 |             howpublished = {\url{https://github.com/hamdiboukamcha/Yolo-V11-cpp-TensorRT/}},
140 |         }
141 | 
142 | 
143 |   
144 | 
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/asset/Bench_YOLO_V11.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hamdiboukamcha/Yolo-V11-cpp-TensorRT/988adf14f25d120fae4d971a0f2186cfe69b4e72/asset/Bench_YOLO_V11.JPG


--------------------------------------------------------------------------------
/asset/Yolo_v11_cpp_tenosrrt.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hamdiboukamcha/Yolo-V11-cpp-TensorRT/988adf14f25d120fae4d971a0f2186cfe69b4e72/asset/Yolo_v11_cpp_tenosrrt.PNG


--------------------------------------------------------------------------------
/asset/output.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hamdiboukamcha/Yolo-V11-cpp-TensorRT/988adf14f25d120fae4d971a0f2186cfe69b4e72/asset/output.gif


--------------------------------------------------------------------------------
/asset/output.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hamdiboukamcha/Yolo-V11-cpp-TensorRT/988adf14f25d120fae4d971a0f2186cfe69b4e72/asset/output.mp4


--------------------------------------------------------------------------------
/include/YOLOv11.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file YOLOv11.h
  3 |  * @brief Header file for the YOLOv11 object detection model using TensorRT and OpenCV.
  4 |  *
  5 |  * This class encapsulates the preprocessing, inference, and postprocessing steps required to
  6 |  * perform object detection using a YOLOv11 model with TensorRT.
  7 |  */
  8 | 
  9 | #pragma once
 10 | 
 11 | #include "NvInfer.h"
 12 | #include <opencv2/opencv.hpp>
 13 | 
 14 | using namespace nvinfer1;
 15 | using namespace std;
 16 | using namespace cv;
 17 | 
 18 | /**
 19 |  * @struct Detection
 20 |  * @brief A structure representing a detected object.
 21 |  *
 22 |  * Contains the confidence score, class ID, and bounding box for a detected object.
 23 |  */
 24 | struct Detection
 25 | {
 26 |     float conf;      //!< Confidence score of the detection.
 27 |     int class_id;    //!< Class ID of the detected object.
 28 |     Rect bbox;       //!< Bounding box of the detected object.
 29 | };
 30 | 
 31 | /**
 32 |  * @class YOLOv11
 33 |  * @brief A class for running YOLOv11 object detection using TensorRT and OpenCV.
 34 |  *
 35 |  * This class handles model initialization, inference, and postprocessing to detect objects
 36 |  * in images.
 37 |  */
 38 | class YOLOv11
 39 | {
 40 | public:
 41 | 
 42 |     /**
 43 |      * @brief Constructor to initialize the YOLOv11 object.
 44 |      *
 45 |      * Loads the model and initializes TensorRT objects.
 46 |      *
 47 |      * @param model_path Path to the model engine or ONNX file.
 48 |      * @param logger Reference to a TensorRT logger for error reporting.
 49 |      */
 50 |     YOLOv11(string model_path, nvinfer1::ILogger& logger);
 51 | 
 52 |     /**
 53 |      * @brief Destructor to clean up resources.
 54 |      *
 55 |      * Frees the allocated memory and TensorRT resources.
 56 |      */
 57 |     ~YOLOv11();
 58 | 
 59 |     /**
 60 |      * @brief Preprocess the input image.
 61 |      *
 62 |      * Prepares the image for inference by resizing and normalizing it.
 63 |      *
 64 |      * @param image The input image to be preprocessed.
 65 |      */
 66 |     void preprocess(Mat& image);
 67 | 
 68 |     /**
 69 |      * @brief Run inference on the preprocessed image.
 70 |      *
 71 |      * Executes the TensorRT engine for object detection.
 72 |      */
 73 |     void infer();
 74 | 
 75 |     /**
 76 |      * @brief Postprocess the output from the model.
 77 |      *
 78 |      * Filters and decodes the raw output from the TensorRT engine into detection results.
 79 |      *
 80 |      * @param output A vector to store the detected objects.
 81 |      */
 82 |     void postprocess(vector<Detection>& output);
 83 | 
 84 |     /**
 85 |      * @brief Draw the detected objects on the image.
 86 |      *
 87 |      * Overlays bounding boxes and class labels on the image for visualization.
 88 |      *
 89 |      * @param image The input image where the detections will be drawn.
 90 |      * @param output A vector of detections to be visualized.
 91 |      */
 92 |     void draw(Mat& image, const vector<Detection>& output);
 93 | 
 94 | private:
 95 |     /**
 96 |      * @brief Initialize TensorRT components from the given engine file.
 97 |      *
 98 |      * @param engine_path Path to the serialized TensorRT engine file.
 99 |      * @param logger Reference to a TensorRT logger for error reporting.
100 |      */
101 |     void init(std::string engine_path, nvinfer1::ILogger& logger);
102 | 
103 |     float* gpu_buffers[2]; //!< The vector of device buffers needed for engine execution.
104 |     float* cpu_output_buffer; //!< Pointer to the output buffer on the host.
105 | 
106 |     cudaStream_t stream; //!< CUDA stream for asynchronous execution.
107 |     IRuntime* runtime; //!< The TensorRT runtime used to deserialize the engine.
108 |     ICudaEngine* engine; //!< The TensorRT engine used to run the network.
109 |     IExecutionContext* context; //!< The context for executing inference using an ICudaEngine.
110 | 
111 |     // Model parameters
112 |     int input_w; //!< Width of the input image.
113 |     int input_h; //!< Height of the input image.
114 |     int num_detections; //!< Number of detections output by the model.
115 |     int detection_attribute_size; //!< Size of each detection attribute.
116 |     int num_classes = 80; //!< Number of object classes that can be detected.
117 |     const int MAX_IMAGE_SIZE = 4096 * 4096; //!< Maximum allowed input image size.
118 |     float conf_threshold = 0.3f; //!< Confidence threshold for filtering detections.
119 |     float nms_threshold = 0.4f; //!< Non-Maximum Suppression (NMS) threshold for filtering overlapping boxes.
120 | 
121 |     vector<Scalar> colors; //!< A vector of colors for drawing bounding boxes.
122 | 
123 |     /**
124 |      * @brief Build the TensorRT engine from the ONNX model.
125 |      *
126 |      * @param onnxPath Path to the ONNX file.
127 |      * @param logger Reference to a TensorRT logger for error reporting.
128 |      */
129 |     void build(std::string onnxPath, nvinfer1::ILogger& logger);
130 | 
131 |     /**
132 |      * @brief Save the TensorRT engine to a file.
133 |      *
134 |      * @param filename Path to save the serialized engine.
135 |      * @return True if the engine was saved successfully, false otherwise.
136 |      */
137 |     bool saveEngine(const std::string& filename);
138 | };
139 | 


--------------------------------------------------------------------------------
/include/common.h:
--------------------------------------------------------------------------------
 1 | const std::vector<std::string> CLASS_NAMES = {
 2 |     "person",         "bicycle",    "car",           "motorcycle",    "airplane",     "bus",           "train",
 3 |     "truck",          "boat",       "traffic light", "fire hydrant",  "stop sign",    "parking meter", "bench",
 4 |     "bird",           "cat",        "dog",           "horse",         "sheep",        "cow",           "elephant",
 5 |     "bear",           "zebra",      "giraffe",       "backpack",      "umbrella",     "handbag",       "tie",
 6 |     "suitcase",       "frisbee",    "skis",          "snowboard",     "sports ball",  "kite",          "baseball bat",
 7 |     "baseball glove", "skateboard", "surfboard",     "tennis racket", "bottle",       "wine glass",    "cup",
 8 |     "fork",           "knife",      "spoon",         "bowl",          "banana",       "apple",         "sandwich",
 9 |     "orange",         "broccoli",   "carrot",        "hot dog",       "pizza",        "donut",         "cake",
10 |     "chair",          "couch",      "potted plant",  "bed",           "dining table", "toilet",        "tv",
11 |     "laptop",         "mouse",      "remote",        "keyboard",      "cell phone",   "microwave",     "oven",
12 |     "toaster",        "sink",       "refrigerator",  "book",          "clock",        "vase",          "scissors",
13 |     "teddy bear",     "hair drier", "toothbrush" };
14 | 
15 | const std::vector<std::vector<unsigned int>> COLORS = {
16 |     {0, 114, 189},   {217, 83, 25},   {237, 177, 32},  {126, 47, 142},  {119, 172, 48},  {77, 190, 238},
17 |     {162, 20, 47},   {76, 76, 76},    {153, 153, 153}, {255, 0, 0},     {255, 128, 0},   {191, 191, 0},
18 |     {0, 255, 0},     {0, 0, 255},     {170, 0, 255},   {85, 85, 0},     {85, 170, 0},    {85, 255, 0},
19 |     {170, 85, 0},    {170, 170, 0},   {170, 255, 0},   {255, 85, 0},    {255, 170, 0},   {255, 255, 0},
20 |     {0, 85, 128},    {0, 170, 128},   {0, 255, 128},   {85, 0, 128},    {85, 85, 128},   {85, 170, 128},
21 |     {85, 255, 128},  {170, 0, 128},   {170, 85, 128},  {170, 170, 128}, {170, 255, 128}, {255, 0, 128},
22 |     {255, 85, 128},  {255, 170, 128}, {255, 255, 128}, {0, 85, 255},    {0, 170, 255},   {0, 255, 255},
23 |     {85, 0, 255},    {85, 85, 255},   {85, 170, 255},  {85, 255, 255},  {170, 0, 255},   {170, 85, 255},
24 |     {170, 170, 255}, {170, 255, 255}, {255, 0, 255},   {255, 85, 255},  {255, 170, 255}, {85, 0, 0},
25 |     {128, 0, 0},     {170, 0, 0},     {212, 0, 0},     {255, 0, 0},     {0, 43, 0},      {0, 85, 0},
26 |     {0, 128, 0},     {0, 170, 0},     {0, 212, 0},     {0, 255, 0},     {0, 0, 43},      {0, 0, 85},
27 |     {0, 0, 128},     {0, 0, 170},     {0, 0, 212},     {0, 0, 255},     {0, 0, 0},       {36, 36, 36},
28 |     {73, 73, 73},    {109, 109, 109}, {146, 146, 146}, {182, 182, 182}, {219, 219, 219}, {0, 114, 189},
29 |     {80, 183, 189},  {128, 128, 0} };


--------------------------------------------------------------------------------
/include/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRTX_CUDA_UTILS_H_
 2 | #define TRTX_CUDA_UTILS_H_
 3 | 
 4 | #include <cuda_runtime_api.h>
 5 | 
 6 | #ifndef CUDA_CHECK
 7 | #define CUDA_CHECK(callstr)\
 8 |     {\
 9 |         cudaError_t error_code = callstr;\
10 |         if (error_code != cudaSuccess) {\
11 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
12 |             assert(0);\
13 |         }\
14 |     }
15 | #endif  // CUDA_CHECK
16 | 
17 | #endif  // TRTX_CUDA_UTILS_H_
18 | 


--------------------------------------------------------------------------------
/include/logging.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #ifndef TENSORRT_LOGGING_H
 18 | #define TENSORRT_LOGGING_H
 19 | 
 20 | #include "NvInferRuntimeCommon.h"
 21 | #include <cassert>
 22 | #include <ctime>
 23 | #include <iomanip>
 24 | #include <iostream>
 25 | #include <ostream>
 26 | #include <sstream>
 27 | #include <string>
 28 | #include "macros.h"
 29 | 
 30 | using Severity = nvinfer1::ILogger::Severity;
 31 | 
 32 | class LogStreamConsumerBuffer : public std::stringbuf
 33 | {
 34 | public:
 35 |     LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
 36 |         : mOutput(stream)
 37 |         , mPrefix(prefix)
 38 |         , mShouldLog(shouldLog)
 39 |     {
 40 |     }
 41 | 
 42 |     LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
 43 |         : mOutput(other.mOutput)
 44 |     {
 45 |     }
 46 | 
 47 |     ~LogStreamConsumerBuffer()
 48 |     {
 49 |         // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
 50 |         // std::streambuf::pptr() gives a pointer to the current position of the output sequence
 51 |         // if the pointer to the beginning is not equal to the pointer to the current position,
 52 |         // call putOutput() to log the output to the stream
 53 |         if (pbase() != pptr())
 54 |         {
 55 |             putOutput();
 56 |         }
 57 |     }
 58 | 
 59 |     // synchronizes the stream buffer and returns 0 on success
 60 |     // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
 61 |     // resetting the buffer and flushing the stream
 62 |     virtual int sync()
 63 |     {
 64 |         putOutput();
 65 |         return 0;
 66 |     }
 67 | 
 68 |     void putOutput()
 69 |     {
 70 |         if (mShouldLog)
 71 |         {
 72 |             // prepend timestamp
 73 |             std::time_t timestamp = std::time(nullptr);
 74 |             tm* tm_local = std::localtime(&timestamp);
 75 |             std::cout << "[";
 76 |             std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
 77 |             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
 78 |             std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
 79 |             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
 80 |             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
 81 |             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
 82 |             // std::stringbuf::str() gets the string contents of the buffer
 83 |             // insert the buffer contents pre-appended by the appropriate prefix into the stream
 84 |             mOutput << mPrefix << str();
 85 |             // set the buffer to empty
 86 |             str("");
 87 |             // flush the stream
 88 |             mOutput.flush();
 89 |         }
 90 |     }
 91 | 
 92 |     void setShouldLog(bool shouldLog)
 93 |     {
 94 |         mShouldLog = shouldLog;
 95 |     }
 96 | 
 97 | private:
 98 |     std::ostream& mOutput;
 99 |     std::string mPrefix;
100 |     bool mShouldLog;
101 | };
102 | 
103 | //!
104 | //! \class LogStreamConsumerBase
105 | //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
106 | //!
107 | class LogStreamConsumerBase
108 | {
109 | public:
110 |     LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
111 |         : mBuffer(stream, prefix, shouldLog)
112 |     {
113 |     }
114 | 
115 | protected:
116 |     LogStreamConsumerBuffer mBuffer;
117 | };
118 | 
119 | //!
120 | //! \class LogStreamConsumer
121 | //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
122 | //!  Order of base classes is LogStreamConsumerBase and then std::ostream.
123 | //!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
124 | //!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
125 | //!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
126 | //!  Please do not change the order of the parent classes.
127 | //!
128 | class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
129 | {
130 | public:
131 |     //! \brief Creates a LogStreamConsumer which logs messages with level severity.
132 |     //!  Reportable severity determines if the messages are severe enough to be logged.
133 |     LogStreamConsumer(Severity reportableSeverity, Severity severity)
134 |         : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
135 |         , std::ostream(&mBuffer) // links the stream buffer with the stream
136 |         , mShouldLog(severity <= reportableSeverity)
137 |         , mSeverity(severity)
138 |     {
139 |     }
140 | 
141 |     LogStreamConsumer(LogStreamConsumer&& other)
142 |         : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
143 |         , std::ostream(&mBuffer) // links the stream buffer with the stream
144 |         , mShouldLog(other.mShouldLog)
145 |         , mSeverity(other.mSeverity)
146 |     {
147 |     }
148 | 
149 |     void setReportableSeverity(Severity reportableSeverity)
150 |     {
151 |         mShouldLog = mSeverity <= reportableSeverity;
152 |         mBuffer.setShouldLog(mShouldLog);
153 |     }
154 | 
155 | private:
156 |     static std::ostream& severityOstream(Severity severity)
157 |     {
158 |         return severity >= Severity::kINFO ? std::cout : std::cerr;
159 |     }
160 | 
161 |     static std::string severityPrefix(Severity severity)
162 |     {
163 |         switch (severity)
164 |         {
165 |         case Severity::kINTERNAL_ERROR: return "[F] ";
166 |         case Severity::kERROR: return "[E] ";
167 |         case Severity::kWARNING: return "[W] ";
168 |         case Severity::kINFO: return "[I] ";
169 |         case Severity::kVERBOSE: return "[V] ";
170 |         default: assert(0); return "";
171 |         }
172 |     }
173 | 
174 |     bool mShouldLog;
175 |     Severity mSeverity;
176 | };
177 | 
178 | //! \class Logger
179 | //!
180 | //! \brief Class which manages logging of TensorRT tools and samples
181 | //!
182 | //! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
183 | //! and supports logging two types of messages:
184 | //!
185 | //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
186 | //! - Test pass/fail messages
187 | //!
188 | //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
189 | //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
190 | //!
191 | //! In the future, this class could be extended to support dumping test results to a file in some standard format
192 | //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
193 | //!
194 | //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
195 | //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
196 | //! library and messages coming from the sample.
197 | //!
198 | //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
199 | //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
200 | //! object.
201 | 
202 | class Logger : public nvinfer1::ILogger
203 | {
204 | public:
205 |     Logger(Severity severity = Severity::kWARNING)
206 |         : mReportableSeverity(severity)
207 |     {
208 |     }
209 | 
210 |     //!
211 |     //! \enum TestResult
212 |     //! \brief Represents the state of a given test
213 |     //!
214 |     enum class TestResult
215 |     {
216 |         kRUNNING, //!< The test is running
217 |         kPASSED,  //!< The test passed
218 |         kFAILED,  //!< The test failed
219 |         kWAIVED   //!< The test was waived
220 |     };
221 | 
222 |     //!
223 |     //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
224 |     //! \return The nvinfer1::ILogger associated with this Logger
225 |     //!
226 |     //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
227 |     //! we can eliminate the inheritance of Logger from ILogger
228 |     //!
229 |     nvinfer1::ILogger& getTRTLogger()
230 |     {
231 |         return *this;
232 |     }
233 | 
234 |     //!
235 |     //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
236 |     //!
237 |     //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
238 |     //! inheritance from nvinfer1::ILogger
239 |     //!
240 |     void log(Severity severity, const char* msg) TRT_NOEXCEPT override
241 |     {
242 |         LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
243 |     }
244 | 
245 |     //!
246 |     //! \brief Method for controlling the verbosity of logging output
247 |     //!
248 |     //! \param severity The logger will only emit messages that have severity of this level or higher.
249 |     //!
250 |     void setReportableSeverity(Severity severity)
251 |     {
252 |         mReportableSeverity = severity;
253 |     }
254 | 
255 |     //!
256 |     //! \brief Opaque handle that holds logging information for a particular test
257 |     //!
258 |     //! This object is an opaque handle to information used by the Logger to print test results.
259 |     //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
260 |     //! with Logger::reportTest{Start,End}().
261 |     //!
262 |     class TestAtom
263 |     {
264 |     public:
265 |         TestAtom(TestAtom&&) = default;
266 | 
267 |     private:
268 |         friend class Logger;
269 | 
270 |         TestAtom(bool started, const std::string& name, const std::string& cmdline)
271 |             : mStarted(started)
272 |             , mName(name)
273 |             , mCmdline(cmdline)
274 |         {
275 |         }
276 | 
277 |         bool mStarted;
278 |         std::string mName;
279 |         std::string mCmdline;
280 |     };
281 | 
282 |     //!
283 |     //! \brief Define a test for logging
284 |     //!
285 |     //! \param[in] name The name of the test.  This should be a string starting with
286 |     //!                  "TensorRT" and containing dot-separated strings containing
287 |     //!                  the characters [A-Za-z0-9_].
288 |     //!                  For example, "TensorRT.sample_googlenet"
289 |     //! \param[in] cmdline The command line used to reproduce the test
290 |     //
291 |     //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
292 |     //!
293 |     static TestAtom defineTest(const std::string& name, const std::string& cmdline)
294 |     {
295 |         return TestAtom(false, name, cmdline);
296 |     }
297 | 
298 |     //!
299 |     //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
300 |     //!        as input
301 |     //!
302 |     //! \param[in] name The name of the test
303 |     //! \param[in] argc The number of command-line arguments
304 |     //! \param[in] argv The array of command-line arguments (given as C strings)
305 |     //!
306 |     //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
307 |     static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
308 |     {
309 |         auto cmdline = genCmdlineString(argc, argv);
310 |         return defineTest(name, cmdline);
311 |     }
312 | 
313 |     //!
314 |     //! \brief Report that a test has started.
315 |     //!
316 |     //! \pre reportTestStart() has not been called yet for the given testAtom
317 |     //!
318 |     //! \param[in] testAtom The handle to the test that has started
319 |     //!
320 |     static void reportTestStart(TestAtom& testAtom)
321 |     {
322 |         reportTestResult(testAtom, TestResult::kRUNNING);
323 |         assert(!testAtom.mStarted);
324 |         testAtom.mStarted = true;
325 |     }
326 | 
327 |     //!
328 |     //! \brief Report that a test has ended.
329 |     //!
330 |     //! \pre reportTestStart() has been called for the given testAtom
331 |     //!
332 |     //! \param[in] testAtom The handle to the test that has ended
333 |     //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
334 |     //!                   TestResult::kFAILED, TestResult::kWAIVED
335 |     //!
336 |     static void reportTestEnd(const TestAtom& testAtom, TestResult result)
337 |     {
338 |         assert(result != TestResult::kRUNNING);
339 |         assert(testAtom.mStarted);
340 |         reportTestResult(testAtom, result);
341 |     }
342 | 
343 |     static int reportPass(const TestAtom& testAtom)
344 |     {
345 |         reportTestEnd(testAtom, TestResult::kPASSED);
346 |         return EXIT_SUCCESS;
347 |     }
348 | 
349 |     static int reportFail(const TestAtom& testAtom)
350 |     {
351 |         reportTestEnd(testAtom, TestResult::kFAILED);
352 |         return EXIT_FAILURE;
353 |     }
354 | 
355 |     static int reportWaive(const TestAtom& testAtom)
356 |     {
357 |         reportTestEnd(testAtom, TestResult::kWAIVED);
358 |         return EXIT_SUCCESS;
359 |     }
360 | 
361 |     static int reportTest(const TestAtom& testAtom, bool pass)
362 |     {
363 |         return pass ? reportPass(testAtom) : reportFail(testAtom);
364 |     }
365 | 
366 |     Severity getReportableSeverity() const
367 |     {
368 |         return mReportableSeverity;
369 |     }
370 | 
371 | private:
372 |     //!
373 |     //! \brief returns an appropriate string for prefixing a log message with the given severity
374 |     //!
375 |     static const char* severityPrefix(Severity severity)
376 |     {
377 |         switch (severity)
378 |         {
379 |         case Severity::kINTERNAL_ERROR: return "[F] ";
380 |         case Severity::kERROR: return "[E] ";
381 |         case Severity::kWARNING: return "[W] ";
382 |         case Severity::kINFO: return "[I] ";
383 |         case Severity::kVERBOSE: return "[V] ";
384 |         default: assert(0); return "";
385 |         }
386 |     }
387 | 
388 |     //!
389 |     //! \brief returns an appropriate string for prefixing a test result message with the given result
390 |     //!
391 |     static const char* testResultString(TestResult result)
392 |     {
393 |         switch (result)
394 |         {
395 |         case TestResult::kRUNNING: return "RUNNING";
396 |         case TestResult::kPASSED: return "PASSED";
397 |         case TestResult::kFAILED: return "FAILED";
398 |         case TestResult::kWAIVED: return "WAIVED";
399 |         default: assert(0); return "";
400 |         }
401 |     }
402 | 
403 |     //!
404 |     //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
405 |     //!
406 |     static std::ostream& severityOstream(Severity severity)
407 |     {
408 |         return severity >= Severity::kINFO ? std::cout : std::cerr;
409 |     }
410 | 
411 |     //!
412 |     //! \brief method that implements logging test results
413 |     //!
414 |     static void reportTestResult(const TestAtom& testAtom, TestResult result)
415 |     {
416 |         severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
417 |             << testAtom.mCmdline << std::endl;
418 |     }
419 | 
420 |     //!
421 |     //! \brief generate a command line string from the given (argc, argv) values
422 |     //!
423 |     static std::string genCmdlineString(int argc, char const* const* argv)
424 |     {
425 |         std::stringstream ss;
426 |         for (int i = 0; i < argc; i++)
427 |         {
428 |             if (i > 0)
429 |                 ss << " ";
430 |             ss << argv[i];
431 |         }
432 |         return ss.str();
433 |     }
434 | 
435 |     Severity mReportableSeverity;
436 | };
437 | 
438 | namespace
439 | {
440 | 
441 |     //!
442 |     //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
443 |     //!
444 |     //! Example usage:
445 |     //!
446 |     //!     LOG_VERBOSE(logger) << "hello world" << std::endl;
447 |     //!
448 |     inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
449 |     {
450 |         return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
451 |     }
452 | 
453 |     //!
454 |     //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
455 |     //!
456 |     //! Example usage:
457 |     //!
458 |     //!     LOG_INFO(logger) << "hello world" << std::endl;
459 |     //!
460 |     inline LogStreamConsumer LOG_INFO(const Logger& logger)
461 |     {
462 |         return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
463 |     }
464 | 
465 |     //!
466 |     //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
467 |     //!
468 |     //! Example usage:
469 |     //!
470 |     //!     LOG_WARN(logger) << "hello world" << std::endl;
471 |     //!
472 |     inline LogStreamConsumer LOG_WARN(const Logger& logger)
473 |     {
474 |         return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
475 |     }
476 | 
477 |     //!
478 |     //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
479 |     //!
480 |     //! Example usage:
481 |     //!
482 |     //!     LOG_ERROR(logger) << "hello world" << std::endl;
483 |     //!
484 |     inline LogStreamConsumer LOG_ERROR(const Logger& logger)
485 |     {
486 |         return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
487 |     }
488 | 
489 |     //!
490 |     //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
491 |     //         ("fatal" severity)
492 |     //!
493 |     //! Example usage:
494 |     //!
495 |     //!     LOG_FATAL(logger) << "hello world" << std::endl;
496 |     //!
497 |     inline LogStreamConsumer LOG_FATAL(const Logger& logger)
498 |     {
499 |         return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
500 |     }
501 | 
502 | } // anonymous namespace
503 | 
504 | #endif // TENSORRT_LOGGING_H


--------------------------------------------------------------------------------
/include/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #ifdef API_EXPORTS
 5 | #if defined(_MSC_VER)
 6 | #define API __declspec(dllexport)
 7 | #else
 8 | #define API __attribute__((visibility("default")))
 9 | #endif
10 | #else
11 | 
12 | #if defined(_MSC_VER)
13 | #define API __declspec(dllimport)
14 | #else
15 | #define API
16 | #endif
17 | #endif  // API_EXPORTS
18 | 
19 | #if NV_TENSORRT_MAJOR >= 8
20 | #define TRT_NOEXCEPT noexcept
21 | #define TRT_CONST_ENQUEUE const
22 | #else
23 | #define TRT_NOEXCEPT
24 | #define TRT_CONST_ENQUEUE
25 | #endif
26 | 
27 | #endif  // __MACROS_H


--------------------------------------------------------------------------------
/include/preprocess.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file cuda_preprocess.h
 3 |  * @brief Header file for CUDA-based image preprocessing functions.
 4 |  *
 5 |  * This file contains functions for initializing, destroying, and running image preprocessing
 6 |  * using CUDA for accelerating operations like resizing and data format conversion.
 7 |  */
 8 | 
 9 | #pragma once
10 | 
11 | #include <cuda_runtime.h>
12 | #include <cstdint>
13 | #include <opencv2/opencv.hpp>
14 | 
15 |  /**
16 |   * @brief Initialize CUDA resources for image preprocessing.
17 |   *
18 |   * Allocates resources and sets up the necessary environment for performing image preprocessing
19 |   * on the GPU. This function should be called once before using any preprocessing functions.
20 |   *
21 |   * @param max_image_size The maximum image size (in pixels) that will be processed.
22 |   */
23 | void cuda_preprocess_init(int max_image_size);
24 | 
25 | /**
26 |  * @brief Clean up and release CUDA resources.
27 |  *
28 |  * Frees any memory and resources allocated during initialization. This function should be
29 |  * called when the preprocessing operations are no longer needed.
30 |  */
31 | void cuda_preprocess_destroy();
32 | 
33 | /**
34 |  * @brief Preprocess an image using CUDA.
35 |  *
36 |  * This function resizes and converts the input image data (from uint8 to float) using CUDA
37 |  * for faster processing. The result is stored in a destination buffer, ready for inference.
38 |  *
39 |  * @param src Pointer to the source image data in uint8 format.
40 |  * @param src_width The width of the source image.
41 |  * @param src_height The height of the source image.
42 |  * @param dst Pointer to the destination buffer to store the preprocessed image in float format.
43 |  * @param dst_width The desired width of the output image.
44 |  * @param dst_height The desired height of the output image.
45 |  * @param stream The CUDA stream to execute the preprocessing operation asynchronously.
46 |  */
47 | void cuda_preprocess(uint8_t* src, int src_width, int src_height,
48 |     float* dst, int dst_width, int dst_height,
49 |     cudaStream_t stream);
50 | 


--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
  1 | ﻿#ifdef _WIN32
  2 | #include <windows.h>
  3 | #else
  4 | #include <sys/stat.h>
  5 | #include <unistd.h>
  6 | #endif
  7 | 
  8 | #include <iostream>
  9 | #include <string>
 10 | #include "yolov11.h"
 11 | 
 12 | 
 13 | /**
 14 |  * @brief Setting up Tensorrt logger
 15 | */
 16 | class Logger : public nvinfer1::ILogger {
 17 |     void log(Severity severity, const char* msg) noexcept override {
 18 |         // Only output logs with severity greater than warning
 19 |         if (severity <= Severity::kWARNING)
 20 |             std::cout << msg << std::endl;
 21 |     }
 22 | }logger;
 23 | 
 24 | 
 25 | int main(int argc, char* argv[]) {
 26 | 
 27 |     // Define color codes for terminal output
 28 |     const std::string RED_COLOR = "\033[31m";
 29 |     const std::string GREEN_COLOR = "\033[32m";
 30 |     const std::string YELLOW_COLOR = "\033[33m";
 31 |     const std::string RESET_COLOR = "\033[0m";
 32 | 
 33 |     // Check for valid number of arguments
 34 |     if (argc < 4 || argc > 5) {
 35 |         std::cerr << RED_COLOR << "Usage: " << RESET_COLOR << argv[0]
 36 |             << " <mode> <input_path> <engine_path> [onnx_path]" << std::endl;
 37 |         std::cerr << YELLOW_COLOR << "  <mode> - Mode of operation: 'convert', 'infer_video', or 'infer_image'" << RESET_COLOR << std::endl;
 38 |         std::cerr << YELLOW_COLOR << "  <input_path> - Path to the input video/image or ONNX model" << RESET_COLOR << std::endl;
 39 |         std::cerr << YELLOW_COLOR << "  <engine_path> - Path to the TensorRT engine file" << RESET_COLOR << std::endl;
 40 |         std::cerr << YELLOW_COLOR << "  [onnx_path] - Path to the ONNX model (only for 'convert' mode)" << RESET_COLOR << std::endl;
 41 |         return 1;
 42 |     }
 43 | 
 44 |     // Parse command-line arguments
 45 |     std::string mode = argv[1];
 46 |     std::string inputPath = argv[2];
 47 |     std::string enginePath = argv[3];
 48 |     std::string onnxPath;
 49 | 
 50 |     // Validate mode and arguments
 51 |     if (mode == "convert") {
 52 |         if (argc != 5) {  // 'convert' requires onnx_path
 53 |             std::cerr << RED_COLOR << "Usage for conversion: " << RESET_COLOR << argv[0]
 54 |                 << " convert <onnx_path> <engine_path>" << std::endl;
 55 |             return 1;
 56 |         }
 57 |         onnxPath = inputPath;  // In 'convert' mode, inputPath is actually onnx_path
 58 |     }
 59 |     else if (mode == "infer_video" || mode == "infer_image") {
 60 |         if (argc != 4) {
 61 |             std::cerr << RED_COLOR << "Usage for " << mode << ": " << RESET_COLOR << argv[0]
 62 |                 << " " << mode << " <input_path> <engine_path>" << std::endl;
 63 |             return 1;
 64 |         }
 65 |     }
 66 |     else {
 67 |         std::cerr << RED_COLOR << "Invalid mode. Use 'convert', 'infer_video', or 'infer_image'." << RESET_COLOR << std::endl;
 68 |         return 1;
 69 |     }
 70 | 
 71 |     // Initialize the Logger
 72 |     Logger logger;
 73 | 
 74 |     // Handle 'convert' mode
 75 |     if (mode == "convert") {
 76 |         try {
 77 |             // Initialize YOLOv11 with the ONNX model path
 78 |             YOLOv11 yolov11(onnxPath, logger);
 79 |             std::cout << GREEN_COLOR << "Model conversion successful. Engine saved." << RESET_COLOR << std::endl;
 80 |         }
 81 |         catch (const std::exception& e) {
 82 |             std::cerr << RED_COLOR << "Error during model conversion: " << e.what() << RESET_COLOR << std::endl;
 83 |             return 1;
 84 |         }
 85 |     }
 86 |     // Handle inference modes
 87 |     else if (mode == "infer_video" || mode == "infer_image") {
 88 |         try {
 89 |             // Initialize YOLOv11 with the TensorRT engine path
 90 |             YOLOv11 yolov11(enginePath, logger);
 91 | 
 92 |             if (mode == "infer_video") {
 93 |                 // Open the video file
 94 |                 cv::VideoCapture cap(inputPath);
 95 |                 if (!cap.isOpened()) {
 96 |                     std::cerr << RED_COLOR << "Failed to open video file: " << inputPath << RESET_COLOR << std::endl;
 97 |                     return 1;
 98 |                 }
 99 | 
100 |                 // Prepare video writer to save the output (optional)
101 |                 std::string outputVideoPath = "output_video.avi";
102 |                 int frame_width = static_cast<int>(cap.get(cv::CAP_PROP_FRAME_WIDTH));
103 |                 int frame_height = static_cast<int>(cap.get(cv::CAP_PROP_FRAME_HEIGHT));
104 |                 cv::VideoWriter video(outputVideoPath, cv::VideoWriter::fourcc('M', 'J', 'P', 'G'), 30,
105 |                     cv::Size(frame_width, frame_height));
106 | 
107 |                 cv::Mat frame;
108 |                 while (cap.read(frame)) {
109 |                     // Preprocess the frame
110 |                     yolov11.preprocess(frame);
111 | 
112 |                     // Perform inference
113 |                     yolov11.infer();
114 | 
115 |                     // Postprocess to get detections
116 |                     std::vector<Detection> detections;
117 |                     yolov11.postprocess(detections);
118 | 
119 |                     // Draw detections on the frame
120 |                     yolov11.draw(frame, detections);
121 | 
122 |                     // Display the frame (optional)
123 |                     cv::imshow("Inference", frame);
124 |                     if (cv::waitKey(1) == 27) { // Exit on 'ESC' key
125 |                         break;
126 |                     }
127 | 
128 |                     // Write the frame to the output video
129 |                     video.write(frame);
130 |                 }
131 | 
132 |                 cap.release();
133 |                 video.release();
134 |                 cv::destroyAllWindows();
135 |                 std::cout << GREEN_COLOR << "Video inference completed. Output saved to "
136 |                     << outputVideoPath << RESET_COLOR << std::endl;
137 |             }
138 |             else if (mode == "infer_image") {
139 |                 // Read the image
140 |                 cv::Mat image = cv::imread(inputPath);
141 |                 if (image.empty()) {
142 |                     std::cerr << RED_COLOR << "Failed to read image: " << inputPath << RESET_COLOR << std::endl;
143 |                     return 1;
144 |                 }
145 | 
146 |                 // Preprocess the image
147 |                 yolov11.preprocess(image);
148 | 
149 |                 // Perform inference
150 |                 yolov11.infer();
151 | 
152 |                 // Postprocess to get detections
153 |                 std::vector<Detection> detections;
154 |                 yolov11.postprocess(detections);
155 | 
156 |                 // Draw detections on the image
157 |                 yolov11.draw(image, detections);
158 | 
159 |                 // Display the image (optional)
160 |                 cv::imshow("Inference", image);
161 |                 cv::waitKey(0); // Wait indefinitely until a key is pressed
162 | 
163 |                 // Save the output image
164 |                 std::string outputImagePath = "output_image.jpg";
165 |                 cv::imwrite(outputImagePath, image);
166 |                 std::cout << GREEN_COLOR << "Image inference completed. Output saved to "
167 |                     << outputImagePath << RESET_COLOR << std::endl;
168 |             }
169 |         }
170 |         catch (const std::exception& e) {
171 |             std::cerr << RED_COLOR << "Error during inference: " << e.what() << RESET_COLOR << std::endl;
172 |             return 1;
173 |         }
174 |     }
175 | 
176 |     return 0;
177 | }


--------------------------------------------------------------------------------
/src/YOLOv11.cpp:
--------------------------------------------------------------------------------
  1 | #include "YOLOv11.h"             // Header file for YOLOv11 class
  2 | #include "logging.h"             // Logging utilities
  3 | #include "cuda_utils.h"          // CUDA utility functions
  4 | #include "macros.h"              // Common macros
  5 | #include "preprocess.h"          // Preprocessing functions
  6 | #include <NvOnnxParser.h>        // NVIDIA ONNX parser for TensorRT
  7 | #include "common.h"              // Common definitions and utilities
  8 | #include <fstream>               // File stream operations
  9 | #include <iostream>              // Input/output stream operations
 10 | 
 11 | // Initialize a static logger instance
 12 | static Logger logger;
 13 | 
 14 | // Define whether to use FP16 precision
 15 | #define isFP16 true
 16 | 
 17 | // Define whether to perform model warmup
 18 | #define warmup true
 19 | 
 20 | // Constructor for the YOLOv11 class
 21 | YOLOv11::YOLOv11(string model_path, nvinfer1::ILogger& logger)
 22 | {
 23 |     // Check if the model path does not contain ".onnx"
 24 |     if (model_path.find(".onnx") == std::string::npos)
 25 |     {
 26 |         // Initialize the engine from a serialized engine file
 27 |         init(model_path, logger);
 28 |     }
 29 |     else
 30 |     {
 31 |         // Build the engine from an ONNX model
 32 |         build(model_path, logger);
 33 |         // Save the built engine to a file
 34 |         saveEngine(model_path);
 35 |     }
 36 | 
 37 |     // Handle input dimensions based on TensorRT version
 38 | #if NV_TENSORRT_MAJOR < 10
 39 |     // For TensorRT versions less than 10, get binding dimensions directly
 40 |     auto input_dims = engine->getBindingDimensions(0);
 41 |     input_h = input_dims.d[2];
 42 |     input_w = input_dims.d[3];
 43 | #else
 44 |     // For TensorRT versions 10 and above, use getTensorShape
 45 |     auto input_dims = engine->getTensorShape(engine->getIOTensorName(0));
 46 |     input_h = input_dims.d[2];
 47 |     input_w = input_dims.d[3];
 48 | #endif
 49 | }
 50 | 
 51 | // Initialize the engine from a serialized engine file
 52 | void YOLOv11::init(std::string engine_path, nvinfer1::ILogger& logger)
 53 | {
 54 |     // Open the engine file in binary mode
 55 |     ifstream engineStream(engine_path, ios::binary);
 56 |     // Move to the end to determine file size
 57 |     engineStream.seekg(0, ios::end);
 58 |     const size_t modelSize = engineStream.tellg();
 59 |     // Move back to the beginning of the file
 60 |     engineStream.seekg(0, ios::beg);
 61 |     // Allocate memory to read the engine data
 62 |     unique_ptr<char[]> engineData(new char[modelSize]);
 63 |     // Read the engine data into memory
 64 |     engineStream.read(engineData.get(), modelSize);
 65 |     engineStream.close();
 66 | 
 67 |     // Create a TensorRT runtime instance
 68 |     runtime = createInferRuntime(logger);
 69 |     // Deserialize the CUDA engine from the engine data
 70 |     engine = runtime->deserializeCudaEngine(engineData.get(), modelSize);
 71 |     // Create an execution context for the engine
 72 |     context = engine->createExecutionContext();
 73 | 
 74 |     // Retrieve input dimensions from the engine
 75 |     input_h = engine->getBindingDimensions(0).d[2];
 76 |     input_w = engine->getBindingDimensions(0).d[3];
 77 |     // Retrieve detection attributes and number of detections
 78 |     detection_attribute_size = engine->getBindingDimensions(1).d[1];
 79 |     num_detections = engine->getBindingDimensions(1).d[2];
 80 |     // Calculate the number of classes based on detection attributes
 81 |     num_classes = detection_attribute_size - 4;
 82 | 
 83 |     // Allocate CPU memory for output buffer
 84 |     cpu_output_buffer = new float[detection_attribute_size * num_detections];
 85 |     // Allocate GPU memory for input buffer (assuming 3 channels: RGB)
 86 |     CUDA_CHECK(cudaMalloc(&gpu_buffers[0], 3 * input_w * input_h * sizeof(float)));
 87 |     // Allocate GPU memory for output buffer
 88 |     CUDA_CHECK(cudaMalloc(&gpu_buffers[1], detection_attribute_size * num_detections * sizeof(float)));
 89 | 
 90 |     // Initialize CUDA preprocessing with maximum image size
 91 |     cuda_preprocess_init(MAX_IMAGE_SIZE);
 92 | 
 93 |     // Create a CUDA stream for asynchronous operations
 94 |     CUDA_CHECK(cudaStreamCreate(&stream));
 95 | 
 96 |     // Perform model warmup if enabled
 97 |     if (warmup) {
 98 |         for (int i = 0; i < 10; i++) {
 99 |             this->infer(); // Run inference to warm up the model
100 |         }
101 |         printf("model warmup 10 times\n");
102 |     }
103 | }
104 | 
105 | // Destructor for the YOLOv11 class
106 | YOLOv11::~YOLOv11()
107 | {
108 |     // Synchronize and destroy the CUDA stream
109 |     CUDA_CHECK(cudaStreamSynchronize(stream));
110 |     CUDA_CHECK(cudaStreamDestroy(stream));
111 |     // Free allocated GPU buffers
112 |     for (int i = 0; i < 2; i++)
113 |         CUDA_CHECK(cudaFree(gpu_buffers[i]));
114 |     // Free CPU output buffer
115 |     delete[] cpu_output_buffer;
116 | 
117 |     // Destroy CUDA preprocessing resources
118 |     cuda_preprocess_destroy();
119 |     // Delete TensorRT context, engine, and runtime
120 |     delete context;
121 |     delete engine;
122 |     delete runtime;
123 | }
124 | 
125 | // Preprocess the input image and transfer it to the GPU buffer
126 | void YOLOv11::preprocess(Mat& image) {
127 |     // Perform CUDA-based preprocessing
128 |     cuda_preprocess(image.ptr(), image.cols, image.rows, gpu_buffers[0], input_w, input_h, stream);
129 |     // Synchronize the CUDA stream to ensure preprocessing is complete
130 |     CUDA_CHECK(cudaStreamSynchronize(stream));
131 | }
132 | 
133 | // Perform inference using the TensorRT execution context
134 | void YOLOv11::infer()
135 | {
136 | #if NV_TENSORRT_MAJOR < 10
137 |     // For TensorRT versions less than 10, use enqueueV2 with GPU buffers
138 |     context->enqueueV2((void**)gpu_buffers, stream, nullptr);
139 | #else
140 |     // For TensorRT versions 10 and above, use enqueueV3 with the CUDA stream
141 |     this->context->enqueueV3(this->stream);
142 | #endif
143 | }
144 | 
145 | // Postprocess the inference output to extract detections
146 | void YOLOv11::postprocess(vector<Detection>& output)
147 | {
148 |     // Asynchronously copy output from GPU to CPU
149 |     CUDA_CHECK(cudaMemcpyAsync(cpu_output_buffer, gpu_buffers[1], num_detections * detection_attribute_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
150 |     // Synchronize the CUDA stream to ensure copy is complete
151 |     CUDA_CHECK(cudaStreamSynchronize(stream));
152 | 
153 |     vector<Rect> boxes;          // Bounding boxes
154 |     vector<int> class_ids;       // Class IDs
155 |     vector<float> confidences;   // Confidence scores
156 | 
157 |     // Create a matrix view of the detection output
158 |     const Mat det_output(detection_attribute_size, num_detections, CV_32F, cpu_output_buffer);
159 | 
160 |     // Iterate over each detection
161 |     for (int i = 0; i < det_output.cols; ++i) {
162 |         // Extract class scores for the current detection
163 |         const Mat classes_scores = det_output.col(i).rowRange(4, 4 + num_classes);
164 |         Point class_id_point;
165 |         double score;
166 |         // Find the class with the maximum score
167 |         minMaxLoc(classes_scores, nullptr, &score, nullptr, &class_id_point);
168 | 
169 |         // Check if the confidence score exceeds the threshold
170 |         if (score > conf_threshold) {
171 |             // Extract bounding box coordinates
172 |             const float cx = det_output.at<float>(0, i);
173 |             const float cy = det_output.at<float>(1, i);
174 |             const float ow = det_output.at<float>(2, i);
175 |             const float oh = det_output.at<float>(3, i);
176 |             Rect box;
177 |             // Calculate top-left corner of the bounding box
178 |             box.x = static_cast<int>((cx - 0.5 * ow));
179 |             box.y = static_cast<int>((cy - 0.5 * oh));
180 |             // Set width and height of the bounding box
181 |             box.width = static_cast<int>(ow);
182 |             box.height = static_cast<int>(oh);
183 | 
184 |             // Store the bounding box, class ID, and confidence
185 |             boxes.push_back(box);
186 |             class_ids.push_back(class_id_point.y);
187 |             confidences.push_back(score);
188 |         }
189 |     }
190 | 
191 |     vector<int> nms_result; // Indices after Non-Maximum Suppression (NMS)
192 |     // Apply NMS to remove overlapping boxes
193 |     dnn::NMSBoxes(boxes, confidences, conf_threshold, nms_threshold, nms_result);
194 | 
195 |     // Iterate over NMS results and populate the output detections
196 |     for (int i = 0; i < nms_result.size(); i++)
197 |     {
198 |         Detection result;
199 |         int idx = nms_result[i];
200 |         result.class_id = class_ids[idx];
201 |         result.conf = confidences[idx];
202 |         result.bbox = boxes[idx];
203 |         output.push_back(result);
204 |     }
205 | }
206 | 
207 | // Build the TensorRT engine from an ONNX model
208 | void YOLOv11::build(std::string onnxPath, nvinfer1::ILogger& logger)
209 | {
210 |     // Create a TensorRT builder
211 |     auto builder = createInferBuilder(logger);
212 |     // Define network flags for explicit batch dimensions
213 |     const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
214 |     // Create a network definition with explicit batch
215 |     INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
216 |     // Create builder configuration
217 |     IBuilderConfig* config = builder->createBuilderConfig();
218 |     // Enable FP16 precision if specified
219 |     if (isFP16)
220 |     {
221 |         config->setFlag(BuilderFlag::kFP16);
222 |     }
223 |     // Create an ONNX parser
224 |     nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, logger);
225 |     // Parse the ONNX model file
226 |     bool parsed = parser->parseFromFile(onnxPath.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kINFO));
227 |     // Build the serialized network plan
228 |     IHostMemory* plan{ builder->buildSerializedNetwork(*network, *config) };
229 | 
230 |     // Create a TensorRT runtime
231 |     runtime = createInferRuntime(logger);
232 | 
233 |     // Deserialize the CUDA engine from the serialized plan
234 |     engine = runtime->deserializeCudaEngine(plan->data(), plan->size());
235 | 
236 |     // Create an execution context for the engine
237 |     context = engine->createExecutionContext();
238 | 
239 |     // Clean up allocated resources
240 |     delete network;
241 |     delete config;
242 |     delete parser;
243 |     delete plan;
244 | }
245 | 
246 | // Save the serialized TensorRT engine to a file
247 | bool YOLOv11::saveEngine(const std::string& onnxpath)
248 | {
249 |     // Generate the engine file path by replacing the extension with ".engine"
250 |     std::string engine_path;
251 |     size_t dotIndex = onnxpath.find_last_of(".");
252 |     if (dotIndex != std::string::npos) {
253 |         engine_path = onnxpath.substr(0, dotIndex) + ".engine";
254 |     }
255 |     else
256 |     {
257 |         return false; // Return false if no extension is found
258 |     }
259 | 
260 |     // Check if the engine is valid
261 |     if (engine)
262 |     {
263 |         // Serialize the engine
264 |         nvinfer1::IHostMemory* data = engine->serialize();
265 |         std::ofstream file;
266 |         // Open the engine file in binary write mode
267 |         file.open(engine_path, std::ios::binary | std::ios::out);
268 |         if (!file.is_open())
269 |         {
270 |             std::cout << "Create engine file " << engine_path << " failed" << std::endl;
271 |             return false;
272 |         }
273 |         // Write the serialized engine data to the file
274 |         file.write((const char*)data->data(), data->size());
275 |         file.close();
276 | 
277 |         // Free the serialized data memory
278 |         delete data;
279 |     }
280 |     return true;
281 | }
282 | 
283 | // Draw bounding boxes and labels on the image based on detections
284 | void YOLOv11::draw(Mat& image, const vector<Detection>& output)
285 | {
286 |     // Calculate the scaling ratios between input and original image dimensions
287 |     const float ratio_h = input_h / (float)image.rows;
288 |     const float ratio_w = input_w / (float)image.cols;
289 | 
290 |     // Iterate over each detection
291 |     for (int i = 0; i < output.size(); i++)
292 |     {
293 |         auto detection = output[i];
294 |         auto box = detection.bbox;
295 |         auto class_id = detection.class_id;
296 |         auto conf = detection.conf;
297 |         // Assign a color based on the class ID
298 |         cv::Scalar color = cv::Scalar(COLORS[class_id][0], COLORS[class_id][1], COLORS[class_id][2]);
299 | 
300 |         // Adjust bounding box coordinates based on aspect ratio
301 |         if (ratio_h > ratio_w)
302 |         {
303 |             box.x = box.x / ratio_w;
304 |             box.y = (box.y - (input_h - ratio_w * image.rows) / 2) / ratio_w;
305 |             box.width = box.width / ratio_w;
306 |             box.height = box.height / ratio_w;
307 |         }
308 |         else
309 |         {
310 |             box.x = (box.x - (input_w - ratio_h * image.cols) / 2) / ratio_h;
311 |             box.y = box.y / ratio_h;
312 |             box.width = box.width / ratio_h;
313 |             box.height = box.height / ratio_h;
314 |         }
315 | 
316 |         // Draw the bounding box on the image
317 |         rectangle(image, Point(box.x, box.y), Point(box.x + box.width, box.y + box.height), color, 3);
318 | 
319 |         // Prepare the label text with class name and confidence
320 |         string class_string = CLASS_NAMES[class_id] + ' ' + to_string(conf).substr(0, 4);
321 |         // Calculate the size of the text for background rectangle
322 |         Size text_size = getTextSize(class_string, FONT_HERSHEY_DUPLEX, 1, 2, 0);
323 |         // Define the background rectangle for the text
324 |         Rect text_rect(box.x, box.y - 40, text_size.width + 10, text_size.height + 20);
325 |         // Draw the background rectangle
326 |         rectangle(image, text_rect, color, FILLED);
327 |         // Put the text label on the image
328 |         putText(image, class_string, Point(box.x + 5, box.y - 10), FONT_HERSHEY_DUPLEX, 1, Scalar(0, 0, 0), 2, 0);
329 |     }
330 | }
331 | 


--------------------------------------------------------------------------------
/src/preprocess.cu:
--------------------------------------------------------------------------------
  1 | #include "preprocess.h"
  2 | #include "cuda_utils.h"
  3 | #include "device_launch_parameters.h"
  4 | 
  5 | // Host and device pointers for image buffers
  6 | static uint8_t* img_buffer_host = nullptr;    // Pinned memory on the host for faster transfers
  7 | static uint8_t* img_buffer_device = nullptr;  // Memory on the device (GPU)
  8 | 
  9 | // Structure to represent a 2x3 affine transformation matrix
 10 | struct AffineMatrix {
 11 |     float value[6]; // [m00, m01, m02, m10, m11, m12]
 12 | };
 13 | 
 14 | // CUDA kernel to perform affine warp on the image
 15 | __global__ void warpaffine_kernel(
 16 |     uint8_t* src,           // Source image on device
 17 |     int src_line_size,      // Number of bytes per source image row
 18 |     int src_width,          // Source image width
 19 |     int src_height,         // Source image height
 20 |     float* dst,             // Destination image on device (output)
 21 |     int dst_width,          // Destination image width
 22 |     int dst_height,         // Destination image height
 23 |     uint8_t const_value_st, // Constant value for out-of-bound pixels
 24 |     AffineMatrix d2s,       // Affine transformation matrix (destination to source)
 25 |     int edge                // Total number of pixels to process
 26 | ) {
 27 |     // Calculate the global position of the thread
 28 |     int position = blockDim.x * blockIdx.x + threadIdx.x;
 29 |     if (position >= edge) return; // Exit if position exceeds total pixels
 30 | 
 31 |     // Extract affine matrix elements
 32 |     float m_x1 = d2s.value[0];
 33 |     float m_y1 = d2s.value[1];
 34 |     float m_z1 = d2s.value[2];
 35 |     float m_x2 = d2s.value[3];
 36 |     float m_y2 = d2s.value[4];
 37 |     float m_z2 = d2s.value[5];
 38 | 
 39 |     // Calculate destination pixel coordinates
 40 |     int dx = position % dst_width;
 41 |     int dy = position / dst_width;
 42 | 
 43 |     // Apply affine transformation to get source coordinates
 44 |     float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
 45 |     float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
 46 | 
 47 |     float c0, c1, c2; // Color channels (B, G, R)
 48 | 
 49 |     // Check if the source coordinates are out of bounds
 50 |     if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
 51 |         // Assign constant value if out of range
 52 |         c0 = const_value_st;
 53 |         c1 = const_value_st;
 54 |         c2 = const_value_st;
 55 |     }
 56 |     else {
 57 |         // Perform bilinear interpolation
 58 | 
 59 |         // Get the integer parts of the source coordinates
 60 |         int y_low = floorf(src_y);
 61 |         int x_low = floorf(src_x);
 62 |         int y_high = y_low + 1;
 63 |         int x_high = x_low + 1;
 64 | 
 65 |         // Initialize constant values for out-of-bound pixels
 66 |         uint8_t const_value[] = { const_value_st, const_value_st, const_value_st };
 67 | 
 68 |         // Calculate the fractional parts
 69 |         float ly = src_y - y_low;
 70 |         float lx = src_x - x_low;
 71 |         float hy = 1 - ly;
 72 |         float hx = 1 - lx;
 73 | 
 74 |         // Compute the weights for the four surrounding pixels
 75 |         float w1 = hy * hx; // Top-left
 76 |         float w2 = hy * lx; // Top-right
 77 |         float w3 = ly * hx; // Bottom-left
 78 |         float w4 = ly * lx; // Bottom-right
 79 | 
 80 |         // Initialize pointers to the four surrounding pixels
 81 |         uint8_t* v1 = const_value;
 82 |         uint8_t* v2 = const_value;
 83 |         uint8_t* v3 = const_value;
 84 |         uint8_t* v4 = const_value;
 85 | 
 86 |         // Top-left pixel
 87 |         if (y_low >= 0) {
 88 |             if (x_low >= 0)
 89 |                 v1 = src + y_low * src_line_size + x_low * 3;
 90 |             // Top-right pixel
 91 |             if (x_high < src_width)
 92 |                 v2 = src + y_low * src_line_size + x_high * 3;
 93 |         }
 94 | 
 95 |         // Bottom-left and Bottom-right pixels
 96 |         if (y_high < src_height) {
 97 |             if (x_low >= 0)
 98 |                 v3 = src + y_high * src_line_size + x_low * 3;
 99 |             if (x_high < src_width)
100 |                 v4 = src + y_high * src_line_size + x_high * 3;
101 |         }
102 | 
103 |         // Perform bilinear interpolation for each color channel
104 |         c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]; // Blue
105 |         c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]; // Green
106 |         c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]; // Red
107 |     }
108 | 
109 |     // Convert from BGR to RGB by swapping channels
110 |     float t = c2;
111 |     c2 = c0;
112 |     c0 = t;
113 | 
114 |     // Normalize the color values to [0, 1]
115 |     c0 = c0 / 255.0f;
116 |     c1 = c1 / 255.0f;
117 |     c2 = c2 / 255.0f;
118 | 
119 |     // Rearrange the output format from interleaved RGB to separate channels
120 |     int area = dst_width * dst_height;
121 |     float* pdst_c0 = dst + dy * dst_width + dx;        // Red channel
122 |     float* pdst_c1 = pdst_c0 + area;                   // Green channel
123 |     float* pdst_c2 = pdst_c1 + area;                   // Blue channel
124 | 
125 |     // Assign the normalized color values to the destination buffers
126 |     *pdst_c0 = c0;
127 |     *pdst_c1 = c1;
128 |     *pdst_c2 = c2;
129 | }
130 | 
131 | // Host function to perform CUDA-based preprocessing
132 | void cuda_preprocess(
133 |     uint8_t* src,        // Source image data on host
134 |     int src_width,       // Source image width
135 |     int src_height,      // Source image height
136 |     float* dst,          // Destination buffer on device
137 |     int dst_width,       // Destination image width
138 |     int dst_height,      // Destination image height
139 |     cudaStream_t stream  // CUDA stream for asynchronous execution
140 | ) {
141 |     // Calculate the size of the image in bytes (3 channels: BGR)
142 |     int img_size = src_width * src_height * 3;
143 | 
144 |     // Copy source image data to pinned host memory for faster transfer
145 |     memcpy(img_buffer_host, src, img_size);
146 | 
147 |     // Asynchronously copy image data from host to device memory
148 |     CUDA_CHECK(cudaMemcpyAsync(
149 |         img_buffer_device,
150 |         img_buffer_host,
151 |         img_size,
152 |         cudaMemcpyHostToDevice,
153 |         stream
154 |     ));
155 | 
156 |     // Define affine transformation matrices
157 |     AffineMatrix s2d, d2s; // Source to destination and vice versa
158 | 
159 |     // Calculate the scaling factor to maintain aspect ratio
160 |     float scale = std::min(
161 |         dst_height / (float)src_height,
162 |         dst_width / (float)src_width
163 |     );
164 | 
165 |     // Initialize source-to-destination affine matrix (s2d)
166 |     s2d.value[0] = scale;                  // m00
167 |     s2d.value[1] = 0;                      // m01
168 |     s2d.value[2] = -scale * src_width * 0.5f + dst_width * 0.5f; // m02
169 |     s2d.value[3] = 0;                      // m10
170 |     s2d.value[4] = scale;                  // m11
171 |     s2d.value[5] = -scale * src_height * 0.5f + dst_height * 0.5f; // m12
172 | 
173 |     // Create OpenCV matrices for affine transformation
174 |     cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
175 |     cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
176 | 
177 |     // Invert the source-to-destination matrix to get destination-to-source
178 |     cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
179 | 
180 |     // Copy the inverted matrix back to d2s
181 |     memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));
182 | 
183 |     // Calculate the total number of pixels to process
184 |     int jobs = dst_height * dst_width;
185 | 
186 |     // Define the number of threads per block
187 |     int threads = 256;
188 | 
189 |     // Calculate the number of blocks needed
190 |     int blocks = ceil(jobs / (float)threads);
191 | 
192 |     // Launch the warp affine kernel
193 |     warpaffine_kernel << <blocks, threads, 0, stream >> > (
194 |         img_buffer_device,           // Source image on device
195 |         src_width * 3,               // Source line size (bytes per row)
196 |         src_width,                   // Source width
197 |         src_height,                  // Source height
198 |         dst,                         // Destination buffer on device
199 |         dst_width,                   // Destination width
200 |         dst_height,                  // Destination height
201 |         128,                         // Constant value for out-of-bounds (gray)
202 |         d2s,                         // Destination to source affine matrix
203 |         jobs                         // Total number of pixels
204 |         );
205 | 
206 |     // Optionally, you might want to check for kernel launch errors
207 |     CUDA_CHECK(cudaGetLastError());
208 | }
209 | 
210 | // Initialize CUDA preprocessing by allocating memory
211 | void cuda_preprocess_init(int max_image_size) {
212 |     // Allocate pinned (page-locked) memory on the host for faster transfers
213 |     CUDA_CHECK(cudaMallocHost((void**)&img_buffer_host, max_image_size * 3));
214 | 
215 |     // Allocate memory on the device (GPU) for the image
216 |     CUDA_CHECK(cudaMalloc((void**)&img_buffer_device, max_image_size * 3));
217 | }
218 | 
219 | // Clean up and free allocated memory
220 | void cuda_preprocess_destroy() {
221 |     // Free device memory
222 |     CUDA_CHECK(cudaFree(img_buffer_device));
223 | 
224 |     // Free pinned host memory
225 |     CUDA_CHECK(cudaFreeHost(img_buffer_host));
226 | }
227 | 


--------------------------------------------------------------------------------