├── models └── .gitignore ├── images └── logo.png ├── inputs └── team.jpg ├── .clang-format ├── .pre-commit-config.yaml ├── cmake ├── ccache.cmake └── FindTensorRT.cmake ├── .gitignore ├── include ├── util │ ├── Util.h │ ├── Stopwatch.h │ └── Util.inl ├── macros.h ├── interfaces │ └── IEngine.h ├── Int8Calibrator.h ├── logger.h └── engine │ ├── EngineRunInference.inl │ ├── EngineUtilities.inl │ └── EngineBuildLoadNetwork.inl ├── LICENSE ├── .all-contributorsrc ├── scripts └── build_opencv.sh ├── CMakeLists.txt ├── src ├── cmd_line_parser.h ├── engine.cpp ├── main.cpp └── engine.h └── README.md /models/.gitignore: -------------------------------------------------------------------------------- 1 | *.onnx 2 | *.trt -------------------------------------------------------------------------------- /images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyrusbehr/tensorrt-cpp-api/HEAD/images/logo.png -------------------------------------------------------------------------------- /inputs/team.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyrusbehr/tensorrt-cpp-api/HEAD/inputs/team.jpg -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: LLVM 4 | AccessModifierOffset: -4 5 | ColumnLimit: 140 6 | IndentWidth: 4 7 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/mirrors-clang-format 3 | rev: 'v17.0.3' # Use the sha / tag you want to point at 4 | hooks: 5 | - id: clang-format 6 | types_or: [c++, c, cuda] 7 | -------------------------------------------------------------------------------- /cmake/ccache.cmake: -------------------------------------------------------------------------------- 1 | find_program(CCACHE_PROGRAM ccache) 2 | if(CCACHE_PROGRAM) 3 | set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}") 4 | message(STATUS "ccache: found") 5 | else() 6 | message(STATUS "ccache: not found") 7 | endif() 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | # Prerequisites 3 | *.d 4 | 5 | # Compiled Object files 6 | *.slo 7 | *.lo 8 | *.o 9 | *.obj 10 | 11 | # Precompiled Headers 12 | *.gch 13 | *.pch 14 | 15 | # Compiled Dynamic libraries 16 | *.so 17 | *.dylib 18 | *.dll 19 | 20 | # Fortran module files 21 | *.mod 22 | *.smod 23 | 24 | # Compiled Static libraries 25 | *.lai 26 | *.la 27 | *.a 28 | *.lib 29 | 30 | # Executables 31 | *.exe 32 | *.out 33 | *.app 34 | 35 | build*/ 36 | cmake-build-* 37 | .idea/ 38 | .vscode/ 39 | 40 | *.onnx 41 | 42 | -------------------------------------------------------------------------------- /include/util/Util.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace Util { 10 | // Checks if a file exists at the given file path 11 | bool doesFileExist(const std::string &filepath); 12 | 13 | // Checks and logs CUDA error codes 14 | void checkCudaErrorCode(cudaError_t code); 15 | 16 | // Retrieves a list of file names in the specified directory 17 | std::vector getFilesInDirectory(const std::string &dirPath); 18 | } 19 | 20 | #include "Util.inl" 21 | -------------------------------------------------------------------------------- /include/util/Stopwatch.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | // Utility Timer 5 | template class Stopwatch { 6 | typename Clock::time_point start_point; 7 | 8 | public: 9 | Stopwatch() : start_point(Clock::now()) {} 10 | 11 | // Returns elapsed time 12 | template Rep elapsedTime() const { 13 | std::atomic_thread_fence(std::memory_order_relaxed); 14 | auto counted_time = std::chrono::duration_cast(Clock::now() - start_point).count(); 15 | std::atomic_thread_fence(std::memory_order_relaxed); 16 | return static_cast(counted_time); 17 | } 18 | }; 19 | 20 | using preciseStopwatch = Stopwatch<>; -------------------------------------------------------------------------------- /include/macros.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #define CHECK(condition) \ 6 | do { \ 7 | if (!(condition)) { \ 8 | spdlog::error("Assertion failed: ({}), function {}, file {}, line {}.", #condition, __FUNCTION__, __FILE__, __LINE__); \ 9 | abort(); \ 10 | } \ 11 | } while (false); 12 | -------------------------------------------------------------------------------- /include/util/Util.inl: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace Util { 6 | 7 | inline bool doesFileExist(const std::string &filepath) { 8 | std::ifstream f(filepath.c_str()); 9 | return f.good(); 10 | } 11 | 12 | inline void checkCudaErrorCode(cudaError_t code) { 13 | if (code != cudaSuccess) { 14 | std::string errMsg = "CUDA operation failed with code: " + std::to_string(code) + " (" + cudaGetErrorName(code) + 15 | "), with message: " + cudaGetErrorString(code); 16 | spdlog::error(errMsg); 17 | throw std::runtime_error(errMsg); 18 | } 19 | } 20 | 21 | inline std::vector getFilesInDirectory(const std::string &dirPath) { 22 | std::vector fileNames; 23 | for (const auto &entry : std::filesystem::directory_iterator(dirPath)) { 24 | if (entry.is_regular_file()) { 25 | fileNames.push_back(entry.path().string()); 26 | } 27 | } 28 | return fileNames; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /include/interfaces/IEngine.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "NvInfer.h" // Include for nvinfer1::Dims and nvinfer1::Dims3 7 | 8 | template 9 | class IEngine { 10 | public: 11 | virtual ~IEngine() = default; 12 | virtual bool buildLoadNetwork(std::string onnxModelPath, const std::array &subVals = {0.f, 0.f, 0.f}, 13 | const std::array &divVals = {1.f, 1.f, 1.f}, bool normalize = true) = 0; 14 | virtual bool loadNetwork(std::string trtModelPath, const std::array &subVals = {0.f, 0.f, 0.f}, 15 | const std::array &divVals = {1.f, 1.f, 1.f}, bool normalize = true) = 0; 16 | virtual bool runInference(const std::vector> &inputs, 17 | std::vector>> &featureVectors) = 0; 18 | virtual const std::vector &getInputDims() const = 0; 19 | virtual const std::vector &getOutputDims() const = 0; 20 | }; 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 cyrusbehr 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.all-contributorsrc: -------------------------------------------------------------------------------- 1 | { 2 | "files": [ 3 | "README.md" 4 | ], 5 | "imageSize": 100, 6 | "commit": false, 7 | "commitType": "docs", 8 | "commitConvention": "angular", 9 | "contributors": [ 10 | { 11 | "login": "ltetrel", 12 | "name": "Loic Tetrel", 13 | "avatar_url": "https://avatars.githubusercontent.com/u/37963074?v=4", 14 | "profile": "https://ltetrel.github.io/", 15 | "contributions": [ 16 | "code" 17 | ] 18 | }, 19 | { 20 | "login": "thomaskleiven", 21 | "name": "thomaskleiven", 22 | "avatar_url": "https://avatars.githubusercontent.com/u/17145074?v=4", 23 | "profile": "https://github.com/thomaskleiven", 24 | "contributions": [ 25 | "code" 26 | ] 27 | }, 28 | { 29 | "login": "qq978358810", 30 | "name": "WiCyn", 31 | "avatar_url": "https://avatars.githubusercontent.com/u/45676681?v=4", 32 | "profile": "https://github.com/qq978358810", 33 | "contributions": [ 34 | "code" 35 | ] 36 | } 37 | ], 38 | "contributorsPerLine": 7, 39 | "skipCi": true, 40 | "repoType": "github", 41 | "repoHost": "https://github.com", 42 | "projectName": "tensorrt-cpp-api", 43 | "projectOwner": "cyrusbehr" 44 | } 45 | -------------------------------------------------------------------------------- /scripts/build_opencv.sh: -------------------------------------------------------------------------------- 1 | VERSION=4.8.0 2 | 3 | test -e ${VERSION}.zip || wget https://github.com/opencv/opencv/archive/refs/tags/${VERSION}.zip 4 | test -e opencv-${VERSION} || unzip ${VERSION}.zip 5 | 6 | test -e opencv_extra_${VERSION}.zip || wget -O opencv_extra_${VERSION}.zip https://github.com/opencv/opencv_contrib/archive/refs/tags/${VERSION}.zip 7 | test -e opencv_contrib-${VERSION} || unzip opencv_extra_${VERSION}.zip 8 | 9 | 10 | cd opencv-${VERSION} 11 | mkdir build 12 | cd build 13 | 14 | cmake -D CMAKE_BUILD_TYPE=RELEASE \ 15 | -D CMAKE_INSTALL_PREFIX=/usr/local \ 16 | -D WITH_TBB=ON \ 17 | -D ENABLE_FAST_MATH=1 \ 18 | -D CUDA_FAST_MATH=1 \ 19 | -D WITH_CUBLAS=1 \ 20 | -D WITH_CUDA=ON \ 21 | -D BUILD_opencv_cudacodec=ON \ 22 | -D WITH_CUDNN=ON \ 23 | -D OPENCV_DNN_CUDA=ON \ 24 | -D WITH_QT=OFF \ 25 | -D WITH_OPENGL=ON \ 26 | -D BUILD_opencv_apps=OFF \ 27 | -D BUILD_opencv_python2=OFF \ 28 | -D OPENCV_GENERATE_PKGCONFIG=ON \ 29 | -D OPENCV_PC_FILE_NAME=opencv.pc \ 30 | -D OPENCV_ENABLE_NONFREE=ON \ 31 | -D OPENCV_EXTRA_MODULES_PATH=../../opencv_contrib-${VERSION}/modules \ 32 | -D INSTALL_PYTHON_EXAMPLES=OFF \ 33 | -D INSTALL_C_EXAMPLES=OFF \ 34 | -D BUILD_EXAMPLES=OFF \ 35 | -D WITH_FFMPEG=ON \ 36 | -D CUDNN_INCLUDE_DIR=/usr/local/cuda/include \ 37 | -D CUDNN_LIBRARY=/usr/local/cuda/lib64/libcudnn.so \ 38 | .. 39 | 40 | make -j 8 41 | sudo make -j 8 install 42 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | project(tensorrt_cpp_api) 3 | 4 | # Use ccache to speed up rebuilds 5 | include(cmake/ccache.cmake) 6 | 7 | # Set C++ version and optimization level 8 | set(CMAKE_CXX_STANDARD 17) 9 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Ofast -DNDEBUG -Wno-deprecated-declarations") 10 | 11 | # For finding FindTensorRT.cmake 12 | set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH}) 13 | 14 | # TODO: Specify the path to TensorRT root dir 15 | if (NOT TensorRT_DIR) 16 | set(TensorRT_DIR /home/cyrus/work/libs/TensorRT-10.0.0.6/) 17 | endif() 18 | # Use the correct version of CUDA 19 | set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda) 20 | 21 | # We require CUDA, OpenCV, and TensorRT 22 | find_package(TensorRT REQUIRED) 23 | find_package(CUDA REQUIRED) 24 | find_package(OpenCV REQUIRED) 25 | find_package(fmt REQUIRED) 26 | 27 | add_library(tensorrt_cpp_api SHARED 28 | src/engine.cpp) 29 | 30 | target_include_directories(tensorrt_cpp_api PUBLIC ${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS} ${TensorRT_INCLUDE_DIRS} include include/interfaces) 31 | target_link_libraries(tensorrt_cpp_api PUBLIC ${OpenCV_LIBS} ${CUDA_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${TensorRT_LIBRARIES} fmt::fmt) 32 | 33 | add_executable(run_inference_benchmark src/main.cpp) 34 | target_link_libraries(run_inference_benchmark tensorrt_cpp_api fmt::fmt) -------------------------------------------------------------------------------- /include/Int8Calibrator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "NvInfer.h" 3 | 4 | // Class used for int8 calibration 5 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { 6 | public: 7 | Int8EntropyCalibrator2(int32_t batchSize, int32_t inputW, int32_t inputH, const std::string &calibDataDirPath, 8 | const std::string &calibTableName, const std::string &inputBlobName, 9 | const std::array &subVals = {0.f, 0.f, 0.f}, const std::array &divVals = {1.f, 1.f, 1.f}, 10 | bool normalize = true, bool readCache = true); 11 | virtual ~Int8EntropyCalibrator2(); 12 | // Abstract base class methods which must be implemented 13 | int32_t getBatchSize() const noexcept override; 14 | bool getBatch(void *bindings[], char const *names[], int32_t nbBindings) noexcept override; 15 | void const *readCalibrationCache(std::size_t &length) noexcept override; 16 | void writeCalibrationCache(void const *ptr, std::size_t length) noexcept override; 17 | 18 | private: 19 | const int32_t m_batchSize; 20 | const int32_t m_inputW; 21 | const int32_t m_inputH; 22 | int32_t m_imgIdx; 23 | std::vector m_imgPaths; 24 | size_t m_inputCount; 25 | const std::string m_calibTableName; 26 | const std::string m_inputBlobName; 27 | const std::array m_subVals; 28 | const std::array m_divVals; 29 | const bool m_normalize; 30 | const bool m_readCache; 31 | void *m_deviceInput; 32 | std::vector m_calibCache; 33 | }; -------------------------------------------------------------------------------- /include/logger.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | enum class LogLevel { 8 | Trace, 9 | Debug, 10 | Info, 11 | Warn, 12 | Error, 13 | Critical, 14 | Off, 15 | Unknown 16 | }; 17 | 18 | 19 | // Get the log level string from the environment variable 20 | inline std::string getLogLevelFromEnvironment() { 21 | const char* envValue = std::getenv("LOG_LEVEL"); 22 | if (envValue) { 23 | return std::string(envValue); 24 | } else { 25 | spdlog::warn("LOG_LEVEL environment variable not set. Using default log level (info)."); 26 | return "info"; 27 | } 28 | } 29 | 30 | // Convert log level string to LogLevel enum 31 | inline LogLevel parseLogLevel(const std::string& logLevelStr) { 32 | if (logLevelStr == "trace") { 33 | return LogLevel::Trace; 34 | } else if (logLevelStr == "debug") { 35 | return LogLevel::Debug; 36 | } else if (logLevelStr == "info") { 37 | return LogLevel::Info; 38 | } else if (logLevelStr == "warn" || logLevelStr == "warning") { 39 | return LogLevel::Warn; 40 | } else if (logLevelStr == "err" || logLevelStr == "error") { 41 | return LogLevel::Error; 42 | } else if (logLevelStr == "critical") { 43 | return LogLevel::Critical; 44 | } else if (logLevelStr == "off") { 45 | return LogLevel::Off; 46 | } else { 47 | spdlog::warn("Unknown log level string: {}. Defaulting to 'info' level.", logLevelStr); 48 | return LogLevel::Unknown; 49 | } 50 | } 51 | 52 | // Convert LogLevel enum to spdlog::level::level_enum 53 | inline spdlog::level::level_enum toSpdlogLevel(const std::string& logLevelStr) { 54 | LogLevel logLevel = parseLogLevel(logLevelStr); 55 | 56 | switch (logLevel) { 57 | case LogLevel::Trace: 58 | return spdlog::level::trace; 59 | case LogLevel::Debug: 60 | return spdlog::level::debug; 61 | case LogLevel::Info: 62 | return spdlog::level::info; 63 | case LogLevel::Warn: 64 | return spdlog::level::warn; 65 | case LogLevel::Error: 66 | return spdlog::level::err; 67 | case LogLevel::Critical: 68 | return spdlog::level::critical; 69 | case LogLevel::Off: 70 | return spdlog::level::off; 71 | default: 72 | spdlog::warn("Unknown log level. Using default log level (info)."); 73 | return spdlog::level::info; 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /cmake/FindTensorRT.cmake: -------------------------------------------------------------------------------- 1 | # source: 2 | # https://github.com/NVIDIA/tensorrt-laboratory/blob/master/cmake/FindTensorRT.cmake 3 | 4 | # This module defines the following variables: 5 | # 6 | # :: 7 | # 8 | # TensorRT_INCLUDE_DIRS 9 | # TensorRT_LIBRARIES 10 | # TensorRT_FOUND 11 | # 12 | # :: 13 | # 14 | # TensorRT_VERSION_STRING - version (x.y.z) 15 | # TensorRT_VERSION_MAJOR - major version (x) 16 | # TensorRT_VERSION_MINOR - minor version (y) 17 | # TensorRT_VERSION_PATCH - patch version (z) 18 | # 19 | # Hints 20 | # ^^^^^ 21 | # A user may set ``TensorRT_DIR`` to an installation root to tell this module where to look. 22 | # 23 | set(_TensorRT_SEARCHES) 24 | 25 | if(TensorRT_DIR) 26 | set(_TensorRT_SEARCH_ROOT PATHS ${TensorRT_DIR} NO_DEFAULT_PATH) 27 | list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_ROOT) 28 | endif() 29 | 30 | # appends some common paths 31 | set(_TensorRT_SEARCH_NORMAL 32 | PATHS "/usr" 33 | ) 34 | list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_NORMAL) 35 | 36 | # Include dir 37 | foreach(search ${_TensorRT_SEARCHES}) 38 | find_path(TensorRT_INCLUDE_DIR NAMES NvInfer.h ${${search}} PATH_SUFFIXES include) 39 | endforeach() 40 | 41 | if(NOT TensorRT_LIBRARY) 42 | foreach(search ${_TensorRT_SEARCHES}) 43 | find_library(TensorRT_LIBRARY NAMES nvinfer ${${search}} PATH_SUFFIXES lib) 44 | endforeach() 45 | endif() 46 | 47 | if(NOT TensorRT_NVONNXPARSER_LIBRARY) 48 | foreach(search ${_TensorRT_SEARCHES}) 49 | find_library(TensorRT_NVONNXPARSER_LIBRARY NAMES nvonnxparser ${${search}} PATH_SUFFIXES lib) 50 | endforeach() 51 | endif() 52 | 53 | mark_as_advanced(TensorRT_INCLUDE_DIR) 54 | 55 | if(TensorRT_INCLUDE_DIR AND EXISTS "${TensorRT_INCLUDE_DIR}/NvInfer.h") 56 | file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$") 57 | file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$") 58 | file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$") 59 | 60 | string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}") 61 | string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}") 62 | string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}") 63 | set(TensorRT_VERSION_STRING "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}") 64 | endif() 65 | 66 | include(FindPackageHandleStandardArgs) 67 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(TensorRT REQUIRED_VARS TensorRT_LIBRARY TensorRT_INCLUDE_DIR VERSION_VAR TensorRT_VERSION_STRING) 68 | 69 | if(TensorRT_FOUND) 70 | set(TensorRT_INCLUDE_DIRS ${TensorRT_INCLUDE_DIR}) 71 | 72 | if(NOT TensorRT_LIBRARIES) 73 | set(TensorRT_LIBRARIES ${TensorRT_LIBRARY} ${TensorRT_NVONNXPARSER_LIBRARY} ${TensorRT_NVPARSERS_LIBRARY}) 74 | endif() 75 | 76 | if(NOT TARGET TensorRT::TensorRT) 77 | add_library(TensorRT::TensorRT UNKNOWN IMPORTED) 78 | set_target_properties(TensorRT::TensorRT PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIRS}") 79 | set_property(TARGET TensorRT::TensorRT APPEND PROPERTY IMPORTED_LOCATION "${TensorRT_LIBRARY}") 80 | endif() 81 | endif() -------------------------------------------------------------------------------- /src/cmd_line_parser.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "engine.h" 3 | #include 4 | 5 | struct CommandLineArguments { 6 | std::string onnxModelPath = ""; 7 | std::string trtModelPath = ""; 8 | }; 9 | 10 | inline void showHelp(char *argv[]) { 11 | std::cout << "Usage: " << argv[0] << " [OPTIONS]" << std::endl << std::endl; 12 | 13 | std::cout << "Options:" << std::endl; 14 | std::cout << "--onnx_model Path to the ONNX model. " 15 | "(Either onnx_model or trt_model must be provided)" 16 | << std::endl; 17 | std::cout << "--trt_model Path to the TensorRT model. " 18 | "(Either onnx_model or trt_model must be provided)" 19 | << std::endl; 20 | 21 | std::cout << "Example usage:" << std::endl; 22 | std::cout << argv[0] << " --onnx_model model.onnx" << std::endl; 23 | }; 24 | 25 | inline bool tryGetNextArgument(int argc, char *argv[], int ¤tIndex, std::string &value, std::string flag, bool printErrors = true) { 26 | if (currentIndex + 1 >= argc) { 27 | if (printErrors) 28 | std::cout << "Error: No arguments provided for flag '" << flag << "'" << std::endl; 29 | return false; 30 | } 31 | 32 | std::string nextArgument = argv[currentIndex + 1]; 33 | if (nextArgument.substr(0, 2) == "--") { 34 | if (printErrors) 35 | std::cout << "Error: No arguments provided for flag '" << flag << "'" << std::endl; 36 | return false; 37 | } 38 | 39 | value = argv[++currentIndex]; 40 | return true; 41 | }; 42 | 43 | inline bool parseArguments(int argc, char *argv[], CommandLineArguments &arguments) { 44 | if (argc == 1) { 45 | showHelp(argv); 46 | return false; 47 | } 48 | 49 | for (int i = 1; i < argc; i++) { 50 | std::string argument = argv[i]; 51 | 52 | if (argument.substr(0, 2) == "--") { 53 | std::string flag = argument.substr(2); 54 | std::string nextArgument; 55 | 56 | if (flag == "onnx_model") { 57 | if (!tryGetNextArgument(argc, argv, i, nextArgument, flag)) 58 | return false; 59 | 60 | if (!Util::doesFileExist(nextArgument)) { 61 | std::cout << "Error: Unable to find model at path '" << nextArgument << "' for flag '" << flag << "'" << std::endl; 62 | return false; 63 | } 64 | 65 | arguments.onnxModelPath = nextArgument; 66 | } 67 | 68 | else if (flag == "trt_model") { 69 | if (!tryGetNextArgument(argc, argv, i, nextArgument, flag)) 70 | return false; 71 | 72 | if (!Util::doesFileExist(nextArgument)) { 73 | std::cout << "Error: Unable to find model at path '" << nextArgument << "' for flag '" << flag << "'" << std::endl; 74 | return false; 75 | } 76 | 77 | arguments.trtModelPath = nextArgument; 78 | } 79 | 80 | else { 81 | std::cout << "Error: Unknown flag '" << flag << "'" << std::endl; 82 | showHelp(argv); 83 | return false; 84 | } 85 | } else { 86 | std::cout << "Error: Unknown argument '" << argument << "'" << std::endl; 87 | showHelp(argv); 88 | return false; 89 | } 90 | } 91 | 92 | if (arguments.onnxModelPath.empty() && arguments.trtModelPath.empty()) { 93 | std::cout << "Error: Must specify either 'onnx_model' or 'trt_model'" << std::endl; 94 | return false; 95 | } 96 | 97 | return true; 98 | } 99 | -------------------------------------------------------------------------------- /src/engine.cpp: -------------------------------------------------------------------------------- 1 | #include "engine.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace nvinfer1; 10 | using namespace Util; 11 | 12 | void Logger::log(Severity severity, const char *msg) noexcept { 13 | switch (severity) { 14 | case Severity::kVERBOSE: 15 | spdlog::debug(msg); 16 | break; 17 | case Severity::kINFO: 18 | spdlog::info(msg); 19 | break; 20 | case Severity::kWARNING: 21 | spdlog::warn(msg); 22 | break; 23 | case Severity::kERROR: 24 | spdlog::error(msg); 25 | break; 26 | case Severity::kINTERNAL_ERROR: 27 | spdlog::critical(msg); 28 | break; 29 | default: 30 | spdlog::info("Unexpected severity level"); 31 | } 32 | } 33 | 34 | Int8EntropyCalibrator2::Int8EntropyCalibrator2(int32_t batchSize, int32_t inputW, int32_t inputH, const std::string &calibDataDirPath, 35 | const std::string &calibTableName, const std::string &inputBlobName, 36 | const std::array &subVals, const std::array &divVals, bool normalize, 37 | bool readCache) 38 | : m_batchSize(batchSize), m_inputW(inputW), m_inputH(inputH), m_imgIdx(0), m_calibTableName(calibTableName), 39 | m_inputBlobName(inputBlobName), m_subVals(subVals), m_divVals(divVals), m_normalize(normalize), m_readCache(readCache) { 40 | 41 | // Allocate GPU memory to hold the entire batch 42 | m_inputCount = 3 * inputW * inputH * batchSize; 43 | checkCudaErrorCode(cudaMalloc(&m_deviceInput, m_inputCount * sizeof(float))); 44 | 45 | // Read the name of all the files in the specified directory. 46 | if (!doesFileExist(calibDataDirPath)) { 47 | auto msg = "Error, directory at provided path does not exist: " + calibDataDirPath; 48 | spdlog::error(msg); 49 | throw std::runtime_error(msg); 50 | } 51 | 52 | m_imgPaths = getFilesInDirectory(calibDataDirPath); 53 | if (m_imgPaths.size() < static_cast(batchSize)) { 54 | auto msg = "Error, there are fewer calibration images than the specified batch size!"; 55 | spdlog::error(msg); 56 | throw std::runtime_error(msg); 57 | } 58 | 59 | // Randomize the calibration data 60 | auto rd = std::random_device{}; 61 | auto rng = std::default_random_engine{rd()}; 62 | std::shuffle(std::begin(m_imgPaths), std::end(m_imgPaths), rng); 63 | } 64 | 65 | int32_t Int8EntropyCalibrator2::getBatchSize() const noexcept { 66 | // Return the batch size 67 | return m_batchSize; 68 | } 69 | 70 | bool Int8EntropyCalibrator2::getBatch(void **bindings, const char **names, int32_t nbBindings) noexcept { 71 | // This method will read a batch of images into GPU memory, and place the 72 | // pointer to the GPU memory in the bindings variable. 73 | 74 | if (m_imgIdx + m_batchSize > static_cast(m_imgPaths.size())) { 75 | // There are not enough images left to satisfy an entire batch 76 | return false; 77 | } 78 | 79 | // Read the calibration images into memory for the current batch 80 | std::vector inputImgs; 81 | for (int i = m_imgIdx; i < m_imgIdx + m_batchSize; i++) { 82 | spdlog::info("Reading image {}: {}", i, m_imgPaths[i]); 83 | auto cpuImg = cv::imread(m_imgPaths[i]); 84 | if (cpuImg.empty()) { 85 | spdlog::error("Fatal error: Unable to read image at path: " + m_imgPaths[i]); 86 | return false; 87 | } 88 | 89 | cv::cuda::GpuMat gpuImg; 90 | gpuImg.upload(cpuImg); 91 | //cv::cuda::cvtColor(gpuImg, gpuImg, cv::COLOR_BGR2RGB); 92 | 93 | // TODO: Define any preprocessing code here, such as resizing 94 | auto resized = Engine::resizeKeepAspectRatioPadRightBottom(gpuImg, m_inputH, m_inputW); 95 | 96 | inputImgs.emplace_back(std::move(resized)); 97 | } 98 | 99 | // Convert the batch from NHWC to NCHW 100 | // ALso apply normalization, scaling, and mean subtraction 101 | auto mfloat = Engine::blobFromGpuMats(inputImgs, m_subVals, m_divVals, m_normalize, true); 102 | auto *dataPointer = mfloat.ptr(); 103 | 104 | // Copy the GPU buffer to member variable so that it persists 105 | checkCudaErrorCode(cudaMemcpyAsync(m_deviceInput, dataPointer, m_inputCount * sizeof(float), cudaMemcpyDeviceToDevice)); 106 | 107 | m_imgIdx += m_batchSize; 108 | if (std::string(names[0]) != m_inputBlobName) { 109 | spdlog::error("Error: Incorrect input name provided!"); 110 | return false; 111 | } 112 | bindings[0] = m_deviceInput; 113 | return true; 114 | } 115 | 116 | void const *Int8EntropyCalibrator2::readCalibrationCache(size_t &length) noexcept { 117 | spdlog::info("Searching for calibration cache: {}", m_calibTableName); 118 | m_calibCache.clear(); 119 | std::ifstream input(m_calibTableName, std::ios::binary); 120 | input >> std::noskipws; 121 | if (m_readCache && input.good()) { 122 | spdlog::info("Reading calibration cache: {}", m_calibTableName); 123 | std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(m_calibCache)); 124 | } 125 | length = m_calibCache.size(); 126 | return length ? m_calibCache.data() : nullptr; 127 | } 128 | 129 | void Int8EntropyCalibrator2::writeCalibrationCache(const void *ptr, std::size_t length) noexcept { 130 | spdlog::info("Writing calibration cache: {}", m_calibTableName); 131 | spdlog::info("Calibration cache size: {} bytes", length); 132 | std::ofstream output(m_calibTableName, std::ios::binary); 133 | output.write(reinterpret_cast(ptr), length); 134 | } 135 | 136 | Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { checkCudaErrorCode(cudaFree(m_deviceInput)); }; 137 | -------------------------------------------------------------------------------- /include/engine/EngineRunInference.inl: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include "util/Util.h" 5 | 6 | template 7 | bool Engine::runInference(const std::vector> &inputs, 8 | std::vector>> &featureVectors) { 9 | // First we do some error checking 10 | if (inputs.empty() || inputs[0].empty()) { 11 | spdlog::error("Provided input vector is empty!"); 12 | return false; 13 | } 14 | 15 | const auto numInputs = m_inputDims.size(); 16 | if (inputs.size() != numInputs) { 17 | spdlog::error("Incorrect number of inputs provided!"); 18 | return false; 19 | } 20 | 21 | // Ensure the batch size does not exceed the max 22 | if (inputs[0].size() > static_cast(m_options.maxBatchSize)) { 23 | spdlog::error("===== Error ====="); 24 | spdlog::error("The batch size is larger than the model expects!"); 25 | spdlog::error("Model max batch size: {}", m_options.maxBatchSize); 26 | spdlog::error("Batch size provided to call to runInference: {}", inputs[0].size()); 27 | return false; 28 | } 29 | 30 | // Ensure that if the model has a fixed batch size that is greater than 1, the 31 | // input has the correct length 32 | if (m_inputBatchSize != -1 && inputs[0].size() != static_cast(m_inputBatchSize)) { 33 | spdlog::error("===== Error ====="); 34 | spdlog::error("The batch size is different from what the model expects!"); 35 | spdlog::error("Model batch size: {}", m_inputBatchSize); 36 | spdlog::error("Batch size provided to call to runInference: {}", inputs[0].size()); 37 | return false; 38 | } 39 | 40 | const auto batchSize = static_cast(inputs[0].size()); 41 | // Make sure the same batch size was provided for all inputs 42 | for (size_t i = 1; i < inputs.size(); ++i) { 43 | if (inputs[i].size() != static_cast(batchSize)) { 44 | spdlog::error("===== Error ====="); 45 | spdlog::error("The batch size is different for each input!"); 46 | return false; 47 | } 48 | } 49 | 50 | // Create the cuda stream that will be used for inference 51 | cudaStream_t inferenceCudaStream; 52 | Util::checkCudaErrorCode(cudaStreamCreate(&inferenceCudaStream)); 53 | 54 | std::vector preprocessedInputs; 55 | 56 | // Preprocess all the inputs 57 | for (size_t i = 0; i < numInputs; ++i) { 58 | const auto &batchInput = inputs[i]; 59 | const auto &dims = m_inputDims[i]; 60 | 61 | auto &input = batchInput[0]; 62 | if (input.channels() != dims.d[0] || input.rows != dims.d[1] || input.cols != dims.d[2]) { 63 | spdlog::error("===== Error ====="); 64 | spdlog::error("Input does not have correct size!"); 65 | spdlog::error("Expected: ({}, {}, {})", dims.d[0], dims.d[1], dims.d[2]); 66 | spdlog::error("Got: ({}, {}, {})", input.channels(), input.rows, input.cols); 67 | spdlog::error("Ensure you resize your input image to the correct size"); 68 | return false; 69 | } 70 | 71 | nvinfer1::Dims4 inputDims = {batchSize, dims.d[0], dims.d[1], dims.d[2]}; 72 | m_context->setInputShape(m_IOTensorNames[i].c_str(), 73 | inputDims); // Define the batch size 74 | 75 | // OpenCV reads images into memory in NHWC format, while TensorRT expects 76 | // images in NCHW format. The following method converts NHWC to NCHW. Even 77 | // though TensorRT expects NCHW at IO, during optimization, it can 78 | // internally use NHWC to optimize cuda kernels See: 79 | // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#data-layout 80 | // Copy over the input data and perform the preprocessing 81 | auto mfloat = blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize); 82 | preprocessedInputs.push_back(mfloat); 83 | m_buffers[i] = mfloat.ptr(); 84 | } 85 | 86 | // Ensure all dynamic bindings have been defined. 87 | if (!m_context->allInputDimensionsSpecified()) { 88 | auto msg = "Error, not all required dimensions specified."; 89 | spdlog::error(msg); 90 | throw std::runtime_error(msg); 91 | } 92 | 93 | // Set the address of the input and output buffers 94 | for (size_t i = 0; i < m_buffers.size(); ++i) { 95 | bool status = m_context->setTensorAddress(m_IOTensorNames[i].c_str(), m_buffers[i]); 96 | if (!status) { 97 | return false; 98 | } 99 | } 100 | 101 | // Run inference. 102 | bool status = m_context->enqueueV3(inferenceCudaStream); 103 | if (!status) { 104 | return false; 105 | } 106 | 107 | // Copy the outputs back to CPU 108 | featureVectors.clear(); 109 | 110 | for (int batch = 0; batch < batchSize; ++batch) { 111 | // Batch 112 | std::vector> batchOutputs{}; 113 | for (int32_t outputBinding = numInputs; outputBinding < m_engine->getNbIOTensors(); ++outputBinding) { 114 | // We start at index m_inputDims.size() to account for the inputs in our 115 | // m_buffers 116 | std::vector output; 117 | auto outputLength = m_outputLengths[outputBinding - numInputs]; 118 | output.resize(outputLength); 119 | // Copy the output 120 | Util::checkCudaErrorCode(cudaMemcpyAsync(output.data(), 121 | static_cast(m_buffers[outputBinding]) + (batch * sizeof(T) * outputLength), 122 | outputLength * sizeof(T), cudaMemcpyDeviceToHost, inferenceCudaStream)); 123 | batchOutputs.emplace_back(std::move(output)); 124 | } 125 | featureVectors.emplace_back(std::move(batchOutputs)); 126 | } 127 | 128 | // Synchronize the cuda stream 129 | Util::checkCudaErrorCode(cudaStreamSynchronize(inferenceCudaStream)); 130 | Util::checkCudaErrorCode(cudaStreamDestroy(inferenceCudaStream)); 131 | return true; 132 | } 133 | -------------------------------------------------------------------------------- /include/engine/EngineUtilities.inl: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | template 6 | void Engine::transformOutput(std::vector>> &input, std::vector> &output) { 7 | if (input.size() != 1) { 8 | auto msg = "The feature vector has incorrect dimensions!"; 9 | spdlog::error(msg); 10 | throw std::logic_error(msg); 11 | } 12 | 13 | output = std::move(input[0]); 14 | } 15 | 16 | template void Engine::transformOutput(std::vector>> &input, std::vector &output) { 17 | if (input.size() != 1 || input[0].size() != 1) { 18 | auto msg = "The feature vector has incorrect dimensions!"; 19 | spdlog::error(msg); 20 | throw std::logic_error(msg); 21 | } 22 | 23 | output = std::move(input[0][0]); 24 | } 25 | 26 | template 27 | cv::cuda::GpuMat Engine::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width, 28 | const cv::Scalar &bgcolor) { 29 | float r = std::min(width / (input.cols * 1.0), height / (input.rows * 1.0)); 30 | int unpad_w = r * input.cols; 31 | int unpad_h = r * input.rows; 32 | cv::cuda::GpuMat re(unpad_h, unpad_w, CV_8UC3); 33 | cv::cuda::resize(input, re, re.size()); 34 | cv::cuda::GpuMat out(height, width, CV_8UC3, bgcolor); 35 | re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows))); 36 | return out; 37 | } 38 | 39 | template void Engine::getDeviceNames(std::vector &deviceNames) { 40 | int numGPUs; 41 | cudaGetDeviceCount(&numGPUs); 42 | 43 | for (int device = 0; device < numGPUs; device++) { 44 | cudaDeviceProp prop; 45 | cudaGetDeviceProperties(&prop, device); 46 | 47 | deviceNames.push_back(std::string(prop.name)); 48 | } 49 | } 50 | 51 | template std::string Engine::serializeEngineOptions(const Options &options, const std::string &onnxModelPath) { 52 | const auto filenamePos = onnxModelPath.find_last_of('/') + 1; 53 | std::string engineName = onnxModelPath.substr(filenamePos, onnxModelPath.find_last_of('.') - filenamePos) + ".engine"; 54 | 55 | // Add the GPU device name to the file to ensure that the model is only used 56 | // on devices with the exact same GPU 57 | std::vector deviceNames; 58 | getDeviceNames(deviceNames); 59 | 60 | if (static_cast(options.deviceIndex) >= deviceNames.size()) { 61 | auto msg = "Error, provided device index is out of range!"; 62 | spdlog::error(msg); 63 | throw std::runtime_error(msg); 64 | } 65 | 66 | auto deviceName = deviceNames[options.deviceIndex]; 67 | // Remove spaces from the device name 68 | deviceName.erase(std::remove_if(deviceName.begin(), deviceName.end(), ::isspace), deviceName.end()); 69 | 70 | engineName += "." + deviceName; 71 | 72 | // Serialize the specified options into the filename 73 | if (options.precision == Precision::FP16) { 74 | engineName += ".fp16"; 75 | } else if (options.precision == Precision::FP32) { 76 | engineName += ".fp32"; 77 | } else { 78 | engineName += ".int8"; 79 | } 80 | 81 | engineName += "." + std::to_string(options.maxBatchSize); 82 | engineName += "." + std::to_string(options.optBatchSize); 83 | engineName += "." + std::to_string(options.minInputWidth); 84 | engineName += "." + std::to_string(options.optInputWidth); 85 | engineName += "." + std::to_string(options.maxInputWidth); 86 | 87 | spdlog::info("Engine name: {}", engineName); 88 | return engineName; 89 | } 90 | 91 | template 92 | cv::cuda::GpuMat Engine::blobFromGpuMats(const std::vector &batchInput, const std::array &subVals, 93 | const std::array &divVals, bool normalize, bool swapRB) { 94 | 95 | CHECK(!batchInput.empty()) 96 | CHECK(batchInput[0].channels() == 3) 97 | 98 | cv::cuda::GpuMat gpu_dst(1, batchInput[0].rows * batchInput[0].cols * batchInput.size(), CV_8UC3); 99 | 100 | size_t width = batchInput[0].cols * batchInput[0].rows; 101 | if (swapRB) { 102 | for (size_t img = 0; img < batchInput.size(); ++img) { 103 | std::vector input_channels{ 104 | cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[width * 2 + width * 3 * img])), 105 | cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[width + width * 3 * img])), 106 | cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[0 + width * 3 * img]))}; 107 | cv::cuda::split(batchInput[img], input_channels); // HWC -> CHW 108 | } 109 | } else { 110 | for (size_t img = 0; img < batchInput.size(); ++img) { 111 | std::vector input_channels{ 112 | cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[0 + width * 3 * img])), 113 | cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[width + width * 3 * img])), 114 | cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[width * 2 + width * 3 * img]))}; 115 | cv::cuda::split(batchInput[img], input_channels); // HWC -> CHW 116 | } 117 | } 118 | cv::cuda::GpuMat mfloat; 119 | if (normalize) { 120 | // [0.f, 1.f] 121 | gpu_dst.convertTo(mfloat, CV_32FC3, 1.f / 255.f); 122 | } else { 123 | // [0.f, 255.f] 124 | gpu_dst.convertTo(mfloat, CV_32FC3); 125 | } 126 | 127 | // Apply scaling and mean subtraction 128 | cv::cuda::subtract(mfloat, cv::Scalar(subVals[0], subVals[1], subVals[2]), mfloat, cv::noArray(), -1); 129 | cv::cuda::divide(mfloat, cv::Scalar(divVals[0], divVals[1], divVals[2]), mfloat, 1, -1); 130 | 131 | return mfloat; 132 | } 133 | 134 | template void Engine::clearGpuBuffers() { 135 | if (!m_buffers.empty()) { 136 | // Free GPU memory of outputs 137 | const auto numInputs = m_inputDims.size(); 138 | for (int32_t outputBinding = numInputs; outputBinding < m_engine->getNbIOTensors(); ++outputBinding) { 139 | Util::checkCudaErrorCode(cudaFree(m_buffers[outputBinding])); 140 | } 141 | m_buffers.clear(); 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include "cmd_line_parser.h" 2 | #include "logger.h" 3 | #include "engine.h" 4 | #include 5 | #include 6 | #include 7 | 8 | int main(int argc, char *argv[]) { 9 | CommandLineArguments arguments; 10 | 11 | std::string logLevelStr = getLogLevelFromEnvironment(); 12 | spdlog::level::level_enum logLevel = toSpdlogLevel(logLevelStr); 13 | spdlog::set_level(logLevel); 14 | 15 | // Parse the command line arguments 16 | if (!parseArguments(argc, argv, arguments)) { 17 | return -1; 18 | } 19 | 20 | // Specify our GPU inference configuration options 21 | Options options; 22 | // Specify what precision to use for inference 23 | // FP16 is approximately twice as fast as FP32. 24 | options.precision = Precision::FP16; 25 | // If using INT8 precision, must specify path to directory containing 26 | // calibration data. 27 | options.calibrationDataDirectoryPath = ""; 28 | // Specify the batch size to optimize for. 29 | options.optBatchSize = 1; 30 | // Specify the maximum batch size we plan on running. 31 | options.maxBatchSize = 1; 32 | // Specify the directory where you want the model engine model file saved. 33 | options.engineFileDir = "."; 34 | 35 | Engine engine(options); 36 | 37 | // Define our preprocessing code 38 | // The default Engine::build method will normalize values between [0.f, 1.f] 39 | // Setting the normalize flag to false will leave values between [0.f, 255.f] 40 | // (some converted models may require this). 41 | 42 | // For our YoloV8 model, we need the values to be normalized between 43 | // [0.f, 1.f] so we use the following params 44 | std::array subVals{0.f, 0.f, 0.f}; 45 | std::array divVals{1.f, 1.f, 1.f}; 46 | bool normalize = true; 47 | // Note, we could have also used the default values. 48 | 49 | // If the model requires values to be normalized between [-1.f, 1.f], use the 50 | // following params: 51 | // subVals = {0.5f, 0.5f, 0.5f}; 52 | // divVals = {0.5f, 0.5f, 0.5f}; 53 | // normalize = true; 54 | 55 | if (!arguments.onnxModelPath.empty()) { 56 | // Build the onnx model into a TensorRT engine file, and load the TensorRT 57 | // engine file into memory. 58 | bool succ = engine.buildLoadNetwork(arguments.onnxModelPath, subVals, divVals, normalize); 59 | if (!succ) { 60 | throw std::runtime_error("Unable to build or load TensorRT engine."); 61 | } 62 | } else { 63 | // Load the TensorRT engine file directly 64 | bool succ = engine.loadNetwork(arguments.trtModelPath, subVals, divVals, normalize); 65 | if (!succ) { 66 | const std::string msg = "Unable to load TensorRT engine."; 67 | spdlog::error(msg); 68 | throw std::runtime_error(msg); 69 | } 70 | } 71 | 72 | // Read the input image 73 | // TODO: You will need to read the input image required for your model 74 | const std::string inputImage = "../inputs/team.jpg"; 75 | auto cpuImg = cv::imread(inputImage); 76 | if (cpuImg.empty()) { 77 | const std::string msg = "Unable to read image at path: " + inputImage; 78 | spdlog::error(msg); 79 | throw std::runtime_error(msg); 80 | } 81 | 82 | // Upload the image GPU memory 83 | cv::cuda::GpuMat img; 84 | img.upload(cpuImg); 85 | 86 | // The model expects RGB input 87 | cv::cuda::cvtColor(img, img, cv::COLOR_BGR2RGB); 88 | 89 | // In the following section we populate the input vectors to later pass for 90 | // inference 91 | const auto &inputDims = engine.getInputDims(); 92 | std::vector> inputs; 93 | 94 | // Let's use a batch size which matches that which we set the 95 | // Options.optBatchSize option 96 | size_t batchSize = options.optBatchSize; 97 | 98 | // TODO: 99 | // For the sake of the demo, we will be feeding the same image to all the 100 | // inputs You should populate your inputs appropriately. 101 | for (const auto &inputDim : inputDims) { // For each of the model inputs... 102 | std::vector input; 103 | for (size_t j = 0; j < batchSize; ++j) { // For each element we want to add to the batch... 104 | // TODO: 105 | // You can choose to resize by scaling, adding padding, or a combination 106 | // of the two in order to maintain the aspect ratio You can use the 107 | // Engine::resizeKeepAspectRatioPadRightBottom to resize to a square while 108 | // maintain the aspect ratio (adds padding where necessary to achieve 109 | // this). 110 | auto resized = Engine::resizeKeepAspectRatioPadRightBottom(img, inputDim.d[1], inputDim.d[2]); 111 | // You could also perform a resize operation without maintaining aspect 112 | // ratio with the use of padding by using the following instead: 113 | // cv::cuda::resize(img, resized, cv::Size(inputDim.d[2], 114 | // inputDim.d[1])); // TRT dims are (height, width) whereas 115 | // OpenCV is (width, height) 116 | input.emplace_back(std::move(resized)); 117 | } 118 | inputs.emplace_back(std::move(input)); 119 | } 120 | 121 | // Warm up the network before we begin the benchmark 122 | spdlog::info("Warming up the network..."); 123 | std::vector>> featureVectors; 124 | for (int i = 0; i < 100; ++i) { 125 | bool succ = engine.runInference(inputs, featureVectors); 126 | if (!succ) { 127 | const std::string msg = "Unable to run inference."; 128 | spdlog::error(msg); 129 | throw std::runtime_error(msg); 130 | } 131 | } 132 | 133 | // Benchmark the inference time 134 | size_t numIterations = 1000; 135 | spdlog::info("Running benchmarks ({} iterations)...", numIterations); 136 | preciseStopwatch stopwatch; 137 | for (size_t i = 0; i < numIterations; ++i) { 138 | featureVectors.clear(); 139 | engine.runInference(inputs, featureVectors); 140 | } 141 | auto totalElapsedTimeMs = stopwatch.elapsedTime(); 142 | auto avgElapsedTimeMs = totalElapsedTimeMs / numIterations / static_cast(inputs[0].size()); 143 | 144 | spdlog::info("Benchmarking complete!"); 145 | spdlog::info("======================"); 146 | spdlog::info("Avg time per sample: "); 147 | spdlog::info("Avg time per sample: {} ms", avgElapsedTimeMs); 148 | spdlog::info("Batch size: {}", inputs[0].size()); 149 | spdlog::info("Avg FPS: {} fps", static_cast(1000 / avgElapsedTimeMs)); 150 | spdlog::info("======================\n"); 151 | 152 | // Print the feature vectors 153 | for (size_t batch = 0; batch < featureVectors.size(); ++batch) { 154 | for (size_t outputNum = 0; outputNum < featureVectors[batch].size(); ++outputNum) { 155 | spdlog::info("Batch {}, output {}", batch, outputNum); 156 | std::string output; 157 | int i = 0; 158 | for (const auto &e : featureVectors[batch][outputNum]) { 159 | output += std::to_string(e) + " "; 160 | if (++i == 10) { 161 | output += "..."; 162 | break; 163 | } 164 | } 165 | spdlog::info("{}", output); 166 | } 167 | } 168 | 169 | // TODO: If your model requires post processing (ex. convert feature vector 170 | // into bounding boxes) then you would do so here. 171 | 172 | return 0; 173 | } 174 | -------------------------------------------------------------------------------- /src/engine.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "NvOnnxParser.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "IEngine.h" 15 | #include "logger.h" 16 | #include "Int8Calibrator.h" 17 | #include "util/Util.h" 18 | #include "util/Stopwatch.h" 19 | #include "macros.h" 20 | 21 | // Precision used for GPU inference 22 | enum class Precision { 23 | // Full precision floating point value 24 | FP32, 25 | // Half prevision floating point value 26 | FP16, 27 | // Int8 quantization. 28 | // Has reduced dynamic range, may result in slight loss in accuracy. 29 | // If INT8 is selected, must provide path to calibration dataset directory. 30 | INT8, 31 | }; 32 | 33 | // Options for the network 34 | struct Options { 35 | // Precision to use for GPU inference. 36 | Precision precision = Precision::FP16; 37 | // If INT8 precision is selected, must provide path to calibration dataset 38 | // directory. 39 | std::string calibrationDataDirectoryPath; 40 | // The batch size to be used when computing calibration data for INT8 41 | // inference. Should be set to as large a batch number as your GPU will 42 | // support. 43 | int32_t calibrationBatchSize = 128; 44 | // The batch size which should be optimized for. 45 | int32_t optBatchSize = 1; 46 | // Maximum allowable batch size 47 | int32_t maxBatchSize = 16; 48 | // GPU device index 49 | int deviceIndex = 0; 50 | // Directory where the engine file should be saved 51 | std::string engineFileDir = "."; 52 | // Maximum allowed input width 53 | int32_t maxInputWidth = -1; // Default to -1 --> expecting fixed input size 54 | // Minimum allowed input width 55 | int32_t minInputWidth = -1; // Default to -1 --> expecting fixed input size 56 | // Optimal input width 57 | int32_t optInputWidth = -1; // Default to -1 --> expecting fixed input size 58 | }; 59 | 60 | // Class to extend TensorRT logger 61 | class Logger : public nvinfer1::ILogger { 62 | void log(Severity severity, const char *msg) noexcept override; 63 | }; 64 | 65 | template 66 | class Engine : public IEngine { 67 | public: 68 | Engine(const Options &options); 69 | ~Engine(); 70 | 71 | // Build the onnx model into a TensorRT engine file, cache the model to disk 72 | // (to avoid rebuilding in future), and then load the model into memory The 73 | // default implementation will normalize values between [0.f, 1.f] Setting the 74 | // normalize flag to false will leave values between [0.f, 255.f] (some 75 | // converted models may require this). If the model requires values to be 76 | // normalized between [-1.f, 1.f], use the following params: 77 | // subVals = {0.5f, 0.5f, 0.5f}; 78 | // divVals = {0.5f, 0.5f, 0.5f}; 79 | // normalize = true; 80 | bool buildLoadNetwork(std::string onnxModelPath, const std::array &subVals = {0.f, 0.f, 0.f}, 81 | const std::array &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override; 82 | 83 | // Load a TensorRT engine file from disk into memory 84 | // The default implementation will normalize values between [0.f, 1.f] 85 | // Setting the normalize flag to false will leave values between [0.f, 255.f] 86 | // (some converted models may require this). If the model requires values to 87 | // be normalized between [-1.f, 1.f], use the following params: 88 | // subVals = {0.5f, 0.5f, 0.5f}; 89 | // divVals = {0.5f, 0.5f, 0.5f}; 90 | // normalize = true; 91 | bool loadNetwork(std::string trtModelPath, const std::array &subVals = {0.f, 0.f, 0.f}, 92 | const std::array &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override; 93 | 94 | // Run inference. 95 | // Input format [input][batch][cv::cuda::GpuMat] 96 | // Output format [batch][output][feature_vector] 97 | bool runInference(const std::vector> &inputs, std::vector>> &featureVectors) override; 98 | 99 | // Utility method for resizing an image while maintaining the aspect ratio by 100 | // adding padding to smaller dimension after scaling While letterbox padding 101 | // normally adds padding to top & bottom, or left & right sides, this 102 | // implementation only adds padding to the right or bottom side This is done 103 | // so that it's easier to convert detected coordinates (ex. YOLO model) back 104 | // to the original reference frame. 105 | static cv::cuda::GpuMat resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width, 106 | const cv::Scalar &bgcolor = cv::Scalar(0, 0, 0)); 107 | 108 | [[nodiscard]] const std::vector &getInputDims() const override { return m_inputDims; }; 109 | [[nodiscard]] const std::vector &getOutputDims() const override { return m_outputDims; }; 110 | 111 | // Utility method for transforming triple nested output array into 2D array 112 | // Should be used when the output batch size is 1, but there are multiple 113 | // output feature vectors 114 | static void transformOutput(std::vector>> &input, std::vector> &output); 115 | 116 | // Utility method for transforming triple nested output array into single 117 | // array Should be used when the output batch size is 1, and there is only a 118 | // single output feature vector 119 | static void transformOutput(std::vector>> &input, std::vector &output); 120 | // Convert NHWC to NCHW and apply scaling and mean subtraction 121 | static cv::cuda::GpuMat blobFromGpuMats(const std::vector &batchInput, const std::array &subVals, 122 | const std::array &divVals, bool normalize, bool swapRB = false); 123 | 124 | private: 125 | // Build the network 126 | bool build(std::string onnxModelPath, const std::array &subVals, const std::array &divVals, bool normalize); 127 | 128 | // Converts the engine options into a string 129 | std::string serializeEngineOptions(const Options &options, const std::string &onnxModelPath); 130 | 131 | void getDeviceNames(std::vector &deviceNames); 132 | 133 | void clearGpuBuffers(); 134 | 135 | // Normalization, scaling, and mean subtraction of inputs 136 | std::array m_subVals{}; 137 | std::array m_divVals{}; 138 | bool m_normalize; 139 | 140 | // Holds pointers to the input and output GPU buffers 141 | std::vector m_buffers; 142 | std::vector m_outputLengths{}; 143 | std::vector m_inputDims; 144 | std::vector m_outputDims; 145 | std::vector m_IOTensorNames; 146 | int32_t m_inputBatchSize; 147 | 148 | // Must keep IRuntime around for inference, see: 149 | // https://forums.developer.nvidia.com/t/is-it-safe-to-deallocate-nvinfer1-iruntime-after-creating-an-nvinfer1-icudaengine-but-before-running-inference-with-said-icudaengine/255381/2?u=cyruspk4w6 150 | std::unique_ptr m_runtime = nullptr; 151 | std::unique_ptr m_calibrator = nullptr; 152 | std::unique_ptr m_engine = nullptr; 153 | std::unique_ptr m_context = nullptr; 154 | const Options m_options; 155 | Logger m_logger; 156 | }; 157 | 158 | template Engine::Engine(const Options &options) : m_options(options) {} 159 | 160 | template Engine::~Engine() { clearGpuBuffers(); } 161 | 162 | // Include inline implementations 163 | #include "engine/EngineRunInference.inl" 164 | #include "engine/EngineUtilities.inl" 165 | #include "engine/EngineBuildLoadNetwork.inl" 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Stargazers][stars-shield]][stars-url] 2 | 3 | [![All Contributors](https://img.shields.io/badge/all_contributors-3-orange.svg?style=flat-square)](#contributors-) 4 | 5 | [![Issues][issues-shield]][issues-url] 6 | [![LinkedIn][linkedin-shield]][linkedin-url] 7 | 8 | 9 |
10 |

11 | 12 | logo 13 | 14 | 15 |

TensorRT C++ API Tutorial

16 | 17 |

18 | 19 | How to use TensorRT C++ API for high performance GPU machine-learning inference. 20 | 21 |
22 | Supports models with single / multiple inputs and single / multiple outputs with batching. 23 |
24 |
25 | Project Overview Video 26 | . 27 | Code Deep-Dive Video 28 |

29 |

30 | 31 | ## Looking for Maintainers 🚀 32 | 33 | This project is actively seeking maintainers to help guide its growth and improvement. If you're passionate about this project and interested in contributing, I’d love to hear from you! 34 | 35 | Please feel free to reach out via [LinkedIn](https://www.linkedin.com/in/cyrus-behroozi/) to discuss how you can get involved. 36 | 37 | 38 | # TensorRT C++ Tutorial 39 | *I read all the NVIDIA TensorRT docs so that you don't have to!* 40 | 41 | This project demonstrates how to use the TensorRT C++ API for high performance GPU inference on image data. It covers how to do the following: 42 | - How to install TensorRT 10 on Ubuntu 20.04 / 22.04. 43 | - How to generate a TensorRT engine file optimized for your GPU. 44 | - How to specify a simple optimization profile. 45 | - How to run FP32, FP16, or INT8 precision inference. 46 | - How to read / write data from / into GPU memory and work with GPU images. 47 | - How to use cuda stream to run async inference and later synchronize. 48 | - How to work with models with static and dynamic batch sizes. 49 | - How to work with models with single or multiple output tensors. 50 | - How to work with models with multiple inputs. 51 | - Includes a [Video walkthrough](https://youtu.be/Z0n5aLmcRHQ) where I explain every line of code. 52 | - The code can be used as a base for any model which takes a fixed size image / images as input, including [Insightface](https://github.com/deepinsight/insightface) [ArcFace](https://github.com/onnx/models/tree/main/vision/body_analysis/arcface), [YoloV8](https://github.com/ultralytics/ultralytics), [SCRFD](https://insightface.ai/scrfd) face detection. 53 | - You will just need to implement the appropriate post-processing code. 54 | - TODO: Add support for models with dynamic input shapes. 55 | - TODO: Add support for Windows 56 | 57 | ## Getting Started 58 | The following instructions assume you are using Ubuntu 20.04 or 22.04. 59 | You will need to supply your own onnx model for this sample code or you can download the sample model (see Sanity Check section below). 60 | 61 | ### Prerequisites 62 | - Tested and working on Ubuntu 20.04 and 22.04 (Windows is **not** supported at this time) 63 | - Install CUDA 11 or 12, instructions [here](https://developer.nvidia.com/cuda-downloads). 64 | - Recommended >= 12.0 65 | - Required >= 11.0 66 | - Install cudnn, instructions [here](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#download). 67 | - Required >= 8 68 | - Required < 9 (OpenCV GPU does not yet support) 69 | - `sudo apt install build-essential` 70 | - `sudo snap install cmake --classic` 71 | - `sudo apt install libspdlog-dev libfmt-dev` (for logging) 72 | - Install OpenCV with cuda support. To compile OpenCV from source, run the `build_opencv.sh` script provided in `./scripts/`. 73 | - If you use the provided script and you have installed cuDNN to a non-standard location, you must modify the `CUDNN_INCLUDE_DIR` and `CUDNN_LIBRARY` variables in the script. 74 | - Recommended >= 4.8 75 | - Download TensorRT 10 from [here](https://developer.nvidia.com/tensorrt/download/10x). 76 | - Required >= 10.0 77 | - Navigate to the `CMakeLists.txt` file and replace the `TODO` with the path to your TensorRT installation. 78 | 79 | ### Building the Library 80 | - `mkdir build` 81 | - `cd build` 82 | - `cmake ..` 83 | - `make -j$(nproc)` 84 | 85 | ### Running the Executable 86 | - Navigate to the build directory 87 | - Run the executable and provide the path to your onnx model. 88 | - ex. `./run_inference_benchmark --onnx_model ../models/yolov8n.onnx` 89 | - Note: See sanity check section below for instructions on how to obtain the yolov8n model. 90 | - The first time you run the executable for a given model and options, a TensorRT engine file will be built from your onnx model. This process is fairly slow and can take 5+ minutes for some models (ex. yolo models). 91 | - Alternatively, you can choose to supply your own TensorRT engine file directly: 92 | - ex. `./run_inference_benchmark --trt_model ../models/yolov8n.engine.NVIDIAGeForceRTX3080LaptopGPU.fp16.1.1` 93 | - Note: See V5.0 changelog below for warnings when supply your own TensorRT engine file. 94 | 95 | ### Sanity Check 96 | - To perform a sanity check, download the `YOLOv8n` model from [here](https://github.com/ultralytics/ultralytics#models). 97 | - Next, convert it from pytorch to onnx using the following script: 98 | - You will need to run `pip3 install ultralytics` first. 99 | 100 | ```python 101 | from ultralytics import YOLO 102 | model = YOLO("./yolov8n.pt") 103 | model.fuse() 104 | model.info(verbose=False) # Print model information 105 | model.export(format="onnx", opset=12) # Export the model to onnx using opset 12 106 | ``` 107 | 108 | - Place the resulting onnx model, `yolov8n.onnx`, in the `./models/` directory. 109 | - Running inference using said model and the image located in `./inputs/team.jpg` should produce the following feature vector: 110 | - Note: The feature vector will not be identical (but very similar) as [TensorRT is not deterministic](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#determinism). 111 | ```text 112 | 3.41113 16.5312 20.8828 29.8984 43.7266 54.9609 62.0625 65.8594 70.0312 72.9531 ... 113 | ``` 114 | 115 | ### INT8 Inference 116 | Enabling INT8 precision can further speed up inference at the cost of accuracy reduction due to reduced dynamic range. 117 | For INT8 precision, the user must supply calibration data which is representative of real data the model will see. 118 | It is advised to use 1K+ calibration images. To enable INT8 inference with the YoloV8 sanity check model, the following steps must be taken: 119 | - Change `options.precision = Precision::FP16;` to `options.precision = Precision::INT8;` in `main.cpp` 120 | - `options.calibrationDataDirectoryPath = "";` must be changed in `main.cpp` to specify path containing calibration data. 121 | - If using the YoloV8 model, it is advised to used the COCO validation dataset, which can be downloaded with `wget http://images.cocodataset.org/zips/val2017.zip` 122 | - Make sure the resizing code in the `Int8EntropyCalibrator2::getBatch` method in `engine.cpp` (see `TODO`) is correct for your model. 123 | - If using the YoloV8 model, the preprocessing code is correct and does not need to be changed. 124 | - Recompile, run the executable. 125 | - The calibration cache will be written to disk (`.calibration` extension) so that on subsequent model optimizations it can be reused. If you'd like to regenerate the calibration data, you must delete this cache file. 126 | - If you get an "out of memory in function allocate" error, then you must reduce `Options.calibrationBatchSize` so that the entire batch can fit in your GPU memory. 127 | 128 | ### Benchmarks 129 | Benchmarks run on RTX 3050 Ti Laptop GPU, 11th Gen Intel(R) Core(TM) i9-11900H @ 2.50GHz. 130 | 131 | | Model | Precision | Batch Size | Avg Inference Time | 132 | |---------|-----------|------------|--------------------| 133 | | yolov8n | FP32 | 1 | 4.732 ms | 134 | | yolov8n | FP16 | 1 | 2.493 ms | 135 | | yolov8n | INT8 | 1 | 2.009 ms | 136 | | yolov8x | FP32 | 1 | 76.63 ms | 137 | | yolov8x | FP16 | 1 | 25.08 ms | 138 | | yolov8x | INT8 | 1 | 11.62 ms | 139 | 140 | ### Sample Integration 141 | Wondering how to integrate this library into your project? Or perhaps how to read the outputs of the YoloV8 model to extract meaningful information? 142 | If so, check out my two latest projects, [YOLOv8-TensorRT-CPP](https://github.com/cyrusbehr/YOLOv8-TensorRT-CPP) and [YOLOv9-TensorRT-CPP](https://github.com/cyrusbehr/YOLOv9-TensorRT-CPP), which demonstrate how to use the TensorRT C++ API to run YoloV8/9 inference (supports object detection, semantic segmentation, and body pose estimation). They make use of this project in the backend! 143 | 144 | ### Project Structure 145 | ```sh 146 | project-root/ 147 | ├── include/ 148 | │ ├── engine/ 149 | │ │ ├── EngineRunInference.inl 150 | │ │ ├── EngineUtilities.inl 151 | │ │ └── EngineBuildLoadNetwork.inl 152 | │ ├── util/... 153 | │ ├── ... 154 | ├── src/ 155 | | ├── ... 156 | │ ├── engine.cpp 157 | │ ├── engine.h 158 | │ └── main.cpp 159 | ├── CMakeLists.txt 160 | └── README.md 161 | ``` 162 | 163 | ### Understanding the Code 164 | - The bulk of the implementation is located in `include/engine`. I have written lots of comments all throughout the code which should make it easy to understand what is going on. 165 | - The inference code is located in `include/engine/EngineRunInference.inl`. 166 | - The building and loading of the TensorRT engine file is located in `include/engine/EngineBuildLoadNetwork.inl`. 167 | - You can also check out my [deep-dive video](https://youtu.be/Z0n5aLmcRHQ) in which I explain every line of code. 168 | 169 | ### How to Debug 170 | - The implementation uses the `spdlog` library for logging. You can change the log level by setting the environment variable `LOG_LEVEL` to one of the following values: `trace`, `debug`, `info`, `warn`, `error`, `critical`, `off`. 171 | 172 | - If you have issues creating the TensorRT engine file from the onnx model, consider setting the environment variable `LOG_LEVEL` to `trace` and re-run the application. This should give you more information on where exactly the build process is failing. 173 | 174 | ### Show your Appreciation 175 | If this project was helpful to you, I would appreciate if you could give it a star. That will encourage me to ensure it's up to date and solve issues quickly. I also do consulting work if you require more specific help. Connect with me on [LinkedIn](https://www.linkedin.com/in/cyrus-behroozi/). 176 | 177 | ### Contributors 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 |
Loic Tetrel
Loic Tetrel

💻
thomaskleiven
thomaskleiven

💻
WiCyn
WiCyn

💻
191 | 192 | 193 | 194 | 195 | 196 | 197 | ### Changelog 198 | 199 | **V6.0** 200 | 201 | - Implementation now requires TensorRT >= 10.0. 202 | 203 | **V5.0** 204 | 205 | - `Engine` class has been modified to take a template parameter which specifies the models output data type. The implementation now supports outputs of type `float`, `__half`, `int8_t`, `int32_t`, `bool`, and `uint8_t`. 206 | - Added support for loading TensorRT engine file directly without needing to compile from onnx model. Howver, it is highly recommended that you use the API provided to build the engine file from the onnx model, instead of loading a TensorRT model directly. If you choose to load a TensorRT model file directly, you must hand-check that the `Options` have been set correctly for your model (for example, if your model has been compiled for FP32 but you try running FP16 inference, it will fail, potentially without a verbose error). 207 | - Added command line parser. 208 | 209 | **V4.1** 210 | 211 | - Added support for fixed batch size > 1. 212 | 213 | **V4.0** 214 | 215 | - Added support for INT8 precision. 216 | 217 | 218 | **V3.0** 219 | 220 | - Implementation has been updated to use TensorRT 8.6 API (ex. `IExecutionContext::enqueueV3()`). 221 | - Executable has renamed from `driver` to `run_inference_benchmark` and now must be passed path to onnx model as command line argument. 222 | - Removed `Options.doesSupportDynamicBatchSize`. Implementation now auto-detects supported batch sizes. 223 | - Removed `Options.maxWorkspaceSize`. Implementation now does not limit GPU memory during model constructions, allowing implementation to use as much of memory pool as is available for intermediate layers. 224 | 225 | **v2.2** 226 | 227 | - Serialize model name as part of engine file. 228 | 229 | **V2.1** 230 | 231 | - Added support for models with multiple inputs. Implementation now supports models with single inputs, multiple inputs, single outputs, multiple outputs, and batching. 232 | 233 | **V2.0** 234 | 235 | - Requires OpenCV cuda to be installed. To install, follow instructions [here](https://gist.github.com/raulqf/f42c718a658cddc16f9df07ecc627be7). 236 | - `Options.optBatchSizes` has been removed, replaced by `Options.optBatchSize`. 237 | - Support models with more than a single output (ex. SCRFD). 238 | - Added support for models which do not support batch inference (first input dimension is fixed). 239 | - More error checking. 240 | - Fixed a bunch of common issues people were running into with the original V1.0 version. 241 | - Remove whitespace from GPU device name 242 | 243 | 244 | 245 | [stars-shield]: https://img.shields.io/github/stars/cyrusbehr/tensorrt-cpp-api.svg?style=flat-square 246 | [stars-url]: https://github.com/cyrusbehr/tensorrt-cpp-api/stargazers 247 | [issues-shield]: https://img.shields.io/github/issues/cyrusbehr/tensorrt-cpp-api.svg?style=flat-square 248 | [issues-url]: https://github.com/cyrusbehr/tensorrt-cpp-api/issues 249 | [linkedin-shield]: https://img.shields.io/badge/-LinkedIn-black.svg?style=flat-square&logo=linkedin&colorB=555 250 | [linkedin-url]: https://linkedin.com/in/cyrus-behroozi/ 251 | 252 | ## Contributors ✨ 253 | 254 | Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)): 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome! 264 | -------------------------------------------------------------------------------- /include/engine/EngineBuildLoadNetwork.inl: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include "util/Util.h" 5 | 6 | template 7 | bool Engine::buildLoadNetwork(std::string onnxModelPath, const std::array &subVals, const std::array &divVals, 8 | bool normalize) { 9 | const auto engineName = serializeEngineOptions(m_options, onnxModelPath); 10 | const auto engineDir = std::filesystem::path(m_options.engineFileDir); 11 | std::filesystem::path enginePath = engineDir / engineName; 12 | spdlog::info("Searching for engine file with name: {}", enginePath.string()); 13 | 14 | if (Util::doesFileExist(enginePath)) { 15 | spdlog::info("Engine found, not regenerating..."); 16 | } else { 17 | if (!Util::doesFileExist(onnxModelPath)) { 18 | auto msg = "Could not find ONNX model at path: " + onnxModelPath; 19 | spdlog::error(msg); 20 | throw std::runtime_error(msg); 21 | } 22 | 23 | spdlog::info("Engine not found, generating. This could take a while..."); 24 | if (!std::filesystem::exists(engineDir)) { 25 | std::filesystem::create_directories(engineDir); 26 | spdlog::info("Created directory: {}", engineDir.string()); 27 | } 28 | 29 | auto ret = build(onnxModelPath, subVals, divVals, normalize); 30 | if (!ret) { 31 | return false; 32 | } 33 | } 34 | 35 | return loadNetwork(enginePath, subVals, divVals, normalize); 36 | } 37 | 38 | template 39 | bool Engine::loadNetwork(std::string trtModelPath, const std::array &subVals, const std::array &divVals, 40 | bool normalize) { 41 | m_subVals = subVals; 42 | m_divVals = divVals; 43 | m_normalize = normalize; 44 | 45 | // Read the serialized model from disk 46 | if (!Util::doesFileExist(trtModelPath)) { 47 | auto msg = "Error, unable to read TensorRT model at path: " + trtModelPath; 48 | spdlog::error(msg); 49 | return false; 50 | } else { 51 | auto msg = "Loading TensorRT engine file at path: " + trtModelPath; 52 | spdlog::info(msg); 53 | } 54 | 55 | std::ifstream file(trtModelPath, std::ios::binary | std::ios::ate); 56 | std::streamsize size = file.tellg(); 57 | file.seekg(0, std::ios::beg); 58 | 59 | std::vector buffer(size); 60 | if (!file.read(buffer.data(), size)) { 61 | auto msg = "Error, unable to read engine file"; 62 | spdlog::error(msg); 63 | throw std::runtime_error(msg); 64 | } 65 | 66 | // Create a runtime to deserialize the engine file. 67 | m_runtime = std::unique_ptr{nvinfer1::createInferRuntime(m_logger)}; 68 | if (!m_runtime) { 69 | return false; 70 | } 71 | 72 | // Set the device index 73 | auto ret = cudaSetDevice(m_options.deviceIndex); 74 | if (ret != 0) { 75 | int numGPUs; 76 | cudaGetDeviceCount(&numGPUs); 77 | auto errMsg = "Unable to set GPU device index to: " + std::to_string(m_options.deviceIndex) + ". Note, your device has " + 78 | std::to_string(numGPUs) + " CUDA-capable GPU(s)."; 79 | spdlog::error(errMsg); 80 | throw std::runtime_error(errMsg); 81 | } 82 | 83 | // Create an engine, a representation of the optimized model. 84 | m_engine = std::unique_ptr(m_runtime->deserializeCudaEngine(buffer.data(), buffer.size())); 85 | if (!m_engine) { 86 | return false; 87 | } 88 | 89 | // The execution context contains all of the state associated with a 90 | // particular invocation 91 | m_context = std::unique_ptr(m_engine->createExecutionContext()); 92 | if (!m_context) { 93 | return false; 94 | } 95 | 96 | // Storage for holding the input and output buffers 97 | // This will be passed to TensorRT for inference 98 | clearGpuBuffers(); 99 | m_buffers.resize(m_engine->getNbIOTensors()); 100 | 101 | m_outputLengths.clear(); 102 | m_inputDims.clear(); 103 | m_outputDims.clear(); 104 | m_IOTensorNames.clear(); 105 | 106 | // Create a cuda stream 107 | cudaStream_t stream; 108 | Util::checkCudaErrorCode(cudaStreamCreate(&stream)); 109 | 110 | // Allocate GPU memory for input and output buffers 111 | m_outputLengths.clear(); 112 | for (int i = 0; i < m_engine->getNbIOTensors(); ++i) { 113 | const auto tensorName = m_engine->getIOTensorName(i); 114 | m_IOTensorNames.emplace_back(tensorName); 115 | const auto tensorType = m_engine->getTensorIOMode(tensorName); 116 | const auto tensorShape = m_engine->getTensorShape(tensorName); 117 | const auto tensorDataType = m_engine->getTensorDataType(tensorName); 118 | 119 | if (tensorType == nvinfer1::TensorIOMode::kINPUT) { 120 | // The implementation currently only supports inputs of type float 121 | if (m_engine->getTensorDataType(tensorName) != nvinfer1::DataType::kFLOAT) { 122 | auto msg = "Error, the implementation currently only supports float inputs"; 123 | spdlog::error(msg); 124 | throw std::runtime_error(msg); 125 | } 126 | 127 | // Don't need to allocate memory for inputs as we will be using the OpenCV 128 | // GpuMat buffer directly. 129 | 130 | // Store the input dims for later use 131 | m_inputDims.emplace_back(tensorShape.d[1], tensorShape.d[2], tensorShape.d[3]); 132 | m_inputBatchSize = tensorShape.d[0]; 133 | } else if (tensorType == nvinfer1::TensorIOMode::kOUTPUT) { 134 | // Ensure the model output data type matches the template argument 135 | // specified by the user 136 | if (tensorDataType == nvinfer1::DataType::kFLOAT && !std::is_same::value) { 137 | auto msg = "Error, the model has expected output of type float. Engine class template parameter must be adjusted."; 138 | spdlog::error(msg); 139 | throw std::runtime_error(msg); 140 | } else if (tensorDataType == nvinfer1::DataType::kHALF && !std::is_same<__half, T>::value) { 141 | auto msg = "Error, the model has expected output of type __half. Engine class template parameter must be adjusted."; 142 | spdlog::error(msg); 143 | throw std::runtime_error(msg); 144 | } else if (tensorDataType == nvinfer1::DataType::kINT8 && !std::is_same::value) { 145 | auto msg = "Error, the model has expected output of type int8_t. Engine class template parameter must be adjusted."; 146 | spdlog::error(msg); 147 | throw std::runtime_error(msg); 148 | } else if (tensorDataType == nvinfer1::DataType::kINT32 && !std::is_same::value) { 149 | auto msg = "Error, the model has expected output of type int32_t. Engine class template parameter must be adjusted."; 150 | spdlog::error(msg); 151 | throw std::runtime_error(msg); 152 | } else if (tensorDataType == nvinfer1::DataType::kBOOL && !std::is_same::value) { 153 | auto msg = "Error, the model has expected output of type bool. Engine class template parameter must be adjusted."; 154 | spdlog::error(msg); 155 | throw std::runtime_error(msg); 156 | } else if (tensorDataType == nvinfer1::DataType::kUINT8 && !std::is_same::value) { 157 | auto msg = "Error, the model has expected output of type uint8_t. Engine class template parameter must be adjusted."; 158 | spdlog::error(msg); 159 | throw std::runtime_error(msg); 160 | } else if (tensorDataType == nvinfer1::DataType::kFP8) { 161 | auto msg = "Error, the model has expected output of type kFP8. This is not supported by the Engine class."; 162 | spdlog::error(msg); 163 | throw std::runtime_error(msg); 164 | } 165 | 166 | // The binding is an output 167 | uint32_t outputLength = 1; 168 | m_outputDims.push_back(tensorShape); 169 | 170 | for (int j = 1; j < tensorShape.nbDims; ++j) { 171 | // We ignore j = 0 because that is the batch size, and we will take that 172 | // into account when sizing the buffer 173 | outputLength *= tensorShape.d[j]; 174 | } 175 | 176 | m_outputLengths.push_back(outputLength); 177 | // Now size the output buffer appropriately, taking into account the max 178 | // possible batch size (although we could actually end up using less 179 | // memory) 180 | Util::checkCudaErrorCode(cudaMallocAsync(&m_buffers[i], outputLength * m_options.maxBatchSize * sizeof(T), stream)); 181 | } else { 182 | auto msg = "Error, IO Tensor is neither an input or output!"; 183 | spdlog::error(msg); 184 | throw std::runtime_error(msg); 185 | } 186 | } 187 | 188 | // Synchronize and destroy the cuda stream 189 | Util::checkCudaErrorCode(cudaStreamSynchronize(stream)); 190 | Util::checkCudaErrorCode(cudaStreamDestroy(stream)); 191 | 192 | return true; 193 | } 194 | 195 | 196 | template 197 | bool Engine::build(std::string onnxModelPath, const std::array &subVals, const std::array &divVals, bool normalize) { 198 | // Create our engine builder. 199 | auto builder = std::unique_ptr(nvinfer1::createInferBuilder(m_logger)); 200 | if (!builder) { 201 | return false; 202 | } 203 | 204 | // Define an explicit batch size and then create the network (implicit batch 205 | // size is deprecated). More info here: 206 | // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#explicit-implicit-batch 207 | auto explicitBatch = 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); 208 | auto network = std::unique_ptr(builder->createNetworkV2(explicitBatch)); 209 | if (!network) { 210 | return false; 211 | } 212 | 213 | // Create a parser for reading the onnx file. 214 | auto parser = std::unique_ptr(nvonnxparser::createParser(*network, m_logger)); 215 | if (!parser) { 216 | return false; 217 | } 218 | 219 | // We are going to first read the onnx file into memory, then pass that buffer 220 | // to the parser. Had our onnx model file been encrypted, this approach would 221 | // allow us to first decrypt the buffer. 222 | std::ifstream file(onnxModelPath, std::ios::binary | std::ios::ate); 223 | std::streamsize size = file.tellg(); 224 | file.seekg(0, std::ios::beg); 225 | 226 | std::vector buffer(size); 227 | if (!file.read(buffer.data(), size)) { 228 | auto msg = "Error, unable to read engine file"; 229 | spdlog::error(msg); 230 | throw std::runtime_error(msg); 231 | } 232 | 233 | // Parse the buffer we read into memory. 234 | auto parsed = parser->parse(buffer.data(), buffer.size()); 235 | if (!parsed) { 236 | return false; 237 | } 238 | 239 | // Ensure that all the inputs have the same batch size 240 | const auto numInputs = network->getNbInputs(); 241 | if (numInputs < 1) { 242 | auto msg = "Error, model needs at least 1 input!"; 243 | spdlog::error(msg); 244 | throw std::runtime_error(msg); 245 | } 246 | const auto input0Batch = network->getInput(0)->getDimensions().d[0]; 247 | for (int32_t i = 1; i < numInputs; ++i) { 248 | if (network->getInput(i)->getDimensions().d[0] != input0Batch) { 249 | auto msg = "Error, the model has multiple inputs, each with differing batch sizes!"; 250 | spdlog::error(msg); 251 | throw std::runtime_error(msg); 252 | } 253 | } 254 | 255 | // Check to see if the model supports dynamic batch size or not 256 | bool doesSupportDynamicBatch = false; 257 | if (input0Batch == -1) { 258 | doesSupportDynamicBatch = true; 259 | spdlog::info("Model supports dynamic batch size"); 260 | } else { 261 | spdlog::info("Model only supports fixed batch size of {}", input0Batch); 262 | // If the model supports a fixed batch size, ensure that the maxBatchSize 263 | // and optBatchSize were set correctly. 264 | if (m_options.optBatchSize != input0Batch || m_options.maxBatchSize != input0Batch) { 265 | auto msg = "Error, model only supports a fixed batch size of " + std::to_string(input0Batch) + 266 | ". Must set Options.optBatchSize and Options.maxBatchSize to 1"; 267 | spdlog::error(msg); 268 | throw std::runtime_error(msg); 269 | } 270 | } 271 | 272 | const auto input3Batch = network->getInput(0)->getDimensions().d[3]; 273 | bool doesSupportDynamicWidth = false; 274 | if (input3Batch == -1) { 275 | doesSupportDynamicWidth = true; 276 | spdlog::info("Model supports dynamic width. Using Options.maxInputWidth, Options.minInputWidth, and Options.optInputWidth to set the input width."); 277 | 278 | // Check that the values of maxInputWidth, minInputWidth, and optInputWidth are valid 279 | if (m_options.maxInputWidth < m_options.minInputWidth || m_options.maxInputWidth < m_options.optInputWidth || 280 | m_options.minInputWidth > m_options.optInputWidth 281 | || m_options.maxInputWidth < 1 || m_options.minInputWidth < 1 || m_options.optInputWidth < 1) { 282 | auto msg = "Error, invalid values for Options.maxInputWidth, Options.minInputWidth, and Options.optInputWidth"; 283 | spdlog::error(msg); 284 | throw std::runtime_error(msg); 285 | } 286 | } 287 | 288 | 289 | auto config = std::unique_ptr(builder->createBuilderConfig()); 290 | if (!config) { 291 | return false; 292 | } 293 | 294 | // Register a single optimization profile 295 | nvinfer1::IOptimizationProfile *optProfile = builder->createOptimizationProfile(); 296 | for (int32_t i = 0; i < numInputs; ++i) { 297 | // Must specify dimensions for all the inputs the model expects. 298 | const auto input = network->getInput(i); 299 | const auto inputName = input->getName(); 300 | const auto inputDims = input->getDimensions(); 301 | int32_t inputC = inputDims.d[1]; 302 | int32_t inputH = inputDims.d[2]; 303 | int32_t inputW = inputDims.d[3]; 304 | 305 | int32_t minInputWidth = std::max(m_options.minInputWidth, inputW); 306 | 307 | // Specify the optimization profile` 308 | if (doesSupportDynamicBatch) { 309 | optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4(1, inputC, inputH, minInputWidth)); 310 | } else { 311 | optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMIN, 312 | nvinfer1::Dims4(m_options.optBatchSize, inputC, inputH, minInputWidth)); 313 | } 314 | 315 | if (doesSupportDynamicWidth) { 316 | optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kOPT, 317 | nvinfer1::Dims4(m_options.optBatchSize, inputC, inputH, m_options.optInputWidth)); 318 | optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMAX, 319 | nvinfer1::Dims4(m_options.maxBatchSize, inputC, inputH, m_options.maxInputWidth)); 320 | } else { 321 | optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kOPT, 322 | nvinfer1::Dims4(m_options.optBatchSize, inputC, inputH, inputW)); 323 | optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMAX, 324 | nvinfer1::Dims4(m_options.maxBatchSize, inputC, inputH, inputW)); 325 | } 326 | } 327 | config->addOptimizationProfile(optProfile); 328 | 329 | // Set the precision level 330 | const auto engineName = serializeEngineOptions(m_options, onnxModelPath); 331 | if (m_options.precision == Precision::FP16) { 332 | // Ensure the GPU supports FP16 inference 333 | if (!builder->platformHasFastFp16()) { 334 | auto msg = "Error: GPU does not support FP16 precision"; 335 | spdlog::error(msg); 336 | throw std::runtime_error(msg); 337 | } 338 | config->setFlag(nvinfer1::BuilderFlag::kFP16); 339 | } else if (m_options.precision == Precision::INT8) { 340 | if (numInputs > 1) { 341 | auto msg = "Error, this implementation currently only supports INT8 " 342 | "quantization for single input models"; 343 | spdlog::error(msg); 344 | throw std::runtime_error(msg); 345 | } 346 | 347 | // Ensure the GPU supports INT8 Quantization 348 | if (!builder->platformHasFastInt8()) { 349 | auto msg = "Error: GPU does not support INT8 precision"; 350 | spdlog::error(msg); 351 | throw std::runtime_error(msg); 352 | } 353 | 354 | // Ensure the user has provided path to calibration data directory 355 | if (m_options.calibrationDataDirectoryPath.empty()) { 356 | auto msg = "Error: If INT8 precision is selected, must provide path to " 357 | "calibration data directory to Engine::build method"; 358 | throw std::runtime_error(msg); 359 | } 360 | 361 | config->setFlag((nvinfer1::BuilderFlag::kINT8)); 362 | 363 | const auto input = network->getInput(0); 364 | const auto inputName = input->getName(); 365 | const auto inputDims = input->getDimensions(); 366 | const auto calibrationFileName = engineName + ".calibration"; 367 | 368 | m_calibrator = std::make_unique(m_options.calibrationBatchSize, inputDims.d[3], inputDims.d[2], 369 | m_options.calibrationDataDirectoryPath, calibrationFileName, inputName, 370 | subVals, divVals, normalize); 371 | config->setInt8Calibrator(m_calibrator.get()); 372 | } 373 | 374 | // CUDA stream used for profiling by the builder. 375 | cudaStream_t profileStream; 376 | Util::checkCudaErrorCode(cudaStreamCreate(&profileStream)); 377 | config->setProfileStream(profileStream); 378 | 379 | // Build the engine 380 | // If this call fails, it is suggested to increase the logger verbosity to 381 | // kVERBOSE and try rebuilding the engine. Doing so will provide you with more 382 | // information on why exactly it is failing. 383 | std::unique_ptr plan{builder->buildSerializedNetwork(*network, *config)}; 384 | if (!plan) { 385 | return false; 386 | } 387 | 388 | // Write the engine to disk 389 | const auto enginePath = std::filesystem::path(m_options.engineFileDir) / engineName; 390 | std::ofstream outfile(enginePath, std::ofstream::binary); 391 | outfile.write(reinterpret_cast(plan->data()), plan->size()); 392 | spdlog::info("Success, saved engine to {}", enginePath.string()); 393 | 394 | Util::checkCudaErrorCode(cudaStreamDestroy(profileStream)); 395 | return true; 396 | } --------------------------------------------------------------------------------