├── models
└── .gitignore
├── images
└── logo.png
├── inputs
└── team.jpg
├── .clang-format
├── .pre-commit-config.yaml
├── cmake
├── ccache.cmake
└── FindTensorRT.cmake
├── .gitignore
├── include
├── util
│ ├── Util.h
│ ├── Stopwatch.h
│ └── Util.inl
├── macros.h
├── interfaces
│ └── IEngine.h
├── Int8Calibrator.h
├── logger.h
└── engine
│ ├── EngineRunInference.inl
│ ├── EngineUtilities.inl
│ └── EngineBuildLoadNetwork.inl
├── LICENSE
├── .all-contributorsrc
├── scripts
└── build_opencv.sh
├── CMakeLists.txt
├── src
├── cmd_line_parser.h
├── engine.cpp
├── main.cpp
└── engine.h
└── README.md
/models/.gitignore:
--------------------------------------------------------------------------------
1 | *.onnx
2 | *.trt
--------------------------------------------------------------------------------
/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cyrusbehr/tensorrt-cpp-api/HEAD/images/logo.png
--------------------------------------------------------------------------------
/inputs/team.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cyrusbehr/tensorrt-cpp-api/HEAD/inputs/team.jpg
--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
1 | ---
2 | Language: Cpp
3 | # BasedOnStyle: LLVM
4 | AccessModifierOffset: -4
5 | ColumnLimit: 140
6 | IndentWidth: 4
7 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/mirrors-clang-format
3 | rev: 'v17.0.3' # Use the sha / tag you want to point at
4 | hooks:
5 | - id: clang-format
6 | types_or: [c++, c, cuda]
7 |
--------------------------------------------------------------------------------
/cmake/ccache.cmake:
--------------------------------------------------------------------------------
1 | find_program(CCACHE_PROGRAM ccache)
2 | if(CCACHE_PROGRAM)
3 | set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}")
4 | message(STATUS "ccache: found")
5 | else()
6 | message(STATUS "ccache: not found")
7 | endif()
8 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Prerequisites
2 | # Prerequisites
3 | *.d
4 |
5 | # Compiled Object files
6 | *.slo
7 | *.lo
8 | *.o
9 | *.obj
10 |
11 | # Precompiled Headers
12 | *.gch
13 | *.pch
14 |
15 | # Compiled Dynamic libraries
16 | *.so
17 | *.dylib
18 | *.dll
19 |
20 | # Fortran module files
21 | *.mod
22 | *.smod
23 |
24 | # Compiled Static libraries
25 | *.lai
26 | *.la
27 | *.a
28 | *.lib
29 |
30 | # Executables
31 | *.exe
32 | *.out
33 | *.app
34 |
35 | build*/
36 | cmake-build-*
37 | .idea/
38 | .vscode/
39 |
40 | *.onnx
41 |
42 |
--------------------------------------------------------------------------------
/include/util/Util.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | namespace Util {
10 | // Checks if a file exists at the given file path
11 | bool doesFileExist(const std::string &filepath);
12 |
13 | // Checks and logs CUDA error codes
14 | void checkCudaErrorCode(cudaError_t code);
15 |
16 | // Retrieves a list of file names in the specified directory
17 | std::vector getFilesInDirectory(const std::string &dirPath);
18 | }
19 |
20 | #include "Util.inl"
21 |
--------------------------------------------------------------------------------
/include/util/Stopwatch.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | // Utility Timer
5 | template class Stopwatch {
6 | typename Clock::time_point start_point;
7 |
8 | public:
9 | Stopwatch() : start_point(Clock::now()) {}
10 |
11 | // Returns elapsed time
12 | template Rep elapsedTime() const {
13 | std::atomic_thread_fence(std::memory_order_relaxed);
14 | auto counted_time = std::chrono::duration_cast(Clock::now() - start_point).count();
15 | std::atomic_thread_fence(std::memory_order_relaxed);
16 | return static_cast(counted_time);
17 | }
18 | };
19 |
20 | using preciseStopwatch = Stopwatch<>;
--------------------------------------------------------------------------------
/include/macros.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | #define CHECK(condition) \
6 | do { \
7 | if (!(condition)) { \
8 | spdlog::error("Assertion failed: ({}), function {}, file {}, line {}.", #condition, __FUNCTION__, __FILE__, __LINE__); \
9 | abort(); \
10 | } \
11 | } while (false);
12 |
--------------------------------------------------------------------------------
/include/util/Util.inl:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | namespace Util {
6 |
7 | inline bool doesFileExist(const std::string &filepath) {
8 | std::ifstream f(filepath.c_str());
9 | return f.good();
10 | }
11 |
12 | inline void checkCudaErrorCode(cudaError_t code) {
13 | if (code != cudaSuccess) {
14 | std::string errMsg = "CUDA operation failed with code: " + std::to_string(code) + " (" + cudaGetErrorName(code) +
15 | "), with message: " + cudaGetErrorString(code);
16 | spdlog::error(errMsg);
17 | throw std::runtime_error(errMsg);
18 | }
19 | }
20 |
21 | inline std::vector getFilesInDirectory(const std::string &dirPath) {
22 | std::vector fileNames;
23 | for (const auto &entry : std::filesystem::directory_iterator(dirPath)) {
24 | if (entry.is_regular_file()) {
25 | fileNames.push_back(entry.path().string());
26 | }
27 | }
28 | return fileNames;
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/include/interfaces/IEngine.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 | #include
6 | #include "NvInfer.h" // Include for nvinfer1::Dims and nvinfer1::Dims3
7 |
8 | template
9 | class IEngine {
10 | public:
11 | virtual ~IEngine() = default;
12 | virtual bool buildLoadNetwork(std::string onnxModelPath, const std::array &subVals = {0.f, 0.f, 0.f},
13 | const std::array &divVals = {1.f, 1.f, 1.f}, bool normalize = true) = 0;
14 | virtual bool loadNetwork(std::string trtModelPath, const std::array &subVals = {0.f, 0.f, 0.f},
15 | const std::array &divVals = {1.f, 1.f, 1.f}, bool normalize = true) = 0;
16 | virtual bool runInference(const std::vector> &inputs,
17 | std::vector>> &featureVectors) = 0;
18 | virtual const std::vector &getInputDims() const = 0;
19 | virtual const std::vector &getOutputDims() const = 0;
20 | };
21 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 cyrusbehr
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.all-contributorsrc:
--------------------------------------------------------------------------------
1 | {
2 | "files": [
3 | "README.md"
4 | ],
5 | "imageSize": 100,
6 | "commit": false,
7 | "commitType": "docs",
8 | "commitConvention": "angular",
9 | "contributors": [
10 | {
11 | "login": "ltetrel",
12 | "name": "Loic Tetrel",
13 | "avatar_url": "https://avatars.githubusercontent.com/u/37963074?v=4",
14 | "profile": "https://ltetrel.github.io/",
15 | "contributions": [
16 | "code"
17 | ]
18 | },
19 | {
20 | "login": "thomaskleiven",
21 | "name": "thomaskleiven",
22 | "avatar_url": "https://avatars.githubusercontent.com/u/17145074?v=4",
23 | "profile": "https://github.com/thomaskleiven",
24 | "contributions": [
25 | "code"
26 | ]
27 | },
28 | {
29 | "login": "qq978358810",
30 | "name": "WiCyn",
31 | "avatar_url": "https://avatars.githubusercontent.com/u/45676681?v=4",
32 | "profile": "https://github.com/qq978358810",
33 | "contributions": [
34 | "code"
35 | ]
36 | }
37 | ],
38 | "contributorsPerLine": 7,
39 | "skipCi": true,
40 | "repoType": "github",
41 | "repoHost": "https://github.com",
42 | "projectName": "tensorrt-cpp-api",
43 | "projectOwner": "cyrusbehr"
44 | }
45 |
--------------------------------------------------------------------------------
/scripts/build_opencv.sh:
--------------------------------------------------------------------------------
1 | VERSION=4.8.0
2 |
3 | test -e ${VERSION}.zip || wget https://github.com/opencv/opencv/archive/refs/tags/${VERSION}.zip
4 | test -e opencv-${VERSION} || unzip ${VERSION}.zip
5 |
6 | test -e opencv_extra_${VERSION}.zip || wget -O opencv_extra_${VERSION}.zip https://github.com/opencv/opencv_contrib/archive/refs/tags/${VERSION}.zip
7 | test -e opencv_contrib-${VERSION} || unzip opencv_extra_${VERSION}.zip
8 |
9 |
10 | cd opencv-${VERSION}
11 | mkdir build
12 | cd build
13 |
14 | cmake -D CMAKE_BUILD_TYPE=RELEASE \
15 | -D CMAKE_INSTALL_PREFIX=/usr/local \
16 | -D WITH_TBB=ON \
17 | -D ENABLE_FAST_MATH=1 \
18 | -D CUDA_FAST_MATH=1 \
19 | -D WITH_CUBLAS=1 \
20 | -D WITH_CUDA=ON \
21 | -D BUILD_opencv_cudacodec=ON \
22 | -D WITH_CUDNN=ON \
23 | -D OPENCV_DNN_CUDA=ON \
24 | -D WITH_QT=OFF \
25 | -D WITH_OPENGL=ON \
26 | -D BUILD_opencv_apps=OFF \
27 | -D BUILD_opencv_python2=OFF \
28 | -D OPENCV_GENERATE_PKGCONFIG=ON \
29 | -D OPENCV_PC_FILE_NAME=opencv.pc \
30 | -D OPENCV_ENABLE_NONFREE=ON \
31 | -D OPENCV_EXTRA_MODULES_PATH=../../opencv_contrib-${VERSION}/modules \
32 | -D INSTALL_PYTHON_EXAMPLES=OFF \
33 | -D INSTALL_C_EXAMPLES=OFF \
34 | -D BUILD_EXAMPLES=OFF \
35 | -D WITH_FFMPEG=ON \
36 | -D CUDNN_INCLUDE_DIR=/usr/local/cuda/include \
37 | -D CUDNN_LIBRARY=/usr/local/cuda/lib64/libcudnn.so \
38 | ..
39 |
40 | make -j 8
41 | sudo make -j 8 install
42 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.18)
2 | project(tensorrt_cpp_api)
3 |
4 | # Use ccache to speed up rebuilds
5 | include(cmake/ccache.cmake)
6 |
7 | # Set C++ version and optimization level
8 | set(CMAKE_CXX_STANDARD 17)
9 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Ofast -DNDEBUG -Wno-deprecated-declarations")
10 |
11 | # For finding FindTensorRT.cmake
12 | set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
13 |
14 | # TODO: Specify the path to TensorRT root dir
15 | if (NOT TensorRT_DIR)
16 | set(TensorRT_DIR /home/cyrus/work/libs/TensorRT-10.0.0.6/)
17 | endif()
18 | # Use the correct version of CUDA
19 | set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda)
20 |
21 | # We require CUDA, OpenCV, and TensorRT
22 | find_package(TensorRT REQUIRED)
23 | find_package(CUDA REQUIRED)
24 | find_package(OpenCV REQUIRED)
25 | find_package(fmt REQUIRED)
26 |
27 | add_library(tensorrt_cpp_api SHARED
28 | src/engine.cpp)
29 |
30 | target_include_directories(tensorrt_cpp_api PUBLIC ${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS} ${TensorRT_INCLUDE_DIRS} include include/interfaces)
31 | target_link_libraries(tensorrt_cpp_api PUBLIC ${OpenCV_LIBS} ${CUDA_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} ${TensorRT_LIBRARIES} fmt::fmt)
32 |
33 | add_executable(run_inference_benchmark src/main.cpp)
34 | target_link_libraries(run_inference_benchmark tensorrt_cpp_api fmt::fmt)
--------------------------------------------------------------------------------
/include/Int8Calibrator.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "NvInfer.h"
3 |
4 | // Class used for int8 calibration
5 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
6 | public:
7 | Int8EntropyCalibrator2(int32_t batchSize, int32_t inputW, int32_t inputH, const std::string &calibDataDirPath,
8 | const std::string &calibTableName, const std::string &inputBlobName,
9 | const std::array &subVals = {0.f, 0.f, 0.f}, const std::array &divVals = {1.f, 1.f, 1.f},
10 | bool normalize = true, bool readCache = true);
11 | virtual ~Int8EntropyCalibrator2();
12 | // Abstract base class methods which must be implemented
13 | int32_t getBatchSize() const noexcept override;
14 | bool getBatch(void *bindings[], char const *names[], int32_t nbBindings) noexcept override;
15 | void const *readCalibrationCache(std::size_t &length) noexcept override;
16 | void writeCalibrationCache(void const *ptr, std::size_t length) noexcept override;
17 |
18 | private:
19 | const int32_t m_batchSize;
20 | const int32_t m_inputW;
21 | const int32_t m_inputH;
22 | int32_t m_imgIdx;
23 | std::vector m_imgPaths;
24 | size_t m_inputCount;
25 | const std::string m_calibTableName;
26 | const std::string m_inputBlobName;
27 | const std::array m_subVals;
28 | const std::array m_divVals;
29 | const bool m_normalize;
30 | const bool m_readCache;
31 | void *m_deviceInput;
32 | std::vector m_calibCache;
33 | };
--------------------------------------------------------------------------------
/include/logger.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 | #include
6 |
7 | enum class LogLevel {
8 | Trace,
9 | Debug,
10 | Info,
11 | Warn,
12 | Error,
13 | Critical,
14 | Off,
15 | Unknown
16 | };
17 |
18 |
19 | // Get the log level string from the environment variable
20 | inline std::string getLogLevelFromEnvironment() {
21 | const char* envValue = std::getenv("LOG_LEVEL");
22 | if (envValue) {
23 | return std::string(envValue);
24 | } else {
25 | spdlog::warn("LOG_LEVEL environment variable not set. Using default log level (info).");
26 | return "info";
27 | }
28 | }
29 |
30 | // Convert log level string to LogLevel enum
31 | inline LogLevel parseLogLevel(const std::string& logLevelStr) {
32 | if (logLevelStr == "trace") {
33 | return LogLevel::Trace;
34 | } else if (logLevelStr == "debug") {
35 | return LogLevel::Debug;
36 | } else if (logLevelStr == "info") {
37 | return LogLevel::Info;
38 | } else if (logLevelStr == "warn" || logLevelStr == "warning") {
39 | return LogLevel::Warn;
40 | } else if (logLevelStr == "err" || logLevelStr == "error") {
41 | return LogLevel::Error;
42 | } else if (logLevelStr == "critical") {
43 | return LogLevel::Critical;
44 | } else if (logLevelStr == "off") {
45 | return LogLevel::Off;
46 | } else {
47 | spdlog::warn("Unknown log level string: {}. Defaulting to 'info' level.", logLevelStr);
48 | return LogLevel::Unknown;
49 | }
50 | }
51 |
52 | // Convert LogLevel enum to spdlog::level::level_enum
53 | inline spdlog::level::level_enum toSpdlogLevel(const std::string& logLevelStr) {
54 | LogLevel logLevel = parseLogLevel(logLevelStr);
55 |
56 | switch (logLevel) {
57 | case LogLevel::Trace:
58 | return spdlog::level::trace;
59 | case LogLevel::Debug:
60 | return spdlog::level::debug;
61 | case LogLevel::Info:
62 | return spdlog::level::info;
63 | case LogLevel::Warn:
64 | return spdlog::level::warn;
65 | case LogLevel::Error:
66 | return spdlog::level::err;
67 | case LogLevel::Critical:
68 | return spdlog::level::critical;
69 | case LogLevel::Off:
70 | return spdlog::level::off;
71 | default:
72 | spdlog::warn("Unknown log level. Using default log level (info).");
73 | return spdlog::level::info;
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/cmake/FindTensorRT.cmake:
--------------------------------------------------------------------------------
1 | # source:
2 | # https://github.com/NVIDIA/tensorrt-laboratory/blob/master/cmake/FindTensorRT.cmake
3 |
4 | # This module defines the following variables:
5 | #
6 | # ::
7 | #
8 | # TensorRT_INCLUDE_DIRS
9 | # TensorRT_LIBRARIES
10 | # TensorRT_FOUND
11 | #
12 | # ::
13 | #
14 | # TensorRT_VERSION_STRING - version (x.y.z)
15 | # TensorRT_VERSION_MAJOR - major version (x)
16 | # TensorRT_VERSION_MINOR - minor version (y)
17 | # TensorRT_VERSION_PATCH - patch version (z)
18 | #
19 | # Hints
20 | # ^^^^^
21 | # A user may set ``TensorRT_DIR`` to an installation root to tell this module where to look.
22 | #
23 | set(_TensorRT_SEARCHES)
24 |
25 | if(TensorRT_DIR)
26 | set(_TensorRT_SEARCH_ROOT PATHS ${TensorRT_DIR} NO_DEFAULT_PATH)
27 | list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_ROOT)
28 | endif()
29 |
30 | # appends some common paths
31 | set(_TensorRT_SEARCH_NORMAL
32 | PATHS "/usr"
33 | )
34 | list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_NORMAL)
35 |
36 | # Include dir
37 | foreach(search ${_TensorRT_SEARCHES})
38 | find_path(TensorRT_INCLUDE_DIR NAMES NvInfer.h ${${search}} PATH_SUFFIXES include)
39 | endforeach()
40 |
41 | if(NOT TensorRT_LIBRARY)
42 | foreach(search ${_TensorRT_SEARCHES})
43 | find_library(TensorRT_LIBRARY NAMES nvinfer ${${search}} PATH_SUFFIXES lib)
44 | endforeach()
45 | endif()
46 |
47 | if(NOT TensorRT_NVONNXPARSER_LIBRARY)
48 | foreach(search ${_TensorRT_SEARCHES})
49 | find_library(TensorRT_NVONNXPARSER_LIBRARY NAMES nvonnxparser ${${search}} PATH_SUFFIXES lib)
50 | endforeach()
51 | endif()
52 |
53 | mark_as_advanced(TensorRT_INCLUDE_DIR)
54 |
55 | if(TensorRT_INCLUDE_DIR AND EXISTS "${TensorRT_INCLUDE_DIR}/NvInfer.h")
56 | file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$")
57 | file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$")
58 | file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$")
59 |
60 | string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}")
61 | string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}")
62 | string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}")
63 | set(TensorRT_VERSION_STRING "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}")
64 | endif()
65 |
66 | include(FindPackageHandleStandardArgs)
67 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(TensorRT REQUIRED_VARS TensorRT_LIBRARY TensorRT_INCLUDE_DIR VERSION_VAR TensorRT_VERSION_STRING)
68 |
69 | if(TensorRT_FOUND)
70 | set(TensorRT_INCLUDE_DIRS ${TensorRT_INCLUDE_DIR})
71 |
72 | if(NOT TensorRT_LIBRARIES)
73 | set(TensorRT_LIBRARIES ${TensorRT_LIBRARY} ${TensorRT_NVONNXPARSER_LIBRARY} ${TensorRT_NVPARSERS_LIBRARY})
74 | endif()
75 |
76 | if(NOT TARGET TensorRT::TensorRT)
77 | add_library(TensorRT::TensorRT UNKNOWN IMPORTED)
78 | set_target_properties(TensorRT::TensorRT PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIRS}")
79 | set_property(TARGET TensorRT::TensorRT APPEND PROPERTY IMPORTED_LOCATION "${TensorRT_LIBRARY}")
80 | endif()
81 | endif()
--------------------------------------------------------------------------------
/src/cmd_line_parser.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "engine.h"
3 | #include
4 |
5 | struct CommandLineArguments {
6 | std::string onnxModelPath = "";
7 | std::string trtModelPath = "";
8 | };
9 |
10 | inline void showHelp(char *argv[]) {
11 | std::cout << "Usage: " << argv[0] << " [OPTIONS]" << std::endl << std::endl;
12 |
13 | std::cout << "Options:" << std::endl;
14 | std::cout << "--onnx_model Path to the ONNX model. "
15 | "(Either onnx_model or trt_model must be provided)"
16 | << std::endl;
17 | std::cout << "--trt_model Path to the TensorRT model. "
18 | "(Either onnx_model or trt_model must be provided)"
19 | << std::endl;
20 |
21 | std::cout << "Example usage:" << std::endl;
22 | std::cout << argv[0] << " --onnx_model model.onnx" << std::endl;
23 | };
24 |
25 | inline bool tryGetNextArgument(int argc, char *argv[], int ¤tIndex, std::string &value, std::string flag, bool printErrors = true) {
26 | if (currentIndex + 1 >= argc) {
27 | if (printErrors)
28 | std::cout << "Error: No arguments provided for flag '" << flag << "'" << std::endl;
29 | return false;
30 | }
31 |
32 | std::string nextArgument = argv[currentIndex + 1];
33 | if (nextArgument.substr(0, 2) == "--") {
34 | if (printErrors)
35 | std::cout << "Error: No arguments provided for flag '" << flag << "'" << std::endl;
36 | return false;
37 | }
38 |
39 | value = argv[++currentIndex];
40 | return true;
41 | };
42 |
43 | inline bool parseArguments(int argc, char *argv[], CommandLineArguments &arguments) {
44 | if (argc == 1) {
45 | showHelp(argv);
46 | return false;
47 | }
48 |
49 | for (int i = 1; i < argc; i++) {
50 | std::string argument = argv[i];
51 |
52 | if (argument.substr(0, 2) == "--") {
53 | std::string flag = argument.substr(2);
54 | std::string nextArgument;
55 |
56 | if (flag == "onnx_model") {
57 | if (!tryGetNextArgument(argc, argv, i, nextArgument, flag))
58 | return false;
59 |
60 | if (!Util::doesFileExist(nextArgument)) {
61 | std::cout << "Error: Unable to find model at path '" << nextArgument << "' for flag '" << flag << "'" << std::endl;
62 | return false;
63 | }
64 |
65 | arguments.onnxModelPath = nextArgument;
66 | }
67 |
68 | else if (flag == "trt_model") {
69 | if (!tryGetNextArgument(argc, argv, i, nextArgument, flag))
70 | return false;
71 |
72 | if (!Util::doesFileExist(nextArgument)) {
73 | std::cout << "Error: Unable to find model at path '" << nextArgument << "' for flag '" << flag << "'" << std::endl;
74 | return false;
75 | }
76 |
77 | arguments.trtModelPath = nextArgument;
78 | }
79 |
80 | else {
81 | std::cout << "Error: Unknown flag '" << flag << "'" << std::endl;
82 | showHelp(argv);
83 | return false;
84 | }
85 | } else {
86 | std::cout << "Error: Unknown argument '" << argument << "'" << std::endl;
87 | showHelp(argv);
88 | return false;
89 | }
90 | }
91 |
92 | if (arguments.onnxModelPath.empty() && arguments.trtModelPath.empty()) {
93 | std::cout << "Error: Must specify either 'onnx_model' or 'trt_model'" << std::endl;
94 | return false;
95 | }
96 |
97 | return true;
98 | }
99 |
--------------------------------------------------------------------------------
/src/engine.cpp:
--------------------------------------------------------------------------------
1 | #include "engine.h"
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | using namespace nvinfer1;
10 | using namespace Util;
11 |
12 | void Logger::log(Severity severity, const char *msg) noexcept {
13 | switch (severity) {
14 | case Severity::kVERBOSE:
15 | spdlog::debug(msg);
16 | break;
17 | case Severity::kINFO:
18 | spdlog::info(msg);
19 | break;
20 | case Severity::kWARNING:
21 | spdlog::warn(msg);
22 | break;
23 | case Severity::kERROR:
24 | spdlog::error(msg);
25 | break;
26 | case Severity::kINTERNAL_ERROR:
27 | spdlog::critical(msg);
28 | break;
29 | default:
30 | spdlog::info("Unexpected severity level");
31 | }
32 | }
33 |
34 | Int8EntropyCalibrator2::Int8EntropyCalibrator2(int32_t batchSize, int32_t inputW, int32_t inputH, const std::string &calibDataDirPath,
35 | const std::string &calibTableName, const std::string &inputBlobName,
36 | const std::array &subVals, const std::array &divVals, bool normalize,
37 | bool readCache)
38 | : m_batchSize(batchSize), m_inputW(inputW), m_inputH(inputH), m_imgIdx(0), m_calibTableName(calibTableName),
39 | m_inputBlobName(inputBlobName), m_subVals(subVals), m_divVals(divVals), m_normalize(normalize), m_readCache(readCache) {
40 |
41 | // Allocate GPU memory to hold the entire batch
42 | m_inputCount = 3 * inputW * inputH * batchSize;
43 | checkCudaErrorCode(cudaMalloc(&m_deviceInput, m_inputCount * sizeof(float)));
44 |
45 | // Read the name of all the files in the specified directory.
46 | if (!doesFileExist(calibDataDirPath)) {
47 | auto msg = "Error, directory at provided path does not exist: " + calibDataDirPath;
48 | spdlog::error(msg);
49 | throw std::runtime_error(msg);
50 | }
51 |
52 | m_imgPaths = getFilesInDirectory(calibDataDirPath);
53 | if (m_imgPaths.size() < static_cast(batchSize)) {
54 | auto msg = "Error, there are fewer calibration images than the specified batch size!";
55 | spdlog::error(msg);
56 | throw std::runtime_error(msg);
57 | }
58 |
59 | // Randomize the calibration data
60 | auto rd = std::random_device{};
61 | auto rng = std::default_random_engine{rd()};
62 | std::shuffle(std::begin(m_imgPaths), std::end(m_imgPaths), rng);
63 | }
64 |
65 | int32_t Int8EntropyCalibrator2::getBatchSize() const noexcept {
66 | // Return the batch size
67 | return m_batchSize;
68 | }
69 |
70 | bool Int8EntropyCalibrator2::getBatch(void **bindings, const char **names, int32_t nbBindings) noexcept {
71 | // This method will read a batch of images into GPU memory, and place the
72 | // pointer to the GPU memory in the bindings variable.
73 |
74 | if (m_imgIdx + m_batchSize > static_cast(m_imgPaths.size())) {
75 | // There are not enough images left to satisfy an entire batch
76 | return false;
77 | }
78 |
79 | // Read the calibration images into memory for the current batch
80 | std::vector inputImgs;
81 | for (int i = m_imgIdx; i < m_imgIdx + m_batchSize; i++) {
82 | spdlog::info("Reading image {}: {}", i, m_imgPaths[i]);
83 | auto cpuImg = cv::imread(m_imgPaths[i]);
84 | if (cpuImg.empty()) {
85 | spdlog::error("Fatal error: Unable to read image at path: " + m_imgPaths[i]);
86 | return false;
87 | }
88 |
89 | cv::cuda::GpuMat gpuImg;
90 | gpuImg.upload(cpuImg);
91 | //cv::cuda::cvtColor(gpuImg, gpuImg, cv::COLOR_BGR2RGB);
92 |
93 | // TODO: Define any preprocessing code here, such as resizing
94 | auto resized = Engine::resizeKeepAspectRatioPadRightBottom(gpuImg, m_inputH, m_inputW);
95 |
96 | inputImgs.emplace_back(std::move(resized));
97 | }
98 |
99 | // Convert the batch from NHWC to NCHW
100 | // ALso apply normalization, scaling, and mean subtraction
101 | auto mfloat = Engine::blobFromGpuMats(inputImgs, m_subVals, m_divVals, m_normalize, true);
102 | auto *dataPointer = mfloat.ptr();
103 |
104 | // Copy the GPU buffer to member variable so that it persists
105 | checkCudaErrorCode(cudaMemcpyAsync(m_deviceInput, dataPointer, m_inputCount * sizeof(float), cudaMemcpyDeviceToDevice));
106 |
107 | m_imgIdx += m_batchSize;
108 | if (std::string(names[0]) != m_inputBlobName) {
109 | spdlog::error("Error: Incorrect input name provided!");
110 | return false;
111 | }
112 | bindings[0] = m_deviceInput;
113 | return true;
114 | }
115 |
116 | void const *Int8EntropyCalibrator2::readCalibrationCache(size_t &length) noexcept {
117 | spdlog::info("Searching for calibration cache: {}", m_calibTableName);
118 | m_calibCache.clear();
119 | std::ifstream input(m_calibTableName, std::ios::binary);
120 | input >> std::noskipws;
121 | if (m_readCache && input.good()) {
122 | spdlog::info("Reading calibration cache: {}", m_calibTableName);
123 | std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(m_calibCache));
124 | }
125 | length = m_calibCache.size();
126 | return length ? m_calibCache.data() : nullptr;
127 | }
128 |
129 | void Int8EntropyCalibrator2::writeCalibrationCache(const void *ptr, std::size_t length) noexcept {
130 | spdlog::info("Writing calibration cache: {}", m_calibTableName);
131 | spdlog::info("Calibration cache size: {} bytes", length);
132 | std::ofstream output(m_calibTableName, std::ios::binary);
133 | output.write(reinterpret_cast(ptr), length);
134 | }
135 |
136 | Int8EntropyCalibrator2::~Int8EntropyCalibrator2() { checkCudaErrorCode(cudaFree(m_deviceInput)); };
137 |
--------------------------------------------------------------------------------
/include/engine/EngineRunInference.inl:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 | #include "util/Util.h"
5 |
6 | template
7 | bool Engine::runInference(const std::vector> &inputs,
8 | std::vector>> &featureVectors) {
9 | // First we do some error checking
10 | if (inputs.empty() || inputs[0].empty()) {
11 | spdlog::error("Provided input vector is empty!");
12 | return false;
13 | }
14 |
15 | const auto numInputs = m_inputDims.size();
16 | if (inputs.size() != numInputs) {
17 | spdlog::error("Incorrect number of inputs provided!");
18 | return false;
19 | }
20 |
21 | // Ensure the batch size does not exceed the max
22 | if (inputs[0].size() > static_cast(m_options.maxBatchSize)) {
23 | spdlog::error("===== Error =====");
24 | spdlog::error("The batch size is larger than the model expects!");
25 | spdlog::error("Model max batch size: {}", m_options.maxBatchSize);
26 | spdlog::error("Batch size provided to call to runInference: {}", inputs[0].size());
27 | return false;
28 | }
29 |
30 | // Ensure that if the model has a fixed batch size that is greater than 1, the
31 | // input has the correct length
32 | if (m_inputBatchSize != -1 && inputs[0].size() != static_cast(m_inputBatchSize)) {
33 | spdlog::error("===== Error =====");
34 | spdlog::error("The batch size is different from what the model expects!");
35 | spdlog::error("Model batch size: {}", m_inputBatchSize);
36 | spdlog::error("Batch size provided to call to runInference: {}", inputs[0].size());
37 | return false;
38 | }
39 |
40 | const auto batchSize = static_cast(inputs[0].size());
41 | // Make sure the same batch size was provided for all inputs
42 | for (size_t i = 1; i < inputs.size(); ++i) {
43 | if (inputs[i].size() != static_cast(batchSize)) {
44 | spdlog::error("===== Error =====");
45 | spdlog::error("The batch size is different for each input!");
46 | return false;
47 | }
48 | }
49 |
50 | // Create the cuda stream that will be used for inference
51 | cudaStream_t inferenceCudaStream;
52 | Util::checkCudaErrorCode(cudaStreamCreate(&inferenceCudaStream));
53 |
54 | std::vector preprocessedInputs;
55 |
56 | // Preprocess all the inputs
57 | for (size_t i = 0; i < numInputs; ++i) {
58 | const auto &batchInput = inputs[i];
59 | const auto &dims = m_inputDims[i];
60 |
61 | auto &input = batchInput[0];
62 | if (input.channels() != dims.d[0] || input.rows != dims.d[1] || input.cols != dims.d[2]) {
63 | spdlog::error("===== Error =====");
64 | spdlog::error("Input does not have correct size!");
65 | spdlog::error("Expected: ({}, {}, {})", dims.d[0], dims.d[1], dims.d[2]);
66 | spdlog::error("Got: ({}, {}, {})", input.channels(), input.rows, input.cols);
67 | spdlog::error("Ensure you resize your input image to the correct size");
68 | return false;
69 | }
70 |
71 | nvinfer1::Dims4 inputDims = {batchSize, dims.d[0], dims.d[1], dims.d[2]};
72 | m_context->setInputShape(m_IOTensorNames[i].c_str(),
73 | inputDims); // Define the batch size
74 |
75 | // OpenCV reads images into memory in NHWC format, while TensorRT expects
76 | // images in NCHW format. The following method converts NHWC to NCHW. Even
77 | // though TensorRT expects NCHW at IO, during optimization, it can
78 | // internally use NHWC to optimize cuda kernels See:
79 | // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#data-layout
80 | // Copy over the input data and perform the preprocessing
81 | auto mfloat = blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize);
82 | preprocessedInputs.push_back(mfloat);
83 | m_buffers[i] = mfloat.ptr();
84 | }
85 |
86 | // Ensure all dynamic bindings have been defined.
87 | if (!m_context->allInputDimensionsSpecified()) {
88 | auto msg = "Error, not all required dimensions specified.";
89 | spdlog::error(msg);
90 | throw std::runtime_error(msg);
91 | }
92 |
93 | // Set the address of the input and output buffers
94 | for (size_t i = 0; i < m_buffers.size(); ++i) {
95 | bool status = m_context->setTensorAddress(m_IOTensorNames[i].c_str(), m_buffers[i]);
96 | if (!status) {
97 | return false;
98 | }
99 | }
100 |
101 | // Run inference.
102 | bool status = m_context->enqueueV3(inferenceCudaStream);
103 | if (!status) {
104 | return false;
105 | }
106 |
107 | // Copy the outputs back to CPU
108 | featureVectors.clear();
109 |
110 | for (int batch = 0; batch < batchSize; ++batch) {
111 | // Batch
112 | std::vector> batchOutputs{};
113 | for (int32_t outputBinding = numInputs; outputBinding < m_engine->getNbIOTensors(); ++outputBinding) {
114 | // We start at index m_inputDims.size() to account for the inputs in our
115 | // m_buffers
116 | std::vector output;
117 | auto outputLength = m_outputLengths[outputBinding - numInputs];
118 | output.resize(outputLength);
119 | // Copy the output
120 | Util::checkCudaErrorCode(cudaMemcpyAsync(output.data(),
121 | static_cast(m_buffers[outputBinding]) + (batch * sizeof(T) * outputLength),
122 | outputLength * sizeof(T), cudaMemcpyDeviceToHost, inferenceCudaStream));
123 | batchOutputs.emplace_back(std::move(output));
124 | }
125 | featureVectors.emplace_back(std::move(batchOutputs));
126 | }
127 |
128 | // Synchronize the cuda stream
129 | Util::checkCudaErrorCode(cudaStreamSynchronize(inferenceCudaStream));
130 | Util::checkCudaErrorCode(cudaStreamDestroy(inferenceCudaStream));
131 | return true;
132 | }
133 |
--------------------------------------------------------------------------------
/include/engine/EngineUtilities.inl:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 |
5 | template
6 | void Engine::transformOutput(std::vector>> &input, std::vector> &output) {
7 | if (input.size() != 1) {
8 | auto msg = "The feature vector has incorrect dimensions!";
9 | spdlog::error(msg);
10 | throw std::logic_error(msg);
11 | }
12 |
13 | output = std::move(input[0]);
14 | }
15 |
16 | template void Engine::transformOutput(std::vector>> &input, std::vector &output) {
17 | if (input.size() != 1 || input[0].size() != 1) {
18 | auto msg = "The feature vector has incorrect dimensions!";
19 | spdlog::error(msg);
20 | throw std::logic_error(msg);
21 | }
22 |
23 | output = std::move(input[0][0]);
24 | }
25 |
26 | template
27 | cv::cuda::GpuMat Engine::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width,
28 | const cv::Scalar &bgcolor) {
29 | float r = std::min(width / (input.cols * 1.0), height / (input.rows * 1.0));
30 | int unpad_w = r * input.cols;
31 | int unpad_h = r * input.rows;
32 | cv::cuda::GpuMat re(unpad_h, unpad_w, CV_8UC3);
33 | cv::cuda::resize(input, re, re.size());
34 | cv::cuda::GpuMat out(height, width, CV_8UC3, bgcolor);
35 | re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
36 | return out;
37 | }
38 |
39 | template void Engine::getDeviceNames(std::vector &deviceNames) {
40 | int numGPUs;
41 | cudaGetDeviceCount(&numGPUs);
42 |
43 | for (int device = 0; device < numGPUs; device++) {
44 | cudaDeviceProp prop;
45 | cudaGetDeviceProperties(&prop, device);
46 |
47 | deviceNames.push_back(std::string(prop.name));
48 | }
49 | }
50 |
51 | template std::string Engine::serializeEngineOptions(const Options &options, const std::string &onnxModelPath) {
52 | const auto filenamePos = onnxModelPath.find_last_of('/') + 1;
53 | std::string engineName = onnxModelPath.substr(filenamePos, onnxModelPath.find_last_of('.') - filenamePos) + ".engine";
54 |
55 | // Add the GPU device name to the file to ensure that the model is only used
56 | // on devices with the exact same GPU
57 | std::vector deviceNames;
58 | getDeviceNames(deviceNames);
59 |
60 | if (static_cast(options.deviceIndex) >= deviceNames.size()) {
61 | auto msg = "Error, provided device index is out of range!";
62 | spdlog::error(msg);
63 | throw std::runtime_error(msg);
64 | }
65 |
66 | auto deviceName = deviceNames[options.deviceIndex];
67 | // Remove spaces from the device name
68 | deviceName.erase(std::remove_if(deviceName.begin(), deviceName.end(), ::isspace), deviceName.end());
69 |
70 | engineName += "." + deviceName;
71 |
72 | // Serialize the specified options into the filename
73 | if (options.precision == Precision::FP16) {
74 | engineName += ".fp16";
75 | } else if (options.precision == Precision::FP32) {
76 | engineName += ".fp32";
77 | } else {
78 | engineName += ".int8";
79 | }
80 |
81 | engineName += "." + std::to_string(options.maxBatchSize);
82 | engineName += "." + std::to_string(options.optBatchSize);
83 | engineName += "." + std::to_string(options.minInputWidth);
84 | engineName += "." + std::to_string(options.optInputWidth);
85 | engineName += "." + std::to_string(options.maxInputWidth);
86 |
87 | spdlog::info("Engine name: {}", engineName);
88 | return engineName;
89 | }
90 |
91 | template
92 | cv::cuda::GpuMat Engine::blobFromGpuMats(const std::vector &batchInput, const std::array &subVals,
93 | const std::array &divVals, bool normalize, bool swapRB) {
94 |
95 | CHECK(!batchInput.empty())
96 | CHECK(batchInput[0].channels() == 3)
97 |
98 | cv::cuda::GpuMat gpu_dst(1, batchInput[0].rows * batchInput[0].cols * batchInput.size(), CV_8UC3);
99 |
100 | size_t width = batchInput[0].cols * batchInput[0].rows;
101 | if (swapRB) {
102 | for (size_t img = 0; img < batchInput.size(); ++img) {
103 | std::vector input_channels{
104 | cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[width * 2 + width * 3 * img])),
105 | cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[width + width * 3 * img])),
106 | cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[0 + width * 3 * img]))};
107 | cv::cuda::split(batchInput[img], input_channels); // HWC -> CHW
108 | }
109 | } else {
110 | for (size_t img = 0; img < batchInput.size(); ++img) {
111 | std::vector input_channels{
112 | cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[0 + width * 3 * img])),
113 | cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[width + width * 3 * img])),
114 | cv::cuda::GpuMat(batchInput[0].rows, batchInput[0].cols, CV_8U, &(gpu_dst.ptr()[width * 2 + width * 3 * img]))};
115 | cv::cuda::split(batchInput[img], input_channels); // HWC -> CHW
116 | }
117 | }
118 | cv::cuda::GpuMat mfloat;
119 | if (normalize) {
120 | // [0.f, 1.f]
121 | gpu_dst.convertTo(mfloat, CV_32FC3, 1.f / 255.f);
122 | } else {
123 | // [0.f, 255.f]
124 | gpu_dst.convertTo(mfloat, CV_32FC3);
125 | }
126 |
127 | // Apply scaling and mean subtraction
128 | cv::cuda::subtract(mfloat, cv::Scalar(subVals[0], subVals[1], subVals[2]), mfloat, cv::noArray(), -1);
129 | cv::cuda::divide(mfloat, cv::Scalar(divVals[0], divVals[1], divVals[2]), mfloat, 1, -1);
130 |
131 | return mfloat;
132 | }
133 |
134 | template void Engine::clearGpuBuffers() {
135 | if (!m_buffers.empty()) {
136 | // Free GPU memory of outputs
137 | const auto numInputs = m_inputDims.size();
138 | for (int32_t outputBinding = numInputs; outputBinding < m_engine->getNbIOTensors(); ++outputBinding) {
139 | Util::checkCudaErrorCode(cudaFree(m_buffers[outputBinding]));
140 | }
141 | m_buffers.clear();
142 | }
143 | }
144 |
--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
1 | #include "cmd_line_parser.h"
2 | #include "logger.h"
3 | #include "engine.h"
4 | #include
5 | #include
6 | #include
7 |
8 | int main(int argc, char *argv[]) {
9 | CommandLineArguments arguments;
10 |
11 | std::string logLevelStr = getLogLevelFromEnvironment();
12 | spdlog::level::level_enum logLevel = toSpdlogLevel(logLevelStr);
13 | spdlog::set_level(logLevel);
14 |
15 | // Parse the command line arguments
16 | if (!parseArguments(argc, argv, arguments)) {
17 | return -1;
18 | }
19 |
20 | // Specify our GPU inference configuration options
21 | Options options;
22 | // Specify what precision to use for inference
23 | // FP16 is approximately twice as fast as FP32.
24 | options.precision = Precision::FP16;
25 | // If using INT8 precision, must specify path to directory containing
26 | // calibration data.
27 | options.calibrationDataDirectoryPath = "";
28 | // Specify the batch size to optimize for.
29 | options.optBatchSize = 1;
30 | // Specify the maximum batch size we plan on running.
31 | options.maxBatchSize = 1;
32 | // Specify the directory where you want the model engine model file saved.
33 | options.engineFileDir = ".";
34 |
35 | Engine engine(options);
36 |
37 | // Define our preprocessing code
38 | // The default Engine::build method will normalize values between [0.f, 1.f]
39 | // Setting the normalize flag to false will leave values between [0.f, 255.f]
40 | // (some converted models may require this).
41 |
42 | // For our YoloV8 model, we need the values to be normalized between
43 | // [0.f, 1.f] so we use the following params
44 | std::array subVals{0.f, 0.f, 0.f};
45 | std::array divVals{1.f, 1.f, 1.f};
46 | bool normalize = true;
47 | // Note, we could have also used the default values.
48 |
49 | // If the model requires values to be normalized between [-1.f, 1.f], use the
50 | // following params:
51 | // subVals = {0.5f, 0.5f, 0.5f};
52 | // divVals = {0.5f, 0.5f, 0.5f};
53 | // normalize = true;
54 |
55 | if (!arguments.onnxModelPath.empty()) {
56 | // Build the onnx model into a TensorRT engine file, and load the TensorRT
57 | // engine file into memory.
58 | bool succ = engine.buildLoadNetwork(arguments.onnxModelPath, subVals, divVals, normalize);
59 | if (!succ) {
60 | throw std::runtime_error("Unable to build or load TensorRT engine.");
61 | }
62 | } else {
63 | // Load the TensorRT engine file directly
64 | bool succ = engine.loadNetwork(arguments.trtModelPath, subVals, divVals, normalize);
65 | if (!succ) {
66 | const std::string msg = "Unable to load TensorRT engine.";
67 | spdlog::error(msg);
68 | throw std::runtime_error(msg);
69 | }
70 | }
71 |
72 | // Read the input image
73 | // TODO: You will need to read the input image required for your model
74 | const std::string inputImage = "../inputs/team.jpg";
75 | auto cpuImg = cv::imread(inputImage);
76 | if (cpuImg.empty()) {
77 | const std::string msg = "Unable to read image at path: " + inputImage;
78 | spdlog::error(msg);
79 | throw std::runtime_error(msg);
80 | }
81 |
82 | // Upload the image GPU memory
83 | cv::cuda::GpuMat img;
84 | img.upload(cpuImg);
85 |
86 | // The model expects RGB input
87 | cv::cuda::cvtColor(img, img, cv::COLOR_BGR2RGB);
88 |
89 | // In the following section we populate the input vectors to later pass for
90 | // inference
91 | const auto &inputDims = engine.getInputDims();
92 | std::vector> inputs;
93 |
94 | // Let's use a batch size which matches that which we set the
95 | // Options.optBatchSize option
96 | size_t batchSize = options.optBatchSize;
97 |
98 | // TODO:
99 | // For the sake of the demo, we will be feeding the same image to all the
100 | // inputs You should populate your inputs appropriately.
101 | for (const auto &inputDim : inputDims) { // For each of the model inputs...
102 | std::vector input;
103 | for (size_t j = 0; j < batchSize; ++j) { // For each element we want to add to the batch...
104 | // TODO:
105 | // You can choose to resize by scaling, adding padding, or a combination
106 | // of the two in order to maintain the aspect ratio You can use the
107 | // Engine::resizeKeepAspectRatioPadRightBottom to resize to a square while
108 | // maintain the aspect ratio (adds padding where necessary to achieve
109 | // this).
110 | auto resized = Engine::resizeKeepAspectRatioPadRightBottom(img, inputDim.d[1], inputDim.d[2]);
111 | // You could also perform a resize operation without maintaining aspect
112 | // ratio with the use of padding by using the following instead:
113 | // cv::cuda::resize(img, resized, cv::Size(inputDim.d[2],
114 | // inputDim.d[1])); // TRT dims are (height, width) whereas
115 | // OpenCV is (width, height)
116 | input.emplace_back(std::move(resized));
117 | }
118 | inputs.emplace_back(std::move(input));
119 | }
120 |
121 | // Warm up the network before we begin the benchmark
122 | spdlog::info("Warming up the network...");
123 | std::vector>> featureVectors;
124 | for (int i = 0; i < 100; ++i) {
125 | bool succ = engine.runInference(inputs, featureVectors);
126 | if (!succ) {
127 | const std::string msg = "Unable to run inference.";
128 | spdlog::error(msg);
129 | throw std::runtime_error(msg);
130 | }
131 | }
132 |
133 | // Benchmark the inference time
134 | size_t numIterations = 1000;
135 | spdlog::info("Running benchmarks ({} iterations)...", numIterations);
136 | preciseStopwatch stopwatch;
137 | for (size_t i = 0; i < numIterations; ++i) {
138 | featureVectors.clear();
139 | engine.runInference(inputs, featureVectors);
140 | }
141 | auto totalElapsedTimeMs = stopwatch.elapsedTime();
142 | auto avgElapsedTimeMs = totalElapsedTimeMs / numIterations / static_cast(inputs[0].size());
143 |
144 | spdlog::info("Benchmarking complete!");
145 | spdlog::info("======================");
146 | spdlog::info("Avg time per sample: ");
147 | spdlog::info("Avg time per sample: {} ms", avgElapsedTimeMs);
148 | spdlog::info("Batch size: {}", inputs[0].size());
149 | spdlog::info("Avg FPS: {} fps", static_cast(1000 / avgElapsedTimeMs));
150 | spdlog::info("======================\n");
151 |
152 | // Print the feature vectors
153 | for (size_t batch = 0; batch < featureVectors.size(); ++batch) {
154 | for (size_t outputNum = 0; outputNum < featureVectors[batch].size(); ++outputNum) {
155 | spdlog::info("Batch {}, output {}", batch, outputNum);
156 | std::string output;
157 | int i = 0;
158 | for (const auto &e : featureVectors[batch][outputNum]) {
159 | output += std::to_string(e) + " ";
160 | if (++i == 10) {
161 | output += "...";
162 | break;
163 | }
164 | }
165 | spdlog::info("{}", output);
166 | }
167 | }
168 |
169 | // TODO: If your model requires post processing (ex. convert feature vector
170 | // into bounding boxes) then you would do so here.
171 |
172 | return 0;
173 | }
174 |
--------------------------------------------------------------------------------
/src/engine.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "NvOnnxParser.h"
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 |
14 | #include "IEngine.h"
15 | #include "logger.h"
16 | #include "Int8Calibrator.h"
17 | #include "util/Util.h"
18 | #include "util/Stopwatch.h"
19 | #include "macros.h"
20 |
21 | // Precision used for GPU inference
22 | enum class Precision {
23 | // Full precision floating point value
24 | FP32,
25 | // Half prevision floating point value
26 | FP16,
27 | // Int8 quantization.
28 | // Has reduced dynamic range, may result in slight loss in accuracy.
29 | // If INT8 is selected, must provide path to calibration dataset directory.
30 | INT8,
31 | };
32 |
33 | // Options for the network
34 | struct Options {
35 | // Precision to use for GPU inference.
36 | Precision precision = Precision::FP16;
37 | // If INT8 precision is selected, must provide path to calibration dataset
38 | // directory.
39 | std::string calibrationDataDirectoryPath;
40 | // The batch size to be used when computing calibration data for INT8
41 | // inference. Should be set to as large a batch number as your GPU will
42 | // support.
43 | int32_t calibrationBatchSize = 128;
44 | // The batch size which should be optimized for.
45 | int32_t optBatchSize = 1;
46 | // Maximum allowable batch size
47 | int32_t maxBatchSize = 16;
48 | // GPU device index
49 | int deviceIndex = 0;
50 | // Directory where the engine file should be saved
51 | std::string engineFileDir = ".";
52 | // Maximum allowed input width
53 | int32_t maxInputWidth = -1; // Default to -1 --> expecting fixed input size
54 | // Minimum allowed input width
55 | int32_t minInputWidth = -1; // Default to -1 --> expecting fixed input size
56 | // Optimal input width
57 | int32_t optInputWidth = -1; // Default to -1 --> expecting fixed input size
58 | };
59 |
60 | // Class to extend TensorRT logger
61 | class Logger : public nvinfer1::ILogger {
62 | void log(Severity severity, const char *msg) noexcept override;
63 | };
64 |
65 | template
66 | class Engine : public IEngine {
67 | public:
68 | Engine(const Options &options);
69 | ~Engine();
70 |
71 | // Build the onnx model into a TensorRT engine file, cache the model to disk
72 | // (to avoid rebuilding in future), and then load the model into memory The
73 | // default implementation will normalize values between [0.f, 1.f] Setting the
74 | // normalize flag to false will leave values between [0.f, 255.f] (some
75 | // converted models may require this). If the model requires values to be
76 | // normalized between [-1.f, 1.f], use the following params:
77 | // subVals = {0.5f, 0.5f, 0.5f};
78 | // divVals = {0.5f, 0.5f, 0.5f};
79 | // normalize = true;
80 | bool buildLoadNetwork(std::string onnxModelPath, const std::array &subVals = {0.f, 0.f, 0.f},
81 | const std::array &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override;
82 |
83 | // Load a TensorRT engine file from disk into memory
84 | // The default implementation will normalize values between [0.f, 1.f]
85 | // Setting the normalize flag to false will leave values between [0.f, 255.f]
86 | // (some converted models may require this). If the model requires values to
87 | // be normalized between [-1.f, 1.f], use the following params:
88 | // subVals = {0.5f, 0.5f, 0.5f};
89 | // divVals = {0.5f, 0.5f, 0.5f};
90 | // normalize = true;
91 | bool loadNetwork(std::string trtModelPath, const std::array &subVals = {0.f, 0.f, 0.f},
92 | const std::array &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override;
93 |
94 | // Run inference.
95 | // Input format [input][batch][cv::cuda::GpuMat]
96 | // Output format [batch][output][feature_vector]
97 | bool runInference(const std::vector> &inputs, std::vector>> &featureVectors) override;
98 |
99 | // Utility method for resizing an image while maintaining the aspect ratio by
100 | // adding padding to smaller dimension after scaling While letterbox padding
101 | // normally adds padding to top & bottom, or left & right sides, this
102 | // implementation only adds padding to the right or bottom side This is done
103 | // so that it's easier to convert detected coordinates (ex. YOLO model) back
104 | // to the original reference frame.
105 | static cv::cuda::GpuMat resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width,
106 | const cv::Scalar &bgcolor = cv::Scalar(0, 0, 0));
107 |
108 | [[nodiscard]] const std::vector &getInputDims() const override { return m_inputDims; };
109 | [[nodiscard]] const std::vector &getOutputDims() const override { return m_outputDims; };
110 |
111 | // Utility method for transforming triple nested output array into 2D array
112 | // Should be used when the output batch size is 1, but there are multiple
113 | // output feature vectors
114 | static void transformOutput(std::vector>> &input, std::vector> &output);
115 |
116 | // Utility method for transforming triple nested output array into single
117 | // array Should be used when the output batch size is 1, and there is only a
118 | // single output feature vector
119 | static void transformOutput(std::vector>> &input, std::vector &output);
120 | // Convert NHWC to NCHW and apply scaling and mean subtraction
121 | static cv::cuda::GpuMat blobFromGpuMats(const std::vector &batchInput, const std::array &subVals,
122 | const std::array &divVals, bool normalize, bool swapRB = false);
123 |
124 | private:
125 | // Build the network
126 | bool build(std::string onnxModelPath, const std::array &subVals, const std::array &divVals, bool normalize);
127 |
128 | // Converts the engine options into a string
129 | std::string serializeEngineOptions(const Options &options, const std::string &onnxModelPath);
130 |
131 | void getDeviceNames(std::vector &deviceNames);
132 |
133 | void clearGpuBuffers();
134 |
135 | // Normalization, scaling, and mean subtraction of inputs
136 | std::array m_subVals{};
137 | std::array m_divVals{};
138 | bool m_normalize;
139 |
140 | // Holds pointers to the input and output GPU buffers
141 | std::vector m_buffers;
142 | std::vector m_outputLengths{};
143 | std::vector m_inputDims;
144 | std::vector m_outputDims;
145 | std::vector m_IOTensorNames;
146 | int32_t m_inputBatchSize;
147 |
148 | // Must keep IRuntime around for inference, see:
149 | // https://forums.developer.nvidia.com/t/is-it-safe-to-deallocate-nvinfer1-iruntime-after-creating-an-nvinfer1-icudaengine-but-before-running-inference-with-said-icudaengine/255381/2?u=cyruspk4w6
150 | std::unique_ptr m_runtime = nullptr;
151 | std::unique_ptr m_calibrator = nullptr;
152 | std::unique_ptr m_engine = nullptr;
153 | std::unique_ptr m_context = nullptr;
154 | const Options m_options;
155 | Logger m_logger;
156 | };
157 |
158 | template Engine::Engine(const Options &options) : m_options(options) {}
159 |
160 | template Engine::~Engine() { clearGpuBuffers(); }
161 |
162 | // Include inline implementations
163 | #include "engine/EngineRunInference.inl"
164 | #include "engine/EngineUtilities.inl"
165 | #include "engine/EngineBuildLoadNetwork.inl"
166 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [![Stargazers][stars-shield]][stars-url]
2 |
3 | [](#contributors-)
4 |
5 | [![Issues][issues-shield]][issues-url]
6 | [![LinkedIn][linkedin-shield]][linkedin-url]
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
TensorRT C++ API Tutorial
16 |
17 |
18 |
19 | How to use TensorRT C++ API for high performance GPU machine-learning inference.
20 |
21 |
22 | Supports models with single / multiple inputs and single / multiple outputs with batching.
23 |
24 |
25 | Project Overview Video
26 | .
27 | Code Deep-Dive Video
28 |
29 |
30 |
31 | ## Looking for Maintainers 🚀
32 |
33 | This project is actively seeking maintainers to help guide its growth and improvement. If you're passionate about this project and interested in contributing, I’d love to hear from you!
34 |
35 | Please feel free to reach out via [LinkedIn](https://www.linkedin.com/in/cyrus-behroozi/) to discuss how you can get involved.
36 |
37 |
38 | # TensorRT C++ Tutorial
39 | *I read all the NVIDIA TensorRT docs so that you don't have to!*
40 |
41 | This project demonstrates how to use the TensorRT C++ API for high performance GPU inference on image data. It covers how to do the following:
42 | - How to install TensorRT 10 on Ubuntu 20.04 / 22.04.
43 | - How to generate a TensorRT engine file optimized for your GPU.
44 | - How to specify a simple optimization profile.
45 | - How to run FP32, FP16, or INT8 precision inference.
46 | - How to read / write data from / into GPU memory and work with GPU images.
47 | - How to use cuda stream to run async inference and later synchronize.
48 | - How to work with models with static and dynamic batch sizes.
49 | - How to work with models with single or multiple output tensors.
50 | - How to work with models with multiple inputs.
51 | - Includes a [Video walkthrough](https://youtu.be/Z0n5aLmcRHQ) where I explain every line of code.
52 | - The code can be used as a base for any model which takes a fixed size image / images as input, including [Insightface](https://github.com/deepinsight/insightface) [ArcFace](https://github.com/onnx/models/tree/main/vision/body_analysis/arcface), [YoloV8](https://github.com/ultralytics/ultralytics), [SCRFD](https://insightface.ai/scrfd) face detection.
53 | - You will just need to implement the appropriate post-processing code.
54 | - TODO: Add support for models with dynamic input shapes.
55 | - TODO: Add support for Windows
56 |
57 | ## Getting Started
58 | The following instructions assume you are using Ubuntu 20.04 or 22.04.
59 | You will need to supply your own onnx model for this sample code or you can download the sample model (see Sanity Check section below).
60 |
61 | ### Prerequisites
62 | - Tested and working on Ubuntu 20.04 and 22.04 (Windows is **not** supported at this time)
63 | - Install CUDA 11 or 12, instructions [here](https://developer.nvidia.com/cuda-downloads).
64 | - Recommended >= 12.0
65 | - Required >= 11.0
66 | - Install cudnn, instructions [here](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#download).
67 | - Required >= 8
68 | - Required < 9 (OpenCV GPU does not yet support)
69 | - `sudo apt install build-essential`
70 | - `sudo snap install cmake --classic`
71 | - `sudo apt install libspdlog-dev libfmt-dev` (for logging)
72 | - Install OpenCV with cuda support. To compile OpenCV from source, run the `build_opencv.sh` script provided in `./scripts/`.
73 | - If you use the provided script and you have installed cuDNN to a non-standard location, you must modify the `CUDNN_INCLUDE_DIR` and `CUDNN_LIBRARY` variables in the script.
74 | - Recommended >= 4.8
75 | - Download TensorRT 10 from [here](https://developer.nvidia.com/tensorrt/download/10x).
76 | - Required >= 10.0
77 | - Navigate to the `CMakeLists.txt` file and replace the `TODO` with the path to your TensorRT installation.
78 |
79 | ### Building the Library
80 | - `mkdir build`
81 | - `cd build`
82 | - `cmake ..`
83 | - `make -j$(nproc)`
84 |
85 | ### Running the Executable
86 | - Navigate to the build directory
87 | - Run the executable and provide the path to your onnx model.
88 | - ex. `./run_inference_benchmark --onnx_model ../models/yolov8n.onnx`
89 | - Note: See sanity check section below for instructions on how to obtain the yolov8n model.
90 | - The first time you run the executable for a given model and options, a TensorRT engine file will be built from your onnx model. This process is fairly slow and can take 5+ minutes for some models (ex. yolo models).
91 | - Alternatively, you can choose to supply your own TensorRT engine file directly:
92 | - ex. `./run_inference_benchmark --trt_model ../models/yolov8n.engine.NVIDIAGeForceRTX3080LaptopGPU.fp16.1.1`
93 | - Note: See V5.0 changelog below for warnings when supply your own TensorRT engine file.
94 |
95 | ### Sanity Check
96 | - To perform a sanity check, download the `YOLOv8n` model from [here](https://github.com/ultralytics/ultralytics#models).
97 | - Next, convert it from pytorch to onnx using the following script:
98 | - You will need to run `pip3 install ultralytics` first.
99 |
100 | ```python
101 | from ultralytics import YOLO
102 | model = YOLO("./yolov8n.pt")
103 | model.fuse()
104 | model.info(verbose=False) # Print model information
105 | model.export(format="onnx", opset=12) # Export the model to onnx using opset 12
106 | ```
107 |
108 | - Place the resulting onnx model, `yolov8n.onnx`, in the `./models/` directory.
109 | - Running inference using said model and the image located in `./inputs/team.jpg` should produce the following feature vector:
110 | - Note: The feature vector will not be identical (but very similar) as [TensorRT is not deterministic](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#determinism).
111 | ```text
112 | 3.41113 16.5312 20.8828 29.8984 43.7266 54.9609 62.0625 65.8594 70.0312 72.9531 ...
113 | ```
114 |
115 | ### INT8 Inference
116 | Enabling INT8 precision can further speed up inference at the cost of accuracy reduction due to reduced dynamic range.
117 | For INT8 precision, the user must supply calibration data which is representative of real data the model will see.
118 | It is advised to use 1K+ calibration images. To enable INT8 inference with the YoloV8 sanity check model, the following steps must be taken:
119 | - Change `options.precision = Precision::FP16;` to `options.precision = Precision::INT8;` in `main.cpp`
120 | - `options.calibrationDataDirectoryPath = "";` must be changed in `main.cpp` to specify path containing calibration data.
121 | - If using the YoloV8 model, it is advised to used the COCO validation dataset, which can be downloaded with `wget http://images.cocodataset.org/zips/val2017.zip`
122 | - Make sure the resizing code in the `Int8EntropyCalibrator2::getBatch` method in `engine.cpp` (see `TODO`) is correct for your model.
123 | - If using the YoloV8 model, the preprocessing code is correct and does not need to be changed.
124 | - Recompile, run the executable.
125 | - The calibration cache will be written to disk (`.calibration` extension) so that on subsequent model optimizations it can be reused. If you'd like to regenerate the calibration data, you must delete this cache file.
126 | - If you get an "out of memory in function allocate" error, then you must reduce `Options.calibrationBatchSize` so that the entire batch can fit in your GPU memory.
127 |
128 | ### Benchmarks
129 | Benchmarks run on RTX 3050 Ti Laptop GPU, 11th Gen Intel(R) Core(TM) i9-11900H @ 2.50GHz.
130 |
131 | | Model | Precision | Batch Size | Avg Inference Time |
132 | |---------|-----------|------------|--------------------|
133 | | yolov8n | FP32 | 1 | 4.732 ms |
134 | | yolov8n | FP16 | 1 | 2.493 ms |
135 | | yolov8n | INT8 | 1 | 2.009 ms |
136 | | yolov8x | FP32 | 1 | 76.63 ms |
137 | | yolov8x | FP16 | 1 | 25.08 ms |
138 | | yolov8x | INT8 | 1 | 11.62 ms |
139 |
140 | ### Sample Integration
141 | Wondering how to integrate this library into your project? Or perhaps how to read the outputs of the YoloV8 model to extract meaningful information?
142 | If so, check out my two latest projects, [YOLOv8-TensorRT-CPP](https://github.com/cyrusbehr/YOLOv8-TensorRT-CPP) and [YOLOv9-TensorRT-CPP](https://github.com/cyrusbehr/YOLOv9-TensorRT-CPP), which demonstrate how to use the TensorRT C++ API to run YoloV8/9 inference (supports object detection, semantic segmentation, and body pose estimation). They make use of this project in the backend!
143 |
144 | ### Project Structure
145 | ```sh
146 | project-root/
147 | ├── include/
148 | │ ├── engine/
149 | │ │ ├── EngineRunInference.inl
150 | │ │ ├── EngineUtilities.inl
151 | │ │ └── EngineBuildLoadNetwork.inl
152 | │ ├── util/...
153 | │ ├── ...
154 | ├── src/
155 | | ├── ...
156 | │ ├── engine.cpp
157 | │ ├── engine.h
158 | │ └── main.cpp
159 | ├── CMakeLists.txt
160 | └── README.md
161 | ```
162 |
163 | ### Understanding the Code
164 | - The bulk of the implementation is located in `include/engine`. I have written lots of comments all throughout the code which should make it easy to understand what is going on.
165 | - The inference code is located in `include/engine/EngineRunInference.inl`.
166 | - The building and loading of the TensorRT engine file is located in `include/engine/EngineBuildLoadNetwork.inl`.
167 | - You can also check out my [deep-dive video](https://youtu.be/Z0n5aLmcRHQ) in which I explain every line of code.
168 |
169 | ### How to Debug
170 | - The implementation uses the `spdlog` library for logging. You can change the log level by setting the environment variable `LOG_LEVEL` to one of the following values: `trace`, `debug`, `info`, `warn`, `error`, `critical`, `off`.
171 |
172 | - If you have issues creating the TensorRT engine file from the onnx model, consider setting the environment variable `LOG_LEVEL` to `trace` and re-run the application. This should give you more information on where exactly the build process is failing.
173 |
174 | ### Show your Appreciation
175 | If this project was helpful to you, I would appreciate if you could give it a star. That will encourage me to ensure it's up to date and solve issues quickly. I also do consulting work if you require more specific help. Connect with me on [LinkedIn](https://www.linkedin.com/in/cyrus-behroozi/).
176 |
177 | ### Contributors
178 |
179 |
180 |
181 |
182 |
191 |
192 |
193 |
194 |
195 |
196 |
197 | ### Changelog
198 |
199 | **V6.0**
200 |
201 | - Implementation now requires TensorRT >= 10.0.
202 |
203 | **V5.0**
204 |
205 | - `Engine` class has been modified to take a template parameter which specifies the models output data type. The implementation now supports outputs of type `float`, `__half`, `int8_t`, `int32_t`, `bool`, and `uint8_t`.
206 | - Added support for loading TensorRT engine file directly without needing to compile from onnx model. Howver, it is highly recommended that you use the API provided to build the engine file from the onnx model, instead of loading a TensorRT model directly. If you choose to load a TensorRT model file directly, you must hand-check that the `Options` have been set correctly for your model (for example, if your model has been compiled for FP32 but you try running FP16 inference, it will fail, potentially without a verbose error).
207 | - Added command line parser.
208 |
209 | **V4.1**
210 |
211 | - Added support for fixed batch size > 1.
212 |
213 | **V4.0**
214 |
215 | - Added support for INT8 precision.
216 |
217 |
218 | **V3.0**
219 |
220 | - Implementation has been updated to use TensorRT 8.6 API (ex. `IExecutionContext::enqueueV3()`).
221 | - Executable has renamed from `driver` to `run_inference_benchmark` and now must be passed path to onnx model as command line argument.
222 | - Removed `Options.doesSupportDynamicBatchSize`. Implementation now auto-detects supported batch sizes.
223 | - Removed `Options.maxWorkspaceSize`. Implementation now does not limit GPU memory during model constructions, allowing implementation to use as much of memory pool as is available for intermediate layers.
224 |
225 | **v2.2**
226 |
227 | - Serialize model name as part of engine file.
228 |
229 | **V2.1**
230 |
231 | - Added support for models with multiple inputs. Implementation now supports models with single inputs, multiple inputs, single outputs, multiple outputs, and batching.
232 |
233 | **V2.0**
234 |
235 | - Requires OpenCV cuda to be installed. To install, follow instructions [here](https://gist.github.com/raulqf/f42c718a658cddc16f9df07ecc627be7).
236 | - `Options.optBatchSizes` has been removed, replaced by `Options.optBatchSize`.
237 | - Support models with more than a single output (ex. SCRFD).
238 | - Added support for models which do not support batch inference (first input dimension is fixed).
239 | - More error checking.
240 | - Fixed a bunch of common issues people were running into with the original V1.0 version.
241 | - Remove whitespace from GPU device name
242 |
243 |
244 |
245 | [stars-shield]: https://img.shields.io/github/stars/cyrusbehr/tensorrt-cpp-api.svg?style=flat-square
246 | [stars-url]: https://github.com/cyrusbehr/tensorrt-cpp-api/stargazers
247 | [issues-shield]: https://img.shields.io/github/issues/cyrusbehr/tensorrt-cpp-api.svg?style=flat-square
248 | [issues-url]: https://github.com/cyrusbehr/tensorrt-cpp-api/issues
249 | [linkedin-shield]: https://img.shields.io/badge/-LinkedIn-black.svg?style=flat-square&logo=linkedin&colorB=555
250 | [linkedin-url]: https://linkedin.com/in/cyrus-behroozi/
251 |
252 | ## Contributors ✨
253 |
254 | Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 | This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
264 |
--------------------------------------------------------------------------------
/include/engine/EngineBuildLoadNetwork.inl:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 | #include "util/Util.h"
5 |
6 | template
7 | bool Engine::buildLoadNetwork(std::string onnxModelPath, const std::array &subVals, const std::array &divVals,
8 | bool normalize) {
9 | const auto engineName = serializeEngineOptions(m_options, onnxModelPath);
10 | const auto engineDir = std::filesystem::path(m_options.engineFileDir);
11 | std::filesystem::path enginePath = engineDir / engineName;
12 | spdlog::info("Searching for engine file with name: {}", enginePath.string());
13 |
14 | if (Util::doesFileExist(enginePath)) {
15 | spdlog::info("Engine found, not regenerating...");
16 | } else {
17 | if (!Util::doesFileExist(onnxModelPath)) {
18 | auto msg = "Could not find ONNX model at path: " + onnxModelPath;
19 | spdlog::error(msg);
20 | throw std::runtime_error(msg);
21 | }
22 |
23 | spdlog::info("Engine not found, generating. This could take a while...");
24 | if (!std::filesystem::exists(engineDir)) {
25 | std::filesystem::create_directories(engineDir);
26 | spdlog::info("Created directory: {}", engineDir.string());
27 | }
28 |
29 | auto ret = build(onnxModelPath, subVals, divVals, normalize);
30 | if (!ret) {
31 | return false;
32 | }
33 | }
34 |
35 | return loadNetwork(enginePath, subVals, divVals, normalize);
36 | }
37 |
38 | template
39 | bool Engine::loadNetwork(std::string trtModelPath, const std::array &subVals, const std::array &divVals,
40 | bool normalize) {
41 | m_subVals = subVals;
42 | m_divVals = divVals;
43 | m_normalize = normalize;
44 |
45 | // Read the serialized model from disk
46 | if (!Util::doesFileExist(trtModelPath)) {
47 | auto msg = "Error, unable to read TensorRT model at path: " + trtModelPath;
48 | spdlog::error(msg);
49 | return false;
50 | } else {
51 | auto msg = "Loading TensorRT engine file at path: " + trtModelPath;
52 | spdlog::info(msg);
53 | }
54 |
55 | std::ifstream file(trtModelPath, std::ios::binary | std::ios::ate);
56 | std::streamsize size = file.tellg();
57 | file.seekg(0, std::ios::beg);
58 |
59 | std::vector buffer(size);
60 | if (!file.read(buffer.data(), size)) {
61 | auto msg = "Error, unable to read engine file";
62 | spdlog::error(msg);
63 | throw std::runtime_error(msg);
64 | }
65 |
66 | // Create a runtime to deserialize the engine file.
67 | m_runtime = std::unique_ptr{nvinfer1::createInferRuntime(m_logger)};
68 | if (!m_runtime) {
69 | return false;
70 | }
71 |
72 | // Set the device index
73 | auto ret = cudaSetDevice(m_options.deviceIndex);
74 | if (ret != 0) {
75 | int numGPUs;
76 | cudaGetDeviceCount(&numGPUs);
77 | auto errMsg = "Unable to set GPU device index to: " + std::to_string(m_options.deviceIndex) + ". Note, your device has " +
78 | std::to_string(numGPUs) + " CUDA-capable GPU(s).";
79 | spdlog::error(errMsg);
80 | throw std::runtime_error(errMsg);
81 | }
82 |
83 | // Create an engine, a representation of the optimized model.
84 | m_engine = std::unique_ptr(m_runtime->deserializeCudaEngine(buffer.data(), buffer.size()));
85 | if (!m_engine) {
86 | return false;
87 | }
88 |
89 | // The execution context contains all of the state associated with a
90 | // particular invocation
91 | m_context = std::unique_ptr(m_engine->createExecutionContext());
92 | if (!m_context) {
93 | return false;
94 | }
95 |
96 | // Storage for holding the input and output buffers
97 | // This will be passed to TensorRT for inference
98 | clearGpuBuffers();
99 | m_buffers.resize(m_engine->getNbIOTensors());
100 |
101 | m_outputLengths.clear();
102 | m_inputDims.clear();
103 | m_outputDims.clear();
104 | m_IOTensorNames.clear();
105 |
106 | // Create a cuda stream
107 | cudaStream_t stream;
108 | Util::checkCudaErrorCode(cudaStreamCreate(&stream));
109 |
110 | // Allocate GPU memory for input and output buffers
111 | m_outputLengths.clear();
112 | for (int i = 0; i < m_engine->getNbIOTensors(); ++i) {
113 | const auto tensorName = m_engine->getIOTensorName(i);
114 | m_IOTensorNames.emplace_back(tensorName);
115 | const auto tensorType = m_engine->getTensorIOMode(tensorName);
116 | const auto tensorShape = m_engine->getTensorShape(tensorName);
117 | const auto tensorDataType = m_engine->getTensorDataType(tensorName);
118 |
119 | if (tensorType == nvinfer1::TensorIOMode::kINPUT) {
120 | // The implementation currently only supports inputs of type float
121 | if (m_engine->getTensorDataType(tensorName) != nvinfer1::DataType::kFLOAT) {
122 | auto msg = "Error, the implementation currently only supports float inputs";
123 | spdlog::error(msg);
124 | throw std::runtime_error(msg);
125 | }
126 |
127 | // Don't need to allocate memory for inputs as we will be using the OpenCV
128 | // GpuMat buffer directly.
129 |
130 | // Store the input dims for later use
131 | m_inputDims.emplace_back(tensorShape.d[1], tensorShape.d[2], tensorShape.d[3]);
132 | m_inputBatchSize = tensorShape.d[0];
133 | } else if (tensorType == nvinfer1::TensorIOMode::kOUTPUT) {
134 | // Ensure the model output data type matches the template argument
135 | // specified by the user
136 | if (tensorDataType == nvinfer1::DataType::kFLOAT && !std::is_same::value) {
137 | auto msg = "Error, the model has expected output of type float. Engine class template parameter must be adjusted.";
138 | spdlog::error(msg);
139 | throw std::runtime_error(msg);
140 | } else if (tensorDataType == nvinfer1::DataType::kHALF && !std::is_same<__half, T>::value) {
141 | auto msg = "Error, the model has expected output of type __half. Engine class template parameter must be adjusted.";
142 | spdlog::error(msg);
143 | throw std::runtime_error(msg);
144 | } else if (tensorDataType == nvinfer1::DataType::kINT8 && !std::is_same::value) {
145 | auto msg = "Error, the model has expected output of type int8_t. Engine class template parameter must be adjusted.";
146 | spdlog::error(msg);
147 | throw std::runtime_error(msg);
148 | } else if (tensorDataType == nvinfer1::DataType::kINT32 && !std::is_same::value) {
149 | auto msg = "Error, the model has expected output of type int32_t. Engine class template parameter must be adjusted.";
150 | spdlog::error(msg);
151 | throw std::runtime_error(msg);
152 | } else if (tensorDataType == nvinfer1::DataType::kBOOL && !std::is_same::value) {
153 | auto msg = "Error, the model has expected output of type bool. Engine class template parameter must be adjusted.";
154 | spdlog::error(msg);
155 | throw std::runtime_error(msg);
156 | } else if (tensorDataType == nvinfer1::DataType::kUINT8 && !std::is_same::value) {
157 | auto msg = "Error, the model has expected output of type uint8_t. Engine class template parameter must be adjusted.";
158 | spdlog::error(msg);
159 | throw std::runtime_error(msg);
160 | } else if (tensorDataType == nvinfer1::DataType::kFP8) {
161 | auto msg = "Error, the model has expected output of type kFP8. This is not supported by the Engine class.";
162 | spdlog::error(msg);
163 | throw std::runtime_error(msg);
164 | }
165 |
166 | // The binding is an output
167 | uint32_t outputLength = 1;
168 | m_outputDims.push_back(tensorShape);
169 |
170 | for (int j = 1; j < tensorShape.nbDims; ++j) {
171 | // We ignore j = 0 because that is the batch size, and we will take that
172 | // into account when sizing the buffer
173 | outputLength *= tensorShape.d[j];
174 | }
175 |
176 | m_outputLengths.push_back(outputLength);
177 | // Now size the output buffer appropriately, taking into account the max
178 | // possible batch size (although we could actually end up using less
179 | // memory)
180 | Util::checkCudaErrorCode(cudaMallocAsync(&m_buffers[i], outputLength * m_options.maxBatchSize * sizeof(T), stream));
181 | } else {
182 | auto msg = "Error, IO Tensor is neither an input or output!";
183 | spdlog::error(msg);
184 | throw std::runtime_error(msg);
185 | }
186 | }
187 |
188 | // Synchronize and destroy the cuda stream
189 | Util::checkCudaErrorCode(cudaStreamSynchronize(stream));
190 | Util::checkCudaErrorCode(cudaStreamDestroy(stream));
191 |
192 | return true;
193 | }
194 |
195 |
196 | template
197 | bool Engine::build(std::string onnxModelPath, const std::array &subVals, const std::array &divVals, bool normalize) {
198 | // Create our engine builder.
199 | auto builder = std::unique_ptr(nvinfer1::createInferBuilder(m_logger));
200 | if (!builder) {
201 | return false;
202 | }
203 |
204 | // Define an explicit batch size and then create the network (implicit batch
205 | // size is deprecated). More info here:
206 | // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#explicit-implicit-batch
207 | auto explicitBatch = 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
208 | auto network = std::unique_ptr(builder->createNetworkV2(explicitBatch));
209 | if (!network) {
210 | return false;
211 | }
212 |
213 | // Create a parser for reading the onnx file.
214 | auto parser = std::unique_ptr(nvonnxparser::createParser(*network, m_logger));
215 | if (!parser) {
216 | return false;
217 | }
218 |
219 | // We are going to first read the onnx file into memory, then pass that buffer
220 | // to the parser. Had our onnx model file been encrypted, this approach would
221 | // allow us to first decrypt the buffer.
222 | std::ifstream file(onnxModelPath, std::ios::binary | std::ios::ate);
223 | std::streamsize size = file.tellg();
224 | file.seekg(0, std::ios::beg);
225 |
226 | std::vector buffer(size);
227 | if (!file.read(buffer.data(), size)) {
228 | auto msg = "Error, unable to read engine file";
229 | spdlog::error(msg);
230 | throw std::runtime_error(msg);
231 | }
232 |
233 | // Parse the buffer we read into memory.
234 | auto parsed = parser->parse(buffer.data(), buffer.size());
235 | if (!parsed) {
236 | return false;
237 | }
238 |
239 | // Ensure that all the inputs have the same batch size
240 | const auto numInputs = network->getNbInputs();
241 | if (numInputs < 1) {
242 | auto msg = "Error, model needs at least 1 input!";
243 | spdlog::error(msg);
244 | throw std::runtime_error(msg);
245 | }
246 | const auto input0Batch = network->getInput(0)->getDimensions().d[0];
247 | for (int32_t i = 1; i < numInputs; ++i) {
248 | if (network->getInput(i)->getDimensions().d[0] != input0Batch) {
249 | auto msg = "Error, the model has multiple inputs, each with differing batch sizes!";
250 | spdlog::error(msg);
251 | throw std::runtime_error(msg);
252 | }
253 | }
254 |
255 | // Check to see if the model supports dynamic batch size or not
256 | bool doesSupportDynamicBatch = false;
257 | if (input0Batch == -1) {
258 | doesSupportDynamicBatch = true;
259 | spdlog::info("Model supports dynamic batch size");
260 | } else {
261 | spdlog::info("Model only supports fixed batch size of {}", input0Batch);
262 | // If the model supports a fixed batch size, ensure that the maxBatchSize
263 | // and optBatchSize were set correctly.
264 | if (m_options.optBatchSize != input0Batch || m_options.maxBatchSize != input0Batch) {
265 | auto msg = "Error, model only supports a fixed batch size of " + std::to_string(input0Batch) +
266 | ". Must set Options.optBatchSize and Options.maxBatchSize to 1";
267 | spdlog::error(msg);
268 | throw std::runtime_error(msg);
269 | }
270 | }
271 |
272 | const auto input3Batch = network->getInput(0)->getDimensions().d[3];
273 | bool doesSupportDynamicWidth = false;
274 | if (input3Batch == -1) {
275 | doesSupportDynamicWidth = true;
276 | spdlog::info("Model supports dynamic width. Using Options.maxInputWidth, Options.minInputWidth, and Options.optInputWidth to set the input width.");
277 |
278 | // Check that the values of maxInputWidth, minInputWidth, and optInputWidth are valid
279 | if (m_options.maxInputWidth < m_options.minInputWidth || m_options.maxInputWidth < m_options.optInputWidth ||
280 | m_options.minInputWidth > m_options.optInputWidth
281 | || m_options.maxInputWidth < 1 || m_options.minInputWidth < 1 || m_options.optInputWidth < 1) {
282 | auto msg = "Error, invalid values for Options.maxInputWidth, Options.minInputWidth, and Options.optInputWidth";
283 | spdlog::error(msg);
284 | throw std::runtime_error(msg);
285 | }
286 | }
287 |
288 |
289 | auto config = std::unique_ptr(builder->createBuilderConfig());
290 | if (!config) {
291 | return false;
292 | }
293 |
294 | // Register a single optimization profile
295 | nvinfer1::IOptimizationProfile *optProfile = builder->createOptimizationProfile();
296 | for (int32_t i = 0; i < numInputs; ++i) {
297 | // Must specify dimensions for all the inputs the model expects.
298 | const auto input = network->getInput(i);
299 | const auto inputName = input->getName();
300 | const auto inputDims = input->getDimensions();
301 | int32_t inputC = inputDims.d[1];
302 | int32_t inputH = inputDims.d[2];
303 | int32_t inputW = inputDims.d[3];
304 |
305 | int32_t minInputWidth = std::max(m_options.minInputWidth, inputW);
306 |
307 | // Specify the optimization profile`
308 | if (doesSupportDynamicBatch) {
309 | optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4(1, inputC, inputH, minInputWidth));
310 | } else {
311 | optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMIN,
312 | nvinfer1::Dims4(m_options.optBatchSize, inputC, inputH, minInputWidth));
313 | }
314 |
315 | if (doesSupportDynamicWidth) {
316 | optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kOPT,
317 | nvinfer1::Dims4(m_options.optBatchSize, inputC, inputH, m_options.optInputWidth));
318 | optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMAX,
319 | nvinfer1::Dims4(m_options.maxBatchSize, inputC, inputH, m_options.maxInputWidth));
320 | } else {
321 | optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kOPT,
322 | nvinfer1::Dims4(m_options.optBatchSize, inputC, inputH, inputW));
323 | optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMAX,
324 | nvinfer1::Dims4(m_options.maxBatchSize, inputC, inputH, inputW));
325 | }
326 | }
327 | config->addOptimizationProfile(optProfile);
328 |
329 | // Set the precision level
330 | const auto engineName = serializeEngineOptions(m_options, onnxModelPath);
331 | if (m_options.precision == Precision::FP16) {
332 | // Ensure the GPU supports FP16 inference
333 | if (!builder->platformHasFastFp16()) {
334 | auto msg = "Error: GPU does not support FP16 precision";
335 | spdlog::error(msg);
336 | throw std::runtime_error(msg);
337 | }
338 | config->setFlag(nvinfer1::BuilderFlag::kFP16);
339 | } else if (m_options.precision == Precision::INT8) {
340 | if (numInputs > 1) {
341 | auto msg = "Error, this implementation currently only supports INT8 "
342 | "quantization for single input models";
343 | spdlog::error(msg);
344 | throw std::runtime_error(msg);
345 | }
346 |
347 | // Ensure the GPU supports INT8 Quantization
348 | if (!builder->platformHasFastInt8()) {
349 | auto msg = "Error: GPU does not support INT8 precision";
350 | spdlog::error(msg);
351 | throw std::runtime_error(msg);
352 | }
353 |
354 | // Ensure the user has provided path to calibration data directory
355 | if (m_options.calibrationDataDirectoryPath.empty()) {
356 | auto msg = "Error: If INT8 precision is selected, must provide path to "
357 | "calibration data directory to Engine::build method";
358 | throw std::runtime_error(msg);
359 | }
360 |
361 | config->setFlag((nvinfer1::BuilderFlag::kINT8));
362 |
363 | const auto input = network->getInput(0);
364 | const auto inputName = input->getName();
365 | const auto inputDims = input->getDimensions();
366 | const auto calibrationFileName = engineName + ".calibration";
367 |
368 | m_calibrator = std::make_unique(m_options.calibrationBatchSize, inputDims.d[3], inputDims.d[2],
369 | m_options.calibrationDataDirectoryPath, calibrationFileName, inputName,
370 | subVals, divVals, normalize);
371 | config->setInt8Calibrator(m_calibrator.get());
372 | }
373 |
374 | // CUDA stream used for profiling by the builder.
375 | cudaStream_t profileStream;
376 | Util::checkCudaErrorCode(cudaStreamCreate(&profileStream));
377 | config->setProfileStream(profileStream);
378 |
379 | // Build the engine
380 | // If this call fails, it is suggested to increase the logger verbosity to
381 | // kVERBOSE and try rebuilding the engine. Doing so will provide you with more
382 | // information on why exactly it is failing.
383 | std::unique_ptr plan{builder->buildSerializedNetwork(*network, *config)};
384 | if (!plan) {
385 | return false;
386 | }
387 |
388 | // Write the engine to disk
389 | const auto enginePath = std::filesystem::path(m_options.engineFileDir) / engineName;
390 | std::ofstream outfile(enginePath, std::ofstream::binary);
391 | outfile.write(reinterpret_cast(plan->data()), plan->size());
392 | spdlog::info("Success, saved engine to {}", enginePath.string());
393 |
394 | Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
395 | return true;
396 | }
--------------------------------------------------------------------------------