├── .gitignore ├── CMakeLists.txt ├── README.md ├── configs ├── yolov9.yaml └── yolov9py.yaml ├── data ├── 000000000036.jpg ├── 000000000144.jpg ├── 000000000194.jpg └── 000000000368.jpg ├── demo.cpp ├── include ├── LoggingRT.h ├── TimerCounter.h ├── Yolov9.h └── macros.h ├── python ├── AIResult.py ├── decorators.py ├── draw_AI_results.py ├── logging_system.py └── tensorrt_base.py ├── result ├── 000000000036.jpg ├── 000000000144.jpg └── performance.png ├── src ├── Decode.cu ├── Preprocess.cu └── Yolov9.cpp └── yolov9_trt.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | build/ 3 | cmake-build-debug/ 4 | configs/*.engine 5 | configs/*.onnx 6 | python/__pycache__/ 7 | logs/ 8 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | project(yolov9) 3 | 4 | option(RELEASE "build Yolov9 lib release" OFF) 5 | 6 | if(RELEASE) 7 | set(CMAKE_BUILD_TYPE Release) 8 | set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -DNODEBUG -O3 -Wall") 9 | else() 10 | set(CMAKE_BUILD_TYPE Debug) 11 | set(CMAKE_CXX_FLAGS_DEBUG "-DDEBUG -g") ### open compiler debug flag 12 | endif() 13 | 14 | set(CMAKE_CXX_COMPILIER "/usr/bin/g++") 15 | set(CMAKE_CXX_FLAGS "-std=c++14 -O3") 16 | set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath -Wl,$ORIGIN") 17 | 18 | #opencv 19 | find_package(OpenCV REQUIRED) 20 | 21 | # cuda 22 | find_package(CUDA) 23 | include_directories(/usr/local/cuda/include) 24 | link_directories(/usr/local/cuda/lib64) 25 | 26 | # tensorrt 27 | link_directories(/usr/include/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu) 28 | include_directories(/usr/lib/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu) 29 | 30 | 31 | #yolov9 source 32 | include_directories(${PROJECT_SOURCE_DIR}/include) 33 | 34 | file(GLOB LIB_SOURCES src/*.cu 35 | src/*.cpp 36 | include/*.h) 37 | 38 | #add cuda lib 39 | CUDA_ADD_LIBRARY(yolov9 SHARED ${LIB_SOURCES}) 40 | 41 | target_link_libraries(yolov9 42 | nvinfer 43 | nvonnxparser 44 | pthread 45 | ${CUDA_LIBRARIES} 46 | ${OpenCV_LIBRARIES}) 47 | 48 | message("OpenCV: ${OpenCV_LIBRARIES}") 49 | target_include_directories(yolov9 PRIVATE include/ 50 | ${OpenCV_INCLUDE_DIRS} 51 | ${CUDA_TOOLKIT_ROOT_DIR}/include) 52 | 53 | add_executable(demo ${PROJECT_SOURCE_DIR}/demo.cpp) 54 | target_link_libraries(demo yolov9) 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |

YOLOv9 tensorrt deployment

3 | 4 | 5 |

6 | This repository provides an API for accelerating inference deployment, with two open interface implementation: C++ and Python. C++also provides the use of CUDA programming to accelerate YOLOv9 model preprocessing and post-processing 7 | to pursue faster model inference speed🔥🔥🔥 8 | 9 |
10 | 11 |
12 | 13 | 14 |

15 | 16 | ## ⭐ Build 17 | 18 |

1. Export onnx

19 | 20 | Clone [YOLOv9](https://github.com/WongKinYiu/yolov9) code repository, download the original model provided by the repository, or train your own model, such as [yolov9-c.pt](https://objects.githubusercontent.com/github-production-release-asset-2e65be/759338070/c8ca43f2-0d2d-4aa3-a074-426505bfbfb1?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240223%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240223T073054Z&X-Amz-Expires=300&X-Amz-Signature=db76944695e398168b222b502bb019a301336e5b5dc74db31604699b8f837a9b&X-Amz-SignedHeaders=host&actor_id=45328395&key_id=0&repo_id=759338070&response-content-disposition=attachment%3B%20filename%3Dyolov9-c.pt&response-content-type=application%2Foctet-stream) 21 | 22 | ``` shell 23 | # export onnx 24 | python export.py --weights yolov9-c.pt --simplify --include "onnx" 25 | ``` 26 | 27 |

2. Setup

3. Build project

4. python API

81 | 82 | ![图片](result/000000000036.jpg) 83 |

84 | 85 | ## 👏 Acknowledgement 86 | 87 | This project is based on the following awesome projects: 88 | - [Yolov9](https://github.com/WongKinYiu/yolov9) - YOLOv9: Learning What You Want to Learn Using Programmable Gradient Information. 89 | - [TensorRT](https://github.com/NVIDIA/TensorRT/tree/release/8.6/samples) - TensorRT samples and api documentation. 90 | 91 | ## 🤗 Citation 92 | 93 | ```bibtex 94 | @article{wang2024yolov9, 95 | title={{YOLOv9}: Learning What You Want to Learn Using Programmable Gradient Information}, 96 | author={Wang, Chien-Yao and Liao, Hong-Yuan Mark}, 97 | booktitle={arXiv preprint arXiv:2402.13616}, 98 | year={2024} 99 | } 100 | ``` -------------------------------------------------------------------------------- /configs/yolov9.yaml: -------------------------------------------------------------------------------- 1 | %YAML:1.0 2 | --- 3 | confTreshold: 0.25 #Detection confidence threshold 4 | nmsTreshold : 0.45 #nms threshold 5 | maxSupportBatchSize: 1 #support max input batch size 6 | quantizationInfer: "FP16" #support FP32 or FP16 quantization 7 | onnxFile: "yolov9-c.onnx" # The currently used onnx model file 8 | engineFile: "yolov9-c.engine" # Automatically generate file names for the Tensorrt inference engine -------------------------------------------------------------------------------- /configs/yolov9py.yaml: -------------------------------------------------------------------------------- 1 | %YAML:1.0 2 | --- 3 | confTreshold: 0.3 # detect treshold 4 | nmsThreshold: 0.45 #nms treshold 5 | quantizationInfer: "FP16" #FP32 or FP16 6 | onnxFile: "yolov9-c.onnx" 7 | engineFile: "yolov9-c.engine" -------------------------------------------------------------------------------- /data/000000000036.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LinhanDai/yolov9-tensorrt/7729c915733543c76a6ae9afb77d5a8825f924c0/data/000000000036.jpg -------------------------------------------------------------------------------- /data/000000000144.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LinhanDai/yolov9-tensorrt/7729c915733543c76a6ae9afb77d5a8825f924c0/data/000000000144.jpg -------------------------------------------------------------------------------- /data/000000000194.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LinhanDai/yolov9-tensorrt/7729c915733543c76a6ae9afb77d5a8825f924c0/data/000000000194.jpg -------------------------------------------------------------------------------- /data/000000000368.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LinhanDai/yolov9-tensorrt/7729c915733543c76a6ae9afb77d5a8825f924c0/data/000000000368.jpg -------------------------------------------------------------------------------- /demo.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Website: https://github.com/LinhanDai 3 | * @author dailinhan 4 | * @date 24-02-23 9:19 5 | _ooOoo_ 6 | o8888888o 7 | 88" . "88 8 | (| -_- |) 9 | O\ = /O 10 | ____/`---'\____ 11 | .' \\| |// `. 12 | / \\||| : |||// \ 13 | / _||||| -:- |||||- \ 14 | | | \\\ - /// | | 15 | | \_| ''\---/'' | | 16 | \ .-\__ `-` ___/-. / 17 | ___`. .' /--.--\ `. . __ 18 | ."" '< `.___\_<|>_/___.' >'"". 19 | | | : `- \`.;`\ _ /`;.`/ - ` : | | 20 | \ \ `-. \_ __\ /__ _/ .-` / / 21 | ======`-.____`-.___\_____/___.-`____.-'====== 22 | `=---=' 23 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 24 | no error no bug 25 | */ 26 | 27 | #include 28 | #include "TimerCounter.h" 29 | 30 | std::vector generateRandomColors(int classNum) 31 | { 32 | std::vector colors; 33 | for (int i = 0; i < classNum; i++) 34 | { 35 | colors.push_back(cv::Scalar(rand() % 256, rand() % 256, rand() % 256)); 36 | } 37 | return colors; 38 | } 39 | 40 | void showResult(const std::vector& result, std::vector &imgCloneBatch, std::vector colors) 41 | { 42 | for (int i = 0; i < result.size(); i++) 43 | { 44 | detectResult batchResult = result[i]; 45 | for (const auto& r: batchResult) 46 | { 47 | std::stringstream stream; 48 | stream << std::fixed << std::setprecision(2) << "id:" << r.label << " score:" << r.confidence; 49 | cv::rectangle(imgCloneBatch[i], cv::Point(r.left, r.top), cv::Point(r.right, r.bottom), colors[r.label], 2); 50 | cv::putText(imgCloneBatch[i], stream.str(), cv::Point(r.left, r.top - 5), 0, 0.8, colors[r.label], 2); 51 | } 52 | cv::imwrite("1.jpg", imgCloneBatch[i]); 53 | cv::namedWindow("Windows", cv::WINDOW_AUTOSIZE); 54 | cv::resizeWindow("Windows", imgCloneBatch[i].cols / 2, imgCloneBatch[i].rows / 2); 55 | cv::imshow("Windows", imgCloneBatch[i]); 56 | cv::waitKey(0); 57 | } 58 | } 59 | 60 | int main(int argc, char* argv[]) 61 | { 62 | std::string configPath = "../configs"; 63 | std::string configFile = "yolov9.yaml"; 64 | std::vector images; 65 | if (argc != 2) 66 | { 67 | std::cout << "Need input test img folder path!!!" << std::endl; 68 | return 0; 69 | } 70 | std::string folderPath = argv[1]; 71 | cv::String path(folderPath + "/*.jpg"); //small picture 72 | cv::glob(path, images); 73 | std::shared_ptr timer = std::make_shared(); 74 | std::shared_ptr yoloObj = std::make_shared(configPath, configFile); 75 | std::vector colors = generateRandomColors(80); 76 | for (const auto& image: images) 77 | { 78 | std::vector imgMatVec; 79 | std::vector imgSrcVec; 80 | std::vector imgInfoVec; 81 | std::vector detectResult {}; 82 | ImgInfo imgInfo{}; 83 | cv::Mat img = cv::imread(image, cv::IMREAD_COLOR); 84 | imgInfo.width = img.cols; 85 | imgInfo.height = img.rows; 86 | imgInfo.channels = img.channels(); 87 | unsigned char *deviceImgSrc; 88 | CHECK(cudaMalloc(&deviceImgSrc, img.cols * img.rows * img.channels() * sizeof(unsigned char))); 89 | CHECK(cudaMemcpy(deviceImgSrc, img.data, img.cols * img.rows * img.channels() * sizeof(unsigned char), cudaMemcpyHostToDevice)); 90 | imgSrcVec.push_back(deviceImgSrc); 91 | imgInfoVec.push_back(imgInfo); 92 | imgMatVec.push_back(img); 93 | timer->start(); 94 | yoloObj->doInfer(imgSrcVec, imgInfoVec, detectResult); 95 | timer->stop(); 96 | float time = timer->elapsed_ms(); 97 | std::cout << "cost time:" << time <<" ms, " <<"fps:" << 1000 / time << std::endl; 98 | showResult(detectResult, imgMatVec, colors); 99 | cudaFree(deviceImgSrc); 100 | } 101 | return 0; 102 | } -------------------------------------------------------------------------------- /include/LoggingRT.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef TENSORRT_LOGGING_H 18 | #define TENSORRT_LOGGING_H 19 | 20 | #include "NvInferRuntimeCommon.h" 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include "macros.h" 29 | 30 | using Severity = nvinfer1::ILogger::Severity; 31 | 32 | class LogStreamConsumerBuffer : public std::stringbuf 33 | { 34 | public: 35 | LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) 36 | : mOutput(stream) 37 | , mPrefix(prefix) 38 | , mShouldLog(shouldLog) 39 | { 40 | } 41 | 42 | LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) 43 | : mOutput(other.mOutput) 44 | { 45 | } 46 | 47 | ~LogStreamConsumerBuffer() 48 | { 49 | // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence 50 | // std::streambuf::pptr() gives a pointer to the current position of the output sequence 51 | // if the pointer to the beginning is not equal to the pointer to the current position, 52 | // call putOutput() to log the output to the stream 53 | if (pbase() != pptr()) 54 | { 55 | putOutput(); 56 | } 57 | } 58 | 59 | // synchronizes the stream buffer and returns 0 on success 60 | // synchronizing the stream buffer consists of inserting the buffer contents into the stream, 61 | // resetting the buffer and flushing the stream 62 | virtual int sync() 63 | { 64 | putOutput(); 65 | return 0; 66 | } 67 | 68 | void putOutput() 69 | { 70 | if (mShouldLog) 71 | { 72 | // prepend timestamp 73 | std::time_t timestamp = std::time(nullptr); 74 | tm* tm_local = std::localtime(×tamp); 75 | std::cout << "["; 76 | std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; 77 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; 78 | std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; 79 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; 80 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; 81 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; 82 | // std::stringbuf::str() gets the string contents of the buffer 83 | // insert the buffer contents pre-appended by the appropriate prefix into the stream 84 | mOutput << mPrefix << str(); 85 | // set the buffer to empty 86 | str(""); 87 | // flush the stream 88 | mOutput.flush(); 89 | } 90 | } 91 | 92 | void setShouldLog(bool shouldLog) 93 | { 94 | mShouldLog = shouldLog; 95 | } 96 | 97 | private: 98 | std::ostream& mOutput; 99 | std::string mPrefix; 100 | bool mShouldLog; 101 | }; 102 | 103 | //! 104 | //! \class LogStreamConsumerBase 105 | //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer 106 | //! 107 | class LogStreamConsumerBase 108 | { 109 | public: 110 | LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) 111 | : mBuffer(stream, prefix, shouldLog) 112 | { 113 | } 114 | 115 | protected: 116 | LogStreamConsumerBuffer mBuffer; 117 | }; 118 | 119 | //! 120 | //! \class LogStreamConsumer 121 | //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. 122 | //! Order of base classes is LogStreamConsumerBase and then std::ostream. 123 | //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field 124 | //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. 125 | //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. 126 | //! Please do not change the order of the parent classes. 127 | //! 128 | class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream 129 | { 130 | public: 131 | //! \brief Creates a LogStreamConsumer which logs messages with level severity. 132 | //! Reportable severity determines if the messages are severe enough to be logged. 133 | LogStreamConsumer(Severity reportableSeverity, Severity severity) 134 | : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) 135 | , std::ostream(&mBuffer) // links the stream buffer with the stream 136 | , mShouldLog(severity <= reportableSeverity) 137 | , mSeverity(severity) 138 | { 139 | } 140 | 141 | LogStreamConsumer(LogStreamConsumer&& other) 142 | : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) 143 | , std::ostream(&mBuffer) // links the stream buffer with the stream 144 | , mShouldLog(other.mShouldLog) 145 | , mSeverity(other.mSeverity) 146 | { 147 | } 148 | 149 | void setReportableSeverity(Severity reportableSeverity) 150 | { 151 | mShouldLog = mSeverity <= reportableSeverity; 152 | mBuffer.setShouldLog(mShouldLog); 153 | } 154 | 155 | private: 156 | static std::ostream& severityOstream(Severity severity) 157 | { 158 | return severity >= Severity::kINFO ? std::cout : std::cerr; 159 | } 160 | 161 | static std::string severityPrefix(Severity severity) 162 | { 163 | switch (severity) 164 | { 165 | case Severity::kINTERNAL_ERROR: return "[F] "; 166 | case Severity::kERROR: return "[E] "; 167 | case Severity::kWARNING: return "[W] "; 168 | case Severity::kINFO: return "[I] "; 169 | case Severity::kVERBOSE: return "[V] "; 170 | default: assert(0); return ""; 171 | } 172 | } 173 | 174 | bool mShouldLog; 175 | Severity mSeverity; 176 | }; 177 | 178 | //! \class Logger 179 | //! 180 | //! \brief Class which manages logging of TensorRT tools and samples 181 | //! 182 | //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, 183 | //! and supports logging two types of messages: 184 | //! 185 | //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) 186 | //! - Test pass/fail messages 187 | //! 188 | //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is 189 | //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. 190 | //! 191 | //! In the future, this class could be extended to support dumping test results to a file in some standard format 192 | //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). 193 | //! 194 | //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger 195 | //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT 196 | //! library and messages coming from the sample. 197 | //! 198 | //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the 199 | //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger 200 | //! object. 201 | 202 | class Logger : public nvinfer1::ILogger 203 | { 204 | public: 205 | Logger(Severity severity = Severity::kWARNING) 206 | : mReportableSeverity(severity) 207 | { 208 | } 209 | 210 | //! 211 | //! \enum TestResult 212 | //! \brief Represents the state of a given test 213 | //! 214 | enum class TestResult 215 | { 216 | kRUNNING, //!< The test is running 217 | kPASSED, //!< The test passed 218 | kFAILED, //!< The test failed 219 | kWAIVED //!< The test was waived 220 | }; 221 | 222 | //! 223 | //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger 224 | //! \return The nvinfer1::ILogger associated with this Logger 225 | //! 226 | //! TODO Once all samples are updated to use this method to register the logger with TensorRT, 227 | //! we can eliminate the inheritance of Logger from ILogger 228 | //! 229 | nvinfer1::ILogger& getTRTLogger() 230 | { 231 | return *this; 232 | } 233 | 234 | //! 235 | //! \brief Encapsulate the log function under nvinfer1 to print out the function name and the corresponding number of lines 236 | //! 237 | //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the 238 | //! inheritance from nvinfer1::ILogger 239 | //! 240 | void logPrint(Severity severity, const char *funcName, int line, const char* msg) 241 | { 242 | std::string transMsg = "functionName: " + std::string(funcName) + " line: " + std::to_string(line) + " " + std::string(msg); 243 | log(severity, transMsg.c_str()); 244 | } 245 | 246 | //! 247 | //! \brief Implementation of the nvinfer1::ILogger::log() virtual method 248 | //! 249 | //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the 250 | //! inheritance from nvinfer1::ILogger 251 | //! 252 | void log(Severity severity, const char* msg) TRT_NOEXCEPT override 253 | { 254 | LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; 255 | } 256 | 257 | //! 258 | //! \brief Method for controlling the verbosity of logging output 259 | //! 260 | //! \param severity The logger will only emit messages that have severity of this level or higher. 261 | //! 262 | void setReportableSeverity(Severity severity) 263 | { 264 | mReportableSeverity = severity; 265 | } 266 | 267 | //! 268 | //! \brief Opaque handle that holds logging information for a particular test 269 | //! 270 | //! This object is an opaque handle to information used by the Logger to print test results. 271 | //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used 272 | //! with Logger::reportTest{Start,End}(). 273 | //! 274 | class TestAtom 275 | { 276 | public: 277 | TestAtom(TestAtom&&) = default; 278 | 279 | private: 280 | friend class Logger; 281 | 282 | TestAtom(bool started, const std::string& name, const std::string& cmdline) 283 | : mStarted(started) 284 | , mName(name) 285 | , mCmdline(cmdline) 286 | { 287 | } 288 | 289 | bool mStarted; 290 | std::string mName; 291 | std::string mCmdline; 292 | }; 293 | 294 | //! 295 | //! \brief Define a test for logging 296 | //! 297 | //! \param[in] name The name of the test. This should be a string starting with 298 | //! "TensorRT" and containing dot-separated strings containing 299 | //! the characters [A-Za-z0-9_]. 300 | //! For example, "TensorRT.sample_googlenet" 301 | //! \param[in] cmdline The command line used to reproduce the test 302 | // 303 | //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). 304 | //! 305 | static TestAtom defineTest(const std::string& name, const std::string& cmdline) 306 | { 307 | return TestAtom(false, name, cmdline); 308 | } 309 | 310 | //! 311 | //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments 312 | //! as input 313 | //! 314 | //! \param[in] name The name of the test 315 | //! \param[in] argc The number of command-line arguments 316 | //! \param[in] argv The array of command-line arguments (given as C strings) 317 | //! 318 | //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). 319 | static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) 320 | { 321 | auto cmdline = genCmdlineString(argc, argv); 322 | return defineTest(name, cmdline); 323 | } 324 | 325 | //! 326 | //! \brief Report that a test has started. 327 | //! 328 | //! \pre reportTestStart() has not been called yet for the given testAtom 329 | //! 330 | //! \param[in] testAtom The handle to the test that has started 331 | //! 332 | static void reportTestStart(TestAtom& testAtom) 333 | { 334 | reportTestResult(testAtom, TestResult::kRUNNING); 335 | assert(!testAtom.mStarted); 336 | testAtom.mStarted = true; 337 | } 338 | 339 | //! 340 | //! \brief Report that a test has ended. 341 | //! 342 | //! \pre reportTestStart() has been called for the given testAtom 343 | //! 344 | //! \param[in] testAtom The handle to the test that has ended 345 | //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, 346 | //! TestResult::kFAILED, TestResult::kWAIVED 347 | //! 348 | static void reportTestEnd(const TestAtom& testAtom, TestResult result) 349 | { 350 | assert(result != TestResult::kRUNNING); 351 | assert(testAtom.mStarted); 352 | reportTestResult(testAtom, result); 353 | } 354 | 355 | static int reportPass(const TestAtom& testAtom) 356 | { 357 | reportTestEnd(testAtom, TestResult::kPASSED); 358 | return EXIT_SUCCESS; 359 | } 360 | 361 | static int reportFail(const TestAtom& testAtom) 362 | { 363 | reportTestEnd(testAtom, TestResult::kFAILED); 364 | return EXIT_FAILURE; 365 | } 366 | 367 | static int reportWaive(const TestAtom& testAtom) 368 | { 369 | reportTestEnd(testAtom, TestResult::kWAIVED); 370 | return EXIT_SUCCESS; 371 | } 372 | 373 | static int reportTest(const TestAtom& testAtom, bool pass) 374 | { 375 | return pass ? reportPass(testAtom) : reportFail(testAtom); 376 | } 377 | 378 | Severity getReportableSeverity() const 379 | { 380 | return mReportableSeverity; 381 | } 382 | 383 | private: 384 | //! 385 | //! \brief returns an appropriate string for prefixing a log message with the given severity 386 | //! 387 | static const char* severityPrefix(Severity severity) 388 | { 389 | switch (severity) 390 | { 391 | case Severity::kINTERNAL_ERROR: return "[F] "; 392 | case Severity::kERROR: return "[E] "; 393 | case Severity::kWARNING: return "[W] "; 394 | case Severity::kINFO: return "[I] "; 395 | case Severity::kVERBOSE: return "[V] "; 396 | default: assert(0); return ""; 397 | } 398 | } 399 | 400 | //! 401 | //! \brief returns an appropriate string for prefixing a test result message with the given result 402 | //! 403 | static const char* testResultString(TestResult result) 404 | { 405 | switch (result) 406 | { 407 | case TestResult::kRUNNING: return "RUNNING"; 408 | case TestResult::kPASSED: return "PASSED"; 409 | case TestResult::kFAILED: return "FAILED"; 410 | case TestResult::kWAIVED: return "WAIVED"; 411 | default: assert(0); return ""; 412 | } 413 | } 414 | 415 | //! 416 | //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity 417 | //! 418 | static std::ostream& severityOstream(Severity severity) 419 | { 420 | return severity >= Severity::kINFO ? std::cout : std::cerr; 421 | } 422 | 423 | //! 424 | //! \brief method that implements logging test results 425 | //! 426 | static void reportTestResult(const TestAtom& testAtom, TestResult result) 427 | { 428 | severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " 429 | << testAtom.mCmdline << std::endl; 430 | } 431 | 432 | //! 433 | //! \brief generate a command line string from the given (argc, argv) values 434 | //! 435 | static std::string genCmdlineString(int argc, char const* const* argv) 436 | { 437 | std::stringstream ss; 438 | for (int i = 0; i < argc; i++) 439 | { 440 | if (i > 0) 441 | ss << " "; 442 | ss << argv[i]; 443 | } 444 | return ss.str(); 445 | } 446 | 447 | Severity mReportableSeverity; 448 | }; 449 | 450 | namespace 451 | { 452 | 453 | //! 454 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE 455 | //! 456 | //! Example usage: 457 | //! 458 | //! LOG_VERBOSE(logger) << "hello world" << std::endl; 459 | //! 460 | inline LogStreamConsumer LOG_VERBOSE_RT(const Logger& logger) 461 | { 462 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); 463 | } 464 | 465 | //! 466 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO 467 | //! 468 | //! Example usage: 469 | //! 470 | //! LOG_INFO(logger) << "hello world" << std::endl; 471 | //! 472 | inline LogStreamConsumer LOG_INFO_RT(const Logger& logger) 473 | { 474 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); 475 | } 476 | 477 | //! 478 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING 479 | //! 480 | //! Example usage: 481 | //! 482 | //! LOG_WARN(logger) << "hello world" << std::endl; 483 | //! 484 | inline LogStreamConsumer LOG_WARN_RT(const Logger& logger) 485 | { 486 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); 487 | } 488 | 489 | //! 490 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR 491 | //! 492 | //! Example usage: 493 | //! 494 | //! LOG_ERROR(logger) << "hello world" << std::endl; 495 | //! 496 | inline LogStreamConsumer LOG_ERROR_RT(const Logger& logger) 497 | { 498 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); 499 | } 500 | 501 | //! 502 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR 503 | // ("fatal" severity) 504 | //! 505 | //! Example usage: 506 | //! 507 | //! LOG_FATAL(logger) << "hello world" << std::endl; 508 | //! 509 | inline LogStreamConsumer LOG_FATAL_RT(const Logger& logger) 510 | { 511 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); 512 | } 513 | 514 | } // anonymous namespace 515 | 516 | #endif // TENSORRT_LOGGING_H 517 | -------------------------------------------------------------------------------- /include/TimerCounter.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Website: https://github.com/LinhanDai 3 | * @author dailinhan 4 | * @date 24-02-23 9:30 5 | _ooOoo_ 6 | o8888888o 7 | 88" . "88 8 | (| -_- |) 9 | O\ = /O 10 | ____/`---'\____ 11 | .' \\| |// `. 12 | / \\||| : |||// \ 13 | / _||||| -:- |||||- \ 14 | | | \\\ - /// | | 15 | | \_| ''\---/'' | | 16 | \ .-\__ `-` ___/-. / 17 | ___`. .' /--.--\ `. . __ 18 | ."" '< `.___\_<|>_/___.' >'"". 19 | | | : `- \`.;`\ _ /`;.`/ - ` : | | 20 | \ \ `-. \_ __\ /__ _/ .-` / / 21 | ======`-.____`-.___\_____/___.-`____.-'====== 22 | `=---=' 23 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 24 | no error no bug 25 | */ 26 | 27 | #pragma once 28 | #ifndef YOLOV9_TIMERCOUNTER_H 29 | #define YOLOV9_TIMERCOUNTER_H 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | class CPUTimer 36 | { 37 | public: 38 | CPUTimer() 39 | { 40 | mStart = std::chrono::high_resolution_clock::now(); 41 | } 42 | 43 | void start() 44 | { 45 | mStart = std::chrono::high_resolution_clock::now(); 46 | } 47 | 48 | void stop() 49 | { 50 | mEnd = std::chrono::high_resolution_clock::now(); 51 | } 52 | 53 | float elapsed_ms() 54 | { 55 | int64_t dur = 0; 56 | dur = std::chrono::duration_cast(mEnd - mStart).count(); // us 57 | return (float)(dur) / 1000; 58 | } 59 | 60 | private: 61 | std::chrono::time_point mStart; 62 | std::chrono::time_point mEnd; 63 | }; 64 | 65 | class GPUTimer 66 | { 67 | public: 68 | GPUTimer() 69 | { 70 | cudaEventCreate(&mStart); 71 | cudaEventCreate(&mEnd); 72 | } 73 | 74 | float elapsed_ms() 75 | { 76 | float ms = 0; 77 | cudaEventElapsedTime(&ms, mStart, mEnd); 78 | return ms; 79 | } 80 | 81 | void start() 82 | { 83 | cudaEventRecord(mStart); 84 | } 85 | 86 | void stop() 87 | { 88 | cudaEventRecord(mEnd); 89 | cudaEventSynchronize(mEnd); 90 | } 91 | 92 | private: 93 | cudaEvent_t mStart; 94 | cudaEvent_t mEnd; 95 | }; 96 | 97 | #endif //YOLOV9_TIMERCOUNTER_H 98 | -------------------------------------------------------------------------------- /include/Yolov9.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Website: https://github.com/LinhanDai 3 | * @author dailinhan 4 | * @date 24-02-23 9:19 5 | _ooOoo_ 6 | o8888888o 7 | 88" . "88 8 | (| -_- |) 9 | O\ = /O 10 | ____/`---'\____ 11 | .' \\| |// `. 12 | / \\||| : |||// \ 13 | / _||||| -:- |||||- \ 14 | | | \\\ - /// | | 15 | | \_| ''\---/'' | | 16 | \ .-\__ `-` ___/-. / 17 | ___`. .' /--.--\ `. . __ 18 | ."" '< `.___\_<|>_/___.' >'"". 19 | | | : `- \`.;`\ _ /`;.`/ - ` : | | 20 | \ \ `-. \_ __\ /__ _/ .-` / / 21 | ======`-.____`-.___\_____/___.-`____.-'====== 22 | `=---=' 23 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 24 | no error no bug 25 | */ 26 | 27 | #ifndef YOLOV9_TENSORRT_YOLOV9_H 28 | #define YOLOV9_TENSORRT_YOLOV9_H 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include "LoggingRT.h" 41 | 42 | 43 | #define CHECK(op) __check_cuda_runtime((op), #op, __FILE__, __LINE__) 44 | 45 | constexpr long long int operator"" _GiB(long long unsigned int val) 46 | { 47 | return val * (1 << 30); 48 | } 49 | 50 | #define MAX_OBJECTS 1000 51 | #define NUM_BOX_ELEMENT 7 // left, top, right, bottom, confidence, class, keepflag 52 | #define GPU_MAX_LIMIT_WIDTH 4096 53 | #define GPU_MAX_LIMIT_HEIGHT 4096 54 | #define GPU_MAX_LIMIT_CHANNEL 3 55 | 56 | inline bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line) 57 | { 58 | if(code != cudaSuccess) 59 | { 60 | const char* err_name = cudaGetErrorName(code); 61 | const char* err_message = cudaGetErrorString(code); 62 | std::cout << "runtime error " << file << ":" << line << " :" << " " << op << " failed, code:" << err_name << " massage:" << err_message << std::endl; 63 | return false; 64 | } 65 | return true; 66 | } 67 | 68 | struct ImgInfo 69 | { 70 | int width; 71 | int height; 72 | int channels; 73 | }; 74 | 75 | struct Box{ 76 | int left, top, right, bottom; 77 | float confidence; 78 | int label; 79 | int trackerID; 80 | 81 | Box() = default; 82 | Box(int left, int top, int right, int bottom, float confidence, int label, int trackerID): 83 | left(left), top(top), right(right), bottom(bottom), confidence(confidence), label(label),trackerID(trackerID){} 84 | }; 85 | typedef std::vector detectResult; 86 | 87 | 88 | extern "C" void transpose_kernel_invoker(float *src, int num_bboxes, int num_elements,float *dst,cudaStream_t stream); 89 | 90 | extern "C" void decode_kernel_invoker( 91 | float* predict, int num_bboxes, int num_classes, float confidence_threshold, 92 | float nms_threshold, float* invert_affine_matrix, float* parray, int max_objects, 93 | int num_box_element, cudaStream_t stream); 94 | 95 | extern "C" void preprocess_kernel_img( 96 | uint8_t* src, int src_width, int src_height, 97 | float* dst, int dst_width, int dst_height, 98 | float *d2i, cudaStream_t stream); 99 | 100 | class YoloV9 101 | { 102 | public: 103 | struct AffineMatrix //Preprocessing affine transformation matrix and inverse matrix 104 | { 105 | float i2d[6]; //transformation matrix 106 | float d2i[6]; //inverse matrix 107 | }; 108 | 109 | public: 110 | explicit YoloV9(const std::string& configPath, const std::string &configFile); 111 | void doInfer(std::vector batchImg, 112 | std::vector imgInfoVec, 113 | std::vector &detResult); 114 | 115 | private: 116 | std::vector getDetResultToCPU(int batch); 117 | void getAffineMartrix(AffineMatrix &afmt,cv::Size &to,cv::Size &from); 118 | void gpuDecode(float* anchorsProb, int batch, float confidence_threshold, float nms_threshold); 119 | void imgPreProcess(std::vector &batchImg); 120 | void getTrtmodelStream(); 121 | void getBindingDimsInfo(); 122 | void createInferenceEngine(nvinfer1::IHostMemory **modelStream);; 123 | void modelInfer(nvinfer1::IExecutionContext& context, int batchSize); 124 | bool readParameters(const std::string& configPath, const std::string& configFile); 125 | bool createEngineIfNotExit(); 126 | nvinfer1::IHostMemory *createEngine(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config); 127 | 128 | private: 129 | int mMaxSupportBatchSize{}; 130 | int mInputH{}; 131 | int mInputW{}; 132 | int mInputC{}; 133 | int mOutputAnchorsNum; 134 | int mOutputAnchorsDim; 135 | int mOutputAnchorsSize; 136 | std::string mOnnxFile; 137 | std::string mEngineFile; 138 | std::string mQuantizationInfer; 139 | unsigned char *mDeviceWarpAffine; 140 | char *mTrtModelStream{}; 141 | nvinfer1::IRuntime *mRuntime{}; 142 | nvinfer1::ICudaEngine *mEngine{}; 143 | nvinfer1::IExecutionContext *mContext{}; 144 | cudaStream_t mStream{}; 145 | float *mAffineMatrixD2iHost; 146 | float *mAffineMatrixD2iDevice; 147 | float mConfTreshold; 148 | float mNMSTreshold; 149 | float *mBuff[9]; 150 | float* mOutputDevice; 151 | float* mTransposeDevice; 152 | float* mOutputHost; 153 | std::vector mImageSizeBatch; 154 | Logger mLogger; 155 | }; 156 | 157 | #endif //YOLOV9_TENSORRT_YOLOV9_H 158 | -------------------------------------------------------------------------------- /include/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #include "NvInfer.h" 5 | 6 | #ifdef API_EXPORTS 7 | #if defined(_MSC_VER) 8 | #define API __declspec(dllexport) 9 | #else 10 | #define API __attribute__((visibility("default"))) 11 | #endif 12 | #else 13 | 14 | #if defined(_MSC_VER) 15 | #define API __declspec(dllimport) 16 | #else 17 | #define API 18 | #endif 19 | #endif // API_EXPORTS 20 | 21 | #if NV_TENSORRT_MAJOR >= 8 22 | #define TRT_NOEXCEPT noexcept 23 | #define TRT_CONST_ENQUEUE const 24 | #else 25 | #define TRT_NOEXCEPT 26 | #define TRT_CONST_ENQUEUE 27 | #endif 28 | 29 | #endif // __MACROS_H 30 | -------------------------------------------------------------------------------- /python/AIResult.py: -------------------------------------------------------------------------------- 1 | class DetResult(object): 2 | def __init__(self, score, box, class_id): 3 | self.score = score # confidence 4 | self.box = box # x1,y1,w,h 5 | self.class_id = class_id # class_id -------------------------------------------------------------------------------- /python/decorators.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | def time_cost(func): 5 | def wrapper(*args, **kwargs): 6 | start_time = time.time() 7 | result = func(*args, **kwargs) 8 | end_time = time.time() 9 | print(f"{func.__name__} took {(end_time - start_time) * 1000 :.4f} ms to execute.") 10 | return result 11 | return wrapper 12 | 13 | 14 | def suppress_errors(func): 15 | def wrapper(*args, **kwargs): 16 | try: 17 | return func(*args, **kwargs) 18 | except Exception as e: 19 | print(f"Error in {func.__name__}: {e}") 20 | return None 21 | return wrapper -------------------------------------------------------------------------------- /python/draw_AI_results.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import random 3 | from python.AIResult import * 4 | 5 | #Generic paint color 6 | colors = list() 7 | while len(colors) < 100: 8 | # Randomly generate RGB color values 9 | b = random.randint(0, 255) 10 | g = random.randint(0, 255) 11 | r = random.randint(0, 255) 12 | color = (b, g, r) 13 | # Check if the same color already exists 14 | if color not in colors: 15 | colors.append(color) 16 | 17 | 18 | def draw_detect_results(img, results): 19 | ''' 20 | Draw detection results 21 | :param img: src img 22 | :param results: detect results 23 | ''' 24 | for r in results: 25 | cv2.rectangle(img, (r.box[0], r.box[1]), (r.box[0] + r.box[2], r.box[1] + r.box[3]), colors[r.class_id], 3) 26 | label_str = "id:" + str(r.class_id) + " " + str(round(r.score, 2)) 27 | cv2.putText(img, label_str, (r.box[0], r.box[1] - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.3, colors[r.class_id], 2) 28 | cv2.namedWindow("detect", cv2.WINDOW_NORMAL) 29 | cv2.imshow("detect", img) 30 | cv2.waitKey(0) -------------------------------------------------------------------------------- /python/logging_system.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import datetime 4 | 5 | 6 | class Logger(object): 7 | def __init__(self, level="DEBUG"): 8 | # Create a logger object 9 | current_date = datetime.date.today() 10 | self.logger = logging.getLogger(__name__) 11 | self.logger.setLevel(level) 12 | self.project_name = "yolov9-tensorrt" 13 | self.project_path = self.get_project_path() 14 | self.log_dir = os.path.join(self.project_path, "logs") 15 | log_dir_name = "{}-{}-{}".format(current_date.year, current_date.month, current_date.day) 16 | self.log_dir = os.path.join(self.log_dir, log_dir_name) 17 | os.makedirs(self.log_dir, exist_ok=True) 18 | 19 | def get_project_path(self): 20 | script_path = os.path.abspath(__file__) 21 | path = os.path.dirname(script_path) 22 | pos = path.rfind(self.project_name) 23 | return os.path.join(path[:pos], self.project_name) 24 | 25 | def console_handler(self, level="DEBUG"): 26 | # Create a log processor for the console 27 | console_handler = logging.StreamHandler() 28 | console_handler.setLevel(level) 29 | 30 | # Add output format to processor 31 | console_handler.setFormatter(self.get_formatter()[0]) 32 | 33 | # Return to controller 34 | return console_handler 35 | 36 | def file_handler(self, log_file, level="DEBUG"): 37 | log_file = os.path.join(self.log_dir, log_file) 38 | # Log processor for creating files 39 | file_handler = logging.FileHandler(log_file, mode="w", encoding="utf-8") 40 | file_handler.setLevel(level) 41 | 42 | # Add output format to processor 43 | file_handler.setFormatter(self.get_formatter()[1]) 44 | 45 | # Return to controller 46 | return file_handler 47 | 48 | def get_formatter(self): 49 | """Formatter""" 50 | console_fmt = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(module)s,%(funcName)s: %(message)s') 51 | file_fmt = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(module)s,%(funcName)s: %(message)s') 52 | # Returns a tuple 53 | return console_fmt, file_fmt 54 | 55 | def get_log(self, log_file, level="DEBUG"): 56 | # Adding a console processor to the logger 57 | self.logger.addHandler(self.console_handler(level)) 58 | # Adding a file processor to the logger 59 | self.logger.addHandler(self.file_handler(log_file, level)) 60 | 61 | # Return Log Instance Object 62 | return self.logger 63 | 64 | 65 | if __name__ == "__main__": 66 | log = Logger() 67 | logger = log.get_log("log.txt") 68 | logger.info("hello world") -------------------------------------------------------------------------------- /python/tensorrt_base.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorrt as trt 3 | 4 | 5 | class TensorrtBase(object): 6 | def __init__(self, logger): 7 | ''' 8 | Initialize the base class for building tensorrt 9 | :param logger: Logging system 10 | ''' 11 | self.logger = logger 12 | self.trt_logger = trt.Logger(trt.Logger.WARNING) 13 | self.quantization_infer = None 14 | self.engine_file = None 15 | self.onnx_file = None 16 | 17 | def create_engine_if_not_exit(self): 18 | ''' 19 | If the inference engine does not exist, create it 20 | :return: Whether the inference engine was successfully created 21 | ''' 22 | serialized_model = None 23 | if os.path.exists(self.engine_file): 24 | return True 25 | else: 26 | builder = trt.Builder(self.trt_logger) 27 | config = builder.create_builder_config() 28 | engine = self.create_engine(builder, config) 29 | assert serialized_model is None, self.logger.error("engine create failure!") 30 | with open(self.engine_file, "wb") as f: 31 | f.write(engine.serialize()) 32 | return True 33 | 34 | def create_engine(self, builder, config): 35 | ''' 36 | Create inference engine 37 | :param builder: TRT construction 38 | :param config: TRT configuration 39 | :return: Inference engine 40 | ''' 41 | explicitBatch = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) 42 | network = builder.create_network(explicitBatch) 43 | parser = trt.OnnxParser(network, self.trt_logger) 44 | parsed = parser.parse_from_file(self.onnx_file) 45 | config.max_workspace_size = 1 << 30 46 | if self.quantization_infer == "FP16": 47 | self.logger.info("create engine with FP16") 48 | config.set_flag(trt.BuilderFlag.FP16) 49 | else: 50 | self.logger.info("create engine with TF32") 51 | config.set_flag(trt.BuilderFlag.TF32) 52 | 53 | input_Dims = network.get_input(0).shape 54 | if input_Dims[0] == -1: 55 | profile_calib = builder.create_optimization_profile() 56 | input_name = network.get_input(0).get_name() 57 | batch_dim = input_Dims 58 | batch_dim.d[0] = 1 59 | profile_calib.set_shape(input_name, batch_dim) 60 | config.add_optimization_profile(profile_calib) 61 | 62 | self.logger.info("Creating an inference engine, please wait a few minutes!!!") 63 | engine = builder.build_engine(network, config) 64 | self.logger.info("Creating an inference engine successful!") 65 | return engine 66 | -------------------------------------------------------------------------------- /result/000000000036.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LinhanDai/yolov9-tensorrt/7729c915733543c76a6ae9afb77d5a8825f924c0/result/000000000036.jpg -------------------------------------------------------------------------------- /result/000000000144.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LinhanDai/yolov9-tensorrt/7729c915733543c76a6ae9afb77d5a8825f924c0/result/000000000144.jpg -------------------------------------------------------------------------------- /result/performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LinhanDai/yolov9-tensorrt/7729c915733543c76a6ae9afb77d5a8825f924c0/result/performance.png -------------------------------------------------------------------------------- /src/Decode.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Website: https://github.com/LinhanDai 3 | * @author dailinhan 4 | * @date 24-02-23 10:24 5 | _ooOoo_ 6 | o8888888o 7 | 88" . "88 8 | (| -_- |) 9 | O\ = /O 10 | ____/`---'\____ 11 | .' \\| |// `. 12 | / \\||| : |||// \ 13 | / _||||| -:- |||||- \ 14 | | | \\\ - /// | | 15 | | \_| ''\---/'' | | 16 | \ .-\__ `-` ___/-. / 17 | ___`. .' /--.--\ `. . __ 18 | ."" '< `.___\_<|>_/___.' >'"". 19 | | | : `- \`.;`\ _ /`;.`/ - ` : | | 20 | \ \ `-. \_ __\ /__ _/ .-` / / 21 | ======`-.____`-.___\_____/___.-`____.-'====== 22 | `=---=' 23 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 24 | no error no bug 25 | */ 26 | 27 | #include 28 | #include 29 | 30 | 31 | static __global__ void transpose_kernel(float *src, int num_bboxes, int num_elements,float *dst, int edge) 32 | { 33 | int position = blockDim.x * blockIdx.x + threadIdx.x; 34 | if (position>=edge) 35 | return; 36 | dst[position]=src[position / num_elements + (position % num_elements) * num_bboxes]; 37 | } 38 | 39 | extern "C" void transpose_kernel_invoker(float *src, int num_bboxes, int num_elements,float *dst,cudaStream_t stream) 40 | { 41 | int edge = num_bboxes * num_elements; 42 | int block = 256; 43 | int gird = ceil(edge / (float)block); 44 | transpose_kernel<<>>(src,num_bboxes,num_elements, dst, edge); 45 | } 46 | 47 | static __device__ void affine_project(float* matrix, float x, float y, float* ox, float* oy) 48 | { 49 | *ox = matrix[0] * x + matrix[1] * y + matrix[2]; 50 | *oy = matrix[3] * x + matrix[4] * y + matrix[5]; 51 | } 52 | 53 | static __global__ void decode_kernel( 54 | float* predict, int num_bboxes, int num_classes, 55 | float confidence_threshold, float* invert_affine_matrix, 56 | float* parray, int max_objects, int NUM_BOX_ELEMENT) 57 | { 58 | 59 | int position = blockDim.x * blockIdx.x + threadIdx.x; 60 | if (position >= num_bboxes) return; 61 | 62 | float* pitem = predict + (4 + num_classes) * position; 63 | 64 | float* class_confidence = pitem + 4; 65 | float confidence = *class_confidence++; 66 | int label = 0; 67 | for(int i = 1; i < num_classes; ++i, ++class_confidence) 68 | { 69 | if(*class_confidence > confidence) 70 | { 71 | confidence = *class_confidence; 72 | label = i; 73 | } 74 | } 75 | 76 | // confidence *= objectness; 77 | if(confidence < confidence_threshold) 78 | return; 79 | 80 | int index = atomicAdd(parray, 1); 81 | if(index >= max_objects) 82 | return; 83 | // printf("index %d max_objects %d\n", index,max_objects); 84 | float cx = pitem[0]; 85 | float cy = pitem[1]; 86 | float width = pitem[2]; 87 | float height = pitem[3]; 88 | 89 | float left = cx - width * 0.5f; 90 | float top = cy - height * 0.5f; 91 | float right = cx + width * 0.5f; 92 | float bottom = cy + height * 0.5f; 93 | 94 | affine_project(invert_affine_matrix, left, top, &left, &top); 95 | affine_project(invert_affine_matrix, right, bottom, &right, &bottom); 96 | 97 | 98 | float* pout_item = parray + 1 + index * NUM_BOX_ELEMENT; 99 | *pout_item++ = left; 100 | *pout_item++ = top; 101 | *pout_item++ = right; 102 | *pout_item++ = bottom; 103 | *pout_item++ = confidence; 104 | *pout_item++ = label; 105 | *pout_item++ = 1; // 1 = keep, 0 = ignore 106 | } 107 | 108 | static __device__ float box_iou( 109 | float aleft, float atop, float aright, float abottom, 110 | float bleft, float btop, float bright, float bbottom) 111 | { 112 | float cleft = max(aleft, bleft); 113 | float ctop = max(atop, btop); 114 | float cright = min(aright, bright); 115 | float cbottom = min(abottom, bbottom); 116 | 117 | float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f); 118 | if(c_area == 0.0f) 119 | return 0.0f; 120 | 121 | float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop); 122 | float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop); 123 | return c_area / (a_area + b_area - c_area); 124 | } 125 | 126 | static __global__ void fast_nms_kernel(float* bboxes, 127 | int max_objects, 128 | float threshold, 129 | int NUM_BOX_ELEMENT) 130 | { 131 | int position = (blockDim.x * blockIdx.x + threadIdx.x); 132 | int count = min((int)*bboxes, max_objects); 133 | if (position >= count) 134 | return; 135 | 136 | // left, top, right, bottom, confidence, class, keepflag 137 | float* pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT; 138 | for(int i = 0; i < count; ++i) 139 | { 140 | float* pitem = bboxes + 1 + i * NUM_BOX_ELEMENT; 141 | if(i == position || pcurrent[5] != pitem[5]) continue; 142 | 143 | if(pitem[4] >= pcurrent[4]) 144 | { 145 | if(pitem[4] == pcurrent[4] && i < position) 146 | continue; 147 | 148 | float iou = box_iou( 149 | pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], 150 | pitem[0], pitem[1], pitem[2], pitem[3] 151 | ); 152 | 153 | if(iou > threshold) 154 | { 155 | pcurrent[6] = 0; // 1=keep, 0=ignore 156 | return; 157 | } 158 | } 159 | } 160 | } 161 | 162 | extern "C" void decode_kernel_invoker( 163 | float* predict, int num_bboxes, int num_classes, float confidence_threshold, 164 | float nms_threshold, float* invert_affine_matrix, float* parray, int max_objects, 165 | int num_box_element, cudaStream_t stream) 166 | { 167 | auto block = num_bboxes > 512 ? 512 : num_bboxes; 168 | auto grid = (num_bboxes + block - 1) / block; 169 | decode_kernel<<>>( 170 | predict, num_bboxes, num_classes, 171 | confidence_threshold, invert_affine_matrix, 172 | parray, max_objects, num_box_element); 173 | 174 | block = max_objects > 512 ? 512 : max_objects; 175 | grid = (max_objects + block - 1) / block; 176 | fast_nms_kernel<<>>(parray, max_objects, nms_threshold, num_box_element); 177 | } -------------------------------------------------------------------------------- /src/Preprocess.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Website: https://github.com/LinhanDai 3 | * @author dailinhan 4 | * @date 24-02-23 11:40 5 | _ooOoo_ 6 | o8888888o 7 | 88" . "88 8 | (| -_- |) 9 | O\ = /O 10 | ____/`---'\____ 11 | .' \\| |// `. 12 | / \\||| : |||// \ 13 | / _||||| -:- |||||- \ 14 | | | \\\ - /// | | 15 | | \_| ''\---/'' | | 16 | \ .-\__ `-` ___/-. / 17 | ___`. .' /--.--\ `. . __ 18 | ."" '< `.___\_<|>_/___.' >'"". 19 | | | : `- \`.;`\ _ /`;.`/ - ` : | | 20 | \ \ `-. \_ __\ /__ _/ .-` / / 21 | ======`-.____`-.___\_____/___.-`____.-'====== 22 | `=---=' 23 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 24 | no error no bug 25 | */ 26 | 27 | #include 28 | #include 29 | 30 | __global__ void warpaffine_kernel( 31 | uint8_t* src, int src_line_size, int src_width, 32 | int src_height, float* dst, int dst_width, 33 | int dst_height, uint8_t const_value_st, 34 | float * d2i, int edge) 35 | { 36 | int position = blockDim.x * blockIdx.x + threadIdx.x; 37 | if (position >= edge) return; 38 | 39 | float m_x1 = d2i[0]; 40 | float m_y1 = d2i[1]; 41 | float m_z1 = d2i[2]; 42 | float m_x2 = d2i[3]; 43 | float m_y2 = d2i[4]; 44 | float m_z2 = d2i[5]; 45 | 46 | int dx = position % dst_width; 47 | int dy = position / dst_width; 48 | float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f; 49 | float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f; 50 | float c0, c1, c2; 51 | 52 | if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) 53 | { 54 | // out of range 55 | c0 = const_value_st; 56 | c1 = const_value_st; 57 | c2 = const_value_st; 58 | } 59 | else 60 | { 61 | int y_low = floorf(src_y); 62 | int x_low = floorf(src_x); 63 | int y_high = y_low + 1; 64 | int x_high = x_low + 1; 65 | 66 | uint8_t const_value[] = {const_value_st, const_value_st, const_value_st}; 67 | float ly = src_y - y_low; 68 | float lx = src_x - x_low; 69 | float hy = 1 - ly; 70 | float hx = 1 - lx; 71 | float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; 72 | uint8_t* v1 = const_value; 73 | uint8_t* v2 = const_value; 74 | uint8_t* v3 = const_value; 75 | uint8_t* v4 = const_value; 76 | 77 | if (y_low >= 0) 78 | { 79 | if (x_low >= 0) 80 | v1 = src + y_low * src_line_size + x_low * 3; 81 | 82 | if (x_high < src_width) 83 | v2 = src + y_low * src_line_size + x_high * 3; 84 | } 85 | 86 | if (y_high < src_height) 87 | { 88 | if (x_low >= 0) 89 | v3 = src + y_high * src_line_size + x_low * 3; 90 | 91 | if (x_high < src_width) 92 | v4 = src + y_high * src_line_size + x_high * 3; 93 | } 94 | 95 | c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]; 96 | c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]; 97 | c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]; 98 | } 99 | 100 | //bgr to rgb 101 | float t = c2; 102 | c2 = c0; 103 | c0 = t; 104 | 105 | //normalization 106 | c0 = c0 / 255.0f; 107 | c1 = c1 / 255.0f; 108 | c2 = c2 / 255.0f; 109 | 110 | //rgbrgbrgb to rrrgggbbb 111 | int area = dst_width * dst_height; 112 | float* pdst_c0 = dst + dy * dst_width + dx; 113 | float* pdst_c1 = pdst_c0 + area; 114 | float* pdst_c2 = pdst_c1 + area; 115 | *pdst_c0 = c0; 116 | *pdst_c1 = c1; 117 | *pdst_c2 = c2; 118 | } 119 | 120 | extern "C" void preprocess_kernel_img( 121 | uint8_t* src, int src_width, int src_height, 122 | float* dst, int dst_width, int dst_height, 123 | float *d2i, cudaStream_t stream) 124 | { 125 | int jobs = dst_height * dst_width; 126 | int threads = 256; 127 | int blocks = ceil(jobs / (float)threads); 128 | warpaffine_kernel<<< blocks, threads, 0, stream >>>( 129 | src, src_width * 3, src_width, 130 | src_height, dst, dst_width, 131 | dst_height, 128, d2i, jobs); 132 | 133 | } -------------------------------------------------------------------------------- /src/Yolov9.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Website: https://github.com/LinhanDai 3 | * @author dailinhan 4 | * @date 24-02-23 9:19 5 | _ooOoo_ 6 | o8888888o 7 | 88" . "88 8 | (| -_- |) 9 | O\ = /O 10 | ____/`---'\____ 11 | .' \\| |// `. 12 | / \\||| : |||// \ 13 | / _||||| -:- |||||- \ 14 | | | \\\ - /// | | 15 | | \_| ''\---/'' | | 16 | \ .-\__ `-` ___/-. / 17 | ___`. .' /--.--\ `. . __ 18 | ."" '< `.___\_<|>_/___.' >'"". 19 | | | : `- \`.;`\ _ /`;.`/ - ` : | | 20 | \ \ `-. \_ __\ /__ _/ .-` / / 21 | ======`-.____`-.___\_____/___.-`____.-'====== 22 | `=---=' 23 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 24 | no error no bug 25 | */ 26 | 27 | #include "Yolov9.h" 28 | #include "TimerCounter.h" 29 | 30 | 31 | YoloV9::YoloV9(const std::string& configPath, const std::string &configFile) 32 | { 33 | std::cout << "Yolov9 init..." << std::endl; 34 | assert(readParameters(configPath, configFile)); 35 | cudaSetDevice(0); 36 | assert(createEngineIfNotExit() == true && "engine create failure!"); 37 | getTrtmodelStream(); 38 | } 39 | 40 | nvinfer1::IHostMemory *YoloV9::createEngine(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config) 41 | { 42 | std::cout << "Creating an inference engine, please wait a few minutes!!!" << std::endl; 43 | mLogger.setReportableSeverity(Severity::kERROR); 44 | const auto explicitBatch = 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); 45 | nvinfer1::INetworkDefinition *network = builder->createNetworkV2(explicitBatch); 46 | assert(network); 47 | nvonnxparser::IParser *parser = nvonnxparser::createParser(*network, mLogger); 48 | assert(parser); 49 | bool parsed = parser->parseFromFile(mOnnxFile.c_str(), (int) nvinfer1::ILogger::Severity::kWARNING); 50 | if (!parsed) { 51 | mLogger.logPrint(Severity::kERROR, __FUNCTION__ , __LINE__, "onnx file parse error, please check onnx file!"); 52 | std::abort(); 53 | } 54 | config->setMaxWorkspaceSize(2_GiB); 55 | if (strcmp(mQuantizationInfer.c_str(), "FP16") == 0) 56 | { 57 | config->setFlag(nvinfer1::BuilderFlag::kFP16); 58 | } 59 | else if(strcmp(mQuantizationInfer.c_str(), "FP32") == 0) 60 | { 61 | config->setFlag(nvinfer1::BuilderFlag::kTF32); 62 | } 63 | nvinfer1::Dims inputDims = network->getInput(0)->getDimensions(); 64 | if (inputDims.d[0] == -1) 65 | { 66 | nvinfer1::IOptimizationProfile *profileCalib = builder->createOptimizationProfile(); 67 | const auto inputName = network->getInput(0)->getName(); 68 | nvinfer1::Dims batchDim = inputDims; 69 | batchDim.d[0] = 1; 70 | // We do not need to check the return of setDimension and setCalibrationProfile here as all dims are explicitly set 71 | profileCalib->setDimensions(inputName, nvinfer1::OptProfileSelector::kMIN, batchDim); 72 | profileCalib->setDimensions(inputName, nvinfer1::OptProfileSelector::kOPT, batchDim); 73 | batchDim.d[0] = mMaxSupportBatchSize; 74 | profileCalib->setDimensions(inputName, nvinfer1::OptProfileSelector::kMAX, batchDim); 75 | config->addOptimizationProfile(profileCalib); 76 | } 77 | nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config); 78 | assert(serialized_model); 79 | mLogger.logPrint(Severity::kINFO,__FUNCTION__ ,__LINE__ ,"success create serialized_model!"); 80 | return serialized_model; 81 | } 82 | 83 | void YoloV9::createInferenceEngine(nvinfer1::IHostMemory **modelStream) 84 | { 85 | nvinfer1::IBuilder *builder = nvinfer1::createInferBuilder(mLogger); 86 | assert(builder); 87 | nvinfer1::IBuilderConfig *config = builder->createBuilderConfig(); 88 | assert(config); 89 | (*modelStream) = createEngine(builder, config); 90 | assert(modelStream != nullptr && "engine create failure!"); 91 | } 92 | 93 | bool YoloV9::createEngineIfNotExit() 94 | { 95 | std::ifstream cache(mEngineFile.c_str(), std::ios::binary); 96 | if (cache) 97 | return true; 98 | else { 99 | nvinfer1::IHostMemory *modelStream{nullptr}; 100 | createInferenceEngine(&modelStream); 101 | assert(modelStream != nullptr); 102 | std::ofstream p(mEngineFile.c_str(), std::ios::binary); 103 | if (!p) { 104 | std::cout << "could not open plan output file" << std::endl; 105 | return false; 106 | } 107 | p.write(reinterpret_cast(modelStream->data()), modelStream->size()); 108 | } 109 | } 110 | 111 | bool YoloV9::readParameters(const std::string& configPath, const std::string& configFile) 112 | { 113 | std::string yamlFile = configPath + "/" + configFile; 114 | if (access(yamlFile.c_str(), F_OK) != -1) 115 | { 116 | cv::FileStorage fs(yamlFile, cv::FileStorage::READ); 117 | mConfTreshold = fs["confTreshold"]; 118 | mNMSTreshold = fs["nmsTreshold"]; 119 | mMaxSupportBatchSize = fs["maxSupportBatchSize"]; 120 | mQuantizationInfer = (std::string) fs["quantizationInfer"]; 121 | mOnnxFile = configPath + "/" + (std::string) fs["onnxFile"]; 122 | mEngineFile = configPath + "/" + (std::string) fs["engineFile"]; 123 | } 124 | else 125 | { 126 | return false; 127 | } 128 | return true; 129 | } 130 | 131 | void YoloV9::getBindingDimsInfo() 132 | { 133 | nvinfer1::Dims inputDims = mEngine->getBindingDimensions(0); 134 | nvinfer1::Dims dInput = inputDims; 135 | mInputC = dInput.d[1]; 136 | mInputH = dInput.d[2]; 137 | mInputW = dInput.d[3]; 138 | nvinfer1::Dims outPutBoxesDims = mEngine->getBindingDimensions(7); 139 | nvinfer1::Dims dOutPutBoxes = outPutBoxesDims; 140 | mOutputAnchorsDim= dOutPutBoxes.d[1]; 141 | mOutputAnchorsNum = dOutPutBoxes.d[2]; 142 | mOutputAnchorsSize = mOutputAnchorsNum * mOutputAnchorsDim; 143 | } 144 | 145 | void YoloV9::getTrtmodelStream() 146 | { 147 | int engineFileSize = 0; 148 | cudaSetDevice(0); 149 | std::ifstream file(mEngineFile, std::ios::binary); 150 | if (file.good()) 151 | { 152 | file.seekg(0, file.end); 153 | engineFileSize = file.tellg(); 154 | file.seekg(0, file.beg); 155 | mTrtModelStream = new char[engineFileSize]; 156 | assert(mTrtModelStream); 157 | file.read(mTrtModelStream, engineFileSize); 158 | file.close(); 159 | } 160 | mRuntime = nvinfer1::createInferRuntime(mLogger); 161 | assert(mRuntime); 162 | mEngine = mRuntime->deserializeCudaEngine(mTrtModelStream, engineFileSize); 163 | assert(mEngine); 164 | mContext = mEngine->createExecutionContext(); 165 | assert(mContext); 166 | getBindingDimsInfo(); 167 | //create fixed maximum input buffer 168 | int inputSingleByteNum = mInputW * mInputH * mInputC; 169 | int outputSingleAnchorByteNum = mOutputAnchorsNum * mOutputAnchorsDim; 170 | //input layer 171 | CHECK(cudaMalloc(&(mBuff[0]), mMaxSupportBatchSize * inputSingleByteNum * sizeof(float))); 172 | //output feature map layer 173 | nvinfer1::Dims outputDims1 = mEngine->getBindingDimensions(1); 174 | CHECK(cudaMalloc(&(mBuff[1]), mMaxSupportBatchSize * outputDims1.d[1] * outputDims1.d[2] * outputDims1.d[3] * sizeof(float))); 175 | nvinfer1::Dims outputDims2 = mEngine->getBindingDimensions(2); 176 | CHECK(cudaMalloc(&(mBuff[2]), mMaxSupportBatchSize * outputDims2.d[1] * outputDims2.d[2] * outputDims2.d[3] * sizeof(float))); 177 | nvinfer1::Dims outputDims3 = mEngine->getBindingDimensions(3); 178 | CHECK(cudaMalloc(&(mBuff[3]), mMaxSupportBatchSize * outputDims3.d[1] * outputDims3.d[2] * outputDims3.d[3] * sizeof(float))); 179 | nvinfer1::Dims outputDims4 = mEngine->getBindingDimensions(4); 180 | CHECK(cudaMalloc(&(mBuff[4]), mMaxSupportBatchSize * outputDims4.d[1] * outputDims4.d[2] * outputDims4.d[3] * sizeof(float))); 181 | nvinfer1::Dims outputDims5 = mEngine->getBindingDimensions(5); 182 | CHECK(cudaMalloc(&(mBuff[5]), mMaxSupportBatchSize * outputDims5.d[1] * outputDims5.d[2] * outputDims5.d[3] * sizeof(float))); 183 | nvinfer1::Dims outputDims6 = mEngine->getBindingDimensions(6); 184 | CHECK(cudaMalloc(&(mBuff[6]), mMaxSupportBatchSize * outputDims6.d[1] * outputDims6.d[2] * outputDims6.d[3] * sizeof(float))); 185 | //output layer 186 | CHECK(cudaMalloc(&(mBuff[7]), mMaxSupportBatchSize * outputSingleAnchorByteNum * sizeof(float))); 187 | CHECK(cudaMalloc(&(mBuff[8]), mMaxSupportBatchSize * outputSingleAnchorByteNum * sizeof(float))); 188 | 189 | //malloc resize warpAffine space 190 | mDeviceWarpAffine = nullptr; 191 | CHECK(cudaMalloc(&mDeviceWarpAffine, GPU_MAX_LIMIT_WIDTH * GPU_MAX_LIMIT_HEIGHT * GPU_MAX_LIMIT_CHANNEL * sizeof(unsigned char))); 192 | CHECK(cudaMemset(mDeviceWarpAffine, 0, GPU_MAX_LIMIT_WIDTH * GPU_MAX_LIMIT_HEIGHT * GPU_MAX_LIMIT_CHANNEL * sizeof(unsigned char))); 193 | 194 | //malloc yolo gpuDecode space 195 | mOutputDevice = nullptr; 196 | mTransposeDevice = nullptr; 197 | mOutputHost = nullptr; 198 | CHECK(cudaMalloc(&mOutputDevice, sizeof(float) + MAX_OBJECTS * NUM_BOX_ELEMENT * sizeof(float))); 199 | CHECK(cudaMalloc(&mTransposeDevice, mOutputAnchorsSize * sizeof(float))); 200 | CHECK(cudaMallocHost(&mOutputHost, sizeof(float) + MAX_OBJECTS * NUM_BOX_ELEMENT * sizeof(float))); 201 | CHECK(cudaMemset(mOutputHost, 0, sizeof(float) + MAX_OBJECTS * NUM_BOX_ELEMENT * sizeof(float))); 202 | CHECK(cudaMemset(mTransposeDevice, 0, mOutputAnchorsSize * sizeof(float))); 203 | CHECK(cudaMemset(mOutputDevice, 0, sizeof(float) + MAX_OBJECTS * NUM_BOX_ELEMENT * sizeof(float))); 204 | 205 | mAffineMatrixD2iHost = nullptr; 206 | mAffineMatrixD2iDevice = nullptr; 207 | CHECK(cudaMallocHost(&mAffineMatrixD2iHost,sizeof(float) * 6)); 208 | CHECK(cudaMalloc(&mAffineMatrixD2iDevice,sizeof(float) * 6)); 209 | delete []mTrtModelStream; 210 | mTrtModelStream = nullptr; 211 | } 212 | 213 | void YoloV9::getAffineMartrix(AffineMatrix &afmt,cv::Size &to,cv::Size &from) 214 | { 215 | float scale = std::min(to.width/(float)from.width,to.height/(float)from.height); 216 | afmt.i2d[0] = scale; 217 | afmt.i2d[1] = 0; 218 | afmt.i2d[2] = (-scale * from.width+to.width) * 0.5; 219 | afmt.i2d[3] = 0; 220 | afmt.i2d[4] = scale; 221 | afmt.i2d[5] = (-scale * from.height + to.height) * 0.5; 222 | cv::Mat cv_i2d(2,3,CV_32F,afmt.i2d); 223 | cv::Mat cv_d2i(2,3,CV_32F,afmt.d2i); 224 | cv::invertAffineTransform(cv_i2d,cv_d2i); 225 | memcpy(afmt.d2i,cv_d2i.ptr(0),sizeof(afmt.d2i)); 226 | } 227 | 228 | void YoloV9::imgPreProcess(std::vector &batchImg) 229 | { 230 | for (size_t i = 0; i < batchImg.size(); i++) 231 | { 232 | AffineMatrix afmt{}; 233 | cv::Size to(mInputW, mInputH); 234 | cv::Size from(mImageSizeBatch[i].width, mImageSizeBatch[i].height); 235 | getAffineMartrix(afmt, to, from); 236 | memcpy(mAffineMatrixD2iHost,afmt.d2i,sizeof(afmt.d2i)); 237 | CHECK(cudaMemcpyAsync(mAffineMatrixD2iDevice, mAffineMatrixD2iHost, sizeof(afmt.d2i),cudaMemcpyHostToDevice, mStream)); 238 | preprocess_kernel_img(batchImg[i], mImageSizeBatch[i].width, mImageSizeBatch[i].height, 239 | mBuff[0], mInputW, mInputH, mAffineMatrixD2iDevice, mStream); 240 | } 241 | } 242 | 243 | void YoloV9::gpuDecode(float* anchorsProb, int batch, float confidence_threshold, float nms_threshold) 244 | { 245 | for (int i = 0; i < batch; i++) 246 | { 247 | float *predictDevice = anchorsProb + i * mOutputAnchorsSize; 248 | transpose_kernel_invoker(predictDevice, mOutputAnchorsNum, mOutputAnchorsDim, mTransposeDevice, mStream); 249 | CHECK(cudaMemset(mOutputDevice, 0, sizeof(float) + MAX_OBJECTS * NUM_BOX_ELEMENT * sizeof(float))); 250 | decode_kernel_invoker( 251 | mTransposeDevice, mOutputAnchorsNum, 252 | mOutputAnchorsDim - 4, confidence_threshold, 253 | nms_threshold, mAffineMatrixD2iDevice, 254 | mOutputDevice, MAX_OBJECTS, 255 | NUM_BOX_ELEMENT,mStream); 256 | } 257 | } 258 | 259 | std::vector YoloV9::getDetResultToCPU(int batch) 260 | { 261 | std::vector result; 262 | for (int b = 0; b < batch; b++) 263 | { 264 | std::vector boxResult; 265 | CHECK(cudaMemset(mOutputHost, 0, sizeof(float) + MAX_OBJECTS * NUM_BOX_ELEMENT * sizeof(float))); 266 | CHECK(cudaMemcpyAsync(mOutputHost, mOutputDevice, 267 | sizeof(float) + MAX_OBJECTS * NUM_BOX_ELEMENT * sizeof(float), 268 | cudaMemcpyDeviceToHost, mStream)); 269 | CHECK(cudaStreamSynchronize(mStream)); 270 | int num_boxes = std::min((int)mOutputHost[0], MAX_OBJECTS); 271 | for(int i = 0; i < num_boxes; ++i) 272 | { 273 | float* ptr = mOutputHost + 1 + NUM_BOX_ELEMENT * i; 274 | int keep_flag = ptr[6]; 275 | if(keep_flag) 276 | { 277 | boxResult.emplace_back(ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], (int)ptr[5], 0); 278 | } 279 | } 280 | result.push_back(boxResult); 281 | } 282 | return result; 283 | } 284 | 285 | void YoloV9::modelInfer(nvinfer1::IExecutionContext& context, int batchSize) 286 | { 287 | const nvinfer1::ICudaEngine &engine = context.getEngine(); 288 | nvinfer1::Dims inputDims = engine.getBindingDimensions(0); 289 | nvinfer1::Dims d = inputDims; 290 | d.d[0] = batchSize; 291 | if (!mContext->setBindingDimensions(0, d)) 292 | { 293 | mLogger.logPrint(Severity::kERROR, __FUNCTION__ , __LINE__, "The input dimension of the model is incorrect"); 294 | std::abort(); 295 | } 296 | context.enqueueV2((void **)mBuff, mStream, nullptr); 297 | } 298 | 299 | void YoloV9::doInfer(std::vector batchImg, 300 | std::vector imgInfoVec, 301 | std::vector &detResult) 302 | { 303 | int batch = imgInfoVec.size(); 304 | mImageSizeBatch = imgInfoVec; 305 | imgPreProcess(batchImg); 306 | modelInfer(*mContext, batch); 307 | gpuDecode(mBuff[7], batch,mConfTreshold, mNMSTreshold); 308 | detResult = getDetResultToCPU(batch); 309 | } -------------------------------------------------------------------------------- /yolov9_trt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import argparse 4 | import numpy as np 5 | import tensorrt as trt 6 | import pycuda.autoinit 7 | import pycuda.driver as cuda 8 | from python.AIResult import * 9 | from python.logging_system import Logger 10 | from python.tensorrt_base import TensorrtBase 11 | from python.draw_AI_results import draw_detect_results 12 | from python.decorators import time_cost, suppress_errors 13 | 14 | parser = argparse.ArgumentParser("yolov9_demo") 15 | parser.add_argument('--configs', type=str, default="configs", help="configs path") 16 | parser.add_argument('--yaml_file', type=str, default="yolov9py.yaml", help="yaml file name") 17 | parser.add_argument('--data', type=str, default="data", help="images data path") 18 | args = parser.parse_args() 19 | 20 | 21 | class Yolov9(TensorrtBase): 22 | def __init__(self, logger, config_path, config_file): 23 | super().__init__(logger) 24 | self.logger = logger 25 | assert self.read_parameters(config_path, config_file), self.logger.info("Read parameters failure!") 26 | assert self.create_engine_if_not_exit(), self.logger.error("create engine failure!") 27 | self.get_trt_model_stream() 28 | @time_cost 29 | @suppress_errors 30 | def preprocess(self, raw_bgr_image): 31 | """ 32 | description: Convert BGR image to RGB, 33 | resize and pad it to target size, normalize to [0,1], 34 | transform to NCHW format. 35 | param: 36 | input_image_path: str, image path 37 | return: 38 | image: the processed image 39 | image_raw: the original image 40 | h: original height 41 | w: original width 42 | """ 43 | image_raw = raw_bgr_image 44 | h, w, c = image_raw.shape 45 | image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) 46 | # Calculate widht and height and paddings 47 | r_w = self.input_w / w 48 | r_h = self.input_h / h 49 | if r_h > r_w: 50 | tw = self.input_w 51 | th = int(r_w * h) 52 | tx1 = tx2 = 0 53 | ty1 = int((self.input_h - th) / 2) 54 | ty2 = self.input_h - th - ty1 55 | else: 56 | tw = int(r_h * w) 57 | th = self.input_h 58 | tx1 = int((self.input_w - tw) / 2) 59 | tx2 = self.input_w - tw - tx1 60 | ty1 = ty2 = 0 61 | # Resize the image with long side while maintaining ratio 62 | image = cv2.resize(image, (tw, th)) 63 | # Pad the short side with (128,128,128) 64 | image = cv2.copyMakeBorder(image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)) 65 | image = image.astype(np.float32) 66 | # Normalize to [0,1] 67 | image /= 255.0 68 | # HWC to CHW format: 69 | image = np.transpose(image, [2, 0, 1]) 70 | # CHW to NCHW format 71 | image = np.expand_dims(image, axis=0) 72 | # Convert the image to row-major order, also known as "C order": 73 | image = np.ascontiguousarray(image) 74 | return image, image_raw, h, w 75 | 76 | @time_cost 77 | def post_process(self, output, origin_h, origin_w): 78 | ''' 79 | Post-process the output of YOLO model 80 | :param output: Output of model inference 81 | :param origin_h: Image original height 82 | :param origin_w: Image original width 83 | :return: Algorithm detection results 84 | ''' 85 | predict = np.transpose(np.reshape(output, (self.output_dim, self.output_anchor_num))) 86 | detect_results = list() 87 | boxes_list = list() 88 | class_ids = list() 89 | scores = list() 90 | scores_array = np.max(predict[:, 4:], axis=1) 91 | filter_predict = predict[scores_array > self.conf_treshold, :] 92 | for predict_box in filter_predict: 93 | cx = predict_box[0] 94 | cy = predict_box[1] 95 | width = predict_box[2] 96 | height = predict_box[3] 97 | score = np.max(predict_box[4:]) 98 | class_id = np.argmax(predict_box[4:]) 99 | ratio_w = self.input_w / origin_w 100 | ratio_h = self.input_h / origin_h 101 | if ratio_h > ratio_w: 102 | left = (cx - width / 2) / ratio_w 103 | top = (cy - height / 2 - (self.input_h - ratio_w * origin_h) / 2) / ratio_w 104 | right = (cx + width / 2) / ratio_w 105 | bottom = (cy + height / 2 - (self.input_h - ratio_w * origin_h) / 2) / ratio_w 106 | else: 107 | left = (cx - width / 2 - (self.input_w - ratio_h * origin_w) / 2) / ratio_h 108 | top = (cy - height / 2) / ratio_h 109 | right = (cx + width / 2 - (self.input_w - ratio_h * origin_w) / 2) / ratio_h 110 | bottom = (cy + height / 2) / ratio_h 111 | box_xywh = list(map(lambda x: int(x), [max(0, left), max(0, top), min(right - left, origin_w), min(bottom - top, origin_h)])) 112 | boxes_list.append(box_xywh) 113 | class_ids.append(class_id) 114 | scores.append(score) 115 | nms_result = cv2.dnn.NMSBoxes(boxes_list, scores, self.conf_treshold, self.nms_threshold) 116 | for i in range(len(nms_result)): 117 | idx = nms_result[i] 118 | class_id = class_ids[idx] 119 | score = scores[idx] 120 | box = boxes_list[idx] 121 | result = DetResult(score, box, class_id) 122 | detect_results.append(result) 123 | return detect_results 124 | 125 | @time_cost 126 | @suppress_errors 127 | def do_infer(self, img): 128 | start_time = cv2.getTickCount() 129 | # Do image preprocess 130 | self.ctx.push() 131 | input_image, image_raw, h, w = self.preprocess(img) 132 | # Copy input image to host buffer 133 | np.copyto(self.host_inputs[0], input_image.ravel()) 134 | # Transfer input data to the GPU. 135 | cuda.memcpy_htod_async(self.cuda_inputs[0], self.host_inputs[0], self.stream) 136 | # Run inference. 137 | self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle) 138 | # Transfer predictions back from the GPU. 139 | cuda.memcpy_dtoh_async(self.host_outputs[6], self.cuda_outputs[6], self.stream) 140 | # Synchronize the stream 141 | self.stream.synchronize() 142 | # Here we use the first row of output in that batch_size = 1 143 | output = self.host_outputs[6] 144 | # Do postprocess 145 | detect_results = self.post_process(output, image_raw.shape[0], image_raw.shape[1]) 146 | self.ctx.pop() 147 | # print cost time 148 | end_time = cv2.getTickCount() 149 | fps = 1 / ((end_time - start_time) / cv2.getTickFrequency()) 150 | self.logger.info("detect fps:{}".format(fps)) 151 | return detect_results 152 | 153 | @suppress_errors 154 | def get_trt_model_stream(self): 155 | ''' 156 | Obtain the data flow for Tensorrt model inference and initialize the model 157 | ''' 158 | self.ctx = cuda.Device(0).make_context() 159 | stream = cuda.Stream() 160 | TRT_LOGGER = self.trt_logger 161 | runtime = trt.Runtime(TRT_LOGGER) 162 | 163 | # Deserialize the engine from file 164 | with open(self.engine_file, "rb") as f: 165 | engine = runtime.deserialize_cuda_engine(f.read()) 166 | context = engine.create_execution_context() 167 | 168 | host_inputs = [] 169 | cuda_inputs = [] 170 | host_outputs = [] 171 | cuda_outputs = [] 172 | bindings = [] 173 | 174 | for binding_index, binding in enumerate(engine): 175 | self.logger.info("bingding shape:{}".format(engine.get_binding_shape(binding))) 176 | size = trt.volume(engine.get_binding_shape(binding)) 177 | dtype = trt.nptype(engine.get_binding_dtype(binding)) 178 | # Allocate host and device buffers 179 | host_mem = cuda.pagelocked_empty(size, dtype) 180 | cuda_mem = cuda.mem_alloc(host_mem.nbytes) 181 | # Append the device buffer to device bindings. 182 | bindings.append(int(cuda_mem)) 183 | # Append to the appropriate list. 184 | if binding_index == 0: 185 | self.input_w = engine.get_binding_shape(binding)[-1] 186 | self.input_h = engine.get_binding_shape(binding)[-2] 187 | host_inputs.append(host_mem) 188 | cuda_inputs.append(cuda_mem) 189 | elif binding_index == 7: 190 | self.output_anchor_num = engine.get_binding_shape(binding)[-1] 191 | self.output_dim = engine.get_binding_shape(binding)[-2] 192 | host_outputs.append(host_mem) 193 | cuda_outputs.append(cuda_mem) 194 | else: 195 | host_outputs.append(host_mem) 196 | cuda_outputs.append(cuda_mem) 197 | 198 | # Store 199 | self.stream = stream 200 | self.context = context 201 | self.engine = engine 202 | self.host_inputs = host_inputs 203 | self.cuda_inputs = cuda_inputs 204 | self.host_outputs = host_outputs 205 | self.cuda_outputs = cuda_outputs 206 | self.bindings = bindings 207 | 208 | def read_parameters(self, config_path, config_file): 209 | ''' 210 | Read parameters from config file 211 | :param config_path: Profile Path 212 | :param config_file: profile name 213 | :return: Did it read successfully 214 | ''' 215 | yaml_file = os.path.join(config_path, config_file) 216 | if os.path.exists(yaml_file): 217 | fs = cv2.FileStorage(yaml_file, cv2.FILE_STORAGE_READ) 218 | self.conf_treshold = fs.getNode('confTreshold').real() 219 | self.nms_threshold = fs.getNode('nmsThreshold').real() 220 | self.quantization_infer = fs.getNode("quantizationInfer").string() 221 | self.onnx_file = os.path.join(config_path, fs.getNode('onnxFile').string()) 222 | self.engine_file = os.path.join(config_path, fs.getNode('engineFile').string()) 223 | else: 224 | return False 225 | return True 226 | 227 | def destroy(self): 228 | self.ctx.pop() 229 | del self.ctx 230 | self.logger.info("yolov9 destroy") 231 | 232 | 233 | if __name__ == "__main__": 234 | log = Logger() 235 | logger = log.get_log("yolov9.txt") 236 | yolov9 = Yolov9(logger, args.configs, args.yaml_file) 237 | image_root = args.data 238 | file_list = os.listdir(image_root) 239 | for image_file in file_list: 240 | image_path = os.path.join(image_root, image_file) 241 | img = cv2.imdecode(np.fromfile(image_path, dtype=np.uint8), cv2.IMREAD_COLOR) 242 | detect_results = yolov9.do_infer(img) 243 | draw_detect_results(img, detect_results) 244 | yolov9.destroy() 245 | --------------------------------------------------------------------------------