├── .gitignore
├── CMakeLists.txt
├── README.md
├── configs
├── yolov9.yaml
└── yolov9py.yaml
├── data
├── 000000000036.jpg
├── 000000000144.jpg
├── 000000000194.jpg
└── 000000000368.jpg
├── demo.cpp
├── include
├── LoggingRT.h
├── TimerCounter.h
├── Yolov9.h
└── macros.h
├── python
├── AIResult.py
├── decorators.py
├── draw_AI_results.py
├── logging_system.py
└── tensorrt_base.py
├── result
├── 000000000036.jpg
├── 000000000144.jpg
└── performance.png
├── src
├── Decode.cu
├── Preprocess.cu
└── Yolov9.cpp
└── yolov9_trt.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | build/
3 | cmake-build-debug/
4 | configs/*.engine
5 | configs/*.onnx
6 | python/__pycache__/
7 | logs/
8 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.10)
2 | project(yolov9)
3 |
4 | option(RELEASE "build Yolov9 lib release" OFF)
5 |
6 | if(RELEASE)
7 | set(CMAKE_BUILD_TYPE Release)
8 | set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -DNODEBUG -O3 -Wall")
9 | else()
10 | set(CMAKE_BUILD_TYPE Debug)
11 | set(CMAKE_CXX_FLAGS_DEBUG "-DDEBUG -g") ### open compiler debug flag
12 | endif()
13 |
14 | set(CMAKE_CXX_COMPILIER "/usr/bin/g++")
15 | set(CMAKE_CXX_FLAGS "-std=c++14 -O3")
16 | set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath -Wl,$ORIGIN")
17 |
18 | #opencv
19 | find_package(OpenCV REQUIRED)
20 |
21 | # cuda
22 | find_package(CUDA)
23 | include_directories(/usr/local/cuda/include)
24 | link_directories(/usr/local/cuda/lib64)
25 |
26 | # tensorrt
27 | link_directories(/usr/include/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu)
28 | include_directories(/usr/lib/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu)
29 |
30 |
31 | #yolov9 source
32 | include_directories(${PROJECT_SOURCE_DIR}/include)
33 |
34 | file(GLOB LIB_SOURCES src/*.cu
35 | src/*.cpp
36 | include/*.h)
37 |
38 | #add cuda lib
39 | CUDA_ADD_LIBRARY(yolov9 SHARED ${LIB_SOURCES})
40 |
41 | target_link_libraries(yolov9
42 | nvinfer
43 | nvonnxparser
44 | pthread
45 | ${CUDA_LIBRARIES}
46 | ${OpenCV_LIBRARIES})
47 |
48 | message("OpenCV: ${OpenCV_LIBRARIES}")
49 | target_include_directories(yolov9 PRIVATE include/
50 | ${OpenCV_INCLUDE_DIRS}
51 | ${CUDA_TOOLKIT_ROOT_DIR}/include)
52 |
53 | add_executable(demo ${PROJECT_SOURCE_DIR}/demo.cpp)
54 | target_link_libraries(demo yolov9)
55 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | YOLOv9 tensorrt deployment
3 |
4 |
5 |
6 | This repository provides an API for accelerating inference deployment, with two open interface implementation: C++ and Python. C++also provides the use of CUDA programming to accelerate YOLOv9 model preprocessing and post-processing
7 | to pursue faster model inference speed🔥🔥🔥
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 | ## ⭐ Build
17 |
18 | 1. Export onnx
19 |
20 | Clone [YOLOv9](https://github.com/WongKinYiu/yolov9) code repository, download the original model provided by the repository, or train your own model, such as [yolov9-c.pt](https://objects.githubusercontent.com/github-production-release-asset-2e65be/759338070/c8ca43f2-0d2d-4aa3-a074-426505bfbfb1?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240223%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240223T073054Z&X-Amz-Expires=300&X-Amz-Signature=db76944695e398168b222b502bb019a301336e5b5dc74db31604699b8f837a9b&X-Amz-SignedHeaders=host&actor_id=45328395&key_id=0&repo_id=759338070&response-content-disposition=attachment%3B%20filename%3Dyolov9-c.pt&response-content-type=application%2Foctet-stream)
21 |
22 | ``` shell
23 | # export onnx
24 | python export.py --weights yolov9-c.pt --simplify --include "onnx"
25 | ```
26 |
27 | 2. Setup
28 |
29 | Place the exported onnx file in the "yolov9-tensorrt/configs" folder and configure the relevant parameters through the "yolov9-tensorrt/configs/yolov9.yaml" file
30 | ``` shell
31 | # move onnx
32 | cd yolov9-Tensorrt
33 | mv yolov9-c.onnx ./configs
34 | ```
35 |
36 | Modify parameter configuration in configs/yolov9-yaml
37 | ``` shell
38 | # modify configuration in configs/yolov9.yaml
39 | confTreshold: 0.25 #Detection confidence threshold
40 | nmsTreshold : 0.45 #nms threshold
41 | maxSupportBatchSize: 1 #support max input batch size
42 | quantizationInfer: "FP16" #support FP32 or FP16 quantization
43 | onnxFile: "yolov9-c.onnx" # The currently used onnx model file
44 | engineFile: "yolov9-c.engine" # Automatically generate file names for the Tensorrt inference engine
45 | ```
46 |
47 | 3. Build project
48 |
49 | ``` shell
50 | mkdir build
51 | cd build
52 | cmake ..
53 | make -j4
54 | ```
55 | 4. python API
56 | Modify parameter configuration in configs/yolov9py-yaml
57 |
58 | ``` shell
59 | # modify configuration in configs/yolov9py.yaml
60 | confTreshold: 0.3 # detect treshold
61 | nmsThreshold: 0.45 #nms treshold
62 | quantizationInfer: "FP16" #FP32 or FP16
63 | onnxFile: "yolov9-c.onnx" # The currently used onnx model file
64 | engineFile: "yolov9-c.engine" # Automatically generate file names for the Tensorrt inference engine
65 | ```
66 |
67 | ## 🌠 Run demo
68 | The first run will generate the inference engine ".engine" file in the configs folder. If the inference engine has already been generated, it will not be generated again
69 |
70 | run c++ demo API
71 | ``` shell
72 | # run images floder
73 | ./demo ../data
74 | ```
75 | run python demo API
76 | ``` shell
77 | # run images floder
78 | python yolov9_trt.py --configs configs --yaml_file yolov9py.yaml --data data
79 | ```
80 |
81 |
82 | 
83 |
84 |
85 | ## 👏 Acknowledgement
86 |
87 | This project is based on the following awesome projects:
88 | - [Yolov9](https://github.com/WongKinYiu/yolov9) - YOLOv9: Learning What You Want to Learn Using Programmable Gradient Information.
89 | - [TensorRT](https://github.com/NVIDIA/TensorRT/tree/release/8.6/samples) - TensorRT samples and api documentation.
90 |
91 | ## 🤗 Citation
92 |
93 | ```bibtex
94 | @article{wang2024yolov9,
95 | title={{YOLOv9}: Learning What You Want to Learn Using Programmable Gradient Information},
96 | author={Wang, Chien-Yao and Liao, Hong-Yuan Mark},
97 | booktitle={arXiv preprint arXiv:2402.13616},
98 | year={2024}
99 | }
100 | ```
--------------------------------------------------------------------------------
/configs/yolov9.yaml:
--------------------------------------------------------------------------------
1 | %YAML:1.0
2 | ---
3 | confTreshold: 0.25 #Detection confidence threshold
4 | nmsTreshold : 0.45 #nms threshold
5 | maxSupportBatchSize: 1 #support max input batch size
6 | quantizationInfer: "FP16" #support FP32 or FP16 quantization
7 | onnxFile: "yolov9-c.onnx" # The currently used onnx model file
8 | engineFile: "yolov9-c.engine" # Automatically generate file names for the Tensorrt inference engine
--------------------------------------------------------------------------------
/configs/yolov9py.yaml:
--------------------------------------------------------------------------------
1 | %YAML:1.0
2 | ---
3 | confTreshold: 0.3 # detect treshold
4 | nmsThreshold: 0.45 #nms treshold
5 | quantizationInfer: "FP16" #FP32 or FP16
6 | onnxFile: "yolov9-c.onnx"
7 | engineFile: "yolov9-c.engine"
--------------------------------------------------------------------------------
/data/000000000036.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LinhanDai/yolov9-tensorrt/7729c915733543c76a6ae9afb77d5a8825f924c0/data/000000000036.jpg
--------------------------------------------------------------------------------
/data/000000000144.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LinhanDai/yolov9-tensorrt/7729c915733543c76a6ae9afb77d5a8825f924c0/data/000000000144.jpg
--------------------------------------------------------------------------------
/data/000000000194.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LinhanDai/yolov9-tensorrt/7729c915733543c76a6ae9afb77d5a8825f924c0/data/000000000194.jpg
--------------------------------------------------------------------------------
/data/000000000368.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LinhanDai/yolov9-tensorrt/7729c915733543c76a6ae9afb77d5a8825f924c0/data/000000000368.jpg
--------------------------------------------------------------------------------
/demo.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | * Website: https://github.com/LinhanDai
3 | * @author dailinhan
4 | * @date 24-02-23 9:19
5 | _ooOoo_
6 | o8888888o
7 | 88" . "88
8 | (| -_- |)
9 | O\ = /O
10 | ____/`---'\____
11 | .' \\| |// `.
12 | / \\||| : |||// \
13 | / _||||| -:- |||||- \
14 | | | \\\ - /// | |
15 | | \_| ''\---/'' | |
16 | \ .-\__ `-` ___/-. /
17 | ___`. .' /--.--\ `. . __
18 | ."" '< `.___\_<|>_/___.' >'"".
19 | | | : `- \`.;`\ _ /`;.`/ - ` : | |
20 | \ \ `-. \_ __\ /__ _/ .-` / /
21 | ======`-.____`-.___\_____/___.-`____.-'======
22 | `=---='
23 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
24 | no error no bug
25 | */
26 |
27 | #include
28 | #include "TimerCounter.h"
29 |
30 | std::vector generateRandomColors(int classNum)
31 | {
32 | std::vector colors;
33 | for (int i = 0; i < classNum; i++)
34 | {
35 | colors.push_back(cv::Scalar(rand() % 256, rand() % 256, rand() % 256));
36 | }
37 | return colors;
38 | }
39 |
40 | void showResult(const std::vector& result, std::vector &imgCloneBatch, std::vector colors)
41 | {
42 | for (int i = 0; i < result.size(); i++)
43 | {
44 | detectResult batchResult = result[i];
45 | for (const auto& r: batchResult)
46 | {
47 | std::stringstream stream;
48 | stream << std::fixed << std::setprecision(2) << "id:" << r.label << " score:" << r.confidence;
49 | cv::rectangle(imgCloneBatch[i], cv::Point(r.left, r.top), cv::Point(r.right, r.bottom), colors[r.label], 2);
50 | cv::putText(imgCloneBatch[i], stream.str(), cv::Point(r.left, r.top - 5), 0, 0.8, colors[r.label], 2);
51 | }
52 | cv::imwrite("1.jpg", imgCloneBatch[i]);
53 | cv::namedWindow("Windows", cv::WINDOW_AUTOSIZE);
54 | cv::resizeWindow("Windows", imgCloneBatch[i].cols / 2, imgCloneBatch[i].rows / 2);
55 | cv::imshow("Windows", imgCloneBatch[i]);
56 | cv::waitKey(0);
57 | }
58 | }
59 |
60 | int main(int argc, char* argv[])
61 | {
62 | std::string configPath = "../configs";
63 | std::string configFile = "yolov9.yaml";
64 | std::vector images;
65 | if (argc != 2)
66 | {
67 | std::cout << "Need input test img folder path!!!" << std::endl;
68 | return 0;
69 | }
70 | std::string folderPath = argv[1];
71 | cv::String path(folderPath + "/*.jpg"); //small picture
72 | cv::glob(path, images);
73 | std::shared_ptr timer = std::make_shared();
74 | std::shared_ptr yoloObj = std::make_shared(configPath, configFile);
75 | std::vector colors = generateRandomColors(80);
76 | for (const auto& image: images)
77 | {
78 | std::vector imgMatVec;
79 | std::vector imgSrcVec;
80 | std::vector imgInfoVec;
81 | std::vector detectResult {};
82 | ImgInfo imgInfo{};
83 | cv::Mat img = cv::imread(image, cv::IMREAD_COLOR);
84 | imgInfo.width = img.cols;
85 | imgInfo.height = img.rows;
86 | imgInfo.channels = img.channels();
87 | unsigned char *deviceImgSrc;
88 | CHECK(cudaMalloc(&deviceImgSrc, img.cols * img.rows * img.channels() * sizeof(unsigned char)));
89 | CHECK(cudaMemcpy(deviceImgSrc, img.data, img.cols * img.rows * img.channels() * sizeof(unsigned char), cudaMemcpyHostToDevice));
90 | imgSrcVec.push_back(deviceImgSrc);
91 | imgInfoVec.push_back(imgInfo);
92 | imgMatVec.push_back(img);
93 | timer->start();
94 | yoloObj->doInfer(imgSrcVec, imgInfoVec, detectResult);
95 | timer->stop();
96 | float time = timer->elapsed_ms();
97 | std::cout << "cost time:" << time <<" ms, " <<"fps:" << 1000 / time << std::endl;
98 | showResult(detectResult, imgMatVec, colors);
99 | cudaFree(deviceImgSrc);
100 | }
101 | return 0;
102 | }
--------------------------------------------------------------------------------
/include/LoggingRT.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #ifndef TENSORRT_LOGGING_H
18 | #define TENSORRT_LOGGING_H
19 |
20 | #include "NvInferRuntimeCommon.h"
21 | #include
22 | #include
23 | #include
24 | #include
25 | #include
26 | #include
27 | #include
28 | #include "macros.h"
29 |
30 | using Severity = nvinfer1::ILogger::Severity;
31 |
32 | class LogStreamConsumerBuffer : public std::stringbuf
33 | {
34 | public:
35 | LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
36 | : mOutput(stream)
37 | , mPrefix(prefix)
38 | , mShouldLog(shouldLog)
39 | {
40 | }
41 |
42 | LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
43 | : mOutput(other.mOutput)
44 | {
45 | }
46 |
47 | ~LogStreamConsumerBuffer()
48 | {
49 | // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
50 | // std::streambuf::pptr() gives a pointer to the current position of the output sequence
51 | // if the pointer to the beginning is not equal to the pointer to the current position,
52 | // call putOutput() to log the output to the stream
53 | if (pbase() != pptr())
54 | {
55 | putOutput();
56 | }
57 | }
58 |
59 | // synchronizes the stream buffer and returns 0 on success
60 | // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
61 | // resetting the buffer and flushing the stream
62 | virtual int sync()
63 | {
64 | putOutput();
65 | return 0;
66 | }
67 |
68 | void putOutput()
69 | {
70 | if (mShouldLog)
71 | {
72 | // prepend timestamp
73 | std::time_t timestamp = std::time(nullptr);
74 | tm* tm_local = std::localtime(×tamp);
75 | std::cout << "[";
76 | std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
77 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
78 | std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
79 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
80 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
81 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
82 | // std::stringbuf::str() gets the string contents of the buffer
83 | // insert the buffer contents pre-appended by the appropriate prefix into the stream
84 | mOutput << mPrefix << str();
85 | // set the buffer to empty
86 | str("");
87 | // flush the stream
88 | mOutput.flush();
89 | }
90 | }
91 |
92 | void setShouldLog(bool shouldLog)
93 | {
94 | mShouldLog = shouldLog;
95 | }
96 |
97 | private:
98 | std::ostream& mOutput;
99 | std::string mPrefix;
100 | bool mShouldLog;
101 | };
102 |
103 | //!
104 | //! \class LogStreamConsumerBase
105 | //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
106 | //!
107 | class LogStreamConsumerBase
108 | {
109 | public:
110 | LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
111 | : mBuffer(stream, prefix, shouldLog)
112 | {
113 | }
114 |
115 | protected:
116 | LogStreamConsumerBuffer mBuffer;
117 | };
118 |
119 | //!
120 | //! \class LogStreamConsumer
121 | //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
122 | //! Order of base classes is LogStreamConsumerBase and then std::ostream.
123 | //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
124 | //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
125 | //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
126 | //! Please do not change the order of the parent classes.
127 | //!
128 | class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
129 | {
130 | public:
131 | //! \brief Creates a LogStreamConsumer which logs messages with level severity.
132 | //! Reportable severity determines if the messages are severe enough to be logged.
133 | LogStreamConsumer(Severity reportableSeverity, Severity severity)
134 | : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
135 | , std::ostream(&mBuffer) // links the stream buffer with the stream
136 | , mShouldLog(severity <= reportableSeverity)
137 | , mSeverity(severity)
138 | {
139 | }
140 |
141 | LogStreamConsumer(LogStreamConsumer&& other)
142 | : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
143 | , std::ostream(&mBuffer) // links the stream buffer with the stream
144 | , mShouldLog(other.mShouldLog)
145 | , mSeverity(other.mSeverity)
146 | {
147 | }
148 |
149 | void setReportableSeverity(Severity reportableSeverity)
150 | {
151 | mShouldLog = mSeverity <= reportableSeverity;
152 | mBuffer.setShouldLog(mShouldLog);
153 | }
154 |
155 | private:
156 | static std::ostream& severityOstream(Severity severity)
157 | {
158 | return severity >= Severity::kINFO ? std::cout : std::cerr;
159 | }
160 |
161 | static std::string severityPrefix(Severity severity)
162 | {
163 | switch (severity)
164 | {
165 | case Severity::kINTERNAL_ERROR: return "[F] ";
166 | case Severity::kERROR: return "[E] ";
167 | case Severity::kWARNING: return "[W] ";
168 | case Severity::kINFO: return "[I] ";
169 | case Severity::kVERBOSE: return "[V] ";
170 | default: assert(0); return "";
171 | }
172 | }
173 |
174 | bool mShouldLog;
175 | Severity mSeverity;
176 | };
177 |
178 | //! \class Logger
179 | //!
180 | //! \brief Class which manages logging of TensorRT tools and samples
181 | //!
182 | //! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
183 | //! and supports logging two types of messages:
184 | //!
185 | //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
186 | //! - Test pass/fail messages
187 | //!
188 | //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
189 | //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
190 | //!
191 | //! In the future, this class could be extended to support dumping test results to a file in some standard format
192 | //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
193 | //!
194 | //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
195 | //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
196 | //! library and messages coming from the sample.
197 | //!
198 | //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
199 | //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
200 | //! object.
201 |
202 | class Logger : public nvinfer1::ILogger
203 | {
204 | public:
205 | Logger(Severity severity = Severity::kWARNING)
206 | : mReportableSeverity(severity)
207 | {
208 | }
209 |
210 | //!
211 | //! \enum TestResult
212 | //! \brief Represents the state of a given test
213 | //!
214 | enum class TestResult
215 | {
216 | kRUNNING, //!< The test is running
217 | kPASSED, //!< The test passed
218 | kFAILED, //!< The test failed
219 | kWAIVED //!< The test was waived
220 | };
221 |
222 | //!
223 | //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
224 | //! \return The nvinfer1::ILogger associated with this Logger
225 | //!
226 | //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
227 | //! we can eliminate the inheritance of Logger from ILogger
228 | //!
229 | nvinfer1::ILogger& getTRTLogger()
230 | {
231 | return *this;
232 | }
233 |
234 | //!
235 | //! \brief Encapsulate the log function under nvinfer1 to print out the function name and the corresponding number of lines
236 | //!
237 | //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
238 | //! inheritance from nvinfer1::ILogger
239 | //!
240 | void logPrint(Severity severity, const char *funcName, int line, const char* msg)
241 | {
242 | std::string transMsg = "functionName: " + std::string(funcName) + " line: " + std::to_string(line) + " " + std::string(msg);
243 | log(severity, transMsg.c_str());
244 | }
245 |
246 | //!
247 | //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
248 | //!
249 | //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
250 | //! inheritance from nvinfer1::ILogger
251 | //!
252 | void log(Severity severity, const char* msg) TRT_NOEXCEPT override
253 | {
254 | LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
255 | }
256 |
257 | //!
258 | //! \brief Method for controlling the verbosity of logging output
259 | //!
260 | //! \param severity The logger will only emit messages that have severity of this level or higher.
261 | //!
262 | void setReportableSeverity(Severity severity)
263 | {
264 | mReportableSeverity = severity;
265 | }
266 |
267 | //!
268 | //! \brief Opaque handle that holds logging information for a particular test
269 | //!
270 | //! This object is an opaque handle to information used by the Logger to print test results.
271 | //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
272 | //! with Logger::reportTest{Start,End}().
273 | //!
274 | class TestAtom
275 | {
276 | public:
277 | TestAtom(TestAtom&&) = default;
278 |
279 | private:
280 | friend class Logger;
281 |
282 | TestAtom(bool started, const std::string& name, const std::string& cmdline)
283 | : mStarted(started)
284 | , mName(name)
285 | , mCmdline(cmdline)
286 | {
287 | }
288 |
289 | bool mStarted;
290 | std::string mName;
291 | std::string mCmdline;
292 | };
293 |
294 | //!
295 | //! \brief Define a test for logging
296 | //!
297 | //! \param[in] name The name of the test. This should be a string starting with
298 | //! "TensorRT" and containing dot-separated strings containing
299 | //! the characters [A-Za-z0-9_].
300 | //! For example, "TensorRT.sample_googlenet"
301 | //! \param[in] cmdline The command line used to reproduce the test
302 | //
303 | //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
304 | //!
305 | static TestAtom defineTest(const std::string& name, const std::string& cmdline)
306 | {
307 | return TestAtom(false, name, cmdline);
308 | }
309 |
310 | //!
311 | //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
312 | //! as input
313 | //!
314 | //! \param[in] name The name of the test
315 | //! \param[in] argc The number of command-line arguments
316 | //! \param[in] argv The array of command-line arguments (given as C strings)
317 | //!
318 | //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
319 | static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
320 | {
321 | auto cmdline = genCmdlineString(argc, argv);
322 | return defineTest(name, cmdline);
323 | }
324 |
325 | //!
326 | //! \brief Report that a test has started.
327 | //!
328 | //! \pre reportTestStart() has not been called yet for the given testAtom
329 | //!
330 | //! \param[in] testAtom The handle to the test that has started
331 | //!
332 | static void reportTestStart(TestAtom& testAtom)
333 | {
334 | reportTestResult(testAtom, TestResult::kRUNNING);
335 | assert(!testAtom.mStarted);
336 | testAtom.mStarted = true;
337 | }
338 |
339 | //!
340 | //! \brief Report that a test has ended.
341 | //!
342 | //! \pre reportTestStart() has been called for the given testAtom
343 | //!
344 | //! \param[in] testAtom The handle to the test that has ended
345 | //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
346 | //! TestResult::kFAILED, TestResult::kWAIVED
347 | //!
348 | static void reportTestEnd(const TestAtom& testAtom, TestResult result)
349 | {
350 | assert(result != TestResult::kRUNNING);
351 | assert(testAtom.mStarted);
352 | reportTestResult(testAtom, result);
353 | }
354 |
355 | static int reportPass(const TestAtom& testAtom)
356 | {
357 | reportTestEnd(testAtom, TestResult::kPASSED);
358 | return EXIT_SUCCESS;
359 | }
360 |
361 | static int reportFail(const TestAtom& testAtom)
362 | {
363 | reportTestEnd(testAtom, TestResult::kFAILED);
364 | return EXIT_FAILURE;
365 | }
366 |
367 | static int reportWaive(const TestAtom& testAtom)
368 | {
369 | reportTestEnd(testAtom, TestResult::kWAIVED);
370 | return EXIT_SUCCESS;
371 | }
372 |
373 | static int reportTest(const TestAtom& testAtom, bool pass)
374 | {
375 | return pass ? reportPass(testAtom) : reportFail(testAtom);
376 | }
377 |
378 | Severity getReportableSeverity() const
379 | {
380 | return mReportableSeverity;
381 | }
382 |
383 | private:
384 | //!
385 | //! \brief returns an appropriate string for prefixing a log message with the given severity
386 | //!
387 | static const char* severityPrefix(Severity severity)
388 | {
389 | switch (severity)
390 | {
391 | case Severity::kINTERNAL_ERROR: return "[F] ";
392 | case Severity::kERROR: return "[E] ";
393 | case Severity::kWARNING: return "[W] ";
394 | case Severity::kINFO: return "[I] ";
395 | case Severity::kVERBOSE: return "[V] ";
396 | default: assert(0); return "";
397 | }
398 | }
399 |
400 | //!
401 | //! \brief returns an appropriate string for prefixing a test result message with the given result
402 | //!
403 | static const char* testResultString(TestResult result)
404 | {
405 | switch (result)
406 | {
407 | case TestResult::kRUNNING: return "RUNNING";
408 | case TestResult::kPASSED: return "PASSED";
409 | case TestResult::kFAILED: return "FAILED";
410 | case TestResult::kWAIVED: return "WAIVED";
411 | default: assert(0); return "";
412 | }
413 | }
414 |
415 | //!
416 | //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
417 | //!
418 | static std::ostream& severityOstream(Severity severity)
419 | {
420 | return severity >= Severity::kINFO ? std::cout : std::cerr;
421 | }
422 |
423 | //!
424 | //! \brief method that implements logging test results
425 | //!
426 | static void reportTestResult(const TestAtom& testAtom, TestResult result)
427 | {
428 | severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
429 | << testAtom.mCmdline << std::endl;
430 | }
431 |
432 | //!
433 | //! \brief generate a command line string from the given (argc, argv) values
434 | //!
435 | static std::string genCmdlineString(int argc, char const* const* argv)
436 | {
437 | std::stringstream ss;
438 | for (int i = 0; i < argc; i++)
439 | {
440 | if (i > 0)
441 | ss << " ";
442 | ss << argv[i];
443 | }
444 | return ss.str();
445 | }
446 |
447 | Severity mReportableSeverity;
448 | };
449 |
450 | namespace
451 | {
452 |
453 | //!
454 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
455 | //!
456 | //! Example usage:
457 | //!
458 | //! LOG_VERBOSE(logger) << "hello world" << std::endl;
459 | //!
460 | inline LogStreamConsumer LOG_VERBOSE_RT(const Logger& logger)
461 | {
462 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
463 | }
464 |
465 | //!
466 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
467 | //!
468 | //! Example usage:
469 | //!
470 | //! LOG_INFO(logger) << "hello world" << std::endl;
471 | //!
472 | inline LogStreamConsumer LOG_INFO_RT(const Logger& logger)
473 | {
474 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
475 | }
476 |
477 | //!
478 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
479 | //!
480 | //! Example usage:
481 | //!
482 | //! LOG_WARN(logger) << "hello world" << std::endl;
483 | //!
484 | inline LogStreamConsumer LOG_WARN_RT(const Logger& logger)
485 | {
486 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
487 | }
488 |
489 | //!
490 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
491 | //!
492 | //! Example usage:
493 | //!
494 | //! LOG_ERROR(logger) << "hello world" << std::endl;
495 | //!
496 | inline LogStreamConsumer LOG_ERROR_RT(const Logger& logger)
497 | {
498 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
499 | }
500 |
501 | //!
502 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
503 | // ("fatal" severity)
504 | //!
505 | //! Example usage:
506 | //!
507 | //! LOG_FATAL(logger) << "hello world" << std::endl;
508 | //!
509 | inline LogStreamConsumer LOG_FATAL_RT(const Logger& logger)
510 | {
511 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
512 | }
513 |
514 | } // anonymous namespace
515 |
516 | #endif // TENSORRT_LOGGING_H
517 |
--------------------------------------------------------------------------------
/include/TimerCounter.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Website: https://github.com/LinhanDai
3 | * @author dailinhan
4 | * @date 24-02-23 9:30
5 | _ooOoo_
6 | o8888888o
7 | 88" . "88
8 | (| -_- |)
9 | O\ = /O
10 | ____/`---'\____
11 | .' \\| |// `.
12 | / \\||| : |||// \
13 | / _||||| -:- |||||- \
14 | | | \\\ - /// | |
15 | | \_| ''\---/'' | |
16 | \ .-\__ `-` ___/-. /
17 | ___`. .' /--.--\ `. . __
18 | ."" '< `.___\_<|>_/___.' >'"".
19 | | | : `- \`.;`\ _ /`;.`/ - ` : | |
20 | \ \ `-. \_ __\ /__ _/ .-` / /
21 | ======`-.____`-.___\_____/___.-`____.-'======
22 | `=---='
23 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
24 | no error no bug
25 | */
26 |
27 | #pragma once
28 | #ifndef YOLOV9_TIMERCOUNTER_H
29 | #define YOLOV9_TIMERCOUNTER_H
30 |
31 | #include
32 | #include
33 | #include
34 |
35 | class CPUTimer
36 | {
37 | public:
38 | CPUTimer()
39 | {
40 | mStart = std::chrono::high_resolution_clock::now();
41 | }
42 |
43 | void start()
44 | {
45 | mStart = std::chrono::high_resolution_clock::now();
46 | }
47 |
48 | void stop()
49 | {
50 | mEnd = std::chrono::high_resolution_clock::now();
51 | }
52 |
53 | float elapsed_ms()
54 | {
55 | int64_t dur = 0;
56 | dur = std::chrono::duration_cast(mEnd - mStart).count(); // us
57 | return (float)(dur) / 1000;
58 | }
59 |
60 | private:
61 | std::chrono::time_point mStart;
62 | std::chrono::time_point mEnd;
63 | };
64 |
65 | class GPUTimer
66 | {
67 | public:
68 | GPUTimer()
69 | {
70 | cudaEventCreate(&mStart);
71 | cudaEventCreate(&mEnd);
72 | }
73 |
74 | float elapsed_ms()
75 | {
76 | float ms = 0;
77 | cudaEventElapsedTime(&ms, mStart, mEnd);
78 | return ms;
79 | }
80 |
81 | void start()
82 | {
83 | cudaEventRecord(mStart);
84 | }
85 |
86 | void stop()
87 | {
88 | cudaEventRecord(mEnd);
89 | cudaEventSynchronize(mEnd);
90 | }
91 |
92 | private:
93 | cudaEvent_t mStart;
94 | cudaEvent_t mEnd;
95 | };
96 |
97 | #endif //YOLOV9_TIMERCOUNTER_H
98 |
--------------------------------------------------------------------------------
/include/Yolov9.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Website: https://github.com/LinhanDai
3 | * @author dailinhan
4 | * @date 24-02-23 9:19
5 | _ooOoo_
6 | o8888888o
7 | 88" . "88
8 | (| -_- |)
9 | O\ = /O
10 | ____/`---'\____
11 | .' \\| |// `.
12 | / \\||| : |||// \
13 | / _||||| -:- |||||- \
14 | | | \\\ - /// | |
15 | | \_| ''\---/'' | |
16 | \ .-\__ `-` ___/-. /
17 | ___`. .' /--.--\ `. . __
18 | ."" '< `.___\_<|>_/___.' >'"".
19 | | | : `- \`.;`\ _ /`;.`/ - ` : | |
20 | \ \ `-. \_ __\ /__ _/ .-` / /
21 | ======`-.____`-.___\_____/___.-`____.-'======
22 | `=---='
23 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
24 | no error no bug
25 | */
26 |
27 | #ifndef YOLOV9_TENSORRT_YOLOV9_H
28 | #define YOLOV9_TENSORRT_YOLOV9_H
29 |
30 | #include
31 | #include
32 | #include
33 | #include
34 | #include
35 | #include
36 | #include
37 | #include
38 | #include
39 | #include
40 | #include "LoggingRT.h"
41 |
42 |
43 | #define CHECK(op) __check_cuda_runtime((op), #op, __FILE__, __LINE__)
44 |
45 | constexpr long long int operator"" _GiB(long long unsigned int val)
46 | {
47 | return val * (1 << 30);
48 | }
49 |
50 | #define MAX_OBJECTS 1000
51 | #define NUM_BOX_ELEMENT 7 // left, top, right, bottom, confidence, class, keepflag
52 | #define GPU_MAX_LIMIT_WIDTH 4096
53 | #define GPU_MAX_LIMIT_HEIGHT 4096
54 | #define GPU_MAX_LIMIT_CHANNEL 3
55 |
56 | inline bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line)
57 | {
58 | if(code != cudaSuccess)
59 | {
60 | const char* err_name = cudaGetErrorName(code);
61 | const char* err_message = cudaGetErrorString(code);
62 | std::cout << "runtime error " << file << ":" << line << " :" << " " << op << " failed, code:" << err_name << " massage:" << err_message << std::endl;
63 | return false;
64 | }
65 | return true;
66 | }
67 |
68 | struct ImgInfo
69 | {
70 | int width;
71 | int height;
72 | int channels;
73 | };
74 |
75 | struct Box{
76 | int left, top, right, bottom;
77 | float confidence;
78 | int label;
79 | int trackerID;
80 |
81 | Box() = default;
82 | Box(int left, int top, int right, int bottom, float confidence, int label, int trackerID):
83 | left(left), top(top), right(right), bottom(bottom), confidence(confidence), label(label),trackerID(trackerID){}
84 | };
85 | typedef std::vector detectResult;
86 |
87 |
88 | extern "C" void transpose_kernel_invoker(float *src, int num_bboxes, int num_elements,float *dst,cudaStream_t stream);
89 |
90 | extern "C" void decode_kernel_invoker(
91 | float* predict, int num_bboxes, int num_classes, float confidence_threshold,
92 | float nms_threshold, float* invert_affine_matrix, float* parray, int max_objects,
93 | int num_box_element, cudaStream_t stream);
94 |
95 | extern "C" void preprocess_kernel_img(
96 | uint8_t* src, int src_width, int src_height,
97 | float* dst, int dst_width, int dst_height,
98 | float *d2i, cudaStream_t stream);
99 |
100 | class YoloV9
101 | {
102 | public:
103 | struct AffineMatrix //Preprocessing affine transformation matrix and inverse matrix
104 | {
105 | float i2d[6]; //transformation matrix
106 | float d2i[6]; //inverse matrix
107 | };
108 |
109 | public:
110 | explicit YoloV9(const std::string& configPath, const std::string &configFile);
111 | void doInfer(std::vector batchImg,
112 | std::vector imgInfoVec,
113 | std::vector &detResult);
114 |
115 | private:
116 | std::vector getDetResultToCPU(int batch);
117 | void getAffineMartrix(AffineMatrix &afmt,cv::Size &to,cv::Size &from);
118 | void gpuDecode(float* anchorsProb, int batch, float confidence_threshold, float nms_threshold);
119 | void imgPreProcess(std::vector &batchImg);
120 | void getTrtmodelStream();
121 | void getBindingDimsInfo();
122 | void createInferenceEngine(nvinfer1::IHostMemory **modelStream);;
123 | void modelInfer(nvinfer1::IExecutionContext& context, int batchSize);
124 | bool readParameters(const std::string& configPath, const std::string& configFile);
125 | bool createEngineIfNotExit();
126 | nvinfer1::IHostMemory *createEngine(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config);
127 |
128 | private:
129 | int mMaxSupportBatchSize{};
130 | int mInputH{};
131 | int mInputW{};
132 | int mInputC{};
133 | int mOutputAnchorsNum;
134 | int mOutputAnchorsDim;
135 | int mOutputAnchorsSize;
136 | std::string mOnnxFile;
137 | std::string mEngineFile;
138 | std::string mQuantizationInfer;
139 | unsigned char *mDeviceWarpAffine;
140 | char *mTrtModelStream{};
141 | nvinfer1::IRuntime *mRuntime{};
142 | nvinfer1::ICudaEngine *mEngine{};
143 | nvinfer1::IExecutionContext *mContext{};
144 | cudaStream_t mStream{};
145 | float *mAffineMatrixD2iHost;
146 | float *mAffineMatrixD2iDevice;
147 | float mConfTreshold;
148 | float mNMSTreshold;
149 | float *mBuff[9];
150 | float* mOutputDevice;
151 | float* mTransposeDevice;
152 | float* mOutputHost;
153 | std::vector mImageSizeBatch;
154 | Logger mLogger;
155 | };
156 |
157 | #endif //YOLOV9_TENSORRT_YOLOV9_H
158 |
--------------------------------------------------------------------------------
/include/macros.h:
--------------------------------------------------------------------------------
1 | #ifndef __MACROS_H
2 | #define __MACROS_H
3 |
4 | #include "NvInfer.h"
5 |
6 | #ifdef API_EXPORTS
7 | #if defined(_MSC_VER)
8 | #define API __declspec(dllexport)
9 | #else
10 | #define API __attribute__((visibility("default")))
11 | #endif
12 | #else
13 |
14 | #if defined(_MSC_VER)
15 | #define API __declspec(dllimport)
16 | #else
17 | #define API
18 | #endif
19 | #endif // API_EXPORTS
20 |
21 | #if NV_TENSORRT_MAJOR >= 8
22 | #define TRT_NOEXCEPT noexcept
23 | #define TRT_CONST_ENQUEUE const
24 | #else
25 | #define TRT_NOEXCEPT
26 | #define TRT_CONST_ENQUEUE
27 | #endif
28 |
29 | #endif // __MACROS_H
30 |
--------------------------------------------------------------------------------
/python/AIResult.py:
--------------------------------------------------------------------------------
1 | class DetResult(object):
2 | def __init__(self, score, box, class_id):
3 | self.score = score # confidence
4 | self.box = box # x1,y1,w,h
5 | self.class_id = class_id # class_id
--------------------------------------------------------------------------------
/python/decorators.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 |
4 | def time_cost(func):
5 | def wrapper(*args, **kwargs):
6 | start_time = time.time()
7 | result = func(*args, **kwargs)
8 | end_time = time.time()
9 | print(f"{func.__name__} took {(end_time - start_time) * 1000 :.4f} ms to execute.")
10 | return result
11 | return wrapper
12 |
13 |
14 | def suppress_errors(func):
15 | def wrapper(*args, **kwargs):
16 | try:
17 | return func(*args, **kwargs)
18 | except Exception as e:
19 | print(f"Error in {func.__name__}: {e}")
20 | return None
21 | return wrapper
--------------------------------------------------------------------------------
/python/draw_AI_results.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import random
3 | from python.AIResult import *
4 |
5 | #Generic paint color
6 | colors = list()
7 | while len(colors) < 100:
8 | # Randomly generate RGB color values
9 | b = random.randint(0, 255)
10 | g = random.randint(0, 255)
11 | r = random.randint(0, 255)
12 | color = (b, g, r)
13 | # Check if the same color already exists
14 | if color not in colors:
15 | colors.append(color)
16 |
17 |
18 | def draw_detect_results(img, results):
19 | '''
20 | Draw detection results
21 | :param img: src img
22 | :param results: detect results
23 | '''
24 | for r in results:
25 | cv2.rectangle(img, (r.box[0], r.box[1]), (r.box[0] + r.box[2], r.box[1] + r.box[3]), colors[r.class_id], 3)
26 | label_str = "id:" + str(r.class_id) + " " + str(round(r.score, 2))
27 | cv2.putText(img, label_str, (r.box[0], r.box[1] - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.3, colors[r.class_id], 2)
28 | cv2.namedWindow("detect", cv2.WINDOW_NORMAL)
29 | cv2.imshow("detect", img)
30 | cv2.waitKey(0)
--------------------------------------------------------------------------------
/python/logging_system.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import datetime
4 |
5 |
6 | class Logger(object):
7 | def __init__(self, level="DEBUG"):
8 | # Create a logger object
9 | current_date = datetime.date.today()
10 | self.logger = logging.getLogger(__name__)
11 | self.logger.setLevel(level)
12 | self.project_name = "yolov9-tensorrt"
13 | self.project_path = self.get_project_path()
14 | self.log_dir = os.path.join(self.project_path, "logs")
15 | log_dir_name = "{}-{}-{}".format(current_date.year, current_date.month, current_date.day)
16 | self.log_dir = os.path.join(self.log_dir, log_dir_name)
17 | os.makedirs(self.log_dir, exist_ok=True)
18 |
19 | def get_project_path(self):
20 | script_path = os.path.abspath(__file__)
21 | path = os.path.dirname(script_path)
22 | pos = path.rfind(self.project_name)
23 | return os.path.join(path[:pos], self.project_name)
24 |
25 | def console_handler(self, level="DEBUG"):
26 | # Create a log processor for the console
27 | console_handler = logging.StreamHandler()
28 | console_handler.setLevel(level)
29 |
30 | # Add output format to processor
31 | console_handler.setFormatter(self.get_formatter()[0])
32 |
33 | # Return to controller
34 | return console_handler
35 |
36 | def file_handler(self, log_file, level="DEBUG"):
37 | log_file = os.path.join(self.log_dir, log_file)
38 | # Log processor for creating files
39 | file_handler = logging.FileHandler(log_file, mode="w", encoding="utf-8")
40 | file_handler.setLevel(level)
41 |
42 | # Add output format to processor
43 | file_handler.setFormatter(self.get_formatter()[1])
44 |
45 | # Return to controller
46 | return file_handler
47 |
48 | def get_formatter(self):
49 | """Formatter"""
50 | console_fmt = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(module)s,%(funcName)s: %(message)s')
51 | file_fmt = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(module)s,%(funcName)s: %(message)s')
52 | # Returns a tuple
53 | return console_fmt, file_fmt
54 |
55 | def get_log(self, log_file, level="DEBUG"):
56 | # Adding a console processor to the logger
57 | self.logger.addHandler(self.console_handler(level))
58 | # Adding a file processor to the logger
59 | self.logger.addHandler(self.file_handler(log_file, level))
60 |
61 | # Return Log Instance Object
62 | return self.logger
63 |
64 |
65 | if __name__ == "__main__":
66 | log = Logger()
67 | logger = log.get_log("log.txt")
68 | logger.info("hello world")
--------------------------------------------------------------------------------
/python/tensorrt_base.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tensorrt as trt
3 |
4 |
5 | class TensorrtBase(object):
6 | def __init__(self, logger):
7 | '''
8 | Initialize the base class for building tensorrt
9 | :param logger: Logging system
10 | '''
11 | self.logger = logger
12 | self.trt_logger = trt.Logger(trt.Logger.WARNING)
13 | self.quantization_infer = None
14 | self.engine_file = None
15 | self.onnx_file = None
16 |
17 | def create_engine_if_not_exit(self):
18 | '''
19 | If the inference engine does not exist, create it
20 | :return: Whether the inference engine was successfully created
21 | '''
22 | serialized_model = None
23 | if os.path.exists(self.engine_file):
24 | return True
25 | else:
26 | builder = trt.Builder(self.trt_logger)
27 | config = builder.create_builder_config()
28 | engine = self.create_engine(builder, config)
29 | assert serialized_model is None, self.logger.error("engine create failure!")
30 | with open(self.engine_file, "wb") as f:
31 | f.write(engine.serialize())
32 | return True
33 |
34 | def create_engine(self, builder, config):
35 | '''
36 | Create inference engine
37 | :param builder: TRT construction
38 | :param config: TRT configuration
39 | :return: Inference engine
40 | '''
41 | explicitBatch = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
42 | network = builder.create_network(explicitBatch)
43 | parser = trt.OnnxParser(network, self.trt_logger)
44 | parsed = parser.parse_from_file(self.onnx_file)
45 | config.max_workspace_size = 1 << 30
46 | if self.quantization_infer == "FP16":
47 | self.logger.info("create engine with FP16")
48 | config.set_flag(trt.BuilderFlag.FP16)
49 | else:
50 | self.logger.info("create engine with TF32")
51 | config.set_flag(trt.BuilderFlag.TF32)
52 |
53 | input_Dims = network.get_input(0).shape
54 | if input_Dims[0] == -1:
55 | profile_calib = builder.create_optimization_profile()
56 | input_name = network.get_input(0).get_name()
57 | batch_dim = input_Dims
58 | batch_dim.d[0] = 1
59 | profile_calib.set_shape(input_name, batch_dim)
60 | config.add_optimization_profile(profile_calib)
61 |
62 | self.logger.info("Creating an inference engine, please wait a few minutes!!!")
63 | engine = builder.build_engine(network, config)
64 | self.logger.info("Creating an inference engine successful!")
65 | return engine
66 |
--------------------------------------------------------------------------------
/result/000000000036.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LinhanDai/yolov9-tensorrt/7729c915733543c76a6ae9afb77d5a8825f924c0/result/000000000036.jpg
--------------------------------------------------------------------------------
/result/000000000144.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LinhanDai/yolov9-tensorrt/7729c915733543c76a6ae9afb77d5a8825f924c0/result/000000000144.jpg
--------------------------------------------------------------------------------
/result/performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LinhanDai/yolov9-tensorrt/7729c915733543c76a6ae9afb77d5a8825f924c0/result/performance.png
--------------------------------------------------------------------------------
/src/Decode.cu:
--------------------------------------------------------------------------------
1 | /**
2 | * Website: https://github.com/LinhanDai
3 | * @author dailinhan
4 | * @date 24-02-23 10:24
5 | _ooOoo_
6 | o8888888o
7 | 88" . "88
8 | (| -_- |)
9 | O\ = /O
10 | ____/`---'\____
11 | .' \\| |// `.
12 | / \\||| : |||// \
13 | / _||||| -:- |||||- \
14 | | | \\\ - /// | |
15 | | \_| ''\---/'' | |
16 | \ .-\__ `-` ___/-. /
17 | ___`. .' /--.--\ `. . __
18 | ."" '< `.___\_<|>_/___.' >'"".
19 | | | : `- \`.;`\ _ /`;.`/ - ` : | |
20 | \ \ `-. \_ __\ /__ _/ .-` / /
21 | ======`-.____`-.___\_____/___.-`____.-'======
22 | `=---='
23 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
24 | no error no bug
25 | */
26 |
27 | #include
28 | #include
29 |
30 |
31 | static __global__ void transpose_kernel(float *src, int num_bboxes, int num_elements,float *dst, int edge)
32 | {
33 | int position = blockDim.x * blockIdx.x + threadIdx.x;
34 | if (position>=edge)
35 | return;
36 | dst[position]=src[position / num_elements + (position % num_elements) * num_bboxes];
37 | }
38 |
39 | extern "C" void transpose_kernel_invoker(float *src, int num_bboxes, int num_elements,float *dst,cudaStream_t stream)
40 | {
41 | int edge = num_bboxes * num_elements;
42 | int block = 256;
43 | int gird = ceil(edge / (float)block);
44 | transpose_kernel<<>>(src,num_bboxes,num_elements, dst, edge);
45 | }
46 |
47 | static __device__ void affine_project(float* matrix, float x, float y, float* ox, float* oy)
48 | {
49 | *ox = matrix[0] * x + matrix[1] * y + matrix[2];
50 | *oy = matrix[3] * x + matrix[4] * y + matrix[5];
51 | }
52 |
53 | static __global__ void decode_kernel(
54 | float* predict, int num_bboxes, int num_classes,
55 | float confidence_threshold, float* invert_affine_matrix,
56 | float* parray, int max_objects, int NUM_BOX_ELEMENT)
57 | {
58 |
59 | int position = blockDim.x * blockIdx.x + threadIdx.x;
60 | if (position >= num_bboxes) return;
61 |
62 | float* pitem = predict + (4 + num_classes) * position;
63 |
64 | float* class_confidence = pitem + 4;
65 | float confidence = *class_confidence++;
66 | int label = 0;
67 | for(int i = 1; i < num_classes; ++i, ++class_confidence)
68 | {
69 | if(*class_confidence > confidence)
70 | {
71 | confidence = *class_confidence;
72 | label = i;
73 | }
74 | }
75 |
76 | // confidence *= objectness;
77 | if(confidence < confidence_threshold)
78 | return;
79 |
80 | int index = atomicAdd(parray, 1);
81 | if(index >= max_objects)
82 | return;
83 | // printf("index %d max_objects %d\n", index,max_objects);
84 | float cx = pitem[0];
85 | float cy = pitem[1];
86 | float width = pitem[2];
87 | float height = pitem[3];
88 |
89 | float left = cx - width * 0.5f;
90 | float top = cy - height * 0.5f;
91 | float right = cx + width * 0.5f;
92 | float bottom = cy + height * 0.5f;
93 |
94 | affine_project(invert_affine_matrix, left, top, &left, &top);
95 | affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
96 |
97 |
98 | float* pout_item = parray + 1 + index * NUM_BOX_ELEMENT;
99 | *pout_item++ = left;
100 | *pout_item++ = top;
101 | *pout_item++ = right;
102 | *pout_item++ = bottom;
103 | *pout_item++ = confidence;
104 | *pout_item++ = label;
105 | *pout_item++ = 1; // 1 = keep, 0 = ignore
106 | }
107 |
108 | static __device__ float box_iou(
109 | float aleft, float atop, float aright, float abottom,
110 | float bleft, float btop, float bright, float bbottom)
111 | {
112 | float cleft = max(aleft, bleft);
113 | float ctop = max(atop, btop);
114 | float cright = min(aright, bright);
115 | float cbottom = min(abottom, bbottom);
116 |
117 | float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
118 | if(c_area == 0.0f)
119 | return 0.0f;
120 |
121 | float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
122 | float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
123 | return c_area / (a_area + b_area - c_area);
124 | }
125 |
126 | static __global__ void fast_nms_kernel(float* bboxes,
127 | int max_objects,
128 | float threshold,
129 | int NUM_BOX_ELEMENT)
130 | {
131 | int position = (blockDim.x * blockIdx.x + threadIdx.x);
132 | int count = min((int)*bboxes, max_objects);
133 | if (position >= count)
134 | return;
135 |
136 | // left, top, right, bottom, confidence, class, keepflag
137 | float* pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT;
138 | for(int i = 0; i < count; ++i)
139 | {
140 | float* pitem = bboxes + 1 + i * NUM_BOX_ELEMENT;
141 | if(i == position || pcurrent[5] != pitem[5]) continue;
142 |
143 | if(pitem[4] >= pcurrent[4])
144 | {
145 | if(pitem[4] == pcurrent[4] && i < position)
146 | continue;
147 |
148 | float iou = box_iou(
149 | pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3],
150 | pitem[0], pitem[1], pitem[2], pitem[3]
151 | );
152 |
153 | if(iou > threshold)
154 | {
155 | pcurrent[6] = 0; // 1=keep, 0=ignore
156 | return;
157 | }
158 | }
159 | }
160 | }
161 |
162 | extern "C" void decode_kernel_invoker(
163 | float* predict, int num_bboxes, int num_classes, float confidence_threshold,
164 | float nms_threshold, float* invert_affine_matrix, float* parray, int max_objects,
165 | int num_box_element, cudaStream_t stream)
166 | {
167 | auto block = num_bboxes > 512 ? 512 : num_bboxes;
168 | auto grid = (num_bboxes + block - 1) / block;
169 | decode_kernel<<>>(
170 | predict, num_bboxes, num_classes,
171 | confidence_threshold, invert_affine_matrix,
172 | parray, max_objects, num_box_element);
173 |
174 | block = max_objects > 512 ? 512 : max_objects;
175 | grid = (max_objects + block - 1) / block;
176 | fast_nms_kernel<<>>(parray, max_objects, nms_threshold, num_box_element);
177 | }
--------------------------------------------------------------------------------
/src/Preprocess.cu:
--------------------------------------------------------------------------------
1 | /**
2 | * Website: https://github.com/LinhanDai
3 | * @author dailinhan
4 | * @date 24-02-23 11:40
5 | _ooOoo_
6 | o8888888o
7 | 88" . "88
8 | (| -_- |)
9 | O\ = /O
10 | ____/`---'\____
11 | .' \\| |// `.
12 | / \\||| : |||// \
13 | / _||||| -:- |||||- \
14 | | | \\\ - /// | |
15 | | \_| ''\---/'' | |
16 | \ .-\__ `-` ___/-. /
17 | ___`. .' /--.--\ `. . __
18 | ."" '< `.___\_<|>_/___.' >'"".
19 | | | : `- \`.;`\ _ /`;.`/ - ` : | |
20 | \ \ `-. \_ __\ /__ _/ .-` / /
21 | ======`-.____`-.___\_____/___.-`____.-'======
22 | `=---='
23 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
24 | no error no bug
25 | */
26 |
27 | #include
28 | #include
29 |
30 | __global__ void warpaffine_kernel(
31 | uint8_t* src, int src_line_size, int src_width,
32 | int src_height, float* dst, int dst_width,
33 | int dst_height, uint8_t const_value_st,
34 | float * d2i, int edge)
35 | {
36 | int position = blockDim.x * blockIdx.x + threadIdx.x;
37 | if (position >= edge) return;
38 |
39 | float m_x1 = d2i[0];
40 | float m_y1 = d2i[1];
41 | float m_z1 = d2i[2];
42 | float m_x2 = d2i[3];
43 | float m_y2 = d2i[4];
44 | float m_z2 = d2i[5];
45 |
46 | int dx = position % dst_width;
47 | int dy = position / dst_width;
48 | float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
49 | float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
50 | float c0, c1, c2;
51 |
52 | if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height)
53 | {
54 | // out of range
55 | c0 = const_value_st;
56 | c1 = const_value_st;
57 | c2 = const_value_st;
58 | }
59 | else
60 | {
61 | int y_low = floorf(src_y);
62 | int x_low = floorf(src_x);
63 | int y_high = y_low + 1;
64 | int x_high = x_low + 1;
65 |
66 | uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
67 | float ly = src_y - y_low;
68 | float lx = src_x - x_low;
69 | float hy = 1 - ly;
70 | float hx = 1 - lx;
71 | float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
72 | uint8_t* v1 = const_value;
73 | uint8_t* v2 = const_value;
74 | uint8_t* v3 = const_value;
75 | uint8_t* v4 = const_value;
76 |
77 | if (y_low >= 0)
78 | {
79 | if (x_low >= 0)
80 | v1 = src + y_low * src_line_size + x_low * 3;
81 |
82 | if (x_high < src_width)
83 | v2 = src + y_low * src_line_size + x_high * 3;
84 | }
85 |
86 | if (y_high < src_height)
87 | {
88 | if (x_low >= 0)
89 | v3 = src + y_high * src_line_size + x_low * 3;
90 |
91 | if (x_high < src_width)
92 | v4 = src + y_high * src_line_size + x_high * 3;
93 | }
94 |
95 | c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
96 | c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
97 | c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
98 | }
99 |
100 | //bgr to rgb
101 | float t = c2;
102 | c2 = c0;
103 | c0 = t;
104 |
105 | //normalization
106 | c0 = c0 / 255.0f;
107 | c1 = c1 / 255.0f;
108 | c2 = c2 / 255.0f;
109 |
110 | //rgbrgbrgb to rrrgggbbb
111 | int area = dst_width * dst_height;
112 | float* pdst_c0 = dst + dy * dst_width + dx;
113 | float* pdst_c1 = pdst_c0 + area;
114 | float* pdst_c2 = pdst_c1 + area;
115 | *pdst_c0 = c0;
116 | *pdst_c1 = c1;
117 | *pdst_c2 = c2;
118 | }
119 |
120 | extern "C" void preprocess_kernel_img(
121 | uint8_t* src, int src_width, int src_height,
122 | float* dst, int dst_width, int dst_height,
123 | float *d2i, cudaStream_t stream)
124 | {
125 | int jobs = dst_height * dst_width;
126 | int threads = 256;
127 | int blocks = ceil(jobs / (float)threads);
128 | warpaffine_kernel<<< blocks, threads, 0, stream >>>(
129 | src, src_width * 3, src_width,
130 | src_height, dst, dst_width,
131 | dst_height, 128, d2i, jobs);
132 |
133 | }
--------------------------------------------------------------------------------
/src/Yolov9.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | * Website: https://github.com/LinhanDai
3 | * @author dailinhan
4 | * @date 24-02-23 9:19
5 | _ooOoo_
6 | o8888888o
7 | 88" . "88
8 | (| -_- |)
9 | O\ = /O
10 | ____/`---'\____
11 | .' \\| |// `.
12 | / \\||| : |||// \
13 | / _||||| -:- |||||- \
14 | | | \\\ - /// | |
15 | | \_| ''\---/'' | |
16 | \ .-\__ `-` ___/-. /
17 | ___`. .' /--.--\ `. . __
18 | ."" '< `.___\_<|>_/___.' >'"".
19 | | | : `- \`.;`\ _ /`;.`/ - ` : | |
20 | \ \ `-. \_ __\ /__ _/ .-` / /
21 | ======`-.____`-.___\_____/___.-`____.-'======
22 | `=---='
23 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
24 | no error no bug
25 | */
26 |
27 | #include "Yolov9.h"
28 | #include "TimerCounter.h"
29 |
30 |
31 | YoloV9::YoloV9(const std::string& configPath, const std::string &configFile)
32 | {
33 | std::cout << "Yolov9 init..." << std::endl;
34 | assert(readParameters(configPath, configFile));
35 | cudaSetDevice(0);
36 | assert(createEngineIfNotExit() == true && "engine create failure!");
37 | getTrtmodelStream();
38 | }
39 |
40 | nvinfer1::IHostMemory *YoloV9::createEngine(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config)
41 | {
42 | std::cout << "Creating an inference engine, please wait a few minutes!!!" << std::endl;
43 | mLogger.setReportableSeverity(Severity::kERROR);
44 | const auto explicitBatch = 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
45 | nvinfer1::INetworkDefinition *network = builder->createNetworkV2(explicitBatch);
46 | assert(network);
47 | nvonnxparser::IParser *parser = nvonnxparser::createParser(*network, mLogger);
48 | assert(parser);
49 | bool parsed = parser->parseFromFile(mOnnxFile.c_str(), (int) nvinfer1::ILogger::Severity::kWARNING);
50 | if (!parsed) {
51 | mLogger.logPrint(Severity::kERROR, __FUNCTION__ , __LINE__, "onnx file parse error, please check onnx file!");
52 | std::abort();
53 | }
54 | config->setMaxWorkspaceSize(2_GiB);
55 | if (strcmp(mQuantizationInfer.c_str(), "FP16") == 0)
56 | {
57 | config->setFlag(nvinfer1::BuilderFlag::kFP16);
58 | }
59 | else if(strcmp(mQuantizationInfer.c_str(), "FP32") == 0)
60 | {
61 | config->setFlag(nvinfer1::BuilderFlag::kTF32);
62 | }
63 | nvinfer1::Dims inputDims = network->getInput(0)->getDimensions();
64 | if (inputDims.d[0] == -1)
65 | {
66 | nvinfer1::IOptimizationProfile *profileCalib = builder->createOptimizationProfile();
67 | const auto inputName = network->getInput(0)->getName();
68 | nvinfer1::Dims batchDim = inputDims;
69 | batchDim.d[0] = 1;
70 | // We do not need to check the return of setDimension and setCalibrationProfile here as all dims are explicitly set
71 | profileCalib->setDimensions(inputName, nvinfer1::OptProfileSelector::kMIN, batchDim);
72 | profileCalib->setDimensions(inputName, nvinfer1::OptProfileSelector::kOPT, batchDim);
73 | batchDim.d[0] = mMaxSupportBatchSize;
74 | profileCalib->setDimensions(inputName, nvinfer1::OptProfileSelector::kMAX, batchDim);
75 | config->addOptimizationProfile(profileCalib);
76 | }
77 | nvinfer1::IHostMemory* serialized_model = builder->buildSerializedNetwork(*network, *config);
78 | assert(serialized_model);
79 | mLogger.logPrint(Severity::kINFO,__FUNCTION__ ,__LINE__ ,"success create serialized_model!");
80 | return serialized_model;
81 | }
82 |
83 | void YoloV9::createInferenceEngine(nvinfer1::IHostMemory **modelStream)
84 | {
85 | nvinfer1::IBuilder *builder = nvinfer1::createInferBuilder(mLogger);
86 | assert(builder);
87 | nvinfer1::IBuilderConfig *config = builder->createBuilderConfig();
88 | assert(config);
89 | (*modelStream) = createEngine(builder, config);
90 | assert(modelStream != nullptr && "engine create failure!");
91 | }
92 |
93 | bool YoloV9::createEngineIfNotExit()
94 | {
95 | std::ifstream cache(mEngineFile.c_str(), std::ios::binary);
96 | if (cache)
97 | return true;
98 | else {
99 | nvinfer1::IHostMemory *modelStream{nullptr};
100 | createInferenceEngine(&modelStream);
101 | assert(modelStream != nullptr);
102 | std::ofstream p(mEngineFile.c_str(), std::ios::binary);
103 | if (!p) {
104 | std::cout << "could not open plan output file" << std::endl;
105 | return false;
106 | }
107 | p.write(reinterpret_cast(modelStream->data()), modelStream->size());
108 | }
109 | }
110 |
111 | bool YoloV9::readParameters(const std::string& configPath, const std::string& configFile)
112 | {
113 | std::string yamlFile = configPath + "/" + configFile;
114 | if (access(yamlFile.c_str(), F_OK) != -1)
115 | {
116 | cv::FileStorage fs(yamlFile, cv::FileStorage::READ);
117 | mConfTreshold = fs["confTreshold"];
118 | mNMSTreshold = fs["nmsTreshold"];
119 | mMaxSupportBatchSize = fs["maxSupportBatchSize"];
120 | mQuantizationInfer = (std::string) fs["quantizationInfer"];
121 | mOnnxFile = configPath + "/" + (std::string) fs["onnxFile"];
122 | mEngineFile = configPath + "/" + (std::string) fs["engineFile"];
123 | }
124 | else
125 | {
126 | return false;
127 | }
128 | return true;
129 | }
130 |
131 | void YoloV9::getBindingDimsInfo()
132 | {
133 | nvinfer1::Dims inputDims = mEngine->getBindingDimensions(0);
134 | nvinfer1::Dims dInput = inputDims;
135 | mInputC = dInput.d[1];
136 | mInputH = dInput.d[2];
137 | mInputW = dInput.d[3];
138 | nvinfer1::Dims outPutBoxesDims = mEngine->getBindingDimensions(7);
139 | nvinfer1::Dims dOutPutBoxes = outPutBoxesDims;
140 | mOutputAnchorsDim= dOutPutBoxes.d[1];
141 | mOutputAnchorsNum = dOutPutBoxes.d[2];
142 | mOutputAnchorsSize = mOutputAnchorsNum * mOutputAnchorsDim;
143 | }
144 |
145 | void YoloV9::getTrtmodelStream()
146 | {
147 | int engineFileSize = 0;
148 | cudaSetDevice(0);
149 | std::ifstream file(mEngineFile, std::ios::binary);
150 | if (file.good())
151 | {
152 | file.seekg(0, file.end);
153 | engineFileSize = file.tellg();
154 | file.seekg(0, file.beg);
155 | mTrtModelStream = new char[engineFileSize];
156 | assert(mTrtModelStream);
157 | file.read(mTrtModelStream, engineFileSize);
158 | file.close();
159 | }
160 | mRuntime = nvinfer1::createInferRuntime(mLogger);
161 | assert(mRuntime);
162 | mEngine = mRuntime->deserializeCudaEngine(mTrtModelStream, engineFileSize);
163 | assert(mEngine);
164 | mContext = mEngine->createExecutionContext();
165 | assert(mContext);
166 | getBindingDimsInfo();
167 | //create fixed maximum input buffer
168 | int inputSingleByteNum = mInputW * mInputH * mInputC;
169 | int outputSingleAnchorByteNum = mOutputAnchorsNum * mOutputAnchorsDim;
170 | //input layer
171 | CHECK(cudaMalloc(&(mBuff[0]), mMaxSupportBatchSize * inputSingleByteNum * sizeof(float)));
172 | //output feature map layer
173 | nvinfer1::Dims outputDims1 = mEngine->getBindingDimensions(1);
174 | CHECK(cudaMalloc(&(mBuff[1]), mMaxSupportBatchSize * outputDims1.d[1] * outputDims1.d[2] * outputDims1.d[3] * sizeof(float)));
175 | nvinfer1::Dims outputDims2 = mEngine->getBindingDimensions(2);
176 | CHECK(cudaMalloc(&(mBuff[2]), mMaxSupportBatchSize * outputDims2.d[1] * outputDims2.d[2] * outputDims2.d[3] * sizeof(float)));
177 | nvinfer1::Dims outputDims3 = mEngine->getBindingDimensions(3);
178 | CHECK(cudaMalloc(&(mBuff[3]), mMaxSupportBatchSize * outputDims3.d[1] * outputDims3.d[2] * outputDims3.d[3] * sizeof(float)));
179 | nvinfer1::Dims outputDims4 = mEngine->getBindingDimensions(4);
180 | CHECK(cudaMalloc(&(mBuff[4]), mMaxSupportBatchSize * outputDims4.d[1] * outputDims4.d[2] * outputDims4.d[3] * sizeof(float)));
181 | nvinfer1::Dims outputDims5 = mEngine->getBindingDimensions(5);
182 | CHECK(cudaMalloc(&(mBuff[5]), mMaxSupportBatchSize * outputDims5.d[1] * outputDims5.d[2] * outputDims5.d[3] * sizeof(float)));
183 | nvinfer1::Dims outputDims6 = mEngine->getBindingDimensions(6);
184 | CHECK(cudaMalloc(&(mBuff[6]), mMaxSupportBatchSize * outputDims6.d[1] * outputDims6.d[2] * outputDims6.d[3] * sizeof(float)));
185 | //output layer
186 | CHECK(cudaMalloc(&(mBuff[7]), mMaxSupportBatchSize * outputSingleAnchorByteNum * sizeof(float)));
187 | CHECK(cudaMalloc(&(mBuff[8]), mMaxSupportBatchSize * outputSingleAnchorByteNum * sizeof(float)));
188 |
189 | //malloc resize warpAffine space
190 | mDeviceWarpAffine = nullptr;
191 | CHECK(cudaMalloc(&mDeviceWarpAffine, GPU_MAX_LIMIT_WIDTH * GPU_MAX_LIMIT_HEIGHT * GPU_MAX_LIMIT_CHANNEL * sizeof(unsigned char)));
192 | CHECK(cudaMemset(mDeviceWarpAffine, 0, GPU_MAX_LIMIT_WIDTH * GPU_MAX_LIMIT_HEIGHT * GPU_MAX_LIMIT_CHANNEL * sizeof(unsigned char)));
193 |
194 | //malloc yolo gpuDecode space
195 | mOutputDevice = nullptr;
196 | mTransposeDevice = nullptr;
197 | mOutputHost = nullptr;
198 | CHECK(cudaMalloc(&mOutputDevice, sizeof(float) + MAX_OBJECTS * NUM_BOX_ELEMENT * sizeof(float)));
199 | CHECK(cudaMalloc(&mTransposeDevice, mOutputAnchorsSize * sizeof(float)));
200 | CHECK(cudaMallocHost(&mOutputHost, sizeof(float) + MAX_OBJECTS * NUM_BOX_ELEMENT * sizeof(float)));
201 | CHECK(cudaMemset(mOutputHost, 0, sizeof(float) + MAX_OBJECTS * NUM_BOX_ELEMENT * sizeof(float)));
202 | CHECK(cudaMemset(mTransposeDevice, 0, mOutputAnchorsSize * sizeof(float)));
203 | CHECK(cudaMemset(mOutputDevice, 0, sizeof(float) + MAX_OBJECTS * NUM_BOX_ELEMENT * sizeof(float)));
204 |
205 | mAffineMatrixD2iHost = nullptr;
206 | mAffineMatrixD2iDevice = nullptr;
207 | CHECK(cudaMallocHost(&mAffineMatrixD2iHost,sizeof(float) * 6));
208 | CHECK(cudaMalloc(&mAffineMatrixD2iDevice,sizeof(float) * 6));
209 | delete []mTrtModelStream;
210 | mTrtModelStream = nullptr;
211 | }
212 |
213 | void YoloV9::getAffineMartrix(AffineMatrix &afmt,cv::Size &to,cv::Size &from)
214 | {
215 | float scale = std::min(to.width/(float)from.width,to.height/(float)from.height);
216 | afmt.i2d[0] = scale;
217 | afmt.i2d[1] = 0;
218 | afmt.i2d[2] = (-scale * from.width+to.width) * 0.5;
219 | afmt.i2d[3] = 0;
220 | afmt.i2d[4] = scale;
221 | afmt.i2d[5] = (-scale * from.height + to.height) * 0.5;
222 | cv::Mat cv_i2d(2,3,CV_32F,afmt.i2d);
223 | cv::Mat cv_d2i(2,3,CV_32F,afmt.d2i);
224 | cv::invertAffineTransform(cv_i2d,cv_d2i);
225 | memcpy(afmt.d2i,cv_d2i.ptr(0),sizeof(afmt.d2i));
226 | }
227 |
228 | void YoloV9::imgPreProcess(std::vector &batchImg)
229 | {
230 | for (size_t i = 0; i < batchImg.size(); i++)
231 | {
232 | AffineMatrix afmt{};
233 | cv::Size to(mInputW, mInputH);
234 | cv::Size from(mImageSizeBatch[i].width, mImageSizeBatch[i].height);
235 | getAffineMartrix(afmt, to, from);
236 | memcpy(mAffineMatrixD2iHost,afmt.d2i,sizeof(afmt.d2i));
237 | CHECK(cudaMemcpyAsync(mAffineMatrixD2iDevice, mAffineMatrixD2iHost, sizeof(afmt.d2i),cudaMemcpyHostToDevice, mStream));
238 | preprocess_kernel_img(batchImg[i], mImageSizeBatch[i].width, mImageSizeBatch[i].height,
239 | mBuff[0], mInputW, mInputH, mAffineMatrixD2iDevice, mStream);
240 | }
241 | }
242 |
243 | void YoloV9::gpuDecode(float* anchorsProb, int batch, float confidence_threshold, float nms_threshold)
244 | {
245 | for (int i = 0; i < batch; i++)
246 | {
247 | float *predictDevice = anchorsProb + i * mOutputAnchorsSize;
248 | transpose_kernel_invoker(predictDevice, mOutputAnchorsNum, mOutputAnchorsDim, mTransposeDevice, mStream);
249 | CHECK(cudaMemset(mOutputDevice, 0, sizeof(float) + MAX_OBJECTS * NUM_BOX_ELEMENT * sizeof(float)));
250 | decode_kernel_invoker(
251 | mTransposeDevice, mOutputAnchorsNum,
252 | mOutputAnchorsDim - 4, confidence_threshold,
253 | nms_threshold, mAffineMatrixD2iDevice,
254 | mOutputDevice, MAX_OBJECTS,
255 | NUM_BOX_ELEMENT,mStream);
256 | }
257 | }
258 |
259 | std::vector YoloV9::getDetResultToCPU(int batch)
260 | {
261 | std::vector result;
262 | for (int b = 0; b < batch; b++)
263 | {
264 | std::vector boxResult;
265 | CHECK(cudaMemset(mOutputHost, 0, sizeof(float) + MAX_OBJECTS * NUM_BOX_ELEMENT * sizeof(float)));
266 | CHECK(cudaMemcpyAsync(mOutputHost, mOutputDevice,
267 | sizeof(float) + MAX_OBJECTS * NUM_BOX_ELEMENT * sizeof(float),
268 | cudaMemcpyDeviceToHost, mStream));
269 | CHECK(cudaStreamSynchronize(mStream));
270 | int num_boxes = std::min((int)mOutputHost[0], MAX_OBJECTS);
271 | for(int i = 0; i < num_boxes; ++i)
272 | {
273 | float* ptr = mOutputHost + 1 + NUM_BOX_ELEMENT * i;
274 | int keep_flag = ptr[6];
275 | if(keep_flag)
276 | {
277 | boxResult.emplace_back(ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], (int)ptr[5], 0);
278 | }
279 | }
280 | result.push_back(boxResult);
281 | }
282 | return result;
283 | }
284 |
285 | void YoloV9::modelInfer(nvinfer1::IExecutionContext& context, int batchSize)
286 | {
287 | const nvinfer1::ICudaEngine &engine = context.getEngine();
288 | nvinfer1::Dims inputDims = engine.getBindingDimensions(0);
289 | nvinfer1::Dims d = inputDims;
290 | d.d[0] = batchSize;
291 | if (!mContext->setBindingDimensions(0, d))
292 | {
293 | mLogger.logPrint(Severity::kERROR, __FUNCTION__ , __LINE__, "The input dimension of the model is incorrect");
294 | std::abort();
295 | }
296 | context.enqueueV2((void **)mBuff, mStream, nullptr);
297 | }
298 |
299 | void YoloV9::doInfer(std::vector batchImg,
300 | std::vector imgInfoVec,
301 | std::vector &detResult)
302 | {
303 | int batch = imgInfoVec.size();
304 | mImageSizeBatch = imgInfoVec;
305 | imgPreProcess(batchImg);
306 | modelInfer(*mContext, batch);
307 | gpuDecode(mBuff[7], batch,mConfTreshold, mNMSTreshold);
308 | detResult = getDetResultToCPU(batch);
309 | }
--------------------------------------------------------------------------------
/yolov9_trt.py:
--------------------------------------------------------------------------------
1 | import os
2 | import cv2
3 | import argparse
4 | import numpy as np
5 | import tensorrt as trt
6 | import pycuda.autoinit
7 | import pycuda.driver as cuda
8 | from python.AIResult import *
9 | from python.logging_system import Logger
10 | from python.tensorrt_base import TensorrtBase
11 | from python.draw_AI_results import draw_detect_results
12 | from python.decorators import time_cost, suppress_errors
13 |
14 | parser = argparse.ArgumentParser("yolov9_demo")
15 | parser.add_argument('--configs', type=str, default="configs", help="configs path")
16 | parser.add_argument('--yaml_file', type=str, default="yolov9py.yaml", help="yaml file name")
17 | parser.add_argument('--data', type=str, default="data", help="images data path")
18 | args = parser.parse_args()
19 |
20 |
21 | class Yolov9(TensorrtBase):
22 | def __init__(self, logger, config_path, config_file):
23 | super().__init__(logger)
24 | self.logger = logger
25 | assert self.read_parameters(config_path, config_file), self.logger.info("Read parameters failure!")
26 | assert self.create_engine_if_not_exit(), self.logger.error("create engine failure!")
27 | self.get_trt_model_stream()
28 | @time_cost
29 | @suppress_errors
30 | def preprocess(self, raw_bgr_image):
31 | """
32 | description: Convert BGR image to RGB,
33 | resize and pad it to target size, normalize to [0,1],
34 | transform to NCHW format.
35 | param:
36 | input_image_path: str, image path
37 | return:
38 | image: the processed image
39 | image_raw: the original image
40 | h: original height
41 | w: original width
42 | """
43 | image_raw = raw_bgr_image
44 | h, w, c = image_raw.shape
45 | image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
46 | # Calculate widht and height and paddings
47 | r_w = self.input_w / w
48 | r_h = self.input_h / h
49 | if r_h > r_w:
50 | tw = self.input_w
51 | th = int(r_w * h)
52 | tx1 = tx2 = 0
53 | ty1 = int((self.input_h - th) / 2)
54 | ty2 = self.input_h - th - ty1
55 | else:
56 | tw = int(r_h * w)
57 | th = self.input_h
58 | tx1 = int((self.input_w - tw) / 2)
59 | tx2 = self.input_w - tw - tx1
60 | ty1 = ty2 = 0
61 | # Resize the image with long side while maintaining ratio
62 | image = cv2.resize(image, (tw, th))
63 | # Pad the short side with (128,128,128)
64 | image = cv2.copyMakeBorder(image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128))
65 | image = image.astype(np.float32)
66 | # Normalize to [0,1]
67 | image /= 255.0
68 | # HWC to CHW format:
69 | image = np.transpose(image, [2, 0, 1])
70 | # CHW to NCHW format
71 | image = np.expand_dims(image, axis=0)
72 | # Convert the image to row-major order, also known as "C order":
73 | image = np.ascontiguousarray(image)
74 | return image, image_raw, h, w
75 |
76 | @time_cost
77 | def post_process(self, output, origin_h, origin_w):
78 | '''
79 | Post-process the output of YOLO model
80 | :param output: Output of model inference
81 | :param origin_h: Image original height
82 | :param origin_w: Image original width
83 | :return: Algorithm detection results
84 | '''
85 | predict = np.transpose(np.reshape(output, (self.output_dim, self.output_anchor_num)))
86 | detect_results = list()
87 | boxes_list = list()
88 | class_ids = list()
89 | scores = list()
90 | scores_array = np.max(predict[:, 4:], axis=1)
91 | filter_predict = predict[scores_array > self.conf_treshold, :]
92 | for predict_box in filter_predict:
93 | cx = predict_box[0]
94 | cy = predict_box[1]
95 | width = predict_box[2]
96 | height = predict_box[3]
97 | score = np.max(predict_box[4:])
98 | class_id = np.argmax(predict_box[4:])
99 | ratio_w = self.input_w / origin_w
100 | ratio_h = self.input_h / origin_h
101 | if ratio_h > ratio_w:
102 | left = (cx - width / 2) / ratio_w
103 | top = (cy - height / 2 - (self.input_h - ratio_w * origin_h) / 2) / ratio_w
104 | right = (cx + width / 2) / ratio_w
105 | bottom = (cy + height / 2 - (self.input_h - ratio_w * origin_h) / 2) / ratio_w
106 | else:
107 | left = (cx - width / 2 - (self.input_w - ratio_h * origin_w) / 2) / ratio_h
108 | top = (cy - height / 2) / ratio_h
109 | right = (cx + width / 2 - (self.input_w - ratio_h * origin_w) / 2) / ratio_h
110 | bottom = (cy + height / 2) / ratio_h
111 | box_xywh = list(map(lambda x: int(x), [max(0, left), max(0, top), min(right - left, origin_w), min(bottom - top, origin_h)]))
112 | boxes_list.append(box_xywh)
113 | class_ids.append(class_id)
114 | scores.append(score)
115 | nms_result = cv2.dnn.NMSBoxes(boxes_list, scores, self.conf_treshold, self.nms_threshold)
116 | for i in range(len(nms_result)):
117 | idx = nms_result[i]
118 | class_id = class_ids[idx]
119 | score = scores[idx]
120 | box = boxes_list[idx]
121 | result = DetResult(score, box, class_id)
122 | detect_results.append(result)
123 | return detect_results
124 |
125 | @time_cost
126 | @suppress_errors
127 | def do_infer(self, img):
128 | start_time = cv2.getTickCount()
129 | # Do image preprocess
130 | self.ctx.push()
131 | input_image, image_raw, h, w = self.preprocess(img)
132 | # Copy input image to host buffer
133 | np.copyto(self.host_inputs[0], input_image.ravel())
134 | # Transfer input data to the GPU.
135 | cuda.memcpy_htod_async(self.cuda_inputs[0], self.host_inputs[0], self.stream)
136 | # Run inference.
137 | self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle)
138 | # Transfer predictions back from the GPU.
139 | cuda.memcpy_dtoh_async(self.host_outputs[6], self.cuda_outputs[6], self.stream)
140 | # Synchronize the stream
141 | self.stream.synchronize()
142 | # Here we use the first row of output in that batch_size = 1
143 | output = self.host_outputs[6]
144 | # Do postprocess
145 | detect_results = self.post_process(output, image_raw.shape[0], image_raw.shape[1])
146 | self.ctx.pop()
147 | # print cost time
148 | end_time = cv2.getTickCount()
149 | fps = 1 / ((end_time - start_time) / cv2.getTickFrequency())
150 | self.logger.info("detect fps:{}".format(fps))
151 | return detect_results
152 |
153 | @suppress_errors
154 | def get_trt_model_stream(self):
155 | '''
156 | Obtain the data flow for Tensorrt model inference and initialize the model
157 | '''
158 | self.ctx = cuda.Device(0).make_context()
159 | stream = cuda.Stream()
160 | TRT_LOGGER = self.trt_logger
161 | runtime = trt.Runtime(TRT_LOGGER)
162 |
163 | # Deserialize the engine from file
164 | with open(self.engine_file, "rb") as f:
165 | engine = runtime.deserialize_cuda_engine(f.read())
166 | context = engine.create_execution_context()
167 |
168 | host_inputs = []
169 | cuda_inputs = []
170 | host_outputs = []
171 | cuda_outputs = []
172 | bindings = []
173 |
174 | for binding_index, binding in enumerate(engine):
175 | self.logger.info("bingding shape:{}".format(engine.get_binding_shape(binding)))
176 | size = trt.volume(engine.get_binding_shape(binding))
177 | dtype = trt.nptype(engine.get_binding_dtype(binding))
178 | # Allocate host and device buffers
179 | host_mem = cuda.pagelocked_empty(size, dtype)
180 | cuda_mem = cuda.mem_alloc(host_mem.nbytes)
181 | # Append the device buffer to device bindings.
182 | bindings.append(int(cuda_mem))
183 | # Append to the appropriate list.
184 | if binding_index == 0:
185 | self.input_w = engine.get_binding_shape(binding)[-1]
186 | self.input_h = engine.get_binding_shape(binding)[-2]
187 | host_inputs.append(host_mem)
188 | cuda_inputs.append(cuda_mem)
189 | elif binding_index == 7:
190 | self.output_anchor_num = engine.get_binding_shape(binding)[-1]
191 | self.output_dim = engine.get_binding_shape(binding)[-2]
192 | host_outputs.append(host_mem)
193 | cuda_outputs.append(cuda_mem)
194 | else:
195 | host_outputs.append(host_mem)
196 | cuda_outputs.append(cuda_mem)
197 |
198 | # Store
199 | self.stream = stream
200 | self.context = context
201 | self.engine = engine
202 | self.host_inputs = host_inputs
203 | self.cuda_inputs = cuda_inputs
204 | self.host_outputs = host_outputs
205 | self.cuda_outputs = cuda_outputs
206 | self.bindings = bindings
207 |
208 | def read_parameters(self, config_path, config_file):
209 | '''
210 | Read parameters from config file
211 | :param config_path: Profile Path
212 | :param config_file: profile name
213 | :return: Did it read successfully
214 | '''
215 | yaml_file = os.path.join(config_path, config_file)
216 | if os.path.exists(yaml_file):
217 | fs = cv2.FileStorage(yaml_file, cv2.FILE_STORAGE_READ)
218 | self.conf_treshold = fs.getNode('confTreshold').real()
219 | self.nms_threshold = fs.getNode('nmsThreshold').real()
220 | self.quantization_infer = fs.getNode("quantizationInfer").string()
221 | self.onnx_file = os.path.join(config_path, fs.getNode('onnxFile').string())
222 | self.engine_file = os.path.join(config_path, fs.getNode('engineFile').string())
223 | else:
224 | return False
225 | return True
226 |
227 | def destroy(self):
228 | self.ctx.pop()
229 | del self.ctx
230 | self.logger.info("yolov9 destroy")
231 |
232 |
233 | if __name__ == "__main__":
234 | log = Logger()
235 | logger = log.get_log("yolov9.txt")
236 | yolov9 = Yolov9(logger, args.configs, args.yaml_file)
237 | image_root = args.data
238 | file_list = os.listdir(image_root)
239 | for image_file in file_list:
240 | image_path = os.path.join(image_root, image_file)
241 | img = cv2.imdecode(np.fromfile(image_path, dtype=np.uint8), cv2.IMREAD_COLOR)
242 | detect_results = yolov9.do_infer(img)
243 | draw_detect_results(img, detect_results)
244 | yolov9.destroy()
245 |
--------------------------------------------------------------------------------