├── CMakeLists.txt
├── README.md
├── common.hpp
├── gen_wts.py
├── logging.h
├── test.png
├── utils.h
├── yololayer.cu
├── yololayer.h
├── yolov5.cpp
├── yolov5_trt.py
├── yolov5s.engine
└── yolov5s.wts


/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(yolov5)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_53;code=sm_53)
14 | 
15 | include_directories(${PROJECT_SOURCE_DIR}/include)
16 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
17 | # cuda
18 | include_directories(/usr/local/cuda/include)
19 | link_directories(/usr/local/cuda/lib64)
20 | # tensorrt
21 | include_directories(/usr/include/aarch64-linux-gnu/)
22 | link_directories(/usr/lib/aarch64-linux-gnu/)
23 | 
24 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
25 | 
26 | cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
27 | target_link_libraries(myplugins nvinfer cudart)
28 | 
29 | find_package(OpenCV)
30 | include_directories(OpenCV_INCLUDE_DIRS)
31 | 
32 | add_executable(yolov5 ${PROJECT_SOURCE_DIR}/yolov5.cpp)
33 | target_link_libraries(yolov5 nvinfer)
34 | target_link_libraries(yolov5 cudart)
35 | target_link_libraries(yolov5 myplugins)
36 | target_link_libraries(yolov5 ${OpenCV_LIBS})
37 | 
38 | add_definitions(-O2 -pthread)
39 | 
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # OpenJetson
 2 | 
 3 | http://openjetson.com/
 4 | 
 5 | # yolov5
 6 | 
 7 | The Pytorch implementation is [ultralytics/yolov5](https://github.com/ultralytics/yolov5).
 8 | 
 9 | Currently, we support yolov5 v1.0(yolov5s only), v2.0, v3.0 and v3.1.
10 | 
11 | - For yolov5 v3.1, please visit [yolov5 release v3.1](https://github.com/ultralytics/yolov5/releases/tag/v3.1), and use the latest commit of this repo.
12 | - For yolov5 v3.0, please visit [yolov5 release v3.0](https://github.com/ultralytics/yolov5/releases/tag/v3.0), and use the latest commit of this repo.
13 | - For yolov5 v2.0, please visit [yolov5 release v2.0](https://github.com/ultralytics/yolov5/releases/tag/v2.0), and checkout commit ['5cfa444'](https://github.com/wang-xinyu/tensorrtx/commit/5cfa4445170eabaa54acd5ad7f469ef65a8763f1) of this repo.
14 | - For yolov5 v1.0, please visit [yolov5 release v1.0](https://github.com/ultralytics/yolov5/releases/tag/v1.0), and checkout commit ['f09aa3b'](https://github.com/wang-xinyu/tensorrtx/commit/f09aa3bbebf4d4d37b6d3b32a1d39e1f2678a07b) of this repo.
15 | 
16 | ## Config
17 | 
18 | - Choose the model s/m/l/x by `NET` macro in yolov5.cpp
19 | - Input shape defined in yololayer.h
20 | - Number of classes defined in yololayer.h, **DO NOT FORGET TO ADAPT THIS, If using your own model**
21 | - FP16/FP32 can be selected by the macro in yolov5.cpp
22 | - GPU id can be selected by the macro in yolov5.cpp
23 | - NMS thresh in yolov5.cpp
24 | - BBox confidence thresh in yolov5.cpp
25 | - Batch size in yolov5.cpp
26 | 
27 | ## How to Run, yolov5s as example
28 | 
29 | ```
30 | 1. generate yolov5s.wts from pytorch with yolov5s.pt, or download .wts from model zoo
31 | 
32 | git clone https://github.com/wang-xinyu/tensorrtx.git
33 | git clone https://github.com/ultralytics/yolov5.git
34 | // download its weights 'yolov5s.pt'
35 | // copy tensorrtx/yolov5/gen_wts.py into ultralytics/yolov5
36 | // ensure the file name is yolov5s.pt and yolov5s.wts in gen_wts.py
37 | // go to ultralytics/yolov5
38 | python gen_wts.py
39 | // a file 'yolov5s.wts' will be generated.
40 | 
41 | 2. build tensorrtx/yolov5 and run
42 | 
43 | // put yolov5s.wts into tensorrtx/yolov5
44 | // go to tensorrtx/yolov5
45 | // ensure the macro NET in yolov5.cpp is s
46 | mkdir build
47 | cd build
48 | cmake ..
49 | make
50 | sudo ./yolov5 -s         // serialize model to plan file i.e. 'yolov5s.engine'
51 | sudo ./yolov5 -v         // deserialize plan file and run inference with camera or video.
52 | 
53 | ```
54 | ![demo](https://raw.githubusercontent.com/OpenJetson/tensorrt-yolov5/main/test.png)
55 | 


--------------------------------------------------------------------------------
/common.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef YOLOV5_COMMON_H_
  2 | #define YOLOV5_COMMON_H_
  3 | 
  4 | #include <fstream>
  5 | #include <map>
  6 | #include <sstream>
  7 | #include <vector>
  8 | #include <opencv2/opencv.hpp>
  9 | #include <dirent.h>
 10 | #include "NvInfer.h"
 11 | #include "yololayer.h"
 12 | 
 13 | #define CHECK(status) \
 14 |     do\
 15 |     {\
 16 |         auto ret = (status);\
 17 |         if (ret != 0)\
 18 |         {\
 19 |             std::cerr << "Cuda failure: " << ret << std::endl;\
 20 |             abort();\
 21 |         }\
 22 |     } while (0)
 23 | 
 24 | using namespace nvinfer1;
 25 | 
 26 | cv::Mat preprocess_img(cv::Mat& img) {
 27 |     int w, h, x, y;
 28 |     float r_w = Yolo::INPUT_W / (img.cols*1.0);
 29 |     float r_h = Yolo::INPUT_H / (img.rows*1.0);
 30 |     if (r_h > r_w) {
 31 |         w = Yolo::INPUT_W;
 32 |         h = r_w * img.rows;
 33 |         x = 0;
 34 |         y = (Yolo::INPUT_H - h) / 2;
 35 |     } else {
 36 |         w = r_h * img.cols;
 37 |         h = Yolo::INPUT_H;
 38 |         x = (Yolo::INPUT_W - w) / 2;
 39 |         y = 0;
 40 |     }
 41 |     cv::Mat re(h, w, CV_8UC3);
 42 |     cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
 43 |     cv::Mat out(Yolo::INPUT_H, Yolo::INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128));
 44 |     re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
 45 |     return out;
 46 | }
 47 | 
 48 | cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
 49 |     int l, r, t, b;
 50 |     float r_w = Yolo::INPUT_W / (img.cols * 1.0);
 51 |     float r_h = Yolo::INPUT_H / (img.rows * 1.0);
 52 |     if (r_h > r_w) {
 53 |         l = bbox[0] - bbox[2] / 2.f;
 54 |         r = bbox[0] + bbox[2] / 2.f;
 55 |         t = bbox[1] - bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
 56 |         b = bbox[1] + bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
 57 |         l = l / r_w;
 58 |         r = r / r_w;
 59 |         t = t / r_w;
 60 |         b = b / r_w;
 61 |     } else {
 62 |         l = bbox[0] - bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
 63 |         r = bbox[0] + bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
 64 |         t = bbox[1] - bbox[3] / 2.f;
 65 |         b = bbox[1] + bbox[3] / 2.f;
 66 |         l = l / r_h;
 67 |         r = r / r_h;
 68 |         t = t / r_h;
 69 |         b = b / r_h;
 70 |     }
 71 |     return cv::Rect(l, t, r - l, b - t);
 72 | }
 73 | 
 74 | float iou(float lbox[4], float rbox[4]) {
 75 |     float interBox[] = {
 76 |         (std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left
 77 |         (std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right
 78 |         (std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top
 79 |         (std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom
 80 |     };
 81 | 
 82 |     if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
 83 |         return 0.0f;
 84 | 
 85 |     float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]);
 86 |     return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
 87 | }
 88 | 
 89 | bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) {
 90 |     return a.conf > b.conf;
 91 | }
 92 | 
 93 | void nms(std::vector<Yolo::Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5) {
 94 |     int det_size = sizeof(Yolo::Detection) / sizeof(float);
 95 |     std::map<float, std::vector<Yolo::Detection>> m;
 96 |     for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) {
 97 |         if (output[1 + det_size * i + 4] <= conf_thresh) continue;
 98 |         Yolo::Detection det;
 99 |         memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
100 |         if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
101 |         m[det.class_id].push_back(det);
102 |     }
103 |     for (auto it = m.begin(); it != m.end(); it++) {
104 |         //std::cout << it->second[0].class_id << " --- " << std::endl;
105 |         auto& dets = it->second;
106 |         std::sort(dets.begin(), dets.end(), cmp);
107 |         for (size_t m = 0; m < dets.size(); ++m) {
108 |             auto& item = dets[m];
109 |             res.push_back(item);
110 |             for (size_t n = m + 1; n < dets.size(); ++n) {
111 |                 if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
112 |                     dets.erase(dets.begin() + n);
113 |                     --n;
114 |                 }
115 |             }
116 |         }
117 |     }
118 | }
119 | 
120 | // TensorRT weight files have a simple space delimited format:
121 | // [type] [size] <data x size in hex>
122 | std::map<std::string, Weights> loadWeights(const std::string file) {
123 |     std::cout << "Loading weights: " << file << std::endl;
124 |     std::map<std::string, Weights> weightMap;
125 | 
126 |     // Open weights file
127 |     std::ifstream input(file);
128 |     assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");
129 | 
130 |     // Read number of weight blobs
131 |     int32_t count;
132 |     input >> count;
133 |     assert(count > 0 && "Invalid weight map file.");
134 | 
135 |     while (count--)
136 |     {
137 |         Weights wt{ DataType::kFLOAT, nullptr, 0 };
138 |         uint32_t size;
139 | 
140 |         // Read name and type of blob
141 |         std::string name;
142 |         input >> name >> std::dec >> size;
143 |         wt.type = DataType::kFLOAT;
144 | 
145 |         // Load blob
146 |         uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
147 |         for (uint32_t x = 0, y = size; x < y; ++x)
148 |         {
149 |             input >> std::hex >> val[x];
150 |         }
151 |         wt.values = val;
152 | 
153 |         wt.count = size;
154 |         weightMap[name] = wt;
155 |     }
156 | 
157 |     return weightMap;
158 | }
159 | 
160 | IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
161 |     float *gamma = (float*)weightMap[lname + ".weight"].values;
162 |     float *beta = (float*)weightMap[lname + ".bias"].values;
163 |     float *mean = (float*)weightMap[lname + ".running_mean"].values;
164 |     float *var = (float*)weightMap[lname + ".running_var"].values;
165 |     int len = weightMap[lname + ".running_var"].count;
166 | 
167 |     float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
168 |     for (int i = 0; i < len; i++) {
169 |         scval[i] = gamma[i] / sqrt(var[i] + eps);
170 |     }
171 |     Weights scale{ DataType::kFLOAT, scval, len };
172 | 
173 |     float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
174 |     for (int i = 0; i < len; i++) {
175 |         shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
176 |     }
177 |     Weights shift{ DataType::kFLOAT, shval, len };
178 | 
179 |     float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
180 |     for (int i = 0; i < len; i++) {
181 |         pval[i] = 1.0;
182 |     }
183 |     Weights power{ DataType::kFLOAT, pval, len };
184 | 
185 |     weightMap[lname + ".scale"] = scale;
186 |     weightMap[lname + ".shift"] = shift;
187 |     weightMap[lname + ".power"] = power;
188 |     IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
189 |     assert(scale_1);
190 |     return scale_1;
191 | }
192 | 
193 | ILayer* convBlock(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) {
194 |     Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
195 |     int p = ksize / 2;
196 |     IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv.weight"], emptywts);
197 |     assert(conv1);
198 |     conv1->setStrideNd(DimsHW{ s, s });
199 |     conv1->setPaddingNd(DimsHW{ p, p });
200 |     conv1->setNbGroups(g);
201 |     IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3);
202 | 
203 |     // hard_swish = x * hard_sigmoid
204 |     auto hsig = network->addActivation(*bn1->getOutput(0), ActivationType::kHARD_SIGMOID);
205 |     assert(hsig);
206 |     hsig->setAlpha(1.0 / 6.0);
207 |     hsig->setBeta(0.5);
208 |     auto ew = network->addElementWise(*bn1->getOutput(0), *hsig->getOutput(0), ElementWiseOperation::kPROD);
209 |     assert(ew);
210 |     return ew;
211 | }
212 | 
213 | ILayer* focus(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) {
214 |     ISliceLayer *s1 = network->addSlice(input, Dims3{ 0, 0, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
215 |     ISliceLayer *s2 = network->addSlice(input, Dims3{ 0, 1, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
216 |     ISliceLayer *s3 = network->addSlice(input, Dims3{ 0, 0, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
217 |     ISliceLayer *s4 = network->addSlice(input, Dims3{ 0, 1, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 });
218 |     ITensor* inputTensors[] = { s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0) };
219 |     auto cat = network->addConcatenation(inputTensors, 4);
220 |     auto conv = convBlock(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv");
221 |     return conv;
222 | }
223 | 
224 | ILayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) {
225 |     auto cv1 = convBlock(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1");
226 |     auto cv2 = convBlock(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2");
227 |     if (shortcut && c1 == c2) {
228 |         auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM);
229 |         return ew;
230 |     }
231 |     return cv2;
232 | }
233 | 
234 | ILayer* bottleneckCSP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
235 |     Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
236 |     int c_ = (int)((float)c2 * e);
237 |     auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
238 |     auto cv2 = network->addConvolutionNd(input, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv2.weight"], emptywts);
239 |     ITensor *y1 = cv1->getOutput(0);
240 |     for (int i = 0; i < n; i++) {
241 |         auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
242 |         y1 = b->getOutput(0);
243 |     }
244 |     auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv3.weight"], emptywts);
245 | 
246 |     ITensor* inputTensors[] = { cv3->getOutput(0), cv2->getOutput(0) };
247 |     auto cat = network->addConcatenation(inputTensors, 2);
248 | 
249 |     IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4);
250 |     auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU);
251 |     lr->setAlpha(0.1);
252 | 
253 |     auto cv4 = convBlock(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4");
254 |     return cv4;
255 | }
256 | 
257 | ILayer* SPP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) {
258 |     int c_ = c1 / 2;
259 |     auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
260 | 
261 |     auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k1, k1 });
262 |     pool1->setPaddingNd(DimsHW{ k1 / 2, k1 / 2 });
263 |     pool1->setStrideNd(DimsHW{ 1, 1 });
264 |     auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k2, k2 });
265 |     pool2->setPaddingNd(DimsHW{ k2 / 2, k2 / 2 });
266 |     pool2->setStrideNd(DimsHW{ 1, 1 });
267 |     auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k3, k3 });
268 |     pool3->setPaddingNd(DimsHW{ k3 / 2, k3 / 2 });
269 |     pool3->setStrideNd(DimsHW{ 1, 1 });
270 | 
271 |     ITensor* inputTensors[] = { cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0) };
272 |     auto cat = network->addConcatenation(inputTensors, 4);
273 | 
274 |     auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2");
275 |     return cv2;
276 | }
277 | 
278 | int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
279 |     DIR *p_dir = opendir(p_dir_name);
280 |     if (p_dir == nullptr) {
281 |         return -1;
282 |     }
283 | 
284 |     struct dirent* p_file = nullptr;
285 |     while ((p_file = readdir(p_dir)) != nullptr) {
286 |         if (strcmp(p_file->d_name, ".") != 0 &&
287 |             strcmp(p_file->d_name, "..") != 0) {
288 |             //std::string cur_file_name(p_dir_name);
289 |             //cur_file_name += "/";
290 |             //cur_file_name += p_file->d_name;
291 |             std::string cur_file_name(p_file->d_name);
292 |             file_names.push_back(cur_file_name);
293 |         }
294 |     }
295 | 
296 |     closedir(p_dir);
297 |     return 0;
298 | }
299 | 
300 | std::vector<float> getAnchors(std::map<std::string, Weights>& weightMap)
301 | {
302 |     std::vector<float> anchors_yolo;
303 |     Weights Yolo_Anchors = weightMap["model.24.anchor_grid"];
304 |     assert(Yolo_Anchors.count == 18);
305 |     int each_yololayer_anchorsnum = Yolo_Anchors.count / 3;
306 |     const float* tempAnchors = (const float*)(Yolo_Anchors.values);
307 |     for (int i = 0; i < Yolo_Anchors.count; i++)
308 |     {
309 |         if (i < each_yololayer_anchorsnum)
310 |         {
311 |             anchors_yolo.push_back(const_cast<float*>(tempAnchors)[i]);
312 |         }
313 |         if ((i >= each_yololayer_anchorsnum) && (i < (2 * each_yololayer_anchorsnum)))
314 |         {
315 |             anchors_yolo.push_back(const_cast<float*>(tempAnchors)[i]);
316 |         }
317 |         if (i >= (2 * each_yololayer_anchorsnum))
318 |         {
319 |             anchors_yolo.push_back(const_cast<float*>(tempAnchors)[i]);
320 |         }
321 |     }
322 |     return anchors_yolo;
323 | }
324 | 
325 | IPluginV2Layer* addYoLoLayer(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, IConvolutionLayer* det0, IConvolutionLayer* det1, IConvolutionLayer* det2)
326 | {
327 |     auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
328 |     std::vector<float> anchors_yolo = getAnchors(weightMap);
329 |     PluginField pluginMultidata[4];
330 |     int NetData[4];
331 |     NetData[0] = Yolo::CLASS_NUM;
332 |     NetData[1] = Yolo::INPUT_W;
333 |     NetData[2] = Yolo::INPUT_H;
334 |     NetData[3] = Yolo::MAX_OUTPUT_BBOX_COUNT;
335 |     pluginMultidata[0].data = NetData;
336 |     pluginMultidata[0].length = 3;
337 |     pluginMultidata[0].name = "netdata";
338 |     pluginMultidata[0].type = PluginFieldType::kFLOAT32;
339 |     int scale[3] = { 8, 16, 32 };
340 |     int plugindata[3][8];
341 |     std::string names[3];
342 |     for (int k = 1; k < 4; k++)
343 |     {
344 |         plugindata[k - 1][0] = Yolo::INPUT_W / scale[k - 1];
345 |         plugindata[k - 1][1] = Yolo::INPUT_H / scale[k - 1];
346 |         for (int i = 2; i < 8; i++)
347 |         {
348 |             plugindata[k - 1][i] = int(anchors_yolo[(k - 1) * 6 + i - 2]);
349 |         }
350 |         pluginMultidata[k].data = plugindata[k - 1];
351 |         pluginMultidata[k].length = 8;
352 |         names[k - 1] = "yolodata" + std::to_string(k);
353 |         pluginMultidata[k].name = names[k - 1].c_str();
354 |         pluginMultidata[k].type = PluginFieldType::kFLOAT32;
355 |     }
356 |     PluginFieldCollection pluginData;
357 |     pluginData.nbFields = 4;
358 |     pluginData.fields = pluginMultidata;
359 |     IPluginV2 *pluginObj = creator->createPlugin("yololayer", &pluginData);
360 |     ITensor* inputTensors_yolo[] = { det2->getOutput(0), det1->getOutput(0), det0->getOutput(0) };
361 |     auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj);
362 |     return yolo;
363 | }
364 | #endif
365 | 
366 | 


--------------------------------------------------------------------------------
/gen_wts.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import struct
 3 | from utils.torch_utils import select_device
 4 | 
 5 | # Initialize
 6 | device = select_device('cpu')
 7 | # Load model
 8 | model = torch.load('weights/yolov5s.pt', map_location=device)['model'].float()  # load to FP32
 9 | model.to(device).eval()
10 | 
11 | f = open('yolov5s.wts', 'w')
12 | f.write('{}\n'.format(len(model.state_dict().keys())))
13 | for k, v in model.state_dict().items():
14 |     vr = v.reshape(-1).cpu().numpy()
15 |     f.write('{} {} '.format(k, len(vr)))
16 |     for vv in vr:
17 |         f.write(' ')
18 |         f.write(struct.pack('>f',float(vv)).hex())
19 |     f.write('\n')
20 | 


--------------------------------------------------------------------------------
/logging.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #ifndef TENSORRT_LOGGING_H
 18 | #define TENSORRT_LOGGING_H
 19 | 
 20 | #include "NvInferRuntimeCommon.h"
 21 | #include <cassert>
 22 | #include <ctime>
 23 | #include <iomanip>
 24 | #include <iostream>
 25 | #include <ostream>
 26 | #include <sstream>
 27 | #include <string>
 28 | 
 29 | using Severity = nvinfer1::ILogger::Severity;
 30 | 
 31 | class LogStreamConsumerBuffer : public std::stringbuf
 32 | {
 33 | public:
 34 |     LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
 35 |         : mOutput(stream)
 36 |         , mPrefix(prefix)
 37 |         , mShouldLog(shouldLog)
 38 |     {
 39 |     }
 40 | 
 41 |     LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
 42 |         : mOutput(other.mOutput)
 43 |     {
 44 |     }
 45 | 
 46 |     ~LogStreamConsumerBuffer()
 47 |     {
 48 |         // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
 49 |         // std::streambuf::pptr() gives a pointer to the current position of the output sequence
 50 |         // if the pointer to the beginning is not equal to the pointer to the current position,
 51 |         // call putOutput() to log the output to the stream
 52 |         if (pbase() != pptr())
 53 |         {
 54 |             putOutput();
 55 |         }
 56 |     }
 57 | 
 58 |     // synchronizes the stream buffer and returns 0 on success
 59 |     // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
 60 |     // resetting the buffer and flushing the stream
 61 |     virtual int sync()
 62 |     {
 63 |         putOutput();
 64 |         return 0;
 65 |     }
 66 | 
 67 |     void putOutput()
 68 |     {
 69 |         if (mShouldLog)
 70 |         {
 71 |             // prepend timestamp
 72 |             std::time_t timestamp = std::time(nullptr);
 73 |             tm* tm_local = std::localtime(&timestamp);
 74 |             std::cout << "[";
 75 |             std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
 76 |             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
 77 |             std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
 78 |             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
 79 |             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
 80 |             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
 81 |             // std::stringbuf::str() gets the string contents of the buffer
 82 |             // insert the buffer contents pre-appended by the appropriate prefix into the stream
 83 |             mOutput << mPrefix << str();
 84 |             // set the buffer to empty
 85 |             str("");
 86 |             // flush the stream
 87 |             mOutput.flush();
 88 |         }
 89 |     }
 90 | 
 91 |     void setShouldLog(bool shouldLog)
 92 |     {
 93 |         mShouldLog = shouldLog;
 94 |     }
 95 | 
 96 | private:
 97 |     std::ostream& mOutput;
 98 |     std::string mPrefix;
 99 |     bool mShouldLog;
100 | };
101 | 
102 | //!
103 | //! \class LogStreamConsumerBase
104 | //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
105 | //!
106 | class LogStreamConsumerBase
107 | {
108 | public:
109 |     LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
110 |         : mBuffer(stream, prefix, shouldLog)
111 |     {
112 |     }
113 | 
114 | protected:
115 |     LogStreamConsumerBuffer mBuffer;
116 | };
117 | 
118 | //!
119 | //! \class LogStreamConsumer
120 | //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
121 | //!  Order of base classes is LogStreamConsumerBase and then std::ostream.
122 | //!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
123 | //!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
124 | //!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
125 | //!  Please do not change the order of the parent classes.
126 | //!
127 | class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
128 | {
129 | public:
130 |     //! \brief Creates a LogStreamConsumer which logs messages with level severity.
131 |     //!  Reportable severity determines if the messages are severe enough to be logged.
132 |     LogStreamConsumer(Severity reportableSeverity, Severity severity)
133 |         : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
134 |         , std::ostream(&mBuffer) // links the stream buffer with the stream
135 |         , mShouldLog(severity <= reportableSeverity)
136 |         , mSeverity(severity)
137 |     {
138 |     }
139 | 
140 |     LogStreamConsumer(LogStreamConsumer&& other)
141 |         : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
142 |         , std::ostream(&mBuffer) // links the stream buffer with the stream
143 |         , mShouldLog(other.mShouldLog)
144 |         , mSeverity(other.mSeverity)
145 |     {
146 |     }
147 | 
148 |     void setReportableSeverity(Severity reportableSeverity)
149 |     {
150 |         mShouldLog = mSeverity <= reportableSeverity;
151 |         mBuffer.setShouldLog(mShouldLog);
152 |     }
153 | 
154 | private:
155 |     static std::ostream& severityOstream(Severity severity)
156 |     {
157 |         return severity >= Severity::kINFO ? std::cout : std::cerr;
158 |     }
159 | 
160 |     static std::string severityPrefix(Severity severity)
161 |     {
162 |         switch (severity)
163 |         {
164 |         case Severity::kINTERNAL_ERROR: return "[F] ";
165 |         case Severity::kERROR: return "[E] ";
166 |         case Severity::kWARNING: return "[W] ";
167 |         case Severity::kINFO: return "[I] ";
168 |         case Severity::kVERBOSE: return "[V] ";
169 |         default: assert(0); return "";
170 |         }
171 |     }
172 | 
173 |     bool mShouldLog;
174 |     Severity mSeverity;
175 | };
176 | 
177 | //! \class Logger
178 | //!
179 | //! \brief Class which manages logging of TensorRT tools and samples
180 | //!
181 | //! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
182 | //! and supports logging two types of messages:
183 | //!
184 | //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
185 | //! - Test pass/fail messages
186 | //!
187 | //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
188 | //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
189 | //!
190 | //! In the future, this class could be extended to support dumping test results to a file in some standard format
191 | //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
192 | //!
193 | //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
194 | //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
195 | //! library and messages coming from the sample.
196 | //!
197 | //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
198 | //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
199 | //! object.
200 | 
201 | class Logger : public nvinfer1::ILogger
202 | {
203 | public:
204 |     Logger(Severity severity = Severity::kWARNING)
205 |         : mReportableSeverity(severity)
206 |     {
207 |     }
208 | 
209 |     //!
210 |     //! \enum TestResult
211 |     //! \brief Represents the state of a given test
212 |     //!
213 |     enum class TestResult
214 |     {
215 |         kRUNNING, //!< The test is running
216 |         kPASSED,  //!< The test passed
217 |         kFAILED,  //!< The test failed
218 |         kWAIVED   //!< The test was waived
219 |     };
220 | 
221 |     //!
222 |     //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
223 |     //! \return The nvinfer1::ILogger associated with this Logger
224 |     //!
225 |     //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
226 |     //! we can eliminate the inheritance of Logger from ILogger
227 |     //!
228 |     nvinfer1::ILogger& getTRTLogger()
229 |     {
230 |         return *this;
231 |     }
232 | 
233 |     //!
234 |     //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
235 |     //!
236 |     //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
237 |     //! inheritance from nvinfer1::ILogger
238 |     //!
239 |     void log(Severity severity, const char* msg) override
240 |     {
241 |         LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
242 |     }
243 | 
244 |     //!
245 |     //! \brief Method for controlling the verbosity of logging output
246 |     //!
247 |     //! \param severity The logger will only emit messages that have severity of this level or higher.
248 |     //!
249 |     void setReportableSeverity(Severity severity)
250 |     {
251 |         mReportableSeverity = severity;
252 |     }
253 | 
254 |     //!
255 |     //! \brief Opaque handle that holds logging information for a particular test
256 |     //!
257 |     //! This object is an opaque handle to information used by the Logger to print test results.
258 |     //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
259 |     //! with Logger::reportTest{Start,End}().
260 |     //!
261 |     class TestAtom
262 |     {
263 |     public:
264 |         TestAtom(TestAtom&&) = default;
265 | 
266 |     private:
267 |         friend class Logger;
268 | 
269 |         TestAtom(bool started, const std::string& name, const std::string& cmdline)
270 |             : mStarted(started)
271 |             , mName(name)
272 |             , mCmdline(cmdline)
273 |         {
274 |         }
275 | 
276 |         bool mStarted;
277 |         std::string mName;
278 |         std::string mCmdline;
279 |     };
280 | 
281 |     //!
282 |     //! \brief Define a test for logging
283 |     //!
284 |     //! \param[in] name The name of the test.  This should be a string starting with
285 |     //!                  "TensorRT" and containing dot-separated strings containing
286 |     //!                  the characters [A-Za-z0-9_].
287 |     //!                  For example, "TensorRT.sample_googlenet"
288 |     //! \param[in] cmdline The command line used to reproduce the test
289 |     //
290 |     //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
291 |     //!
292 |     static TestAtom defineTest(const std::string& name, const std::string& cmdline)
293 |     {
294 |         return TestAtom(false, name, cmdline);
295 |     }
296 | 
297 |     //!
298 |     //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
299 |     //!        as input
300 |     //!
301 |     //! \param[in] name The name of the test
302 |     //! \param[in] argc The number of command-line arguments
303 |     //! \param[in] argv The array of command-line arguments (given as C strings)
304 |     //!
305 |     //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
306 |     static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
307 |     {
308 |         auto cmdline = genCmdlineString(argc, argv);
309 |         return defineTest(name, cmdline);
310 |     }
311 | 
312 |     //!
313 |     //! \brief Report that a test has started.
314 |     //!
315 |     //! \pre reportTestStart() has not been called yet for the given testAtom
316 |     //!
317 |     //! \param[in] testAtom The handle to the test that has started
318 |     //!
319 |     static void reportTestStart(TestAtom& testAtom)
320 |     {
321 |         reportTestResult(testAtom, TestResult::kRUNNING);
322 |         assert(!testAtom.mStarted);
323 |         testAtom.mStarted = true;
324 |     }
325 | 
326 |     //!
327 |     //! \brief Report that a test has ended.
328 |     //!
329 |     //! \pre reportTestStart() has been called for the given testAtom
330 |     //!
331 |     //! \param[in] testAtom The handle to the test that has ended
332 |     //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
333 |     //!                   TestResult::kFAILED, TestResult::kWAIVED
334 |     //!
335 |     static void reportTestEnd(const TestAtom& testAtom, TestResult result)
336 |     {
337 |         assert(result != TestResult::kRUNNING);
338 |         assert(testAtom.mStarted);
339 |         reportTestResult(testAtom, result);
340 |     }
341 | 
342 |     static int reportPass(const TestAtom& testAtom)
343 |     {
344 |         reportTestEnd(testAtom, TestResult::kPASSED);
345 |         return EXIT_SUCCESS;
346 |     }
347 | 
348 |     static int reportFail(const TestAtom& testAtom)
349 |     {
350 |         reportTestEnd(testAtom, TestResult::kFAILED);
351 |         return EXIT_FAILURE;
352 |     }
353 | 
354 |     static int reportWaive(const TestAtom& testAtom)
355 |     {
356 |         reportTestEnd(testAtom, TestResult::kWAIVED);
357 |         return EXIT_SUCCESS;
358 |     }
359 | 
360 |     static int reportTest(const TestAtom& testAtom, bool pass)
361 |     {
362 |         return pass ? reportPass(testAtom) : reportFail(testAtom);
363 |     }
364 | 
365 |     Severity getReportableSeverity() const
366 |     {
367 |         return mReportableSeverity;
368 |     }
369 | 
370 | private:
371 |     //!
372 |     //! \brief returns an appropriate string for prefixing a log message with the given severity
373 |     //!
374 |     static const char* severityPrefix(Severity severity)
375 |     {
376 |         switch (severity)
377 |         {
378 |         case Severity::kINTERNAL_ERROR: return "[F] ";
379 |         case Severity::kERROR: return "[E] ";
380 |         case Severity::kWARNING: return "[W] ";
381 |         case Severity::kINFO: return "[I] ";
382 |         case Severity::kVERBOSE: return "[V] ";
383 |         default: assert(0); return "";
384 |         }
385 |     }
386 | 
387 |     //!
388 |     //! \brief returns an appropriate string for prefixing a test result message with the given result
389 |     //!
390 |     static const char* testResultString(TestResult result)
391 |     {
392 |         switch (result)
393 |         {
394 |         case TestResult::kRUNNING: return "RUNNING";
395 |         case TestResult::kPASSED: return "PASSED";
396 |         case TestResult::kFAILED: return "FAILED";
397 |         case TestResult::kWAIVED: return "WAIVED";
398 |         default: assert(0); return "";
399 |         }
400 |     }
401 | 
402 |     //!
403 |     //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
404 |     //!
405 |     static std::ostream& severityOstream(Severity severity)
406 |     {
407 |         return severity >= Severity::kINFO ? std::cout : std::cerr;
408 |     }
409 | 
410 |     //!
411 |     //! \brief method that implements logging test results
412 |     //!
413 |     static void reportTestResult(const TestAtom& testAtom, TestResult result)
414 |     {
415 |         severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
416 |                                          << testAtom.mCmdline << std::endl;
417 |     }
418 | 
419 |     //!
420 |     //! \brief generate a command line string from the given (argc, argv) values
421 |     //!
422 |     static std::string genCmdlineString(int argc, char const* const* argv)
423 |     {
424 |         std::stringstream ss;
425 |         for (int i = 0; i < argc; i++)
426 |         {
427 |             if (i > 0)
428 |                 ss << " ";
429 |             ss << argv[i];
430 |         }
431 |         return ss.str();
432 |     }
433 | 
434 |     Severity mReportableSeverity;
435 | };
436 | 
437 | namespace
438 | {
439 | 
440 | //!
441 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
442 | //!
443 | //! Example usage:
444 | //!
445 | //!     LOG_VERBOSE(logger) << "hello world" << std::endl;
446 | //!
447 | inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
448 | {
449 |     return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
450 | }
451 | 
452 | //!
453 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
454 | //!
455 | //! Example usage:
456 | //!
457 | //!     LOG_INFO(logger) << "hello world" << std::endl;
458 | //!
459 | inline LogStreamConsumer LOG_INFO(const Logger& logger)
460 | {
461 |     return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
462 | }
463 | 
464 | //!
465 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
466 | //!
467 | //! Example usage:
468 | //!
469 | //!     LOG_WARN(logger) << "hello world" << std::endl;
470 | //!
471 | inline LogStreamConsumer LOG_WARN(const Logger& logger)
472 | {
473 |     return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
474 | }
475 | 
476 | //!
477 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
478 | //!
479 | //! Example usage:
480 | //!
481 | //!     LOG_ERROR(logger) << "hello world" << std::endl;
482 | //!
483 | inline LogStreamConsumer LOG_ERROR(const Logger& logger)
484 | {
485 |     return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
486 | }
487 | 
488 | //!
489 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
490 | //         ("fatal" severity)
491 | //!
492 | //! Example usage:
493 | //!
494 | //!     LOG_FATAL(logger) << "hello world" << std::endl;
495 | //!
496 | inline LogStreamConsumer LOG_FATAL(const Logger& logger)
497 | {
498 |     return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
499 | }
500 | 
501 | } // anonymous namespace
502 | 
503 | #endif // TENSORRT_LOGGING_H
504 | 


--------------------------------------------------------------------------------
/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenJetson/tensorrt-yolov5/7c79e973e40c3e41488021c3efcc2f4f3b283ed8/test.png


--------------------------------------------------------------------------------
/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef __TRT_UTILS_H_
 2 | #define __TRT_UTILS_H_
 3 | 
 4 | #include <iostream>
 5 | #include <vector>
 6 | #include <algorithm>
 7 | #include <cudnn.h>
 8 | 
 9 | #ifndef CUDA_CHECK
10 | 
11 | #define CUDA_CHECK(callstr)                                                                    \
12 |     {                                                                                          \
13 |         cudaError_t error_code = callstr;                                                      \
14 |         if (error_code != cudaSuccess) {                                                       \
15 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
16 |             assert(0);                                                                         \
17 |         }                                                                                      \
18 |     }
19 | 
20 | #endif
21 | 
22 | namespace Tn
23 | {
24 |     class Profiler : public nvinfer1::IProfiler
25 |     {
26 |     public:
27 |         void printLayerTimes(int itrationsTimes)
28 |         {
29 |             float totalTime = 0;
30 |             for (size_t i = 0; i < mProfile.size(); i++)
31 |             {
32 |                 printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes);
33 |                 totalTime += mProfile[i].second;
34 |             }
35 |             printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes);
36 |         }
37 |     private:
38 |         typedef std::pair<std::string, float> Record;
39 |         std::vector<Record> mProfile;
40 | 
41 |         virtual void reportLayerTime(const char* layerName, float ms)
42 |         {
43 |             auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
44 |             if (record == mProfile.end())
45 |                 mProfile.push_back(std::make_pair(layerName, ms));
46 |             else
47 |                 record->second += ms;
48 |         }
49 |     };
50 | 
51 |     //Logger for TensorRT info/warning/errors
52 |     class Logger : public nvinfer1::ILogger
53 |     {
54 |     public:
55 | 
56 |         Logger(): Logger(Severity::kWARNING) {}
57 | 
58 |         Logger(Severity severity): reportableSeverity(severity) {}
59 | 
60 |         void log(Severity severity, const char* msg) override
61 |         {
62 |             // suppress messages with severity enum value greater than the reportable
63 |             if (severity > reportableSeverity) return;
64 | 
65 |             switch (severity)
66 |             {
67 |                 case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
68 |                 case Severity::kERROR: std::cerr << "ERROR: "; break;
69 |                 case Severity::kWARNING: std::cerr << "WARNING: "; break;
70 |                 case Severity::kINFO: std::cerr << "INFO: "; break;
71 |                 default: std::cerr << "UNKNOWN: "; break;
72 |             }
73 |             std::cerr << msg << std::endl;
74 |         }
75 | 
76 |         Severity reportableSeverity{Severity::kWARNING};
77 |     };
78 | 
79 |     template<typename T> 
80 |     void write(char*& buffer, const T& val)
81 |     {
82 |         *reinterpret_cast<T*>(buffer) = val;
83 |         buffer += sizeof(T);
84 |     }
85 | 
86 |     template<typename T> 
87 |     void read(const char*& buffer, T& val)
88 |     {
89 |         val = *reinterpret_cast<const T*>(buffer);
90 |         buffer += sizeof(T);
91 |     }
92 | }
93 | 
94 | #endif


--------------------------------------------------------------------------------
/yololayer.cu:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include "yololayer.h"
  3 | #include "utils.h"
  4 | 
  5 | using namespace Yolo;
  6 | 
  7 | namespace nvinfer1
  8 | {
  9 |     YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector<Yolo::YoloKernel>& vYoloKernel)
 10 |     {
 11 |         mClassCount = classCount;
 12 |         mYoloV5NetWidth = netWidth;
 13 |         mYoloV5NetHeight = netHeight;
 14 |         mMaxOutObject = maxOut;
 15 |         mYoloKernel = vYoloKernel;
 16 |         mKernelCount = vYoloKernel.size();
 17 | 
 18 |         CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
 19 |         size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2;
 20 |         for (int ii = 0; ii < mKernelCount; ii++)
 21 |         {
 22 |             CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
 23 |             const auto& yolo = mYoloKernel[ii];
 24 |             CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
 25 |         }
 26 |     }
 27 |     YoloLayerPlugin::~YoloLayerPlugin()
 28 |     {
 29 |         for (int ii = 0; ii < mKernelCount; ii++)
 30 |         {
 31 |             CUDA_CHECK(cudaFree(mAnchor[ii]));
 32 |         }
 33 |         CUDA_CHECK(cudaFreeHost(mAnchor));
 34 |     }
 35 | 
 36 |     // create the plugin at runtime from a byte stream
 37 |     YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
 38 |     {
 39 |         using namespace Tn;
 40 |         const char *d = reinterpret_cast<const char *>(data), *a = d;
 41 |         read(d, mClassCount);
 42 |         read(d, mThreadCount);
 43 |         read(d, mKernelCount);
 44 |         read(d, mYoloV5NetWidth);
 45 |         read(d, mYoloV5NetHeight);
 46 |         read(d, mMaxOutObject);
 47 |         mYoloKernel.resize(mKernelCount);
 48 |         auto kernelSize = mKernelCount * sizeof(YoloKernel);
 49 |         memcpy(mYoloKernel.data(), d, kernelSize);
 50 |         d += kernelSize;
 51 |         CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
 52 |         size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2;
 53 |         for (int ii = 0; ii < mKernelCount; ii++)
 54 |         {
 55 |             CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen));
 56 |             const auto& yolo = mYoloKernel[ii];
 57 |             CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
 58 |         }
 59 |         assert(d == a + length);
 60 |     }
 61 | 
 62 |     void YoloLayerPlugin::serialize(void* buffer) const
 63 |     {
 64 |         using namespace Tn;
 65 |         char* d = static_cast<char*>(buffer), *a = d;
 66 |         write(d, mClassCount);
 67 |         write(d, mThreadCount);
 68 |         write(d, mKernelCount);
 69 |         write(d, mYoloV5NetWidth);
 70 |         write(d, mYoloV5NetHeight);
 71 |         write(d, mMaxOutObject);
 72 |         auto kernelSize = mKernelCount * sizeof(YoloKernel);
 73 |         memcpy(d, mYoloKernel.data(), kernelSize);
 74 |         d += kernelSize;
 75 | 
 76 |         assert(d == a + getSerializationSize());
 77 |     }
 78 | 
 79 |     size_t YoloLayerPlugin::getSerializationSize() const
 80 |     {
 81 |         return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size() + sizeof(mYoloV5NetWidth) + sizeof(mYoloV5NetHeight) + sizeof(mMaxOutObject);
 82 |     }
 83 | 
 84 |     int YoloLayerPlugin::initialize()
 85 |     {
 86 |         return 0;
 87 |     }
 88 | 
 89 |     Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
 90 |     {
 91 |         //output the result to channel
 92 |         int totalsize = mMaxOutObject * sizeof(Detection) / sizeof(float);
 93 | 
 94 |         return Dims3(totalsize + 1, 1, 1);
 95 |     }
 96 | 
 97 |     // Set plugin namespace
 98 |     void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace)
 99 |     {
100 |         mPluginNamespace = pluginNamespace;
101 |     }
102 | 
103 |     const char* YoloLayerPlugin::getPluginNamespace() const
104 |     {
105 |         return mPluginNamespace;
106 |     }
107 | 
108 |     // Return the DataType of the plugin output at the requested index
109 |     DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
110 |     {
111 |         return DataType::kFLOAT;
112 |     }
113 | 
114 |     // Return true if output tensor is broadcast across a batch.
115 |     bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
116 |     {
117 |         return false;
118 |     }
119 | 
120 |     // Return true if plugin can use input that is broadcast across batch without replication.
121 |     bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const
122 |     {
123 |         return false;
124 |     }
125 | 
126 |     void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
127 |     {
128 |     }
129 | 
130 |     // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
131 |     void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
132 |     {
133 |     }
134 | 
135 |     // Detach the plugin object from its execution context.
136 |     void YoloLayerPlugin::detachFromContext() {}
137 | 
138 |     const char* YoloLayerPlugin::getPluginType() const
139 |     {
140 |         return "YoloLayer_TRT";
141 |     }
142 | 
143 |     const char* YoloLayerPlugin::getPluginVersion() const
144 |     {
145 |         return "1";
146 |     }
147 | 
148 |     void YoloLayerPlugin::destroy()
149 |     {
150 |         delete this;
151 |     }
152 | 
153 |     // Clone the plugin
154 |     IPluginV2IOExt* YoloLayerPlugin::clone() const
155 |     {
156 |         YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, mYoloKernel);
157 |         p->setPluginNamespace(mPluginNamespace);
158 |         return p;
159 |     }
160 | 
161 |     __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); };
162 | 
163 |     __global__ void CalDetection(const float *input, float *output, int noElements,
164 |         const int netwidth, const int netheight, int maxoutobject, int yoloWidth, int yoloHeight, const float anchors[CHECK_COUNT * 2], int classes, int outputElem)
165 |     {
166 | 
167 |         int idx = threadIdx.x + blockDim.x * blockIdx.x;
168 |         if (idx >= noElements) return;
169 | 
170 |         int total_grid = yoloWidth * yoloHeight;
171 |         int bnIdx = idx / total_grid;
172 |         idx = idx - total_grid * bnIdx;
173 |         int info_len_i = 5 + classes;
174 |         const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);
175 | 
176 |         for (int k = 0; k < 3; ++k) {
177 |             float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
178 |             if (box_prob < IGNORE_THRESH) continue;
179 |             int class_id = 0;
180 |             float max_cls_prob = 0.0;
181 |             for (int i = 5; i < info_len_i; ++i) {
182 |                 float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
183 |                 if (p > max_cls_prob) {
184 |                     max_cls_prob = p;
185 |                     class_id = i - 5;
186 |                 }
187 |             }
188 |             float *res_count = output + bnIdx * outputElem;
189 |             int count = (int)atomicAdd(res_count, 1);
190 |             if (count >= maxoutobject) return;
191 |             char* data = (char *)res_count + sizeof(float) + count * sizeof(Detection);
192 |             Detection* det = (Detection*)(data);
193 | 
194 |             int row = idx / yoloWidth;
195 |             int col = idx % yoloWidth;
196 | 
197 |             //Location
198 |             // pytorch:
199 |             //  y = x[i].sigmoid()
200 |             //  y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
201 |             //  y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh 
202 |             //  X: (sigmoid(tx) + cx)/FeaturemapW *  netwidth 
203 |             det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth;
204 |             det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight;
205 | 
206 |             // W: (Pw * e^tw) / FeaturemapW * netwidth  
207 |             // v5: https://github.com/ultralytics/yolov5/issues/471
208 |             det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]);
209 |             det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2 * k];
210 |             det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]);
211 |             det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2 * k + 1];
212 |             det->conf = box_prob * max_cls_prob;
213 |             det->class_id = class_id;
214 |         }
215 |     }
216 | 
217 |     void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize)
218 |     {
219 |         int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
220 |         for (int idx = 0; idx < batchSize; ++idx) {
221 |             CUDA_CHECK(cudaMemset(output + idx * outputElem, 0, sizeof(float)));
222 |         }
223 |         int numElem = 0;
224 |         for (unsigned int i = 0; i < mYoloKernel.size(); ++i)
225 |         {
226 |             const auto& yolo = mYoloKernel[i];
227 |             numElem = yolo.width*yolo.height*batchSize; 
228 |             if (numElem < mThreadCount)
229 |                 mThreadCount = numElem;
230 | 
231 |             //printf("Net: %d  %d \n", mYoloV5NetWidth, mYoloV5NetHeight);
232 |             CalDetection << < (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount >> >
233 |                 (inputs[i], output, numElem, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount, outputElem);
234 |         }
235 |     }
236 | 
237 | 
238 |     int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
239 |     {
240 |         forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
241 |         return 0;
242 |     }
243 | 
244 |     PluginFieldCollection YoloPluginCreator::mFC{};
245 |     std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
246 | 
247 |     YoloPluginCreator::YoloPluginCreator()
248 |     {
249 |         mPluginAttributes.clear();
250 | 
251 |         mFC.nbFields = mPluginAttributes.size();
252 |         mFC.fields = mPluginAttributes.data();
253 |     }
254 | 
255 |     const char* YoloPluginCreator::getPluginName() const
256 |     {
257 |         return "YoloLayer_TRT";
258 |     }
259 | 
260 |     const char* YoloPluginCreator::getPluginVersion() const
261 |     {
262 |         return "1";
263 |     }
264 | 
265 |     const PluginFieldCollection* YoloPluginCreator::getFieldNames()
266 |     {
267 |         return &mFC;
268 |     }
269 | 
270 |     IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
271 |     {
272 |         int class_count = 80;
273 |         int input_w = 416;
274 |         int input_h = 416;
275 |         int max_output_object_count = 1000;
276 |         std::vector<Yolo::YoloKernel> yolo_kernels(3);
277 | 
278 |         const PluginField* fields = fc->fields;
279 |         for (int i = 0; i < fc->nbFields; i++) {
280 |             if (strcmp(fields[i].name, "netdata") == 0) {
281 |                 assert(fields[i].type == PluginFieldType::kFLOAT32);
282 |                 int *tmp = (int*)(fields[i].data);
283 |                 class_count = tmp[0];
284 |                 input_w = tmp[1];
285 |                 input_h = tmp[2];
286 |                 max_output_object_count = tmp[3];
287 |             } else if (strstr(fields[i].name, "yolodata") != NULL) {
288 |                 assert(fields[i].type == PluginFieldType::kFLOAT32);
289 |                 int *tmp = (int*)(fields[i].data);
290 |                 YoloKernel kernel;
291 |                 kernel.width = tmp[0];
292 |                 kernel.height = tmp[1];
293 |                 for (int j = 0; j < fields[i].length - 2; j++) {
294 |                     kernel.anchors[j] = tmp[j + 2];
295 |                 }
296 |                 yolo_kernels[2 - (fields[i].name[8] - '1')] = kernel;
297 |             }
298 |         }
299 |         YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, yolo_kernels);
300 |         obj->setPluginNamespace(mNamespace.c_str());
301 |         return obj;
302 |     }
303 | 
304 |     IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
305 |     {
306 |         // This object will be deleted when the network is destroyed, which will
307 |         // call YoloLayerPlugin::destroy()
308 |         YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
309 |         obj->setPluginNamespace(mNamespace.c_str());
310 |         return obj;
311 |     }
312 | }
313 | 


--------------------------------------------------------------------------------
/yololayer.h:
--------------------------------------------------------------------------------
  1 | #ifndef _YOLO_LAYER_H
  2 | #define _YOLO_LAYER_H
  3 | 
  4 | #include <vector>
  5 | #include <string>
  6 | #include "NvInfer.h"
  7 | 
  8 | namespace Yolo
  9 | {
 10 |     static constexpr int CHECK_COUNT = 3;
 11 |     static constexpr float IGNORE_THRESH = 0.1f;
 12 |     struct YoloKernel
 13 |     {
 14 |         int width;
 15 |         int height;
 16 |         float anchors[CHECK_COUNT * 2];
 17 |     };
 18 |     static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
 19 |     static constexpr int CLASS_NUM = 80;
 20 |     static constexpr int INPUT_H = 608;
 21 |     static constexpr int INPUT_W = 608;
 22 | 
 23 |     static constexpr int LOCATIONS = 4;
 24 |     struct alignas(float) Detection {
 25 |         //center_x center_y w h
 26 |         float bbox[LOCATIONS];
 27 |         float conf;  // bbox_conf * cls_conf
 28 |         float class_id;
 29 |     };
 30 | }
 31 | 
 32 | namespace nvinfer1
 33 | {
 34 |     class YoloLayerPlugin : public IPluginV2IOExt
 35 |     {
 36 |     public:
 37 |         YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector<Yolo::YoloKernel>& vYoloKernel);
 38 |         YoloLayerPlugin(const void* data, size_t length);
 39 |         ~YoloLayerPlugin();
 40 | 
 41 |         int getNbOutputs() const override
 42 |         {
 43 |             return 1;
 44 |         }
 45 | 
 46 |         Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
 47 | 
 48 |         int initialize() override;
 49 | 
 50 |         virtual void terminate() override {};
 51 | 
 52 |         virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0; }
 53 | 
 54 |         virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;
 55 | 
 56 |         virtual size_t getSerializationSize() const override;
 57 | 
 58 |         virtual void serialize(void* buffer) const override;
 59 | 
 60 |         bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override {
 61 |             return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
 62 |         }
 63 | 
 64 |         const char* getPluginType() const override;
 65 | 
 66 |         const char* getPluginVersion() const override;
 67 | 
 68 |         void destroy() override;
 69 | 
 70 |         IPluginV2IOExt* clone() const override;
 71 | 
 72 |         void setPluginNamespace(const char* pluginNamespace) override;
 73 | 
 74 |         const char* getPluginNamespace() const override;
 75 | 
 76 |         DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
 77 | 
 78 |         bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;
 79 | 
 80 |         bool canBroadcastInputAcrossBatch(int inputIndex) const override;
 81 | 
 82 |         void attachToContext(
 83 |             cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;
 84 | 
 85 |         void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override;
 86 | 
 87 |         void detachFromContext() override;
 88 | 
 89 |     private:
 90 |         void forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize = 1);
 91 |         int mThreadCount = 256;
 92 |         const char* mPluginNamespace;
 93 |         int mKernelCount;
 94 |         int mClassCount;
 95 |         int mYoloV5NetWidth;
 96 |         int mYoloV5NetHeight;
 97 |         int mMaxOutObject;
 98 |         std::vector<Yolo::YoloKernel> mYoloKernel;
 99 |         void** mAnchor;
100 |     };
101 | 
102 |     class YoloPluginCreator : public IPluginCreator
103 |     {
104 |     public:
105 |         YoloPluginCreator();
106 | 
107 |         ~YoloPluginCreator() override = default;
108 | 
109 |         const char* getPluginName() const override;
110 | 
111 |         const char* getPluginVersion() const override;
112 | 
113 |         const PluginFieldCollection* getFieldNames() override;
114 | 
115 |         IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;
116 | 
117 |         IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
118 | 
119 |         void setPluginNamespace(const char* libNamespace) override
120 |         {
121 |             mNamespace = libNamespace;
122 |         }
123 | 
124 |         const char* getPluginNamespace() const override
125 |         {
126 |             return mNamespace.c_str();
127 |         }
128 | 
129 |     private:
130 |         std::string mNamespace;
131 |         static PluginFieldCollection mFC;
132 |         static std::vector<PluginField> mPluginAttributes;
133 |     };
134 |     REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
135 | };
136 | 
137 | #endif 
138 | 


--------------------------------------------------------------------------------
/yolov5.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <chrono>
  3 | #include "cuda_runtime_api.h"
  4 | #include "logging.h"
  5 | #include "common.hpp"
  6 | 
  7 | #define USE_FP16  // comment out this if want to use FP32
  8 | #define DEVICE 0  // GPU id
  9 | #define NMS_THRESH 0.4
 10 | #define CONF_THRESH 0.5
 11 | #define BATCH_SIZE 1
 12 | 
 13 | #define NET s  // s m l x
 14 | #define NETSTRUCT(str) createEngine_##str
 15 | #define CREATENET(net) NETSTRUCT(net)
 16 | #define STR1(x) #x
 17 | #define STR2(x) STR1(x)
 18 | 
 19 | // stuff we know about the network and the input/output blobs
 20 | static const int INPUT_H = Yolo::INPUT_H;
 21 | static const int INPUT_W = Yolo::INPUT_W;
 22 | static const int CLASS_NUM = Yolo::CLASS_NUM;
 23 | static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1;  // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
 24 | const char* INPUT_BLOB_NAME = "data";
 25 | const char* OUTPUT_BLOB_NAME = "prob";
 26 | static Logger gLogger;
 27 | 
 28 | char *coco_classes[] = {"person","bicycle","car","motorcycle","airplane","bus","train","truck","boat","traffic light","fire hydrant","stop sign","parking meter","bench","bird","cat","dog","horse","sheep","cow","elephant","bear","zebra","giraffe","backpack","umbrella","handbag","tie","suitcase","frisbee","skis","snowboard","sports ball","kite","baseball bat","baseball glove","skateboard","surfboard",      "tennis racket","bottle","wine glass","cup","fork","knife","spoon","bowl","banana","apple","sandwich","orange","broccoli","carrot","hot dog","pizza","donut","cake","chair","couch","potted plant","bed",   "dining table","toilet","tv","laptop","mouse","remote","keyboard","cell phone","microwave","oven","toaster","sink","refrigerator","book","clock","vase","scissors","teddy bear","hair drier","toothbrush"};
 29 | 
 30 | // Creat the engine using only the API and not any parser.
 31 | ICudaEngine* createEngine_s(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
 32 |     INetworkDefinition* network = builder->createNetworkV2(0U);
 33 | 
 34 |     // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
 35 |     ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
 36 |     assert(data);
 37 | 
 38 |     std::map<std::string, Weights> weightMap = loadWeights("../yolov5s.wts");
 39 |     Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
 40 | 
 41 |     // yolov5 backbone
 42 |     auto focus0 = focus(network, weightMap, *data, 3, 32, 3, "model.0");
 43 |     auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), 64, 3, 2, 1, "model.1");
 44 |     auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 64, 64, 1, true, 1, 0.5, "model.2");
 45 |     auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), 128, 3, 2, 1, "model.3");
 46 |     auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 128, 128, 3, true, 1, 0.5, "model.4");
 47 |     auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), 256, 3, 2, 1, "model.5");
 48 |     auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 256, 256, 3, true, 1, 0.5, "model.6");
 49 |     auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), 512, 3, 2, 1, "model.7");
 50 |     auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 512, 512, 5, 9, 13, "model.8");
 51 | 
 52 |     // yolov5 head
 53 |     auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.9");
 54 |     auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), 256, 1, 1, 1, "model.10");
 55 | 
 56 |     float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 256 * 2 * 2));
 57 |     for (int i = 0; i < 256 * 2 * 2; i++) {
 58 |         deval[i] = 1.0;
 59 |     }
 60 |     Weights deconvwts11{ DataType::kFLOAT, deval, 256 * 2 * 2 };
 61 |     IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 256, DimsHW{ 2, 2 }, deconvwts11, emptywts);
 62 |     deconv11->setStrideNd(DimsHW{ 2, 2 });
 63 |     deconv11->setNbGroups(256);
 64 |     weightMap["deconv11"] = deconvwts11;
 65 | 
 66 |     ITensor* inputTensors12[] = { deconv11->getOutput(0), bottleneck_csp6->getOutput(0) };
 67 |     auto cat12 = network->addConcatenation(inputTensors12, 2);
 68 |     auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 512, 256, 1, false, 1, 0.5, "model.13");
 69 |     auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), 128, 1, 1, 1, "model.14");
 70 | 
 71 |     Weights deconvwts15{ DataType::kFLOAT, deval, 128 * 2 * 2 };
 72 |     IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts15, emptywts);
 73 |     deconv15->setStrideNd(DimsHW{ 2, 2 });
 74 |     deconv15->setNbGroups(128);
 75 | 
 76 |     ITensor* inputTensors16[] = { deconv15->getOutput(0), bottleneck_csp4->getOutput(0) };
 77 |     auto cat16 = network->addConcatenation(inputTensors16, 2);
 78 |     auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 256, 128, 1, false, 1, 0.5, "model.17");
 79 |     IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
 80 | 
 81 |     auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), 128, 3, 2, 1, "model.18");
 82 |     ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
 83 |     auto cat19 = network->addConcatenation(inputTensors19, 2);
 84 |     auto bottleneck_csp20 = bottleneckCSP(network, weightMap, *cat19->getOutput(0), 256, 256, 1, false, 1, 0.5, "model.20");
 85 |     IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
 86 | 
 87 |     auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), 256, 3, 2, 1, "model.21");
 88 |     ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
 89 |     auto cat22 = network->addConcatenation(inputTensors22, 2);
 90 |     auto bottleneck_csp23 = bottleneckCSP(network, weightMap, *cat22->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.23");
 91 |     IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
 92 | 
 93 |     auto yolo = addYoLoLayer(network, weightMap, det0, det1, det2);
 94 |     yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
 95 |     network->markOutput(*yolo->getOutput(0));
 96 | 
 97 |     // Build engine
 98 |     builder->setMaxBatchSize(maxBatchSize);
 99 |     config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
100 | #ifdef USE_FP16
101 |     config->setFlag(BuilderFlag::kFP16);
102 | #endif
103 |     std::cout << "Building engine, please wait for a while..." << std::endl;
104 |     ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
105 |     std::cout << "Build engine successfully!" << std::endl;
106 | 
107 |     // Don't need the network any more
108 |     network->destroy();
109 | 
110 |     // Release host memory
111 |     for (auto& mem : weightMap)
112 |     {
113 |         free((void*)(mem.second.values));
114 | 
115 |     }
116 |     return engine;
117 | }
118 | 
119 | ICudaEngine* createEngine_m(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
120 |     INetworkDefinition* network = builder->createNetworkV2(0U);
121 | 
122 |     // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
123 |     ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
124 |     assert(data);
125 | 
126 |     std::map<std::string, Weights> weightMap = loadWeights("../yolov5m.wts");
127 |     Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
128 | 
129 |     /* ------ yolov5 backbone------ */
130 |     auto focus0 = focus(network, weightMap, *data, 3, 48, 3, "model.0");
131 |     auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), 96, 3, 2, 1, "model.1");
132 |     auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 96, 96, 2, true, 1, 0.5, "model.2");
133 |     auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), 192, 3, 2, 1, "model.3");
134 |     auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 192, 192, 6, true, 1, 0.5, "model.4");
135 |     auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), 384, 3, 2, 1, "model.5");
136 |     auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 384, 384, 6, true, 1, 0.5, "model.6");
137 |     auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), 768, 3, 2, 1, "model.7");
138 |     auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 768, 768, 5, 9, 13, "model.8");
139 |     /* ------ yolov5 head ------ */
140 |     auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 768, 768, 2, false, 1, 0.5, "model.9");
141 |     auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), 384, 1, 1, 1, "model.10");
142 | 
143 |     float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 384 * 2 * 2));
144 |     for (int i = 0; i < 384 * 2 * 2; i++) {
145 |         deval[i] = 1.0;
146 |     }
147 |     Weights deconvwts11{ DataType::kFLOAT, deval, 384 * 2 * 2 };
148 |     IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 384, DimsHW{ 2, 2 }, deconvwts11, emptywts);
149 |     deconv11->setStrideNd(DimsHW{ 2, 2 });
150 |     deconv11->setNbGroups(384);
151 |     weightMap["deconv11"] = deconvwts11;
152 |     ITensor* inputTensors12[] = { deconv11->getOutput(0), bottleneck_csp6->getOutput(0) };
153 |     auto cat12 = network->addConcatenation(inputTensors12, 2);
154 | 
155 |     auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 768, 384, 2, false, 1, 0.5, "model.13");
156 | 
157 |     auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), 192, 1, 1, 1, "model.14");
158 | 
159 |     Weights deconvwts15{ DataType::kFLOAT, deval, 192 * 2 * 2 };
160 |     IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 192, DimsHW{ 2, 2 }, deconvwts15, emptywts);
161 |     deconv15->setStrideNd(DimsHW{ 2, 2 });
162 |     deconv15->setNbGroups(192);
163 | 
164 |     ITensor* inputTensors16[] = { deconv15->getOutput(0), bottleneck_csp4->getOutput(0) };
165 |     auto cat16 = network->addConcatenation(inputTensors16, 2);
166 |     auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 384, 192, 2, false, 1, 0.5, "model.17");
167 | 
168 |     //yolo layer 0
169 |     IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
170 |     auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), 192, 3, 2, 1, "model.18");
171 |     ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
172 |     auto cat19 = network->addConcatenation(inputTensors19, 2);
173 |     auto bottleneck_csp20 = bottleneckCSP(network, weightMap, *cat19->getOutput(0), 384, 384, 2, false, 1, 0.5, "model.20");
174 | 
175 |     //yolo layer 1
176 |     IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
177 |     auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), 384, 3, 2, 1, "model.21");
178 |     ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
179 |     auto cat22 = network->addConcatenation(inputTensors22, 2);
180 |     auto bottleneck_csp23 = bottleneckCSP(network, weightMap, *cat22->getOutput(0), 768, 768, 2, false, 1, 0.5, "model.23");
181 |     // yolo layer 2
182 |     IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
183 | 
184 |     auto yolo = addYoLoLayer(network, weightMap, det0, det1, det2);
185 |     yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
186 |     network->markOutput(*yolo->getOutput(0));
187 | 
188 |     // Build engine
189 |     builder->setMaxBatchSize(maxBatchSize);
190 |     config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
191 | #ifdef USE_FP16
192 |     config->setFlag(BuilderFlag::kFP16);
193 | #endif
194 |     std::cout << "Building engine, please wait for a while..." << std::endl;
195 |     ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
196 |     std::cout << "Build engine successfully!" << std::endl;
197 | 
198 |     // Don't need the network any more
199 |     network->destroy();
200 | 
201 |     // Release host memory
202 |     for (auto& mem : weightMap)
203 |     {
204 |         free((void*)(mem.second.values));
205 |     }
206 | 
207 |     return engine;
208 | }
209 | 
210 | ICudaEngine* createEngine_l(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
211 |     INetworkDefinition* network = builder->createNetworkV2(0U);
212 | 
213 |     // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
214 |     ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
215 |     assert(data);
216 | 
217 |     std::map<std::string, Weights> weightMap = loadWeights("../yolov5l.wts");
218 |     Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
219 | 
220 |     /* ------ yolov5 backbone------ */
221 |     auto focus0 = focus(network, weightMap, *data, 3, 64, 3, "model.0");
222 |     auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), 128, 3, 2, 1, "model.1");
223 |     auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 128, 128, 3, true, 1, 0.5, "model.2");
224 |     auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), 256, 3, 2, 1, "model.3");
225 |     auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 256, 256, 9, true, 1, 0.5, "model.4");
226 |     auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), 512, 3, 2, 1, "model.5");
227 |     auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 512, 512, 9, true, 1, 0.5, "model.6");
228 |     auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), 1024, 3, 2, 1, "model.7");
229 |     auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 1024, 1024, 5, 9, 13, "model.8");
230 | 
231 |     /* ------ yolov5 head ------ */
232 |     auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 1024, 1024, 3, false, 1, 0.5, "model.9");
233 |     auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), 512, 1, 1, 1, "model.10");
234 | 
235 |     float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 512 * 2 * 2));
236 |     for (int i = 0; i < 512 * 2 * 2; i++) {
237 |         deval[i] = 1.0;
238 |     }
239 |     Weights deconvwts11{ DataType::kFLOAT, deval, 512 * 2 * 2 };
240 |     IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 512, DimsHW{ 2, 2 }, deconvwts11, emptywts);
241 |     deconv11->setStrideNd(DimsHW{ 2, 2 });
242 |     deconv11->setNbGroups(512);
243 |     weightMap["deconv11"] = deconvwts11;
244 | 
245 |     ITensor* inputTensors12[] = { deconv11->getOutput(0), bottleneck_csp6->getOutput(0) };
246 |     auto cat12 = network->addConcatenation(inputTensors12, 2);
247 |     auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 1024, 512, 3, false, 1, 0.5, "model.13");
248 |     auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), 256, 1, 1, 1, "model.14");
249 | 
250 |     Weights deconvwts15{ DataType::kFLOAT, deval, 256 * 2 * 2 };
251 |     IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 256, DimsHW{ 2, 2 }, deconvwts15, emptywts);
252 |     deconv15->setStrideNd(DimsHW{ 2, 2 });
253 |     deconv15->setNbGroups(256);
254 |     ITensor* inputTensors16[] = { deconv15->getOutput(0), bottleneck_csp4->getOutput(0) };
255 |     auto cat16 = network->addConcatenation(inputTensors16, 2);
256 | 
257 |     auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 512, 256, 3, false, 1, 0.5, "model.17");
258 | 
259 |     // yolo layer 0
260 |     IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
261 |     auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), 256, 3, 2, 1, "model.18");
262 |     ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
263 |     auto cat19 = network->addConcatenation(inputTensors19, 2);
264 |     auto bottleneck_csp20 = bottleneckCSP(network, weightMap, *cat19->getOutput(0), 512, 512, 3, false, 1, 0.5, "model.20");
265 |     //yolo layer 1
266 |     IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
267 |     auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), 512, 3, 2, 1, "model.21");
268 |     ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
269 |     auto cat22 = network->addConcatenation(inputTensors22, 2);
270 |     auto bottleneck_csp23 = bottleneckCSP(network, weightMap, *cat22->getOutput(0), 1024, 1024, 3, false, 1, 0.5, "model.23");
271 |     IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
272 | 
273 |     auto yolo = addYoLoLayer(network, weightMap, det0, det1, det2);
274 |     yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
275 |     network->markOutput(*yolo->getOutput(0));
276 | 
277 |     // Build engine
278 |     builder->setMaxBatchSize(maxBatchSize);
279 |     config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
280 | #ifdef USE_FP16
281 |     config->setFlag(BuilderFlag::kFP16);
282 | #endif
283 |     std::cout << "Building engine, please wait for a while..." << std::endl;
284 |     ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
285 |     std::cout << "Build engine successfully!" << std::endl;
286 | 
287 |     // Don't need the network any more
288 |     network->destroy();
289 | 
290 |     // Release host memory
291 |     for (auto& mem : weightMap)
292 |     {
293 |         free((void*)(mem.second.values));
294 |     }
295 | 
296 |     return engine;
297 | }
298 | 
299 | ICudaEngine* createEngine_x(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
300 |     INetworkDefinition* network = builder->createNetworkV2(0U);
301 | 
302 |     // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
303 |     ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
304 |     assert(data);
305 | 
306 |     std::map<std::string, Weights> weightMap = loadWeights("../yolov5x.wts");
307 |     Weights emptywts{ DataType::kFLOAT, nullptr, 0 };
308 | 
309 |     /* ------ yolov5 backbone------ */
310 |     auto focus0 = focus(network, weightMap, *data, 3, 80, 3, "model.0");
311 |     auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), 160, 3, 2, 1, "model.1");
312 |     auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 160, 160, 4, true, 1, 0.5, "model.2");
313 |     auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), 320, 3, 2, 1, "model.3");
314 |     auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 320, 320, 12, true, 1, 0.5, "model.4");
315 |     auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), 640, 3, 2, 1, "model.5");
316 |     auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 640, 640, 12, true, 1, 0.5, "model.6");
317 |     auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), 1280, 3, 2, 1, "model.7");
318 |     auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 1280, 1280, 5, 9, 13, "model.8");
319 | 
320 |     /* ------- yolov5 head ------- */
321 |     auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 1280, 1280, 4, false, 1, 0.5, "model.9");
322 |     auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), 640, 1, 1, 1, "model.10");
323 | 
324 |     float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 640 * 2 * 2));
325 |     for (int i = 0; i < 640 * 2 * 2; i++) {
326 |         deval[i] = 1.0;
327 |     }
328 |     Weights deconvwts11{ DataType::kFLOAT, deval, 640 * 2 * 2 };
329 |     IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 640, DimsHW{ 2, 2 }, deconvwts11, emptywts);
330 |     deconv11->setStrideNd(DimsHW{ 2, 2 });
331 |     deconv11->setNbGroups(640);
332 |     weightMap["deconv11"] = deconvwts11;
333 | 
334 |     ITensor* inputTensors12[] = { deconv11->getOutput(0), bottleneck_csp6->getOutput(0) };
335 |     auto cat12 = network->addConcatenation(inputTensors12, 2);
336 | 
337 |     auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 1280, 640, 4, false, 1, 0.5, "model.13");
338 |     auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), 320, 1, 1, 1, "model.14");
339 | 
340 |     Weights deconvwts15{ DataType::kFLOAT, deval, 320 * 2 * 2 };
341 |     IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 320, DimsHW{ 2, 2 }, deconvwts15, emptywts);
342 |     deconv15->setStrideNd(DimsHW{ 2, 2 });
343 |     deconv15->setNbGroups(320);
344 |     ITensor* inputTensors16[] = { deconv15->getOutput(0), bottleneck_csp4->getOutput(0) };
345 |     auto cat16 = network->addConcatenation(inputTensors16, 2);
346 | 
347 |     auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 640, 320, 4, false, 1, 0.5, "model.17");
348 | 
349 |     // yolo layer 0
350 |     IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
351 |     auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), 320, 3, 2, 1, "model.18");
352 |     ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
353 |     auto cat19 = network->addConcatenation(inputTensors19, 2);
354 |     auto bottleneck_csp20 = bottleneckCSP(network, weightMap, *cat19->getOutput(0), 640, 640, 4, false, 1, 0.5, "model.20");
355 |     // yolo layer 1
356 |     IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
357 |     auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), 640, 3, 2, 1, "model.21");
358 |     ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
359 |     auto cat22 = network->addConcatenation(inputTensors22, 2);
360 |     auto bottleneck_csp23 = bottleneckCSP(network, weightMap, *cat22->getOutput(0), 1280, 1280, 4, false, 1, 0.5, "model.23");
361 |     // yolo layer 2
362 |     IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
363 | 
364 |     auto yolo = addYoLoLayer(network, weightMap, det0, det1, det2);
365 |     yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
366 |     network->markOutput(*yolo->getOutput(0));
367 | 
368 |     // Build engine
369 |     builder->setMaxBatchSize(maxBatchSize);
370 |     config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
371 | #ifdef USE_FP16
372 |     config->setFlag(BuilderFlag::kFP16);
373 | #endif
374 |     std::cout << "Building engine, please wait for a while..." << std::endl;
375 |     ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
376 |     std::cout << "Build engine successfully!" << std::endl;
377 | 
378 |     // Don't need the network any more
379 |     network->destroy();
380 | 
381 |     // Release host memory
382 |     for (auto& mem : weightMap)
383 |     {
384 |         free((void*)(mem.second.values));
385 |     }
386 | 
387 |     return engine;
388 | }
389 | 
390 | void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
391 |     // Create builder
392 |     IBuilder* builder = createInferBuilder(gLogger);
393 |     IBuilderConfig* config = builder->createBuilderConfig();
394 | 
395 |     // Create model to populate the network, then set the outputs and create an engine
396 |     ICudaEngine* engine = (CREATENET(NET))(maxBatchSize, builder, config, DataType::kFLOAT);
397 |     //ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
398 |     assert(engine != nullptr);
399 | 
400 |     // Serialize the engine
401 |     (*modelStream) = engine->serialize();
402 | 
403 |     // Close everything down
404 |     engine->destroy();
405 |     builder->destroy();
406 | }
407 | 
408 | void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, int batchSize) {
409 |     // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
410 |     CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
411 |     context.enqueue(batchSize, buffers, stream, nullptr);
412 |     CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
413 |     cudaStreamSynchronize(stream);
414 | }
415 | 
416 | int main(int argc, char** argv) {
417 |     cudaSetDevice(DEVICE);
418 |     // create a model using the API directly and serialize it to a stream
419 |     char *trtModelStream{ nullptr };
420 |     size_t size{ 0 };
421 |     std::string engine_name = STR2(NET);
422 |     engine_name = "yolov5" + engine_name + ".engine";
423 |     if (argc == 2 && std::string(argv[1]) == "-s") {
424 |         IHostMemory* modelStream{ nullptr };
425 |         APIToModel(BATCH_SIZE, &modelStream);
426 |         assert(modelStream != nullptr);
427 |         std::ofstream p(engine_name, std::ios::binary);
428 |         if (!p) {
429 |             std::cerr << "could not open plan output file" << std::endl;
430 |             return -1;
431 |         }
432 |         p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
433 |         modelStream->destroy();
434 |         return 0;
435 |     } else if (argc == 2 && std::string(argv[1]) == "-v") {
436 |         std::ifstream file(engine_name, std::ios::binary);
437 |         if (file.good()) {
438 |             file.seekg(0, file.end);
439 |             size = file.tellg();
440 |             file.seekg(0, file.beg);
441 |             trtModelStream = new char[size];
442 |             assert(trtModelStream);
443 |             file.read(trtModelStream, size);
444 |             file.close();
445 |         }
446 |     } else {
447 |         std::cerr << "arguments not right!" << std::endl;
448 |         std::cerr << "./yolov5 -s  // serialize model to plan file" << std::endl;
449 |         std::cerr << "./yolov5 -v  // run inference with camera" << std::endl;
450 |         return -1;
451 |     }
452 | 
453 |     std::vector<std::string> file_names;
454 |     /*if (read_files_in_dir(argv[2], file_names) < 0) {
455 |         std::cout << "read_files_in_dir failed." << std::endl;
456 |         return -1;
457 |     }*/
458 | 
459 |     // prepare input data ---------------------------
460 |     static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
461 |     //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
462 |     //    data[i] = 1.0;
463 |     static float prob[BATCH_SIZE * OUTPUT_SIZE];
464 |     IRuntime* runtime = createInferRuntime(gLogger);
465 |     assert(runtime != nullptr);
466 |     ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
467 |     assert(engine != nullptr);
468 |     IExecutionContext* context = engine->createExecutionContext();
469 |     assert(context != nullptr);
470 |     delete[] trtModelStream;
471 |     assert(engine->getNbBindings() == 2);
472 |     void* buffers[2];
473 |     // In order to bind the buffers, we need to know the names of the input and output tensors.
474 |     // Note that indices are guaranteed to be less than IEngine::getNbBindings()
475 |     const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
476 |     const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
477 |     assert(inputIndex == 0);
478 |     assert(outputIndex == 1);
479 |     // Create GPU buffers on device
480 |     CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float)));
481 |     CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
482 |     // Create stream
483 |     cudaStream_t stream;
484 |     CHECK(cudaStreamCreate(&stream));
485 | 
486 |     cv::VideoCapture capture(0);
487 |     //cv::VideoCapture capture("../overpass.mp4");
488 |     //int fourcc = cv::VideoWriter::fourcc('M','J','P','G');
489 |     //capture.set(cv::CAP_PROP_FOURCC, fourcc);
490 |     if(!capture.isOpened()){
491 |         std::cout << "Error opening video stream or file" << std::endl;
492 |         return -1;
493 |     }
494 |     
495 |     int key;
496 |     int fcount = 0;
497 |     while(1)
498 |     {
499 |         cv::Mat frame;
500 |         capture >> frame;
501 |         if(frame.empty())
502 |         {
503 |             std::cout << "Fail to read image from camera!" << std::endl;
504 |             break;
505 |         }
506 |         
507 |         fcount++;
508 |         for (int b = 0; b < fcount; b++) {
509 |             cv::Mat img = frame;
510 |             if (img.empty()) continue;
511 |             cv::Mat pr_img = preprocess_img(img); // letterbox BGR to RGB
512 |             int i = 0;
513 |             for (int row = 0; row < INPUT_H; ++row) {
514 |                 uchar* uc_pixel = pr_img.data + row * pr_img.step;
515 |                 for (int col = 0; col < INPUT_W; ++col) {
516 |                     data[b * 3 * INPUT_H * INPUT_W + i] = (float)uc_pixel[2] / 255.0;
517 |                     data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;
518 |                     data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0;
519 |                     uc_pixel += 3;
520 |                     ++i;
521 |                 }
522 |             }
523 |         }
524 |     
525 |         // Run inference
526 |         auto start = std::chrono::system_clock::now();
527 |         doInference(*context, stream, buffers, data, prob, BATCH_SIZE);
528 |         auto end = std::chrono::system_clock::now();
529 | 	int fps = 1000.0/std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
530 |         //std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
531 |         std::vector<std::vector<Yolo::Detection>> batch_res(fcount);
532 |         for (int b = 0; b < fcount; b++) {
533 |             auto& res = batch_res[b];
534 |             nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH);
535 |         }
536 |         for (int b = 0; b < fcount; b++) {
537 |             auto& res = batch_res[b];
538 |             //std::cout << res.size() << std::endl;
539 |             for (size_t j = 0; j < res.size(); j++) {
540 |                 cv::Rect r = get_rect(frame, res[j].bbox);
541 |                 cv::rectangle(frame, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
542 | 		std::string label = coco_classes[(int)res[j].class_id];
543 | 		cv::putText(frame, label, cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 1.2);
544 | 		std::string jetson_fps = "Jetson Nano FPS: " + std::to_string(fps);
545 | 		cv::putText(frame, jetson_fps, cv::Point(11,80), cv::FONT_HERSHEY_PLAIN, 3, cv::Scalar(0, 0, 255), 2, cv::LINE_AA);
546 |             }
547 |         }
548 |         
549 | 	cv::imshow("yolov5",frame);
550 |         key = cv::waitKey(1);
551 |         if (key == 'q'){
552 |             break; 
553 |         }
554 | 
555 | 	fcount = 0;
556 |     }
557 | 	
558 |     capture.release();
559 |     // Release stream and buffers
560 |     cudaStreamDestroy(stream);
561 |     CHECK(cudaFree(buffers[inputIndex]));
562 |     CHECK(cudaFree(buffers[outputIndex]));
563 |     // Destroy the engine
564 |     context->destroy();
565 |     engine->destroy();
566 |     runtime->destroy();
567 | 
568 |     return 0;
569 | }
570 | 


--------------------------------------------------------------------------------
/yolov5_trt.py:
--------------------------------------------------------------------------------
  1 | """
  2 | An example that uses TensorRT's Python api to make inferences.
  3 | """
  4 | import ctypes
  5 | import os
  6 | import random
  7 | import sys
  8 | import threading
  9 | import time
 10 | 
 11 | import cv2
 12 | import numpy as np
 13 | import pycuda.autoinit
 14 | import pycuda.driver as cuda
 15 | import tensorrt as trt
 16 | import torch
 17 | import torchvision
 18 | 
 19 | INPUT_W = 608
 20 | INPUT_H = 608
 21 | CONF_THRESH = 0.1
 22 | IOU_THRESHOLD = 0.4
 23 | 
 24 | 
 25 | def plot_one_box(x, img, color=None, label=None, line_thickness=None):
 26 |     """
 27 |     description: Plots one bounding box on image img,
 28 |                  this function comes from YoLov5 project.
 29 |     param: 
 30 |         x:      a box likes [x1,y1,x2,y2]
 31 |         img:    a opencv image object
 32 |         color:  color to draw rectangle, such as (0,255,0)
 33 |         label:  str
 34 |         line_thickness: int
 35 |     return:
 36 |         no return
 37 | 
 38 |     """
 39 |     tl = (
 40 |         line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
 41 |     )  # line/font thickness
 42 |     color = color or [random.randint(0, 255) for _ in range(3)]
 43 |     c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
 44 |     cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
 45 |     if label:
 46 |         tf = max(tl - 1, 1)  # font thickness
 47 |         t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
 48 |         c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
 49 |         cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
 50 |         cv2.putText(
 51 |             img,
 52 |             label,
 53 |             (c1[0], c1[1] - 2),
 54 |             0,
 55 |             tl / 3,
 56 |             [225, 255, 255],
 57 |             thickness=tf,
 58 |             lineType=cv2.LINE_AA,
 59 |         )
 60 | 
 61 | 
 62 | class YoLov5TRT(object):
 63 |     """
 64 |     description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
 65 |     """
 66 | 
 67 |     def __init__(self, engine_file_path):
 68 |         # Create a Context on this device,
 69 |         self.cfx = cuda.Device(0).make_context()
 70 |         stream = cuda.Stream()
 71 |         TRT_LOGGER = trt.Logger(trt.Logger.INFO)
 72 |         runtime = trt.Runtime(TRT_LOGGER)
 73 | 
 74 |         # Deserialize the engine from file
 75 |         with open(engine_file_path, "rb") as f:
 76 |             engine = runtime.deserialize_cuda_engine(f.read())
 77 |         context = engine.create_execution_context()
 78 | 
 79 |         host_inputs = []
 80 |         cuda_inputs = []
 81 |         host_outputs = []
 82 |         cuda_outputs = []
 83 |         bindings = []
 84 | 
 85 |         for binding in engine:
 86 |             size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
 87 |             dtype = trt.nptype(engine.get_binding_dtype(binding))
 88 |             # Allocate host and device buffers
 89 |             host_mem = cuda.pagelocked_empty(size, dtype)
 90 |             cuda_mem = cuda.mem_alloc(host_mem.nbytes)
 91 |             # Append the device buffer to device bindings.
 92 |             bindings.append(int(cuda_mem))
 93 |             # Append to the appropriate list.
 94 |             if engine.binding_is_input(binding):
 95 |                 host_inputs.append(host_mem)
 96 |                 cuda_inputs.append(cuda_mem)
 97 |             else:
 98 |                 host_outputs.append(host_mem)
 99 |                 cuda_outputs.append(cuda_mem)
100 | 
101 |         # Store
102 |         self.stream = stream
103 |         self.context = context
104 |         self.engine = engine
105 |         self.host_inputs = host_inputs
106 |         self.cuda_inputs = cuda_inputs
107 |         self.host_outputs = host_outputs
108 |         self.cuda_outputs = cuda_outputs
109 |         self.bindings = bindings
110 | 
111 |     def infer(self, input_image_path):
112 |         threading.Thread.__init__(self)
113 |         # Make self the active context, pushing it on top of the context stack.
114 |         self.cfx.push()
115 |         # Restore
116 |         stream = self.stream
117 |         context = self.context
118 |         engine = self.engine
119 |         host_inputs = self.host_inputs
120 |         cuda_inputs = self.cuda_inputs
121 |         host_outputs = self.host_outputs
122 |         cuda_outputs = self.cuda_outputs
123 |         bindings = self.bindings
124 |         # Do image preprocess
125 |         input_image, image_raw, origin_h, origin_w = self.preprocess_image(
126 |             input_image_path
127 |         )
128 |         # Copy input image to host buffer
129 |         np.copyto(host_inputs[0], input_image.ravel())
130 |         # Transfer input data  to the GPU.
131 |         cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
132 |         # Run inference.
133 |         context.execute_async(bindings=bindings, stream_handle=stream.handle)
134 |         # Transfer predictions back from the GPU.
135 |         cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
136 |         # Synchronize the stream
137 |         stream.synchronize()
138 |         # Remove any context from the top of the context stack, deactivating it.
139 |         self.cfx.pop()
140 |         # Here we use the first row of output in that batch_size = 1
141 |         output = host_outputs[0]
142 |         # Do postprocess
143 |         result_boxes, result_scores, result_classid = self.post_process(
144 |             output, origin_h, origin_w
145 |         )
146 |         # Draw rectangles and labels on the original image
147 |         for i in range(len(result_boxes)):
148 |             box = result_boxes[i]
149 |             plot_one_box(
150 |                 box,
151 |                 image_raw,
152 |                 label="{}:{:.2f}".format(
153 |                     categories[int(result_classid[i])], result_scores[i]
154 |                 ),
155 |             )
156 |         parent, filename = os.path.split(input_image_path)
157 |         save_name = os.path.join(parent, "output_" + filename)
158 |         # 　Save image
159 |         cv2.imwrite(save_name, image_raw)
160 | 
161 |     def destroy(self):
162 |         # Remove any context from the top of the context stack, deactivating it.
163 |         self.cfx.pop()
164 | 
165 |     def preprocess_image(self, input_image_path):
166 |         """
167 |         description: Read an image from image path, convert it to RGB,
168 |                      resize and pad it to target size, normalize to [0,1],
169 |                      transform to NCHW format.
170 |         param:
171 |             input_image_path: str, image path
172 |         return:
173 |             image:  the processed image
174 |             image_raw: the original image
175 |             h: original height
176 |             w: original width
177 |         """
178 |         image_raw = cv2.imread(input_image_path)
179 |         h, w, c = image_raw.shape
180 |         image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
181 |         # Calculate widht and height and paddings
182 |         r_w = INPUT_W / w
183 |         r_h = INPUT_H / h
184 |         if r_h > r_w:
185 |             tw = INPUT_W
186 |             th = int(r_w * h)
187 |             tx1 = tx2 = 0
188 |             ty1 = int((INPUT_H - th) / 2)
189 |             ty2 = INPUT_H - th - ty1
190 |         else:
191 |             tw = int(r_h * w)
192 |             th = INPUT_H
193 |             tx1 = int((INPUT_W - tw) / 2)
194 |             tx2 = INPUT_W - tw - tx1
195 |             ty1 = ty2 = 0
196 |         # Resize the image with long side while maintaining ratio
197 |         image = cv2.resize(image, (tw, th))
198 |         # Pad the short side with (128,128,128)
199 |         image = cv2.copyMakeBorder(
200 |             image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, (128, 128, 128)
201 |         )
202 |         image = image.astype(np.float32)
203 |         # Normalize to [0,1]
204 |         image /= 255.0
205 |         # HWC to CHW format:
206 |         image = np.transpose(image, [2, 0, 1])
207 |         # CHW to NCHW format
208 |         image = np.expand_dims(image, axis=0)
209 |         # Convert the image to row-major order, also known as "C order":
210 |         image = np.ascontiguousarray(image)
211 |         return image, image_raw, h, w
212 | 
213 |     def xywh2xyxy(self, origin_h, origin_w, x):
214 |         """
215 |         description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
216 |         param:
217 |             origin_h:   height of original image
218 |             origin_w:   width of original image
219 |             x:          A boxes tensor, each row is a box [center_x, center_y, w, h]
220 |         return:
221 |             y:          A boxes tensor, each row is a box [x1, y1, x2, y2]
222 |         """
223 |         y = torch.zeros_like(x) if isinstance(x, torch.Tensor) else np.zeros_like(x)
224 |         r_w = INPUT_W / origin_w
225 |         r_h = INPUT_H / origin_h
226 |         if r_h > r_w:
227 |             y[:, 0] = x[:, 0] - x[:, 2] / 2
228 |             y[:, 2] = x[:, 0] + x[:, 2] / 2
229 |             y[:, 1] = x[:, 1] - x[:, 3] / 2 - (INPUT_H - r_w * origin_h) / 2
230 |             y[:, 3] = x[:, 1] + x[:, 3] / 2 - (INPUT_H - r_w * origin_h) / 2
231 |             y /= r_w
232 |         else:
233 |             y[:, 0] = x[:, 0] - x[:, 2] / 2 - (INPUT_W - r_h * origin_w) / 2
234 |             y[:, 2] = x[:, 0] + x[:, 2] / 2 - (INPUT_W - r_h * origin_w) / 2
235 |             y[:, 1] = x[:, 1] - x[:, 3] / 2
236 |             y[:, 3] = x[:, 1] + x[:, 3] / 2
237 |             y /= r_h
238 | 
239 |         return y
240 | 
241 |     def post_process(self, output, origin_h, origin_w):
242 |         """
243 |         description: postprocess the prediction
244 |         param:
245 |             output:     A tensor likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] 
246 |             origin_h:   height of original image
247 |             origin_w:   width of original image
248 |         return:
249 |             result_boxes: finally boxes, a boxes tensor, each row is a box [x1, y1, x2, y2]
250 |             result_scores: finally scores, a tensor, each element is the score correspoing to box
251 |             result_classid: finally classid, a tensor, each element is the classid correspoing to box
252 |         """
253 |         # Get the num of boxes detected
254 |         num = int(output[0])
255 |         # Reshape to a two dimentional ndarray
256 |         pred = np.reshape(output[1:], (-1, 6))[:num, :]
257 |         # to a torch Tensor
258 |         pred = torch.Tensor(pred).cuda()
259 |         # Get the boxes
260 |         boxes = pred[:, :4]
261 |         # Get the scores
262 |         scores = pred[:, 4]
263 |         # Get the classid
264 |         classid = pred[:, 5]
265 |         # Choose those boxes that score > CONF_THRESH
266 |         si = scores > CONF_THRESH
267 |         boxes = boxes[si, :]
268 |         scores = scores[si]
269 |         classid = classid[si]
270 |         # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
271 |         boxes = self.xywh2xyxy(origin_h, origin_w, boxes)
272 |         # Do nms
273 |         indices = torchvision.ops.nms(boxes, scores, iou_threshold=IOU_THRESHOLD).cpu()
274 |         result_boxes = boxes[indices, :].cpu()
275 |         result_scores = scores[indices].cpu()
276 |         result_classid = classid[indices].cpu()
277 |         return result_boxes, result_scores, result_classid
278 | 
279 | 
280 | class myThread(threading.Thread):
281 |     def __init__(self, func, args):
282 |         threading.Thread.__init__(self)
283 |         self.func = func
284 |         self.args = args
285 | 
286 |     def run(self):
287 |         self.func(*self.args)
288 | 
289 | 
290 | if __name__ == "__main__":
291 |     # load custom plugins
292 |     PLUGIN_LIBRARY = "build/libmyplugins.so"
293 |     ctypes.CDLL(PLUGIN_LIBRARY)
294 |     engine_file_path = "build/yolov5s.engine"
295 | 
296 |     # load coco labels
297 | 
298 |     categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
299 |             "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
300 |             "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
301 |             "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
302 |             "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
303 |             "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
304 |             "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
305 |             "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
306 |             "hair drier", "toothbrush"]
307 | 
308 |     # a  YoLov5TRT instance
309 |     yolov5_wrapper = YoLov5TRT(engine_file_path)
310 | 
311 |     # from https://github.com/ultralytics/yolov5/tree/master/inference/images
312 |     input_image_paths = ["zidane.jpg", "bus.jpg"]
313 | 
314 |     for input_image_path in input_image_paths:
315 |         # create a new thread to do inference
316 |         thread1 = myThread(yolov5_wrapper.infer, [input_image_path])
317 |         thread1.start()
318 |         thread1.join()
319 | 
320 |     # destroy the instance
321 |     yolov5_wrapper.destroy()
322 | 


--------------------------------------------------------------------------------
/yolov5s.engine:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenJetson/tensorrt-yolov5/7c79e973e40c3e41488021c3efcc2f4f3b283ed8/yolov5s.engine


--------------------------------------------------------------------------------