├── README.md
├── yolov5l
    ├── CMakeLists.txt
    ├── README.md
    ├── common.hpp
    ├── gen_wts.py
    ├── images
    │   ├── bus.jpg
    │   └── zidane.jpg
    ├── logging.h
    ├── utils.h
    ├── yololayer.cu
    ├── yololayer.h
    └── yolov5l.cpp
├── yolov5m
    ├── CMakeLists.txt
    ├── README.md
    ├── common.hpp
    ├── gen_wts.py
    ├── images
    │   ├── bus.jpg
    │   └── zidane.jpg
    ├── logging.h
    ├── utils.h
    ├── yololayer.cu
    ├── yololayer.h
    └── yolov5m.cpp
├── yolov5s
    ├── CMakeLists.txt
    ├── README.md
    ├── common.hpp
    ├── gen_wts.py
    ├── images
    │   ├── bus.jpg
    │   └── zidane.jpg
    ├── logging.h
    ├── utils.h
    ├── yololayer.cu
    ├── yololayer.h
    └── yolov5s.cpp
└── yolov5x
    ├── CMakeLists.txt
    ├── README.md
    ├── common.hpp
    ├── gen_wts.py
    ├── images
        ├── bus.jpg
        └── zidane.jpg
    ├── logging.h
    ├── utils.h
    ├── yololayer.cu
    ├── yololayer.h
    └── yolov5x.cpp


/README.md:
--------------------------------------------------------------------------------
 1 | # yolov5_2.0-TensorRt
 2 | U版yolov5 2.0的tensorrt加速
 3 | 
 4 | 
 5 | 
 6 | 并且对resize和图像处理阶段的操作做了优化，在win环境下debug下速度有很大提升，但是release则没有变化，因为在release时，opencv中会有相应的优化操作。
 7 | 
 8 | ```
 9 | 实际上，at操作符与ptr操作符在Debug版本下都是有内存检查、防止操作越界的操作，而data十分简单粗暴，没有任何检查，由于它的简单粗暴所以使得data操作速度很快。所以在Debug版本下，at操作符与ptr操作符相较于data，速度还是慢了不少。
10 | 
11 | 另外在Debug版本下，at操作要比指针操作慢得多，所以对于不连续数据或者单个点处理，可以考虑at操作，对于连续的大量数据，尽量不要使用它。
12 | ```
13 | 
14 | 感谢下面两个开源实现：
15 | 
16 |  https://github.com/wang-xinyu/tensorrtx
17 | 
18 | https://github.com/AIpakchoi/yolov5_tensorrt
19 | 


--------------------------------------------------------------------------------
/yolov5l/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(yolov5)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30)
14 | 
15 | include_directories(${PROJECT_SOURCE_DIR}/include)
16 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
17 |     message("embed_platform on")
18 |     include_directories(/usr/local/cuda/targets/aarch64-linux/include)
19 |     link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
20 | else()
21 |     message("embed_platform off")
22 |     include_directories(/usr/local/cuda/include)
23 |     link_directories(/usr/local/cuda/lib64)
24 | endif()
25 | 
26 | 
27 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
28 | 
29 | cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
30 | 
31 | ########## opencv configuration ############
32 | find_package(OpenCV 3.4 REQUIRED)
33 | message(OpenCV_LIBS)
34 | include_directories(OpenCV_INCLUDE_DIRS)
35 | 
36 | add_executable(yolov5l ${PROJECT_SOURCE_DIR}/yolov5l.cpp)
37 | target_link_libraries(yolov5l nvinfer)
38 | target_link_libraries(yolov5l cudart)
39 | target_link_libraries(yolov5l yololayer)
40 | target_link_libraries(yolov5l ${OpenCV_LIBS})
41 | 
42 | add_definitions(-O2 -pthread)
43 | 
44 | 


--------------------------------------------------------------------------------
/yolov5l/README.md:
--------------------------------------------------------------------------------
 1 | # yolov5
 2 | 
 3 | The Pytorch implementation is [ultralytics/yolov5](https://github.com/ultralytics/yolov5).
 4 | 
 5 | I was using [ultralytics/yolov5](https://github.com/ultralytics/yolov5)(The latest version). Just in case the yolov5 model updated.
 6 | 
 7 | ## How to Run
 8 | 
 9 | ```
10 | 1. generate yolov5l.wts from pytorch implementation with yolov5.pt
11 | 
12 | git clone https://github.com/AIpakchoi/yolov5_tensorrt.git
13 | git clone https://github.com/ultralytics/yolov5.git
14 | // download its weights 'yolov5l.pt'
15 | cd yolov5
16 | cp ../yolov5_tensorrt/yolov5l/gen_wts.py .
17 | python gen_wts.py
18 | // a file 'yolov5l.wts' will be generated.
19 | 
20 | 2. put yolov5l.wts into yolov5l, build and run
21 | 
22 | mv yolov5l.wts ../yolov5_tensorrt/yolov5l/
23 | cd ../yolov5_tensorrt/yolov5l
24 | mkdir build
25 | cd build
26 | cmake ..
27 | make
28 | sudo ./yolov5l -s             // serialize model to plan file i.e. 'yolov5l.engine'
29 | sudo ./yolov5l -d  ../samples // deserialize plan file and run inference, the images in samples will be processed.
30 | 
31 | 3. check the images generated, as follows. _zidane.jpg and _bus.jpg
32 | ```
33 | 
34 | <p align="center">
35 | <img src="https://user-images.githubusercontent.com/15235574/78247927-4d9fac00-751e-11ea-8b1b-704a0aeb3fcf.jpg">
36 | </p>
37 | 
38 | <p align="center">
39 | <img src="https://user-images.githubusercontent.com/15235574/78247970-60b27c00-751e-11ea-88df-41473fed4823.jpg">
40 | </p>
41 | 
42 | ## Config
43 | 
44 | - Input shape defined in yololayer.h
45 | - Number of classes defined in yololayer.h
46 | - FP16/FP32 can be selected by the macro in yolov5l.cpp
47 | - GPU id can be selected by the macro in yolov5l.cpp
48 | - NMS thresh in yolov5l.cpp
49 | - BBox confidence thresh in yolov5l.cpp
50 | - Batch size in yolov5l.cpp
51 | 


--------------------------------------------------------------------------------
/yolov5l/common.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef YOLOV5_COMMON_H_
  2 | #define YOLOV5_COMMON_H_
  3 | 
  4 | #include <fstream>
  5 | #include <map>
  6 | #include <sstream>
  7 | #include <vector>
  8 | #include <opencv2/opencv.hpp>
  9 | #include <dirent.h>
 10 | #include "NvInfer.h"
 11 | #include "yololayer.h"
 12 | 
 13 | #define CHECK(status) \
 14 |     do\
 15 |     {\
 16 |         auto ret = (status);\
 17 |         if (ret != 0)\
 18 |         {\
 19 |             std::cerr << "Cuda failure: " << ret << std::endl;\
 20 |             abort();\
 21 |         }\
 22 |     } while (0)
 23 | 
 24 | using namespace nvinfer1;
 25 | // resize 优化
 26 | // 从原图的点映射到输出图像的像素点
 27 | // 对outuput的每个点，先根据长宽比计算其在原图中最邻近的像素点，
 28 | //然后直接根据最邻近的思想，直接拷贝Channel个字节作为输出图像
 29 | void resizeByNN(uchar *input, uchar *output, int height_in, int width_in, int channels, int height_out, int width_out) {
 30 | 
 31 | 	uchar *data_source = input;
 32 | 	uchar *data_half = output;
 33 | 
 34 | 	int bpl_source = width_in * 3;
 35 | 	int bpl_dst = width_out * 3;
 36 | 
 37 | 	int pos = 0;
 38 | 	int sep = 0;
 39 | 	uchar *sr = nullptr;
 40 | 	uchar *hr = nullptr;
 41 | 	float step = 0.0;
 42 | 	float step_x = float(width_in) / float(width_out);
 43 | 	float step_y = float(height_in) / float(height_out);
 44 | 
 45 | 	for (int i = 0; i < height_out; i++) {
 46 | 		for (int j = 0; j < width_out; j++) {
 47 | 			sep = int(step_y*i);
 48 | 			step = int(j*step_x);
 49 | 			sr = data_source + sep * bpl_source;
 50 | 			hr = data_half + i * bpl_dst + j * channels;
 51 | 			pos = step * channels;
 52 | 			memcpy(hr, sr + pos, channels);
 53 | 		}
 54 | 	}
 55 | 	return;
 56 | }
 57 | 
 58 | cv::Mat preprocess_img(cv::Mat& img) {
 59 |     int w, h, x, y;
 60 |     float r_w = Yolo::INPUT_W / (img.cols*1.0);
 61 |     float r_h = Yolo::INPUT_H / (img.rows*1.0);
 62 |     if (r_h > r_w) {
 63 |         w = Yolo::INPUT_W;
 64 |         h = r_w * img.rows;
 65 |         x = 0;
 66 |         y = (Yolo::INPUT_H - h) / 2;
 67 |     } else {
 68 |         w = r_h* img.cols;
 69 |         h = Yolo::INPUT_H;
 70 |         x = (Yolo::INPUT_W - w) / 2;
 71 |         y = 0;
 72 |     }
 73 |     cv::Mat re(h, w, CV_8UC3);
 74 | 	//cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC);
 75 | 	auto start = std::chrono::system_clock::now();
 76 | 	cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
 77 | 	//resizeByNN(img.data, re.data, img.rows, img.cols, img.channels(), re.rows, re.cols);
 78 | 	auto end = std::chrono::system_clock::now();
 79 | 	std::cout << "img resize: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
 80 | 	
 81 | 	cv::Mat out(Yolo::INPUT_H, Yolo::INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128));
 82 |     
 83 |     re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
 84 |     return out;
 85 | }
 86 | 
 87 | cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
 88 |     int l, r, t, b;
 89 |     float r_w = Yolo::INPUT_W / (img.cols * 1.0);
 90 |     float r_h = Yolo::INPUT_H / (img.rows * 1.0);
 91 |     if (r_h > r_w) {
 92 |         l = bbox[0] - bbox[2]/2.f;
 93 | 		if (l < 0)
 94 | 		{
 95 | 			l = 0;
 96 | 		}
 97 |         r = bbox[0] + bbox[2]/2.f;
 98 |         if (r > img.cols)
 99 |         {
100 | 			r = img.cols;
101 |         }
102 | 		t = bbox[1] - bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
103 | 		if (t < 0)
104 | 		{
105 | 			t = 0;
106 | 		}
107 | 		b = bbox[1] + bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
108 | 		if (b > img.rows)
109 | 		{
110 | 			b = img.rows;
111 | 		}
112 |         l = l / r_w;
113 |         r = r / r_w;
114 |         t = t / r_w;
115 |         b = b / r_w;
116 |     } else {
117 |         l = bbox[0] - bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
118 | 		if (l < 0)
119 | 		{
120 | 			l = 0;
121 | 		}
122 |         r = bbox[0] + bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
123 | 		if (r > img.cols)
124 | 		{
125 | 			r = img.cols;
126 | 		}
127 |         t = bbox[1] - bbox[3]/2.f;
128 | 		if (t < 0)
129 | 		{
130 | 			t = 0;
131 | 		}
132 |         b = bbox[1] + bbox[3]/2.f;
133 | 		if (b > img.rows)
134 | 		{
135 | 			b = img.rows;
136 | 		}
137 |         l = l / r_h;
138 |         r = r / r_h;
139 |         t = t / r_h;
140 |         b = b / r_h;
141 |     }
142 |     return cv::Rect(l, t, r-l, b-t);
143 | }
144 | 
145 | // std::max vs. max
146 | //https://www.cnblogs.com/timesdaughter/p/5894930.html
147 | // Use (std::min) and (std::max)
148 | float iou(float lbox[4], float rbox[4]) {
149 |     float interBox[] = {
150 |         (std::max)(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
151 |         (std::min)(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
152 |         (std::max)(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
153 |         (std::min)(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
154 |     };
155 | 
156 |     if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
157 |         return 0.0f;
158 | 
159 |     float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
160 |     return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
161 | }
162 | 
163 | bool cmp(Yolo::Detection& a, Yolo::Detection& b) {
164 |     return a.conf > b.conf;
165 | }
166 | 
167 | void nms(std::vector<Yolo::Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5) {
168 |     int det_size = sizeof(Yolo::Detection) / sizeof(float);
169 |     std::map<float, std::vector<Yolo::Detection>> m;
170 |     for (int i = 0; i < output[0] && i < 1000; i++) {
171 |         if (output[1 + det_size * i + 4] <= conf_thresh) continue;
172 |         Yolo::Detection det;
173 |         memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
174 |         if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
175 |         m[det.class_id].push_back(det);
176 |     }
177 |     for (auto it = m.begin(); it != m.end(); it++) {
178 |         //std::cout << it->second[0].class_id << " --- " << std::endl;
179 |         auto& dets = it->second;
180 |         std::sort(dets.begin(), dets.end(), cmp);
181 |         for (size_t m = 0; m < dets.size(); ++m) {
182 |             auto& item = dets[m];
183 |             res.push_back(item);
184 |             for (size_t n = m + 1; n < dets.size(); ++n) {
185 |                 if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
186 |                     dets.erase(dets.begin()+n);
187 |                     --n;
188 |                 }
189 |             }
190 |         }
191 |     }
192 | }
193 | 
194 | // TensorRT weight files have a simple space delimited format:
195 | // [type] [size] <data x size in hex>
196 | std::map<std::string, Weights> loadWeights(const std::string file) {
197 |     std::cout << "Loading weights: " << file << std::endl;
198 |     std::map<std::string, Weights> weightMap;
199 | 
200 |     // Open weights file
201 |     std::ifstream input(file);
202 |     assert(input.is_open() && "Unable to load weight file.");
203 | 
204 |     // Read number of weight blobs
205 |     int32_t count;
206 |     input >> count;
207 |     assert(count > 0 && "Invalid weight map file.");
208 | 
209 |     while (count--)
210 |     {
211 |         Weights wt{DataType::kFLOAT, nullptr, 0};
212 |         uint32_t size;
213 | 
214 |         // Read name and type of blob
215 |         std::string name;
216 |         input >> name >> std::dec >> size;
217 |         wt.type = DataType::kFLOAT;
218 | 
219 |         // Load blob
220 |         uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
221 |         for (uint32_t x = 0, y = size; x < y; ++x)
222 |         {
223 |             input >> std::hex >> val[x];
224 |         }
225 |         wt.values = val;
226 |         
227 |         wt.count = size;
228 |         weightMap[name] = wt;
229 |     }
230 | 
231 |     return weightMap;
232 | }
233 | 
234 | IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
235 |     float *gamma = (float*)weightMap[lname + ".weight"].values;
236 |     float *beta = (float*)weightMap[lname + ".bias"].values;
237 |     float *mean = (float*)weightMap[lname + ".running_mean"].values;
238 |     float *var = (float*)weightMap[lname + ".running_var"].values;
239 |     int len = weightMap[lname + ".running_var"].count;
240 | 
241 |     float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
242 |     for (int i = 0; i < len; i++) {
243 |         scval[i] = gamma[i] / sqrt(var[i] + eps);
244 |     }
245 |     Weights scale{DataType::kFLOAT, scval, len};
246 |     
247 |     float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
248 |     for (int i = 0; i < len; i++) {
249 |         shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
250 |     }
251 |     Weights shift{DataType::kFLOAT, shval, len};
252 | 
253 |     float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
254 |     for (int i = 0; i < len; i++) {
255 |         pval[i] = 1.0;
256 |     }
257 |     Weights power{DataType::kFLOAT, pval, len};
258 | 
259 |     weightMap[lname + ".scale"] = scale;
260 |     weightMap[lname + ".shift"] = shift;
261 |     weightMap[lname + ".power"] = power;
262 |     IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
263 |     assert(scale_1);
264 |     return scale_1;
265 | }
266 | 
267 | ILayer* convBnLeaky(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) {
268 |     Weights emptywts{DataType::kFLOAT, nullptr, 0};
269 |     int p = ksize / 2;
270 |     IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[lname + ".conv.weight"], emptywts);
271 |     assert(conv1);
272 |     conv1->setStrideNd(DimsHW{s, s});
273 |     conv1->setPaddingNd(DimsHW{p, p});
274 |     conv1->setNbGroups(g);
275 | 	//IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-4);
276 | 	IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3);
277 |     auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU);
278 |     lr->setAlpha(0.1);
279 |     return lr;
280 | }
281 | 
282 | ILayer* focus(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) {
283 |     ISliceLayer *s1 = network->addSlice(input, Dims3{0, 0, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2});
284 |     ISliceLayer *s2 = network->addSlice(input, Dims3{0, 1, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2});
285 |     ISliceLayer *s3 = network->addSlice(input, Dims3{0, 0, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2});
286 |     ISliceLayer *s4 = network->addSlice(input, Dims3{0, 1, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2});
287 |     ITensor* inputTensors[] = {s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0)};
288 |     auto cat = network->addConcatenation(inputTensors, 4);
289 |     auto conv = convBnLeaky(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv");
290 |     return conv;
291 | }
292 | 
293 | ILayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) {
294 |     auto cv1 = convBnLeaky(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1");
295 |     auto cv2 = convBnLeaky(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2");
296 |     if (shortcut && c1 == c2) {
297 |         auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM);
298 |         return ew;
299 |     }
300 |     return cv2;
301 | }
302 | 
303 | ILayer* bottleneckCSP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
304 |     Weights emptywts{DataType::kFLOAT, nullptr, 0};
305 |     int c_ = (int)((float)c2 * e);
306 |     auto cv1 = convBnLeaky(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
307 |     auto cv2 = network->addConvolutionNd(input, c_, DimsHW{1, 1}, weightMap[lname + ".cv2.weight"], emptywts);
308 |     ITensor *y1 = cv1->getOutput(0);
309 |     for (int i = 0; i < n; i++) {
310 |         auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
311 |         y1 = b->getOutput(0);
312 |     }
313 |     auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{1, 1}, weightMap[lname + ".cv3.weight"], emptywts);
314 | 
315 |     ITensor* inputTensors[] = {cv3->getOutput(0), cv2->getOutput(0)};
316 |     auto cat = network->addConcatenation(inputTensors, 2);
317 | 
318 |     IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4);
319 |     auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU);
320 |     lr->setAlpha(0.1);
321 | 
322 |     auto cv4 = convBnLeaky(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4");
323 |     return cv4;
324 | }
325 | 
326 | ILayer* SPP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) {
327 |     int c_ = c1 / 2;
328 |     auto cv1 = convBnLeaky(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
329 | 
330 |     auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k1, k1});
331 |     pool1->setPaddingNd(DimsHW{k1 / 2, k1 / 2});
332 |     pool1->setStrideNd(DimsHW{1, 1});
333 |     auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k2, k2});
334 |     pool2->setPaddingNd(DimsHW{k2 / 2, k2 / 2});
335 |     pool2->setStrideNd(DimsHW{1, 1});
336 |     auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k3, k3});
337 |     pool3->setPaddingNd(DimsHW{k3 / 2, k3 / 2});
338 |     pool3->setStrideNd(DimsHW{1, 1});
339 | 
340 |     ITensor* inputTensors[] = {cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0)};
341 |     auto cat = network->addConcatenation(inputTensors, 4);
342 | 
343 |     auto cv2 = convBnLeaky(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2");
344 |     return cv2;
345 | }
346 | 
347 | int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
348 |     DIR *p_dir = opendir(p_dir_name);
349 |     if (p_dir == nullptr) {
350 |         return -1;
351 |     }
352 | 
353 |     struct dirent* p_file = nullptr;
354 |     while ((p_file = readdir(p_dir)) != nullptr) {
355 |         if (strcmp(p_file->d_name, ".") != 0 &&
356 |                 strcmp(p_file->d_name, "..") != 0) {
357 |             //std::string cur_file_name(p_dir_name);
358 |             //cur_file_name += "/";
359 |             //cur_file_name += p_file->d_name;
360 |             std::string cur_file_name(p_file->d_name);
361 |             file_names.push_back(cur_file_name);
362 |         }
363 |     }
364 | 
365 |     closedir(p_dir);
366 |     return 0;
367 | }
368 | 
369 | #endif
370 | 
371 | 


--------------------------------------------------------------------------------
/yolov5l/gen_wts.py:
--------------------------------------------------------------------------------
 1 | from utils.utils import *
 2 | import struct
 3 | 
 4 | # Initialize
 5 | device = torch_utils.select_device('0')
 6 | # Load model
 7 | model = torch.load('weights/yolov5l.pt', map_location=device)['model'].float()  # load to FP32
 8 | model.to(device).eval()
 9 | 
10 | f = open('yolov5l.wts', 'w')
11 | f.write('{}\n'.format(len(model.state_dict().keys())))
12 | for k, v in model.state_dict().items():
13 |     vr = v.reshape(-1).cpu().numpy()
14 |     f.write('{} {} '.format(k, len(vr)))
15 |     for vv in vr:
16 |         f.write(' ')
17 |         f.write(struct.pack('>f',float(vv)).hex())
18 |     f.write('\n')
19 | 


--------------------------------------------------------------------------------
/yolov5l/images/bus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BaofengZan/yolov5_2.0-TensorRt/78193337dd94bf48c2d55c3a77105a534d11c45b/yolov5l/images/bus.jpg


--------------------------------------------------------------------------------
/yolov5l/images/zidane.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BaofengZan/yolov5_2.0-TensorRt/78193337dd94bf48c2d55c3a77105a534d11c45b/yolov5l/images/zidane.jpg


--------------------------------------------------------------------------------
/yolov5l/logging.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #ifndef TENSORRT_LOGGING_H
 18 | #define TENSORRT_LOGGING_H
 19 | 
 20 | #include "NvInferRuntimeCommon.h"
 21 | #include <cassert>
 22 | #include <ctime>
 23 | #include <iomanip>
 24 | #include <iostream>
 25 | #include <ostream>
 26 | #include <sstream>
 27 | #include <string>
 28 | 
 29 | using Severity = nvinfer1::ILogger::Severity;
 30 | 
 31 | class LogStreamConsumerBuffer : public std::stringbuf
 32 | {
 33 | public:
 34 |     LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
 35 |         : mOutput(stream)
 36 |         , mPrefix(prefix)
 37 |         , mShouldLog(shouldLog)
 38 |     {
 39 |     }
 40 | 
 41 |     LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
 42 |         : mOutput(other.mOutput)
 43 |     {
 44 |     }
 45 | 
 46 |     ~LogStreamConsumerBuffer()
 47 |     {
 48 |         // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
 49 |         // std::streambuf::pptr() gives a pointer to the current position of the output sequence
 50 |         // if the pointer to the beginning is not equal to the pointer to the current position,
 51 |         // call putOutput() to log the output to the stream
 52 |         if (pbase() != pptr())
 53 |         {
 54 |             putOutput();
 55 |         }
 56 |     }
 57 | 
 58 |     // synchronizes the stream buffer and returns 0 on success
 59 |     // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
 60 |     // resetting the buffer and flushing the stream
 61 |     virtual int sync()
 62 |     {
 63 |         putOutput();
 64 |         return 0;
 65 |     }
 66 | 
 67 |     void putOutput()
 68 |     {
 69 |         if (mShouldLog)
 70 |         {
 71 |             // prepend timestamp
 72 |             std::time_t timestamp = std::time(nullptr);
 73 |             tm* tm_local = std::localtime(&timestamp);
 74 |             std::cout << "[";
 75 |             std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
 76 |             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
 77 |             std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
 78 |             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
 79 |             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
 80 |             std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
 81 |             // std::stringbuf::str() gets the string contents of the buffer
 82 |             // insert the buffer contents pre-appended by the appropriate prefix into the stream
 83 |             mOutput << mPrefix << str();
 84 |             // set the buffer to empty
 85 |             str("");
 86 |             // flush the stream
 87 |             mOutput.flush();
 88 |         }
 89 |     }
 90 | 
 91 |     void setShouldLog(bool shouldLog)
 92 |     {
 93 |         mShouldLog = shouldLog;
 94 |     }
 95 | 
 96 | private:
 97 |     std::ostream& mOutput;
 98 |     std::string mPrefix;
 99 |     bool mShouldLog;
100 | };
101 | 
102 | //!
103 | //! \class LogStreamConsumerBase
104 | //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
105 | //!
106 | class LogStreamConsumerBase
107 | {
108 | public:
109 |     LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
110 |         : mBuffer(stream, prefix, shouldLog)
111 |     {
112 |     }
113 | 
114 | protected:
115 |     LogStreamConsumerBuffer mBuffer;
116 | };
117 | 
118 | //!
119 | //! \class LogStreamConsumer
120 | //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
121 | //!  Order of base classes is LogStreamConsumerBase and then std::ostream.
122 | //!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
123 | //!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
124 | //!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
125 | //!  Please do not change the order of the parent classes.
126 | //!
127 | class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
128 | {
129 | public:
130 |     //! \brief Creates a LogStreamConsumer which logs messages with level severity.
131 |     //!  Reportable severity determines if the messages are severe enough to be logged.
132 |     LogStreamConsumer(Severity reportableSeverity, Severity severity)
133 |         : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
134 |         , std::ostream(&mBuffer) // links the stream buffer with the stream
135 |         , mShouldLog(severity <= reportableSeverity)
136 |         , mSeverity(severity)
137 |     {
138 |     }
139 | 
140 |     LogStreamConsumer(LogStreamConsumer&& other)
141 |         : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
142 |         , std::ostream(&mBuffer) // links the stream buffer with the stream
143 |         , mShouldLog(other.mShouldLog)
144 |         , mSeverity(other.mSeverity)
145 |     {
146 |     }
147 | 
148 |     void setReportableSeverity(Severity reportableSeverity)
149 |     {
150 |         mShouldLog = mSeverity <= reportableSeverity;
151 |         mBuffer.setShouldLog(mShouldLog);
152 |     }
153 | 
154 | private:
155 |     static std::ostream& severityOstream(Severity severity)
156 |     {
157 |         return severity >= Severity::kINFO ? std::cout : std::cerr;
158 |     }
159 | 
160 |     static std::string severityPrefix(Severity severity)
161 |     {
162 |         switch (severity)
163 |         {
164 |         case Severity::kINTERNAL_ERROR: return "[F] ";
165 |         case Severity::kERROR: return "[E] ";
166 |         case Severity::kWARNING: return "[W] ";
167 |         case Severity::kINFO: return "[I] ";
168 |         case Severity::kVERBOSE: return "[V] ";
169 |         default: assert(0); return "";
170 |         }
171 |     }
172 | 
173 |     bool mShouldLog;
174 |     Severity mSeverity;
175 | };
176 | 
177 | //! \class Logger
178 | //!
179 | //! \brief Class which manages logging of TensorRT tools and samples
180 | //!
181 | //! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
182 | //! and supports logging two types of messages:
183 | //!
184 | //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
185 | //! - Test pass/fail messages
186 | //!
187 | //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
188 | //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
189 | //!
190 | //! In the future, this class could be extended to support dumping test results to a file in some standard format
191 | //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
192 | //!
193 | //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
194 | //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
195 | //! library and messages coming from the sample.
196 | //!
197 | //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
198 | //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
199 | //! object.
200 | 
201 | class Logger : public nvinfer1::ILogger
202 | {
203 | public:
204 |     Logger(Severity severity = Severity::kWARNING)
205 |         : mReportableSeverity(severity)
206 |     {
207 |     }
208 | 
209 |     //!
210 |     //! \enum TestResult
211 |     //! \brief Represents the state of a given test
212 |     //!
213 |     enum class TestResult
214 |     {
215 |         kRUNNING, //!< The test is running
216 |         kPASSED,  //!< The test passed
217 |         kFAILED,  //!< The test failed
218 |         kWAIVED   //!< The test was waived
219 |     };
220 | 
221 |     //!
222 |     //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
223 |     //! \return The nvinfer1::ILogger associated with this Logger
224 |     //!
225 |     //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
226 |     //! we can eliminate the inheritance of Logger from ILogger
227 |     //!
228 |     nvinfer1::ILogger& getTRTLogger()
229 |     {
230 |         return *this;
231 |     }
232 | 
233 |     //!
234 |     //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
235 |     //!
236 |     //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
237 |     //! inheritance from nvinfer1::ILogger
238 |     //!
239 |     void log(Severity severity, const char* msg) override
240 |     {
241 |         LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
242 |     }
243 | 
244 |     //!
245 |     //! \brief Method for controlling the verbosity of logging output
246 |     //!
247 |     //! \param severity The logger will only emit messages that have severity of this level or higher.
248 |     //!
249 |     void setReportableSeverity(Severity severity)
250 |     {
251 |         mReportableSeverity = severity;
252 |     }
253 | 
254 |     //!
255 |     //! \brief Opaque handle that holds logging information for a particular test
256 |     //!
257 |     //! This object is an opaque handle to information used by the Logger to print test results.
258 |     //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
259 |     //! with Logger::reportTest{Start,End}().
260 |     //!
261 |     class TestAtom
262 |     {
263 |     public:
264 |         TestAtom(TestAtom&&) = default;
265 | 
266 |     private:
267 |         friend class Logger;
268 | 
269 |         TestAtom(bool started, const std::string& name, const std::string& cmdline)
270 |             : mStarted(started)
271 |             , mName(name)
272 |             , mCmdline(cmdline)
273 |         {
274 |         }
275 | 
276 |         bool mStarted;
277 |         std::string mName;
278 |         std::string mCmdline;
279 |     };
280 | 
281 |     //!
282 |     //! \brief Define a test for logging
283 |     //!
284 |     //! \param[in] name The name of the test.  This should be a string starting with
285 |     //!                  "TensorRT" and containing dot-separated strings containing
286 |     //!                  the characters [A-Za-z0-9_].
287 |     //!                  For example, "TensorRT.sample_googlenet"
288 |     //! \param[in] cmdline The command line used to reproduce the test
289 |     //
290 |     //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
291 |     //!
292 |     static TestAtom defineTest(const std::string& name, const std::string& cmdline)
293 |     {
294 |         return TestAtom(false, name, cmdline);
295 |     }
296 | 
297 |     //!
298 |     //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
299 |     //!        as input
300 |     //!
301 |     //! \param[in] name The name of the test
302 |     //! \param[in] argc The number of command-line arguments
303 |     //! \param[in] argv The array of command-line arguments (given as C strings)
304 |     //!
305 |     //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
306 |     static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
307 |     {
308 |         auto cmdline = genCmdlineString(argc, argv);
309 |         return defineTest(name, cmdline);
310 |     }
311 | 
312 |     //!
313 |     //! \brief Report that a test has started.
314 |     //!
315 |     //! \pre reportTestStart() has not been called yet for the given testAtom
316 |     //!
317 |     //! \param[in] testAtom The handle to the test that has started
318 |     //!
319 |     static void reportTestStart(TestAtom& testAtom)
320 |     {
321 |         reportTestResult(testAtom, TestResult::kRUNNING);
322 |         assert(!testAtom.mStarted);
323 |         testAtom.mStarted = true;
324 |     }
325 | 
326 |     //!
327 |     //! \brief Report that a test has ended.
328 |     //!
329 |     //! \pre reportTestStart() has been called for the given testAtom
330 |     //!
331 |     //! \param[in] testAtom The handle to the test that has ended
332 |     //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
333 |     //!                   TestResult::kFAILED, TestResult::kWAIVED
334 |     //!
335 |     static void reportTestEnd(const TestAtom& testAtom, TestResult result)
336 |     {
337 |         assert(result != TestResult::kRUNNING);
338 |         assert(testAtom.mStarted);
339 |         reportTestResult(testAtom, result);
340 |     }
341 | 
342 |     static int reportPass(const TestAtom& testAtom)
343 |     {
344 |         reportTestEnd(testAtom, TestResult::kPASSED);
345 |         return EXIT_SUCCESS;
346 |     }
347 | 
348 |     static int reportFail(const TestAtom& testAtom)
349 |     {
350 |         reportTestEnd(testAtom, TestResult::kFAILED);
351 |         return EXIT_FAILURE;
352 |     }
353 | 
354 |     static int reportWaive(const TestAtom& testAtom)
355 |     {
356 |         reportTestEnd(testAtom, TestResult::kWAIVED);
357 |         return EXIT_SUCCESS;
358 |     }
359 | 
360 |     static int reportTest(const TestAtom& testAtom, bool pass)
361 |     {
362 |         return pass ? reportPass(testAtom) : reportFail(testAtom);
363 |     }
364 | 
365 |     Severity getReportableSeverity() const
366 |     {
367 |         return mReportableSeverity;
368 |     }
369 | 
370 | private:
371 |     //!
372 |     //! \brief returns an appropriate string for prefixing a log message with the given severity
373 |     //!
374 |     static const char* severityPrefix(Severity severity)
375 |     {
376 |         switch (severity)
377 |         {
378 |         case Severity::kINTERNAL_ERROR: return "[F] ";
379 |         case Severity::kERROR: return "[E] ";
380 |         case Severity::kWARNING: return "[W] ";
381 |         case Severity::kINFO: return "[I] ";
382 |         case Severity::kVERBOSE: return "[V] ";
383 |         default: assert(0); return "";
384 |         }
385 |     }
386 | 
387 |     //!
388 |     //! \brief returns an appropriate string for prefixing a test result message with the given result
389 |     //!
390 |     static const char* testResultString(TestResult result)
391 |     {
392 |         switch (result)
393 |         {
394 |         case TestResult::kRUNNING: return "RUNNING";
395 |         case TestResult::kPASSED: return "PASSED";
396 |         case TestResult::kFAILED: return "FAILED";
397 |         case TestResult::kWAIVED: return "WAIVED";
398 |         default: assert(0); return "";
399 |         }
400 |     }
401 | 
402 |     //!
403 |     //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
404 |     //!
405 |     static std::ostream& severityOstream(Severity severity)
406 |     {
407 |         return severity >= Severity::kINFO ? std::cout : std::cerr;
408 |     }
409 | 
410 |     //!
411 |     //! \brief method that implements logging test results
412 |     //!
413 |     static void reportTestResult(const TestAtom& testAtom, TestResult result)
414 |     {
415 |         severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
416 |                                          << testAtom.mCmdline << std::endl;
417 |     }
418 | 
419 |     //!
420 |     //! \brief generate a command line string from the given (argc, argv) values
421 |     //!
422 |     static std::string genCmdlineString(int argc, char const* const* argv)
423 |     {
424 |         std::stringstream ss;
425 |         for (int i = 0; i < argc; i++)
426 |         {
427 |             if (i > 0)
428 |                 ss << " ";
429 |             ss << argv[i];
430 |         }
431 |         return ss.str();
432 |     }
433 | 
434 |     Severity mReportableSeverity;
435 | };
436 | 
437 | namespace
438 | {
439 | 
440 | //!
441 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
442 | //!
443 | //! Example usage:
444 | //!
445 | //!     LOG_VERBOSE(logger) << "hello world" << std::endl;
446 | //!
447 | inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
448 | {
449 |     return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
450 | }
451 | 
452 | //!
453 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
454 | //!
455 | //! Example usage:
456 | //!
457 | //!     LOG_INFO(logger) << "hello world" << std::endl;
458 | //!
459 | inline LogStreamConsumer LOG_INFO(const Logger& logger)
460 | {
461 |     return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
462 | }
463 | 
464 | //!
465 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
466 | //!
467 | //! Example usage:
468 | //!
469 | //!     LOG_WARN(logger) << "hello world" << std::endl;
470 | //!
471 | inline LogStreamConsumer LOG_WARN(const Logger& logger)
472 | {
473 |     return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
474 | }
475 | 
476 | //!
477 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
478 | //!
479 | //! Example usage:
480 | //!
481 | //!     LOG_ERROR(logger) << "hello world" << std::endl;
482 | //!
483 | inline LogStreamConsumer LOG_ERROR(const Logger& logger)
484 | {
485 |     return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
486 | }
487 | 
488 | //!
489 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
490 | //         ("fatal" severity)
491 | //!
492 | //! Example usage:
493 | //!
494 | //!     LOG_FATAL(logger) << "hello world" << std::endl;
495 | //!
496 | inline LogStreamConsumer LOG_FATAL(const Logger& logger)
497 | {
498 |     return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
499 | }
500 | 
501 | } // anonymous namespace
502 | 
503 | #endif // TENSORRT_LOGGING_H
504 | 


--------------------------------------------------------------------------------
/yolov5l/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef __TRT_UTILS_H_
 2 | #define __TRT_UTILS_H_
 3 | 
 4 | #include <iostream>
 5 | #include <vector>
 6 | #include <algorithm>
 7 | #include <cudnn.h>
 8 | 
 9 | #ifndef CUDA_CHECK
10 | 
11 | #define CUDA_CHECK(callstr)                                                                    \
12 |     {                                                                                          \
13 |         cudaError_t error_code = callstr;                                                      \
14 |         if (error_code != cudaSuccess) {                                                       \
15 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
16 |             assert(0);                                                                         \
17 |         }                                                                                      \
18 |     }
19 | 
20 | #endif
21 | 
22 | namespace Tn
23 | {
24 |     class Profiler : public nvinfer1::IProfiler
25 |     {
26 |     public:
27 |         void printLayerTimes(int itrationsTimes)
28 |         {
29 |             float totalTime = 0;
30 |             for (size_t i = 0; i < mProfile.size(); i++)
31 |             {
32 |                 printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes);
33 |                 totalTime += mProfile[i].second;
34 |             }
35 |             printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes);
36 |         }
37 |     private:
38 |         typedef std::pair<std::string, float> Record;
39 |         std::vector<Record> mProfile;
40 | 
41 |         virtual void reportLayerTime(const char* layerName, float ms)
42 |         {
43 |             auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
44 |             if (record == mProfile.end())
45 |                 mProfile.push_back(std::make_pair(layerName, ms));
46 |             else
47 |                 record->second += ms;
48 |         }
49 |     };
50 | 
51 |     //Logger for TensorRT info/warning/errors
52 |     class Logger : public nvinfer1::ILogger
53 |     {
54 |     public:
55 | 
56 |         Logger(): Logger(Severity::kWARNING) {}
57 | 
58 |         Logger(Severity severity): reportableSeverity(severity) {}
59 | 
60 |         void log(Severity severity, const char* msg) override
61 |         {
62 |             // suppress messages with severity enum value greater than the reportable
63 |             if (severity > reportableSeverity) return;
64 | 
65 |             switch (severity)
66 |             {
67 |                 case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
68 |                 case Severity::kERROR: std::cerr << "ERROR: "; break;
69 |                 case Severity::kWARNING: std::cerr << "WARNING: "; break;
70 |                 case Severity::kINFO: std::cerr << "INFO: "; break;
71 |                 default: std::cerr << "UNKNOWN: "; break;
72 |             }
73 |             std::cerr << msg << std::endl;
74 |         }
75 | 
76 |         Severity reportableSeverity{Severity::kWARNING};
77 |     };
78 | 
79 |     template<typename T> 
80 |     void write(char*& buffer, const T& val)
81 |     {
82 |         *reinterpret_cast<T*>(buffer) = val;
83 |         buffer += sizeof(T);
84 |     }
85 | 
86 |     template<typename T> 
87 |     void read(const char*& buffer, T& val)
88 |     {
89 |         val = *reinterpret_cast<const T*>(buffer);
90 |         buffer += sizeof(T);
91 |     }
92 | }
93 | 
94 | #endif


--------------------------------------------------------------------------------
/yolov5l/yololayer.cu:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include "yololayer.h"
  3 | #include "utils.h"
  4 | 
  5 | using namespace Yolo;
  6 | 
  7 | namespace nvinfer1
  8 | {
  9 |     YoloLayerPlugin::YoloLayerPlugin()
 10 |     {
 11 |         mClassCount = CLASS_NUM;
 12 |         mYoloKernel.clear();
 13 |         mYoloKernel.push_back(yolo1);
 14 |         mYoloKernel.push_back(yolo2);
 15 |         mYoloKernel.push_back(yolo3);
 16 | 
 17 |         mKernelCount = mYoloKernel.size();
 18 | 
 19 |         CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
 20 |         size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
 21 |         for(int ii = 0; ii < mKernelCount; ii ++)
 22 |         {
 23 |             CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen));
 24 |             const auto& yolo = mYoloKernel[ii];
 25 |             CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
 26 |         }
 27 |     }
 28 |     
 29 |     YoloLayerPlugin::~YoloLayerPlugin()
 30 |     {
 31 |     }
 32 | 
 33 |     // create the plugin at runtime from a byte stream
 34 |     YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
 35 |     {
 36 |         using namespace Tn;
 37 |         const char *d = reinterpret_cast<const char *>(data), *a = d;
 38 |         read(d, mClassCount);
 39 |         read(d, mThreadCount);
 40 |         read(d, mKernelCount);
 41 |         mYoloKernel.resize(mKernelCount);
 42 |         auto kernelSize = mKernelCount*sizeof(YoloKernel);
 43 |         memcpy(mYoloKernel.data(),d,kernelSize);
 44 |         d += kernelSize;
 45 | 
 46 |         CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
 47 |         size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
 48 |         for(int ii = 0; ii < mKernelCount; ii ++)
 49 |         {
 50 |             CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen));
 51 |             const auto& yolo = mYoloKernel[ii];
 52 |             CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
 53 |         }
 54 | 
 55 |         assert(d == a + length);
 56 |     }
 57 | 
 58 |     void YoloLayerPlugin::serialize(void* buffer) const
 59 |     {
 60 |         using namespace Tn;
 61 |         char* d = static_cast<char*>(buffer), *a = d;
 62 |         write(d, mClassCount);
 63 |         write(d, mThreadCount);
 64 |         write(d, mKernelCount);
 65 |         auto kernelSize = mKernelCount*sizeof(YoloKernel);
 66 |         memcpy(d,mYoloKernel.data(),kernelSize);
 67 |         d += kernelSize;
 68 | 
 69 |         assert(d == a + getSerializationSize());
 70 |     }
 71 |     
 72 |     size_t YoloLayerPlugin::getSerializationSize() const
 73 |     {  
 74 |         return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount)  + sizeof(Yolo::YoloKernel) * mYoloKernel.size();
 75 |     }
 76 | 
 77 |     int YoloLayerPlugin::initialize()
 78 |     { 
 79 |         return 0;
 80 |     }
 81 |     
 82 |     Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
 83 |     {
 84 |         //output the result to channel
 85 |         int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);
 86 | 
 87 |         return Dims3(totalsize + 1, 1, 1);
 88 |     }
 89 | 
 90 |     // Set plugin namespace
 91 |     void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace)
 92 |     {
 93 |         mPluginNamespace = pluginNamespace;
 94 |     }
 95 | 
 96 |     const char* YoloLayerPlugin::getPluginNamespace() const
 97 |     {
 98 |         return mPluginNamespace;
 99 |     }
100 | 
101 |     // Return the DataType of the plugin output at the requested index
102 |     DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
103 |     {
104 |         return DataType::kFLOAT;
105 |     }
106 | 
107 |     // Return true if output tensor is broadcast across a batch.
108 |     bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
109 |     {
110 |         return false;
111 |     }
112 | 
113 |     // Return true if plugin can use input that is broadcast across batch without replication.
114 |     bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const
115 |     {
116 |         return false;
117 |     }
118 | 
119 |     void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
120 |     {
121 |     }
122 | 
123 |     // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
124 |     void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
125 |     {
126 |     }
127 | 
128 |     // Detach the plugin object from its execution context.
129 |     void YoloLayerPlugin::detachFromContext() {}
130 | 
131 |     const char* YoloLayerPlugin::getPluginType() const
132 |     {
133 |         return "YoloLayer_TRT";
134 |     }
135 | 
136 |     const char* YoloLayerPlugin::getPluginVersion() const
137 |     {
138 |         return "1";
139 |     }
140 | 
141 |     void YoloLayerPlugin::destroy()
142 |     {
143 |         delete this;
144 |     }
145 | 
146 |     // Clone the plugin
147 |     IPluginV2IOExt* YoloLayerPlugin::clone() const
148 |     {
149 |         YoloLayerPlugin *p = new YoloLayerPlugin();
150 |         p->setPluginNamespace(mPluginNamespace);
151 |         return p;
152 |     }
153 | 
154 |     __device__ float Logist(float data){ return 1.0f / (1.0f + expf(-data)); };
155 | 
156 |     __global__ void CalDetection(const float *input, float *output,int noElements, 
157 |             int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes,int outputElem) {
158 |  
159 |         int idx = threadIdx.x + blockDim.x * blockIdx.x;
160 |         if (idx >= noElements) return;
161 | 
162 |         int total_grid = yoloWidth * yoloHeight;
163 |         int bnIdx = idx / total_grid;
164 |         idx = idx - total_grid*bnIdx;
165 |         int info_len_i = 5 + classes;
166 |         const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);
167 | 
168 |         for (int k = 0; k < 3; ++k) {
169 |             float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
170 |             if (box_prob < IGNORE_THRESH) continue;
171 |             int class_id = 0;
172 |             float max_cls_prob = 0.0;
173 |             for (int i = 5; i < info_len_i; ++i) {
174 |                 float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
175 |                 if (p > max_cls_prob) {
176 |                     max_cls_prob = p;
177 |                     class_id = i - 5;
178 |                 }
179 |             }
180 |             float *res_count = output + bnIdx*outputElem;
181 |             int count = (int)atomicAdd(res_count, 1);
182 |             if (count >= MAX_OUTPUT_BBOX_COUNT) return;
183 |             char* data = (char *)res_count + sizeof(float) + count * sizeof(Detection);
184 |             Detection* det =  (Detection*)(data);
185 | 
186 |             int row = idx / yoloWidth;
187 |             int col = idx % yoloWidth;
188 | 
189 |             //Location
190 |             det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth;
191 |             det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight;
192 |             det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]);
193 |             det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2*k];
194 |             det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]);
195 |             det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2*k + 1];
196 |             det->conf = box_prob * max_cls_prob;
197 |             det->class_id = class_id;
198 |         }
199 |     }
200 | 
201 |     void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {
202 | 
203 |         int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);
204 | 
205 |         for(int idx = 0 ; idx < batchSize; ++idx) {
206 |             CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float)));
207 |         }
208 |         int numElem = 0;
209 |         for (unsigned int i = 0; i < mYoloKernel.size(); ++i)
210 |         {
211 |             const auto& yolo = mYoloKernel[i];
212 |             numElem = yolo.width*yolo.height*batchSize;
213 |             if (numElem < mThreadCount)
214 |                 mThreadCount = numElem;
215 |             CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>>
216 |                 (inputs[i], output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount, outputElem);
217 |         }
218 | 
219 |     }
220 | 
221 | 
222 |     int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
223 |     {
224 |         forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
225 |         return 0;
226 |     }
227 | 
228 |     PluginFieldCollection YoloPluginCreator::mFC{};
229 |     std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
230 | 
231 |     YoloPluginCreator::YoloPluginCreator()
232 |     {
233 |         mPluginAttributes.clear();
234 | 
235 |         mFC.nbFields = mPluginAttributes.size();
236 |         mFC.fields = mPluginAttributes.data();
237 |     }
238 | 
239 |     const char* YoloPluginCreator::getPluginName() const
240 |     {
241 |             return "YoloLayer_TRT";
242 |     }
243 | 
244 |     const char* YoloPluginCreator::getPluginVersion() const
245 |     {
246 |             return "1";
247 |     }
248 | 
249 |     const PluginFieldCollection* YoloPluginCreator::getFieldNames()
250 |     {
251 |             return &mFC;
252 |     }
253 | 
254 |     IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
255 |     {
256 |         YoloLayerPlugin* obj = new YoloLayerPlugin();
257 |         obj->setPluginNamespace(mNamespace.c_str());
258 |         return obj;
259 |     }
260 | 
261 |     IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
262 |     {
263 |         // This object will be deleted when the network is destroyed, which will
264 |         // call MishPlugin::destroy()
265 |         YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
266 |         obj->setPluginNamespace(mNamespace.c_str());
267 |         return obj;
268 |     }
269 | 
270 | }
271 | 


--------------------------------------------------------------------------------
/yolov5l/yololayer.h:
--------------------------------------------------------------------------------
  1 | #ifndef _YOLO_LAYER_H
  2 | #define _YOLO_LAYER_H
  3 | 
  4 | #include <vector>
  5 | #include <string>
  6 | #include "NvInfer.h"
  7 | 
  8 | namespace Yolo
  9 | {
 10 |     static constexpr int CHECK_COUNT = 3;
 11 |     static constexpr float IGNORE_THRESH = 0.1f;
 12 |     static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
 13 |     static constexpr int CLASS_NUM = 80;
 14 |     static constexpr int INPUT_H = 608;
 15 |     static constexpr int INPUT_W = 608;
 16 | 
 17 |     struct YoloKernel
 18 |     {
 19 |         int width;
 20 |         int height;
 21 |         float anchors[CHECK_COUNT*2];
 22 |     };
 23 | 
 24 |     static constexpr YoloKernel yolo1 = {
 25 |         INPUT_W / 32,
 26 |         INPUT_H / 32,
 27 |         {116,90,  156,198,  373,326}
 28 |     };
 29 |     static constexpr YoloKernel yolo2 = {
 30 |         INPUT_W / 16,
 31 |         INPUT_H / 16,
 32 |         {30,61,  62,45,  59,119}
 33 |     };
 34 |     static constexpr YoloKernel yolo3 = {
 35 |         INPUT_W / 8,
 36 |         INPUT_H / 8,
 37 |         {10,13,  16,30,  33,23}
 38 |     };
 39 | 
 40 |     static constexpr int LOCATIONS = 4;
 41 |     struct alignas(float) Detection{
 42 |         //center_x center_y w h
 43 |         float bbox[LOCATIONS];
 44 |         float conf;  // bbox_conf * cls_conf
 45 |         float class_id;
 46 |     };
 47 | }
 48 | 
 49 | namespace nvinfer1
 50 | {
 51 |     class YoloLayerPlugin: public IPluginV2IOExt
 52 |     {
 53 |         public:
 54 |             explicit YoloLayerPlugin();
 55 |             YoloLayerPlugin(const void* data, size_t length);
 56 | 
 57 |             ~YoloLayerPlugin();
 58 | 
 59 |             int getNbOutputs() const override
 60 |             {
 61 |                 return 1;
 62 |             }
 63 | 
 64 |             Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
 65 | 
 66 |             int initialize() override;
 67 | 
 68 |             virtual void terminate() override {};
 69 | 
 70 |             virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;}
 71 | 
 72 |             virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;
 73 | 
 74 |             virtual size_t getSerializationSize() const override;
 75 | 
 76 |             virtual void serialize(void* buffer) const override;
 77 | 
 78 |             bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override {
 79 |                 return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
 80 |             }
 81 | 
 82 |             const char* getPluginType() const override;
 83 | 
 84 |             const char* getPluginVersion() const override;
 85 | 
 86 |             void destroy() override;
 87 | 
 88 |             IPluginV2IOExt* clone() const override;
 89 | 
 90 |             void setPluginNamespace(const char* pluginNamespace) override;
 91 | 
 92 |             const char* getPluginNamespace() const override;
 93 | 
 94 |             DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
 95 | 
 96 |             bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;
 97 | 
 98 |             bool canBroadcastInputAcrossBatch(int inputIndex) const override;
 99 | 
100 |             void attachToContext(
101 |                     cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;
102 | 
103 |             void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override;
104 | 
105 |             void detachFromContext() override;
106 | 
107 |         private:
108 |             void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1);
109 |             int mClassCount;
110 |             int mKernelCount;
111 |             std::vector<Yolo::YoloKernel> mYoloKernel;
112 |             int mThreadCount = 256;
113 |             void** mAnchor;
114 |             const char* mPluginNamespace;
115 |     };
116 | 
117 |     class YoloPluginCreator : public IPluginCreator
118 |     {
119 |         public:
120 |             YoloPluginCreator();
121 | 
122 |             ~YoloPluginCreator() override = default;
123 | 
124 |             const char* getPluginName() const override;
125 | 
126 |             const char* getPluginVersion() const override;
127 | 
128 |             const PluginFieldCollection* getFieldNames() override;
129 | 
130 |             IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;
131 | 
132 |             IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
133 | 
134 |             void setPluginNamespace(const char* libNamespace) override
135 |             {
136 |                 mNamespace = libNamespace;
137 |             }
138 | 
139 |             const char* getPluginNamespace() const override
140 |             {
141 |                 return mNamespace.c_str();
142 |             }
143 | 
144 |         private:
145 |             std::string mNamespace;
146 |             static PluginFieldCollection mFC;
147 |             static std::vector<PluginField> mPluginAttributes;
148 |     };
149 | 
150 | 
151 | 
152 | };
153 | 
154 | #endif 
155 | 


--------------------------------------------------------------------------------
/yolov5l/yolov5l.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <chrono>
  3 | #include "cuda_runtime_api.h"
  4 | #include "logging.h"
  5 | #include "common.hpp"
  6 | 
  7 | #define USE_FP16  // comment out this if want to use FP32
  8 | #define DEVICE 0  // GPU id
  9 | #define NMS_THRESH 0.5
 10 | #define CONF_THRESH 0.25
 11 | #define BATCH_SIZE 1
 12 | 
 13 | // stuff we know about the network and the input/output blobs
 14 | static const int INPUT_H = Yolo::INPUT_H;
 15 | static const int INPUT_W = Yolo::INPUT_W;
 16 | static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1;  // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1
 17 | const char* INPUT_BLOB_NAME = "data";
 18 | const char* OUTPUT_BLOB_NAME = "prob";
 19 | static Logger gLogger;
 20 | REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
 21 | 
 22 | // Creat the engine using only the API and not any parser.
 23 | ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
 24 |     INetworkDefinition* network = builder->createNetworkV2(0U);
 25 | 
 26 |     // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
 27 |     ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
 28 |     assert(data);
 29 | 
 30 |     std::map<std::string, Weights> weightMap = loadWeights("../yolov5l.wts");
 31 |     Weights emptywts{DataType::kFLOAT, nullptr, 0};
 32 | 
 33 |     /* ------ yolov5 backbone------ */
 34 |     auto focus0 = focus(network, weightMap, *data, 3, 64, 3, "model.0");
 35 |     auto conv1 = convBnLeaky(network, weightMap, *focus0->getOutput(0), 128, 3, 2, 1, "model.1");
 36 |     auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 128, 128, 3, true, 1, 0.5, "model.2");
 37 |     auto conv3 = convBnLeaky(network, weightMap, *bottleneck_CSP2->getOutput(0), 256, 3, 2, 1, "model.3");
 38 |     auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 256, 256, 9, true, 1, 0.5, "model.4");
 39 |     auto conv5 = convBnLeaky(network, weightMap, *bottleneck_csp4->getOutput(0), 512, 3, 2, 1, "model.5");
 40 |     auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 512, 512, 9, true, 1, 0.5, "model.6");
 41 |     auto conv7 = convBnLeaky(network, weightMap, *bottleneck_csp6->getOutput(0), 1024, 3, 2, 1, "model.7");
 42 |     auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 1024, 1024, 5, 9, 13, "model.8");
 43 | 
 44 |     /* ------ yolov5 head ------ */
 45 |     auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 1024, 1024, 3, false, 1, 0.5, "model.9");
 46 |     auto conv10 = convBnLeaky(network, weightMap, *bottleneck_csp9->getOutput(0), 512, 1, 1, 1, "model.10");
 47 |     
 48 |     float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 512 * 2 * 2));
 49 |     for (int i = 0; i < 512 * 2 * 2; i++) {
 50 |         deval[i] = 1.0;
 51 |     }
 52 |     Weights deconvwts11{DataType::kFLOAT, deval, 512 * 2 * 2};
 53 |     IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 512, DimsHW{2, 2}, deconvwts11, emptywts);
 54 |     deconv11->setStrideNd(DimsHW{2, 2});
 55 |     deconv11->setNbGroups(512);
 56 |     weightMap["deconv11"] = deconvwts11;
 57 | 
 58 |     ITensor* inputTensors12[] = {deconv11->getOutput(0), bottleneck_csp6->getOutput(0)};
 59 |     auto cat12 = network->addConcatenation(inputTensors12, 2);
 60 |     auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 1024, 512, 3, false, 1, 0.5, "model.13");
 61 |     auto conv14 = convBnLeaky(network, weightMap, *bottleneck_csp13->getOutput(0), 256, 1, 1, 1, "model.14");
 62 |     
 63 |     Weights deconvwts15{DataType::kFLOAT, deval, 256 * 2 * 2};
 64 |     IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 256, DimsHW{2, 2}, deconvwts15, emptywts);
 65 |     deconv15->setStrideNd(DimsHW{2, 2});
 66 |     deconv15->setNbGroups(256);
 67 |     ITensor* inputTensors16[] = {deconv15->getOutput(0), bottleneck_csp4->getOutput(0)};
 68 |     auto cat16 = network->addConcatenation(inputTensors16, 2);
 69 | 
 70 |     auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 512, 256, 3, false, 1, 0.5, "model.17");
 71 | 
 72 | 	//yolo layer 1
 73 |     IConvolutionLayer* conv18 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
 74 |     
 75 |     auto conv19 = convBnLeaky(network, weightMap, *bottleneck_csp17->getOutput(0), 256, 3, 2, 1, "model.18");
 76 | 
 77 | 	// yolo layer 2
 78 |     ITensor* inputTensors20[] = {conv19->getOutput(0), conv14->getOutput(0)};
 79 |     auto cat20 = network->addConcatenation(inputTensors20, 2);
 80 | 
 81 |     auto bottleneck_csp21 = bottleneckCSP(network, weightMap, *cat20->getOutput(0), 512, 512, 3, false, 1, 0.5, "model.20");
 82 |     
 83 | 	//yolo layer 3
 84 |     IConvolutionLayer* conv22 = network->addConvolutionNd(*bottleneck_csp21->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
 85 | 
 86 |     auto conv23 = convBnLeaky(network, weightMap, *bottleneck_csp21->getOutput(0), 512, 3, 2, 1, "model.21");
 87 |     
 88 |     ITensor* inputTensors24[] = {conv23->getOutput(0), conv10->getOutput(0)};
 89 |     auto cat24 = network->addConcatenation(inputTensors24, 2);
 90 |     
 91 |     auto bottleneck_csp25 = bottleneckCSP(network, weightMap, *cat24->getOutput(0), 1024, 1024, 3, false, 1, 0.5, "model.23");
 92 |     
 93 |     IConvolutionLayer* conv26 = network->addConvolutionNd(*bottleneck_csp25->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
 94 |     
 95 |     auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
 96 |     const PluginFieldCollection* pluginData = creator->getFieldNames();
 97 |     IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData);
 98 |     ITensor* inputTensors_yolo[] = {conv26->getOutput(0), conv22->getOutput(0), conv18->getOutput(0)};
 99 |     auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj);
100 | 
101 |     yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
102 |     network->markOutput(*yolo->getOutput(0));
103 | 
104 |     // Build engine
105 |     builder->setMaxBatchSize(maxBatchSize);
106 |     config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
107 | #ifdef USE_FP16
108 |     config->setFlag(BuilderFlag::kFP16);
109 | #endif
110 |     std::cout << "Building engine, please wait for a while..." << std::endl;
111 |     ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
112 |     std::cout << "Build engine successfully!" << std::endl;
113 | 
114 |     // Don't need the network any more
115 |     network->destroy();
116 | 
117 |     // Release host memory
118 |     for (auto& mem : weightMap)
119 |     {
120 |         free((void*) (mem.second.values));
121 |     }
122 | 
123 |     return engine;
124 | }
125 | 
126 | void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
127 |     // Create builder
128 |     IBuilder* builder = createInferBuilder(gLogger);
129 |     IBuilderConfig* config = builder->createBuilderConfig();
130 | 
131 |     // Create model to populate the network, then set the outputs and create an engine
132 |     ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
133 |     assert(engine != nullptr);
134 | 
135 |     // Serialize the engine
136 |     (*modelStream) = engine->serialize();
137 | 
138 |     // Close everything down
139 |     engine->destroy();
140 |     builder->destroy();
141 | }
142 | 
143 | void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
144 |     const ICudaEngine& engine = context.getEngine();
145 | 
146 |     // Pointers to input and output device buffers to pass to engine.
147 |     // Engine requires exactly IEngine::getNbBindings() number of buffers.
148 |     assert(engine.getNbBindings() == 2);
149 |     void* buffers[2];
150 | 
151 |     // In order to bind the buffers, we need to know the names of the input and output tensors.
152 |     // Note that indices are guaranteed to be less than IEngine::getNbBindings()
153 |     const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
154 |     const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
155 | 
156 |     // Create GPU buffers on device
157 |     CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
158 |     CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
159 | 
160 |     // Create stream
161 |     cudaStream_t stream;
162 |     CHECK(cudaStreamCreate(&stream));
163 | 
164 |     // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
165 |     CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
166 |     context.enqueue(batchSize, buffers, stream, nullptr);
167 |     CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
168 |     cudaStreamSynchronize(stream);
169 | 
170 |     // Release stream and buffers
171 |     cudaStreamDestroy(stream);
172 |     CHECK(cudaFree(buffers[inputIndex]));
173 |     CHECK(cudaFree(buffers[outputIndex]));
174 | }
175 | 
176 | int main(int argc, char** argv) {
177 |     cudaSetDevice(DEVICE);
178 |     // create a model using the API directly and serialize it to a stream
179 |     char *trtModelStream{nullptr};
180 |     size_t size{0};
181 | 
182 |     if (argc == 2 && std::string(argv[1]) == "-s") {
183 |         IHostMemory* modelStream{nullptr};
184 |         APIToModel(BATCH_SIZE, &modelStream);
185 |         assert(modelStream != nullptr);
186 |         std::ofstream p("yolov5l.engine", std::ios::binary);
187 |         if (!p) {
188 |             std::cerr << "could not open plan output file" << std::endl;
189 |             return -1;
190 |         }
191 |         p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
192 |         modelStream->destroy();
193 |         return 0;
194 |     } else if (argc == 3 && std::string(argv[1]) == "-d") {
195 |         std::ifstream file("yolov5l.engine", std::ios::binary);
196 |         if (file.good()) {
197 |             file.seekg(0, file.end);
198 |             size = file.tellg();
199 |             file.seekg(0, file.beg);
200 |             trtModelStream = new char[size];
201 |             assert(trtModelStream);
202 |             file.read(trtModelStream, size);
203 |             file.close();
204 |         }
205 |     } else {
206 |         std::cerr << "arguments not right!" << std::endl;
207 |         std::cerr << "./yolov5l -s  // serialize model to plan file" << std::endl;
208 |         std::cerr << "./yolov5l -d ../samples  // deserialize plan file and run inference" << std::endl;
209 |         return -1;
210 |     }
211 | 
212 |     std::vector<std::string> file_names;
213 |     if (read_files_in_dir(argv[2], file_names) < 0) {
214 |         std::cout << "read_files_in_dir failed." << std::endl;
215 |         return -1;
216 |     }
217 | 
218 |     // prepare input data ---------------------------
219 |     static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
220 |     //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
221 |     //    data[i] = 1.0;
222 |     static float prob[BATCH_SIZE * OUTPUT_SIZE];
223 |     IRuntime* runtime = createInferRuntime(gLogger);
224 |     assert(runtime != nullptr);
225 |     ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
226 |     assert(engine != nullptr);
227 |     IExecutionContext* context = engine->createExecutionContext();
228 |     assert(context != nullptr);
229 |     delete[] trtModelStream;
230 | 
231 |     int fcount = 0;
232 |     for (int f = 0; f < (int)file_names.size(); f++) {
233 |         fcount++;
234 |         if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue;
235 |         for (int b = 0; b < fcount; b++) {
236 |             cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]);
237 |             if (img.empty()) continue;
238 |             cv::Mat pr_img = preprocess_img(img);
239 | 			int i = 0;
240 | 			for (int row = 0; row < INPUT_H; ++row) {
241 | 				uchar* uc_pixel = pr_img.data + row * pr_img.step;
242 | 				for (int col = 0; col < INPUT_W; ++col) {
243 | 					data[b * 3 * INPUT_H * INPUT_W + i] = uc_pixel[2] / 255;
244 | 					data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = uc_pixel[1] / 255.0;
245 | 					data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = uc_pixel[0] / 255.0;
246 | 					uc_pixel += 3;
247 | 					++i;
248 | 				}
249 | 			}
250 |         }
251 | 
252 |         // Run inference
253 |         auto start = std::chrono::system_clock::now();
254 |         doInference(*context, data, prob, BATCH_SIZE);
255 |         auto end = std::chrono::system_clock::now();
256 |         std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
257 |         std::vector<std::vector<Yolo::Detection>> batch_res(fcount);
258 |         for (int b = 0; b < fcount; b++) {
259 |             auto& res = batch_res[b];
260 |             nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH);
261 |         }
262 |         for (int b = 0; b < fcount; b++) {
263 |             auto& res = batch_res[b];
264 |             //std::cout << res.size() << std::endl;
265 |             cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]);
266 |             for (size_t j = 0; j < res.size(); j++) {
267 |                 cv::Rect r = get_rect(img, res[j].bbox);
268 |                 cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
269 |                 cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
270 |             }
271 |             cv::imwrite("_" + file_names[f - fcount + 1 + b], img);
272 |         }
273 |         fcount = 0;
274 |     }
275 | 
276 |     // Destroy the engine
277 |     context->destroy();
278 |     engine->destroy();
279 |     runtime->destroy();
280 | 
281 |     // Print histogram of the output distribution
282 |     //std::cout << "\nOutput:\n\n";
283 |     //for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
284 |     //{
285 |     //    std::cout << prob[i] << ", ";
286 |     //    if (i % 10 == 0) std::cout << std::endl;
287 |     //}
288 |     //std::cout << std::endl;
289 | 
290 |     return 0;
291 | }
292 | 


--------------------------------------------------------------------------------
/yolov5m/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(yolov5)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30)
14 | 
15 | include_directories(${PROJECT_SOURCE_DIR}/include)
16 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
17 |     message("embed_platform on")
18 |     include_directories(/usr/local/cuda/targets/aarch64-linux/include)
19 |     link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
20 | else()
21 |     message("embed_platform off")
22 |     include_directories(/usr/local/cuda/include)
23 |     link_directories(/usr/local/cuda/lib64)
24 | endif()
25 | 
26 | 
27 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
28 | 
29 | cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
30 | 
31 | ########## opencv configuration ############
32 | find_package(OpenCV 3.4 REQUIRED)
33 | message(OpenCV_LIBS)
34 | include_directories(OpenCV_INCLUDE_DIRS)
35 | 
36 | add_executable(yolov5m ${PROJECT_SOURCE_DIR}/yolov5m.cpp)
37 | target_link_libraries(yolov5m nvinfer)
38 | target_link_libraries(yolov5m cudart)
39 | target_link_libraries(yolov5m yololayer)
40 | target_link_libraries(yolov5m ${OpenCV_LIBS})
41 | 
42 | add_definitions(-O2 -pthread)
43 | 
44 | 


--------------------------------------------------------------------------------
/yolov5m/README.md:
--------------------------------------------------------------------------------
 1 | # yolov5
 2 | 
 3 | The Pytorch implementation is [ultralytics/yolov5](https://github.com/ultralytics/yolov5).
 4 | 
 5 | I was using [ultralytics/yolov5](https://github.com/ultralytics/yolov5)(The latest version). Just in case the yolov5 model updated.
 6 | 
 7 | ## How to Run
 8 | 
 9 | ```
10 | 1. generate yolov5m.wts from pytorch implementation with yolov5.pt
11 | 
12 | git clone https://github.com/AIpakchoi/yolov5_tensorrt.git
13 | git clone https://github.com/ultralytics/yolov5.git
14 | // download its weights 'yolov5m.pt'
15 | cd yolov5
16 | cp ../yolov5_tensorrt/yolov5m/gen_wts.py .
17 | python gen_wts.py
18 | // a file 'yolov5m.wts' will be generated.
19 | 
20 | 2. put yolov5m.wts into yolov5m, build and run
21 | 
22 | mv yolov5m.wts ../yolov5_tensorrt/yolov5m/
23 | cd ../yolov5_tensorrt/yolov5m
24 | mkdir build
25 | cd build
26 | cmake ..
27 | make
28 | sudo ./yolov5m -s             // serialize model to plan file i.e. 'yolov5m.engine'
29 | sudo ./yolov5m -d  ../samples // deserialize plan file and run inference, the images in samples will be processed.
30 | 
31 | 3. check the images generated, as follows. _zidane.jpg and _bus.jpg
32 | ```
33 | 
34 | <p align="center">
35 | <img src="https://user-images.githubusercontent.com/15235574/78247927-4d9fac00-751e-11ea-8b1b-704a0aeb3fcf.jpg">
36 | </p>
37 | 
38 | <p align="center">
39 | <img src="https://user-images.githubusercontent.com/15235574/78247970-60b27c00-751e-11ea-88df-41473fed4823.jpg">
40 | </p>
41 | 
42 | ## Config
43 | 
44 | - Input shape defined in yololayer.h
45 | - Number of classes defined in yololayer.h
46 | - FP16/FP32 can be selected by the macro in yolov5m.cpp
47 | - GPU id can be selected by the macro in yolov5m.cpp
48 | - NMS thresh in yolov5m.cpp
49 | - BBox confidence thresh in yolov5m.cpp
50 | - Batch size in yolov5m.cpp
51 | 


--------------------------------------------------------------------------------
/yolov5m/common.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef YOLOV5_COMMON_H_
  2 | #define YOLOV5_COMMON_H_
  3 | 
  4 | #include <fstream>
  5 | #include <map>
  6 | #include <sstream>
  7 | #include <vector>
  8 | #include <opencv2/opencv.hpp>
  9 | #include <dirent.h>
 10 | #include "NvInfer.h"
 11 | #include "yololayer.h"
 12 | 
 13 | #define CHECK(status) \
 14 |     do\
 15 |     {\
 16 |         auto ret = (status);\
 17 |         if (ret != 0)\
 18 |         {\
 19 |             std::cerr << "Cuda failure: " << ret << std::endl;\
 20 |             abort();\
 21 |         }\
 22 |     } while (0)
 23 | 
 24 | using namespace nvinfer1;
 25 | // resize 优化
 26 | // 从原图的点映射到输出图像的像素点
 27 | // 对outuput的每个点，先根据长宽比计算其在原图中最邻近的像素点，
 28 | //然后直接根据最邻近的思想，直接拷贝Channel个字节作为输出图像
 29 | void resizeByNN(uchar *input, uchar *output, int height_in, int width_in, int channels, int height_out, int width_out) {
 30 | 
 31 | 	uchar *data_source = input;
 32 | 	uchar *data_half = output;
 33 | 
 34 | 	int bpl_source = width_in * 3;
 35 | 	int bpl_dst = width_out * 3;
 36 | 
 37 | 	int pos = 0;
 38 | 	int sep = 0;
 39 | 	uchar *sr = nullptr;
 40 | 	uchar *hr = nullptr;
 41 | 	float step = 0.0;
 42 | 	float step_x = float(width_in) / float(width_out);
 43 | 	float step_y = float(height_in) / float(height_out);
 44 | 
 45 | 	for (int i = 0; i < height_out; i++) {
 46 | 		for (int j = 0; j < width_out; j++) {
 47 | 			sep = int(step_y*i);
 48 | 			step = int(j*step_x);
 49 | 			sr = data_source + sep * bpl_source;
 50 | 			hr = data_half + i * bpl_dst + j * channels;
 51 | 			pos = step * channels;
 52 | 			memcpy(hr, sr + pos, channels);
 53 | 		}
 54 | 	}
 55 | 	return;
 56 | }
 57 | 
 58 | cv::Mat preprocess_img(cv::Mat& img) {
 59 |     int w, h, x, y;
 60 |     float r_w = Yolo::INPUT_W / (img.cols*1.0);
 61 |     float r_h = Yolo::INPUT_H / (img.rows*1.0);
 62 |     if (r_h > r_w) {
 63 |         w = Yolo::INPUT_W;
 64 |         h = r_w * img.rows;
 65 |         x = 0;
 66 |         y = (Yolo::INPUT_H - h) / 2;
 67 |     } else {
 68 |         w = r_h* img.cols;
 69 |         h = Yolo::INPUT_H;
 70 |         x = (Yolo::INPUT_W - w) / 2;
 71 |         y = 0;
 72 |     }
 73 |     cv::Mat re(h, w, CV_8UC3);
 74 | 	//cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC);
 75 | 	auto start = std::chrono::system_clock::now();
 76 | 	cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
 77 | 	//resizeByNN(img.data, re.data, img.rows, img.cols, img.channels(), re.rows, re.cols);
 78 | 	auto end = std::chrono::system_clock::now();
 79 | 	std::cout << "img resize: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
 80 | 	
 81 | 	cv::Mat out(Yolo::INPUT_H, Yolo::INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128));
 82 |     
 83 |     re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
 84 |     return out;
 85 | }
 86 | 
 87 | cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
 88 |     int l, r, t, b;
 89 |     float r_w = Yolo::INPUT_W / (img.cols * 1.0);
 90 |     float r_h = Yolo::INPUT_H / (img.rows * 1.0);
 91 |     if (r_h > r_w) {
 92 |         l = bbox[0] - bbox[2]/2.f;
 93 | 		if (l < 0)
 94 | 		{
 95 | 			l = 0;
 96 | 		}
 97 |         r = bbox[0] + bbox[2]/2.f;
 98 |         if (r > img.cols)
 99 |         {
100 | 			r = img.cols;
101 |         }
102 | 		t = bbox[1] - bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
103 | 		if (t < 0)
104 | 		{
105 | 			t = 0;
106 | 		}
107 | 		b = bbox[1] + bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
108 | 		if (b > img.rows)
109 | 		{
110 | 			b = img.rows;
111 | 		}
112 |         l = l / r_w;
113 |         r = r / r_w;
114 |         t = t / r_w;
115 |         b = b / r_w;
116 |     } else {
117 |         l = bbox[0] - bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
118 | 		if (l < 0)
119 | 		{
120 | 			l = 0;
121 | 		}
122 |         r = bbox[0] + bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
123 | 		if (r > img.cols)
124 | 		{
125 | 			r = img.cols;
126 | 		}
127 |         t = bbox[1] - bbox[3]/2.f;
128 | 		if (t < 0)
129 | 		{
130 | 			t = 0;
131 | 		}
132 |         b = bbox[1] + bbox[3]/2.f;
133 | 		if (b > img.rows)
134 | 		{
135 | 			b = img.rows;
136 | 		}
137 |         l = l / r_h;
138 |         r = r / r_h;
139 |         t = t / r_h;
140 |         b = b / r_h;
141 |     }
142 |     return cv::Rect(l, t, r-l, b-t);
143 | }
144 | 
145 | // std::max vs. max
146 | //https://www.cnblogs.com/timesdaughter/p/5894930.html
147 | // Use (std::min) and (std::max)
148 | float iou(float lbox[4], float rbox[4]) {
149 |     float interBox[] = {
150 |         (std::max)(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
151 |         (std::min)(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
152 |         (std::max)(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
153 |         (std::min)(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
154 |     };
155 | 
156 |     if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
157 |         return 0.0f;
158 | 
159 |     float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
160 |     return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
161 | }
162 | 
163 | bool cmp(Yolo::Detection& a, Yolo::Detection& b) {
164 |     return a.conf > b.conf;
165 | }
166 | 
167 | void nms(std::vector<Yolo::Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5) {
168 |     int det_size = sizeof(Yolo::Detection) / sizeof(float);
169 |     std::map<float, std::vector<Yolo::Detection>> m;
170 |     for (int i = 0; i < output[0] && i < 1000; i++) {
171 |         if (output[1 + det_size * i + 4] <= conf_thresh) continue;
172 |         Yolo::Detection det;
173 |         memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
174 |         if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
175 |         m[det.class_id].push_back(det);
176 |     }
177 |     for (auto it = m.begin(); it != m.end(); it++) {
178 |         //std::cout << it->second[0].class_id << " --- " << std::endl;
179 |         auto& dets = it->second;
180 |         std::sort(dets.begin(), dets.end(), cmp);
181 |         for (size_t m = 0; m < dets.size(); ++m) {
182 |             auto& item = dets[m];
183 |             res.push_back(item);
184 |             for (size_t n = m + 1; n < dets.size(); ++n) {
185 |                 if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
186 |                     dets.erase(dets.begin()+n);
187 |                     --n;
188 |                 }
189 |             }
190 |         }
191 |     }
192 | }
193 | 
194 | // TensorRT weight files have a simple space delimited format:
195 | // [type] [size] <data x size in hex>
196 | std::map<std::string, Weights> loadWeights(const std::string file) {
197 |     std::cout << "Loading weights: " << file << std::endl;
198 |     std::map<std::string, Weights> weightMap;
199 | 
200 |     // Open weights file
201 |     std::ifstream input(file);
202 |     assert(input.is_open() && "Unable to load weight file.");
203 | 
204 |     // Read number of weight blobs
205 |     int32_t count;
206 |     input >> count;
207 |     assert(count > 0 && "Invalid weight map file.");
208 | 
209 |     while (count--)
210 |     {
211 |         Weights wt{DataType::kFLOAT, nullptr, 0};
212 |         uint32_t size;
213 | 
214 |         // Read name and type of blob
215 |         std::string name;
216 |         input >> name >> std::dec >> size;
217 |         wt.type = DataType::kFLOAT;
218 | 
219 |         // Load blob
220 |         uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
221 |         for (uint32_t x = 0, y = size; x < y; ++x)
222 |         {
223 |             input >> std::hex >> val[x];
224 |         }
225 |         wt.values = val;
226 |         
227 |         wt.count = size;
228 |         weightMap[name] = wt;
229 |     }
230 | 
231 |     return weightMap;
232 | }
233 | 
234 | IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
235 |     float *gamma = (float*)weightMap[lname + ".weight"].values;
236 |     float *beta = (float*)weightMap[lname + ".bias"].values;
237 |     float *mean = (float*)weightMap[lname + ".running_mean"].values;
238 |     float *var = (float*)weightMap[lname + ".running_var"].values;
239 |     int len = weightMap[lname + ".running_var"].count;
240 | 
241 |     float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
242 |     for (int i = 0; i < len; i++) {
243 |         scval[i] = gamma[i] / sqrt(var[i] + eps);
244 |     }
245 |     Weights scale{DataType::kFLOAT, scval, len};
246 |     
247 |     float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
248 |     for (int i = 0; i < len; i++) {
249 |         shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
250 |     }
251 |     Weights shift{DataType::kFLOAT, shval, len};
252 | 
253 |     float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
254 |     for (int i = 0; i < len; i++) {
255 |         pval[i] = 1.0;
256 |     }
257 |     Weights power{DataType::kFLOAT, pval, len};
258 | 
259 |     weightMap[lname + ".scale"] = scale;
260 |     weightMap[lname + ".shift"] = shift;
261 |     weightMap[lname + ".power"] = power;
262 |     IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
263 |     assert(scale_1);
264 |     return scale_1;
265 | }
266 | 
267 | ILayer* convBnLeaky(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) {
268 |     Weights emptywts{DataType::kFLOAT, nullptr, 0};
269 |     int p = ksize / 2;
270 |     IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[lname + ".conv.weight"], emptywts);
271 |     assert(conv1);
272 |     conv1->setStrideNd(DimsHW{s, s});
273 |     conv1->setPaddingNd(DimsHW{p, p});
274 |     conv1->setNbGroups(g);
275 | 	//IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-4);
276 | 	IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3);
277 |     auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU);
278 |     lr->setAlpha(0.1);
279 |     return lr;
280 | }
281 | 
282 | ILayer* focus(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) {
283 |     ISliceLayer *s1 = network->addSlice(input, Dims3{0, 0, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2});
284 |     ISliceLayer *s2 = network->addSlice(input, Dims3{0, 1, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2});
285 |     ISliceLayer *s3 = network->addSlice(input, Dims3{0, 0, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2});
286 |     ISliceLayer *s4 = network->addSlice(input, Dims3{0, 1, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2});
287 |     ITensor* inputTensors[] = {s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0)};
288 |     auto cat = network->addConcatenation(inputTensors, 4);
289 |     auto conv = convBnLeaky(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv");
290 |     return conv;
291 | }
292 | 
293 | ILayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) {
294 |     auto cv1 = convBnLeaky(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1");
295 |     auto cv2 = convBnLeaky(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2");
296 |     if (shortcut && c1 == c2) {
297 |         auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM);
298 |         return ew;
299 |     }
300 |     return cv2;
301 | }
302 | 
303 | ILayer* bottleneckCSP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
304 |     Weights emptywts{DataType::kFLOAT, nullptr, 0};
305 |     int c_ = (int)((float)c2 * e);
306 |     auto cv1 = convBnLeaky(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
307 |     auto cv2 = network->addConvolutionNd(input, c_, DimsHW{1, 1}, weightMap[lname + ".cv2.weight"], emptywts);
308 |     ITensor *y1 = cv1->getOutput(0);
309 |     for (int i = 0; i < n; i++) {
310 |         auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
311 |         y1 = b->getOutput(0);
312 |     }
313 |     auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{1, 1}, weightMap[lname + ".cv3.weight"], emptywts);
314 | 
315 |     ITensor* inputTensors[] = {cv3->getOutput(0), cv2->getOutput(0)};
316 |     auto cat = network->addConcatenation(inputTensors, 2);
317 | 
318 |     IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4);
319 |     auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU);
320 |     lr->setAlpha(0.1);
321 | 
322 |     auto cv4 = convBnLeaky(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4");
323 |     return cv4;
324 | }
325 | 
326 | ILayer* SPP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) {
327 |     int c_ = c1 / 2;
328 |     auto cv1 = convBnLeaky(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
329 | 
330 |     auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k1, k1});
331 |     pool1->setPaddingNd(DimsHW{k1 / 2, k1 / 2});
332 |     pool1->setStrideNd(DimsHW{1, 1});
333 |     auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k2, k2});
334 |     pool2->setPaddingNd(DimsHW{k2 / 2, k2 / 2});
335 |     pool2->setStrideNd(DimsHW{1, 1});
336 |     auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k3, k3});
337 |     pool3->setPaddingNd(DimsHW{k3 / 2, k3 / 2});
338 |     pool3->setStrideNd(DimsHW{1, 1});
339 | 
340 |     ITensor* inputTensors[] = {cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0)};
341 |     auto cat = network->addConcatenation(inputTensors, 4);
342 | 
343 |     auto cv2 = convBnLeaky(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2");
344 |     return cv2;
345 | }
346 | 
347 | int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
348 |     DIR *p_dir = opendir(p_dir_name);
349 |     if (p_dir == nullptr) {
350 |         return -1;
351 |     }
352 | 
353 |     struct dirent* p_file = nullptr;
354 |     while ((p_file = readdir(p_dir)) != nullptr) {
355 |         if (strcmp(p_file->d_name, ".") != 0 &&
356 |                 strcmp(p_file->d_name, "..") != 0) {
357 |             //std::string cur_file_name(p_dir_name);
358 |             //cur_file_name += "/";
359 |             //cur_file_name += p_file->d_name;
360 |             std::string cur_file_name(p_file->d_name);
361 |             file_names.push_back(cur_file_name);
362 |         }
363 |     }
364 | 
365 |     closedir(p_dir);
366 |     return 0;
367 | }
368 | 
369 | #endif
370 | 
371 | 


--------------------------------------------------------------------------------
/yolov5m/gen_wts.py:
--------------------------------------------------------------------------------
 1 | from utils.utils import *
 2 | import struct
 3 | 
 4 | # Initialize
 5 | device = torch_utils.select_device('0')
 6 | # Load model
 7 | model = torch.load('weights/yolov5m.pt', map_location=device)['model'].float()  # load to FP32
 8 | model.to(device).eval()
 9 | 
10 | f = open('yolov5m.wts', 'w')
11 | f.write('{}\n'.format(len(model.state_dict().keys())))
12 | for k, v in model.state_dict().items():
13 |     vr = v.reshape(-1).cpu().numpy()
14 |     f.write('{} {} '.format(k, len(vr)))
15 |     for vv in vr:
16 |         f.write(' ')
17 |         f.write(struct.pack('>f',float(vv)).hex())
18 |     f.write('\n')
19 | 


--------------------------------------------------------------------------------
/yolov5m/images/bus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BaofengZan/yolov5_2.0-TensorRt/78193337dd94bf48c2d55c3a77105a534d11c45b/yolov5m/images/bus.jpg


--------------------------------------------------------------------------------
/yolov5m/images/zidane.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BaofengZan/yolov5_2.0-TensorRt/78193337dd94bf48c2d55c3a77105a534d11c45b/yolov5m/images/zidane.jpg


--------------------------------------------------------------------------------
/yolov5m/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef __TRT_UTILS_H_
 2 | #define __TRT_UTILS_H_
 3 | 
 4 | #include <iostream>
 5 | #include <vector>
 6 | #include <algorithm>
 7 | #include <cudnn.h>
 8 | 
 9 | #ifndef CUDA_CHECK
10 | 
11 | #define CUDA_CHECK(callstr)                                                                    \
12 |     {                                                                                          \
13 |         cudaError_t error_code = callstr;                                                      \
14 |         if (error_code != cudaSuccess) {                                                       \
15 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
16 |             assert(0);                                                                         \
17 |         }                                                                                      \
18 |     }
19 | 
20 | #endif
21 | 
22 | namespace Tn
23 | {
24 |     class Profiler : public nvinfer1::IProfiler
25 |     {
26 |     public:
27 |         void printLayerTimes(int itrationsTimes)
28 |         {
29 |             float totalTime = 0;
30 |             for (size_t i = 0; i < mProfile.size(); i++)
31 |             {
32 |                 printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes);
33 |                 totalTime += mProfile[i].second;
34 |             }
35 |             printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes);
36 |         }
37 |     private:
38 |         typedef std::pair<std::string, float> Record;
39 |         std::vector<Record> mProfile;
40 | 
41 |         virtual void reportLayerTime(const char* layerName, float ms)
42 |         {
43 |             auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
44 |             if (record == mProfile.end())
45 |                 mProfile.push_back(std::make_pair(layerName, ms));
46 |             else
47 |                 record->second += ms;
48 |         }
49 |     };
50 | 
51 |     //Logger for TensorRT info/warning/errors
52 |     class Logger : public nvinfer1::ILogger
53 |     {
54 |     public:
55 | 
56 |         Logger(): Logger(Severity::kWARNING) {}
57 | 
58 |         Logger(Severity severity): reportableSeverity(severity) {}
59 | 
60 |         void log(Severity severity, const char* msg) override
61 |         {
62 |             // suppress messages with severity enum value greater than the reportable
63 |             if (severity > reportableSeverity) return;
64 | 
65 |             switch (severity)
66 |             {
67 |                 case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
68 |                 case Severity::kERROR: std::cerr << "ERROR: "; break;
69 |                 case Severity::kWARNING: std::cerr << "WARNING: "; break;
70 |                 case Severity::kINFO: std::cerr << "INFO: "; break;
71 |                 default: std::cerr << "UNKNOWN: "; break;
72 |             }
73 |             std::cerr << msg << std::endl;
74 |         }
75 | 
76 |         Severity reportableSeverity{Severity::kWARNING};
77 |     };
78 | 
79 |     template<typename T> 
80 |     void write(char*& buffer, const T& val)
81 |     {
82 |         *reinterpret_cast<T*>(buffer) = val;
83 |         buffer += sizeof(T);
84 |     }
85 | 
86 |     template<typename T> 
87 |     void read(const char*& buffer, T& val)
88 |     {
89 |         val = *reinterpret_cast<const T*>(buffer);
90 |         buffer += sizeof(T);
91 |     }
92 | }
93 | 
94 | #endif


--------------------------------------------------------------------------------
/yolov5m/yololayer.cu:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include "yololayer.h"
  3 | #include "utils.h"
  4 | 
  5 | using namespace Yolo;
  6 | 
  7 | namespace nvinfer1
  8 | {
  9 |     YoloLayerPlugin::YoloLayerPlugin()
 10 |     {
 11 |         mClassCount = CLASS_NUM;
 12 |         mYoloKernel.clear();
 13 |         mYoloKernel.push_back(yolo1);
 14 |         mYoloKernel.push_back(yolo2);
 15 |         mYoloKernel.push_back(yolo3);
 16 | 
 17 |         mKernelCount = mYoloKernel.size();
 18 | 
 19 |         CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
 20 |         size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
 21 |         for(int ii = 0; ii < mKernelCount; ii ++)
 22 |         {
 23 |             CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen));
 24 |             const auto& yolo = mYoloKernel[ii];
 25 |             CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
 26 |         }
 27 |     }
 28 |     
 29 |     YoloLayerPlugin::~YoloLayerPlugin()
 30 |     {
 31 |     }
 32 | 
 33 |     // create the plugin at runtime from a byte stream
 34 |     YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
 35 |     {
 36 |         using namespace Tn;
 37 |         const char *d = reinterpret_cast<const char *>(data), *a = d;
 38 |         read(d, mClassCount);
 39 |         read(d, mThreadCount);
 40 |         read(d, mKernelCount);
 41 |         mYoloKernel.resize(mKernelCount);
 42 |         auto kernelSize = mKernelCount*sizeof(YoloKernel);
 43 |         memcpy(mYoloKernel.data(),d,kernelSize);
 44 |         d += kernelSize;
 45 | 
 46 |         CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
 47 |         size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
 48 |         for(int ii = 0; ii < mKernelCount; ii ++)
 49 |         {
 50 |             CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen));
 51 |             const auto& yolo = mYoloKernel[ii];
 52 |             CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
 53 |         }
 54 | 
 55 |         assert(d == a + length);
 56 |     }
 57 | 
 58 |     void YoloLayerPlugin::serialize(void* buffer) const
 59 |     {
 60 |         using namespace Tn;
 61 |         char* d = static_cast<char*>(buffer), *a = d;
 62 |         write(d, mClassCount);
 63 |         write(d, mThreadCount);
 64 |         write(d, mKernelCount);
 65 |         auto kernelSize = mKernelCount*sizeof(YoloKernel);
 66 |         memcpy(d,mYoloKernel.data(),kernelSize);
 67 |         d += kernelSize;
 68 | 
 69 |         assert(d == a + getSerializationSize());
 70 |     }
 71 |     
 72 |     size_t YoloLayerPlugin::getSerializationSize() const
 73 |     {  
 74 |         return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount)  + sizeof(Yolo::YoloKernel) * mYoloKernel.size();
 75 |     }
 76 | 
 77 |     int YoloLayerPlugin::initialize()
 78 |     { 
 79 |         return 0;
 80 |     }
 81 |     
 82 |     Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
 83 |     {
 84 |         //output the result to channel
 85 |         int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);
 86 | 
 87 |         return Dims3(totalsize + 1, 1, 1);
 88 |     }
 89 | 
 90 |     // Set plugin namespace
 91 |     void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace)
 92 |     {
 93 |         mPluginNamespace = pluginNamespace;
 94 |     }
 95 | 
 96 |     const char* YoloLayerPlugin::getPluginNamespace() const
 97 |     {
 98 |         return mPluginNamespace;
 99 |     }
100 | 
101 |     // Return the DataType of the plugin output at the requested index
102 |     DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
103 |     {
104 |         return DataType::kFLOAT;
105 |     }
106 | 
107 |     // Return true if output tensor is broadcast across a batch.
108 |     bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
109 |     {
110 |         return false;
111 |     }
112 | 
113 |     // Return true if plugin can use input that is broadcast across batch without replication.
114 |     bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const
115 |     {
116 |         return false;
117 |     }
118 | 
119 |     void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
120 |     {
121 |     }
122 | 
123 |     // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
124 |     void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
125 |     {
126 |     }
127 | 
128 |     // Detach the plugin object from its execution context.
129 |     void YoloLayerPlugin::detachFromContext() {}
130 | 
131 |     const char* YoloLayerPlugin::getPluginType() const
132 |     {
133 |         return "YoloLayer_TRT";
134 |     }
135 | 
136 |     const char* YoloLayerPlugin::getPluginVersion() const
137 |     {
138 |         return "1";
139 |     }
140 | 
141 |     void YoloLayerPlugin::destroy()
142 |     {
143 |         delete this;
144 |     }
145 | 
146 |     // Clone the plugin
147 |     IPluginV2IOExt* YoloLayerPlugin::clone() const
148 |     {
149 |         YoloLayerPlugin *p = new YoloLayerPlugin();
150 |         p->setPluginNamespace(mPluginNamespace);
151 |         return p;
152 |     }
153 | 
154 |     __device__ float Logist(float data){ return 1.0f / (1.0f + expf(-data)); };
155 | 
156 |     __global__ void CalDetection(const float *input, float *output,int noElements, 
157 |             int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes,int outputElem) {
158 |  
159 |         int idx = threadIdx.x + blockDim.x * blockIdx.x;
160 |         if (idx >= noElements) return;
161 | 
162 |         int total_grid = yoloWidth * yoloHeight;
163 |         int bnIdx = idx / total_grid;
164 |         idx = idx - total_grid*bnIdx;
165 |         int info_len_i = 5 + classes;
166 |         const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);
167 | 
168 |         for (int k = 0; k < 3; ++k) {
169 |             float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
170 |             if (box_prob < IGNORE_THRESH) continue;
171 |             int class_id = 0;
172 |             float max_cls_prob = 0.0;
173 |             for (int i = 5; i < info_len_i; ++i) {
174 |                 float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
175 |                 if (p > max_cls_prob) {
176 |                     max_cls_prob = p;
177 |                     class_id = i - 5;
178 |                 }
179 |             }
180 |             float *res_count = output + bnIdx*outputElem;
181 |             int count = (int)atomicAdd(res_count, 1);
182 |             if (count >= MAX_OUTPUT_BBOX_COUNT) return;
183 |             char* data = (char *)res_count + sizeof(float) + count * sizeof(Detection);
184 |             Detection* det =  (Detection*)(data);
185 | 
186 |             int row = idx / yoloWidth;
187 |             int col = idx % yoloWidth;
188 | 
189 |             //Location
190 |             det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth;
191 |             det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight;
192 |             det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]);
193 |             det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2*k];
194 |             det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]);
195 |             det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2*k + 1];
196 |             det->conf = box_prob * max_cls_prob;
197 |             det->class_id = class_id;
198 |         }
199 |     }
200 | 
201 |     void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {
202 | 
203 |         int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);
204 | 
205 |         for(int idx = 0 ; idx < batchSize; ++idx) {
206 |             CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float)));
207 |         }
208 |         int numElem = 0;
209 |         for (unsigned int i = 0; i < mYoloKernel.size(); ++i)
210 |         {
211 |             const auto& yolo = mYoloKernel[i];
212 |             numElem = yolo.width*yolo.height*batchSize;
213 |             if (numElem < mThreadCount)
214 |                 mThreadCount = numElem;
215 |             CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>>
216 |                 (inputs[i], output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount, outputElem);
217 |         }
218 | 
219 |     }
220 | 
221 | 
222 |     int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
223 |     {
224 |         forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
225 |         return 0;
226 |     }
227 | 
228 |     PluginFieldCollection YoloPluginCreator::mFC{};
229 |     std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
230 | 
231 |     YoloPluginCreator::YoloPluginCreator()
232 |     {
233 |         mPluginAttributes.clear();
234 | 
235 |         mFC.nbFields = mPluginAttributes.size();
236 |         mFC.fields = mPluginAttributes.data();
237 |     }
238 | 
239 |     const char* YoloPluginCreator::getPluginName() const
240 |     {
241 |             return "YoloLayer_TRT";
242 |     }
243 | 
244 |     const char* YoloPluginCreator::getPluginVersion() const
245 |     {
246 |             return "1";
247 |     }
248 | 
249 |     const PluginFieldCollection* YoloPluginCreator::getFieldNames()
250 |     {
251 |             return &mFC;
252 |     }
253 | 
254 |     IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
255 |     {
256 |         YoloLayerPlugin* obj = new YoloLayerPlugin();
257 |         obj->setPluginNamespace(mNamespace.c_str());
258 |         return obj;
259 |     }
260 | 
261 |     IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
262 |     {
263 |         // This object will be deleted when the network is destroyed, which will
264 |         // call MishPlugin::destroy()
265 |         YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
266 |         obj->setPluginNamespace(mNamespace.c_str());
267 |         return obj;
268 |     }
269 | 
270 | }
271 | 


--------------------------------------------------------------------------------
/yolov5m/yololayer.h:
--------------------------------------------------------------------------------
  1 | #ifndef _YOLO_LAYER_H
  2 | #define _YOLO_LAYER_H
  3 | 
  4 | #include <vector>
  5 | #include <string>
  6 | #include "NvInfer.h"
  7 | 
  8 | namespace Yolo
  9 | {
 10 |     static constexpr int CHECK_COUNT = 3;
 11 |     static constexpr float IGNORE_THRESH = 0.1f;
 12 |     static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
 13 |     static constexpr int CLASS_NUM = 80;
 14 |     static constexpr int INPUT_H = 608;
 15 |     static constexpr int INPUT_W = 608;
 16 | 
 17 |     struct YoloKernel
 18 |     {
 19 |         int width;
 20 |         int height;
 21 |         float anchors[CHECK_COUNT*2];
 22 |     };
 23 | 
 24 |     static constexpr YoloKernel yolo1 = {
 25 |         INPUT_W / 32,
 26 |         INPUT_H / 32,
 27 |         {116,90,  156,198,  373,326}
 28 |     };
 29 |     static constexpr YoloKernel yolo2 = {
 30 |         INPUT_W / 16,
 31 |         INPUT_H / 16,
 32 |         {30,61,  62,45,  59,119}
 33 |     };
 34 |     static constexpr YoloKernel yolo3 = {
 35 |         INPUT_W / 8,
 36 |         INPUT_H / 8,
 37 |         {10,13,  16,30,  33,23}
 38 |     };
 39 | 
 40 |     static constexpr int LOCATIONS = 4;
 41 |     struct alignas(float) Detection{
 42 |         //center_x center_y w h
 43 |         float bbox[LOCATIONS];
 44 |         float conf;  // bbox_conf * cls_conf
 45 |         float class_id;
 46 |     };
 47 | }
 48 | 
 49 | namespace nvinfer1
 50 | {
 51 |     class YoloLayerPlugin: public IPluginV2IOExt
 52 |     {
 53 |         public:
 54 |             explicit YoloLayerPlugin();
 55 |             YoloLayerPlugin(const void* data, size_t length);
 56 | 
 57 |             ~YoloLayerPlugin();
 58 | 
 59 |             int getNbOutputs() const override
 60 |             {
 61 |                 return 1;
 62 |             }
 63 | 
 64 |             Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
 65 | 
 66 |             int initialize() override;
 67 | 
 68 |             virtual void terminate() override {};
 69 | 
 70 |             virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;}
 71 | 
 72 |             virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;
 73 | 
 74 |             virtual size_t getSerializationSize() const override;
 75 | 
 76 |             virtual void serialize(void* buffer) const override;
 77 | 
 78 |             bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override {
 79 |                 return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
 80 |             }
 81 | 
 82 |             const char* getPluginType() const override;
 83 | 
 84 |             const char* getPluginVersion() const override;
 85 | 
 86 |             void destroy() override;
 87 | 
 88 |             IPluginV2IOExt* clone() const override;
 89 | 
 90 |             void setPluginNamespace(const char* pluginNamespace) override;
 91 | 
 92 |             const char* getPluginNamespace() const override;
 93 | 
 94 |             DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
 95 | 
 96 |             bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;
 97 | 
 98 |             bool canBroadcastInputAcrossBatch(int inputIndex) const override;
 99 | 
100 |             void attachToContext(
101 |                     cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;
102 | 
103 |             void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override;
104 | 
105 |             void detachFromContext() override;
106 | 
107 |         private:
108 |             void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1);
109 |             int mClassCount;
110 |             int mKernelCount;
111 |             std::vector<Yolo::YoloKernel> mYoloKernel;
112 |             int mThreadCount = 256;
113 |             void** mAnchor;
114 |             const char* mPluginNamespace;
115 |     };
116 | 
117 |     class YoloPluginCreator : public IPluginCreator
118 |     {
119 |         public:
120 |             YoloPluginCreator();
121 | 
122 |             ~YoloPluginCreator() override = default;
123 | 
124 |             const char* getPluginName() const override;
125 | 
126 |             const char* getPluginVersion() const override;
127 | 
128 |             const PluginFieldCollection* getFieldNames() override;
129 | 
130 |             IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;
131 | 
132 |             IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
133 | 
134 |             void setPluginNamespace(const char* libNamespace) override
135 |             {
136 |                 mNamespace = libNamespace;
137 |             }
138 | 
139 |             const char* getPluginNamespace() const override
140 |             {
141 |                 return mNamespace.c_str();
142 |             }
143 | 
144 |         private:
145 |             std::string mNamespace;
146 |             static PluginFieldCollection mFC;
147 |             static std::vector<PluginField> mPluginAttributes;
148 |     };
149 | 
150 | 
151 | 
152 | };
153 | 
154 | #endif 
155 | 


--------------------------------------------------------------------------------
/yolov5m/yolov5m.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <chrono>
  3 | #include "cuda_runtime_api.h"
  4 | #include "logging.h"
  5 | #include "common.hpp"
  6 | 
  7 | #define USE_FP16  // comment out this if want to use FP32
  8 | #define DEVICE 0  // GPU id
  9 | #define NMS_THRESH 0.5
 10 | #define CONF_THRESH 0.25
 11 | #define BATCH_SIZE 1
 12 | 
 13 | // stuff we know about the network and the input/output blobs
 14 | static const int INPUT_H = Yolo::INPUT_H;
 15 | static const int INPUT_W = Yolo::INPUT_W;
 16 | static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1;  // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1
 17 | const char* INPUT_BLOB_NAME = "data";
 18 | const char* OUTPUT_BLOB_NAME = "prob";
 19 | static Logger gLogger;
 20 | REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
 21 | 
 22 | // Creat the engine using only the API and not any parser.
 23 | ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
 24 |     INetworkDefinition* network = builder->createNetworkV2(0U);
 25 | 
 26 |     // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
 27 |     ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
 28 |     assert(data);
 29 | 
 30 |     std::map<std::string, Weights> weightMap = loadWeights("../yolov5m.wts");
 31 |     Weights emptywts{DataType::kFLOAT, nullptr, 0};
 32 | 
 33 |     /* ------ yolov5 backbone------ */
 34 |     auto focus0 = focus(network, weightMap, *data, 3, 48, 3, "model.0");
 35 |     auto conv1 = convBnLeaky(network, weightMap, *focus0->getOutput(0), 96, 3, 2, 1, "model.1");
 36 |     auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 96, 96, 2, true, 1, 0.5, "model.2");
 37 |     auto conv3 = convBnLeaky(network, weightMap, *bottleneck_CSP2->getOutput(0), 192, 3, 2, 1, "model.3");
 38 |     auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 192, 192, 6, true, 1, 0.5, "model.4");
 39 |     auto conv5 = convBnLeaky(network, weightMap, *bottleneck_csp4->getOutput(0), 384, 3, 2, 1, "model.5");
 40 |     auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 384, 384, 6, true, 1, 0.5, "model.6");
 41 |     auto conv7 = convBnLeaky(network, weightMap, *bottleneck_csp6->getOutput(0), 768, 3, 2, 1, "model.7");
 42 |     auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 768, 768, 5, 9, 13, "model.8");
 43 |     /* ------ yolov5 head ------ */
 44 |     auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 768, 768, 2, false, 1, 0.5, "model.9");
 45 |     auto conv10 = convBnLeaky(network, weightMap, *bottleneck_csp9->getOutput(0), 384, 1, 1, 1, "model.10");
 46 |     
 47 |     float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 384 * 2 * 2));
 48 |     for (int i = 0; i < 384 * 2 * 2; i++) {
 49 |         deval[i] = 1.0;
 50 |     }
 51 |     Weights deconvwts11{DataType::kFLOAT, deval, 384 * 2 * 2};
 52 |     IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 384, DimsHW{2, 2}, deconvwts11, emptywts);
 53 |     deconv11->setStrideNd(DimsHW{2, 2});
 54 |     deconv11->setNbGroups(384);
 55 |     weightMap["deconv11"] = deconvwts11;
 56 |     ITensor* inputTensors12[] = {deconv11->getOutput(0), bottleneck_csp6->getOutput(0)};
 57 |     auto cat12 = network->addConcatenation(inputTensors12, 2);
 58 | 
 59 |     auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 768, 384, 2, false, 1, 0.5, "model.13");
 60 | 
 61 |     auto conv14 = convBnLeaky(network, weightMap, *bottleneck_csp13->getOutput(0), 192, 1, 1, 1, "model.14");
 62 |     
 63 |     Weights deconvwts15{DataType::kFLOAT, deval, 192 * 2 * 2};
 64 |     IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 192, DimsHW{2, 2}, deconvwts15, emptywts);
 65 |     deconv15->setStrideNd(DimsHW{2, 2});
 66 |     deconv15->setNbGroups(192);
 67 | 
 68 |     ITensor* inputTensors16[] = {deconv15->getOutput(0), bottleneck_csp4->getOutput(0)};
 69 |     auto cat16 = network->addConcatenation(inputTensors16, 2);
 70 | 
 71 |     auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 384, 192, 2, false, 1, 0.5, "model.17");
 72 | 
 73 | 	//yolo layer 1
 74 |     IConvolutionLayer* conv18 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
 75 |     
 76 |     auto conv19 = convBnLeaky(network, weightMap, *bottleneck_csp17->getOutput(0), 192, 3, 2, 1, "model.18");
 77 | 
 78 |     ITensor* inputTensors20[] = {conv19->getOutput(0), conv14->getOutput(0)};
 79 |     auto cat20 = network->addConcatenation(inputTensors20, 2);
 80 | 
 81 |     auto bottleneck_csp21 = bottleneckCSP(network, weightMap, *cat20->getOutput(0), 384, 384, 2, false, 1, 0.5, "model.20");
 82 |     
 83 | 	//yolo layer 2
 84 |     IConvolutionLayer* conv22 = network->addConvolutionNd(*bottleneck_csp21->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
 85 | 
 86 |     auto conv23 = convBnLeaky(network, weightMap, *bottleneck_csp21->getOutput(0), 384, 3, 2, 1, "model.21");
 87 |     
 88 |     ITensor* inputTensors24[] = {conv23->getOutput(0), conv10->getOutput(0)};
 89 |     auto cat24 = network->addConcatenation(inputTensors24, 2);
 90 |     
 91 |     auto bottleneck_csp25 = bottleneckCSP(network, weightMap, *cat24->getOutput(0), 768, 768, 2, false, 1, 0.5, "model.23");
 92 |     
 93 | 	// yolo layer 3
 94 |     IConvolutionLayer* conv26 = network->addConvolutionNd(*bottleneck_csp25->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
 95 |     
 96 |     auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
 97 |     const PluginFieldCollection* pluginData = creator->getFieldNames();
 98 |     IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData);
 99 |     ITensor* inputTensors_yolo[] = {conv26->getOutput(0), conv22->getOutput(0), conv18->getOutput(0)};
100 |     auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj);
101 | 
102 |     yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
103 |     network->markOutput(*yolo->getOutput(0));
104 | 
105 |     // Build engine
106 |     builder->setMaxBatchSize(maxBatchSize);
107 |     config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
108 | #ifdef USE_FP16
109 |     config->setFlag(BuilderFlag::kFP16);
110 | #endif
111 |     std::cout << "Building engine, please wait for a while..." << std::endl;
112 |     ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
113 |     std::cout << "Build engine successfully!" << std::endl;
114 | 
115 |     // Don't need the network any more
116 |     network->destroy();
117 | 
118 |     // Release host memory
119 |     for (auto& mem : weightMap)
120 |     {
121 |         free((void*) (mem.second.values));
122 |     }
123 | 
124 |     return engine;
125 | }
126 | 
127 | void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
128 |     // Create builder
129 |     IBuilder* builder = createInferBuilder(gLogger);
130 |     IBuilderConfig* config = builder->createBuilderConfig();
131 | 
132 |     // Create model to populate the network, then set the outputs and create an engine
133 |     ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
134 |     assert(engine != nullptr);
135 | 
136 |     // Serialize the engine
137 |     (*modelStream) = engine->serialize();
138 | 
139 |     // Close everything down
140 |     engine->destroy();
141 |     builder->destroy();
142 | }
143 | 
144 | void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
145 |     const ICudaEngine& engine = context.getEngine();
146 | 
147 |     // Pointers to input and output device buffers to pass to engine.
148 |     // Engine requires exactly IEngine::getNbBindings() number of buffers.
149 |     assert(engine.getNbBindings() == 2);
150 |     void* buffers[2];
151 | 
152 |     // In order to bind the buffers, we need to know the names of the input and output tensors.
153 |     // Note that indices are guaranteed to be less than IEngine::getNbBindings()
154 |     const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
155 |     const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
156 | 
157 |     // Create GPU buffers on device
158 |     CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
159 |     CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
160 | 
161 |     // Create stream
162 |     cudaStream_t stream;
163 |     CHECK(cudaStreamCreate(&stream));
164 | 
165 |     // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
166 |     CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
167 |     context.enqueue(batchSize, buffers, stream, nullptr);
168 |     CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
169 |     cudaStreamSynchronize(stream);
170 | 
171 |     // Release stream and buffers
172 |     cudaStreamDestroy(stream);
173 |     CHECK(cudaFree(buffers[inputIndex]));
174 |     CHECK(cudaFree(buffers[outputIndex]));
175 | }
176 | 
177 | int main(int argc, char** argv) {
178 |     cudaSetDevice(DEVICE);
179 |     // create a model using the API directly and serialize it to a stream
180 |     char *trtModelStream{nullptr};
181 |     size_t size{0};
182 | 
183 |     if (argc == 2 && std::string(argv[1]) == "-s") {
184 |         IHostMemory* modelStream{nullptr};
185 |         APIToModel(BATCH_SIZE, &modelStream);
186 |         assert(modelStream != nullptr);
187 |         std::ofstream p("yolov5m.engine", std::ios::binary);
188 |         if (!p) {
189 |             std::cerr << "could not open plan output file" << std::endl;
190 |             return -1;
191 |         }
192 |         p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
193 |         modelStream->destroy();
194 |         return 0;
195 |     } else if (argc == 3 && std::string(argv[1]) == "-d") {
196 |         std::ifstream file("yolov5m.engine", std::ios::binary);
197 |         if (file.good()) {
198 |             file.seekg(0, file.end);
199 |             size = file.tellg();
200 |             file.seekg(0, file.beg);
201 |             trtModelStream = new char[size];
202 |             assert(trtModelStream);
203 |             file.read(trtModelStream, size);
204 |             file.close();
205 |         }
206 |     } else {
207 |         std::cerr << "arguments not right!" << std::endl;
208 |         std::cerr << "./yolov5m -s  // serialize model to plan file" << std::endl;
209 |         std::cerr << "./yolov5m -d ../samples  // deserialize plan file and run inference" << std::endl;
210 |         return -1;
211 |     }
212 | 
213 |     std::vector<std::string> file_names;
214 |     if (read_files_in_dir(argv[2], file_names) < 0) {
215 |         std::cout << "read_files_in_dir failed." << std::endl;
216 |         return -1;
217 |     }
218 | 
219 |     // prepare input data ---------------------------
220 |     static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
221 |     //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
222 |     //    data[i] = 1.0;
223 |     static float prob[BATCH_SIZE * OUTPUT_SIZE];
224 |     IRuntime* runtime = createInferRuntime(gLogger);
225 |     assert(runtime != nullptr);
226 |     ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
227 |     assert(engine != nullptr);
228 |     IExecutionContext* context = engine->createExecutionContext();
229 |     assert(context != nullptr);
230 |     delete[] trtModelStream;
231 | 
232 |     int fcount = 0;
233 |     for (int f = 0; f < (int)file_names.size(); f++) {
234 |         fcount++;
235 |         if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue;
236 |         for (int b = 0; b < fcount; b++) {
237 |             cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]);
238 |             if (img.empty()) continue;
239 |             cv::Mat pr_img = preprocess_img(img);
240 | 			int i = 0;
241 | 			for (int row = 0; row < INPUT_H; ++row) {
242 | 				uchar* uc_pixel = pr_img.data + row * pr_img.step;
243 | 				for (int col = 0; col < INPUT_W; ++col) {
244 | 					data[b * 3 * INPUT_H * INPUT_W + i] = uc_pixel[2] / 255;
245 | 					data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = uc_pixel[1] / 255.0;
246 | 					data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = uc_pixel[0] / 255.0;
247 | 					uc_pixel += 3;
248 | 					++i;
249 | 				}
250 | 			}
251 |         }
252 | 
253 |         // Run inference
254 |         auto start = std::chrono::system_clock::now();
255 |         doInference(*context, data, prob, BATCH_SIZE);
256 |         auto end = std::chrono::system_clock::now();
257 |         std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
258 |         std::vector<std::vector<Yolo::Detection>> batch_res(fcount);
259 |         for (int b = 0; b < fcount; b++) {
260 |             auto& res = batch_res[b];
261 |             nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH);
262 |         }
263 |         for (int b = 0; b < fcount; b++) {
264 |             auto& res = batch_res[b];
265 |             //std::cout << res.size() << std::endl;
266 |             cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]);
267 |             for (size_t j = 0; j < res.size(); j++) {
268 |                 cv::Rect r = get_rect(img, res[j].bbox);
269 |                 cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
270 |                 cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
271 |             }
272 |             cv::imwrite("_" + file_names[f - fcount + 1 + b], img);
273 |         }
274 |         fcount = 0;
275 |     }
276 | 
277 |     // Destroy the engine
278 |     context->destroy();
279 |     engine->destroy();
280 |     runtime->destroy();
281 | 
282 |     // Print histogram of the output distribution
283 |     //std::cout << "\nOutput:\n\n";
284 |     //for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
285 |     //{
286 |     //    std::cout << prob[i] << ", ";
287 |     //    if (i % 10 == 0) std::cout << std::endl;
288 |     //}
289 |     //std::cout << std::endl;
290 | 
291 |     return 0;
292 | }
293 | 


--------------------------------------------------------------------------------
/yolov5s/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(yolov5)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30)
14 | 
15 | include_directories(${PROJECT_SOURCE_DIR}/include)
16 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
17 |     message("embed_platform on")
18 |     include_directories(/usr/local/cuda/targets/aarch64-linux/include)
19 |     link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
20 | else()
21 |     message("embed_platform off")
22 |     include_directories(/usr/local/cuda/include)
23 |     link_directories(/usr/local/cuda/lib64)
24 | endif()
25 | 
26 | 
27 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
28 | 
29 | cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
30 | 
31 | ########## opencv configuration ############
32 | find_package(OpenCV 3.4 REQUIRED)
33 | message(OpenCV_LIBS)
34 | include_directories(OpenCV_INCLUDE_DIRS)
35 | 
36 | add_executable(yolov5s ${PROJECT_SOURCE_DIR}/yolov5s.cpp)
37 | target_link_libraries(yolov5s nvinfer)
38 | target_link_libraries(yolov5s cudart)
39 | target_link_libraries(yolov5s yololayer)
40 | target_link_libraries(yolov5s ${OpenCV_LIBS})
41 | 
42 | add_definitions(-O2 -pthread)
43 | 
44 | 


--------------------------------------------------------------------------------
/yolov5s/README.md:
--------------------------------------------------------------------------------
 1 | # yolov5
 2 | 
 3 | The Pytorch implementation is [ultralytics/yolov5](https://github.com/ultralytics/yolov5).
 4 | 
 5 | I was using [ultralytics/yolov5](https://github.com/ultralytics/yolov5)(The latest version). Just in case the yolov5 model updated.
 6 | 
 7 | ## How to Run
 8 | 
 9 | ```
10 | 1. generate yolov5m.wts from pytorch implementation with yolov5.pt
11 | 
12 | git clone https://github.com/AIpakchoi/yolov5_tensorrt.git
13 | git clone https://github.com/ultralytics/yolov5.git
14 | // download its weights 'yolov5s.pt'
15 | cd yolov5
16 | cp ../yolov5_tensorrt/yolov5s/gen_wts.py .
17 | python gen_wts.py
18 | // a file 'yolov5s.wts' will be generated.
19 | 
20 | 2. put yolov5s.wts into yolov5s, build and run
21 | 
22 | mv yolov5s.wts ../yolov5_tensorrt/yolov5s/
23 | cd ../yolov5_tensorrt/yolov5s
24 | mkdir build
25 | cd build
26 | cmake ..
27 | make
28 | sudo ./yolov5s -s             // serialize model to plan file i.e. 'yolov5s.engine'
29 | sudo ./yolov5s -d  ../samples // deserialize plan file and run inference, the images in samples will be processed.
30 | 
31 | 3. check the images generated, as follows. _zidane.jpg and _bus.jpg
32 | ```
33 | 
34 | <p align="center">
35 | <img src="https://user-images.githubusercontent.com/15235574/78247927-4d9fac00-751e-11ea-8b1b-704a0aeb3fcf.jpg">
36 | </p>
37 | 
38 | <p align="center">
39 | <img src="https://user-images.githubusercontent.com/15235574/78247970-60b27c00-751e-11ea-88df-41473fed4823.jpg">
40 | </p>
41 | 
42 | ## Config
43 | 
44 | - Input shape defined in yololayer.h
45 | - Number of classes defined in yololayer.h
46 | - FP16/FP32 can be selected by the macro in yolov5s.cpp
47 | - GPU id can be selected by the macro in yolov5s.cpp
48 | - NMS thresh in yolov5s.cpp
49 | - BBox confidence thresh in yolov5s.cpp
50 | - Batch size in yolov5s.cpp
51 | 


--------------------------------------------------------------------------------
/yolov5s/common.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef YOLOV5_COMMON_H_
  2 | #define YOLOV5_COMMON_H_
  3 | 
  4 | #include <fstream>
  5 | #include <map>
  6 | #include <sstream>
  7 | #include <vector>
  8 | #include <opencv2/opencv.hpp>
  9 | #include <dirent.h>
 10 | #include "NvInfer.h"
 11 | #include "yololayer.h"
 12 | 
 13 | #define CHECK(status) \
 14 |     do\
 15 |     {\
 16 |         auto ret = (status);\
 17 |         if (ret != 0)\
 18 |         {\
 19 |             std::cerr << "Cuda failure: " << ret << std::endl;\
 20 |             abort();\
 21 |         }\
 22 |     } while (0)
 23 | 
 24 | using namespace nvinfer1;
 25 | // resize 优化
 26 | // 从原图的点映射到输出图像的像素点
 27 | // 对outuput的每个点，先根据长宽比计算其在原图中最邻近的像素点，
 28 | //然后直接根据最邻近的思想，直接拷贝Channel个字节作为输出图像
 29 | void resizeByNN(uchar *input, uchar *output, int height_in, int width_in, int channels, int height_out, int width_out) {
 30 | 
 31 | 	uchar *data_source = input;
 32 | 	uchar *data_half = output;
 33 | 
 34 | 	int bpl_source = width_in * 3;
 35 | 	int bpl_dst = width_out * 3;
 36 | 
 37 | 	int pos = 0;
 38 | 	int sep = 0;
 39 | 	uchar *sr = nullptr;
 40 | 	uchar *hr = nullptr;
 41 | 	float step = 0.0;
 42 | 	float step_x = float(width_in) / float(width_out);
 43 | 	float step_y = float(height_in) / float(height_out);
 44 | 
 45 | 	for (int i = 0; i < height_out; i++) {
 46 | 		for (int j = 0; j < width_out; j++) {
 47 | 			sep = int(step_y*i);
 48 | 			step = int(j*step_x);
 49 | 			sr = data_source + sep * bpl_source;
 50 | 			hr = data_half + i * bpl_dst + j * channels;
 51 | 			pos = step * channels;
 52 | 			memcpy(hr, sr + pos, channels);
 53 | 		}
 54 | 	}
 55 | 	return;
 56 | }
 57 | 
 58 | cv::Mat preprocess_img(cv::Mat& img) {
 59 |     int w, h, x, y;
 60 |     float r_w = Yolo::INPUT_W / (img.cols*1.0);
 61 |     float r_h = Yolo::INPUT_H / (img.rows*1.0);
 62 |     if (r_h > r_w) {
 63 |         w = Yolo::INPUT_W;
 64 |         h = r_w * img.rows;
 65 |         x = 0;
 66 |         y = (Yolo::INPUT_H - h) / 2;
 67 |     } else {
 68 |         w = r_h* img.cols;
 69 |         h = Yolo::INPUT_H;
 70 |         x = (Yolo::INPUT_W - w) / 2;
 71 |         y = 0;
 72 |     }
 73 |     cv::Mat re(h, w, CV_8UC3);
 74 | 	//cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC);
 75 | 	auto start = std::chrono::system_clock::now();
 76 | 	cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
 77 | 	//resizeByNN(img.data, re.data, img.rows, img.cols, img.channels(), re.rows, re.cols);
 78 | 	auto end = std::chrono::system_clock::now();
 79 | 	std::cout << "img resize: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
 80 | 	
 81 | 	cv::Mat out(Yolo::INPUT_H, Yolo::INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128));
 82 |     
 83 |     re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
 84 |     return out;
 85 | }
 86 | 
 87 | cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
 88 |     int l, r, t, b;
 89 |     float r_w = Yolo::INPUT_W / (img.cols * 1.0);
 90 |     float r_h = Yolo::INPUT_H / (img.rows * 1.0);
 91 |     if (r_h > r_w) {
 92 |         l = bbox[0] - bbox[2]/2.f;
 93 | 		if (l < 0)
 94 | 		{
 95 | 			l = 0;
 96 | 		}
 97 |         r = bbox[0] + bbox[2]/2.f;
 98 |         if (r > img.cols)
 99 |         {
100 | 			r = img.cols;
101 |         }
102 | 		t = bbox[1] - bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
103 | 		if (t < 0)
104 | 		{
105 | 			t = 0;
106 | 		}
107 | 		b = bbox[1] + bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
108 | 		if (b > img.rows)
109 | 		{
110 | 			b = img.rows;
111 | 		}
112 |         l = l / r_w;
113 |         r = r / r_w;
114 |         t = t / r_w;
115 |         b = b / r_w;
116 |     } else {
117 |         l = bbox[0] - bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
118 | 		if (l < 0)
119 | 		{
120 | 			l = 0;
121 | 		}
122 |         r = bbox[0] + bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
123 | 		if (r > img.cols)
124 | 		{
125 | 			r = img.cols;
126 | 		}
127 |         t = bbox[1] - bbox[3]/2.f;
128 | 		if (t < 0)
129 | 		{
130 | 			t = 0;
131 | 		}
132 |         b = bbox[1] + bbox[3]/2.f;
133 | 		if (b > img.rows)
134 | 		{
135 | 			b = img.rows;
136 | 		}
137 |         l = l / r_h;
138 |         r = r / r_h;
139 |         t = t / r_h;
140 |         b = b / r_h;
141 |     }
142 |     return cv::Rect(l, t, r-l, b-t);
143 | }
144 | 
145 | // std::max vs. max
146 | //https://www.cnblogs.com/timesdaughter/p/5894930.html
147 | // Use (std::min) and (std::max)
148 | float iou(float lbox[4], float rbox[4]) {
149 |     float interBox[] = {
150 |         (std::max)(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
151 |         (std::min)(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
152 |         (std::max)(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
153 |         (std::min)(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
154 |     };
155 | 
156 |     if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
157 |         return 0.0f;
158 | 
159 |     float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
160 |     return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
161 | }
162 | 
163 | bool cmp(Yolo::Detection& a, Yolo::Detection& b) {
164 |     return a.conf > b.conf;
165 | }
166 | 
167 | void nms(std::vector<Yolo::Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5) {
168 |     int det_size = sizeof(Yolo::Detection) / sizeof(float);
169 |     std::map<float, std::vector<Yolo::Detection>> m;
170 |     for (int i = 0; i < output[0] && i < 1000; i++) {
171 |         if (output[1 + det_size * i + 4] <= conf_thresh) continue;
172 |         Yolo::Detection det;
173 |         memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
174 |         if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
175 |         m[det.class_id].push_back(det);
176 |     }
177 |     for (auto it = m.begin(); it != m.end(); it++) {
178 |         //std::cout << it->second[0].class_id << " --- " << std::endl;
179 |         auto& dets = it->second;
180 |         std::sort(dets.begin(), dets.end(), cmp);
181 |         for (size_t m = 0; m < dets.size(); ++m) {
182 |             auto& item = dets[m];
183 |             res.push_back(item);
184 |             for (size_t n = m + 1; n < dets.size(); ++n) {
185 |                 if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
186 |                     dets.erase(dets.begin()+n);
187 |                     --n;
188 |                 }
189 |             }
190 |         }
191 |     }
192 | }
193 | 
194 | // TensorRT weight files have a simple space delimited format:
195 | // [type] [size] <data x size in hex>
196 | std::map<std::string, Weights> loadWeights(const std::string file) {
197 |     std::cout << "Loading weights: " << file << std::endl;
198 |     std::map<std::string, Weights> weightMap;
199 | 
200 |     // Open weights file
201 |     std::ifstream input(file);
202 |     assert(input.is_open() && "Unable to load weight file.");
203 | 
204 |     // Read number of weight blobs
205 |     int32_t count;
206 |     input >> count;
207 |     assert(count > 0 && "Invalid weight map file.");
208 | 
209 |     while (count--)
210 |     {
211 |         Weights wt{DataType::kFLOAT, nullptr, 0};
212 |         uint32_t size;
213 | 
214 |         // Read name and type of blob
215 |         std::string name;
216 |         input >> name >> std::dec >> size;
217 |         wt.type = DataType::kFLOAT;
218 | 
219 |         // Load blob
220 |         uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
221 |         for (uint32_t x = 0, y = size; x < y; ++x)
222 |         {
223 |             input >> std::hex >> val[x];
224 |         }
225 |         wt.values = val;
226 |         
227 |         wt.count = size;
228 |         weightMap[name] = wt;
229 |     }
230 | 
231 |     return weightMap;
232 | }
233 | 
234 | IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
235 |     float *gamma = (float*)weightMap[lname + ".weight"].values;
236 |     float *beta = (float*)weightMap[lname + ".bias"].values;
237 |     float *mean = (float*)weightMap[lname + ".running_mean"].values;
238 |     float *var = (float*)weightMap[lname + ".running_var"].values;
239 |     int len = weightMap[lname + ".running_var"].count;
240 | 
241 |     float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
242 |     for (int i = 0; i < len; i++) {
243 |         scval[i] = gamma[i] / sqrt(var[i] + eps);
244 |     }
245 |     Weights scale{DataType::kFLOAT, scval, len};
246 |     
247 |     float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
248 |     for (int i = 0; i < len; i++) {
249 |         shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
250 |     }
251 |     Weights shift{DataType::kFLOAT, shval, len};
252 | 
253 |     float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
254 |     for (int i = 0; i < len; i++) {
255 |         pval[i] = 1.0;
256 |     }
257 |     Weights power{DataType::kFLOAT, pval, len};
258 | 
259 |     weightMap[lname + ".scale"] = scale;
260 |     weightMap[lname + ".shift"] = shift;
261 |     weightMap[lname + ".power"] = power;
262 |     IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
263 |     assert(scale_1);
264 |     return scale_1;
265 | }
266 | 
267 | ILayer* convBnLeaky(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) {
268 |     Weights emptywts{DataType::kFLOAT, nullptr, 0};
269 |     int p = ksize / 2;
270 |     IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[lname + ".conv.weight"], emptywts);
271 |     assert(conv1);
272 |     conv1->setStrideNd(DimsHW{s, s});
273 |     conv1->setPaddingNd(DimsHW{p, p});
274 |     conv1->setNbGroups(g);
275 | 	//IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-4);
276 | 	IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3);
277 |     auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU);
278 |     lr->setAlpha(0.1);
279 |     return lr;
280 | }
281 | 
282 | ILayer* focus(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) {
283 |     ISliceLayer *s1 = network->addSlice(input, Dims3{0, 0, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2});
284 |     ISliceLayer *s2 = network->addSlice(input, Dims3{0, 1, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2});
285 |     ISliceLayer *s3 = network->addSlice(input, Dims3{0, 0, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2});
286 |     ISliceLayer *s4 = network->addSlice(input, Dims3{0, 1, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2});
287 |     ITensor* inputTensors[] = {s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0)};
288 |     auto cat = network->addConcatenation(inputTensors, 4);
289 |     auto conv = convBnLeaky(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv");
290 |     return conv;
291 | }
292 | 
293 | ILayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) {
294 |     auto cv1 = convBnLeaky(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1");
295 |     auto cv2 = convBnLeaky(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2");
296 |     if (shortcut && c1 == c2) {
297 |         auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM);
298 |         return ew;
299 |     }
300 |     return cv2;
301 | }
302 | 
303 | ILayer* bottleneckCSP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
304 |     Weights emptywts{DataType::kFLOAT, nullptr, 0};
305 |     int c_ = (int)((float)c2 * e);
306 |     auto cv1 = convBnLeaky(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
307 |     auto cv2 = network->addConvolutionNd(input, c_, DimsHW{1, 1}, weightMap[lname + ".cv2.weight"], emptywts);
308 |     ITensor *y1 = cv1->getOutput(0);
309 |     for (int i = 0; i < n; i++) {
310 |         auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
311 |         y1 = b->getOutput(0);
312 |     }
313 |     auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{1, 1}, weightMap[lname + ".cv3.weight"], emptywts);
314 | 
315 |     ITensor* inputTensors[] = {cv3->getOutput(0), cv2->getOutput(0)};
316 |     auto cat = network->addConcatenation(inputTensors, 2);
317 | 
318 |     IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4);
319 |     auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU);
320 |     lr->setAlpha(0.1);
321 | 
322 |     auto cv4 = convBnLeaky(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4");
323 |     return cv4;
324 | }
325 | 
326 | ILayer* SPP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) {
327 |     int c_ = c1 / 2;
328 |     auto cv1 = convBnLeaky(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
329 | 
330 |     auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k1, k1});
331 |     pool1->setPaddingNd(DimsHW{k1 / 2, k1 / 2});
332 |     pool1->setStrideNd(DimsHW{1, 1});
333 |     auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k2, k2});
334 |     pool2->setPaddingNd(DimsHW{k2 / 2, k2 / 2});
335 |     pool2->setStrideNd(DimsHW{1, 1});
336 |     auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k3, k3});
337 |     pool3->setPaddingNd(DimsHW{k3 / 2, k3 / 2});
338 |     pool3->setStrideNd(DimsHW{1, 1});
339 | 
340 |     ITensor* inputTensors[] = {cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0)};
341 |     auto cat = network->addConcatenation(inputTensors, 4);
342 | 
343 |     auto cv2 = convBnLeaky(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2");
344 |     return cv2;
345 | }
346 | 
347 | int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
348 |     DIR *p_dir = opendir(p_dir_name);
349 |     if (p_dir == nullptr) {
350 |         return -1;
351 |     }
352 | 
353 |     struct dirent* p_file = nullptr;
354 |     while ((p_file = readdir(p_dir)) != nullptr) {
355 |         if (strcmp(p_file->d_name, ".") != 0 &&
356 |                 strcmp(p_file->d_name, "..") != 0) {
357 |             //std::string cur_file_name(p_dir_name);
358 |             //cur_file_name += "/";
359 |             //cur_file_name += p_file->d_name;
360 |             std::string cur_file_name(p_file->d_name);
361 |             file_names.push_back(cur_file_name);
362 |         }
363 |     }
364 | 
365 |     closedir(p_dir);
366 |     return 0;
367 | }
368 | 
369 | #endif
370 | 
371 | 


--------------------------------------------------------------------------------
/yolov5s/gen_wts.py:
--------------------------------------------------------------------------------
 1 | from utils.utils import *
 2 | import struct
 3 | 
 4 | # Initialize
 5 | device = torch_utils.select_device('0')
 6 | # Load model
 7 | model = torch.load('weights/yolov5s.pt', map_location=device)['model'].float()  # load to FP32
 8 | model.to(device).eval()
 9 | 
10 | f = open('yolov5s.wts', 'w')
11 | f.write('{}\n'.format(len(model.state_dict().keys())))
12 | for k, v in model.state_dict().items():
13 |     vr = v.reshape(-1).cpu().numpy()
14 |     f.write('{} {} '.format(k, len(vr)))
15 |     for vv in vr:
16 |         f.write(' ')
17 |         f.write(struct.pack('>f',float(vv)).hex())
18 |     f.write('\n')
19 | 


--------------------------------------------------------------------------------
/yolov5s/images/bus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BaofengZan/yolov5_2.0-TensorRt/78193337dd94bf48c2d55c3a77105a534d11c45b/yolov5s/images/bus.jpg


--------------------------------------------------------------------------------
/yolov5s/images/zidane.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BaofengZan/yolov5_2.0-TensorRt/78193337dd94bf48c2d55c3a77105a534d11c45b/yolov5s/images/zidane.jpg


--------------------------------------------------------------------------------
/yolov5s/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef __TRT_UTILS_H_
 2 | #define __TRT_UTILS_H_
 3 | 
 4 | #include <iostream>
 5 | #include <vector>
 6 | #include <algorithm>
 7 | #include <cudnn.h>
 8 | 
 9 | #ifndef CUDA_CHECK
10 | 
11 | #define CUDA_CHECK(callstr)                                                                    \
12 |     {                                                                                          \
13 |         cudaError_t error_code = callstr;                                                      \
14 |         if (error_code != cudaSuccess) {                                                       \
15 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
16 |             assert(0);                                                                         \
17 |         }                                                                                      \
18 |     }
19 | 
20 | #endif
21 | 
22 | namespace Tn
23 | {
24 |     class Profiler : public nvinfer1::IProfiler
25 |     {
26 |     public:
27 |         void printLayerTimes(int itrationsTimes)
28 |         {
29 |             float totalTime = 0;
30 |             for (size_t i = 0; i < mProfile.size(); i++)
31 |             {
32 |                 printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes);
33 |                 totalTime += mProfile[i].second;
34 |             }
35 |             printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes);
36 |         }
37 |     private:
38 |         typedef std::pair<std::string, float> Record;
39 |         std::vector<Record> mProfile;
40 | 
41 |         virtual void reportLayerTime(const char* layerName, float ms)
42 |         {
43 |             auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
44 |             if (record == mProfile.end())
45 |                 mProfile.push_back(std::make_pair(layerName, ms));
46 |             else
47 |                 record->second += ms;
48 |         }
49 |     };
50 | 
51 |     //Logger for TensorRT info/warning/errors
52 |     class Logger : public nvinfer1::ILogger
53 |     {
54 |     public:
55 | 
56 |         Logger(): Logger(Severity::kWARNING) {}
57 | 
58 |         Logger(Severity severity): reportableSeverity(severity) {}
59 | 
60 |         void log(Severity severity, const char* msg) override
61 |         {
62 |             // suppress messages with severity enum value greater than the reportable
63 |             if (severity > reportableSeverity) return;
64 | 
65 |             switch (severity)
66 |             {
67 |                 case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
68 |                 case Severity::kERROR: std::cerr << "ERROR: "; break;
69 |                 case Severity::kWARNING: std::cerr << "WARNING: "; break;
70 |                 case Severity::kINFO: std::cerr << "INFO: "; break;
71 |                 default: std::cerr << "UNKNOWN: "; break;
72 |             }
73 |             std::cerr << msg << std::endl;
74 |         }
75 | 
76 |         Severity reportableSeverity{Severity::kWARNING};
77 |     };
78 | 
79 |     template<typename T> 
80 |     void write(char*& buffer, const T& val)
81 |     {
82 |         *reinterpret_cast<T*>(buffer) = val;
83 |         buffer += sizeof(T);
84 |     }
85 | 
86 |     template<typename T> 
87 |     void read(const char*& buffer, T& val)
88 |     {
89 |         val = *reinterpret_cast<const T*>(buffer);
90 |         buffer += sizeof(T);
91 |     }
92 | }
93 | 
94 | #endif


--------------------------------------------------------------------------------
/yolov5s/yololayer.cu:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include "yololayer.h"
  3 | #include "utils.h"
  4 | 
  5 | using namespace Yolo;
  6 | 
  7 | namespace nvinfer1
  8 | {
  9 |     YoloLayerPlugin::YoloLayerPlugin()
 10 |     {
 11 |         mClassCount = CLASS_NUM;
 12 |         mYoloKernel.clear();
 13 |         mYoloKernel.push_back(yolo1);
 14 |         mYoloKernel.push_back(yolo2);
 15 |         mYoloKernel.push_back(yolo3);
 16 | 
 17 |         mKernelCount = mYoloKernel.size();
 18 | 
 19 |         CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
 20 |         size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
 21 |         for(int ii = 0; ii < mKernelCount; ii ++)
 22 |         {
 23 |             CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen));
 24 |             const auto& yolo = mYoloKernel[ii];
 25 |             CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
 26 |         }
 27 |     }
 28 |     
 29 |     YoloLayerPlugin::~YoloLayerPlugin()
 30 |     {
 31 |     }
 32 | 
 33 |     // create the plugin at runtime from a byte stream
 34 |     YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
 35 |     {
 36 |         using namespace Tn;
 37 |         const char *d = reinterpret_cast<const char *>(data), *a = d;
 38 |         read(d, mClassCount);
 39 |         read(d, mThreadCount);
 40 |         read(d, mKernelCount);
 41 |         mYoloKernel.resize(mKernelCount);
 42 |         auto kernelSize = mKernelCount*sizeof(YoloKernel);
 43 |         memcpy(mYoloKernel.data(),d,kernelSize);
 44 |         d += kernelSize;
 45 | 
 46 |         CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
 47 |         size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
 48 |         for(int ii = 0; ii < mKernelCount; ii ++)
 49 |         {
 50 |             CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen));
 51 |             const auto& yolo = mYoloKernel[ii];
 52 |             CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
 53 |         }
 54 | 
 55 |         assert(d == a + length);
 56 |     }
 57 | 
 58 |     void YoloLayerPlugin::serialize(void* buffer) const
 59 |     {
 60 |         using namespace Tn;
 61 |         char* d = static_cast<char*>(buffer), *a = d;
 62 |         write(d, mClassCount);
 63 |         write(d, mThreadCount);
 64 |         write(d, mKernelCount);
 65 |         auto kernelSize = mKernelCount*sizeof(YoloKernel);
 66 |         memcpy(d,mYoloKernel.data(),kernelSize);
 67 |         d += kernelSize;
 68 | 
 69 |         assert(d == a + getSerializationSize());
 70 |     }
 71 |     
 72 |     size_t YoloLayerPlugin::getSerializationSize() const
 73 |     {  
 74 |         return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount)  + sizeof(Yolo::YoloKernel) * mYoloKernel.size();
 75 |     }
 76 | 
 77 |     int YoloLayerPlugin::initialize()
 78 |     { 
 79 |         return 0;
 80 |     }
 81 |     
 82 |     Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
 83 |     {
 84 |         //output the result to channel
 85 |         int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);
 86 | 
 87 |         return Dims3(totalsize + 1, 1, 1);
 88 |     }
 89 | 
 90 |     // Set plugin namespace
 91 |     void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace)
 92 |     {
 93 |         mPluginNamespace = pluginNamespace;
 94 |     }
 95 | 
 96 |     const char* YoloLayerPlugin::getPluginNamespace() const
 97 |     {
 98 |         return mPluginNamespace;
 99 |     }
100 | 
101 |     // Return the DataType of the plugin output at the requested index
102 |     DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
103 |     {
104 |         return DataType::kFLOAT;
105 |     }
106 | 
107 |     // Return true if output tensor is broadcast across a batch.
108 |     bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
109 |     {
110 |         return false;
111 |     }
112 | 
113 |     // Return true if plugin can use input that is broadcast across batch without replication.
114 |     bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const
115 |     {
116 |         return false;
117 |     }
118 | 
119 |     void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
120 |     {
121 |     }
122 | 
123 |     // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
124 |     void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
125 |     {
126 |     }
127 | 
128 |     // Detach the plugin object from its execution context.
129 |     void YoloLayerPlugin::detachFromContext() {}
130 | 
131 |     const char* YoloLayerPlugin::getPluginType() const
132 |     {
133 |         return "YoloLayer_TRT";
134 |     }
135 | 
136 |     const char* YoloLayerPlugin::getPluginVersion() const
137 |     {
138 |         return "1";
139 |     }
140 | 
141 |     void YoloLayerPlugin::destroy()
142 |     {
143 |         delete this;
144 |     }
145 | 
146 |     // Clone the plugin
147 |     IPluginV2IOExt* YoloLayerPlugin::clone() const
148 |     {
149 |         YoloLayerPlugin *p = new YoloLayerPlugin();
150 |         p->setPluginNamespace(mPluginNamespace);
151 |         return p;
152 |     }
153 | 
154 |     __device__ float Logist(float data){ return 1.0f / (1.0f + expf(-data)); };
155 | 
156 |     __global__ void CalDetection(const float *input, float *output,int noElements, 
157 |             int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes,int outputElem) {
158 |  
159 |         int idx = threadIdx.x + blockDim.x * blockIdx.x;
160 |         if (idx >= noElements) return;
161 | 
162 |         int total_grid = yoloWidth * yoloHeight;
163 |         int bnIdx = idx / total_grid;
164 |         idx = idx - total_grid*bnIdx;
165 |         int info_len_i = 5 + classes;
166 |         const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);
167 | 
168 |         for (int k = 0; k < 3; ++k) {
169 |             float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
170 |             if (box_prob < IGNORE_THRESH) continue;
171 |             int class_id = 0;
172 |             float max_cls_prob = 0.0;
173 |             for (int i = 5; i < info_len_i; ++i) {
174 |                 float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
175 |                 if (p > max_cls_prob) {
176 |                     max_cls_prob = p;
177 |                     class_id = i - 5;
178 |                 }
179 |             }
180 |             float *res_count = output + bnIdx*outputElem;
181 |             int count = (int)atomicAdd(res_count, 1);
182 |             if (count >= MAX_OUTPUT_BBOX_COUNT) return;
183 |             char* data = (char *)res_count + sizeof(float) + count * sizeof(Detection);
184 |             Detection* det =  (Detection*)(data);
185 | 
186 |             int row = idx / yoloWidth;
187 |             int col = idx % yoloWidth;
188 | 
189 |             //Location
190 |             det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth;
191 |             det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight;
192 |             det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]);
193 |             det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2*k];
194 |             det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]);
195 |             det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2*k + 1];
196 |             det->conf = box_prob * max_cls_prob;
197 |             det->class_id = class_id;
198 |         }
199 |     }
200 | 
201 |     void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {
202 | 
203 |         int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);
204 | 
205 |         for(int idx = 0 ; idx < batchSize; ++idx) {
206 |             CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float)));
207 |         }
208 |         int numElem = 0;
209 |         for (unsigned int i = 0; i < mYoloKernel.size(); ++i)
210 |         {
211 |             const auto& yolo = mYoloKernel[i];
212 |             numElem = yolo.width*yolo.height*batchSize;
213 |             if (numElem < mThreadCount)
214 |                 mThreadCount = numElem;
215 |             CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>>
216 |                 (inputs[i], output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount, outputElem);
217 |         }
218 | 
219 |     }
220 | 
221 | 
222 |     int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
223 |     {
224 |         forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
225 |         return 0;
226 |     }
227 | 
228 |     PluginFieldCollection YoloPluginCreator::mFC{};
229 |     std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
230 | 
231 |     YoloPluginCreator::YoloPluginCreator()
232 |     {
233 |         mPluginAttributes.clear();
234 | 
235 |         mFC.nbFields = mPluginAttributes.size();
236 |         mFC.fields = mPluginAttributes.data();
237 |     }
238 | 
239 |     const char* YoloPluginCreator::getPluginName() const
240 |     {
241 |             return "YoloLayer_TRT";
242 |     }
243 | 
244 |     const char* YoloPluginCreator::getPluginVersion() const
245 |     {
246 |             return "1";
247 |     }
248 | 
249 |     const PluginFieldCollection* YoloPluginCreator::getFieldNames()
250 |     {
251 |             return &mFC;
252 |     }
253 | 
254 |     IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
255 |     {
256 |         YoloLayerPlugin* obj = new YoloLayerPlugin();
257 |         obj->setPluginNamespace(mNamespace.c_str());
258 |         return obj;
259 |     }
260 | 
261 |     IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
262 |     {
263 |         // This object will be deleted when the network is destroyed, which will
264 |         // call MishPlugin::destroy()
265 |         YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
266 |         obj->setPluginNamespace(mNamespace.c_str());
267 |         return obj;
268 |     }
269 | 
270 | }
271 | 


--------------------------------------------------------------------------------
/yolov5s/yololayer.h:
--------------------------------------------------------------------------------
  1 | #ifndef _YOLO_LAYER_H
  2 | #define _YOLO_LAYER_H
  3 | 
  4 | #include <vector>
  5 | #include <string>
  6 | #include "NvInfer.h"
  7 | 
  8 | namespace Yolo
  9 | {
 10 |     static constexpr int CHECK_COUNT = 3;
 11 |     static constexpr float IGNORE_THRESH = 0.1f;
 12 |     static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
 13 |     static constexpr int CLASS_NUM = 80;
 14 |     static constexpr int INPUT_H = 608;
 15 |     static constexpr int INPUT_W = 608;
 16 | 
 17 |     struct YoloKernel
 18 |     {
 19 |         int width;
 20 |         int height;
 21 |         float anchors[CHECK_COUNT*2];
 22 |     };
 23 | 
 24 |     static constexpr YoloKernel yolo1 = {
 25 |         INPUT_W / 32,
 26 |         INPUT_H / 32,
 27 |         {116,90,  156,198,  373,326}
 28 |     };
 29 |     static constexpr YoloKernel yolo2 = {
 30 |         INPUT_W / 16,
 31 |         INPUT_H / 16,
 32 |         {30,61,  62,45,  59,119}
 33 |     };
 34 |     static constexpr YoloKernel yolo3 = {
 35 |         INPUT_W / 8,
 36 |         INPUT_H / 8,
 37 |         {10,13,  16,30,  33,23}
 38 |     };
 39 | 
 40 |     static constexpr int LOCATIONS = 4;
 41 |     struct alignas(float) Detection{
 42 |         //center_x center_y w h
 43 |         float bbox[LOCATIONS];
 44 |         float conf;  // bbox_conf * cls_conf
 45 |         float class_id;
 46 |     };
 47 | }
 48 | 
 49 | namespace nvinfer1
 50 | {
 51 |     class YoloLayerPlugin: public IPluginV2IOExt
 52 |     {
 53 |         public:
 54 |             explicit YoloLayerPlugin();
 55 |             YoloLayerPlugin(const void* data, size_t length);
 56 | 
 57 |             ~YoloLayerPlugin();
 58 | 
 59 |             int getNbOutputs() const override
 60 |             {
 61 |                 return 1;
 62 |             }
 63 | 
 64 |             Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
 65 | 
 66 |             int initialize() override;
 67 | 
 68 |             virtual void terminate() override {};
 69 | 
 70 |             virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;}
 71 | 
 72 |             virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;
 73 | 
 74 |             virtual size_t getSerializationSize() const override;
 75 | 
 76 |             virtual void serialize(void* buffer) const override;
 77 | 
 78 |             bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override {
 79 |                 return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
 80 |             }
 81 | 
 82 |             const char* getPluginType() const override;
 83 | 
 84 |             const char* getPluginVersion() const override;
 85 | 
 86 |             void destroy() override;
 87 | 
 88 |             IPluginV2IOExt* clone() const override;
 89 | 
 90 |             void setPluginNamespace(const char* pluginNamespace) override;
 91 | 
 92 |             const char* getPluginNamespace() const override;
 93 | 
 94 |             DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
 95 | 
 96 |             bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;
 97 | 
 98 |             bool canBroadcastInputAcrossBatch(int inputIndex) const override;
 99 | 
100 |             void attachToContext(
101 |                     cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;
102 | 
103 |             void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override;
104 | 
105 |             void detachFromContext() override;
106 | 
107 |         private:
108 |             void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1);
109 |             int mClassCount;
110 |             int mKernelCount;
111 |             std::vector<Yolo::YoloKernel> mYoloKernel;
112 |             int mThreadCount = 256;
113 |             void** mAnchor;
114 |             const char* mPluginNamespace;
115 |     };
116 | 
117 |     class YoloPluginCreator : public IPluginCreator
118 |     {
119 |         public:
120 |             YoloPluginCreator();
121 | 
122 |             ~YoloPluginCreator() override = default;
123 | 
124 |             const char* getPluginName() const override;
125 | 
126 |             const char* getPluginVersion() const override;
127 | 
128 |             const PluginFieldCollection* getFieldNames() override;
129 | 
130 |             IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;
131 | 
132 |             IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
133 | 
134 |             void setPluginNamespace(const char* libNamespace) override
135 |             {
136 |                 mNamespace = libNamespace;
137 |             }
138 | 
139 |             const char* getPluginNamespace() const override
140 |             {
141 |                 return mNamespace.c_str();
142 |             }
143 | 
144 |         private:
145 |             std::string mNamespace;
146 |             static PluginFieldCollection mFC;
147 |             static std::vector<PluginField> mPluginAttributes;
148 |     };
149 | 
150 | 
151 | 
152 | };
153 | 
154 | #endif 
155 | 


--------------------------------------------------------------------------------
/yolov5s/yolov5s.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <chrono>
  3 | #include "cuda_runtime_api.h"
  4 | #include "logging.h"
  5 | #include "common.hpp"
  6 | 
  7 | #define USE_FP16  // comment out this if want to use FP32
  8 | #define DEVICE 0  // GPU id
  9 | #define NMS_THRESH 0.5
 10 | #define CONF_THRESH 0.4
 11 | #define BATCH_SIZE 1
 12 | 
 13 | // stuff we know about the network and the input/output blobs
 14 | static const int INPUT_H = Yolo::INPUT_H;
 15 | static const int INPUT_W = Yolo::INPUT_W;
 16 | static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1;  // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1
 17 | const char* INPUT_BLOB_NAME = "data";
 18 | const char* OUTPUT_BLOB_NAME = "prob";
 19 | static Logger gLogger;
 20 | REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
 21 | 
 22 | // Creat the engine using only the API and not any parser.
 23 | ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
 24 |     INetworkDefinition* network = builder->createNetworkV2(0U);
 25 | 
 26 |     // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
 27 |     ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
 28 |     assert(data);
 29 | 
 30 |     std::map<std::string, Weights> weightMap = loadWeights("../yolov5sV2.wts");
 31 |     Weights emptywts{DataType::kFLOAT, nullptr, 0};
 32 | 
 33 |     // yolov5 backbone
 34 |     auto focus0 = focus(network, weightMap, *data, 3, 32, 3, "model.0");
 35 |     auto conv1 = convBnLeaky(network, weightMap, *focus0->getOutput(0), 64, 3, 2, 1, "model.1");
 36 |     auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 64, 64, 1, true, 1, 0.5, "model.2");
 37 |     auto conv3 = convBnLeaky(network, weightMap, *bottleneck_CSP2->getOutput(0), 128, 3, 2, 1, "model.3");
 38 |     auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 128, 128, 3, true, 1, 0.5, "model.4");
 39 |     auto conv5 = convBnLeaky(network, weightMap, *bottleneck_csp4->getOutput(0), 256, 3, 2, 1, "model.5");
 40 |     auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 256, 256, 3, true, 1, 0.5, "model.6");
 41 |     auto conv7 = convBnLeaky(network, weightMap, *bottleneck_csp6->getOutput(0), 512, 3, 2, 1, "model.7");
 42 |     auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 512, 512, 5, 9, 13, "model.8");
 43 | 
 44 |     // yolov5 head
 45 |     auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.9");
 46 |     auto conv10 = convBnLeaky(network, weightMap, *bottleneck_csp9->getOutput(0), 256, 1, 1, 1, "model.10");
 47 | 
 48 |     float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 256 * 2 * 2));
 49 |     for (int i = 0; i < 256 * 2 * 2; i++) {
 50 |         deval[i] = 1.0;
 51 |     }
 52 |     Weights deconvwts11{DataType::kFLOAT, deval, 256 * 2 * 2};
 53 |     IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 256, DimsHW{2, 2}, deconvwts11, emptywts);
 54 |     deconv11->setStrideNd(DimsHW{2, 2});
 55 |     deconv11->setNbGroups(256);
 56 |     weightMap["deconv11"] = deconvwts11;
 57 | 
 58 |     ITensor* inputTensors12[] = {deconv11->getOutput(0), bottleneck_csp6->getOutput(0)};
 59 |     auto cat12 = network->addConcatenation(inputTensors12, 2);
 60 |     auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 512, 256, 1, false, 1, 0.5, "model.13");
 61 |     auto conv14 = convBnLeaky(network, weightMap, *bottleneck_csp13->getOutput(0), 128, 1, 1, 1, "model.14");
 62 | 
 63 |     Weights deconvwts15{DataType::kFLOAT, deval, 128 * 2 * 2};
 64 |     IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 128, DimsHW{2, 2}, deconvwts15, emptywts);
 65 |     deconv15->setStrideNd(DimsHW{2, 2});
 66 |     deconv15->setNbGroups(128);
 67 | 	//weightMap["deconv15"] = deconvwts15;
 68 | 
 69 |     ITensor* inputTensors16[] = {deconv15->getOutput(0), bottleneck_csp4->getOutput(0)};
 70 |     auto cat16 = network->addConcatenation(inputTensors16, 2);
 71 |     auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 256, 128, 1, false, 1, 0.5, "model.17");
 72 |     IConvolutionLayer* conv18 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
 73 | 
 74 |     auto conv19 = convBnLeaky(network, weightMap, *bottleneck_csp17->getOutput(0), 128, 3, 2, 1, "model.18");
 75 |     ITensor* inputTensors20[] = {conv19->getOutput(0), conv14->getOutput(0)};
 76 |     auto cat20 = network->addConcatenation(inputTensors20, 2);
 77 |     auto bottleneck_csp21 = bottleneckCSP(network, weightMap, *cat20->getOutput(0), 256, 256, 1, false, 1, 0.5, "model.20");
 78 |     IConvolutionLayer* conv22 = network->addConvolutionNd(*bottleneck_csp21->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
 79 | 
 80 |     auto conv23 = convBnLeaky(network, weightMap, *bottleneck_csp21->getOutput(0), 256, 3, 2, 1, "model.21");
 81 |     ITensor* inputTensors24[] = {conv23->getOutput(0), conv10->getOutput(0)};
 82 |     auto cat24 = network->addConcatenation(inputTensors24, 2);
 83 |     auto bottleneck_csp25 = bottleneckCSP(network, weightMap, *cat24->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.23");
 84 |     IConvolutionLayer* conv26 = network->addConvolutionNd(*bottleneck_csp25->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
 85 | 
 86 |     auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
 87 |     const PluginFieldCollection* pluginData = creator->getFieldNames();
 88 |     IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData);
 89 |     ITensor* inputTensors_yolo[] = {conv26->getOutput(0), conv22->getOutput(0), conv18->getOutput(0)};
 90 |     auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj);
 91 | 
 92 |     yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
 93 |     network->markOutput(*yolo->getOutput(0));
 94 | 
 95 |     // Build engine
 96 |     builder->setMaxBatchSize(maxBatchSize);
 97 |     config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
 98 | #ifdef USE_FP16
 99 |     config->setFlag(BuilderFlag::kFP16);
100 | #endif
101 |     std::cout << "Building engine, please wait for a while..." << std::endl;
102 |     ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
103 |     std::cout << "Build engine successfully!" << std::endl;
104 | 
105 |     // Don't need the network any more
106 |     network->destroy();
107 | 
108 |     // Release host memory
109 |     for (auto& mem : weightMap)
110 |     {
111 |         free((void*) (mem.second.values));
112 |     }
113 | 
114 |     return engine;
115 | }
116 | 
117 | void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
118 |     // Create builder
119 |     IBuilder* builder = createInferBuilder(gLogger);
120 |     IBuilderConfig* config = builder->createBuilderConfig();
121 | 
122 |     // Create model to populate the network, then set the outputs and create an engine
123 |     ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
124 |     assert(engine != nullptr);
125 | 
126 |     // Serialize the engine
127 |     (*modelStream) = engine->serialize();
128 | 
129 |     // Close everything down
130 |     engine->destroy();
131 |     builder->destroy();
132 | }
133 | 
134 | void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
135 |     const ICudaEngine& engine = context.getEngine();
136 | 
137 |     // Pointers to input and output device buffers to pass to engine.
138 |     // Engine requires exactly IEngine::getNbBindings() number of buffers.
139 |     assert(engine.getNbBindings() == 2);
140 |     void* buffers[2];
141 | 
142 |     // In order to bind the buffers, we need to know the names of the input and output tensors.
143 |     // Note that indices are guaranteed to be less than IEngine::getNbBindings()
144 |     const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
145 |     const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
146 | 
147 |     // Create GPU buffers on device
148 |     CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
149 |     CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
150 | 
151 |     // Create stream
152 |     cudaStream_t stream;
153 |     CHECK(cudaStreamCreate(&stream));
154 | 
155 |     // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
156 |     CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
157 |     context.enqueue(batchSize, buffers, stream, nullptr);
158 |     CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
159 |     cudaStreamSynchronize(stream);
160 | 
161 |     // Release stream and buffers
162 |     cudaStreamDestroy(stream);
163 |     CHECK(cudaFree(buffers[inputIndex]));
164 |     CHECK(cudaFree(buffers[outputIndex]));
165 | }
166 | 
167 | int main(int argc, char** argv) {
168 |     cudaSetDevice(DEVICE);
169 |     // create a model using the API directly and serialize it to a stream
170 |     char *trtModelStream{nullptr};
171 |     size_t size{0};
172 | 
173 |     if (argc == 2 && std::string(argv[1]) == "-s") {
174 |         IHostMemory* modelStream{nullptr};
175 |         APIToModel(BATCH_SIZE, &modelStream);
176 |         assert(modelStream != nullptr);
177 |         std::ofstream p("yolov5s.engine", std::ios::binary);
178 |         if (!p) {
179 |             std::cerr << "could not open plan output file" << std::endl;
180 |             return -1;
181 |         }
182 |         p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
183 |         modelStream->destroy();
184 |         return 0;
185 |     } else if (argc == 3 && std::string(argv[1]) == "-d") {
186 |         std::ifstream file("yolov5s.engine", std::ios::binary);
187 |         if (file.good()) {
188 |             file.seekg(0, file.end);
189 |             size = file.tellg();
190 |             file.seekg(0, file.beg);
191 |             trtModelStream = new char[size];
192 |             assert(trtModelStream);
193 |             file.read(trtModelStream, size);
194 |             file.close();
195 |         }
196 |     } else {
197 |         std::cerr << "arguments not right!" << std::endl;
198 |         std::cerr << "./yolov5s -s  // serialize model to plan file" << std::endl;
199 |         std::cerr << "./yolov5s -d ../samples  // deserialize plan file and run inference" << std::endl;
200 |         return -1;
201 |     }
202 | 
203 |     std::vector<std::string> file_names;
204 |     if (read_files_in_dir(argv[2], file_names) < 0) {
205 |         std::cout << "read_files_in_dir failed." << std::endl;
206 |         return -1;
207 |     }
208 | 
209 |     // prepare input data ---------------------------
210 |     static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
211 |     //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
212 |     //    data[i] = 1.0;
213 |     static float prob[BATCH_SIZE * OUTPUT_SIZE];
214 |     IRuntime* runtime = createInferRuntime(gLogger);
215 |     assert(runtime != nullptr);
216 |     ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
217 |     assert(engine != nullptr);
218 |     IExecutionContext* context = engine->createExecutionContext();
219 |     assert(context != nullptr);
220 |     delete[] trtModelStream;
221 | 
222 |     int fcount = 0;
223 |     for (int f = 0; f < (int)file_names.size(); f++) {
224 |         fcount++;
225 |         if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue;
226 |         for (int b = 0; b < fcount; b++) {
227 |             cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]);
228 |             if (img.empty()) continue;
229 |             cv::Mat pr_img = preprocess_img(img);
230 | 			int i = 0;
231 | 			for (int row = 0; row < INPUT_H; ++row) {
232 | 				uchar* uc_pixel = pr_img.data + row * pr_img.step;
233 | 				for (int col = 0; col < INPUT_W; ++col) {
234 | 					data[b * 3 * INPUT_H * INPUT_W + i] = uc_pixel[2] / 255;
235 | 					data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = uc_pixel[1] / 255.0;
236 | 					data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = uc_pixel[0] / 255.0;
237 | 					uc_pixel += 3;
238 | 					++i;
239 | 				}
240 | 			}
241 |         }
242 | 
243 |         // Run inference
244 |         auto start = std::chrono::system_clock::now();
245 |         doInference(*context, data, prob, BATCH_SIZE);
246 |         auto end = std::chrono::system_clock::now();
247 |         std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
248 |         std::vector<std::vector<Yolo::Detection>> batch_res(fcount);
249 |         for (int b = 0; b < fcount; b++) {
250 |             auto& res = batch_res[b];
251 |             nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH);
252 |         }
253 |         for (int b = 0; b < fcount; b++) {
254 |             auto& res = batch_res[b];
255 |             //std::cout << res.size() << std::endl;
256 |             cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]);
257 |             for (size_t j = 0; j < res.size(); j++) {
258 |                 cv::Rect r = get_rect(img, res[j].bbox);
259 |                 cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
260 |                 cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
261 |             }
262 |             cv::imwrite("_" + file_names[f - fcount + 1 + b], img);
263 |         }
264 |         fcount = 0;
265 |     }
266 | 
267 |     // Destroy the engine
268 |     context->destroy();
269 |     engine->destroy();
270 |     runtime->destroy();
271 | 
272 |     // Print histogram of the output distribution
273 |     //std::cout << "\nOutput:\n\n";
274 |     //for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
275 |     //{
276 |     //    std::cout << prob[i] << ", ";
277 |     //    if (i % 10 == 0) std::cout << std::endl;
278 |     //}
279 |     //std::cout << std::endl;
280 | 
281 |     return 0;
282 | }
283 | 


--------------------------------------------------------------------------------
/yolov5x/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(yolov5)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30)
14 | 
15 | include_directories(${PROJECT_SOURCE_DIR}/include)
16 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
17 |     message("embed_platform on")
18 |     include_directories(/usr/local/cuda/targets/aarch64-linux/include)
19 |     link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
20 | else()
21 |     message("embed_platform off")
22 |     include_directories(/usr/local/cuda/include)
23 |     link_directories(/usr/local/cuda/lib64)
24 | endif()
25 | 
26 | 
27 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
28 | 
29 | cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
30 | 
31 | ########## opencv configuration ############
32 | find_package(OpenCV 3.4 REQUIRED)
33 | message(OpenCV_LIBS)
34 | include_directories(OpenCV_INCLUDE_DIRS)
35 | 
36 | add_executable(yolov5x ${PROJECT_SOURCE_DIR}/yolov5x.cpp)
37 | target_link_libraries(yolov5x nvinfer)
38 | target_link_libraries(yolov5x cudart)
39 | target_link_libraries(yolov5x yololayer)
40 | target_link_libraries(yolov5x ${OpenCV_LIBS})
41 | 
42 | add_definitions(-O2 -pthread)
43 | 
44 | 


--------------------------------------------------------------------------------
/yolov5x/README.md:
--------------------------------------------------------------------------------
 1 | # yolov5
 2 | 
 3 | The Pytorch implementation is [ultralytics/yolov5](https://github.com/ultralytics/yolov5).
 4 | 
 5 | I was using [ultralytics/yolov5](https://github.com/ultralytics/yolov5)(The latest version). Just in case the yolov5 model updated.
 6 | 
 7 | ## How to Run
 8 | 
 9 | ```
10 | 1. generate yolov5x.wts from pytorch implementation with yolov5.pt
11 | 
12 | git clone https://github.com/AIpakchoi/yolov5_tensorrt.git
13 | git clone https://github.com/ultralytics/yolov5.git
14 | // download its weights 'yolov5x.pt'
15 | cd yolov5
16 | cp ../yolov5_tensorrt/yolov5x/gen_wts.py .
17 | python gen_wts.py
18 | // a file 'yolov5x.wts' will be generated.
19 | 
20 | 2. put yolov5x.wts into yolov5x, build and run
21 | 
22 | mv yolov5x.wts ../yolov5_tensorrt/yolov5x/
23 | cd ../yolov5_tensorrt/yolov5x
24 | mkdir build
25 | cd build
26 | cmake ..
27 | make
28 | sudo ./yolov5x -s             // serialize model to plan file i.e. 'yolov5x.engine'
29 | sudo ./yolov5x -d  ../samples // deserialize plan file and run inference, the images in samples will be processed.
30 | 
31 | 3. check the images generated, as follows. _zidane.jpg and _bus.jpg
32 | ```
33 | 
34 | <p align="center">
35 | <img src="https://user-images.githubusercontent.com/15235574/78247927-4d9fac00-751e-11ea-8b1b-704a0aeb3fcf.jpg">
36 | </p>
37 | 
38 | <p align="center">
39 | <img src="https://user-images.githubusercontent.com/15235574/78247970-60b27c00-751e-11ea-88df-41473fed4823.jpg">
40 | </p>
41 | 
42 | ## Config
43 | 
44 | - Input shape defined in yololayer.h
45 | - Number of classes defined in yololayer.h
46 | - FP16/FP32 can be selected by the macro in yolov5x.cpp
47 | - GPU id can be selected by the macro in yolov5x.cpp
48 | - NMS thresh in yolov5x.cpp
49 | - BBox confidence thresh in yolov5x.cpp
50 | - Batch size in yolov5x.cpp
51 | 


--------------------------------------------------------------------------------
/yolov5x/common.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef YOLOV5_COMMON_H_
  2 | #define YOLOV5_COMMON_H_
  3 | 
  4 | #include <fstream>
  5 | #include <map>
  6 | #include <sstream>
  7 | #include <vector>
  8 | #include <opencv2/opencv.hpp>
  9 | #include <dirent.h>
 10 | #include "NvInfer.h"
 11 | #include "yololayer.h"
 12 | 
 13 | #define CHECK(status) \
 14 |     do\
 15 |     {\
 16 |         auto ret = (status);\
 17 |         if (ret != 0)\
 18 |         {\
 19 |             std::cerr << "Cuda failure: " << ret << std::endl;\
 20 |             abort();\
 21 |         }\
 22 |     } while (0)
 23 | 
 24 | using namespace nvinfer1;
 25 | // resize 优化
 26 | // 从原图的点映射到输出图像的像素点
 27 | // 对outuput的每个点，先根据长宽比计算其在原图中最邻近的像素点，
 28 | //然后直接根据最邻近的思想，直接拷贝Channel个字节作为输出图像
 29 | void resizeByNN(uchar *input, uchar *output, int height_in, int width_in, int channels, int height_out, int width_out) {
 30 | 
 31 | 	uchar *data_source = input;
 32 | 	uchar *data_half = output;
 33 | 
 34 | 	int bpl_source = width_in * 3;
 35 | 	int bpl_dst = width_out * 3;
 36 | 
 37 | 	int pos = 0;
 38 | 	int sep = 0;
 39 | 	uchar *sr = nullptr;
 40 | 	uchar *hr = nullptr;
 41 | 	float step = 0.0;
 42 | 	float step_x = float(width_in) / float(width_out);
 43 | 	float step_y = float(height_in) / float(height_out);
 44 | 
 45 | 	for (int i = 0; i < height_out; i++) {
 46 | 		for (int j = 0; j < width_out; j++) {
 47 | 			sep = int(step_y*i);
 48 | 			step = int(j*step_x);
 49 | 			sr = data_source + sep * bpl_source;
 50 | 			hr = data_half + i * bpl_dst + j * channels;
 51 | 			pos = step * channels;
 52 | 			memcpy(hr, sr + pos, channels);
 53 | 		}
 54 | 	}
 55 | 	return;
 56 | }
 57 | 
 58 | cv::Mat preprocess_img(cv::Mat& img) {
 59 |     int w, h, x, y;
 60 |     float r_w = Yolo::INPUT_W / (img.cols*1.0);
 61 |     float r_h = Yolo::INPUT_H / (img.rows*1.0);
 62 |     if (r_h > r_w) {
 63 |         w = Yolo::INPUT_W;
 64 |         h = r_w * img.rows;
 65 |         x = 0;
 66 |         y = (Yolo::INPUT_H - h) / 2;
 67 |     } else {
 68 |         w = r_h* img.cols;
 69 |         h = Yolo::INPUT_H;
 70 |         x = (Yolo::INPUT_W - w) / 2;
 71 |         y = 0;
 72 |     }
 73 |     cv::Mat re(h, w, CV_8UC3);
 74 | 	//cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC);
 75 | 	auto start = std::chrono::system_clock::now();
 76 | 	cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
 77 | 	//resizeByNN(img.data, re.data, img.rows, img.cols, img.channels(), re.rows, re.cols);
 78 | 	auto end = std::chrono::system_clock::now();
 79 | 	std::cout << "img resize: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
 80 | 	
 81 | 	cv::Mat out(Yolo::INPUT_H, Yolo::INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128));
 82 |     
 83 |     re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
 84 |     return out;
 85 | }
 86 | 
 87 | cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
 88 |     int l, r, t, b;
 89 |     float r_w = Yolo::INPUT_W / (img.cols * 1.0);
 90 |     float r_h = Yolo::INPUT_H / (img.rows * 1.0);
 91 |     if (r_h > r_w) {
 92 |         l = bbox[0] - bbox[2]/2.f;
 93 | 		if (l < 0)
 94 | 		{
 95 | 			l = 0;
 96 | 		}
 97 |         r = bbox[0] + bbox[2]/2.f;
 98 |         if (r > img.cols)
 99 |         {
100 | 			r = img.cols;
101 |         }
102 | 		t = bbox[1] - bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
103 | 		if (t < 0)
104 | 		{
105 | 			t = 0;
106 | 		}
107 | 		b = bbox[1] + bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
108 | 		if (b > img.rows)
109 | 		{
110 | 			b = img.rows;
111 | 		}
112 |         l = l / r_w;
113 |         r = r / r_w;
114 |         t = t / r_w;
115 |         b = b / r_w;
116 |     } else {
117 |         l = bbox[0] - bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
118 | 		if (l < 0)
119 | 		{
120 | 			l = 0;
121 | 		}
122 |         r = bbox[0] + bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
123 | 		if (r > img.cols)
124 | 		{
125 | 			r = img.cols;
126 | 		}
127 |         t = bbox[1] - bbox[3]/2.f;
128 | 		if (t < 0)
129 | 		{
130 | 			t = 0;
131 | 		}
132 |         b = bbox[1] + bbox[3]/2.f;
133 | 		if (b > img.rows)
134 | 		{
135 | 			b = img.rows;
136 | 		}
137 |         l = l / r_h;
138 |         r = r / r_h;
139 |         t = t / r_h;
140 |         b = b / r_h;
141 |     }
142 |     return cv::Rect(l, t, r-l, b-t);
143 | }
144 | 
145 | // std::max vs. max
146 | //https://www.cnblogs.com/timesdaughter/p/5894930.html
147 | // Use (std::min) and (std::max)
148 | float iou(float lbox[4], float rbox[4]) {
149 |     float interBox[] = {
150 |         (std::max)(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
151 |         (std::min)(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
152 |         (std::max)(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
153 |         (std::min)(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
154 |     };
155 | 
156 |     if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
157 |         return 0.0f;
158 | 
159 |     float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
160 |     return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
161 | }
162 | 
163 | bool cmp(Yolo::Detection& a, Yolo::Detection& b) {
164 |     return a.conf > b.conf;
165 | }
166 | 
167 | void nms(std::vector<Yolo::Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5) {
168 |     int det_size = sizeof(Yolo::Detection) / sizeof(float);
169 |     std::map<float, std::vector<Yolo::Detection>> m;
170 |     for (int i = 0; i < output[0] && i < 1000; i++) {
171 |         if (output[1 + det_size * i + 4] <= conf_thresh) continue;
172 |         Yolo::Detection det;
173 |         memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
174 |         if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
175 |         m[det.class_id].push_back(det);
176 |     }
177 |     for (auto it = m.begin(); it != m.end(); it++) {
178 |         //std::cout << it->second[0].class_id << " --- " << std::endl;
179 |         auto& dets = it->second;
180 |         std::sort(dets.begin(), dets.end(), cmp);
181 |         for (size_t m = 0; m < dets.size(); ++m) {
182 |             auto& item = dets[m];
183 |             res.push_back(item);
184 |             for (size_t n = m + 1; n < dets.size(); ++n) {
185 |                 if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
186 |                     dets.erase(dets.begin()+n);
187 |                     --n;
188 |                 }
189 |             }
190 |         }
191 |     }
192 | }
193 | 
194 | // TensorRT weight files have a simple space delimited format:
195 | // [type] [size] <data x size in hex>
196 | std::map<std::string, Weights> loadWeights(const std::string file) {
197 |     std::cout << "Loading weights: " << file << std::endl;
198 |     std::map<std::string, Weights> weightMap;
199 | 
200 |     // Open weights file
201 |     std::ifstream input(file);
202 |     assert(input.is_open() && "Unable to load weight file.");
203 | 
204 |     // Read number of weight blobs
205 |     int32_t count;
206 |     input >> count;
207 |     assert(count > 0 && "Invalid weight map file.");
208 | 
209 |     while (count--)
210 |     {
211 |         Weights wt{DataType::kFLOAT, nullptr, 0};
212 |         uint32_t size;
213 | 
214 |         // Read name and type of blob
215 |         std::string name;
216 |         input >> name >> std::dec >> size;
217 |         wt.type = DataType::kFLOAT;
218 | 
219 |         // Load blob
220 |         uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
221 |         for (uint32_t x = 0, y = size; x < y; ++x)
222 |         {
223 |             input >> std::hex >> val[x];
224 |         }
225 |         wt.values = val;
226 |         
227 |         wt.count = size;
228 |         weightMap[name] = wt;
229 |     }
230 | 
231 |     return weightMap;
232 | }
233 | 
234 | IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
235 |     float *gamma = (float*)weightMap[lname + ".weight"].values;
236 |     float *beta = (float*)weightMap[lname + ".bias"].values;
237 |     float *mean = (float*)weightMap[lname + ".running_mean"].values;
238 |     float *var = (float*)weightMap[lname + ".running_var"].values;
239 |     int len = weightMap[lname + ".running_var"].count;
240 | 
241 |     float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
242 |     for (int i = 0; i < len; i++) {
243 |         scval[i] = gamma[i] / sqrt(var[i] + eps);
244 |     }
245 |     Weights scale{DataType::kFLOAT, scval, len};
246 |     
247 |     float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
248 |     for (int i = 0; i < len; i++) {
249 |         shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
250 |     }
251 |     Weights shift{DataType::kFLOAT, shval, len};
252 | 
253 |     float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
254 |     for (int i = 0; i < len; i++) {
255 |         pval[i] = 1.0;
256 |     }
257 |     Weights power{DataType::kFLOAT, pval, len};
258 | 
259 |     weightMap[lname + ".scale"] = scale;
260 |     weightMap[lname + ".shift"] = shift;
261 |     weightMap[lname + ".power"] = power;
262 |     IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
263 |     assert(scale_1);
264 |     return scale_1;
265 | }
266 | 
267 | ILayer* convBnLeaky(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) {
268 |     Weights emptywts{DataType::kFLOAT, nullptr, 0};
269 |     int p = ksize / 2;
270 |     IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[lname + ".conv.weight"], emptywts);
271 |     assert(conv1);
272 |     conv1->setStrideNd(DimsHW{s, s});
273 |     conv1->setPaddingNd(DimsHW{p, p});
274 |     conv1->setNbGroups(g);
275 | 	//IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-4);
276 | 	IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3);
277 |     auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU);
278 |     lr->setAlpha(0.1);
279 |     return lr;
280 | }
281 | 
282 | ILayer* focus(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) {
283 |     ISliceLayer *s1 = network->addSlice(input, Dims3{0, 0, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2});
284 |     ISliceLayer *s2 = network->addSlice(input, Dims3{0, 1, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2});
285 |     ISliceLayer *s3 = network->addSlice(input, Dims3{0, 0, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2});
286 |     ISliceLayer *s4 = network->addSlice(input, Dims3{0, 1, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2});
287 |     ITensor* inputTensors[] = {s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0)};
288 |     auto cat = network->addConcatenation(inputTensors, 4);
289 |     auto conv = convBnLeaky(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv");
290 |     return conv;
291 | }
292 | 
293 | ILayer* bottleneck(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) {
294 |     auto cv1 = convBnLeaky(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1");
295 |     auto cv2 = convBnLeaky(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2");
296 |     if (shortcut && c1 == c2) {
297 |         auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM);
298 |         return ew;
299 |     }
300 |     return cv2;
301 | }
302 | 
303 | ILayer* bottleneckCSP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) {
304 |     Weights emptywts{DataType::kFLOAT, nullptr, 0};
305 |     int c_ = (int)((float)c2 * e);
306 |     auto cv1 = convBnLeaky(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
307 |     auto cv2 = network->addConvolutionNd(input, c_, DimsHW{1, 1}, weightMap[lname + ".cv2.weight"], emptywts);
308 |     ITensor *y1 = cv1->getOutput(0);
309 |     for (int i = 0; i < n; i++) {
310 |         auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i));
311 |         y1 = b->getOutput(0);
312 |     }
313 |     auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{1, 1}, weightMap[lname + ".cv3.weight"], emptywts);
314 | 
315 |     ITensor* inputTensors[] = {cv3->getOutput(0), cv2->getOutput(0)};
316 |     auto cat = network->addConcatenation(inputTensors, 2);
317 | 
318 |     IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4);
319 |     auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU);
320 |     lr->setAlpha(0.1);
321 | 
322 |     auto cv4 = convBnLeaky(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4");
323 |     return cv4;
324 | }
325 | 
326 | ILayer* SPP(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) {
327 |     int c_ = c1 / 2;
328 |     auto cv1 = convBnLeaky(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1");
329 | 
330 |     auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k1, k1});
331 |     pool1->setPaddingNd(DimsHW{k1 / 2, k1 / 2});
332 |     pool1->setStrideNd(DimsHW{1, 1});
333 |     auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k2, k2});
334 |     pool2->setPaddingNd(DimsHW{k2 / 2, k2 / 2});
335 |     pool2->setStrideNd(DimsHW{1, 1});
336 |     auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k3, k3});
337 |     pool3->setPaddingNd(DimsHW{k3 / 2, k3 / 2});
338 |     pool3->setStrideNd(DimsHW{1, 1});
339 | 
340 |     ITensor* inputTensors[] = {cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0)};
341 |     auto cat = network->addConcatenation(inputTensors, 4);
342 | 
343 |     auto cv2 = convBnLeaky(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2");
344 |     return cv2;
345 | }
346 | 
347 | int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
348 |     DIR *p_dir = opendir(p_dir_name);
349 |     if (p_dir == nullptr) {
350 |         return -1;
351 |     }
352 | 
353 |     struct dirent* p_file = nullptr;
354 |     while ((p_file = readdir(p_dir)) != nullptr) {
355 |         if (strcmp(p_file->d_name, ".") != 0 &&
356 |                 strcmp(p_file->d_name, "..") != 0) {
357 |             //std::string cur_file_name(p_dir_name);
358 |             //cur_file_name += "/";
359 |             //cur_file_name += p_file->d_name;
360 |             std::string cur_file_name(p_file->d_name);
361 |             file_names.push_back(cur_file_name);
362 |         }
363 |     }
364 | 
365 |     closedir(p_dir);
366 |     return 0;
367 | }
368 | 
369 | #endif
370 | 
371 | 


--------------------------------------------------------------------------------
/yolov5x/gen_wts.py:
--------------------------------------------------------------------------------
 1 | from utils.utils import *
 2 | import struct
 3 | 
 4 | # Initialize
 5 | device = torch_utils.select_device('0')
 6 | # Load model
 7 | model = torch.load('weights/yolov5x.pt', map_location=device)['model'].float()  # load to FP32
 8 | model.to(device).eval()
 9 | 
10 | f = open('yolov5x.wts', 'w')
11 | f.write('{}\n'.format(len(model.state_dict().keys())))
12 | for k, v in model.state_dict().items():
13 |     vr = v.reshape(-1).cpu().numpy()
14 |     f.write('{} {} '.format(k, len(vr)))
15 |     for vv in vr:
16 |         f.write(' ')
17 |         f.write(struct.pack('>f',float(vv)).hex())
18 |     f.write('\n')
19 | 


--------------------------------------------------------------------------------
/yolov5x/images/bus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BaofengZan/yolov5_2.0-TensorRt/78193337dd94bf48c2d55c3a77105a534d11c45b/yolov5x/images/bus.jpg


--------------------------------------------------------------------------------
/yolov5x/images/zidane.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BaofengZan/yolov5_2.0-TensorRt/78193337dd94bf48c2d55c3a77105a534d11c45b/yolov5x/images/zidane.jpg


--------------------------------------------------------------------------------
/yolov5x/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef __TRT_UTILS_H_
 2 | #define __TRT_UTILS_H_
 3 | 
 4 | #include <iostream>
 5 | #include <vector>
 6 | #include <algorithm>
 7 | #include <cudnn.h>
 8 | 
 9 | #ifndef CUDA_CHECK
10 | 
11 | #define CUDA_CHECK(callstr)                                                                    \
12 |     {                                                                                          \
13 |         cudaError_t error_code = callstr;                                                      \
14 |         if (error_code != cudaSuccess) {                                                       \
15 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
16 |             assert(0);                                                                         \
17 |         }                                                                                      \
18 |     }
19 | 
20 | #endif
21 | 
22 | namespace Tn
23 | {
24 |     class Profiler : public nvinfer1::IProfiler
25 |     {
26 |     public:
27 |         void printLayerTimes(int itrationsTimes)
28 |         {
29 |             float totalTime = 0;
30 |             for (size_t i = 0; i < mProfile.size(); i++)
31 |             {
32 |                 printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes);
33 |                 totalTime += mProfile[i].second;
34 |             }
35 |             printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes);
36 |         }
37 |     private:
38 |         typedef std::pair<std::string, float> Record;
39 |         std::vector<Record> mProfile;
40 | 
41 |         virtual void reportLayerTime(const char* layerName, float ms)
42 |         {
43 |             auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
44 |             if (record == mProfile.end())
45 |                 mProfile.push_back(std::make_pair(layerName, ms));
46 |             else
47 |                 record->second += ms;
48 |         }
49 |     };
50 | 
51 |     //Logger for TensorRT info/warning/errors
52 |     class Logger : public nvinfer1::ILogger
53 |     {
54 |     public:
55 | 
56 |         Logger(): Logger(Severity::kWARNING) {}
57 | 
58 |         Logger(Severity severity): reportableSeverity(severity) {}
59 | 
60 |         void log(Severity severity, const char* msg) override
61 |         {
62 |             // suppress messages with severity enum value greater than the reportable
63 |             if (severity > reportableSeverity) return;
64 | 
65 |             switch (severity)
66 |             {
67 |                 case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
68 |                 case Severity::kERROR: std::cerr << "ERROR: "; break;
69 |                 case Severity::kWARNING: std::cerr << "WARNING: "; break;
70 |                 case Severity::kINFO: std::cerr << "INFO: "; break;
71 |                 default: std::cerr << "UNKNOWN: "; break;
72 |             }
73 |             std::cerr << msg << std::endl;
74 |         }
75 | 
76 |         Severity reportableSeverity{Severity::kWARNING};
77 |     };
78 | 
79 |     template<typename T> 
80 |     void write(char*& buffer, const T& val)
81 |     {
82 |         *reinterpret_cast<T*>(buffer) = val;
83 |         buffer += sizeof(T);
84 |     }
85 | 
86 |     template<typename T> 
87 |     void read(const char*& buffer, T& val)
88 |     {
89 |         val = *reinterpret_cast<const T*>(buffer);
90 |         buffer += sizeof(T);
91 |     }
92 | }
93 | 
94 | #endif


--------------------------------------------------------------------------------
/yolov5x/yololayer.cu:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include "yololayer.h"
  3 | #include "utils.h"
  4 | 
  5 | using namespace Yolo;
  6 | 
  7 | namespace nvinfer1
  8 | {
  9 |     YoloLayerPlugin::YoloLayerPlugin()
 10 |     {
 11 |         mClassCount = CLASS_NUM;
 12 |         mYoloKernel.clear();
 13 |         mYoloKernel.push_back(yolo1);
 14 |         mYoloKernel.push_back(yolo2);
 15 |         mYoloKernel.push_back(yolo3);
 16 | 
 17 |         mKernelCount = mYoloKernel.size();
 18 | 
 19 |         CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
 20 |         size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
 21 |         for(int ii = 0; ii < mKernelCount; ii ++)
 22 |         {
 23 |             CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen));
 24 |             const auto& yolo = mYoloKernel[ii];
 25 |             CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
 26 |         }
 27 |     }
 28 |     
 29 |     YoloLayerPlugin::~YoloLayerPlugin()
 30 |     {
 31 |     }
 32 | 
 33 |     // create the plugin at runtime from a byte stream
 34 |     YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
 35 |     {
 36 |         using namespace Tn;
 37 |         const char *d = reinterpret_cast<const char *>(data), *a = d;
 38 |         read(d, mClassCount);
 39 |         read(d, mThreadCount);
 40 |         read(d, mKernelCount);
 41 |         mYoloKernel.resize(mKernelCount);
 42 |         auto kernelSize = mKernelCount*sizeof(YoloKernel);
 43 |         memcpy(mYoloKernel.data(),d,kernelSize);
 44 |         d += kernelSize;
 45 | 
 46 |         CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*)));
 47 |         size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
 48 |         for(int ii = 0; ii < mKernelCount; ii ++)
 49 |         {
 50 |             CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen));
 51 |             const auto& yolo = mYoloKernel[ii];
 52 |             CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
 53 |         }
 54 | 
 55 |         assert(d == a + length);
 56 |     }
 57 | 
 58 |     void YoloLayerPlugin::serialize(void* buffer) const
 59 |     {
 60 |         using namespace Tn;
 61 |         char* d = static_cast<char*>(buffer), *a = d;
 62 |         write(d, mClassCount);
 63 |         write(d, mThreadCount);
 64 |         write(d, mKernelCount);
 65 |         auto kernelSize = mKernelCount*sizeof(YoloKernel);
 66 |         memcpy(d,mYoloKernel.data(),kernelSize);
 67 |         d += kernelSize;
 68 | 
 69 |         assert(d == a + getSerializationSize());
 70 |     }
 71 |     
 72 |     size_t YoloLayerPlugin::getSerializationSize() const
 73 |     {  
 74 |         return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount)  + sizeof(Yolo::YoloKernel) * mYoloKernel.size();
 75 |     }
 76 | 
 77 |     int YoloLayerPlugin::initialize()
 78 |     { 
 79 |         return 0;
 80 |     }
 81 |     
 82 |     Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
 83 |     {
 84 |         //output the result to channel
 85 |         int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);
 86 | 
 87 |         return Dims3(totalsize + 1, 1, 1);
 88 |     }
 89 | 
 90 |     // Set plugin namespace
 91 |     void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace)
 92 |     {
 93 |         mPluginNamespace = pluginNamespace;
 94 |     }
 95 | 
 96 |     const char* YoloLayerPlugin::getPluginNamespace() const
 97 |     {
 98 |         return mPluginNamespace;
 99 |     }
100 | 
101 |     // Return the DataType of the plugin output at the requested index
102 |     DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
103 |     {
104 |         return DataType::kFLOAT;
105 |     }
106 | 
107 |     // Return true if output tensor is broadcast across a batch.
108 |     bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
109 |     {
110 |         return false;
111 |     }
112 | 
113 |     // Return true if plugin can use input that is broadcast across batch without replication.
114 |     bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const
115 |     {
116 |         return false;
117 |     }
118 | 
119 |     void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
120 |     {
121 |     }
122 | 
123 |     // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
124 |     void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
125 |     {
126 |     }
127 | 
128 |     // Detach the plugin object from its execution context.
129 |     void YoloLayerPlugin::detachFromContext() {}
130 | 
131 |     const char* YoloLayerPlugin::getPluginType() const
132 |     {
133 |         return "YoloLayer_TRT";
134 |     }
135 | 
136 |     const char* YoloLayerPlugin::getPluginVersion() const
137 |     {
138 |         return "1";
139 |     }
140 | 
141 |     void YoloLayerPlugin::destroy()
142 |     {
143 |         delete this;
144 |     }
145 | 
146 |     // Clone the plugin
147 |     IPluginV2IOExt* YoloLayerPlugin::clone() const
148 |     {
149 |         YoloLayerPlugin *p = new YoloLayerPlugin();
150 |         p->setPluginNamespace(mPluginNamespace);
151 |         return p;
152 |     }
153 | 
154 |     __device__ float Logist(float data){ return 1.0f / (1.0f + expf(-data)); };
155 | 
156 |     __global__ void CalDetection(const float *input, float *output,int noElements, 
157 |             int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes,int outputElem) {
158 |  
159 |         int idx = threadIdx.x + blockDim.x * blockIdx.x;
160 |         if (idx >= noElements) return;
161 | 
162 |         int total_grid = yoloWidth * yoloHeight;
163 |         int bnIdx = idx / total_grid;
164 |         idx = idx - total_grid*bnIdx;
165 |         int info_len_i = 5 + classes;
166 |         const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);
167 | 
168 |         for (int k = 0; k < 3; ++k) {
169 |             float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
170 |             if (box_prob < IGNORE_THRESH) continue;
171 |             int class_id = 0;
172 |             float max_cls_prob = 0.0;
173 |             for (int i = 5; i < info_len_i; ++i) {
174 |                 float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
175 |                 if (p > max_cls_prob) {
176 |                     max_cls_prob = p;
177 |                     class_id = i - 5;
178 |                 }
179 |             }
180 |             float *res_count = output + bnIdx*outputElem;
181 |             int count = (int)atomicAdd(res_count, 1);
182 |             if (count >= MAX_OUTPUT_BBOX_COUNT) return;
183 |             char* data = (char *)res_count + sizeof(float) + count * sizeof(Detection);
184 |             Detection* det =  (Detection*)(data);
185 | 
186 |             int row = idx / yoloWidth;
187 |             int col = idx % yoloWidth;
188 | 
189 |             //Location
190 |             det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth;
191 |             det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight;
192 |             det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]);
193 |             det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2*k];
194 |             det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]);
195 |             det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2*k + 1];
196 |             det->conf = box_prob * max_cls_prob;
197 |             det->class_id = class_id;
198 |         }
199 |     }
200 | 
201 |     void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {
202 | 
203 |         int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);
204 | 
205 |         for(int idx = 0 ; idx < batchSize; ++idx) {
206 |             CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float)));
207 |         }
208 |         int numElem = 0;
209 |         for (unsigned int i = 0; i < mYoloKernel.size(); ++i)
210 |         {
211 |             const auto& yolo = mYoloKernel[i];
212 |             numElem = yolo.width*yolo.height*batchSize;
213 |             if (numElem < mThreadCount)
214 |                 mThreadCount = numElem;
215 |             CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>>
216 |                 (inputs[i], output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount, outputElem);
217 |         }
218 | 
219 |     }
220 | 
221 | 
222 |     int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
223 |     {
224 |         forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
225 |         return 0;
226 |     }
227 | 
228 |     PluginFieldCollection YoloPluginCreator::mFC{};
229 |     std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
230 | 
231 |     YoloPluginCreator::YoloPluginCreator()
232 |     {
233 |         mPluginAttributes.clear();
234 | 
235 |         mFC.nbFields = mPluginAttributes.size();
236 |         mFC.fields = mPluginAttributes.data();
237 |     }
238 | 
239 |     const char* YoloPluginCreator::getPluginName() const
240 |     {
241 |             return "YoloLayer_TRT";
242 |     }
243 | 
244 |     const char* YoloPluginCreator::getPluginVersion() const
245 |     {
246 |             return "1";
247 |     }
248 | 
249 |     const PluginFieldCollection* YoloPluginCreator::getFieldNames()
250 |     {
251 |             return &mFC;
252 |     }
253 | 
254 |     IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
255 |     {
256 |         YoloLayerPlugin* obj = new YoloLayerPlugin();
257 |         obj->setPluginNamespace(mNamespace.c_str());
258 |         return obj;
259 |     }
260 | 
261 |     IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
262 |     {
263 |         // This object will be deleted when the network is destroyed, which will
264 |         // call MishPlugin::destroy()
265 |         YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
266 |         obj->setPluginNamespace(mNamespace.c_str());
267 |         return obj;
268 |     }
269 | 
270 | }
271 | 


--------------------------------------------------------------------------------
/yolov5x/yololayer.h:
--------------------------------------------------------------------------------
  1 | #ifndef _YOLO_LAYER_H
  2 | #define _YOLO_LAYER_H
  3 | 
  4 | #include <vector>
  5 | #include <string>
  6 | #include "NvInfer.h"
  7 | 
  8 | namespace Yolo
  9 | {
 10 |     static constexpr int CHECK_COUNT = 3;
 11 |     static constexpr float IGNORE_THRESH = 0.1f;
 12 |     static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
 13 |     static constexpr int CLASS_NUM = 80;
 14 |     static constexpr int INPUT_H = 608;
 15 |     static constexpr int INPUT_W = 608;
 16 | 
 17 |     struct YoloKernel
 18 |     {
 19 |         int width;
 20 |         int height;
 21 |         float anchors[CHECK_COUNT*2];
 22 |     };
 23 | 
 24 |     static constexpr YoloKernel yolo1 = {
 25 |         INPUT_W / 32,
 26 |         INPUT_H / 32,
 27 |         {116,90,  156,198,  373,326}
 28 |     };
 29 |     static constexpr YoloKernel yolo2 = {
 30 |         INPUT_W / 16,
 31 |         INPUT_H / 16,
 32 |         {30,61,  62,45,  59,119}
 33 |     };
 34 |     static constexpr YoloKernel yolo3 = {
 35 |         INPUT_W / 8,
 36 |         INPUT_H / 8,
 37 |         {10,13,  16,30,  33,23}
 38 |     };
 39 | 
 40 |     static constexpr int LOCATIONS = 4;
 41 |     struct alignas(float) Detection{
 42 |         //center_x center_y w h
 43 |         float bbox[LOCATIONS];
 44 |         float conf;  // bbox_conf * cls_conf
 45 |         float class_id;
 46 |     };
 47 | }
 48 | 
 49 | namespace nvinfer1
 50 | {
 51 |     class YoloLayerPlugin: public IPluginV2IOExt
 52 |     {
 53 |         public:
 54 |             explicit YoloLayerPlugin();
 55 |             YoloLayerPlugin(const void* data, size_t length);
 56 | 
 57 |             ~YoloLayerPlugin();
 58 | 
 59 |             int getNbOutputs() const override
 60 |             {
 61 |                 return 1;
 62 |             }
 63 | 
 64 |             Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
 65 | 
 66 |             int initialize() override;
 67 | 
 68 |             virtual void terminate() override {};
 69 | 
 70 |             virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;}
 71 | 
 72 |             virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;
 73 | 
 74 |             virtual size_t getSerializationSize() const override;
 75 | 
 76 |             virtual void serialize(void* buffer) const override;
 77 | 
 78 |             bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override {
 79 |                 return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
 80 |             }
 81 | 
 82 |             const char* getPluginType() const override;
 83 | 
 84 |             const char* getPluginVersion() const override;
 85 | 
 86 |             void destroy() override;
 87 | 
 88 |             IPluginV2IOExt* clone() const override;
 89 | 
 90 |             void setPluginNamespace(const char* pluginNamespace) override;
 91 | 
 92 |             const char* getPluginNamespace() const override;
 93 | 
 94 |             DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
 95 | 
 96 |             bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;
 97 | 
 98 |             bool canBroadcastInputAcrossBatch(int inputIndex) const override;
 99 | 
100 |             void attachToContext(
101 |                     cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;
102 | 
103 |             void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override;
104 | 
105 |             void detachFromContext() override;
106 | 
107 |         private:
108 |             void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1);
109 |             int mClassCount;
110 |             int mKernelCount;
111 |             std::vector<Yolo::YoloKernel> mYoloKernel;
112 |             int mThreadCount = 256;
113 |             void** mAnchor;
114 |             const char* mPluginNamespace;
115 |     };
116 | 
117 |     class YoloPluginCreator : public IPluginCreator
118 |     {
119 |         public:
120 |             YoloPluginCreator();
121 | 
122 |             ~YoloPluginCreator() override = default;
123 | 
124 |             const char* getPluginName() const override;
125 | 
126 |             const char* getPluginVersion() const override;
127 | 
128 |             const PluginFieldCollection* getFieldNames() override;
129 | 
130 |             IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;
131 | 
132 |             IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
133 | 
134 |             void setPluginNamespace(const char* libNamespace) override
135 |             {
136 |                 mNamespace = libNamespace;
137 |             }
138 | 
139 |             const char* getPluginNamespace() const override
140 |             {
141 |                 return mNamespace.c_str();
142 |             }
143 | 
144 |         private:
145 |             std::string mNamespace;
146 |             static PluginFieldCollection mFC;
147 |             static std::vector<PluginField> mPluginAttributes;
148 |     };
149 | 
150 | 
151 | 
152 | };
153 | 
154 | #endif 
155 | 


--------------------------------------------------------------------------------
/yolov5x/yolov5x.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <chrono>
  3 | #include "cuda_runtime_api.h"
  4 | #include "logging.h"
  5 | #include "common.hpp"
  6 | 
  7 | #define USE_FP16  // comment out this if want to use FP32
  8 | #define DEVICE 0  // GPU id
  9 | #define NMS_THRESH 0.5
 10 | #define CONF_THRESH 0.25
 11 | #define BATCH_SIZE 1
 12 | 
 13 | // stuff we know about the network and the input/output blobs
 14 | static const int INPUT_H = Yolo::INPUT_H;
 15 | static const int INPUT_W = Yolo::INPUT_W;
 16 | static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1;  // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1
 17 | const char* INPUT_BLOB_NAME = "data";
 18 | const char* OUTPUT_BLOB_NAME = "prob";
 19 | static Logger gLogger;
 20 | REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
 21 | 
 22 | // Creat the engine using only the API and not any parser.
 23 | ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) {
 24 |     INetworkDefinition* network = builder->createNetworkV2(0U);
 25 | 
 26 |     // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
 27 |     ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
 28 |     assert(data);
 29 | 
 30 |     std::map<std::string, Weights> weightMap = loadWeights("../yolov5x.wts");
 31 |     Weights emptywts{DataType::kFLOAT, nullptr, 0};
 32 | 
 33 |     /* ------ yolov5 backbone------ */
 34 |     auto focus0 = focus(network, weightMap, *data, 3, 80, 3, "model.0");
 35 |     auto conv1 = convBnLeaky(network, weightMap, *focus0->getOutput(0), 160, 3, 2, 1, "model.1");
 36 |     auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 160, 160, 4, true, 1, 0.5, "model.2");
 37 |     auto conv3 = convBnLeaky(network, weightMap, *bottleneck_CSP2->getOutput(0), 320, 3, 2, 1, "model.3");
 38 |     auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 320, 320, 12, true, 1, 0.5, "model.4");
 39 |     auto conv5 = convBnLeaky(network, weightMap, *bottleneck_csp4->getOutput(0), 640, 3, 2, 1, "model.5");
 40 |     auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 640, 640, 12, true, 1, 0.5, "model.6");
 41 |     auto conv7 = convBnLeaky(network, weightMap, *bottleneck_csp6->getOutput(0), 1280, 3, 2, 1, "model.7");
 42 |     auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 1280, 1280, 5, 9, 13, "model.8");
 43 | 
 44 |     /* ------- yolov5 head ------- */
 45 |     auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 1280, 1280, 4, false, 1, 0.5, "model.9");
 46 |     auto conv10 = convBnLeaky(network, weightMap, *bottleneck_csp9->getOutput(0), 640, 1, 1, 1, "model.10");
 47 |   
 48 |     float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 640 * 2 * 2));
 49 |     for (int i = 0; i < 640 * 2 * 2; i++) {
 50 |         deval[i] = 1.0;
 51 |     }
 52 |     Weights deconvwts11{DataType::kFLOAT, deval, 640 * 2 * 2};
 53 |     IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 640, DimsHW{2, 2}, deconvwts11, emptywts);
 54 |     deconv11->setStrideNd(DimsHW{2, 2});
 55 |     deconv11->setNbGroups(640);
 56 |     weightMap["deconv11"] = deconvwts11;
 57 | 
 58 |     ITensor* inputTensors12[] = {deconv11->getOutput(0), bottleneck_csp6->getOutput(0)};
 59 |     auto cat12 = network->addConcatenation(inputTensors12, 2);
 60 | 
 61 |     auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 1280, 640, 4, false, 1, 0.5, "model.13");
 62 |     auto conv14 = convBnLeaky(network, weightMap, *bottleneck_csp13->getOutput(0), 320, 1, 1, 1, "model.14");
 63 |     
 64 |     Weights deconvwts15{DataType::kFLOAT, deval, 320 * 2 * 2};
 65 |     IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 320, DimsHW{2, 2}, deconvwts15, emptywts);
 66 |     deconv15->setStrideNd(DimsHW{2, 2});
 67 |     deconv15->setNbGroups(320);
 68 |     ITensor* inputTensors16[] = {deconv15->getOutput(0), bottleneck_csp4->getOutput(0)};
 69 |     auto cat16 = network->addConcatenation(inputTensors16, 2);
 70 | 
 71 |     auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 640, 320, 4, false, 1, 0.5, "model.17");
 72 | 
 73 |     // yolo layer 1
 74 |     IConvolutionLayer* conv18 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);
 75 |     
 76 |     auto conv19 = convBnLeaky(network, weightMap, *bottleneck_csp17->getOutput(0), 320, 3, 2, 1, "model.18");
 77 | 
 78 |     ITensor* inputTensors20[] = {conv19->getOutput(0), conv14->getOutput(0)};
 79 |     auto cat20 = network->addConcatenation(inputTensors20, 2);
 80 | 
 81 |     auto bottleneck_csp21 = bottleneckCSP(network, weightMap, *cat20->getOutput(0), 640, 640, 4, false, 1, 0.5, "model.20");
 82 | 	
 83 | 	// yolo layer 2
 84 |     IConvolutionLayer* conv22 = network->addConvolutionNd(*bottleneck_csp21->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);
 85 | 
 86 |     auto conv23 = convBnLeaky(network, weightMap, *bottleneck_csp21->getOutput(0), 640, 3, 2, 1, "model.21");
 87 |     
 88 |     ITensor* inputTensors24[] = {conv23->getOutput(0), conv10->getOutput(0)};
 89 |     auto cat24 = network->addConcatenation(inputTensors24, 2);
 90 | 
 91 |     auto bottleneck_csp25 = bottleneckCSP(network, weightMap, *cat24->getOutput(0), 1280, 1280, 4, false, 1, 0.5, "model.23");
 92 |     
 93 | 	// yolo layer 3
 94 |     IConvolutionLayer* conv26 = network->addConvolutionNd(*bottleneck_csp25->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);
 95 |     
 96 |     auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
 97 |     const PluginFieldCollection* pluginData = creator->getFieldNames();
 98 |     IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData);
 99 |     ITensor* inputTensors_yolo[] = {conv26->getOutput(0), conv22->getOutput(0), conv18->getOutput(0)};
100 |     auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj);
101 | 
102 |     yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
103 |     network->markOutput(*yolo->getOutput(0));
104 | 
105 |     // Build engine
106 |     builder->setMaxBatchSize(maxBatchSize);
107 |     config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
108 | #ifdef USE_FP16
109 |     config->setFlag(BuilderFlag::kFP16);
110 | #endif
111 |     std::cout << "Building engine, please wait for a while..." << std::endl;
112 |     ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
113 |     std::cout << "Build engine successfully!" << std::endl;
114 | 
115 |     // Don't need the network any more
116 |     network->destroy();
117 | 
118 |     // Release host memory
119 |     for (auto& mem : weightMap)
120 |     {
121 |         free((void*) (mem.second.values));
122 |     }
123 | 
124 |     return engine;
125 | }
126 | 
127 | void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) {
128 |     // Create builder
129 |     IBuilder* builder = createInferBuilder(gLogger);
130 |     IBuilderConfig* config = builder->createBuilderConfig();
131 | 
132 |     // Create model to populate the network, then set the outputs and create an engine
133 |     ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
134 |     assert(engine != nullptr);
135 | 
136 |     // Serialize the engine
137 |     (*modelStream) = engine->serialize();
138 | 
139 |     // Close everything down
140 |     engine->destroy();
141 |     builder->destroy();
142 | }
143 | 
144 | void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
145 |     const ICudaEngine& engine = context.getEngine();
146 | 
147 |     // Pointers to input and output device buffers to pass to engine.
148 |     // Engine requires exactly IEngine::getNbBindings() number of buffers.
149 |     assert(engine.getNbBindings() == 2);
150 |     void* buffers[2];
151 | 
152 |     // In order to bind the buffers, we need to know the names of the input and output tensors.
153 |     // Note that indices are guaranteed to be less than IEngine::getNbBindings()
154 |     const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
155 |     const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
156 | 
157 |     // Create GPU buffers on device
158 |     CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
159 |     CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
160 | 
161 |     // Create stream
162 |     cudaStream_t stream;
163 |     CHECK(cudaStreamCreate(&stream));
164 | 
165 |     // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
166 |     CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
167 |     context.enqueue(batchSize, buffers, stream, nullptr);
168 |     CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
169 |     cudaStreamSynchronize(stream);
170 | 
171 |     // Release stream and buffers
172 |     cudaStreamDestroy(stream);
173 |     CHECK(cudaFree(buffers[inputIndex]));
174 |     CHECK(cudaFree(buffers[outputIndex]));
175 | }
176 | 
177 | int main(int argc, char** argv) {
178 |     cudaSetDevice(DEVICE);
179 |     // create a model using the API directly and serialize it to a stream
180 |     char *trtModelStream{nullptr};
181 |     size_t size{0};
182 | 
183 |     if (argc == 2 && std::string(argv[1]) == "-s") {
184 |         IHostMemory* modelStream{nullptr};
185 |         APIToModel(BATCH_SIZE, &modelStream);
186 |         assert(modelStream != nullptr);
187 |         std::ofstream p("yolov5x.engine", std::ios::binary);
188 |         if (!p) {
189 |             std::cerr << "could not open plan output file" << std::endl;
190 |             return -1;
191 |         }
192 |         p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
193 |         modelStream->destroy();
194 |         return 0;
195 |     } else if (argc == 3 && std::string(argv[1]) == "-d") {
196 |         std::ifstream file("yolov5x.engine", std::ios::binary);
197 |         if (file.good()) {
198 |             file.seekg(0, file.end);
199 |             size = file.tellg();
200 |             file.seekg(0, file.beg);
201 |             trtModelStream = new char[size];
202 |             assert(trtModelStream);
203 |             file.read(trtModelStream, size);
204 |             file.close();
205 |         }
206 |     } else {
207 |         std::cerr << "arguments not right!" << std::endl;
208 |         std::cerr << "./yolov5x -s  // serialize model to plan file" << std::endl;
209 |         std::cerr << "./yolov5x -d ../samples  // deserialize plan file and run inference" << std::endl;
210 |         return -1;
211 |     }
212 | 
213 |     std::vector<std::string> file_names;
214 |     if (read_files_in_dir(argv[2], file_names) < 0) {
215 |         std::cout << "read_files_in_dir failed." << std::endl;
216 |         return -1;
217 |     }
218 | 
219 |     // prepare input data ---------------------------
220 |     static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
221 |     //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
222 |     //    data[i] = 1.0;
223 |     static float prob[BATCH_SIZE * OUTPUT_SIZE];
224 |     IRuntime* runtime = createInferRuntime(gLogger);
225 |     assert(runtime != nullptr);
226 |     ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
227 |     assert(engine != nullptr);
228 |     IExecutionContext* context = engine->createExecutionContext();
229 |     assert(context != nullptr);
230 |     delete[] trtModelStream;
231 | 
232 |     int fcount = 0;
233 |     for (int f = 0; f < (int)file_names.size(); f++) {
234 |         fcount++;
235 |         if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue;
236 |         for (int b = 0; b < fcount; b++) {
237 |             cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]);
238 |             if (img.empty()) continue;
239 |             cv::Mat pr_img = preprocess_img(img);
240 | 			int i = 0;
241 | 			for (int row = 0; row < INPUT_H; ++row) {
242 | 				uchar* uc_pixel = pr_img.data + row * pr_img.step;
243 | 				for (int col = 0; col < INPUT_W; ++col) {
244 | 					data[b * 3 * INPUT_H * INPUT_W + i] = uc_pixel[2] / 255;
245 | 					data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = uc_pixel[1] / 255.0;
246 | 					data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = uc_pixel[0] / 255.0;
247 | 					uc_pixel += 3;
248 | 					++i;
249 | 				}
250 | 			}
251 |         }
252 | 
253 |         // Run inference
254 |         auto start = std::chrono::system_clock::now();
255 |         doInference(*context, data, prob, BATCH_SIZE);
256 |         auto end = std::chrono::system_clock::now();
257 |         std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
258 |         std::vector<std::vector<Yolo::Detection>> batch_res(fcount);
259 |         for (int b = 0; b < fcount; b++) {
260 |             auto& res = batch_res[b];
261 |             nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH);
262 |         }
263 |         for (int b = 0; b < fcount; b++) {
264 |             auto& res = batch_res[b];
265 |             //std::cout << res.size() << std::endl;
266 |             cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]);
267 |             for (size_t j = 0; j < res.size(); j++) {
268 |                 cv::Rect r = get_rect(img, res[j].bbox);
269 |                 cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
270 |                 cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
271 |             }
272 |             cv::imwrite("_" + file_names[f - fcount + 1 + b], img);
273 |         }
274 |         fcount = 0;
275 |     }
276 | 
277 |     // Destroy the engine
278 |     context->destroy();
279 |     engine->destroy();
280 |     runtime->destroy();
281 | 
282 |     // Print histogram of the output distribution
283 |     //std::cout << "\nOutput:\n\n";
284 |     //for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
285 |     //{
286 |     //    std::cout << prob[i] << ", ";
287 |     //    if (i % 10 == 0) std::cout << std::endl;
288 |     //}
289 |     //std::cout << std::endl;
290 | 
291 |     return 0;
292 | }
293 | 


--------------------------------------------------------------------------------