├── README.md ├── yolov5l ├── CMakeLists.txt ├── README.md ├── common.hpp ├── gen_wts.py ├── images │ ├── bus.jpg │ └── zidane.jpg ├── logging.h ├── utils.h ├── yololayer.cu ├── yololayer.h └── yolov5l.cpp ├── yolov5m ├── CMakeLists.txt ├── README.md ├── common.hpp ├── gen_wts.py ├── images │ ├── bus.jpg │ └── zidane.jpg ├── logging.h ├── utils.h ├── yololayer.cu ├── yololayer.h └── yolov5m.cpp ├── yolov5s ├── CMakeLists.txt ├── README.md ├── common.hpp ├── gen_wts.py ├── images │ ├── bus.jpg │ └── zidane.jpg ├── logging.h ├── utils.h ├── yololayer.cu ├── yololayer.h └── yolov5s.cpp └── yolov5x ├── CMakeLists.txt ├── README.md ├── common.hpp ├── gen_wts.py ├── images ├── bus.jpg └── zidane.jpg ├── logging.h ├── utils.h ├── yololayer.cu ├── yololayer.h └── yolov5x.cpp /README.md: -------------------------------------------------------------------------------- 1 | # yolov5_2.0-TensorRt 2 | U版yolov5 2.0的tensorrt加速 3 | 4 | 5 | 6 | 并且对resize和图像处理阶段的操作做了优化,在win环境下debug下速度有很大提升,但是release则没有变化,因为在release时,opencv中会有相应的优化操作。 7 | 8 | ``` 9 | 实际上,at操作符与ptr操作符在Debug版本下都是有内存检查、防止操作越界的操作,而data十分简单粗暴,没有任何检查,由于它的简单粗暴所以使得data操作速度很快。所以在Debug版本下,at操作符与ptr操作符相较于data,速度还是慢了不少。 10 | 11 | 另外在Debug版本下,at操作要比指针操作慢得多,所以对于不连续数据或者单个点处理,可以考虑at操作,对于连续的大量数据,尽量不要使用它。 12 | ``` 13 | 14 | 感谢下面两个开源实现: 15 | 16 | https://github.com/wang-xinyu/tensorrtx 17 | 18 | https://github.com/AIpakchoi/yolov5_tensorrt 19 | -------------------------------------------------------------------------------- /yolov5l/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(yolov5) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30) 14 | 15 | include_directories(${PROJECT_SOURCE_DIR}/include) 16 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") 17 | message("embed_platform on") 18 | include_directories(/usr/local/cuda/targets/aarch64-linux/include) 19 | link_directories(/usr/local/cuda/targets/aarch64-linux/lib) 20 | else() 21 | message("embed_platform off") 22 | include_directories(/usr/local/cuda/include) 23 | link_directories(/usr/local/cuda/lib64) 24 | endif() 25 | 26 | 27 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 28 | 29 | cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu) 30 | 31 | ########## opencv configuration ############ 32 | find_package(OpenCV 3.4 REQUIRED) 33 | message(OpenCV_LIBS) 34 | include_directories(OpenCV_INCLUDE_DIRS) 35 | 36 | add_executable(yolov5l ${PROJECT_SOURCE_DIR}/yolov5l.cpp) 37 | target_link_libraries(yolov5l nvinfer) 38 | target_link_libraries(yolov5l cudart) 39 | target_link_libraries(yolov5l yololayer) 40 | target_link_libraries(yolov5l ${OpenCV_LIBS}) 41 | 42 | add_definitions(-O2 -pthread) 43 | 44 | -------------------------------------------------------------------------------- /yolov5l/README.md: -------------------------------------------------------------------------------- 1 | # yolov5 2 | 3 | The Pytorch implementation is [ultralytics/yolov5](https://github.com/ultralytics/yolov5). 4 | 5 | I was using [ultralytics/yolov5](https://github.com/ultralytics/yolov5)(The latest version). Just in case the yolov5 model updated. 6 | 7 | ## How to Run 8 | 9 | ``` 10 | 1. generate yolov5l.wts from pytorch implementation with yolov5.pt 11 | 12 | git clone https://github.com/AIpakchoi/yolov5_tensorrt.git 13 | git clone https://github.com/ultralytics/yolov5.git 14 | // download its weights 'yolov5l.pt' 15 | cd yolov5 16 | cp ../yolov5_tensorrt/yolov5l/gen_wts.py . 17 | python gen_wts.py 18 | // a file 'yolov5l.wts' will be generated. 19 | 20 | 2. put yolov5l.wts into yolov5l, build and run 21 | 22 | mv yolov5l.wts ../yolov5_tensorrt/yolov5l/ 23 | cd ../yolov5_tensorrt/yolov5l 24 | mkdir build 25 | cd build 26 | cmake .. 27 | make 28 | sudo ./yolov5l -s // serialize model to plan file i.e. 'yolov5l.engine' 29 | sudo ./yolov5l -d ../samples // deserialize plan file and run inference, the images in samples will be processed. 30 | 31 | 3. check the images generated, as follows. _zidane.jpg and _bus.jpg 32 | ``` 33 | 34 |

35 | 36 |

37 | 38 |

39 | 40 |

41 | 42 | ## Config 43 | 44 | - Input shape defined in yololayer.h 45 | - Number of classes defined in yololayer.h 46 | - FP16/FP32 can be selected by the macro in yolov5l.cpp 47 | - GPU id can be selected by the macro in yolov5l.cpp 48 | - NMS thresh in yolov5l.cpp 49 | - BBox confidence thresh in yolov5l.cpp 50 | - Batch size in yolov5l.cpp 51 | -------------------------------------------------------------------------------- /yolov5l/common.hpp: -------------------------------------------------------------------------------- 1 | #ifndef YOLOV5_COMMON_H_ 2 | #define YOLOV5_COMMON_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "NvInfer.h" 11 | #include "yololayer.h" 12 | 13 | #define CHECK(status) \ 14 | do\ 15 | {\ 16 | auto ret = (status);\ 17 | if (ret != 0)\ 18 | {\ 19 | std::cerr << "Cuda failure: " << ret << std::endl;\ 20 | abort();\ 21 | }\ 22 | } while (0) 23 | 24 | using namespace nvinfer1; 25 | // resize 优化 26 | // 从原图的点映射到输出图像的像素点 27 | // 对outuput的每个点,先根据长宽比计算其在原图中最邻近的像素点, 28 | //然后直接根据最邻近的思想,直接拷贝Channel个字节作为输出图像 29 | void resizeByNN(uchar *input, uchar *output, int height_in, int width_in, int channels, int height_out, int width_out) { 30 | 31 | uchar *data_source = input; 32 | uchar *data_half = output; 33 | 34 | int bpl_source = width_in * 3; 35 | int bpl_dst = width_out * 3; 36 | 37 | int pos = 0; 38 | int sep = 0; 39 | uchar *sr = nullptr; 40 | uchar *hr = nullptr; 41 | float step = 0.0; 42 | float step_x = float(width_in) / float(width_out); 43 | float step_y = float(height_in) / float(height_out); 44 | 45 | for (int i = 0; i < height_out; i++) { 46 | for (int j = 0; j < width_out; j++) { 47 | sep = int(step_y*i); 48 | step = int(j*step_x); 49 | sr = data_source + sep * bpl_source; 50 | hr = data_half + i * bpl_dst + j * channels; 51 | pos = step * channels; 52 | memcpy(hr, sr + pos, channels); 53 | } 54 | } 55 | return; 56 | } 57 | 58 | cv::Mat preprocess_img(cv::Mat& img) { 59 | int w, h, x, y; 60 | float r_w = Yolo::INPUT_W / (img.cols*1.0); 61 | float r_h = Yolo::INPUT_H / (img.rows*1.0); 62 | if (r_h > r_w) { 63 | w = Yolo::INPUT_W; 64 | h = r_w * img.rows; 65 | x = 0; 66 | y = (Yolo::INPUT_H - h) / 2; 67 | } else { 68 | w = r_h* img.cols; 69 | h = Yolo::INPUT_H; 70 | x = (Yolo::INPUT_W - w) / 2; 71 | y = 0; 72 | } 73 | cv::Mat re(h, w, CV_8UC3); 74 | //cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC); 75 | auto start = std::chrono::system_clock::now(); 76 | cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); 77 | //resizeByNN(img.data, re.data, img.rows, img.cols, img.channels(), re.rows, re.cols); 78 | auto end = std::chrono::system_clock::now(); 79 | std::cout << "img resize: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; 80 | 81 | cv::Mat out(Yolo::INPUT_H, Yolo::INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128)); 82 | 83 | re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); 84 | return out; 85 | } 86 | 87 | cv::Rect get_rect(cv::Mat& img, float bbox[4]) { 88 | int l, r, t, b; 89 | float r_w = Yolo::INPUT_W / (img.cols * 1.0); 90 | float r_h = Yolo::INPUT_H / (img.rows * 1.0); 91 | if (r_h > r_w) { 92 | l = bbox[0] - bbox[2]/2.f; 93 | if (l < 0) 94 | { 95 | l = 0; 96 | } 97 | r = bbox[0] + bbox[2]/2.f; 98 | if (r > img.cols) 99 | { 100 | r = img.cols; 101 | } 102 | t = bbox[1] - bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2; 103 | if (t < 0) 104 | { 105 | t = 0; 106 | } 107 | b = bbox[1] + bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2; 108 | if (b > img.rows) 109 | { 110 | b = img.rows; 111 | } 112 | l = l / r_w; 113 | r = r / r_w; 114 | t = t / r_w; 115 | b = b / r_w; 116 | } else { 117 | l = bbox[0] - bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2; 118 | if (l < 0) 119 | { 120 | l = 0; 121 | } 122 | r = bbox[0] + bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2; 123 | if (r > img.cols) 124 | { 125 | r = img.cols; 126 | } 127 | t = bbox[1] - bbox[3]/2.f; 128 | if (t < 0) 129 | { 130 | t = 0; 131 | } 132 | b = bbox[1] + bbox[3]/2.f; 133 | if (b > img.rows) 134 | { 135 | b = img.rows; 136 | } 137 | l = l / r_h; 138 | r = r / r_h; 139 | t = t / r_h; 140 | b = b / r_h; 141 | } 142 | return cv::Rect(l, t, r-l, b-t); 143 | } 144 | 145 | // std::max vs. max 146 | //https://www.cnblogs.com/timesdaughter/p/5894930.html 147 | // Use (std::min) and (std::max) 148 | float iou(float lbox[4], float rbox[4]) { 149 | float interBox[] = { 150 | (std::max)(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left 151 | (std::min)(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right 152 | (std::max)(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top 153 | (std::min)(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom 154 | }; 155 | 156 | if(interBox[2] > interBox[3] || interBox[0] > interBox[1]) 157 | return 0.0f; 158 | 159 | float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]); 160 | return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS); 161 | } 162 | 163 | bool cmp(Yolo::Detection& a, Yolo::Detection& b) { 164 | return a.conf > b.conf; 165 | } 166 | 167 | void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh = 0.5) { 168 | int det_size = sizeof(Yolo::Detection) / sizeof(float); 169 | std::map> m; 170 | for (int i = 0; i < output[0] && i < 1000; i++) { 171 | if (output[1 + det_size * i + 4] <= conf_thresh) continue; 172 | Yolo::Detection det; 173 | memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); 174 | if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); 175 | m[det.class_id].push_back(det); 176 | } 177 | for (auto it = m.begin(); it != m.end(); it++) { 178 | //std::cout << it->second[0].class_id << " --- " << std::endl; 179 | auto& dets = it->second; 180 | std::sort(dets.begin(), dets.end(), cmp); 181 | for (size_t m = 0; m < dets.size(); ++m) { 182 | auto& item = dets[m]; 183 | res.push_back(item); 184 | for (size_t n = m + 1; n < dets.size(); ++n) { 185 | if (iou(item.bbox, dets[n].bbox) > nms_thresh) { 186 | dets.erase(dets.begin()+n); 187 | --n; 188 | } 189 | } 190 | } 191 | } 192 | } 193 | 194 | // TensorRT weight files have a simple space delimited format: 195 | // [type] [size] 196 | std::map loadWeights(const std::string file) { 197 | std::cout << "Loading weights: " << file << std::endl; 198 | std::map weightMap; 199 | 200 | // Open weights file 201 | std::ifstream input(file); 202 | assert(input.is_open() && "Unable to load weight file."); 203 | 204 | // Read number of weight blobs 205 | int32_t count; 206 | input >> count; 207 | assert(count > 0 && "Invalid weight map file."); 208 | 209 | while (count--) 210 | { 211 | Weights wt{DataType::kFLOAT, nullptr, 0}; 212 | uint32_t size; 213 | 214 | // Read name and type of blob 215 | std::string name; 216 | input >> name >> std::dec >> size; 217 | wt.type = DataType::kFLOAT; 218 | 219 | // Load blob 220 | uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); 221 | for (uint32_t x = 0, y = size; x < y; ++x) 222 | { 223 | input >> std::hex >> val[x]; 224 | } 225 | wt.values = val; 226 | 227 | wt.count = size; 228 | weightMap[name] = wt; 229 | } 230 | 231 | return weightMap; 232 | } 233 | 234 | IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { 235 | float *gamma = (float*)weightMap[lname + ".weight"].values; 236 | float *beta = (float*)weightMap[lname + ".bias"].values; 237 | float *mean = (float*)weightMap[lname + ".running_mean"].values; 238 | float *var = (float*)weightMap[lname + ".running_var"].values; 239 | int len = weightMap[lname + ".running_var"].count; 240 | 241 | float *scval = reinterpret_cast(malloc(sizeof(float) * len)); 242 | for (int i = 0; i < len; i++) { 243 | scval[i] = gamma[i] / sqrt(var[i] + eps); 244 | } 245 | Weights scale{DataType::kFLOAT, scval, len}; 246 | 247 | float *shval = reinterpret_cast(malloc(sizeof(float) * len)); 248 | for (int i = 0; i < len; i++) { 249 | shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); 250 | } 251 | Weights shift{DataType::kFLOAT, shval, len}; 252 | 253 | float *pval = reinterpret_cast(malloc(sizeof(float) * len)); 254 | for (int i = 0; i < len; i++) { 255 | pval[i] = 1.0; 256 | } 257 | Weights power{DataType::kFLOAT, pval, len}; 258 | 259 | weightMap[lname + ".scale"] = scale; 260 | weightMap[lname + ".shift"] = shift; 261 | weightMap[lname + ".power"] = power; 262 | IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); 263 | assert(scale_1); 264 | return scale_1; 265 | } 266 | 267 | ILayer* convBnLeaky(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) { 268 | Weights emptywts{DataType::kFLOAT, nullptr, 0}; 269 | int p = ksize / 2; 270 | IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[lname + ".conv.weight"], emptywts); 271 | assert(conv1); 272 | conv1->setStrideNd(DimsHW{s, s}); 273 | conv1->setPaddingNd(DimsHW{p, p}); 274 | conv1->setNbGroups(g); 275 | //IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-4); 276 | IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3); 277 | auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU); 278 | lr->setAlpha(0.1); 279 | return lr; 280 | } 281 | 282 | ILayer* focus(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) { 283 | ISliceLayer *s1 = network->addSlice(input, Dims3{0, 0, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 284 | ISliceLayer *s2 = network->addSlice(input, Dims3{0, 1, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 285 | ISliceLayer *s3 = network->addSlice(input, Dims3{0, 0, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 286 | ISliceLayer *s4 = network->addSlice(input, Dims3{0, 1, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 287 | ITensor* inputTensors[] = {s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0)}; 288 | auto cat = network->addConcatenation(inputTensors, 4); 289 | auto conv = convBnLeaky(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv"); 290 | return conv; 291 | } 292 | 293 | ILayer* bottleneck(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) { 294 | auto cv1 = convBnLeaky(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1"); 295 | auto cv2 = convBnLeaky(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2"); 296 | if (shortcut && c1 == c2) { 297 | auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM); 298 | return ew; 299 | } 300 | return cv2; 301 | } 302 | 303 | ILayer* bottleneckCSP(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) { 304 | Weights emptywts{DataType::kFLOAT, nullptr, 0}; 305 | int c_ = (int)((float)c2 * e); 306 | auto cv1 = convBnLeaky(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); 307 | auto cv2 = network->addConvolutionNd(input, c_, DimsHW{1, 1}, weightMap[lname + ".cv2.weight"], emptywts); 308 | ITensor *y1 = cv1->getOutput(0); 309 | for (int i = 0; i < n; i++) { 310 | auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i)); 311 | y1 = b->getOutput(0); 312 | } 313 | auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{1, 1}, weightMap[lname + ".cv3.weight"], emptywts); 314 | 315 | ITensor* inputTensors[] = {cv3->getOutput(0), cv2->getOutput(0)}; 316 | auto cat = network->addConcatenation(inputTensors, 2); 317 | 318 | IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4); 319 | auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU); 320 | lr->setAlpha(0.1); 321 | 322 | auto cv4 = convBnLeaky(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4"); 323 | return cv4; 324 | } 325 | 326 | ILayer* SPP(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) { 327 | int c_ = c1 / 2; 328 | auto cv1 = convBnLeaky(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); 329 | 330 | auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k1, k1}); 331 | pool1->setPaddingNd(DimsHW{k1 / 2, k1 / 2}); 332 | pool1->setStrideNd(DimsHW{1, 1}); 333 | auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k2, k2}); 334 | pool2->setPaddingNd(DimsHW{k2 / 2, k2 / 2}); 335 | pool2->setStrideNd(DimsHW{1, 1}); 336 | auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k3, k3}); 337 | pool3->setPaddingNd(DimsHW{k3 / 2, k3 / 2}); 338 | pool3->setStrideNd(DimsHW{1, 1}); 339 | 340 | ITensor* inputTensors[] = {cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0)}; 341 | auto cat = network->addConcatenation(inputTensors, 4); 342 | 343 | auto cv2 = convBnLeaky(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2"); 344 | return cv2; 345 | } 346 | 347 | int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { 348 | DIR *p_dir = opendir(p_dir_name); 349 | if (p_dir == nullptr) { 350 | return -1; 351 | } 352 | 353 | struct dirent* p_file = nullptr; 354 | while ((p_file = readdir(p_dir)) != nullptr) { 355 | if (strcmp(p_file->d_name, ".") != 0 && 356 | strcmp(p_file->d_name, "..") != 0) { 357 | //std::string cur_file_name(p_dir_name); 358 | //cur_file_name += "/"; 359 | //cur_file_name += p_file->d_name; 360 | std::string cur_file_name(p_file->d_name); 361 | file_names.push_back(cur_file_name); 362 | } 363 | } 364 | 365 | closedir(p_dir); 366 | return 0; 367 | } 368 | 369 | #endif 370 | 371 | -------------------------------------------------------------------------------- /yolov5l/gen_wts.py: -------------------------------------------------------------------------------- 1 | from utils.utils import * 2 | import struct 3 | 4 | # Initialize 5 | device = torch_utils.select_device('0') 6 | # Load model 7 | model = torch.load('weights/yolov5l.pt', map_location=device)['model'].float() # load to FP32 8 | model.to(device).eval() 9 | 10 | f = open('yolov5l.wts', 'w') 11 | f.write('{}\n'.format(len(model.state_dict().keys()))) 12 | for k, v in model.state_dict().items(): 13 | vr = v.reshape(-1).cpu().numpy() 14 | f.write('{} {} '.format(k, len(vr))) 15 | for vv in vr: 16 | f.write(' ') 17 | f.write(struct.pack('>f',float(vv)).hex()) 18 | f.write('\n') 19 | -------------------------------------------------------------------------------- /yolov5l/images/bus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BaofengZan/yolov5_2.0-TensorRt/78193337dd94bf48c2d55c3a77105a534d11c45b/yolov5l/images/bus.jpg -------------------------------------------------------------------------------- /yolov5l/images/zidane.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BaofengZan/yolov5_2.0-TensorRt/78193337dd94bf48c2d55c3a77105a534d11c45b/yolov5l/images/zidane.jpg -------------------------------------------------------------------------------- /yolov5l/logging.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef TENSORRT_LOGGING_H 18 | #define TENSORRT_LOGGING_H 19 | 20 | #include "NvInferRuntimeCommon.h" 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | using Severity = nvinfer1::ILogger::Severity; 30 | 31 | class LogStreamConsumerBuffer : public std::stringbuf 32 | { 33 | public: 34 | LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) 35 | : mOutput(stream) 36 | , mPrefix(prefix) 37 | , mShouldLog(shouldLog) 38 | { 39 | } 40 | 41 | LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) 42 | : mOutput(other.mOutput) 43 | { 44 | } 45 | 46 | ~LogStreamConsumerBuffer() 47 | { 48 | // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence 49 | // std::streambuf::pptr() gives a pointer to the current position of the output sequence 50 | // if the pointer to the beginning is not equal to the pointer to the current position, 51 | // call putOutput() to log the output to the stream 52 | if (pbase() != pptr()) 53 | { 54 | putOutput(); 55 | } 56 | } 57 | 58 | // synchronizes the stream buffer and returns 0 on success 59 | // synchronizing the stream buffer consists of inserting the buffer contents into the stream, 60 | // resetting the buffer and flushing the stream 61 | virtual int sync() 62 | { 63 | putOutput(); 64 | return 0; 65 | } 66 | 67 | void putOutput() 68 | { 69 | if (mShouldLog) 70 | { 71 | // prepend timestamp 72 | std::time_t timestamp = std::time(nullptr); 73 | tm* tm_local = std::localtime(×tamp); 74 | std::cout << "["; 75 | std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; 76 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; 77 | std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; 78 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; 79 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; 80 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; 81 | // std::stringbuf::str() gets the string contents of the buffer 82 | // insert the buffer contents pre-appended by the appropriate prefix into the stream 83 | mOutput << mPrefix << str(); 84 | // set the buffer to empty 85 | str(""); 86 | // flush the stream 87 | mOutput.flush(); 88 | } 89 | } 90 | 91 | void setShouldLog(bool shouldLog) 92 | { 93 | mShouldLog = shouldLog; 94 | } 95 | 96 | private: 97 | std::ostream& mOutput; 98 | std::string mPrefix; 99 | bool mShouldLog; 100 | }; 101 | 102 | //! 103 | //! \class LogStreamConsumerBase 104 | //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer 105 | //! 106 | class LogStreamConsumerBase 107 | { 108 | public: 109 | LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) 110 | : mBuffer(stream, prefix, shouldLog) 111 | { 112 | } 113 | 114 | protected: 115 | LogStreamConsumerBuffer mBuffer; 116 | }; 117 | 118 | //! 119 | //! \class LogStreamConsumer 120 | //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. 121 | //! Order of base classes is LogStreamConsumerBase and then std::ostream. 122 | //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field 123 | //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. 124 | //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. 125 | //! Please do not change the order of the parent classes. 126 | //! 127 | class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream 128 | { 129 | public: 130 | //! \brief Creates a LogStreamConsumer which logs messages with level severity. 131 | //! Reportable severity determines if the messages are severe enough to be logged. 132 | LogStreamConsumer(Severity reportableSeverity, Severity severity) 133 | : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) 134 | , std::ostream(&mBuffer) // links the stream buffer with the stream 135 | , mShouldLog(severity <= reportableSeverity) 136 | , mSeverity(severity) 137 | { 138 | } 139 | 140 | LogStreamConsumer(LogStreamConsumer&& other) 141 | : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) 142 | , std::ostream(&mBuffer) // links the stream buffer with the stream 143 | , mShouldLog(other.mShouldLog) 144 | , mSeverity(other.mSeverity) 145 | { 146 | } 147 | 148 | void setReportableSeverity(Severity reportableSeverity) 149 | { 150 | mShouldLog = mSeverity <= reportableSeverity; 151 | mBuffer.setShouldLog(mShouldLog); 152 | } 153 | 154 | private: 155 | static std::ostream& severityOstream(Severity severity) 156 | { 157 | return severity >= Severity::kINFO ? std::cout : std::cerr; 158 | } 159 | 160 | static std::string severityPrefix(Severity severity) 161 | { 162 | switch (severity) 163 | { 164 | case Severity::kINTERNAL_ERROR: return "[F] "; 165 | case Severity::kERROR: return "[E] "; 166 | case Severity::kWARNING: return "[W] "; 167 | case Severity::kINFO: return "[I] "; 168 | case Severity::kVERBOSE: return "[V] "; 169 | default: assert(0); return ""; 170 | } 171 | } 172 | 173 | bool mShouldLog; 174 | Severity mSeverity; 175 | }; 176 | 177 | //! \class Logger 178 | //! 179 | //! \brief Class which manages logging of TensorRT tools and samples 180 | //! 181 | //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, 182 | //! and supports logging two types of messages: 183 | //! 184 | //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) 185 | //! - Test pass/fail messages 186 | //! 187 | //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is 188 | //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. 189 | //! 190 | //! In the future, this class could be extended to support dumping test results to a file in some standard format 191 | //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). 192 | //! 193 | //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger 194 | //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT 195 | //! library and messages coming from the sample. 196 | //! 197 | //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the 198 | //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger 199 | //! object. 200 | 201 | class Logger : public nvinfer1::ILogger 202 | { 203 | public: 204 | Logger(Severity severity = Severity::kWARNING) 205 | : mReportableSeverity(severity) 206 | { 207 | } 208 | 209 | //! 210 | //! \enum TestResult 211 | //! \brief Represents the state of a given test 212 | //! 213 | enum class TestResult 214 | { 215 | kRUNNING, //!< The test is running 216 | kPASSED, //!< The test passed 217 | kFAILED, //!< The test failed 218 | kWAIVED //!< The test was waived 219 | }; 220 | 221 | //! 222 | //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger 223 | //! \return The nvinfer1::ILogger associated with this Logger 224 | //! 225 | //! TODO Once all samples are updated to use this method to register the logger with TensorRT, 226 | //! we can eliminate the inheritance of Logger from ILogger 227 | //! 228 | nvinfer1::ILogger& getTRTLogger() 229 | { 230 | return *this; 231 | } 232 | 233 | //! 234 | //! \brief Implementation of the nvinfer1::ILogger::log() virtual method 235 | //! 236 | //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the 237 | //! inheritance from nvinfer1::ILogger 238 | //! 239 | void log(Severity severity, const char* msg) override 240 | { 241 | LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; 242 | } 243 | 244 | //! 245 | //! \brief Method for controlling the verbosity of logging output 246 | //! 247 | //! \param severity The logger will only emit messages that have severity of this level or higher. 248 | //! 249 | void setReportableSeverity(Severity severity) 250 | { 251 | mReportableSeverity = severity; 252 | } 253 | 254 | //! 255 | //! \brief Opaque handle that holds logging information for a particular test 256 | //! 257 | //! This object is an opaque handle to information used by the Logger to print test results. 258 | //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used 259 | //! with Logger::reportTest{Start,End}(). 260 | //! 261 | class TestAtom 262 | { 263 | public: 264 | TestAtom(TestAtom&&) = default; 265 | 266 | private: 267 | friend class Logger; 268 | 269 | TestAtom(bool started, const std::string& name, const std::string& cmdline) 270 | : mStarted(started) 271 | , mName(name) 272 | , mCmdline(cmdline) 273 | { 274 | } 275 | 276 | bool mStarted; 277 | std::string mName; 278 | std::string mCmdline; 279 | }; 280 | 281 | //! 282 | //! \brief Define a test for logging 283 | //! 284 | //! \param[in] name The name of the test. This should be a string starting with 285 | //! "TensorRT" and containing dot-separated strings containing 286 | //! the characters [A-Za-z0-9_]. 287 | //! For example, "TensorRT.sample_googlenet" 288 | //! \param[in] cmdline The command line used to reproduce the test 289 | // 290 | //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). 291 | //! 292 | static TestAtom defineTest(const std::string& name, const std::string& cmdline) 293 | { 294 | return TestAtom(false, name, cmdline); 295 | } 296 | 297 | //! 298 | //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments 299 | //! as input 300 | //! 301 | //! \param[in] name The name of the test 302 | //! \param[in] argc The number of command-line arguments 303 | //! \param[in] argv The array of command-line arguments (given as C strings) 304 | //! 305 | //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). 306 | static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) 307 | { 308 | auto cmdline = genCmdlineString(argc, argv); 309 | return defineTest(name, cmdline); 310 | } 311 | 312 | //! 313 | //! \brief Report that a test has started. 314 | //! 315 | //! \pre reportTestStart() has not been called yet for the given testAtom 316 | //! 317 | //! \param[in] testAtom The handle to the test that has started 318 | //! 319 | static void reportTestStart(TestAtom& testAtom) 320 | { 321 | reportTestResult(testAtom, TestResult::kRUNNING); 322 | assert(!testAtom.mStarted); 323 | testAtom.mStarted = true; 324 | } 325 | 326 | //! 327 | //! \brief Report that a test has ended. 328 | //! 329 | //! \pre reportTestStart() has been called for the given testAtom 330 | //! 331 | //! \param[in] testAtom The handle to the test that has ended 332 | //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, 333 | //! TestResult::kFAILED, TestResult::kWAIVED 334 | //! 335 | static void reportTestEnd(const TestAtom& testAtom, TestResult result) 336 | { 337 | assert(result != TestResult::kRUNNING); 338 | assert(testAtom.mStarted); 339 | reportTestResult(testAtom, result); 340 | } 341 | 342 | static int reportPass(const TestAtom& testAtom) 343 | { 344 | reportTestEnd(testAtom, TestResult::kPASSED); 345 | return EXIT_SUCCESS; 346 | } 347 | 348 | static int reportFail(const TestAtom& testAtom) 349 | { 350 | reportTestEnd(testAtom, TestResult::kFAILED); 351 | return EXIT_FAILURE; 352 | } 353 | 354 | static int reportWaive(const TestAtom& testAtom) 355 | { 356 | reportTestEnd(testAtom, TestResult::kWAIVED); 357 | return EXIT_SUCCESS; 358 | } 359 | 360 | static int reportTest(const TestAtom& testAtom, bool pass) 361 | { 362 | return pass ? reportPass(testAtom) : reportFail(testAtom); 363 | } 364 | 365 | Severity getReportableSeverity() const 366 | { 367 | return mReportableSeverity; 368 | } 369 | 370 | private: 371 | //! 372 | //! \brief returns an appropriate string for prefixing a log message with the given severity 373 | //! 374 | static const char* severityPrefix(Severity severity) 375 | { 376 | switch (severity) 377 | { 378 | case Severity::kINTERNAL_ERROR: return "[F] "; 379 | case Severity::kERROR: return "[E] "; 380 | case Severity::kWARNING: return "[W] "; 381 | case Severity::kINFO: return "[I] "; 382 | case Severity::kVERBOSE: return "[V] "; 383 | default: assert(0); return ""; 384 | } 385 | } 386 | 387 | //! 388 | //! \brief returns an appropriate string for prefixing a test result message with the given result 389 | //! 390 | static const char* testResultString(TestResult result) 391 | { 392 | switch (result) 393 | { 394 | case TestResult::kRUNNING: return "RUNNING"; 395 | case TestResult::kPASSED: return "PASSED"; 396 | case TestResult::kFAILED: return "FAILED"; 397 | case TestResult::kWAIVED: return "WAIVED"; 398 | default: assert(0); return ""; 399 | } 400 | } 401 | 402 | //! 403 | //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity 404 | //! 405 | static std::ostream& severityOstream(Severity severity) 406 | { 407 | return severity >= Severity::kINFO ? std::cout : std::cerr; 408 | } 409 | 410 | //! 411 | //! \brief method that implements logging test results 412 | //! 413 | static void reportTestResult(const TestAtom& testAtom, TestResult result) 414 | { 415 | severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " 416 | << testAtom.mCmdline << std::endl; 417 | } 418 | 419 | //! 420 | //! \brief generate a command line string from the given (argc, argv) values 421 | //! 422 | static std::string genCmdlineString(int argc, char const* const* argv) 423 | { 424 | std::stringstream ss; 425 | for (int i = 0; i < argc; i++) 426 | { 427 | if (i > 0) 428 | ss << " "; 429 | ss << argv[i]; 430 | } 431 | return ss.str(); 432 | } 433 | 434 | Severity mReportableSeverity; 435 | }; 436 | 437 | namespace 438 | { 439 | 440 | //! 441 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE 442 | //! 443 | //! Example usage: 444 | //! 445 | //! LOG_VERBOSE(logger) << "hello world" << std::endl; 446 | //! 447 | inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) 448 | { 449 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); 450 | } 451 | 452 | //! 453 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO 454 | //! 455 | //! Example usage: 456 | //! 457 | //! LOG_INFO(logger) << "hello world" << std::endl; 458 | //! 459 | inline LogStreamConsumer LOG_INFO(const Logger& logger) 460 | { 461 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); 462 | } 463 | 464 | //! 465 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING 466 | //! 467 | //! Example usage: 468 | //! 469 | //! LOG_WARN(logger) << "hello world" << std::endl; 470 | //! 471 | inline LogStreamConsumer LOG_WARN(const Logger& logger) 472 | { 473 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); 474 | } 475 | 476 | //! 477 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR 478 | //! 479 | //! Example usage: 480 | //! 481 | //! LOG_ERROR(logger) << "hello world" << std::endl; 482 | //! 483 | inline LogStreamConsumer LOG_ERROR(const Logger& logger) 484 | { 485 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); 486 | } 487 | 488 | //! 489 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR 490 | // ("fatal" severity) 491 | //! 492 | //! Example usage: 493 | //! 494 | //! LOG_FATAL(logger) << "hello world" << std::endl; 495 | //! 496 | inline LogStreamConsumer LOG_FATAL(const Logger& logger) 497 | { 498 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); 499 | } 500 | 501 | } // anonymous namespace 502 | 503 | #endif // TENSORRT_LOGGING_H 504 | -------------------------------------------------------------------------------- /yolov5l/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef __TRT_UTILS_H_ 2 | #define __TRT_UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #ifndef CUDA_CHECK 10 | 11 | #define CUDA_CHECK(callstr) \ 12 | { \ 13 | cudaError_t error_code = callstr; \ 14 | if (error_code != cudaSuccess) { \ 15 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ 16 | assert(0); \ 17 | } \ 18 | } 19 | 20 | #endif 21 | 22 | namespace Tn 23 | { 24 | class Profiler : public nvinfer1::IProfiler 25 | { 26 | public: 27 | void printLayerTimes(int itrationsTimes) 28 | { 29 | float totalTime = 0; 30 | for (size_t i = 0; i < mProfile.size(); i++) 31 | { 32 | printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes); 33 | totalTime += mProfile[i].second; 34 | } 35 | printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes); 36 | } 37 | private: 38 | typedef std::pair Record; 39 | std::vector mProfile; 40 | 41 | virtual void reportLayerTime(const char* layerName, float ms) 42 | { 43 | auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; }); 44 | if (record == mProfile.end()) 45 | mProfile.push_back(std::make_pair(layerName, ms)); 46 | else 47 | record->second += ms; 48 | } 49 | }; 50 | 51 | //Logger for TensorRT info/warning/errors 52 | class Logger : public nvinfer1::ILogger 53 | { 54 | public: 55 | 56 | Logger(): Logger(Severity::kWARNING) {} 57 | 58 | Logger(Severity severity): reportableSeverity(severity) {} 59 | 60 | void log(Severity severity, const char* msg) override 61 | { 62 | // suppress messages with severity enum value greater than the reportable 63 | if (severity > reportableSeverity) return; 64 | 65 | switch (severity) 66 | { 67 | case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break; 68 | case Severity::kERROR: std::cerr << "ERROR: "; break; 69 | case Severity::kWARNING: std::cerr << "WARNING: "; break; 70 | case Severity::kINFO: std::cerr << "INFO: "; break; 71 | default: std::cerr << "UNKNOWN: "; break; 72 | } 73 | std::cerr << msg << std::endl; 74 | } 75 | 76 | Severity reportableSeverity{Severity::kWARNING}; 77 | }; 78 | 79 | template 80 | void write(char*& buffer, const T& val) 81 | { 82 | *reinterpret_cast(buffer) = val; 83 | buffer += sizeof(T); 84 | } 85 | 86 | template 87 | void read(const char*& buffer, T& val) 88 | { 89 | val = *reinterpret_cast(buffer); 90 | buffer += sizeof(T); 91 | } 92 | } 93 | 94 | #endif -------------------------------------------------------------------------------- /yolov5l/yololayer.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "yololayer.h" 3 | #include "utils.h" 4 | 5 | using namespace Yolo; 6 | 7 | namespace nvinfer1 8 | { 9 | YoloLayerPlugin::YoloLayerPlugin() 10 | { 11 | mClassCount = CLASS_NUM; 12 | mYoloKernel.clear(); 13 | mYoloKernel.push_back(yolo1); 14 | mYoloKernel.push_back(yolo2); 15 | mYoloKernel.push_back(yolo3); 16 | 17 | mKernelCount = mYoloKernel.size(); 18 | 19 | CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); 20 | size_t AnchorLen = sizeof(float)* CHECK_COUNT*2; 21 | for(int ii = 0; ii < mKernelCount; ii ++) 22 | { 23 | CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen)); 24 | const auto& yolo = mYoloKernel[ii]; 25 | CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); 26 | } 27 | } 28 | 29 | YoloLayerPlugin::~YoloLayerPlugin() 30 | { 31 | } 32 | 33 | // create the plugin at runtime from a byte stream 34 | YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) 35 | { 36 | using namespace Tn; 37 | const char *d = reinterpret_cast(data), *a = d; 38 | read(d, mClassCount); 39 | read(d, mThreadCount); 40 | read(d, mKernelCount); 41 | mYoloKernel.resize(mKernelCount); 42 | auto kernelSize = mKernelCount*sizeof(YoloKernel); 43 | memcpy(mYoloKernel.data(),d,kernelSize); 44 | d += kernelSize; 45 | 46 | CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); 47 | size_t AnchorLen = sizeof(float)* CHECK_COUNT*2; 48 | for(int ii = 0; ii < mKernelCount; ii ++) 49 | { 50 | CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen)); 51 | const auto& yolo = mYoloKernel[ii]; 52 | CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); 53 | } 54 | 55 | assert(d == a + length); 56 | } 57 | 58 | void YoloLayerPlugin::serialize(void* buffer) const 59 | { 60 | using namespace Tn; 61 | char* d = static_cast(buffer), *a = d; 62 | write(d, mClassCount); 63 | write(d, mThreadCount); 64 | write(d, mKernelCount); 65 | auto kernelSize = mKernelCount*sizeof(YoloKernel); 66 | memcpy(d,mYoloKernel.data(),kernelSize); 67 | d += kernelSize; 68 | 69 | assert(d == a + getSerializationSize()); 70 | } 71 | 72 | size_t YoloLayerPlugin::getSerializationSize() const 73 | { 74 | return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size(); 75 | } 76 | 77 | int YoloLayerPlugin::initialize() 78 | { 79 | return 0; 80 | } 81 | 82 | Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) 83 | { 84 | //output the result to channel 85 | int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); 86 | 87 | return Dims3(totalsize + 1, 1, 1); 88 | } 89 | 90 | // Set plugin namespace 91 | void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) 92 | { 93 | mPluginNamespace = pluginNamespace; 94 | } 95 | 96 | const char* YoloLayerPlugin::getPluginNamespace() const 97 | { 98 | return mPluginNamespace; 99 | } 100 | 101 | // Return the DataType of the plugin output at the requested index 102 | DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const 103 | { 104 | return DataType::kFLOAT; 105 | } 106 | 107 | // Return true if output tensor is broadcast across a batch. 108 | bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const 109 | { 110 | return false; 111 | } 112 | 113 | // Return true if plugin can use input that is broadcast across batch without replication. 114 | bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const 115 | { 116 | return false; 117 | } 118 | 119 | void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) 120 | { 121 | } 122 | 123 | // Attach the plugin object to an execution context and grant the plugin the access to some context resource. 124 | void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) 125 | { 126 | } 127 | 128 | // Detach the plugin object from its execution context. 129 | void YoloLayerPlugin::detachFromContext() {} 130 | 131 | const char* YoloLayerPlugin::getPluginType() const 132 | { 133 | return "YoloLayer_TRT"; 134 | } 135 | 136 | const char* YoloLayerPlugin::getPluginVersion() const 137 | { 138 | return "1"; 139 | } 140 | 141 | void YoloLayerPlugin::destroy() 142 | { 143 | delete this; 144 | } 145 | 146 | // Clone the plugin 147 | IPluginV2IOExt* YoloLayerPlugin::clone() const 148 | { 149 | YoloLayerPlugin *p = new YoloLayerPlugin(); 150 | p->setPluginNamespace(mPluginNamespace); 151 | return p; 152 | } 153 | 154 | __device__ float Logist(float data){ return 1.0f / (1.0f + expf(-data)); }; 155 | 156 | __global__ void CalDetection(const float *input, float *output,int noElements, 157 | int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes,int outputElem) { 158 | 159 | int idx = threadIdx.x + blockDim.x * blockIdx.x; 160 | if (idx >= noElements) return; 161 | 162 | int total_grid = yoloWidth * yoloHeight; 163 | int bnIdx = idx / total_grid; 164 | idx = idx - total_grid*bnIdx; 165 | int info_len_i = 5 + classes; 166 | const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT); 167 | 168 | for (int k = 0; k < 3; ++k) { 169 | float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]); 170 | if (box_prob < IGNORE_THRESH) continue; 171 | int class_id = 0; 172 | float max_cls_prob = 0.0; 173 | for (int i = 5; i < info_len_i; ++i) { 174 | float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]); 175 | if (p > max_cls_prob) { 176 | max_cls_prob = p; 177 | class_id = i - 5; 178 | } 179 | } 180 | float *res_count = output + bnIdx*outputElem; 181 | int count = (int)atomicAdd(res_count, 1); 182 | if (count >= MAX_OUTPUT_BBOX_COUNT) return; 183 | char* data = (char *)res_count + sizeof(float) + count * sizeof(Detection); 184 | Detection* det = (Detection*)(data); 185 | 186 | int row = idx / yoloWidth; 187 | int col = idx % yoloWidth; 188 | 189 | //Location 190 | det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth; 191 | det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight; 192 | det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]); 193 | det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2*k]; 194 | det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]); 195 | det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2*k + 1]; 196 | det->conf = box_prob * max_cls_prob; 197 | det->class_id = class_id; 198 | } 199 | } 200 | 201 | void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) { 202 | 203 | int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); 204 | 205 | for(int idx = 0 ; idx < batchSize; ++idx) { 206 | CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float))); 207 | } 208 | int numElem = 0; 209 | for (unsigned int i = 0; i < mYoloKernel.size(); ++i) 210 | { 211 | const auto& yolo = mYoloKernel[i]; 212 | numElem = yolo.width*yolo.height*batchSize; 213 | if (numElem < mThreadCount) 214 | mThreadCount = numElem; 215 | CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>> 216 | (inputs[i], output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount, outputElem); 217 | } 218 | 219 | } 220 | 221 | 222 | int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) 223 | { 224 | forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize); 225 | return 0; 226 | } 227 | 228 | PluginFieldCollection YoloPluginCreator::mFC{}; 229 | std::vector YoloPluginCreator::mPluginAttributes; 230 | 231 | YoloPluginCreator::YoloPluginCreator() 232 | { 233 | mPluginAttributes.clear(); 234 | 235 | mFC.nbFields = mPluginAttributes.size(); 236 | mFC.fields = mPluginAttributes.data(); 237 | } 238 | 239 | const char* YoloPluginCreator::getPluginName() const 240 | { 241 | return "YoloLayer_TRT"; 242 | } 243 | 244 | const char* YoloPluginCreator::getPluginVersion() const 245 | { 246 | return "1"; 247 | } 248 | 249 | const PluginFieldCollection* YoloPluginCreator::getFieldNames() 250 | { 251 | return &mFC; 252 | } 253 | 254 | IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) 255 | { 256 | YoloLayerPlugin* obj = new YoloLayerPlugin(); 257 | obj->setPluginNamespace(mNamespace.c_str()); 258 | return obj; 259 | } 260 | 261 | IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) 262 | { 263 | // This object will be deleted when the network is destroyed, which will 264 | // call MishPlugin::destroy() 265 | YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); 266 | obj->setPluginNamespace(mNamespace.c_str()); 267 | return obj; 268 | } 269 | 270 | } 271 | -------------------------------------------------------------------------------- /yolov5l/yololayer.h: -------------------------------------------------------------------------------- 1 | #ifndef _YOLO_LAYER_H 2 | #define _YOLO_LAYER_H 3 | 4 | #include 5 | #include 6 | #include "NvInfer.h" 7 | 8 | namespace Yolo 9 | { 10 | static constexpr int CHECK_COUNT = 3; 11 | static constexpr float IGNORE_THRESH = 0.1f; 12 | static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000; 13 | static constexpr int CLASS_NUM = 80; 14 | static constexpr int INPUT_H = 608; 15 | static constexpr int INPUT_W = 608; 16 | 17 | struct YoloKernel 18 | { 19 | int width; 20 | int height; 21 | float anchors[CHECK_COUNT*2]; 22 | }; 23 | 24 | static constexpr YoloKernel yolo1 = { 25 | INPUT_W / 32, 26 | INPUT_H / 32, 27 | {116,90, 156,198, 373,326} 28 | }; 29 | static constexpr YoloKernel yolo2 = { 30 | INPUT_W / 16, 31 | INPUT_H / 16, 32 | {30,61, 62,45, 59,119} 33 | }; 34 | static constexpr YoloKernel yolo3 = { 35 | INPUT_W / 8, 36 | INPUT_H / 8, 37 | {10,13, 16,30, 33,23} 38 | }; 39 | 40 | static constexpr int LOCATIONS = 4; 41 | struct alignas(float) Detection{ 42 | //center_x center_y w h 43 | float bbox[LOCATIONS]; 44 | float conf; // bbox_conf * cls_conf 45 | float class_id; 46 | }; 47 | } 48 | 49 | namespace nvinfer1 50 | { 51 | class YoloLayerPlugin: public IPluginV2IOExt 52 | { 53 | public: 54 | explicit YoloLayerPlugin(); 55 | YoloLayerPlugin(const void* data, size_t length); 56 | 57 | ~YoloLayerPlugin(); 58 | 59 | int getNbOutputs() const override 60 | { 61 | return 1; 62 | } 63 | 64 | Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; 65 | 66 | int initialize() override; 67 | 68 | virtual void terminate() override {}; 69 | 70 | virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;} 71 | 72 | virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; 73 | 74 | virtual size_t getSerializationSize() const override; 75 | 76 | virtual void serialize(void* buffer) const override; 77 | 78 | bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override { 79 | return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; 80 | } 81 | 82 | const char* getPluginType() const override; 83 | 84 | const char* getPluginVersion() const override; 85 | 86 | void destroy() override; 87 | 88 | IPluginV2IOExt* clone() const override; 89 | 90 | void setPluginNamespace(const char* pluginNamespace) override; 91 | 92 | const char* getPluginNamespace() const override; 93 | 94 | DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override; 95 | 96 | bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override; 97 | 98 | bool canBroadcastInputAcrossBatch(int inputIndex) const override; 99 | 100 | void attachToContext( 101 | cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override; 102 | 103 | void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override; 104 | 105 | void detachFromContext() override; 106 | 107 | private: 108 | void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1); 109 | int mClassCount; 110 | int mKernelCount; 111 | std::vector mYoloKernel; 112 | int mThreadCount = 256; 113 | void** mAnchor; 114 | const char* mPluginNamespace; 115 | }; 116 | 117 | class YoloPluginCreator : public IPluginCreator 118 | { 119 | public: 120 | YoloPluginCreator(); 121 | 122 | ~YoloPluginCreator() override = default; 123 | 124 | const char* getPluginName() const override; 125 | 126 | const char* getPluginVersion() const override; 127 | 128 | const PluginFieldCollection* getFieldNames() override; 129 | 130 | IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override; 131 | 132 | IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; 133 | 134 | void setPluginNamespace(const char* libNamespace) override 135 | { 136 | mNamespace = libNamespace; 137 | } 138 | 139 | const char* getPluginNamespace() const override 140 | { 141 | return mNamespace.c_str(); 142 | } 143 | 144 | private: 145 | std::string mNamespace; 146 | static PluginFieldCollection mFC; 147 | static std::vector mPluginAttributes; 148 | }; 149 | 150 | 151 | 152 | }; 153 | 154 | #endif 155 | -------------------------------------------------------------------------------- /yolov5l/yolov5l.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "cuda_runtime_api.h" 4 | #include "logging.h" 5 | #include "common.hpp" 6 | 7 | #define USE_FP16 // comment out this if want to use FP32 8 | #define DEVICE 0 // GPU id 9 | #define NMS_THRESH 0.5 10 | #define CONF_THRESH 0.25 11 | #define BATCH_SIZE 1 12 | 13 | // stuff we know about the network and the input/output blobs 14 | static const int INPUT_H = Yolo::INPUT_H; 15 | static const int INPUT_W = Yolo::INPUT_W; 16 | static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1; // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1 17 | const char* INPUT_BLOB_NAME = "data"; 18 | const char* OUTPUT_BLOB_NAME = "prob"; 19 | static Logger gLogger; 20 | REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); 21 | 22 | // Creat the engine using only the API and not any parser. 23 | ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { 24 | INetworkDefinition* network = builder->createNetworkV2(0U); 25 | 26 | // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME 27 | ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); 28 | assert(data); 29 | 30 | std::map weightMap = loadWeights("../yolov5l.wts"); 31 | Weights emptywts{DataType::kFLOAT, nullptr, 0}; 32 | 33 | /* ------ yolov5 backbone------ */ 34 | auto focus0 = focus(network, weightMap, *data, 3, 64, 3, "model.0"); 35 | auto conv1 = convBnLeaky(network, weightMap, *focus0->getOutput(0), 128, 3, 2, 1, "model.1"); 36 | auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 128, 128, 3, true, 1, 0.5, "model.2"); 37 | auto conv3 = convBnLeaky(network, weightMap, *bottleneck_CSP2->getOutput(0), 256, 3, 2, 1, "model.3"); 38 | auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 256, 256, 9, true, 1, 0.5, "model.4"); 39 | auto conv5 = convBnLeaky(network, weightMap, *bottleneck_csp4->getOutput(0), 512, 3, 2, 1, "model.5"); 40 | auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 512, 512, 9, true, 1, 0.5, "model.6"); 41 | auto conv7 = convBnLeaky(network, weightMap, *bottleneck_csp6->getOutput(0), 1024, 3, 2, 1, "model.7"); 42 | auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 1024, 1024, 5, 9, 13, "model.8"); 43 | 44 | /* ------ yolov5 head ------ */ 45 | auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 1024, 1024, 3, false, 1, 0.5, "model.9"); 46 | auto conv10 = convBnLeaky(network, weightMap, *bottleneck_csp9->getOutput(0), 512, 1, 1, 1, "model.10"); 47 | 48 | float *deval = reinterpret_cast(malloc(sizeof(float) * 512 * 2 * 2)); 49 | for (int i = 0; i < 512 * 2 * 2; i++) { 50 | deval[i] = 1.0; 51 | } 52 | Weights deconvwts11{DataType::kFLOAT, deval, 512 * 2 * 2}; 53 | IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 512, DimsHW{2, 2}, deconvwts11, emptywts); 54 | deconv11->setStrideNd(DimsHW{2, 2}); 55 | deconv11->setNbGroups(512); 56 | weightMap["deconv11"] = deconvwts11; 57 | 58 | ITensor* inputTensors12[] = {deconv11->getOutput(0), bottleneck_csp6->getOutput(0)}; 59 | auto cat12 = network->addConcatenation(inputTensors12, 2); 60 | auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 1024, 512, 3, false, 1, 0.5, "model.13"); 61 | auto conv14 = convBnLeaky(network, weightMap, *bottleneck_csp13->getOutput(0), 256, 1, 1, 1, "model.14"); 62 | 63 | Weights deconvwts15{DataType::kFLOAT, deval, 256 * 2 * 2}; 64 | IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 256, DimsHW{2, 2}, deconvwts15, emptywts); 65 | deconv15->setStrideNd(DimsHW{2, 2}); 66 | deconv15->setNbGroups(256); 67 | ITensor* inputTensors16[] = {deconv15->getOutput(0), bottleneck_csp4->getOutput(0)}; 68 | auto cat16 = network->addConcatenation(inputTensors16, 2); 69 | 70 | auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 512, 256, 3, false, 1, 0.5, "model.17"); 71 | 72 | //yolo layer 1 73 | IConvolutionLayer* conv18 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]); 74 | 75 | auto conv19 = convBnLeaky(network, weightMap, *bottleneck_csp17->getOutput(0), 256, 3, 2, 1, "model.18"); 76 | 77 | // yolo layer 2 78 | ITensor* inputTensors20[] = {conv19->getOutput(0), conv14->getOutput(0)}; 79 | auto cat20 = network->addConcatenation(inputTensors20, 2); 80 | 81 | auto bottleneck_csp21 = bottleneckCSP(network, weightMap, *cat20->getOutput(0), 512, 512, 3, false, 1, 0.5, "model.20"); 82 | 83 | //yolo layer 3 84 | IConvolutionLayer* conv22 = network->addConvolutionNd(*bottleneck_csp21->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]); 85 | 86 | auto conv23 = convBnLeaky(network, weightMap, *bottleneck_csp21->getOutput(0), 512, 3, 2, 1, "model.21"); 87 | 88 | ITensor* inputTensors24[] = {conv23->getOutput(0), conv10->getOutput(0)}; 89 | auto cat24 = network->addConcatenation(inputTensors24, 2); 90 | 91 | auto bottleneck_csp25 = bottleneckCSP(network, weightMap, *cat24->getOutput(0), 1024, 1024, 3, false, 1, 0.5, "model.23"); 92 | 93 | IConvolutionLayer* conv26 = network->addConvolutionNd(*bottleneck_csp25->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]); 94 | 95 | auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); 96 | const PluginFieldCollection* pluginData = creator->getFieldNames(); 97 | IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData); 98 | ITensor* inputTensors_yolo[] = {conv26->getOutput(0), conv22->getOutput(0), conv18->getOutput(0)}; 99 | auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj); 100 | 101 | yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME); 102 | network->markOutput(*yolo->getOutput(0)); 103 | 104 | // Build engine 105 | builder->setMaxBatchSize(maxBatchSize); 106 | config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB 107 | #ifdef USE_FP16 108 | config->setFlag(BuilderFlag::kFP16); 109 | #endif 110 | std::cout << "Building engine, please wait for a while..." << std::endl; 111 | ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); 112 | std::cout << "Build engine successfully!" << std::endl; 113 | 114 | // Don't need the network any more 115 | network->destroy(); 116 | 117 | // Release host memory 118 | for (auto& mem : weightMap) 119 | { 120 | free((void*) (mem.second.values)); 121 | } 122 | 123 | return engine; 124 | } 125 | 126 | void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { 127 | // Create builder 128 | IBuilder* builder = createInferBuilder(gLogger); 129 | IBuilderConfig* config = builder->createBuilderConfig(); 130 | 131 | // Create model to populate the network, then set the outputs and create an engine 132 | ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); 133 | assert(engine != nullptr); 134 | 135 | // Serialize the engine 136 | (*modelStream) = engine->serialize(); 137 | 138 | // Close everything down 139 | engine->destroy(); 140 | builder->destroy(); 141 | } 142 | 143 | void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { 144 | const ICudaEngine& engine = context.getEngine(); 145 | 146 | // Pointers to input and output device buffers to pass to engine. 147 | // Engine requires exactly IEngine::getNbBindings() number of buffers. 148 | assert(engine.getNbBindings() == 2); 149 | void* buffers[2]; 150 | 151 | // In order to bind the buffers, we need to know the names of the input and output tensors. 152 | // Note that indices are guaranteed to be less than IEngine::getNbBindings() 153 | const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); 154 | const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); 155 | 156 | // Create GPU buffers on device 157 | CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); 158 | CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); 159 | 160 | // Create stream 161 | cudaStream_t stream; 162 | CHECK(cudaStreamCreate(&stream)); 163 | 164 | // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host 165 | CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); 166 | context.enqueue(batchSize, buffers, stream, nullptr); 167 | CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); 168 | cudaStreamSynchronize(stream); 169 | 170 | // Release stream and buffers 171 | cudaStreamDestroy(stream); 172 | CHECK(cudaFree(buffers[inputIndex])); 173 | CHECK(cudaFree(buffers[outputIndex])); 174 | } 175 | 176 | int main(int argc, char** argv) { 177 | cudaSetDevice(DEVICE); 178 | // create a model using the API directly and serialize it to a stream 179 | char *trtModelStream{nullptr}; 180 | size_t size{0}; 181 | 182 | if (argc == 2 && std::string(argv[1]) == "-s") { 183 | IHostMemory* modelStream{nullptr}; 184 | APIToModel(BATCH_SIZE, &modelStream); 185 | assert(modelStream != nullptr); 186 | std::ofstream p("yolov5l.engine", std::ios::binary); 187 | if (!p) { 188 | std::cerr << "could not open plan output file" << std::endl; 189 | return -1; 190 | } 191 | p.write(reinterpret_cast(modelStream->data()), modelStream->size()); 192 | modelStream->destroy(); 193 | return 0; 194 | } else if (argc == 3 && std::string(argv[1]) == "-d") { 195 | std::ifstream file("yolov5l.engine", std::ios::binary); 196 | if (file.good()) { 197 | file.seekg(0, file.end); 198 | size = file.tellg(); 199 | file.seekg(0, file.beg); 200 | trtModelStream = new char[size]; 201 | assert(trtModelStream); 202 | file.read(trtModelStream, size); 203 | file.close(); 204 | } 205 | } else { 206 | std::cerr << "arguments not right!" << std::endl; 207 | std::cerr << "./yolov5l -s // serialize model to plan file" << std::endl; 208 | std::cerr << "./yolov5l -d ../samples // deserialize plan file and run inference" << std::endl; 209 | return -1; 210 | } 211 | 212 | std::vector file_names; 213 | if (read_files_in_dir(argv[2], file_names) < 0) { 214 | std::cout << "read_files_in_dir failed." << std::endl; 215 | return -1; 216 | } 217 | 218 | // prepare input data --------------------------- 219 | static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; 220 | //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) 221 | // data[i] = 1.0; 222 | static float prob[BATCH_SIZE * OUTPUT_SIZE]; 223 | IRuntime* runtime = createInferRuntime(gLogger); 224 | assert(runtime != nullptr); 225 | ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); 226 | assert(engine != nullptr); 227 | IExecutionContext* context = engine->createExecutionContext(); 228 | assert(context != nullptr); 229 | delete[] trtModelStream; 230 | 231 | int fcount = 0; 232 | for (int f = 0; f < (int)file_names.size(); f++) { 233 | fcount++; 234 | if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue; 235 | for (int b = 0; b < fcount; b++) { 236 | cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]); 237 | if (img.empty()) continue; 238 | cv::Mat pr_img = preprocess_img(img); 239 | int i = 0; 240 | for (int row = 0; row < INPUT_H; ++row) { 241 | uchar* uc_pixel = pr_img.data + row * pr_img.step; 242 | for (int col = 0; col < INPUT_W; ++col) { 243 | data[b * 3 * INPUT_H * INPUT_W + i] = uc_pixel[2] / 255; 244 | data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = uc_pixel[1] / 255.0; 245 | data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = uc_pixel[0] / 255.0; 246 | uc_pixel += 3; 247 | ++i; 248 | } 249 | } 250 | } 251 | 252 | // Run inference 253 | auto start = std::chrono::system_clock::now(); 254 | doInference(*context, data, prob, BATCH_SIZE); 255 | auto end = std::chrono::system_clock::now(); 256 | std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; 257 | std::vector> batch_res(fcount); 258 | for (int b = 0; b < fcount; b++) { 259 | auto& res = batch_res[b]; 260 | nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH); 261 | } 262 | for (int b = 0; b < fcount; b++) { 263 | auto& res = batch_res[b]; 264 | //std::cout << res.size() << std::endl; 265 | cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]); 266 | for (size_t j = 0; j < res.size(); j++) { 267 | cv::Rect r = get_rect(img, res[j].bbox); 268 | cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); 269 | cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); 270 | } 271 | cv::imwrite("_" + file_names[f - fcount + 1 + b], img); 272 | } 273 | fcount = 0; 274 | } 275 | 276 | // Destroy the engine 277 | context->destroy(); 278 | engine->destroy(); 279 | runtime->destroy(); 280 | 281 | // Print histogram of the output distribution 282 | //std::cout << "\nOutput:\n\n"; 283 | //for (unsigned int i = 0; i < OUTPUT_SIZE; i++) 284 | //{ 285 | // std::cout << prob[i] << ", "; 286 | // if (i % 10 == 0) std::cout << std::endl; 287 | //} 288 | //std::cout << std::endl; 289 | 290 | return 0; 291 | } 292 | -------------------------------------------------------------------------------- /yolov5m/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(yolov5) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30) 14 | 15 | include_directories(${PROJECT_SOURCE_DIR}/include) 16 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") 17 | message("embed_platform on") 18 | include_directories(/usr/local/cuda/targets/aarch64-linux/include) 19 | link_directories(/usr/local/cuda/targets/aarch64-linux/lib) 20 | else() 21 | message("embed_platform off") 22 | include_directories(/usr/local/cuda/include) 23 | link_directories(/usr/local/cuda/lib64) 24 | endif() 25 | 26 | 27 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 28 | 29 | cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu) 30 | 31 | ########## opencv configuration ############ 32 | find_package(OpenCV 3.4 REQUIRED) 33 | message(OpenCV_LIBS) 34 | include_directories(OpenCV_INCLUDE_DIRS) 35 | 36 | add_executable(yolov5m ${PROJECT_SOURCE_DIR}/yolov5m.cpp) 37 | target_link_libraries(yolov5m nvinfer) 38 | target_link_libraries(yolov5m cudart) 39 | target_link_libraries(yolov5m yololayer) 40 | target_link_libraries(yolov5m ${OpenCV_LIBS}) 41 | 42 | add_definitions(-O2 -pthread) 43 | 44 | -------------------------------------------------------------------------------- /yolov5m/README.md: -------------------------------------------------------------------------------- 1 | # yolov5 2 | 3 | The Pytorch implementation is [ultralytics/yolov5](https://github.com/ultralytics/yolov5). 4 | 5 | I was using [ultralytics/yolov5](https://github.com/ultralytics/yolov5)(The latest version). Just in case the yolov5 model updated. 6 | 7 | ## How to Run 8 | 9 | ``` 10 | 1. generate yolov5m.wts from pytorch implementation with yolov5.pt 11 | 12 | git clone https://github.com/AIpakchoi/yolov5_tensorrt.git 13 | git clone https://github.com/ultralytics/yolov5.git 14 | // download its weights 'yolov5m.pt' 15 | cd yolov5 16 | cp ../yolov5_tensorrt/yolov5m/gen_wts.py . 17 | python gen_wts.py 18 | // a file 'yolov5m.wts' will be generated. 19 | 20 | 2. put yolov5m.wts into yolov5m, build and run 21 | 22 | mv yolov5m.wts ../yolov5_tensorrt/yolov5m/ 23 | cd ../yolov5_tensorrt/yolov5m 24 | mkdir build 25 | cd build 26 | cmake .. 27 | make 28 | sudo ./yolov5m -s // serialize model to plan file i.e. 'yolov5m.engine' 29 | sudo ./yolov5m -d ../samples // deserialize plan file and run inference, the images in samples will be processed. 30 | 31 | 3. check the images generated, as follows. _zidane.jpg and _bus.jpg 32 | ``` 33 | 34 |

35 | 36 |

37 | 38 |

39 | 40 |

41 | 42 | ## Config 43 | 44 | - Input shape defined in yololayer.h 45 | - Number of classes defined in yololayer.h 46 | - FP16/FP32 can be selected by the macro in yolov5m.cpp 47 | - GPU id can be selected by the macro in yolov5m.cpp 48 | - NMS thresh in yolov5m.cpp 49 | - BBox confidence thresh in yolov5m.cpp 50 | - Batch size in yolov5m.cpp 51 | -------------------------------------------------------------------------------- /yolov5m/common.hpp: -------------------------------------------------------------------------------- 1 | #ifndef YOLOV5_COMMON_H_ 2 | #define YOLOV5_COMMON_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "NvInfer.h" 11 | #include "yololayer.h" 12 | 13 | #define CHECK(status) \ 14 | do\ 15 | {\ 16 | auto ret = (status);\ 17 | if (ret != 0)\ 18 | {\ 19 | std::cerr << "Cuda failure: " << ret << std::endl;\ 20 | abort();\ 21 | }\ 22 | } while (0) 23 | 24 | using namespace nvinfer1; 25 | // resize 优化 26 | // 从原图的点映射到输出图像的像素点 27 | // 对outuput的每个点,先根据长宽比计算其在原图中最邻近的像素点, 28 | //然后直接根据最邻近的思想,直接拷贝Channel个字节作为输出图像 29 | void resizeByNN(uchar *input, uchar *output, int height_in, int width_in, int channels, int height_out, int width_out) { 30 | 31 | uchar *data_source = input; 32 | uchar *data_half = output; 33 | 34 | int bpl_source = width_in * 3; 35 | int bpl_dst = width_out * 3; 36 | 37 | int pos = 0; 38 | int sep = 0; 39 | uchar *sr = nullptr; 40 | uchar *hr = nullptr; 41 | float step = 0.0; 42 | float step_x = float(width_in) / float(width_out); 43 | float step_y = float(height_in) / float(height_out); 44 | 45 | for (int i = 0; i < height_out; i++) { 46 | for (int j = 0; j < width_out; j++) { 47 | sep = int(step_y*i); 48 | step = int(j*step_x); 49 | sr = data_source + sep * bpl_source; 50 | hr = data_half + i * bpl_dst + j * channels; 51 | pos = step * channels; 52 | memcpy(hr, sr + pos, channels); 53 | } 54 | } 55 | return; 56 | } 57 | 58 | cv::Mat preprocess_img(cv::Mat& img) { 59 | int w, h, x, y; 60 | float r_w = Yolo::INPUT_W / (img.cols*1.0); 61 | float r_h = Yolo::INPUT_H / (img.rows*1.0); 62 | if (r_h > r_w) { 63 | w = Yolo::INPUT_W; 64 | h = r_w * img.rows; 65 | x = 0; 66 | y = (Yolo::INPUT_H - h) / 2; 67 | } else { 68 | w = r_h* img.cols; 69 | h = Yolo::INPUT_H; 70 | x = (Yolo::INPUT_W - w) / 2; 71 | y = 0; 72 | } 73 | cv::Mat re(h, w, CV_8UC3); 74 | //cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC); 75 | auto start = std::chrono::system_clock::now(); 76 | cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); 77 | //resizeByNN(img.data, re.data, img.rows, img.cols, img.channels(), re.rows, re.cols); 78 | auto end = std::chrono::system_clock::now(); 79 | std::cout << "img resize: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; 80 | 81 | cv::Mat out(Yolo::INPUT_H, Yolo::INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128)); 82 | 83 | re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); 84 | return out; 85 | } 86 | 87 | cv::Rect get_rect(cv::Mat& img, float bbox[4]) { 88 | int l, r, t, b; 89 | float r_w = Yolo::INPUT_W / (img.cols * 1.0); 90 | float r_h = Yolo::INPUT_H / (img.rows * 1.0); 91 | if (r_h > r_w) { 92 | l = bbox[0] - bbox[2]/2.f; 93 | if (l < 0) 94 | { 95 | l = 0; 96 | } 97 | r = bbox[0] + bbox[2]/2.f; 98 | if (r > img.cols) 99 | { 100 | r = img.cols; 101 | } 102 | t = bbox[1] - bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2; 103 | if (t < 0) 104 | { 105 | t = 0; 106 | } 107 | b = bbox[1] + bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2; 108 | if (b > img.rows) 109 | { 110 | b = img.rows; 111 | } 112 | l = l / r_w; 113 | r = r / r_w; 114 | t = t / r_w; 115 | b = b / r_w; 116 | } else { 117 | l = bbox[0] - bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2; 118 | if (l < 0) 119 | { 120 | l = 0; 121 | } 122 | r = bbox[0] + bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2; 123 | if (r > img.cols) 124 | { 125 | r = img.cols; 126 | } 127 | t = bbox[1] - bbox[3]/2.f; 128 | if (t < 0) 129 | { 130 | t = 0; 131 | } 132 | b = bbox[1] + bbox[3]/2.f; 133 | if (b > img.rows) 134 | { 135 | b = img.rows; 136 | } 137 | l = l / r_h; 138 | r = r / r_h; 139 | t = t / r_h; 140 | b = b / r_h; 141 | } 142 | return cv::Rect(l, t, r-l, b-t); 143 | } 144 | 145 | // std::max vs. max 146 | //https://www.cnblogs.com/timesdaughter/p/5894930.html 147 | // Use (std::min) and (std::max) 148 | float iou(float lbox[4], float rbox[4]) { 149 | float interBox[] = { 150 | (std::max)(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left 151 | (std::min)(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right 152 | (std::max)(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top 153 | (std::min)(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom 154 | }; 155 | 156 | if(interBox[2] > interBox[3] || interBox[0] > interBox[1]) 157 | return 0.0f; 158 | 159 | float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]); 160 | return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS); 161 | } 162 | 163 | bool cmp(Yolo::Detection& a, Yolo::Detection& b) { 164 | return a.conf > b.conf; 165 | } 166 | 167 | void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh = 0.5) { 168 | int det_size = sizeof(Yolo::Detection) / sizeof(float); 169 | std::map> m; 170 | for (int i = 0; i < output[0] && i < 1000; i++) { 171 | if (output[1 + det_size * i + 4] <= conf_thresh) continue; 172 | Yolo::Detection det; 173 | memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); 174 | if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); 175 | m[det.class_id].push_back(det); 176 | } 177 | for (auto it = m.begin(); it != m.end(); it++) { 178 | //std::cout << it->second[0].class_id << " --- " << std::endl; 179 | auto& dets = it->second; 180 | std::sort(dets.begin(), dets.end(), cmp); 181 | for (size_t m = 0; m < dets.size(); ++m) { 182 | auto& item = dets[m]; 183 | res.push_back(item); 184 | for (size_t n = m + 1; n < dets.size(); ++n) { 185 | if (iou(item.bbox, dets[n].bbox) > nms_thresh) { 186 | dets.erase(dets.begin()+n); 187 | --n; 188 | } 189 | } 190 | } 191 | } 192 | } 193 | 194 | // TensorRT weight files have a simple space delimited format: 195 | // [type] [size] 196 | std::map loadWeights(const std::string file) { 197 | std::cout << "Loading weights: " << file << std::endl; 198 | std::map weightMap; 199 | 200 | // Open weights file 201 | std::ifstream input(file); 202 | assert(input.is_open() && "Unable to load weight file."); 203 | 204 | // Read number of weight blobs 205 | int32_t count; 206 | input >> count; 207 | assert(count > 0 && "Invalid weight map file."); 208 | 209 | while (count--) 210 | { 211 | Weights wt{DataType::kFLOAT, nullptr, 0}; 212 | uint32_t size; 213 | 214 | // Read name and type of blob 215 | std::string name; 216 | input >> name >> std::dec >> size; 217 | wt.type = DataType::kFLOAT; 218 | 219 | // Load blob 220 | uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); 221 | for (uint32_t x = 0, y = size; x < y; ++x) 222 | { 223 | input >> std::hex >> val[x]; 224 | } 225 | wt.values = val; 226 | 227 | wt.count = size; 228 | weightMap[name] = wt; 229 | } 230 | 231 | return weightMap; 232 | } 233 | 234 | IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { 235 | float *gamma = (float*)weightMap[lname + ".weight"].values; 236 | float *beta = (float*)weightMap[lname + ".bias"].values; 237 | float *mean = (float*)weightMap[lname + ".running_mean"].values; 238 | float *var = (float*)weightMap[lname + ".running_var"].values; 239 | int len = weightMap[lname + ".running_var"].count; 240 | 241 | float *scval = reinterpret_cast(malloc(sizeof(float) * len)); 242 | for (int i = 0; i < len; i++) { 243 | scval[i] = gamma[i] / sqrt(var[i] + eps); 244 | } 245 | Weights scale{DataType::kFLOAT, scval, len}; 246 | 247 | float *shval = reinterpret_cast(malloc(sizeof(float) * len)); 248 | for (int i = 0; i < len; i++) { 249 | shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); 250 | } 251 | Weights shift{DataType::kFLOAT, shval, len}; 252 | 253 | float *pval = reinterpret_cast(malloc(sizeof(float) * len)); 254 | for (int i = 0; i < len; i++) { 255 | pval[i] = 1.0; 256 | } 257 | Weights power{DataType::kFLOAT, pval, len}; 258 | 259 | weightMap[lname + ".scale"] = scale; 260 | weightMap[lname + ".shift"] = shift; 261 | weightMap[lname + ".power"] = power; 262 | IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); 263 | assert(scale_1); 264 | return scale_1; 265 | } 266 | 267 | ILayer* convBnLeaky(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) { 268 | Weights emptywts{DataType::kFLOAT, nullptr, 0}; 269 | int p = ksize / 2; 270 | IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[lname + ".conv.weight"], emptywts); 271 | assert(conv1); 272 | conv1->setStrideNd(DimsHW{s, s}); 273 | conv1->setPaddingNd(DimsHW{p, p}); 274 | conv1->setNbGroups(g); 275 | //IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-4); 276 | IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3); 277 | auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU); 278 | lr->setAlpha(0.1); 279 | return lr; 280 | } 281 | 282 | ILayer* focus(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) { 283 | ISliceLayer *s1 = network->addSlice(input, Dims3{0, 0, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 284 | ISliceLayer *s2 = network->addSlice(input, Dims3{0, 1, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 285 | ISliceLayer *s3 = network->addSlice(input, Dims3{0, 0, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 286 | ISliceLayer *s4 = network->addSlice(input, Dims3{0, 1, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 287 | ITensor* inputTensors[] = {s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0)}; 288 | auto cat = network->addConcatenation(inputTensors, 4); 289 | auto conv = convBnLeaky(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv"); 290 | return conv; 291 | } 292 | 293 | ILayer* bottleneck(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) { 294 | auto cv1 = convBnLeaky(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1"); 295 | auto cv2 = convBnLeaky(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2"); 296 | if (shortcut && c1 == c2) { 297 | auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM); 298 | return ew; 299 | } 300 | return cv2; 301 | } 302 | 303 | ILayer* bottleneckCSP(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) { 304 | Weights emptywts{DataType::kFLOAT, nullptr, 0}; 305 | int c_ = (int)((float)c2 * e); 306 | auto cv1 = convBnLeaky(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); 307 | auto cv2 = network->addConvolutionNd(input, c_, DimsHW{1, 1}, weightMap[lname + ".cv2.weight"], emptywts); 308 | ITensor *y1 = cv1->getOutput(0); 309 | for (int i = 0; i < n; i++) { 310 | auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i)); 311 | y1 = b->getOutput(0); 312 | } 313 | auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{1, 1}, weightMap[lname + ".cv3.weight"], emptywts); 314 | 315 | ITensor* inputTensors[] = {cv3->getOutput(0), cv2->getOutput(0)}; 316 | auto cat = network->addConcatenation(inputTensors, 2); 317 | 318 | IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4); 319 | auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU); 320 | lr->setAlpha(0.1); 321 | 322 | auto cv4 = convBnLeaky(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4"); 323 | return cv4; 324 | } 325 | 326 | ILayer* SPP(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) { 327 | int c_ = c1 / 2; 328 | auto cv1 = convBnLeaky(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); 329 | 330 | auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k1, k1}); 331 | pool1->setPaddingNd(DimsHW{k1 / 2, k1 / 2}); 332 | pool1->setStrideNd(DimsHW{1, 1}); 333 | auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k2, k2}); 334 | pool2->setPaddingNd(DimsHW{k2 / 2, k2 / 2}); 335 | pool2->setStrideNd(DimsHW{1, 1}); 336 | auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k3, k3}); 337 | pool3->setPaddingNd(DimsHW{k3 / 2, k3 / 2}); 338 | pool3->setStrideNd(DimsHW{1, 1}); 339 | 340 | ITensor* inputTensors[] = {cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0)}; 341 | auto cat = network->addConcatenation(inputTensors, 4); 342 | 343 | auto cv2 = convBnLeaky(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2"); 344 | return cv2; 345 | } 346 | 347 | int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { 348 | DIR *p_dir = opendir(p_dir_name); 349 | if (p_dir == nullptr) { 350 | return -1; 351 | } 352 | 353 | struct dirent* p_file = nullptr; 354 | while ((p_file = readdir(p_dir)) != nullptr) { 355 | if (strcmp(p_file->d_name, ".") != 0 && 356 | strcmp(p_file->d_name, "..") != 0) { 357 | //std::string cur_file_name(p_dir_name); 358 | //cur_file_name += "/"; 359 | //cur_file_name += p_file->d_name; 360 | std::string cur_file_name(p_file->d_name); 361 | file_names.push_back(cur_file_name); 362 | } 363 | } 364 | 365 | closedir(p_dir); 366 | return 0; 367 | } 368 | 369 | #endif 370 | 371 | -------------------------------------------------------------------------------- /yolov5m/gen_wts.py: -------------------------------------------------------------------------------- 1 | from utils.utils import * 2 | import struct 3 | 4 | # Initialize 5 | device = torch_utils.select_device('0') 6 | # Load model 7 | model = torch.load('weights/yolov5m.pt', map_location=device)['model'].float() # load to FP32 8 | model.to(device).eval() 9 | 10 | f = open('yolov5m.wts', 'w') 11 | f.write('{}\n'.format(len(model.state_dict().keys()))) 12 | for k, v in model.state_dict().items(): 13 | vr = v.reshape(-1).cpu().numpy() 14 | f.write('{} {} '.format(k, len(vr))) 15 | for vv in vr: 16 | f.write(' ') 17 | f.write(struct.pack('>f',float(vv)).hex()) 18 | f.write('\n') 19 | -------------------------------------------------------------------------------- /yolov5m/images/bus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BaofengZan/yolov5_2.0-TensorRt/78193337dd94bf48c2d55c3a77105a534d11c45b/yolov5m/images/bus.jpg -------------------------------------------------------------------------------- /yolov5m/images/zidane.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BaofengZan/yolov5_2.0-TensorRt/78193337dd94bf48c2d55c3a77105a534d11c45b/yolov5m/images/zidane.jpg -------------------------------------------------------------------------------- /yolov5m/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef __TRT_UTILS_H_ 2 | #define __TRT_UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #ifndef CUDA_CHECK 10 | 11 | #define CUDA_CHECK(callstr) \ 12 | { \ 13 | cudaError_t error_code = callstr; \ 14 | if (error_code != cudaSuccess) { \ 15 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ 16 | assert(0); \ 17 | } \ 18 | } 19 | 20 | #endif 21 | 22 | namespace Tn 23 | { 24 | class Profiler : public nvinfer1::IProfiler 25 | { 26 | public: 27 | void printLayerTimes(int itrationsTimes) 28 | { 29 | float totalTime = 0; 30 | for (size_t i = 0; i < mProfile.size(); i++) 31 | { 32 | printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes); 33 | totalTime += mProfile[i].second; 34 | } 35 | printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes); 36 | } 37 | private: 38 | typedef std::pair Record; 39 | std::vector mProfile; 40 | 41 | virtual void reportLayerTime(const char* layerName, float ms) 42 | { 43 | auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; }); 44 | if (record == mProfile.end()) 45 | mProfile.push_back(std::make_pair(layerName, ms)); 46 | else 47 | record->second += ms; 48 | } 49 | }; 50 | 51 | //Logger for TensorRT info/warning/errors 52 | class Logger : public nvinfer1::ILogger 53 | { 54 | public: 55 | 56 | Logger(): Logger(Severity::kWARNING) {} 57 | 58 | Logger(Severity severity): reportableSeverity(severity) {} 59 | 60 | void log(Severity severity, const char* msg) override 61 | { 62 | // suppress messages with severity enum value greater than the reportable 63 | if (severity > reportableSeverity) return; 64 | 65 | switch (severity) 66 | { 67 | case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break; 68 | case Severity::kERROR: std::cerr << "ERROR: "; break; 69 | case Severity::kWARNING: std::cerr << "WARNING: "; break; 70 | case Severity::kINFO: std::cerr << "INFO: "; break; 71 | default: std::cerr << "UNKNOWN: "; break; 72 | } 73 | std::cerr << msg << std::endl; 74 | } 75 | 76 | Severity reportableSeverity{Severity::kWARNING}; 77 | }; 78 | 79 | template 80 | void write(char*& buffer, const T& val) 81 | { 82 | *reinterpret_cast(buffer) = val; 83 | buffer += sizeof(T); 84 | } 85 | 86 | template 87 | void read(const char*& buffer, T& val) 88 | { 89 | val = *reinterpret_cast(buffer); 90 | buffer += sizeof(T); 91 | } 92 | } 93 | 94 | #endif -------------------------------------------------------------------------------- /yolov5m/yololayer.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "yololayer.h" 3 | #include "utils.h" 4 | 5 | using namespace Yolo; 6 | 7 | namespace nvinfer1 8 | { 9 | YoloLayerPlugin::YoloLayerPlugin() 10 | { 11 | mClassCount = CLASS_NUM; 12 | mYoloKernel.clear(); 13 | mYoloKernel.push_back(yolo1); 14 | mYoloKernel.push_back(yolo2); 15 | mYoloKernel.push_back(yolo3); 16 | 17 | mKernelCount = mYoloKernel.size(); 18 | 19 | CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); 20 | size_t AnchorLen = sizeof(float)* CHECK_COUNT*2; 21 | for(int ii = 0; ii < mKernelCount; ii ++) 22 | { 23 | CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen)); 24 | const auto& yolo = mYoloKernel[ii]; 25 | CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); 26 | } 27 | } 28 | 29 | YoloLayerPlugin::~YoloLayerPlugin() 30 | { 31 | } 32 | 33 | // create the plugin at runtime from a byte stream 34 | YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) 35 | { 36 | using namespace Tn; 37 | const char *d = reinterpret_cast(data), *a = d; 38 | read(d, mClassCount); 39 | read(d, mThreadCount); 40 | read(d, mKernelCount); 41 | mYoloKernel.resize(mKernelCount); 42 | auto kernelSize = mKernelCount*sizeof(YoloKernel); 43 | memcpy(mYoloKernel.data(),d,kernelSize); 44 | d += kernelSize; 45 | 46 | CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); 47 | size_t AnchorLen = sizeof(float)* CHECK_COUNT*2; 48 | for(int ii = 0; ii < mKernelCount; ii ++) 49 | { 50 | CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen)); 51 | const auto& yolo = mYoloKernel[ii]; 52 | CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); 53 | } 54 | 55 | assert(d == a + length); 56 | } 57 | 58 | void YoloLayerPlugin::serialize(void* buffer) const 59 | { 60 | using namespace Tn; 61 | char* d = static_cast(buffer), *a = d; 62 | write(d, mClassCount); 63 | write(d, mThreadCount); 64 | write(d, mKernelCount); 65 | auto kernelSize = mKernelCount*sizeof(YoloKernel); 66 | memcpy(d,mYoloKernel.data(),kernelSize); 67 | d += kernelSize; 68 | 69 | assert(d == a + getSerializationSize()); 70 | } 71 | 72 | size_t YoloLayerPlugin::getSerializationSize() const 73 | { 74 | return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size(); 75 | } 76 | 77 | int YoloLayerPlugin::initialize() 78 | { 79 | return 0; 80 | } 81 | 82 | Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) 83 | { 84 | //output the result to channel 85 | int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); 86 | 87 | return Dims3(totalsize + 1, 1, 1); 88 | } 89 | 90 | // Set plugin namespace 91 | void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) 92 | { 93 | mPluginNamespace = pluginNamespace; 94 | } 95 | 96 | const char* YoloLayerPlugin::getPluginNamespace() const 97 | { 98 | return mPluginNamespace; 99 | } 100 | 101 | // Return the DataType of the plugin output at the requested index 102 | DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const 103 | { 104 | return DataType::kFLOAT; 105 | } 106 | 107 | // Return true if output tensor is broadcast across a batch. 108 | bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const 109 | { 110 | return false; 111 | } 112 | 113 | // Return true if plugin can use input that is broadcast across batch without replication. 114 | bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const 115 | { 116 | return false; 117 | } 118 | 119 | void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) 120 | { 121 | } 122 | 123 | // Attach the plugin object to an execution context and grant the plugin the access to some context resource. 124 | void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) 125 | { 126 | } 127 | 128 | // Detach the plugin object from its execution context. 129 | void YoloLayerPlugin::detachFromContext() {} 130 | 131 | const char* YoloLayerPlugin::getPluginType() const 132 | { 133 | return "YoloLayer_TRT"; 134 | } 135 | 136 | const char* YoloLayerPlugin::getPluginVersion() const 137 | { 138 | return "1"; 139 | } 140 | 141 | void YoloLayerPlugin::destroy() 142 | { 143 | delete this; 144 | } 145 | 146 | // Clone the plugin 147 | IPluginV2IOExt* YoloLayerPlugin::clone() const 148 | { 149 | YoloLayerPlugin *p = new YoloLayerPlugin(); 150 | p->setPluginNamespace(mPluginNamespace); 151 | return p; 152 | } 153 | 154 | __device__ float Logist(float data){ return 1.0f / (1.0f + expf(-data)); }; 155 | 156 | __global__ void CalDetection(const float *input, float *output,int noElements, 157 | int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes,int outputElem) { 158 | 159 | int idx = threadIdx.x + blockDim.x * blockIdx.x; 160 | if (idx >= noElements) return; 161 | 162 | int total_grid = yoloWidth * yoloHeight; 163 | int bnIdx = idx / total_grid; 164 | idx = idx - total_grid*bnIdx; 165 | int info_len_i = 5 + classes; 166 | const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT); 167 | 168 | for (int k = 0; k < 3; ++k) { 169 | float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]); 170 | if (box_prob < IGNORE_THRESH) continue; 171 | int class_id = 0; 172 | float max_cls_prob = 0.0; 173 | for (int i = 5; i < info_len_i; ++i) { 174 | float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]); 175 | if (p > max_cls_prob) { 176 | max_cls_prob = p; 177 | class_id = i - 5; 178 | } 179 | } 180 | float *res_count = output + bnIdx*outputElem; 181 | int count = (int)atomicAdd(res_count, 1); 182 | if (count >= MAX_OUTPUT_BBOX_COUNT) return; 183 | char* data = (char *)res_count + sizeof(float) + count * sizeof(Detection); 184 | Detection* det = (Detection*)(data); 185 | 186 | int row = idx / yoloWidth; 187 | int col = idx % yoloWidth; 188 | 189 | //Location 190 | det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth; 191 | det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight; 192 | det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]); 193 | det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2*k]; 194 | det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]); 195 | det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2*k + 1]; 196 | det->conf = box_prob * max_cls_prob; 197 | det->class_id = class_id; 198 | } 199 | } 200 | 201 | void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) { 202 | 203 | int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); 204 | 205 | for(int idx = 0 ; idx < batchSize; ++idx) { 206 | CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float))); 207 | } 208 | int numElem = 0; 209 | for (unsigned int i = 0; i < mYoloKernel.size(); ++i) 210 | { 211 | const auto& yolo = mYoloKernel[i]; 212 | numElem = yolo.width*yolo.height*batchSize; 213 | if (numElem < mThreadCount) 214 | mThreadCount = numElem; 215 | CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>> 216 | (inputs[i], output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount, outputElem); 217 | } 218 | 219 | } 220 | 221 | 222 | int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) 223 | { 224 | forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize); 225 | return 0; 226 | } 227 | 228 | PluginFieldCollection YoloPluginCreator::mFC{}; 229 | std::vector YoloPluginCreator::mPluginAttributes; 230 | 231 | YoloPluginCreator::YoloPluginCreator() 232 | { 233 | mPluginAttributes.clear(); 234 | 235 | mFC.nbFields = mPluginAttributes.size(); 236 | mFC.fields = mPluginAttributes.data(); 237 | } 238 | 239 | const char* YoloPluginCreator::getPluginName() const 240 | { 241 | return "YoloLayer_TRT"; 242 | } 243 | 244 | const char* YoloPluginCreator::getPluginVersion() const 245 | { 246 | return "1"; 247 | } 248 | 249 | const PluginFieldCollection* YoloPluginCreator::getFieldNames() 250 | { 251 | return &mFC; 252 | } 253 | 254 | IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) 255 | { 256 | YoloLayerPlugin* obj = new YoloLayerPlugin(); 257 | obj->setPluginNamespace(mNamespace.c_str()); 258 | return obj; 259 | } 260 | 261 | IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) 262 | { 263 | // This object will be deleted when the network is destroyed, which will 264 | // call MishPlugin::destroy() 265 | YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); 266 | obj->setPluginNamespace(mNamespace.c_str()); 267 | return obj; 268 | } 269 | 270 | } 271 | -------------------------------------------------------------------------------- /yolov5m/yololayer.h: -------------------------------------------------------------------------------- 1 | #ifndef _YOLO_LAYER_H 2 | #define _YOLO_LAYER_H 3 | 4 | #include 5 | #include 6 | #include "NvInfer.h" 7 | 8 | namespace Yolo 9 | { 10 | static constexpr int CHECK_COUNT = 3; 11 | static constexpr float IGNORE_THRESH = 0.1f; 12 | static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000; 13 | static constexpr int CLASS_NUM = 80; 14 | static constexpr int INPUT_H = 608; 15 | static constexpr int INPUT_W = 608; 16 | 17 | struct YoloKernel 18 | { 19 | int width; 20 | int height; 21 | float anchors[CHECK_COUNT*2]; 22 | }; 23 | 24 | static constexpr YoloKernel yolo1 = { 25 | INPUT_W / 32, 26 | INPUT_H / 32, 27 | {116,90, 156,198, 373,326} 28 | }; 29 | static constexpr YoloKernel yolo2 = { 30 | INPUT_W / 16, 31 | INPUT_H / 16, 32 | {30,61, 62,45, 59,119} 33 | }; 34 | static constexpr YoloKernel yolo3 = { 35 | INPUT_W / 8, 36 | INPUT_H / 8, 37 | {10,13, 16,30, 33,23} 38 | }; 39 | 40 | static constexpr int LOCATIONS = 4; 41 | struct alignas(float) Detection{ 42 | //center_x center_y w h 43 | float bbox[LOCATIONS]; 44 | float conf; // bbox_conf * cls_conf 45 | float class_id; 46 | }; 47 | } 48 | 49 | namespace nvinfer1 50 | { 51 | class YoloLayerPlugin: public IPluginV2IOExt 52 | { 53 | public: 54 | explicit YoloLayerPlugin(); 55 | YoloLayerPlugin(const void* data, size_t length); 56 | 57 | ~YoloLayerPlugin(); 58 | 59 | int getNbOutputs() const override 60 | { 61 | return 1; 62 | } 63 | 64 | Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; 65 | 66 | int initialize() override; 67 | 68 | virtual void terminate() override {}; 69 | 70 | virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;} 71 | 72 | virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; 73 | 74 | virtual size_t getSerializationSize() const override; 75 | 76 | virtual void serialize(void* buffer) const override; 77 | 78 | bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override { 79 | return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; 80 | } 81 | 82 | const char* getPluginType() const override; 83 | 84 | const char* getPluginVersion() const override; 85 | 86 | void destroy() override; 87 | 88 | IPluginV2IOExt* clone() const override; 89 | 90 | void setPluginNamespace(const char* pluginNamespace) override; 91 | 92 | const char* getPluginNamespace() const override; 93 | 94 | DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override; 95 | 96 | bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override; 97 | 98 | bool canBroadcastInputAcrossBatch(int inputIndex) const override; 99 | 100 | void attachToContext( 101 | cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override; 102 | 103 | void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override; 104 | 105 | void detachFromContext() override; 106 | 107 | private: 108 | void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1); 109 | int mClassCount; 110 | int mKernelCount; 111 | std::vector mYoloKernel; 112 | int mThreadCount = 256; 113 | void** mAnchor; 114 | const char* mPluginNamespace; 115 | }; 116 | 117 | class YoloPluginCreator : public IPluginCreator 118 | { 119 | public: 120 | YoloPluginCreator(); 121 | 122 | ~YoloPluginCreator() override = default; 123 | 124 | const char* getPluginName() const override; 125 | 126 | const char* getPluginVersion() const override; 127 | 128 | const PluginFieldCollection* getFieldNames() override; 129 | 130 | IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override; 131 | 132 | IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; 133 | 134 | void setPluginNamespace(const char* libNamespace) override 135 | { 136 | mNamespace = libNamespace; 137 | } 138 | 139 | const char* getPluginNamespace() const override 140 | { 141 | return mNamespace.c_str(); 142 | } 143 | 144 | private: 145 | std::string mNamespace; 146 | static PluginFieldCollection mFC; 147 | static std::vector mPluginAttributes; 148 | }; 149 | 150 | 151 | 152 | }; 153 | 154 | #endif 155 | -------------------------------------------------------------------------------- /yolov5m/yolov5m.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "cuda_runtime_api.h" 4 | #include "logging.h" 5 | #include "common.hpp" 6 | 7 | #define USE_FP16 // comment out this if want to use FP32 8 | #define DEVICE 0 // GPU id 9 | #define NMS_THRESH 0.5 10 | #define CONF_THRESH 0.25 11 | #define BATCH_SIZE 1 12 | 13 | // stuff we know about the network and the input/output blobs 14 | static const int INPUT_H = Yolo::INPUT_H; 15 | static const int INPUT_W = Yolo::INPUT_W; 16 | static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1; // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1 17 | const char* INPUT_BLOB_NAME = "data"; 18 | const char* OUTPUT_BLOB_NAME = "prob"; 19 | static Logger gLogger; 20 | REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); 21 | 22 | // Creat the engine using only the API and not any parser. 23 | ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { 24 | INetworkDefinition* network = builder->createNetworkV2(0U); 25 | 26 | // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME 27 | ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); 28 | assert(data); 29 | 30 | std::map weightMap = loadWeights("../yolov5m.wts"); 31 | Weights emptywts{DataType::kFLOAT, nullptr, 0}; 32 | 33 | /* ------ yolov5 backbone------ */ 34 | auto focus0 = focus(network, weightMap, *data, 3, 48, 3, "model.0"); 35 | auto conv1 = convBnLeaky(network, weightMap, *focus0->getOutput(0), 96, 3, 2, 1, "model.1"); 36 | auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 96, 96, 2, true, 1, 0.5, "model.2"); 37 | auto conv3 = convBnLeaky(network, weightMap, *bottleneck_CSP2->getOutput(0), 192, 3, 2, 1, "model.3"); 38 | auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 192, 192, 6, true, 1, 0.5, "model.4"); 39 | auto conv5 = convBnLeaky(network, weightMap, *bottleneck_csp4->getOutput(0), 384, 3, 2, 1, "model.5"); 40 | auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 384, 384, 6, true, 1, 0.5, "model.6"); 41 | auto conv7 = convBnLeaky(network, weightMap, *bottleneck_csp6->getOutput(0), 768, 3, 2, 1, "model.7"); 42 | auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 768, 768, 5, 9, 13, "model.8"); 43 | /* ------ yolov5 head ------ */ 44 | auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 768, 768, 2, false, 1, 0.5, "model.9"); 45 | auto conv10 = convBnLeaky(network, weightMap, *bottleneck_csp9->getOutput(0), 384, 1, 1, 1, "model.10"); 46 | 47 | float *deval = reinterpret_cast(malloc(sizeof(float) * 384 * 2 * 2)); 48 | for (int i = 0; i < 384 * 2 * 2; i++) { 49 | deval[i] = 1.0; 50 | } 51 | Weights deconvwts11{DataType::kFLOAT, deval, 384 * 2 * 2}; 52 | IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 384, DimsHW{2, 2}, deconvwts11, emptywts); 53 | deconv11->setStrideNd(DimsHW{2, 2}); 54 | deconv11->setNbGroups(384); 55 | weightMap["deconv11"] = deconvwts11; 56 | ITensor* inputTensors12[] = {deconv11->getOutput(0), bottleneck_csp6->getOutput(0)}; 57 | auto cat12 = network->addConcatenation(inputTensors12, 2); 58 | 59 | auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 768, 384, 2, false, 1, 0.5, "model.13"); 60 | 61 | auto conv14 = convBnLeaky(network, weightMap, *bottleneck_csp13->getOutput(0), 192, 1, 1, 1, "model.14"); 62 | 63 | Weights deconvwts15{DataType::kFLOAT, deval, 192 * 2 * 2}; 64 | IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 192, DimsHW{2, 2}, deconvwts15, emptywts); 65 | deconv15->setStrideNd(DimsHW{2, 2}); 66 | deconv15->setNbGroups(192); 67 | 68 | ITensor* inputTensors16[] = {deconv15->getOutput(0), bottleneck_csp4->getOutput(0)}; 69 | auto cat16 = network->addConcatenation(inputTensors16, 2); 70 | 71 | auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 384, 192, 2, false, 1, 0.5, "model.17"); 72 | 73 | //yolo layer 1 74 | IConvolutionLayer* conv18 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]); 75 | 76 | auto conv19 = convBnLeaky(network, weightMap, *bottleneck_csp17->getOutput(0), 192, 3, 2, 1, "model.18"); 77 | 78 | ITensor* inputTensors20[] = {conv19->getOutput(0), conv14->getOutput(0)}; 79 | auto cat20 = network->addConcatenation(inputTensors20, 2); 80 | 81 | auto bottleneck_csp21 = bottleneckCSP(network, weightMap, *cat20->getOutput(0), 384, 384, 2, false, 1, 0.5, "model.20"); 82 | 83 | //yolo layer 2 84 | IConvolutionLayer* conv22 = network->addConvolutionNd(*bottleneck_csp21->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]); 85 | 86 | auto conv23 = convBnLeaky(network, weightMap, *bottleneck_csp21->getOutput(0), 384, 3, 2, 1, "model.21"); 87 | 88 | ITensor* inputTensors24[] = {conv23->getOutput(0), conv10->getOutput(0)}; 89 | auto cat24 = network->addConcatenation(inputTensors24, 2); 90 | 91 | auto bottleneck_csp25 = bottleneckCSP(network, weightMap, *cat24->getOutput(0), 768, 768, 2, false, 1, 0.5, "model.23"); 92 | 93 | // yolo layer 3 94 | IConvolutionLayer* conv26 = network->addConvolutionNd(*bottleneck_csp25->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]); 95 | 96 | auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); 97 | const PluginFieldCollection* pluginData = creator->getFieldNames(); 98 | IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData); 99 | ITensor* inputTensors_yolo[] = {conv26->getOutput(0), conv22->getOutput(0), conv18->getOutput(0)}; 100 | auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj); 101 | 102 | yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME); 103 | network->markOutput(*yolo->getOutput(0)); 104 | 105 | // Build engine 106 | builder->setMaxBatchSize(maxBatchSize); 107 | config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB 108 | #ifdef USE_FP16 109 | config->setFlag(BuilderFlag::kFP16); 110 | #endif 111 | std::cout << "Building engine, please wait for a while..." << std::endl; 112 | ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); 113 | std::cout << "Build engine successfully!" << std::endl; 114 | 115 | // Don't need the network any more 116 | network->destroy(); 117 | 118 | // Release host memory 119 | for (auto& mem : weightMap) 120 | { 121 | free((void*) (mem.second.values)); 122 | } 123 | 124 | return engine; 125 | } 126 | 127 | void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { 128 | // Create builder 129 | IBuilder* builder = createInferBuilder(gLogger); 130 | IBuilderConfig* config = builder->createBuilderConfig(); 131 | 132 | // Create model to populate the network, then set the outputs and create an engine 133 | ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); 134 | assert(engine != nullptr); 135 | 136 | // Serialize the engine 137 | (*modelStream) = engine->serialize(); 138 | 139 | // Close everything down 140 | engine->destroy(); 141 | builder->destroy(); 142 | } 143 | 144 | void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { 145 | const ICudaEngine& engine = context.getEngine(); 146 | 147 | // Pointers to input and output device buffers to pass to engine. 148 | // Engine requires exactly IEngine::getNbBindings() number of buffers. 149 | assert(engine.getNbBindings() == 2); 150 | void* buffers[2]; 151 | 152 | // In order to bind the buffers, we need to know the names of the input and output tensors. 153 | // Note that indices are guaranteed to be less than IEngine::getNbBindings() 154 | const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); 155 | const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); 156 | 157 | // Create GPU buffers on device 158 | CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); 159 | CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); 160 | 161 | // Create stream 162 | cudaStream_t stream; 163 | CHECK(cudaStreamCreate(&stream)); 164 | 165 | // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host 166 | CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); 167 | context.enqueue(batchSize, buffers, stream, nullptr); 168 | CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); 169 | cudaStreamSynchronize(stream); 170 | 171 | // Release stream and buffers 172 | cudaStreamDestroy(stream); 173 | CHECK(cudaFree(buffers[inputIndex])); 174 | CHECK(cudaFree(buffers[outputIndex])); 175 | } 176 | 177 | int main(int argc, char** argv) { 178 | cudaSetDevice(DEVICE); 179 | // create a model using the API directly and serialize it to a stream 180 | char *trtModelStream{nullptr}; 181 | size_t size{0}; 182 | 183 | if (argc == 2 && std::string(argv[1]) == "-s") { 184 | IHostMemory* modelStream{nullptr}; 185 | APIToModel(BATCH_SIZE, &modelStream); 186 | assert(modelStream != nullptr); 187 | std::ofstream p("yolov5m.engine", std::ios::binary); 188 | if (!p) { 189 | std::cerr << "could not open plan output file" << std::endl; 190 | return -1; 191 | } 192 | p.write(reinterpret_cast(modelStream->data()), modelStream->size()); 193 | modelStream->destroy(); 194 | return 0; 195 | } else if (argc == 3 && std::string(argv[1]) == "-d") { 196 | std::ifstream file("yolov5m.engine", std::ios::binary); 197 | if (file.good()) { 198 | file.seekg(0, file.end); 199 | size = file.tellg(); 200 | file.seekg(0, file.beg); 201 | trtModelStream = new char[size]; 202 | assert(trtModelStream); 203 | file.read(trtModelStream, size); 204 | file.close(); 205 | } 206 | } else { 207 | std::cerr << "arguments not right!" << std::endl; 208 | std::cerr << "./yolov5m -s // serialize model to plan file" << std::endl; 209 | std::cerr << "./yolov5m -d ../samples // deserialize plan file and run inference" << std::endl; 210 | return -1; 211 | } 212 | 213 | std::vector file_names; 214 | if (read_files_in_dir(argv[2], file_names) < 0) { 215 | std::cout << "read_files_in_dir failed." << std::endl; 216 | return -1; 217 | } 218 | 219 | // prepare input data --------------------------- 220 | static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; 221 | //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) 222 | // data[i] = 1.0; 223 | static float prob[BATCH_SIZE * OUTPUT_SIZE]; 224 | IRuntime* runtime = createInferRuntime(gLogger); 225 | assert(runtime != nullptr); 226 | ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); 227 | assert(engine != nullptr); 228 | IExecutionContext* context = engine->createExecutionContext(); 229 | assert(context != nullptr); 230 | delete[] trtModelStream; 231 | 232 | int fcount = 0; 233 | for (int f = 0; f < (int)file_names.size(); f++) { 234 | fcount++; 235 | if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue; 236 | for (int b = 0; b < fcount; b++) { 237 | cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]); 238 | if (img.empty()) continue; 239 | cv::Mat pr_img = preprocess_img(img); 240 | int i = 0; 241 | for (int row = 0; row < INPUT_H; ++row) { 242 | uchar* uc_pixel = pr_img.data + row * pr_img.step; 243 | for (int col = 0; col < INPUT_W; ++col) { 244 | data[b * 3 * INPUT_H * INPUT_W + i] = uc_pixel[2] / 255; 245 | data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = uc_pixel[1] / 255.0; 246 | data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = uc_pixel[0] / 255.0; 247 | uc_pixel += 3; 248 | ++i; 249 | } 250 | } 251 | } 252 | 253 | // Run inference 254 | auto start = std::chrono::system_clock::now(); 255 | doInference(*context, data, prob, BATCH_SIZE); 256 | auto end = std::chrono::system_clock::now(); 257 | std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; 258 | std::vector> batch_res(fcount); 259 | for (int b = 0; b < fcount; b++) { 260 | auto& res = batch_res[b]; 261 | nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH); 262 | } 263 | for (int b = 0; b < fcount; b++) { 264 | auto& res = batch_res[b]; 265 | //std::cout << res.size() << std::endl; 266 | cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]); 267 | for (size_t j = 0; j < res.size(); j++) { 268 | cv::Rect r = get_rect(img, res[j].bbox); 269 | cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); 270 | cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); 271 | } 272 | cv::imwrite("_" + file_names[f - fcount + 1 + b], img); 273 | } 274 | fcount = 0; 275 | } 276 | 277 | // Destroy the engine 278 | context->destroy(); 279 | engine->destroy(); 280 | runtime->destroy(); 281 | 282 | // Print histogram of the output distribution 283 | //std::cout << "\nOutput:\n\n"; 284 | //for (unsigned int i = 0; i < OUTPUT_SIZE; i++) 285 | //{ 286 | // std::cout << prob[i] << ", "; 287 | // if (i % 10 == 0) std::cout << std::endl; 288 | //} 289 | //std::cout << std::endl; 290 | 291 | return 0; 292 | } 293 | -------------------------------------------------------------------------------- /yolov5s/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(yolov5) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30) 14 | 15 | include_directories(${PROJECT_SOURCE_DIR}/include) 16 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") 17 | message("embed_platform on") 18 | include_directories(/usr/local/cuda/targets/aarch64-linux/include) 19 | link_directories(/usr/local/cuda/targets/aarch64-linux/lib) 20 | else() 21 | message("embed_platform off") 22 | include_directories(/usr/local/cuda/include) 23 | link_directories(/usr/local/cuda/lib64) 24 | endif() 25 | 26 | 27 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 28 | 29 | cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu) 30 | 31 | ########## opencv configuration ############ 32 | find_package(OpenCV 3.4 REQUIRED) 33 | message(OpenCV_LIBS) 34 | include_directories(OpenCV_INCLUDE_DIRS) 35 | 36 | add_executable(yolov5s ${PROJECT_SOURCE_DIR}/yolov5s.cpp) 37 | target_link_libraries(yolov5s nvinfer) 38 | target_link_libraries(yolov5s cudart) 39 | target_link_libraries(yolov5s yololayer) 40 | target_link_libraries(yolov5s ${OpenCV_LIBS}) 41 | 42 | add_definitions(-O2 -pthread) 43 | 44 | -------------------------------------------------------------------------------- /yolov5s/README.md: -------------------------------------------------------------------------------- 1 | # yolov5 2 | 3 | The Pytorch implementation is [ultralytics/yolov5](https://github.com/ultralytics/yolov5). 4 | 5 | I was using [ultralytics/yolov5](https://github.com/ultralytics/yolov5)(The latest version). Just in case the yolov5 model updated. 6 | 7 | ## How to Run 8 | 9 | ``` 10 | 1. generate yolov5m.wts from pytorch implementation with yolov5.pt 11 | 12 | git clone https://github.com/AIpakchoi/yolov5_tensorrt.git 13 | git clone https://github.com/ultralytics/yolov5.git 14 | // download its weights 'yolov5s.pt' 15 | cd yolov5 16 | cp ../yolov5_tensorrt/yolov5s/gen_wts.py . 17 | python gen_wts.py 18 | // a file 'yolov5s.wts' will be generated. 19 | 20 | 2. put yolov5s.wts into yolov5s, build and run 21 | 22 | mv yolov5s.wts ../yolov5_tensorrt/yolov5s/ 23 | cd ../yolov5_tensorrt/yolov5s 24 | mkdir build 25 | cd build 26 | cmake .. 27 | make 28 | sudo ./yolov5s -s // serialize model to plan file i.e. 'yolov5s.engine' 29 | sudo ./yolov5s -d ../samples // deserialize plan file and run inference, the images in samples will be processed. 30 | 31 | 3. check the images generated, as follows. _zidane.jpg and _bus.jpg 32 | ``` 33 | 34 |

35 | 36 |

37 | 38 |

39 | 40 |

41 | 42 | ## Config 43 | 44 | - Input shape defined in yololayer.h 45 | - Number of classes defined in yololayer.h 46 | - FP16/FP32 can be selected by the macro in yolov5s.cpp 47 | - GPU id can be selected by the macro in yolov5s.cpp 48 | - NMS thresh in yolov5s.cpp 49 | - BBox confidence thresh in yolov5s.cpp 50 | - Batch size in yolov5s.cpp 51 | -------------------------------------------------------------------------------- /yolov5s/common.hpp: -------------------------------------------------------------------------------- 1 | #ifndef YOLOV5_COMMON_H_ 2 | #define YOLOV5_COMMON_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "NvInfer.h" 11 | #include "yololayer.h" 12 | 13 | #define CHECK(status) \ 14 | do\ 15 | {\ 16 | auto ret = (status);\ 17 | if (ret != 0)\ 18 | {\ 19 | std::cerr << "Cuda failure: " << ret << std::endl;\ 20 | abort();\ 21 | }\ 22 | } while (0) 23 | 24 | using namespace nvinfer1; 25 | // resize 优化 26 | // 从原图的点映射到输出图像的像素点 27 | // 对outuput的每个点,先根据长宽比计算其在原图中最邻近的像素点, 28 | //然后直接根据最邻近的思想,直接拷贝Channel个字节作为输出图像 29 | void resizeByNN(uchar *input, uchar *output, int height_in, int width_in, int channels, int height_out, int width_out) { 30 | 31 | uchar *data_source = input; 32 | uchar *data_half = output; 33 | 34 | int bpl_source = width_in * 3; 35 | int bpl_dst = width_out * 3; 36 | 37 | int pos = 0; 38 | int sep = 0; 39 | uchar *sr = nullptr; 40 | uchar *hr = nullptr; 41 | float step = 0.0; 42 | float step_x = float(width_in) / float(width_out); 43 | float step_y = float(height_in) / float(height_out); 44 | 45 | for (int i = 0; i < height_out; i++) { 46 | for (int j = 0; j < width_out; j++) { 47 | sep = int(step_y*i); 48 | step = int(j*step_x); 49 | sr = data_source + sep * bpl_source; 50 | hr = data_half + i * bpl_dst + j * channels; 51 | pos = step * channels; 52 | memcpy(hr, sr + pos, channels); 53 | } 54 | } 55 | return; 56 | } 57 | 58 | cv::Mat preprocess_img(cv::Mat& img) { 59 | int w, h, x, y; 60 | float r_w = Yolo::INPUT_W / (img.cols*1.0); 61 | float r_h = Yolo::INPUT_H / (img.rows*1.0); 62 | if (r_h > r_w) { 63 | w = Yolo::INPUT_W; 64 | h = r_w * img.rows; 65 | x = 0; 66 | y = (Yolo::INPUT_H - h) / 2; 67 | } else { 68 | w = r_h* img.cols; 69 | h = Yolo::INPUT_H; 70 | x = (Yolo::INPUT_W - w) / 2; 71 | y = 0; 72 | } 73 | cv::Mat re(h, w, CV_8UC3); 74 | //cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC); 75 | auto start = std::chrono::system_clock::now(); 76 | cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); 77 | //resizeByNN(img.data, re.data, img.rows, img.cols, img.channels(), re.rows, re.cols); 78 | auto end = std::chrono::system_clock::now(); 79 | std::cout << "img resize: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; 80 | 81 | cv::Mat out(Yolo::INPUT_H, Yolo::INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128)); 82 | 83 | re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); 84 | return out; 85 | } 86 | 87 | cv::Rect get_rect(cv::Mat& img, float bbox[4]) { 88 | int l, r, t, b; 89 | float r_w = Yolo::INPUT_W / (img.cols * 1.0); 90 | float r_h = Yolo::INPUT_H / (img.rows * 1.0); 91 | if (r_h > r_w) { 92 | l = bbox[0] - bbox[2]/2.f; 93 | if (l < 0) 94 | { 95 | l = 0; 96 | } 97 | r = bbox[0] + bbox[2]/2.f; 98 | if (r > img.cols) 99 | { 100 | r = img.cols; 101 | } 102 | t = bbox[1] - bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2; 103 | if (t < 0) 104 | { 105 | t = 0; 106 | } 107 | b = bbox[1] + bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2; 108 | if (b > img.rows) 109 | { 110 | b = img.rows; 111 | } 112 | l = l / r_w; 113 | r = r / r_w; 114 | t = t / r_w; 115 | b = b / r_w; 116 | } else { 117 | l = bbox[0] - bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2; 118 | if (l < 0) 119 | { 120 | l = 0; 121 | } 122 | r = bbox[0] + bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2; 123 | if (r > img.cols) 124 | { 125 | r = img.cols; 126 | } 127 | t = bbox[1] - bbox[3]/2.f; 128 | if (t < 0) 129 | { 130 | t = 0; 131 | } 132 | b = bbox[1] + bbox[3]/2.f; 133 | if (b > img.rows) 134 | { 135 | b = img.rows; 136 | } 137 | l = l / r_h; 138 | r = r / r_h; 139 | t = t / r_h; 140 | b = b / r_h; 141 | } 142 | return cv::Rect(l, t, r-l, b-t); 143 | } 144 | 145 | // std::max vs. max 146 | //https://www.cnblogs.com/timesdaughter/p/5894930.html 147 | // Use (std::min) and (std::max) 148 | float iou(float lbox[4], float rbox[4]) { 149 | float interBox[] = { 150 | (std::max)(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left 151 | (std::min)(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right 152 | (std::max)(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top 153 | (std::min)(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom 154 | }; 155 | 156 | if(interBox[2] > interBox[3] || interBox[0] > interBox[1]) 157 | return 0.0f; 158 | 159 | float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]); 160 | return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS); 161 | } 162 | 163 | bool cmp(Yolo::Detection& a, Yolo::Detection& b) { 164 | return a.conf > b.conf; 165 | } 166 | 167 | void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh = 0.5) { 168 | int det_size = sizeof(Yolo::Detection) / sizeof(float); 169 | std::map> m; 170 | for (int i = 0; i < output[0] && i < 1000; i++) { 171 | if (output[1 + det_size * i + 4] <= conf_thresh) continue; 172 | Yolo::Detection det; 173 | memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); 174 | if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); 175 | m[det.class_id].push_back(det); 176 | } 177 | for (auto it = m.begin(); it != m.end(); it++) { 178 | //std::cout << it->second[0].class_id << " --- " << std::endl; 179 | auto& dets = it->second; 180 | std::sort(dets.begin(), dets.end(), cmp); 181 | for (size_t m = 0; m < dets.size(); ++m) { 182 | auto& item = dets[m]; 183 | res.push_back(item); 184 | for (size_t n = m + 1; n < dets.size(); ++n) { 185 | if (iou(item.bbox, dets[n].bbox) > nms_thresh) { 186 | dets.erase(dets.begin()+n); 187 | --n; 188 | } 189 | } 190 | } 191 | } 192 | } 193 | 194 | // TensorRT weight files have a simple space delimited format: 195 | // [type] [size] 196 | std::map loadWeights(const std::string file) { 197 | std::cout << "Loading weights: " << file << std::endl; 198 | std::map weightMap; 199 | 200 | // Open weights file 201 | std::ifstream input(file); 202 | assert(input.is_open() && "Unable to load weight file."); 203 | 204 | // Read number of weight blobs 205 | int32_t count; 206 | input >> count; 207 | assert(count > 0 && "Invalid weight map file."); 208 | 209 | while (count--) 210 | { 211 | Weights wt{DataType::kFLOAT, nullptr, 0}; 212 | uint32_t size; 213 | 214 | // Read name and type of blob 215 | std::string name; 216 | input >> name >> std::dec >> size; 217 | wt.type = DataType::kFLOAT; 218 | 219 | // Load blob 220 | uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); 221 | for (uint32_t x = 0, y = size; x < y; ++x) 222 | { 223 | input >> std::hex >> val[x]; 224 | } 225 | wt.values = val; 226 | 227 | wt.count = size; 228 | weightMap[name] = wt; 229 | } 230 | 231 | return weightMap; 232 | } 233 | 234 | IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { 235 | float *gamma = (float*)weightMap[lname + ".weight"].values; 236 | float *beta = (float*)weightMap[lname + ".bias"].values; 237 | float *mean = (float*)weightMap[lname + ".running_mean"].values; 238 | float *var = (float*)weightMap[lname + ".running_var"].values; 239 | int len = weightMap[lname + ".running_var"].count; 240 | 241 | float *scval = reinterpret_cast(malloc(sizeof(float) * len)); 242 | for (int i = 0; i < len; i++) { 243 | scval[i] = gamma[i] / sqrt(var[i] + eps); 244 | } 245 | Weights scale{DataType::kFLOAT, scval, len}; 246 | 247 | float *shval = reinterpret_cast(malloc(sizeof(float) * len)); 248 | for (int i = 0; i < len; i++) { 249 | shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); 250 | } 251 | Weights shift{DataType::kFLOAT, shval, len}; 252 | 253 | float *pval = reinterpret_cast(malloc(sizeof(float) * len)); 254 | for (int i = 0; i < len; i++) { 255 | pval[i] = 1.0; 256 | } 257 | Weights power{DataType::kFLOAT, pval, len}; 258 | 259 | weightMap[lname + ".scale"] = scale; 260 | weightMap[lname + ".shift"] = shift; 261 | weightMap[lname + ".power"] = power; 262 | IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); 263 | assert(scale_1); 264 | return scale_1; 265 | } 266 | 267 | ILayer* convBnLeaky(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) { 268 | Weights emptywts{DataType::kFLOAT, nullptr, 0}; 269 | int p = ksize / 2; 270 | IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[lname + ".conv.weight"], emptywts); 271 | assert(conv1); 272 | conv1->setStrideNd(DimsHW{s, s}); 273 | conv1->setPaddingNd(DimsHW{p, p}); 274 | conv1->setNbGroups(g); 275 | //IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-4); 276 | IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3); 277 | auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU); 278 | lr->setAlpha(0.1); 279 | return lr; 280 | } 281 | 282 | ILayer* focus(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) { 283 | ISliceLayer *s1 = network->addSlice(input, Dims3{0, 0, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 284 | ISliceLayer *s2 = network->addSlice(input, Dims3{0, 1, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 285 | ISliceLayer *s3 = network->addSlice(input, Dims3{0, 0, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 286 | ISliceLayer *s4 = network->addSlice(input, Dims3{0, 1, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 287 | ITensor* inputTensors[] = {s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0)}; 288 | auto cat = network->addConcatenation(inputTensors, 4); 289 | auto conv = convBnLeaky(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv"); 290 | return conv; 291 | } 292 | 293 | ILayer* bottleneck(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) { 294 | auto cv1 = convBnLeaky(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1"); 295 | auto cv2 = convBnLeaky(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2"); 296 | if (shortcut && c1 == c2) { 297 | auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM); 298 | return ew; 299 | } 300 | return cv2; 301 | } 302 | 303 | ILayer* bottleneckCSP(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) { 304 | Weights emptywts{DataType::kFLOAT, nullptr, 0}; 305 | int c_ = (int)((float)c2 * e); 306 | auto cv1 = convBnLeaky(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); 307 | auto cv2 = network->addConvolutionNd(input, c_, DimsHW{1, 1}, weightMap[lname + ".cv2.weight"], emptywts); 308 | ITensor *y1 = cv1->getOutput(0); 309 | for (int i = 0; i < n; i++) { 310 | auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i)); 311 | y1 = b->getOutput(0); 312 | } 313 | auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{1, 1}, weightMap[lname + ".cv3.weight"], emptywts); 314 | 315 | ITensor* inputTensors[] = {cv3->getOutput(0), cv2->getOutput(0)}; 316 | auto cat = network->addConcatenation(inputTensors, 2); 317 | 318 | IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4); 319 | auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU); 320 | lr->setAlpha(0.1); 321 | 322 | auto cv4 = convBnLeaky(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4"); 323 | return cv4; 324 | } 325 | 326 | ILayer* SPP(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) { 327 | int c_ = c1 / 2; 328 | auto cv1 = convBnLeaky(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); 329 | 330 | auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k1, k1}); 331 | pool1->setPaddingNd(DimsHW{k1 / 2, k1 / 2}); 332 | pool1->setStrideNd(DimsHW{1, 1}); 333 | auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k2, k2}); 334 | pool2->setPaddingNd(DimsHW{k2 / 2, k2 / 2}); 335 | pool2->setStrideNd(DimsHW{1, 1}); 336 | auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k3, k3}); 337 | pool3->setPaddingNd(DimsHW{k3 / 2, k3 / 2}); 338 | pool3->setStrideNd(DimsHW{1, 1}); 339 | 340 | ITensor* inputTensors[] = {cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0)}; 341 | auto cat = network->addConcatenation(inputTensors, 4); 342 | 343 | auto cv2 = convBnLeaky(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2"); 344 | return cv2; 345 | } 346 | 347 | int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { 348 | DIR *p_dir = opendir(p_dir_name); 349 | if (p_dir == nullptr) { 350 | return -1; 351 | } 352 | 353 | struct dirent* p_file = nullptr; 354 | while ((p_file = readdir(p_dir)) != nullptr) { 355 | if (strcmp(p_file->d_name, ".") != 0 && 356 | strcmp(p_file->d_name, "..") != 0) { 357 | //std::string cur_file_name(p_dir_name); 358 | //cur_file_name += "/"; 359 | //cur_file_name += p_file->d_name; 360 | std::string cur_file_name(p_file->d_name); 361 | file_names.push_back(cur_file_name); 362 | } 363 | } 364 | 365 | closedir(p_dir); 366 | return 0; 367 | } 368 | 369 | #endif 370 | 371 | -------------------------------------------------------------------------------- /yolov5s/gen_wts.py: -------------------------------------------------------------------------------- 1 | from utils.utils import * 2 | import struct 3 | 4 | # Initialize 5 | device = torch_utils.select_device('0') 6 | # Load model 7 | model = torch.load('weights/yolov5s.pt', map_location=device)['model'].float() # load to FP32 8 | model.to(device).eval() 9 | 10 | f = open('yolov5s.wts', 'w') 11 | f.write('{}\n'.format(len(model.state_dict().keys()))) 12 | for k, v in model.state_dict().items(): 13 | vr = v.reshape(-1).cpu().numpy() 14 | f.write('{} {} '.format(k, len(vr))) 15 | for vv in vr: 16 | f.write(' ') 17 | f.write(struct.pack('>f',float(vv)).hex()) 18 | f.write('\n') 19 | -------------------------------------------------------------------------------- /yolov5s/images/bus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BaofengZan/yolov5_2.0-TensorRt/78193337dd94bf48c2d55c3a77105a534d11c45b/yolov5s/images/bus.jpg -------------------------------------------------------------------------------- /yolov5s/images/zidane.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BaofengZan/yolov5_2.0-TensorRt/78193337dd94bf48c2d55c3a77105a534d11c45b/yolov5s/images/zidane.jpg -------------------------------------------------------------------------------- /yolov5s/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef __TRT_UTILS_H_ 2 | #define __TRT_UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #ifndef CUDA_CHECK 10 | 11 | #define CUDA_CHECK(callstr) \ 12 | { \ 13 | cudaError_t error_code = callstr; \ 14 | if (error_code != cudaSuccess) { \ 15 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ 16 | assert(0); \ 17 | } \ 18 | } 19 | 20 | #endif 21 | 22 | namespace Tn 23 | { 24 | class Profiler : public nvinfer1::IProfiler 25 | { 26 | public: 27 | void printLayerTimes(int itrationsTimes) 28 | { 29 | float totalTime = 0; 30 | for (size_t i = 0; i < mProfile.size(); i++) 31 | { 32 | printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes); 33 | totalTime += mProfile[i].second; 34 | } 35 | printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes); 36 | } 37 | private: 38 | typedef std::pair Record; 39 | std::vector mProfile; 40 | 41 | virtual void reportLayerTime(const char* layerName, float ms) 42 | { 43 | auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; }); 44 | if (record == mProfile.end()) 45 | mProfile.push_back(std::make_pair(layerName, ms)); 46 | else 47 | record->second += ms; 48 | } 49 | }; 50 | 51 | //Logger for TensorRT info/warning/errors 52 | class Logger : public nvinfer1::ILogger 53 | { 54 | public: 55 | 56 | Logger(): Logger(Severity::kWARNING) {} 57 | 58 | Logger(Severity severity): reportableSeverity(severity) {} 59 | 60 | void log(Severity severity, const char* msg) override 61 | { 62 | // suppress messages with severity enum value greater than the reportable 63 | if (severity > reportableSeverity) return; 64 | 65 | switch (severity) 66 | { 67 | case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break; 68 | case Severity::kERROR: std::cerr << "ERROR: "; break; 69 | case Severity::kWARNING: std::cerr << "WARNING: "; break; 70 | case Severity::kINFO: std::cerr << "INFO: "; break; 71 | default: std::cerr << "UNKNOWN: "; break; 72 | } 73 | std::cerr << msg << std::endl; 74 | } 75 | 76 | Severity reportableSeverity{Severity::kWARNING}; 77 | }; 78 | 79 | template 80 | void write(char*& buffer, const T& val) 81 | { 82 | *reinterpret_cast(buffer) = val; 83 | buffer += sizeof(T); 84 | } 85 | 86 | template 87 | void read(const char*& buffer, T& val) 88 | { 89 | val = *reinterpret_cast(buffer); 90 | buffer += sizeof(T); 91 | } 92 | } 93 | 94 | #endif -------------------------------------------------------------------------------- /yolov5s/yololayer.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "yololayer.h" 3 | #include "utils.h" 4 | 5 | using namespace Yolo; 6 | 7 | namespace nvinfer1 8 | { 9 | YoloLayerPlugin::YoloLayerPlugin() 10 | { 11 | mClassCount = CLASS_NUM; 12 | mYoloKernel.clear(); 13 | mYoloKernel.push_back(yolo1); 14 | mYoloKernel.push_back(yolo2); 15 | mYoloKernel.push_back(yolo3); 16 | 17 | mKernelCount = mYoloKernel.size(); 18 | 19 | CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); 20 | size_t AnchorLen = sizeof(float)* CHECK_COUNT*2; 21 | for(int ii = 0; ii < mKernelCount; ii ++) 22 | { 23 | CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen)); 24 | const auto& yolo = mYoloKernel[ii]; 25 | CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); 26 | } 27 | } 28 | 29 | YoloLayerPlugin::~YoloLayerPlugin() 30 | { 31 | } 32 | 33 | // create the plugin at runtime from a byte stream 34 | YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) 35 | { 36 | using namespace Tn; 37 | const char *d = reinterpret_cast(data), *a = d; 38 | read(d, mClassCount); 39 | read(d, mThreadCount); 40 | read(d, mKernelCount); 41 | mYoloKernel.resize(mKernelCount); 42 | auto kernelSize = mKernelCount*sizeof(YoloKernel); 43 | memcpy(mYoloKernel.data(),d,kernelSize); 44 | d += kernelSize; 45 | 46 | CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); 47 | size_t AnchorLen = sizeof(float)* CHECK_COUNT*2; 48 | for(int ii = 0; ii < mKernelCount; ii ++) 49 | { 50 | CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen)); 51 | const auto& yolo = mYoloKernel[ii]; 52 | CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); 53 | } 54 | 55 | assert(d == a + length); 56 | } 57 | 58 | void YoloLayerPlugin::serialize(void* buffer) const 59 | { 60 | using namespace Tn; 61 | char* d = static_cast(buffer), *a = d; 62 | write(d, mClassCount); 63 | write(d, mThreadCount); 64 | write(d, mKernelCount); 65 | auto kernelSize = mKernelCount*sizeof(YoloKernel); 66 | memcpy(d,mYoloKernel.data(),kernelSize); 67 | d += kernelSize; 68 | 69 | assert(d == a + getSerializationSize()); 70 | } 71 | 72 | size_t YoloLayerPlugin::getSerializationSize() const 73 | { 74 | return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size(); 75 | } 76 | 77 | int YoloLayerPlugin::initialize() 78 | { 79 | return 0; 80 | } 81 | 82 | Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) 83 | { 84 | //output the result to channel 85 | int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); 86 | 87 | return Dims3(totalsize + 1, 1, 1); 88 | } 89 | 90 | // Set plugin namespace 91 | void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) 92 | { 93 | mPluginNamespace = pluginNamespace; 94 | } 95 | 96 | const char* YoloLayerPlugin::getPluginNamespace() const 97 | { 98 | return mPluginNamespace; 99 | } 100 | 101 | // Return the DataType of the plugin output at the requested index 102 | DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const 103 | { 104 | return DataType::kFLOAT; 105 | } 106 | 107 | // Return true if output tensor is broadcast across a batch. 108 | bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const 109 | { 110 | return false; 111 | } 112 | 113 | // Return true if plugin can use input that is broadcast across batch without replication. 114 | bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const 115 | { 116 | return false; 117 | } 118 | 119 | void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) 120 | { 121 | } 122 | 123 | // Attach the plugin object to an execution context and grant the plugin the access to some context resource. 124 | void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) 125 | { 126 | } 127 | 128 | // Detach the plugin object from its execution context. 129 | void YoloLayerPlugin::detachFromContext() {} 130 | 131 | const char* YoloLayerPlugin::getPluginType() const 132 | { 133 | return "YoloLayer_TRT"; 134 | } 135 | 136 | const char* YoloLayerPlugin::getPluginVersion() const 137 | { 138 | return "1"; 139 | } 140 | 141 | void YoloLayerPlugin::destroy() 142 | { 143 | delete this; 144 | } 145 | 146 | // Clone the plugin 147 | IPluginV2IOExt* YoloLayerPlugin::clone() const 148 | { 149 | YoloLayerPlugin *p = new YoloLayerPlugin(); 150 | p->setPluginNamespace(mPluginNamespace); 151 | return p; 152 | } 153 | 154 | __device__ float Logist(float data){ return 1.0f / (1.0f + expf(-data)); }; 155 | 156 | __global__ void CalDetection(const float *input, float *output,int noElements, 157 | int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes,int outputElem) { 158 | 159 | int idx = threadIdx.x + blockDim.x * blockIdx.x; 160 | if (idx >= noElements) return; 161 | 162 | int total_grid = yoloWidth * yoloHeight; 163 | int bnIdx = idx / total_grid; 164 | idx = idx - total_grid*bnIdx; 165 | int info_len_i = 5 + classes; 166 | const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT); 167 | 168 | for (int k = 0; k < 3; ++k) { 169 | float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]); 170 | if (box_prob < IGNORE_THRESH) continue; 171 | int class_id = 0; 172 | float max_cls_prob = 0.0; 173 | for (int i = 5; i < info_len_i; ++i) { 174 | float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]); 175 | if (p > max_cls_prob) { 176 | max_cls_prob = p; 177 | class_id = i - 5; 178 | } 179 | } 180 | float *res_count = output + bnIdx*outputElem; 181 | int count = (int)atomicAdd(res_count, 1); 182 | if (count >= MAX_OUTPUT_BBOX_COUNT) return; 183 | char* data = (char *)res_count + sizeof(float) + count * sizeof(Detection); 184 | Detection* det = (Detection*)(data); 185 | 186 | int row = idx / yoloWidth; 187 | int col = idx % yoloWidth; 188 | 189 | //Location 190 | det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth; 191 | det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight; 192 | det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]); 193 | det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2*k]; 194 | det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]); 195 | det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2*k + 1]; 196 | det->conf = box_prob * max_cls_prob; 197 | det->class_id = class_id; 198 | } 199 | } 200 | 201 | void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) { 202 | 203 | int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); 204 | 205 | for(int idx = 0 ; idx < batchSize; ++idx) { 206 | CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float))); 207 | } 208 | int numElem = 0; 209 | for (unsigned int i = 0; i < mYoloKernel.size(); ++i) 210 | { 211 | const auto& yolo = mYoloKernel[i]; 212 | numElem = yolo.width*yolo.height*batchSize; 213 | if (numElem < mThreadCount) 214 | mThreadCount = numElem; 215 | CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>> 216 | (inputs[i], output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount, outputElem); 217 | } 218 | 219 | } 220 | 221 | 222 | int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) 223 | { 224 | forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize); 225 | return 0; 226 | } 227 | 228 | PluginFieldCollection YoloPluginCreator::mFC{}; 229 | std::vector YoloPluginCreator::mPluginAttributes; 230 | 231 | YoloPluginCreator::YoloPluginCreator() 232 | { 233 | mPluginAttributes.clear(); 234 | 235 | mFC.nbFields = mPluginAttributes.size(); 236 | mFC.fields = mPluginAttributes.data(); 237 | } 238 | 239 | const char* YoloPluginCreator::getPluginName() const 240 | { 241 | return "YoloLayer_TRT"; 242 | } 243 | 244 | const char* YoloPluginCreator::getPluginVersion() const 245 | { 246 | return "1"; 247 | } 248 | 249 | const PluginFieldCollection* YoloPluginCreator::getFieldNames() 250 | { 251 | return &mFC; 252 | } 253 | 254 | IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) 255 | { 256 | YoloLayerPlugin* obj = new YoloLayerPlugin(); 257 | obj->setPluginNamespace(mNamespace.c_str()); 258 | return obj; 259 | } 260 | 261 | IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) 262 | { 263 | // This object will be deleted when the network is destroyed, which will 264 | // call MishPlugin::destroy() 265 | YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); 266 | obj->setPluginNamespace(mNamespace.c_str()); 267 | return obj; 268 | } 269 | 270 | } 271 | -------------------------------------------------------------------------------- /yolov5s/yololayer.h: -------------------------------------------------------------------------------- 1 | #ifndef _YOLO_LAYER_H 2 | #define _YOLO_LAYER_H 3 | 4 | #include 5 | #include 6 | #include "NvInfer.h" 7 | 8 | namespace Yolo 9 | { 10 | static constexpr int CHECK_COUNT = 3; 11 | static constexpr float IGNORE_THRESH = 0.1f; 12 | static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000; 13 | static constexpr int CLASS_NUM = 80; 14 | static constexpr int INPUT_H = 608; 15 | static constexpr int INPUT_W = 608; 16 | 17 | struct YoloKernel 18 | { 19 | int width; 20 | int height; 21 | float anchors[CHECK_COUNT*2]; 22 | }; 23 | 24 | static constexpr YoloKernel yolo1 = { 25 | INPUT_W / 32, 26 | INPUT_H / 32, 27 | {116,90, 156,198, 373,326} 28 | }; 29 | static constexpr YoloKernel yolo2 = { 30 | INPUT_W / 16, 31 | INPUT_H / 16, 32 | {30,61, 62,45, 59,119} 33 | }; 34 | static constexpr YoloKernel yolo3 = { 35 | INPUT_W / 8, 36 | INPUT_H / 8, 37 | {10,13, 16,30, 33,23} 38 | }; 39 | 40 | static constexpr int LOCATIONS = 4; 41 | struct alignas(float) Detection{ 42 | //center_x center_y w h 43 | float bbox[LOCATIONS]; 44 | float conf; // bbox_conf * cls_conf 45 | float class_id; 46 | }; 47 | } 48 | 49 | namespace nvinfer1 50 | { 51 | class YoloLayerPlugin: public IPluginV2IOExt 52 | { 53 | public: 54 | explicit YoloLayerPlugin(); 55 | YoloLayerPlugin(const void* data, size_t length); 56 | 57 | ~YoloLayerPlugin(); 58 | 59 | int getNbOutputs() const override 60 | { 61 | return 1; 62 | } 63 | 64 | Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; 65 | 66 | int initialize() override; 67 | 68 | virtual void terminate() override {}; 69 | 70 | virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;} 71 | 72 | virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; 73 | 74 | virtual size_t getSerializationSize() const override; 75 | 76 | virtual void serialize(void* buffer) const override; 77 | 78 | bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override { 79 | return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; 80 | } 81 | 82 | const char* getPluginType() const override; 83 | 84 | const char* getPluginVersion() const override; 85 | 86 | void destroy() override; 87 | 88 | IPluginV2IOExt* clone() const override; 89 | 90 | void setPluginNamespace(const char* pluginNamespace) override; 91 | 92 | const char* getPluginNamespace() const override; 93 | 94 | DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override; 95 | 96 | bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override; 97 | 98 | bool canBroadcastInputAcrossBatch(int inputIndex) const override; 99 | 100 | void attachToContext( 101 | cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override; 102 | 103 | void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override; 104 | 105 | void detachFromContext() override; 106 | 107 | private: 108 | void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1); 109 | int mClassCount; 110 | int mKernelCount; 111 | std::vector mYoloKernel; 112 | int mThreadCount = 256; 113 | void** mAnchor; 114 | const char* mPluginNamespace; 115 | }; 116 | 117 | class YoloPluginCreator : public IPluginCreator 118 | { 119 | public: 120 | YoloPluginCreator(); 121 | 122 | ~YoloPluginCreator() override = default; 123 | 124 | const char* getPluginName() const override; 125 | 126 | const char* getPluginVersion() const override; 127 | 128 | const PluginFieldCollection* getFieldNames() override; 129 | 130 | IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override; 131 | 132 | IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; 133 | 134 | void setPluginNamespace(const char* libNamespace) override 135 | { 136 | mNamespace = libNamespace; 137 | } 138 | 139 | const char* getPluginNamespace() const override 140 | { 141 | return mNamespace.c_str(); 142 | } 143 | 144 | private: 145 | std::string mNamespace; 146 | static PluginFieldCollection mFC; 147 | static std::vector mPluginAttributes; 148 | }; 149 | 150 | 151 | 152 | }; 153 | 154 | #endif 155 | -------------------------------------------------------------------------------- /yolov5s/yolov5s.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "cuda_runtime_api.h" 4 | #include "logging.h" 5 | #include "common.hpp" 6 | 7 | #define USE_FP16 // comment out this if want to use FP32 8 | #define DEVICE 0 // GPU id 9 | #define NMS_THRESH 0.5 10 | #define CONF_THRESH 0.4 11 | #define BATCH_SIZE 1 12 | 13 | // stuff we know about the network and the input/output blobs 14 | static const int INPUT_H = Yolo::INPUT_H; 15 | static const int INPUT_W = Yolo::INPUT_W; 16 | static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1; // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1 17 | const char* INPUT_BLOB_NAME = "data"; 18 | const char* OUTPUT_BLOB_NAME = "prob"; 19 | static Logger gLogger; 20 | REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); 21 | 22 | // Creat the engine using only the API and not any parser. 23 | ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { 24 | INetworkDefinition* network = builder->createNetworkV2(0U); 25 | 26 | // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME 27 | ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); 28 | assert(data); 29 | 30 | std::map weightMap = loadWeights("../yolov5sV2.wts"); 31 | Weights emptywts{DataType::kFLOAT, nullptr, 0}; 32 | 33 | // yolov5 backbone 34 | auto focus0 = focus(network, weightMap, *data, 3, 32, 3, "model.0"); 35 | auto conv1 = convBnLeaky(network, weightMap, *focus0->getOutput(0), 64, 3, 2, 1, "model.1"); 36 | auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 64, 64, 1, true, 1, 0.5, "model.2"); 37 | auto conv3 = convBnLeaky(network, weightMap, *bottleneck_CSP2->getOutput(0), 128, 3, 2, 1, "model.3"); 38 | auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 128, 128, 3, true, 1, 0.5, "model.4"); 39 | auto conv5 = convBnLeaky(network, weightMap, *bottleneck_csp4->getOutput(0), 256, 3, 2, 1, "model.5"); 40 | auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 256, 256, 3, true, 1, 0.5, "model.6"); 41 | auto conv7 = convBnLeaky(network, weightMap, *bottleneck_csp6->getOutput(0), 512, 3, 2, 1, "model.7"); 42 | auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 512, 512, 5, 9, 13, "model.8"); 43 | 44 | // yolov5 head 45 | auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.9"); 46 | auto conv10 = convBnLeaky(network, weightMap, *bottleneck_csp9->getOutput(0), 256, 1, 1, 1, "model.10"); 47 | 48 | float *deval = reinterpret_cast(malloc(sizeof(float) * 256 * 2 * 2)); 49 | for (int i = 0; i < 256 * 2 * 2; i++) { 50 | deval[i] = 1.0; 51 | } 52 | Weights deconvwts11{DataType::kFLOAT, deval, 256 * 2 * 2}; 53 | IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 256, DimsHW{2, 2}, deconvwts11, emptywts); 54 | deconv11->setStrideNd(DimsHW{2, 2}); 55 | deconv11->setNbGroups(256); 56 | weightMap["deconv11"] = deconvwts11; 57 | 58 | ITensor* inputTensors12[] = {deconv11->getOutput(0), bottleneck_csp6->getOutput(0)}; 59 | auto cat12 = network->addConcatenation(inputTensors12, 2); 60 | auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 512, 256, 1, false, 1, 0.5, "model.13"); 61 | auto conv14 = convBnLeaky(network, weightMap, *bottleneck_csp13->getOutput(0), 128, 1, 1, 1, "model.14"); 62 | 63 | Weights deconvwts15{DataType::kFLOAT, deval, 128 * 2 * 2}; 64 | IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 128, DimsHW{2, 2}, deconvwts15, emptywts); 65 | deconv15->setStrideNd(DimsHW{2, 2}); 66 | deconv15->setNbGroups(128); 67 | //weightMap["deconv15"] = deconvwts15; 68 | 69 | ITensor* inputTensors16[] = {deconv15->getOutput(0), bottleneck_csp4->getOutput(0)}; 70 | auto cat16 = network->addConcatenation(inputTensors16, 2); 71 | auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 256, 128, 1, false, 1, 0.5, "model.17"); 72 | IConvolutionLayer* conv18 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]); 73 | 74 | auto conv19 = convBnLeaky(network, weightMap, *bottleneck_csp17->getOutput(0), 128, 3, 2, 1, "model.18"); 75 | ITensor* inputTensors20[] = {conv19->getOutput(0), conv14->getOutput(0)}; 76 | auto cat20 = network->addConcatenation(inputTensors20, 2); 77 | auto bottleneck_csp21 = bottleneckCSP(network, weightMap, *cat20->getOutput(0), 256, 256, 1, false, 1, 0.5, "model.20"); 78 | IConvolutionLayer* conv22 = network->addConvolutionNd(*bottleneck_csp21->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]); 79 | 80 | auto conv23 = convBnLeaky(network, weightMap, *bottleneck_csp21->getOutput(0), 256, 3, 2, 1, "model.21"); 81 | ITensor* inputTensors24[] = {conv23->getOutput(0), conv10->getOutput(0)}; 82 | auto cat24 = network->addConcatenation(inputTensors24, 2); 83 | auto bottleneck_csp25 = bottleneckCSP(network, weightMap, *cat24->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.23"); 84 | IConvolutionLayer* conv26 = network->addConvolutionNd(*bottleneck_csp25->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]); 85 | 86 | auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); 87 | const PluginFieldCollection* pluginData = creator->getFieldNames(); 88 | IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData); 89 | ITensor* inputTensors_yolo[] = {conv26->getOutput(0), conv22->getOutput(0), conv18->getOutput(0)}; 90 | auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj); 91 | 92 | yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME); 93 | network->markOutput(*yolo->getOutput(0)); 94 | 95 | // Build engine 96 | builder->setMaxBatchSize(maxBatchSize); 97 | config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB 98 | #ifdef USE_FP16 99 | config->setFlag(BuilderFlag::kFP16); 100 | #endif 101 | std::cout << "Building engine, please wait for a while..." << std::endl; 102 | ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); 103 | std::cout << "Build engine successfully!" << std::endl; 104 | 105 | // Don't need the network any more 106 | network->destroy(); 107 | 108 | // Release host memory 109 | for (auto& mem : weightMap) 110 | { 111 | free((void*) (mem.second.values)); 112 | } 113 | 114 | return engine; 115 | } 116 | 117 | void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { 118 | // Create builder 119 | IBuilder* builder = createInferBuilder(gLogger); 120 | IBuilderConfig* config = builder->createBuilderConfig(); 121 | 122 | // Create model to populate the network, then set the outputs and create an engine 123 | ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); 124 | assert(engine != nullptr); 125 | 126 | // Serialize the engine 127 | (*modelStream) = engine->serialize(); 128 | 129 | // Close everything down 130 | engine->destroy(); 131 | builder->destroy(); 132 | } 133 | 134 | void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { 135 | const ICudaEngine& engine = context.getEngine(); 136 | 137 | // Pointers to input and output device buffers to pass to engine. 138 | // Engine requires exactly IEngine::getNbBindings() number of buffers. 139 | assert(engine.getNbBindings() == 2); 140 | void* buffers[2]; 141 | 142 | // In order to bind the buffers, we need to know the names of the input and output tensors. 143 | // Note that indices are guaranteed to be less than IEngine::getNbBindings() 144 | const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); 145 | const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); 146 | 147 | // Create GPU buffers on device 148 | CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); 149 | CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); 150 | 151 | // Create stream 152 | cudaStream_t stream; 153 | CHECK(cudaStreamCreate(&stream)); 154 | 155 | // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host 156 | CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); 157 | context.enqueue(batchSize, buffers, stream, nullptr); 158 | CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); 159 | cudaStreamSynchronize(stream); 160 | 161 | // Release stream and buffers 162 | cudaStreamDestroy(stream); 163 | CHECK(cudaFree(buffers[inputIndex])); 164 | CHECK(cudaFree(buffers[outputIndex])); 165 | } 166 | 167 | int main(int argc, char** argv) { 168 | cudaSetDevice(DEVICE); 169 | // create a model using the API directly and serialize it to a stream 170 | char *trtModelStream{nullptr}; 171 | size_t size{0}; 172 | 173 | if (argc == 2 && std::string(argv[1]) == "-s") { 174 | IHostMemory* modelStream{nullptr}; 175 | APIToModel(BATCH_SIZE, &modelStream); 176 | assert(modelStream != nullptr); 177 | std::ofstream p("yolov5s.engine", std::ios::binary); 178 | if (!p) { 179 | std::cerr << "could not open plan output file" << std::endl; 180 | return -1; 181 | } 182 | p.write(reinterpret_cast(modelStream->data()), modelStream->size()); 183 | modelStream->destroy(); 184 | return 0; 185 | } else if (argc == 3 && std::string(argv[1]) == "-d") { 186 | std::ifstream file("yolov5s.engine", std::ios::binary); 187 | if (file.good()) { 188 | file.seekg(0, file.end); 189 | size = file.tellg(); 190 | file.seekg(0, file.beg); 191 | trtModelStream = new char[size]; 192 | assert(trtModelStream); 193 | file.read(trtModelStream, size); 194 | file.close(); 195 | } 196 | } else { 197 | std::cerr << "arguments not right!" << std::endl; 198 | std::cerr << "./yolov5s -s // serialize model to plan file" << std::endl; 199 | std::cerr << "./yolov5s -d ../samples // deserialize plan file and run inference" << std::endl; 200 | return -1; 201 | } 202 | 203 | std::vector file_names; 204 | if (read_files_in_dir(argv[2], file_names) < 0) { 205 | std::cout << "read_files_in_dir failed." << std::endl; 206 | return -1; 207 | } 208 | 209 | // prepare input data --------------------------- 210 | static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; 211 | //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) 212 | // data[i] = 1.0; 213 | static float prob[BATCH_SIZE * OUTPUT_SIZE]; 214 | IRuntime* runtime = createInferRuntime(gLogger); 215 | assert(runtime != nullptr); 216 | ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); 217 | assert(engine != nullptr); 218 | IExecutionContext* context = engine->createExecutionContext(); 219 | assert(context != nullptr); 220 | delete[] trtModelStream; 221 | 222 | int fcount = 0; 223 | for (int f = 0; f < (int)file_names.size(); f++) { 224 | fcount++; 225 | if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue; 226 | for (int b = 0; b < fcount; b++) { 227 | cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]); 228 | if (img.empty()) continue; 229 | cv::Mat pr_img = preprocess_img(img); 230 | int i = 0; 231 | for (int row = 0; row < INPUT_H; ++row) { 232 | uchar* uc_pixel = pr_img.data + row * pr_img.step; 233 | for (int col = 0; col < INPUT_W; ++col) { 234 | data[b * 3 * INPUT_H * INPUT_W + i] = uc_pixel[2] / 255; 235 | data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = uc_pixel[1] / 255.0; 236 | data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = uc_pixel[0] / 255.0; 237 | uc_pixel += 3; 238 | ++i; 239 | } 240 | } 241 | } 242 | 243 | // Run inference 244 | auto start = std::chrono::system_clock::now(); 245 | doInference(*context, data, prob, BATCH_SIZE); 246 | auto end = std::chrono::system_clock::now(); 247 | std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; 248 | std::vector> batch_res(fcount); 249 | for (int b = 0; b < fcount; b++) { 250 | auto& res = batch_res[b]; 251 | nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH); 252 | } 253 | for (int b = 0; b < fcount; b++) { 254 | auto& res = batch_res[b]; 255 | //std::cout << res.size() << std::endl; 256 | cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]); 257 | for (size_t j = 0; j < res.size(); j++) { 258 | cv::Rect r = get_rect(img, res[j].bbox); 259 | cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); 260 | cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); 261 | } 262 | cv::imwrite("_" + file_names[f - fcount + 1 + b], img); 263 | } 264 | fcount = 0; 265 | } 266 | 267 | // Destroy the engine 268 | context->destroy(); 269 | engine->destroy(); 270 | runtime->destroy(); 271 | 272 | // Print histogram of the output distribution 273 | //std::cout << "\nOutput:\n\n"; 274 | //for (unsigned int i = 0; i < OUTPUT_SIZE; i++) 275 | //{ 276 | // std::cout << prob[i] << ", "; 277 | // if (i % 10 == 0) std::cout << std::endl; 278 | //} 279 | //std::cout << std::endl; 280 | 281 | return 0; 282 | } 283 | -------------------------------------------------------------------------------- /yolov5x/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(yolov5) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30) 14 | 15 | include_directories(${PROJECT_SOURCE_DIR}/include) 16 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") 17 | message("embed_platform on") 18 | include_directories(/usr/local/cuda/targets/aarch64-linux/include) 19 | link_directories(/usr/local/cuda/targets/aarch64-linux/lib) 20 | else() 21 | message("embed_platform off") 22 | include_directories(/usr/local/cuda/include) 23 | link_directories(/usr/local/cuda/lib64) 24 | endif() 25 | 26 | 27 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 28 | 29 | cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu) 30 | 31 | ########## opencv configuration ############ 32 | find_package(OpenCV 3.4 REQUIRED) 33 | message(OpenCV_LIBS) 34 | include_directories(OpenCV_INCLUDE_DIRS) 35 | 36 | add_executable(yolov5x ${PROJECT_SOURCE_DIR}/yolov5x.cpp) 37 | target_link_libraries(yolov5x nvinfer) 38 | target_link_libraries(yolov5x cudart) 39 | target_link_libraries(yolov5x yololayer) 40 | target_link_libraries(yolov5x ${OpenCV_LIBS}) 41 | 42 | add_definitions(-O2 -pthread) 43 | 44 | -------------------------------------------------------------------------------- /yolov5x/README.md: -------------------------------------------------------------------------------- 1 | # yolov5 2 | 3 | The Pytorch implementation is [ultralytics/yolov5](https://github.com/ultralytics/yolov5). 4 | 5 | I was using [ultralytics/yolov5](https://github.com/ultralytics/yolov5)(The latest version). Just in case the yolov5 model updated. 6 | 7 | ## How to Run 8 | 9 | ``` 10 | 1. generate yolov5x.wts from pytorch implementation with yolov5.pt 11 | 12 | git clone https://github.com/AIpakchoi/yolov5_tensorrt.git 13 | git clone https://github.com/ultralytics/yolov5.git 14 | // download its weights 'yolov5x.pt' 15 | cd yolov5 16 | cp ../yolov5_tensorrt/yolov5x/gen_wts.py . 17 | python gen_wts.py 18 | // a file 'yolov5x.wts' will be generated. 19 | 20 | 2. put yolov5x.wts into yolov5x, build and run 21 | 22 | mv yolov5x.wts ../yolov5_tensorrt/yolov5x/ 23 | cd ../yolov5_tensorrt/yolov5x 24 | mkdir build 25 | cd build 26 | cmake .. 27 | make 28 | sudo ./yolov5x -s // serialize model to plan file i.e. 'yolov5x.engine' 29 | sudo ./yolov5x -d ../samples // deserialize plan file and run inference, the images in samples will be processed. 30 | 31 | 3. check the images generated, as follows. _zidane.jpg and _bus.jpg 32 | ``` 33 | 34 |

35 | 36 |

37 | 38 |

39 | 40 |

41 | 42 | ## Config 43 | 44 | - Input shape defined in yololayer.h 45 | - Number of classes defined in yololayer.h 46 | - FP16/FP32 can be selected by the macro in yolov5x.cpp 47 | - GPU id can be selected by the macro in yolov5x.cpp 48 | - NMS thresh in yolov5x.cpp 49 | - BBox confidence thresh in yolov5x.cpp 50 | - Batch size in yolov5x.cpp 51 | -------------------------------------------------------------------------------- /yolov5x/common.hpp: -------------------------------------------------------------------------------- 1 | #ifndef YOLOV5_COMMON_H_ 2 | #define YOLOV5_COMMON_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "NvInfer.h" 11 | #include "yololayer.h" 12 | 13 | #define CHECK(status) \ 14 | do\ 15 | {\ 16 | auto ret = (status);\ 17 | if (ret != 0)\ 18 | {\ 19 | std::cerr << "Cuda failure: " << ret << std::endl;\ 20 | abort();\ 21 | }\ 22 | } while (0) 23 | 24 | using namespace nvinfer1; 25 | // resize 优化 26 | // 从原图的点映射到输出图像的像素点 27 | // 对outuput的每个点,先根据长宽比计算其在原图中最邻近的像素点, 28 | //然后直接根据最邻近的思想,直接拷贝Channel个字节作为输出图像 29 | void resizeByNN(uchar *input, uchar *output, int height_in, int width_in, int channels, int height_out, int width_out) { 30 | 31 | uchar *data_source = input; 32 | uchar *data_half = output; 33 | 34 | int bpl_source = width_in * 3; 35 | int bpl_dst = width_out * 3; 36 | 37 | int pos = 0; 38 | int sep = 0; 39 | uchar *sr = nullptr; 40 | uchar *hr = nullptr; 41 | float step = 0.0; 42 | float step_x = float(width_in) / float(width_out); 43 | float step_y = float(height_in) / float(height_out); 44 | 45 | for (int i = 0; i < height_out; i++) { 46 | for (int j = 0; j < width_out; j++) { 47 | sep = int(step_y*i); 48 | step = int(j*step_x); 49 | sr = data_source + sep * bpl_source; 50 | hr = data_half + i * bpl_dst + j * channels; 51 | pos = step * channels; 52 | memcpy(hr, sr + pos, channels); 53 | } 54 | } 55 | return; 56 | } 57 | 58 | cv::Mat preprocess_img(cv::Mat& img) { 59 | int w, h, x, y; 60 | float r_w = Yolo::INPUT_W / (img.cols*1.0); 61 | float r_h = Yolo::INPUT_H / (img.rows*1.0); 62 | if (r_h > r_w) { 63 | w = Yolo::INPUT_W; 64 | h = r_w * img.rows; 65 | x = 0; 66 | y = (Yolo::INPUT_H - h) / 2; 67 | } else { 68 | w = r_h* img.cols; 69 | h = Yolo::INPUT_H; 70 | x = (Yolo::INPUT_W - w) / 2; 71 | y = 0; 72 | } 73 | cv::Mat re(h, w, CV_8UC3); 74 | //cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC); 75 | auto start = std::chrono::system_clock::now(); 76 | cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); 77 | //resizeByNN(img.data, re.data, img.rows, img.cols, img.channels(), re.rows, re.cols); 78 | auto end = std::chrono::system_clock::now(); 79 | std::cout << "img resize: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; 80 | 81 | cv::Mat out(Yolo::INPUT_H, Yolo::INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128)); 82 | 83 | re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); 84 | return out; 85 | } 86 | 87 | cv::Rect get_rect(cv::Mat& img, float bbox[4]) { 88 | int l, r, t, b; 89 | float r_w = Yolo::INPUT_W / (img.cols * 1.0); 90 | float r_h = Yolo::INPUT_H / (img.rows * 1.0); 91 | if (r_h > r_w) { 92 | l = bbox[0] - bbox[2]/2.f; 93 | if (l < 0) 94 | { 95 | l = 0; 96 | } 97 | r = bbox[0] + bbox[2]/2.f; 98 | if (r > img.cols) 99 | { 100 | r = img.cols; 101 | } 102 | t = bbox[1] - bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2; 103 | if (t < 0) 104 | { 105 | t = 0; 106 | } 107 | b = bbox[1] + bbox[3]/2.f - (Yolo::INPUT_H - r_w * img.rows) / 2; 108 | if (b > img.rows) 109 | { 110 | b = img.rows; 111 | } 112 | l = l / r_w; 113 | r = r / r_w; 114 | t = t / r_w; 115 | b = b / r_w; 116 | } else { 117 | l = bbox[0] - bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2; 118 | if (l < 0) 119 | { 120 | l = 0; 121 | } 122 | r = bbox[0] + bbox[2]/2.f - (Yolo::INPUT_W - r_h * img.cols) / 2; 123 | if (r > img.cols) 124 | { 125 | r = img.cols; 126 | } 127 | t = bbox[1] - bbox[3]/2.f; 128 | if (t < 0) 129 | { 130 | t = 0; 131 | } 132 | b = bbox[1] + bbox[3]/2.f; 133 | if (b > img.rows) 134 | { 135 | b = img.rows; 136 | } 137 | l = l / r_h; 138 | r = r / r_h; 139 | t = t / r_h; 140 | b = b / r_h; 141 | } 142 | return cv::Rect(l, t, r-l, b-t); 143 | } 144 | 145 | // std::max vs. max 146 | //https://www.cnblogs.com/timesdaughter/p/5894930.html 147 | // Use (std::min) and (std::max) 148 | float iou(float lbox[4], float rbox[4]) { 149 | float interBox[] = { 150 | (std::max)(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left 151 | (std::min)(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right 152 | (std::max)(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top 153 | (std::min)(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom 154 | }; 155 | 156 | if(interBox[2] > interBox[3] || interBox[0] > interBox[1]) 157 | return 0.0f; 158 | 159 | float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]); 160 | return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS); 161 | } 162 | 163 | bool cmp(Yolo::Detection& a, Yolo::Detection& b) { 164 | return a.conf > b.conf; 165 | } 166 | 167 | void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh = 0.5) { 168 | int det_size = sizeof(Yolo::Detection) / sizeof(float); 169 | std::map> m; 170 | for (int i = 0; i < output[0] && i < 1000; i++) { 171 | if (output[1 + det_size * i + 4] <= conf_thresh) continue; 172 | Yolo::Detection det; 173 | memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); 174 | if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); 175 | m[det.class_id].push_back(det); 176 | } 177 | for (auto it = m.begin(); it != m.end(); it++) { 178 | //std::cout << it->second[0].class_id << " --- " << std::endl; 179 | auto& dets = it->second; 180 | std::sort(dets.begin(), dets.end(), cmp); 181 | for (size_t m = 0; m < dets.size(); ++m) { 182 | auto& item = dets[m]; 183 | res.push_back(item); 184 | for (size_t n = m + 1; n < dets.size(); ++n) { 185 | if (iou(item.bbox, dets[n].bbox) > nms_thresh) { 186 | dets.erase(dets.begin()+n); 187 | --n; 188 | } 189 | } 190 | } 191 | } 192 | } 193 | 194 | // TensorRT weight files have a simple space delimited format: 195 | // [type] [size] 196 | std::map loadWeights(const std::string file) { 197 | std::cout << "Loading weights: " << file << std::endl; 198 | std::map weightMap; 199 | 200 | // Open weights file 201 | std::ifstream input(file); 202 | assert(input.is_open() && "Unable to load weight file."); 203 | 204 | // Read number of weight blobs 205 | int32_t count; 206 | input >> count; 207 | assert(count > 0 && "Invalid weight map file."); 208 | 209 | while (count--) 210 | { 211 | Weights wt{DataType::kFLOAT, nullptr, 0}; 212 | uint32_t size; 213 | 214 | // Read name and type of blob 215 | std::string name; 216 | input >> name >> std::dec >> size; 217 | wt.type = DataType::kFLOAT; 218 | 219 | // Load blob 220 | uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); 221 | for (uint32_t x = 0, y = size; x < y; ++x) 222 | { 223 | input >> std::hex >> val[x]; 224 | } 225 | wt.values = val; 226 | 227 | wt.count = size; 228 | weightMap[name] = wt; 229 | } 230 | 231 | return weightMap; 232 | } 233 | 234 | IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { 235 | float *gamma = (float*)weightMap[lname + ".weight"].values; 236 | float *beta = (float*)weightMap[lname + ".bias"].values; 237 | float *mean = (float*)weightMap[lname + ".running_mean"].values; 238 | float *var = (float*)weightMap[lname + ".running_var"].values; 239 | int len = weightMap[lname + ".running_var"].count; 240 | 241 | float *scval = reinterpret_cast(malloc(sizeof(float) * len)); 242 | for (int i = 0; i < len; i++) { 243 | scval[i] = gamma[i] / sqrt(var[i] + eps); 244 | } 245 | Weights scale{DataType::kFLOAT, scval, len}; 246 | 247 | float *shval = reinterpret_cast(malloc(sizeof(float) * len)); 248 | for (int i = 0; i < len; i++) { 249 | shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); 250 | } 251 | Weights shift{DataType::kFLOAT, shval, len}; 252 | 253 | float *pval = reinterpret_cast(malloc(sizeof(float) * len)); 254 | for (int i = 0; i < len; i++) { 255 | pval[i] = 1.0; 256 | } 257 | Weights power{DataType::kFLOAT, pval, len}; 258 | 259 | weightMap[lname + ".scale"] = scale; 260 | weightMap[lname + ".shift"] = shift; 261 | weightMap[lname + ".power"] = power; 262 | IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); 263 | assert(scale_1); 264 | return scale_1; 265 | } 266 | 267 | ILayer* convBnLeaky(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) { 268 | Weights emptywts{DataType::kFLOAT, nullptr, 0}; 269 | int p = ksize / 2; 270 | IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[lname + ".conv.weight"], emptywts); 271 | assert(conv1); 272 | conv1->setStrideNd(DimsHW{s, s}); 273 | conv1->setPaddingNd(DimsHW{p, p}); 274 | conv1->setNbGroups(g); 275 | //IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-4); 276 | IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3); 277 | auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU); 278 | lr->setAlpha(0.1); 279 | return lr; 280 | } 281 | 282 | ILayer* focus(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) { 283 | ISliceLayer *s1 = network->addSlice(input, Dims3{0, 0, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 284 | ISliceLayer *s2 = network->addSlice(input, Dims3{0, 1, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 285 | ISliceLayer *s3 = network->addSlice(input, Dims3{0, 0, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 286 | ISliceLayer *s4 = network->addSlice(input, Dims3{0, 1, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 287 | ITensor* inputTensors[] = {s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0)}; 288 | auto cat = network->addConcatenation(inputTensors, 4); 289 | auto conv = convBnLeaky(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv"); 290 | return conv; 291 | } 292 | 293 | ILayer* bottleneck(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) { 294 | auto cv1 = convBnLeaky(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1"); 295 | auto cv2 = convBnLeaky(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2"); 296 | if (shortcut && c1 == c2) { 297 | auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM); 298 | return ew; 299 | } 300 | return cv2; 301 | } 302 | 303 | ILayer* bottleneckCSP(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) { 304 | Weights emptywts{DataType::kFLOAT, nullptr, 0}; 305 | int c_ = (int)((float)c2 * e); 306 | auto cv1 = convBnLeaky(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); 307 | auto cv2 = network->addConvolutionNd(input, c_, DimsHW{1, 1}, weightMap[lname + ".cv2.weight"], emptywts); 308 | ITensor *y1 = cv1->getOutput(0); 309 | for (int i = 0; i < n; i++) { 310 | auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i)); 311 | y1 = b->getOutput(0); 312 | } 313 | auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{1, 1}, weightMap[lname + ".cv3.weight"], emptywts); 314 | 315 | ITensor* inputTensors[] = {cv3->getOutput(0), cv2->getOutput(0)}; 316 | auto cat = network->addConcatenation(inputTensors, 2); 317 | 318 | IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4); 319 | auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU); 320 | lr->setAlpha(0.1); 321 | 322 | auto cv4 = convBnLeaky(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4"); 323 | return cv4; 324 | } 325 | 326 | ILayer* SPP(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) { 327 | int c_ = c1 / 2; 328 | auto cv1 = convBnLeaky(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); 329 | 330 | auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k1, k1}); 331 | pool1->setPaddingNd(DimsHW{k1 / 2, k1 / 2}); 332 | pool1->setStrideNd(DimsHW{1, 1}); 333 | auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k2, k2}); 334 | pool2->setPaddingNd(DimsHW{k2 / 2, k2 / 2}); 335 | pool2->setStrideNd(DimsHW{1, 1}); 336 | auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k3, k3}); 337 | pool3->setPaddingNd(DimsHW{k3 / 2, k3 / 2}); 338 | pool3->setStrideNd(DimsHW{1, 1}); 339 | 340 | ITensor* inputTensors[] = {cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0)}; 341 | auto cat = network->addConcatenation(inputTensors, 4); 342 | 343 | auto cv2 = convBnLeaky(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2"); 344 | return cv2; 345 | } 346 | 347 | int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { 348 | DIR *p_dir = opendir(p_dir_name); 349 | if (p_dir == nullptr) { 350 | return -1; 351 | } 352 | 353 | struct dirent* p_file = nullptr; 354 | while ((p_file = readdir(p_dir)) != nullptr) { 355 | if (strcmp(p_file->d_name, ".") != 0 && 356 | strcmp(p_file->d_name, "..") != 0) { 357 | //std::string cur_file_name(p_dir_name); 358 | //cur_file_name += "/"; 359 | //cur_file_name += p_file->d_name; 360 | std::string cur_file_name(p_file->d_name); 361 | file_names.push_back(cur_file_name); 362 | } 363 | } 364 | 365 | closedir(p_dir); 366 | return 0; 367 | } 368 | 369 | #endif 370 | 371 | -------------------------------------------------------------------------------- /yolov5x/gen_wts.py: -------------------------------------------------------------------------------- 1 | from utils.utils import * 2 | import struct 3 | 4 | # Initialize 5 | device = torch_utils.select_device('0') 6 | # Load model 7 | model = torch.load('weights/yolov5x.pt', map_location=device)['model'].float() # load to FP32 8 | model.to(device).eval() 9 | 10 | f = open('yolov5x.wts', 'w') 11 | f.write('{}\n'.format(len(model.state_dict().keys()))) 12 | for k, v in model.state_dict().items(): 13 | vr = v.reshape(-1).cpu().numpy() 14 | f.write('{} {} '.format(k, len(vr))) 15 | for vv in vr: 16 | f.write(' ') 17 | f.write(struct.pack('>f',float(vv)).hex()) 18 | f.write('\n') 19 | -------------------------------------------------------------------------------- /yolov5x/images/bus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BaofengZan/yolov5_2.0-TensorRt/78193337dd94bf48c2d55c3a77105a534d11c45b/yolov5x/images/bus.jpg -------------------------------------------------------------------------------- /yolov5x/images/zidane.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BaofengZan/yolov5_2.0-TensorRt/78193337dd94bf48c2d55c3a77105a534d11c45b/yolov5x/images/zidane.jpg -------------------------------------------------------------------------------- /yolov5x/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef __TRT_UTILS_H_ 2 | #define __TRT_UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #ifndef CUDA_CHECK 10 | 11 | #define CUDA_CHECK(callstr) \ 12 | { \ 13 | cudaError_t error_code = callstr; \ 14 | if (error_code != cudaSuccess) { \ 15 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ 16 | assert(0); \ 17 | } \ 18 | } 19 | 20 | #endif 21 | 22 | namespace Tn 23 | { 24 | class Profiler : public nvinfer1::IProfiler 25 | { 26 | public: 27 | void printLayerTimes(int itrationsTimes) 28 | { 29 | float totalTime = 0; 30 | for (size_t i = 0; i < mProfile.size(); i++) 31 | { 32 | printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes); 33 | totalTime += mProfile[i].second; 34 | } 35 | printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes); 36 | } 37 | private: 38 | typedef std::pair Record; 39 | std::vector mProfile; 40 | 41 | virtual void reportLayerTime(const char* layerName, float ms) 42 | { 43 | auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; }); 44 | if (record == mProfile.end()) 45 | mProfile.push_back(std::make_pair(layerName, ms)); 46 | else 47 | record->second += ms; 48 | } 49 | }; 50 | 51 | //Logger for TensorRT info/warning/errors 52 | class Logger : public nvinfer1::ILogger 53 | { 54 | public: 55 | 56 | Logger(): Logger(Severity::kWARNING) {} 57 | 58 | Logger(Severity severity): reportableSeverity(severity) {} 59 | 60 | void log(Severity severity, const char* msg) override 61 | { 62 | // suppress messages with severity enum value greater than the reportable 63 | if (severity > reportableSeverity) return; 64 | 65 | switch (severity) 66 | { 67 | case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break; 68 | case Severity::kERROR: std::cerr << "ERROR: "; break; 69 | case Severity::kWARNING: std::cerr << "WARNING: "; break; 70 | case Severity::kINFO: std::cerr << "INFO: "; break; 71 | default: std::cerr << "UNKNOWN: "; break; 72 | } 73 | std::cerr << msg << std::endl; 74 | } 75 | 76 | Severity reportableSeverity{Severity::kWARNING}; 77 | }; 78 | 79 | template 80 | void write(char*& buffer, const T& val) 81 | { 82 | *reinterpret_cast(buffer) = val; 83 | buffer += sizeof(T); 84 | } 85 | 86 | template 87 | void read(const char*& buffer, T& val) 88 | { 89 | val = *reinterpret_cast(buffer); 90 | buffer += sizeof(T); 91 | } 92 | } 93 | 94 | #endif -------------------------------------------------------------------------------- /yolov5x/yololayer.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "yololayer.h" 3 | #include "utils.h" 4 | 5 | using namespace Yolo; 6 | 7 | namespace nvinfer1 8 | { 9 | YoloLayerPlugin::YoloLayerPlugin() 10 | { 11 | mClassCount = CLASS_NUM; 12 | mYoloKernel.clear(); 13 | mYoloKernel.push_back(yolo1); 14 | mYoloKernel.push_back(yolo2); 15 | mYoloKernel.push_back(yolo3); 16 | 17 | mKernelCount = mYoloKernel.size(); 18 | 19 | CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); 20 | size_t AnchorLen = sizeof(float)* CHECK_COUNT*2; 21 | for(int ii = 0; ii < mKernelCount; ii ++) 22 | { 23 | CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen)); 24 | const auto& yolo = mYoloKernel[ii]; 25 | CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); 26 | } 27 | } 28 | 29 | YoloLayerPlugin::~YoloLayerPlugin() 30 | { 31 | } 32 | 33 | // create the plugin at runtime from a byte stream 34 | YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) 35 | { 36 | using namespace Tn; 37 | const char *d = reinterpret_cast(data), *a = d; 38 | read(d, mClassCount); 39 | read(d, mThreadCount); 40 | read(d, mKernelCount); 41 | mYoloKernel.resize(mKernelCount); 42 | auto kernelSize = mKernelCount*sizeof(YoloKernel); 43 | memcpy(mYoloKernel.data(),d,kernelSize); 44 | d += kernelSize; 45 | 46 | CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); 47 | size_t AnchorLen = sizeof(float)* CHECK_COUNT*2; 48 | for(int ii = 0; ii < mKernelCount; ii ++) 49 | { 50 | CUDA_CHECK(cudaMalloc(&mAnchor[ii],AnchorLen)); 51 | const auto& yolo = mYoloKernel[ii]; 52 | CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); 53 | } 54 | 55 | assert(d == a + length); 56 | } 57 | 58 | void YoloLayerPlugin::serialize(void* buffer) const 59 | { 60 | using namespace Tn; 61 | char* d = static_cast(buffer), *a = d; 62 | write(d, mClassCount); 63 | write(d, mThreadCount); 64 | write(d, mKernelCount); 65 | auto kernelSize = mKernelCount*sizeof(YoloKernel); 66 | memcpy(d,mYoloKernel.data(),kernelSize); 67 | d += kernelSize; 68 | 69 | assert(d == a + getSerializationSize()); 70 | } 71 | 72 | size_t YoloLayerPlugin::getSerializationSize() const 73 | { 74 | return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size(); 75 | } 76 | 77 | int YoloLayerPlugin::initialize() 78 | { 79 | return 0; 80 | } 81 | 82 | Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) 83 | { 84 | //output the result to channel 85 | int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); 86 | 87 | return Dims3(totalsize + 1, 1, 1); 88 | } 89 | 90 | // Set plugin namespace 91 | void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) 92 | { 93 | mPluginNamespace = pluginNamespace; 94 | } 95 | 96 | const char* YoloLayerPlugin::getPluginNamespace() const 97 | { 98 | return mPluginNamespace; 99 | } 100 | 101 | // Return the DataType of the plugin output at the requested index 102 | DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const 103 | { 104 | return DataType::kFLOAT; 105 | } 106 | 107 | // Return true if output tensor is broadcast across a batch. 108 | bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const 109 | { 110 | return false; 111 | } 112 | 113 | // Return true if plugin can use input that is broadcast across batch without replication. 114 | bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const 115 | { 116 | return false; 117 | } 118 | 119 | void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) 120 | { 121 | } 122 | 123 | // Attach the plugin object to an execution context and grant the plugin the access to some context resource. 124 | void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) 125 | { 126 | } 127 | 128 | // Detach the plugin object from its execution context. 129 | void YoloLayerPlugin::detachFromContext() {} 130 | 131 | const char* YoloLayerPlugin::getPluginType() const 132 | { 133 | return "YoloLayer_TRT"; 134 | } 135 | 136 | const char* YoloLayerPlugin::getPluginVersion() const 137 | { 138 | return "1"; 139 | } 140 | 141 | void YoloLayerPlugin::destroy() 142 | { 143 | delete this; 144 | } 145 | 146 | // Clone the plugin 147 | IPluginV2IOExt* YoloLayerPlugin::clone() const 148 | { 149 | YoloLayerPlugin *p = new YoloLayerPlugin(); 150 | p->setPluginNamespace(mPluginNamespace); 151 | return p; 152 | } 153 | 154 | __device__ float Logist(float data){ return 1.0f / (1.0f + expf(-data)); }; 155 | 156 | __global__ void CalDetection(const float *input, float *output,int noElements, 157 | int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes,int outputElem) { 158 | 159 | int idx = threadIdx.x + blockDim.x * blockIdx.x; 160 | if (idx >= noElements) return; 161 | 162 | int total_grid = yoloWidth * yoloHeight; 163 | int bnIdx = idx / total_grid; 164 | idx = idx - total_grid*bnIdx; 165 | int info_len_i = 5 + classes; 166 | const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT); 167 | 168 | for (int k = 0; k < 3; ++k) { 169 | float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]); 170 | if (box_prob < IGNORE_THRESH) continue; 171 | int class_id = 0; 172 | float max_cls_prob = 0.0; 173 | for (int i = 5; i < info_len_i; ++i) { 174 | float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]); 175 | if (p > max_cls_prob) { 176 | max_cls_prob = p; 177 | class_id = i - 5; 178 | } 179 | } 180 | float *res_count = output + bnIdx*outputElem; 181 | int count = (int)atomicAdd(res_count, 1); 182 | if (count >= MAX_OUTPUT_BBOX_COUNT) return; 183 | char* data = (char *)res_count + sizeof(float) + count * sizeof(Detection); 184 | Detection* det = (Detection*)(data); 185 | 186 | int row = idx / yoloWidth; 187 | int col = idx % yoloWidth; 188 | 189 | //Location 190 | det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth; 191 | det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight; 192 | det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]); 193 | det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2*k]; 194 | det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]); 195 | det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2*k + 1]; 196 | det->conf = box_prob * max_cls_prob; 197 | det->class_id = class_id; 198 | } 199 | } 200 | 201 | void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) { 202 | 203 | int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); 204 | 205 | for(int idx = 0 ; idx < batchSize; ++idx) { 206 | CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float))); 207 | } 208 | int numElem = 0; 209 | for (unsigned int i = 0; i < mYoloKernel.size(); ++i) 210 | { 211 | const auto& yolo = mYoloKernel[i]; 212 | numElem = yolo.width*yolo.height*batchSize; 213 | if (numElem < mThreadCount) 214 | mThreadCount = numElem; 215 | CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>> 216 | (inputs[i], output, numElem, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount, outputElem); 217 | } 218 | 219 | } 220 | 221 | 222 | int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) 223 | { 224 | forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize); 225 | return 0; 226 | } 227 | 228 | PluginFieldCollection YoloPluginCreator::mFC{}; 229 | std::vector YoloPluginCreator::mPluginAttributes; 230 | 231 | YoloPluginCreator::YoloPluginCreator() 232 | { 233 | mPluginAttributes.clear(); 234 | 235 | mFC.nbFields = mPluginAttributes.size(); 236 | mFC.fields = mPluginAttributes.data(); 237 | } 238 | 239 | const char* YoloPluginCreator::getPluginName() const 240 | { 241 | return "YoloLayer_TRT"; 242 | } 243 | 244 | const char* YoloPluginCreator::getPluginVersion() const 245 | { 246 | return "1"; 247 | } 248 | 249 | const PluginFieldCollection* YoloPluginCreator::getFieldNames() 250 | { 251 | return &mFC; 252 | } 253 | 254 | IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) 255 | { 256 | YoloLayerPlugin* obj = new YoloLayerPlugin(); 257 | obj->setPluginNamespace(mNamespace.c_str()); 258 | return obj; 259 | } 260 | 261 | IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) 262 | { 263 | // This object will be deleted when the network is destroyed, which will 264 | // call MishPlugin::destroy() 265 | YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); 266 | obj->setPluginNamespace(mNamespace.c_str()); 267 | return obj; 268 | } 269 | 270 | } 271 | -------------------------------------------------------------------------------- /yolov5x/yololayer.h: -------------------------------------------------------------------------------- 1 | #ifndef _YOLO_LAYER_H 2 | #define _YOLO_LAYER_H 3 | 4 | #include 5 | #include 6 | #include "NvInfer.h" 7 | 8 | namespace Yolo 9 | { 10 | static constexpr int CHECK_COUNT = 3; 11 | static constexpr float IGNORE_THRESH = 0.1f; 12 | static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000; 13 | static constexpr int CLASS_NUM = 80; 14 | static constexpr int INPUT_H = 608; 15 | static constexpr int INPUT_W = 608; 16 | 17 | struct YoloKernel 18 | { 19 | int width; 20 | int height; 21 | float anchors[CHECK_COUNT*2]; 22 | }; 23 | 24 | static constexpr YoloKernel yolo1 = { 25 | INPUT_W / 32, 26 | INPUT_H / 32, 27 | {116,90, 156,198, 373,326} 28 | }; 29 | static constexpr YoloKernel yolo2 = { 30 | INPUT_W / 16, 31 | INPUT_H / 16, 32 | {30,61, 62,45, 59,119} 33 | }; 34 | static constexpr YoloKernel yolo3 = { 35 | INPUT_W / 8, 36 | INPUT_H / 8, 37 | {10,13, 16,30, 33,23} 38 | }; 39 | 40 | static constexpr int LOCATIONS = 4; 41 | struct alignas(float) Detection{ 42 | //center_x center_y w h 43 | float bbox[LOCATIONS]; 44 | float conf; // bbox_conf * cls_conf 45 | float class_id; 46 | }; 47 | } 48 | 49 | namespace nvinfer1 50 | { 51 | class YoloLayerPlugin: public IPluginV2IOExt 52 | { 53 | public: 54 | explicit YoloLayerPlugin(); 55 | YoloLayerPlugin(const void* data, size_t length); 56 | 57 | ~YoloLayerPlugin(); 58 | 59 | int getNbOutputs() const override 60 | { 61 | return 1; 62 | } 63 | 64 | Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; 65 | 66 | int initialize() override; 67 | 68 | virtual void terminate() override {}; 69 | 70 | virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;} 71 | 72 | virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; 73 | 74 | virtual size_t getSerializationSize() const override; 75 | 76 | virtual void serialize(void* buffer) const override; 77 | 78 | bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override { 79 | return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; 80 | } 81 | 82 | const char* getPluginType() const override; 83 | 84 | const char* getPluginVersion() const override; 85 | 86 | void destroy() override; 87 | 88 | IPluginV2IOExt* clone() const override; 89 | 90 | void setPluginNamespace(const char* pluginNamespace) override; 91 | 92 | const char* getPluginNamespace() const override; 93 | 94 | DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override; 95 | 96 | bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override; 97 | 98 | bool canBroadcastInputAcrossBatch(int inputIndex) const override; 99 | 100 | void attachToContext( 101 | cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override; 102 | 103 | void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override; 104 | 105 | void detachFromContext() override; 106 | 107 | private: 108 | void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1); 109 | int mClassCount; 110 | int mKernelCount; 111 | std::vector mYoloKernel; 112 | int mThreadCount = 256; 113 | void** mAnchor; 114 | const char* mPluginNamespace; 115 | }; 116 | 117 | class YoloPluginCreator : public IPluginCreator 118 | { 119 | public: 120 | YoloPluginCreator(); 121 | 122 | ~YoloPluginCreator() override = default; 123 | 124 | const char* getPluginName() const override; 125 | 126 | const char* getPluginVersion() const override; 127 | 128 | const PluginFieldCollection* getFieldNames() override; 129 | 130 | IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override; 131 | 132 | IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; 133 | 134 | void setPluginNamespace(const char* libNamespace) override 135 | { 136 | mNamespace = libNamespace; 137 | } 138 | 139 | const char* getPluginNamespace() const override 140 | { 141 | return mNamespace.c_str(); 142 | } 143 | 144 | private: 145 | std::string mNamespace; 146 | static PluginFieldCollection mFC; 147 | static std::vector mPluginAttributes; 148 | }; 149 | 150 | 151 | 152 | }; 153 | 154 | #endif 155 | -------------------------------------------------------------------------------- /yolov5x/yolov5x.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "cuda_runtime_api.h" 4 | #include "logging.h" 5 | #include "common.hpp" 6 | 7 | #define USE_FP16 // comment out this if want to use FP32 8 | #define DEVICE 0 // GPU id 9 | #define NMS_THRESH 0.5 10 | #define CONF_THRESH 0.25 11 | #define BATCH_SIZE 1 12 | 13 | // stuff we know about the network and the input/output blobs 14 | static const int INPUT_H = Yolo::INPUT_H; 15 | static const int INPUT_W = Yolo::INPUT_W; 16 | static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1; // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1 17 | const char* INPUT_BLOB_NAME = "data"; 18 | const char* OUTPUT_BLOB_NAME = "prob"; 19 | static Logger gLogger; 20 | REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); 21 | 22 | // Creat the engine using only the API and not any parser. 23 | ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { 24 | INetworkDefinition* network = builder->createNetworkV2(0U); 25 | 26 | // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME 27 | ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); 28 | assert(data); 29 | 30 | std::map weightMap = loadWeights("../yolov5x.wts"); 31 | Weights emptywts{DataType::kFLOAT, nullptr, 0}; 32 | 33 | /* ------ yolov5 backbone------ */ 34 | auto focus0 = focus(network, weightMap, *data, 3, 80, 3, "model.0"); 35 | auto conv1 = convBnLeaky(network, weightMap, *focus0->getOutput(0), 160, 3, 2, 1, "model.1"); 36 | auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 160, 160, 4, true, 1, 0.5, "model.2"); 37 | auto conv3 = convBnLeaky(network, weightMap, *bottleneck_CSP2->getOutput(0), 320, 3, 2, 1, "model.3"); 38 | auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 320, 320, 12, true, 1, 0.5, "model.4"); 39 | auto conv5 = convBnLeaky(network, weightMap, *bottleneck_csp4->getOutput(0), 640, 3, 2, 1, "model.5"); 40 | auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 640, 640, 12, true, 1, 0.5, "model.6"); 41 | auto conv7 = convBnLeaky(network, weightMap, *bottleneck_csp6->getOutput(0), 1280, 3, 2, 1, "model.7"); 42 | auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 1280, 1280, 5, 9, 13, "model.8"); 43 | 44 | /* ------- yolov5 head ------- */ 45 | auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 1280, 1280, 4, false, 1, 0.5, "model.9"); 46 | auto conv10 = convBnLeaky(network, weightMap, *bottleneck_csp9->getOutput(0), 640, 1, 1, 1, "model.10"); 47 | 48 | float *deval = reinterpret_cast(malloc(sizeof(float) * 640 * 2 * 2)); 49 | for (int i = 0; i < 640 * 2 * 2; i++) { 50 | deval[i] = 1.0; 51 | } 52 | Weights deconvwts11{DataType::kFLOAT, deval, 640 * 2 * 2}; 53 | IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 640, DimsHW{2, 2}, deconvwts11, emptywts); 54 | deconv11->setStrideNd(DimsHW{2, 2}); 55 | deconv11->setNbGroups(640); 56 | weightMap["deconv11"] = deconvwts11; 57 | 58 | ITensor* inputTensors12[] = {deconv11->getOutput(0), bottleneck_csp6->getOutput(0)}; 59 | auto cat12 = network->addConcatenation(inputTensors12, 2); 60 | 61 | auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 1280, 640, 4, false, 1, 0.5, "model.13"); 62 | auto conv14 = convBnLeaky(network, weightMap, *bottleneck_csp13->getOutput(0), 320, 1, 1, 1, "model.14"); 63 | 64 | Weights deconvwts15{DataType::kFLOAT, deval, 320 * 2 * 2}; 65 | IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 320, DimsHW{2, 2}, deconvwts15, emptywts); 66 | deconv15->setStrideNd(DimsHW{2, 2}); 67 | deconv15->setNbGroups(320); 68 | ITensor* inputTensors16[] = {deconv15->getOutput(0), bottleneck_csp4->getOutput(0)}; 69 | auto cat16 = network->addConcatenation(inputTensors16, 2); 70 | 71 | auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 640, 320, 4, false, 1, 0.5, "model.17"); 72 | 73 | // yolo layer 1 74 | IConvolutionLayer* conv18 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]); 75 | 76 | auto conv19 = convBnLeaky(network, weightMap, *bottleneck_csp17->getOutput(0), 320, 3, 2, 1, "model.18"); 77 | 78 | ITensor* inputTensors20[] = {conv19->getOutput(0), conv14->getOutput(0)}; 79 | auto cat20 = network->addConcatenation(inputTensors20, 2); 80 | 81 | auto bottleneck_csp21 = bottleneckCSP(network, weightMap, *cat20->getOutput(0), 640, 640, 4, false, 1, 0.5, "model.20"); 82 | 83 | // yolo layer 2 84 | IConvolutionLayer* conv22 = network->addConvolutionNd(*bottleneck_csp21->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]); 85 | 86 | auto conv23 = convBnLeaky(network, weightMap, *bottleneck_csp21->getOutput(0), 640, 3, 2, 1, "model.21"); 87 | 88 | ITensor* inputTensors24[] = {conv23->getOutput(0), conv10->getOutput(0)}; 89 | auto cat24 = network->addConcatenation(inputTensors24, 2); 90 | 91 | auto bottleneck_csp25 = bottleneckCSP(network, weightMap, *cat24->getOutput(0), 1280, 1280, 4, false, 1, 0.5, "model.23"); 92 | 93 | // yolo layer 3 94 | IConvolutionLayer* conv26 = network->addConvolutionNd(*bottleneck_csp25->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]); 95 | 96 | auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); 97 | const PluginFieldCollection* pluginData = creator->getFieldNames(); 98 | IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData); 99 | ITensor* inputTensors_yolo[] = {conv26->getOutput(0), conv22->getOutput(0), conv18->getOutput(0)}; 100 | auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj); 101 | 102 | yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME); 103 | network->markOutput(*yolo->getOutput(0)); 104 | 105 | // Build engine 106 | builder->setMaxBatchSize(maxBatchSize); 107 | config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB 108 | #ifdef USE_FP16 109 | config->setFlag(BuilderFlag::kFP16); 110 | #endif 111 | std::cout << "Building engine, please wait for a while..." << std::endl; 112 | ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); 113 | std::cout << "Build engine successfully!" << std::endl; 114 | 115 | // Don't need the network any more 116 | network->destroy(); 117 | 118 | // Release host memory 119 | for (auto& mem : weightMap) 120 | { 121 | free((void*) (mem.second.values)); 122 | } 123 | 124 | return engine; 125 | } 126 | 127 | void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { 128 | // Create builder 129 | IBuilder* builder = createInferBuilder(gLogger); 130 | IBuilderConfig* config = builder->createBuilderConfig(); 131 | 132 | // Create model to populate the network, then set the outputs and create an engine 133 | ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); 134 | assert(engine != nullptr); 135 | 136 | // Serialize the engine 137 | (*modelStream) = engine->serialize(); 138 | 139 | // Close everything down 140 | engine->destroy(); 141 | builder->destroy(); 142 | } 143 | 144 | void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { 145 | const ICudaEngine& engine = context.getEngine(); 146 | 147 | // Pointers to input and output device buffers to pass to engine. 148 | // Engine requires exactly IEngine::getNbBindings() number of buffers. 149 | assert(engine.getNbBindings() == 2); 150 | void* buffers[2]; 151 | 152 | // In order to bind the buffers, we need to know the names of the input and output tensors. 153 | // Note that indices are guaranteed to be less than IEngine::getNbBindings() 154 | const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); 155 | const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); 156 | 157 | // Create GPU buffers on device 158 | CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); 159 | CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); 160 | 161 | // Create stream 162 | cudaStream_t stream; 163 | CHECK(cudaStreamCreate(&stream)); 164 | 165 | // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host 166 | CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); 167 | context.enqueue(batchSize, buffers, stream, nullptr); 168 | CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); 169 | cudaStreamSynchronize(stream); 170 | 171 | // Release stream and buffers 172 | cudaStreamDestroy(stream); 173 | CHECK(cudaFree(buffers[inputIndex])); 174 | CHECK(cudaFree(buffers[outputIndex])); 175 | } 176 | 177 | int main(int argc, char** argv) { 178 | cudaSetDevice(DEVICE); 179 | // create a model using the API directly and serialize it to a stream 180 | char *trtModelStream{nullptr}; 181 | size_t size{0}; 182 | 183 | if (argc == 2 && std::string(argv[1]) == "-s") { 184 | IHostMemory* modelStream{nullptr}; 185 | APIToModel(BATCH_SIZE, &modelStream); 186 | assert(modelStream != nullptr); 187 | std::ofstream p("yolov5x.engine", std::ios::binary); 188 | if (!p) { 189 | std::cerr << "could not open plan output file" << std::endl; 190 | return -1; 191 | } 192 | p.write(reinterpret_cast(modelStream->data()), modelStream->size()); 193 | modelStream->destroy(); 194 | return 0; 195 | } else if (argc == 3 && std::string(argv[1]) == "-d") { 196 | std::ifstream file("yolov5x.engine", std::ios::binary); 197 | if (file.good()) { 198 | file.seekg(0, file.end); 199 | size = file.tellg(); 200 | file.seekg(0, file.beg); 201 | trtModelStream = new char[size]; 202 | assert(trtModelStream); 203 | file.read(trtModelStream, size); 204 | file.close(); 205 | } 206 | } else { 207 | std::cerr << "arguments not right!" << std::endl; 208 | std::cerr << "./yolov5x -s // serialize model to plan file" << std::endl; 209 | std::cerr << "./yolov5x -d ../samples // deserialize plan file and run inference" << std::endl; 210 | return -1; 211 | } 212 | 213 | std::vector file_names; 214 | if (read_files_in_dir(argv[2], file_names) < 0) { 215 | std::cout << "read_files_in_dir failed." << std::endl; 216 | return -1; 217 | } 218 | 219 | // prepare input data --------------------------- 220 | static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; 221 | //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) 222 | // data[i] = 1.0; 223 | static float prob[BATCH_SIZE * OUTPUT_SIZE]; 224 | IRuntime* runtime = createInferRuntime(gLogger); 225 | assert(runtime != nullptr); 226 | ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); 227 | assert(engine != nullptr); 228 | IExecutionContext* context = engine->createExecutionContext(); 229 | assert(context != nullptr); 230 | delete[] trtModelStream; 231 | 232 | int fcount = 0; 233 | for (int f = 0; f < (int)file_names.size(); f++) { 234 | fcount++; 235 | if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) continue; 236 | for (int b = 0; b < fcount; b++) { 237 | cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]); 238 | if (img.empty()) continue; 239 | cv::Mat pr_img = preprocess_img(img); 240 | int i = 0; 241 | for (int row = 0; row < INPUT_H; ++row) { 242 | uchar* uc_pixel = pr_img.data + row * pr_img.step; 243 | for (int col = 0; col < INPUT_W; ++col) { 244 | data[b * 3 * INPUT_H * INPUT_W + i] = uc_pixel[2] / 255; 245 | data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = uc_pixel[1] / 255.0; 246 | data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = uc_pixel[0] / 255.0; 247 | uc_pixel += 3; 248 | ++i; 249 | } 250 | } 251 | } 252 | 253 | // Run inference 254 | auto start = std::chrono::system_clock::now(); 255 | doInference(*context, data, prob, BATCH_SIZE); 256 | auto end = std::chrono::system_clock::now(); 257 | std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; 258 | std::vector> batch_res(fcount); 259 | for (int b = 0; b < fcount; b++) { 260 | auto& res = batch_res[b]; 261 | nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH); 262 | } 263 | for (int b = 0; b < fcount; b++) { 264 | auto& res = batch_res[b]; 265 | //std::cout << res.size() << std::endl; 266 | cv::Mat img = cv::imread(std::string(argv[2]) + "/" + file_names[f - fcount + 1 + b]); 267 | for (size_t j = 0; j < res.size(); j++) { 268 | cv::Rect r = get_rect(img, res[j].bbox); 269 | cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); 270 | cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); 271 | } 272 | cv::imwrite("_" + file_names[f - fcount + 1 + b], img); 273 | } 274 | fcount = 0; 275 | } 276 | 277 | // Destroy the engine 278 | context->destroy(); 279 | engine->destroy(); 280 | runtime->destroy(); 281 | 282 | // Print histogram of the output distribution 283 | //std::cout << "\nOutput:\n\n"; 284 | //for (unsigned int i = 0; i < OUTPUT_SIZE; i++) 285 | //{ 286 | // std::cout << prob[i] << ", "; 287 | // if (i % 10 == 0) std::cout << std::endl; 288 | //} 289 | //std::cout << std::endl; 290 | 291 | return 0; 292 | } 293 | --------------------------------------------------------------------------------