├── CMakeLists.txt ├── README.md ├── common.hpp ├── gen_wts.py ├── logging.h ├── test.png ├── utils.h ├── yololayer.cu ├── yololayer.h ├── yolov5.cpp ├── yolov5_trt.py ├── yolov5s.engine └── yolov5s.wts /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(yolov5) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_53;code=sm_53) 14 | 15 | include_directories(${PROJECT_SOURCE_DIR}/include) 16 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 17 | # cuda 18 | include_directories(/usr/local/cuda/include) 19 | link_directories(/usr/local/cuda/lib64) 20 | # tensorrt 21 | include_directories(/usr/include/aarch64-linux-gnu/) 22 | link_directories(/usr/lib/aarch64-linux-gnu/) 23 | 24 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 25 | 26 | cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu) 27 | target_link_libraries(myplugins nvinfer cudart) 28 | 29 | find_package(OpenCV) 30 | include_directories(OpenCV_INCLUDE_DIRS) 31 | 32 | add_executable(yolov5 ${PROJECT_SOURCE_DIR}/yolov5.cpp) 33 | target_link_libraries(yolov5 nvinfer) 34 | target_link_libraries(yolov5 cudart) 35 | target_link_libraries(yolov5 myplugins) 36 | target_link_libraries(yolov5 ${OpenCV_LIBS}) 37 | 38 | add_definitions(-O2 -pthread) 39 | 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenJetson 2 | 3 | http://openjetson.com/ 4 | 5 | # yolov5 6 | 7 | The Pytorch implementation is [ultralytics/yolov5](https://github.com/ultralytics/yolov5). 8 | 9 | Currently, we support yolov5 v1.0(yolov5s only), v2.0, v3.0 and v3.1. 10 | 11 | - For yolov5 v3.1, please visit [yolov5 release v3.1](https://github.com/ultralytics/yolov5/releases/tag/v3.1), and use the latest commit of this repo. 12 | - For yolov5 v3.0, please visit [yolov5 release v3.0](https://github.com/ultralytics/yolov5/releases/tag/v3.0), and use the latest commit of this repo. 13 | - For yolov5 v2.0, please visit [yolov5 release v2.0](https://github.com/ultralytics/yolov5/releases/tag/v2.0), and checkout commit ['5cfa444'](https://github.com/wang-xinyu/tensorrtx/commit/5cfa4445170eabaa54acd5ad7f469ef65a8763f1) of this repo. 14 | - For yolov5 v1.0, please visit [yolov5 release v1.0](https://github.com/ultralytics/yolov5/releases/tag/v1.0), and checkout commit ['f09aa3b'](https://github.com/wang-xinyu/tensorrtx/commit/f09aa3bbebf4d4d37b6d3b32a1d39e1f2678a07b) of this repo. 15 | 16 | ## Config 17 | 18 | - Choose the model s/m/l/x by `NET` macro in yolov5.cpp 19 | - Input shape defined in yololayer.h 20 | - Number of classes defined in yololayer.h, **DO NOT FORGET TO ADAPT THIS, If using your own model** 21 | - FP16/FP32 can be selected by the macro in yolov5.cpp 22 | - GPU id can be selected by the macro in yolov5.cpp 23 | - NMS thresh in yolov5.cpp 24 | - BBox confidence thresh in yolov5.cpp 25 | - Batch size in yolov5.cpp 26 | 27 | ## How to Run, yolov5s as example 28 | 29 | ``` 30 | 1. generate yolov5s.wts from pytorch with yolov5s.pt, or download .wts from model zoo 31 | 32 | git clone https://github.com/wang-xinyu/tensorrtx.git 33 | git clone https://github.com/ultralytics/yolov5.git 34 | // download its weights 'yolov5s.pt' 35 | // copy tensorrtx/yolov5/gen_wts.py into ultralytics/yolov5 36 | // ensure the file name is yolov5s.pt and yolov5s.wts in gen_wts.py 37 | // go to ultralytics/yolov5 38 | python gen_wts.py 39 | // a file 'yolov5s.wts' will be generated. 40 | 41 | 2. build tensorrtx/yolov5 and run 42 | 43 | // put yolov5s.wts into tensorrtx/yolov5 44 | // go to tensorrtx/yolov5 45 | // ensure the macro NET in yolov5.cpp is s 46 | mkdir build 47 | cd build 48 | cmake .. 49 | make 50 | sudo ./yolov5 -s // serialize model to plan file i.e. 'yolov5s.engine' 51 | sudo ./yolov5 -v // deserialize plan file and run inference with camera or video. 52 | 53 | ``` 54 | ![demo](https://raw.githubusercontent.com/OpenJetson/tensorrt-yolov5/main/test.png) 55 | -------------------------------------------------------------------------------- /common.hpp: -------------------------------------------------------------------------------- 1 | #ifndef YOLOV5_COMMON_H_ 2 | #define YOLOV5_COMMON_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "NvInfer.h" 11 | #include "yololayer.h" 12 | 13 | #define CHECK(status) \ 14 | do\ 15 | {\ 16 | auto ret = (status);\ 17 | if (ret != 0)\ 18 | {\ 19 | std::cerr << "Cuda failure: " << ret << std::endl;\ 20 | abort();\ 21 | }\ 22 | } while (0) 23 | 24 | using namespace nvinfer1; 25 | 26 | cv::Mat preprocess_img(cv::Mat& img) { 27 | int w, h, x, y; 28 | float r_w = Yolo::INPUT_W / (img.cols*1.0); 29 | float r_h = Yolo::INPUT_H / (img.rows*1.0); 30 | if (r_h > r_w) { 31 | w = Yolo::INPUT_W; 32 | h = r_w * img.rows; 33 | x = 0; 34 | y = (Yolo::INPUT_H - h) / 2; 35 | } else { 36 | w = r_h * img.cols; 37 | h = Yolo::INPUT_H; 38 | x = (Yolo::INPUT_W - w) / 2; 39 | y = 0; 40 | } 41 | cv::Mat re(h, w, CV_8UC3); 42 | cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); 43 | cv::Mat out(Yolo::INPUT_H, Yolo::INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128)); 44 | re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); 45 | return out; 46 | } 47 | 48 | cv::Rect get_rect(cv::Mat& img, float bbox[4]) { 49 | int l, r, t, b; 50 | float r_w = Yolo::INPUT_W / (img.cols * 1.0); 51 | float r_h = Yolo::INPUT_H / (img.rows * 1.0); 52 | if (r_h > r_w) { 53 | l = bbox[0] - bbox[2] / 2.f; 54 | r = bbox[0] + bbox[2] / 2.f; 55 | t = bbox[1] - bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2; 56 | b = bbox[1] + bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2; 57 | l = l / r_w; 58 | r = r / r_w; 59 | t = t / r_w; 60 | b = b / r_w; 61 | } else { 62 | l = bbox[0] - bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2; 63 | r = bbox[0] + bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2; 64 | t = bbox[1] - bbox[3] / 2.f; 65 | b = bbox[1] + bbox[3] / 2.f; 66 | l = l / r_h; 67 | r = r / r_h; 68 | t = t / r_h; 69 | b = b / r_h; 70 | } 71 | return cv::Rect(l, t, r - l, b - t); 72 | } 73 | 74 | float iou(float lbox[4], float rbox[4]) { 75 | float interBox[] = { 76 | (std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left 77 | (std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right 78 | (std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top 79 | (std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom 80 | }; 81 | 82 | if (interBox[2] > interBox[3] || interBox[0] > interBox[1]) 83 | return 0.0f; 84 | 85 | float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]); 86 | return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS); 87 | } 88 | 89 | bool cmp(const Yolo::Detection& a, const Yolo::Detection& b) { 90 | return a.conf > b.conf; 91 | } 92 | 93 | void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh = 0.5) { 94 | int det_size = sizeof(Yolo::Detection) / sizeof(float); 95 | std::map> m; 96 | for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) { 97 | if (output[1 + det_size * i + 4] <= conf_thresh) continue; 98 | Yolo::Detection det; 99 | memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); 100 | if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); 101 | m[det.class_id].push_back(det); 102 | } 103 | for (auto it = m.begin(); it != m.end(); it++) { 104 | //std::cout << it->second[0].class_id << " --- " << std::endl; 105 | auto& dets = it->second; 106 | std::sort(dets.begin(), dets.end(), cmp); 107 | for (size_t m = 0; m < dets.size(); ++m) { 108 | auto& item = dets[m]; 109 | res.push_back(item); 110 | for (size_t n = m + 1; n < dets.size(); ++n) { 111 | if (iou(item.bbox, dets[n].bbox) > nms_thresh) { 112 | dets.erase(dets.begin() + n); 113 | --n; 114 | } 115 | } 116 | } 117 | } 118 | } 119 | 120 | // TensorRT weight files have a simple space delimited format: 121 | // [type] [size] 122 | std::map loadWeights(const std::string file) { 123 | std::cout << "Loading weights: " << file << std::endl; 124 | std::map weightMap; 125 | 126 | // Open weights file 127 | std::ifstream input(file); 128 | assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); 129 | 130 | // Read number of weight blobs 131 | int32_t count; 132 | input >> count; 133 | assert(count > 0 && "Invalid weight map file."); 134 | 135 | while (count--) 136 | { 137 | Weights wt{ DataType::kFLOAT, nullptr, 0 }; 138 | uint32_t size; 139 | 140 | // Read name and type of blob 141 | std::string name; 142 | input >> name >> std::dec >> size; 143 | wt.type = DataType::kFLOAT; 144 | 145 | // Load blob 146 | uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); 147 | for (uint32_t x = 0, y = size; x < y; ++x) 148 | { 149 | input >> std::hex >> val[x]; 150 | } 151 | wt.values = val; 152 | 153 | wt.count = size; 154 | weightMap[name] = wt; 155 | } 156 | 157 | return weightMap; 158 | } 159 | 160 | IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { 161 | float *gamma = (float*)weightMap[lname + ".weight"].values; 162 | float *beta = (float*)weightMap[lname + ".bias"].values; 163 | float *mean = (float*)weightMap[lname + ".running_mean"].values; 164 | float *var = (float*)weightMap[lname + ".running_var"].values; 165 | int len = weightMap[lname + ".running_var"].count; 166 | 167 | float *scval = reinterpret_cast(malloc(sizeof(float) * len)); 168 | for (int i = 0; i < len; i++) { 169 | scval[i] = gamma[i] / sqrt(var[i] + eps); 170 | } 171 | Weights scale{ DataType::kFLOAT, scval, len }; 172 | 173 | float *shval = reinterpret_cast(malloc(sizeof(float) * len)); 174 | for (int i = 0; i < len; i++) { 175 | shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); 176 | } 177 | Weights shift{ DataType::kFLOAT, shval, len }; 178 | 179 | float *pval = reinterpret_cast(malloc(sizeof(float) * len)); 180 | for (int i = 0; i < len; i++) { 181 | pval[i] = 1.0; 182 | } 183 | Weights power{ DataType::kFLOAT, pval, len }; 184 | 185 | weightMap[lname + ".scale"] = scale; 186 | weightMap[lname + ".shift"] = shift; 187 | weightMap[lname + ".power"] = power; 188 | IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); 189 | assert(scale_1); 190 | return scale_1; 191 | } 192 | 193 | ILayer* convBlock(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int g, std::string lname) { 194 | Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; 195 | int p = ksize / 2; 196 | IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ ksize, ksize }, weightMap[lname + ".conv.weight"], emptywts); 197 | assert(conv1); 198 | conv1->setStrideNd(DimsHW{ s, s }); 199 | conv1->setPaddingNd(DimsHW{ p, p }); 200 | conv1->setNbGroups(g); 201 | IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3); 202 | 203 | // hard_swish = x * hard_sigmoid 204 | auto hsig = network->addActivation(*bn1->getOutput(0), ActivationType::kHARD_SIGMOID); 205 | assert(hsig); 206 | hsig->setAlpha(1.0 / 6.0); 207 | hsig->setBeta(0.5); 208 | auto ew = network->addElementWise(*bn1->getOutput(0), *hsig->getOutput(0), ElementWiseOperation::kPROD); 209 | assert(ew); 210 | return ew; 211 | } 212 | 213 | ILayer* focus(INetworkDefinition *network, std::map& weightMap, ITensor& input, int inch, int outch, int ksize, std::string lname) { 214 | ISliceLayer *s1 = network->addSlice(input, Dims3{ 0, 0, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 }); 215 | ISliceLayer *s2 = network->addSlice(input, Dims3{ 0, 1, 0 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 }); 216 | ISliceLayer *s3 = network->addSlice(input, Dims3{ 0, 0, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 }); 217 | ISliceLayer *s4 = network->addSlice(input, Dims3{ 0, 1, 1 }, Dims3{ inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2 }, Dims3{ 1, 2, 2 }); 218 | ITensor* inputTensors[] = { s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0) }; 219 | auto cat = network->addConcatenation(inputTensors, 4); 220 | auto conv = convBlock(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv"); 221 | return conv; 222 | } 223 | 224 | ILayer* bottleneck(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, bool shortcut, int g, float e, std::string lname) { 225 | auto cv1 = convBlock(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1"); 226 | auto cv2 = convBlock(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2"); 227 | if (shortcut && c1 == c2) { 228 | auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM); 229 | return ew; 230 | } 231 | return cv2; 232 | } 233 | 234 | ILayer* bottleneckCSP(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) { 235 | Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; 236 | int c_ = (int)((float)c2 * e); 237 | auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); 238 | auto cv2 = network->addConvolutionNd(input, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv2.weight"], emptywts); 239 | ITensor *y1 = cv1->getOutput(0); 240 | for (int i = 0; i < n; i++) { 241 | auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i)); 242 | y1 = b->getOutput(0); 243 | } 244 | auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{ 1, 1 }, weightMap[lname + ".cv3.weight"], emptywts); 245 | 246 | ITensor* inputTensors[] = { cv3->getOutput(0), cv2->getOutput(0) }; 247 | auto cat = network->addConcatenation(inputTensors, 2); 248 | 249 | IScaleLayer* bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4); 250 | auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU); 251 | lr->setAlpha(0.1); 252 | 253 | auto cv4 = convBlock(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4"); 254 | return cv4; 255 | } 256 | 257 | ILayer* SPP(INetworkDefinition *network, std::map& weightMap, ITensor& input, int c1, int c2, int k1, int k2, int k3, std::string lname) { 258 | int c_ = c1 / 2; 259 | auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); 260 | 261 | auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k1, k1 }); 262 | pool1->setPaddingNd(DimsHW{ k1 / 2, k1 / 2 }); 263 | pool1->setStrideNd(DimsHW{ 1, 1 }); 264 | auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k2, k2 }); 265 | pool2->setPaddingNd(DimsHW{ k2 / 2, k2 / 2 }); 266 | pool2->setStrideNd(DimsHW{ 1, 1 }); 267 | auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{ k3, k3 }); 268 | pool3->setPaddingNd(DimsHW{ k3 / 2, k3 / 2 }); 269 | pool3->setStrideNd(DimsHW{ 1, 1 }); 270 | 271 | ITensor* inputTensors[] = { cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0) }; 272 | auto cat = network->addConcatenation(inputTensors, 4); 273 | 274 | auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2"); 275 | return cv2; 276 | } 277 | 278 | int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { 279 | DIR *p_dir = opendir(p_dir_name); 280 | if (p_dir == nullptr) { 281 | return -1; 282 | } 283 | 284 | struct dirent* p_file = nullptr; 285 | while ((p_file = readdir(p_dir)) != nullptr) { 286 | if (strcmp(p_file->d_name, ".") != 0 && 287 | strcmp(p_file->d_name, "..") != 0) { 288 | //std::string cur_file_name(p_dir_name); 289 | //cur_file_name += "/"; 290 | //cur_file_name += p_file->d_name; 291 | std::string cur_file_name(p_file->d_name); 292 | file_names.push_back(cur_file_name); 293 | } 294 | } 295 | 296 | closedir(p_dir); 297 | return 0; 298 | } 299 | 300 | std::vector getAnchors(std::map& weightMap) 301 | { 302 | std::vector anchors_yolo; 303 | Weights Yolo_Anchors = weightMap["model.24.anchor_grid"]; 304 | assert(Yolo_Anchors.count == 18); 305 | int each_yololayer_anchorsnum = Yolo_Anchors.count / 3; 306 | const float* tempAnchors = (const float*)(Yolo_Anchors.values); 307 | for (int i = 0; i < Yolo_Anchors.count; i++) 308 | { 309 | if (i < each_yololayer_anchorsnum) 310 | { 311 | anchors_yolo.push_back(const_cast(tempAnchors)[i]); 312 | } 313 | if ((i >= each_yololayer_anchorsnum) && (i < (2 * each_yololayer_anchorsnum))) 314 | { 315 | anchors_yolo.push_back(const_cast(tempAnchors)[i]); 316 | } 317 | if (i >= (2 * each_yololayer_anchorsnum)) 318 | { 319 | anchors_yolo.push_back(const_cast(tempAnchors)[i]); 320 | } 321 | } 322 | return anchors_yolo; 323 | } 324 | 325 | IPluginV2Layer* addYoLoLayer(INetworkDefinition *network, std::map& weightMap, IConvolutionLayer* det0, IConvolutionLayer* det1, IConvolutionLayer* det2) 326 | { 327 | auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); 328 | std::vector anchors_yolo = getAnchors(weightMap); 329 | PluginField pluginMultidata[4]; 330 | int NetData[4]; 331 | NetData[0] = Yolo::CLASS_NUM; 332 | NetData[1] = Yolo::INPUT_W; 333 | NetData[2] = Yolo::INPUT_H; 334 | NetData[3] = Yolo::MAX_OUTPUT_BBOX_COUNT; 335 | pluginMultidata[0].data = NetData; 336 | pluginMultidata[0].length = 3; 337 | pluginMultidata[0].name = "netdata"; 338 | pluginMultidata[0].type = PluginFieldType::kFLOAT32; 339 | int scale[3] = { 8, 16, 32 }; 340 | int plugindata[3][8]; 341 | std::string names[3]; 342 | for (int k = 1; k < 4; k++) 343 | { 344 | plugindata[k - 1][0] = Yolo::INPUT_W / scale[k - 1]; 345 | plugindata[k - 1][1] = Yolo::INPUT_H / scale[k - 1]; 346 | for (int i = 2; i < 8; i++) 347 | { 348 | plugindata[k - 1][i] = int(anchors_yolo[(k - 1) * 6 + i - 2]); 349 | } 350 | pluginMultidata[k].data = plugindata[k - 1]; 351 | pluginMultidata[k].length = 8; 352 | names[k - 1] = "yolodata" + std::to_string(k); 353 | pluginMultidata[k].name = names[k - 1].c_str(); 354 | pluginMultidata[k].type = PluginFieldType::kFLOAT32; 355 | } 356 | PluginFieldCollection pluginData; 357 | pluginData.nbFields = 4; 358 | pluginData.fields = pluginMultidata; 359 | IPluginV2 *pluginObj = creator->createPlugin("yololayer", &pluginData); 360 | ITensor* inputTensors_yolo[] = { det2->getOutput(0), det1->getOutput(0), det0->getOutput(0) }; 361 | auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj); 362 | return yolo; 363 | } 364 | #endif 365 | 366 | -------------------------------------------------------------------------------- /gen_wts.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import struct 3 | from utils.torch_utils import select_device 4 | 5 | # Initialize 6 | device = select_device('cpu') 7 | # Load model 8 | model = torch.load('weights/yolov5s.pt', map_location=device)['model'].float() # load to FP32 9 | model.to(device).eval() 10 | 11 | f = open('yolov5s.wts', 'w') 12 | f.write('{}\n'.format(len(model.state_dict().keys()))) 13 | for k, v in model.state_dict().items(): 14 | vr = v.reshape(-1).cpu().numpy() 15 | f.write('{} {} '.format(k, len(vr))) 16 | for vv in vr: 17 | f.write(' ') 18 | f.write(struct.pack('>f',float(vv)).hex()) 19 | f.write('\n') 20 | -------------------------------------------------------------------------------- /logging.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef TENSORRT_LOGGING_H 18 | #define TENSORRT_LOGGING_H 19 | 20 | #include "NvInferRuntimeCommon.h" 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | using Severity = nvinfer1::ILogger::Severity; 30 | 31 | class LogStreamConsumerBuffer : public std::stringbuf 32 | { 33 | public: 34 | LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) 35 | : mOutput(stream) 36 | , mPrefix(prefix) 37 | , mShouldLog(shouldLog) 38 | { 39 | } 40 | 41 | LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) 42 | : mOutput(other.mOutput) 43 | { 44 | } 45 | 46 | ~LogStreamConsumerBuffer() 47 | { 48 | // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence 49 | // std::streambuf::pptr() gives a pointer to the current position of the output sequence 50 | // if the pointer to the beginning is not equal to the pointer to the current position, 51 | // call putOutput() to log the output to the stream 52 | if (pbase() != pptr()) 53 | { 54 | putOutput(); 55 | } 56 | } 57 | 58 | // synchronizes the stream buffer and returns 0 on success 59 | // synchronizing the stream buffer consists of inserting the buffer contents into the stream, 60 | // resetting the buffer and flushing the stream 61 | virtual int sync() 62 | { 63 | putOutput(); 64 | return 0; 65 | } 66 | 67 | void putOutput() 68 | { 69 | if (mShouldLog) 70 | { 71 | // prepend timestamp 72 | std::time_t timestamp = std::time(nullptr); 73 | tm* tm_local = std::localtime(×tamp); 74 | std::cout << "["; 75 | std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; 76 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; 77 | std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; 78 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; 79 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; 80 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; 81 | // std::stringbuf::str() gets the string contents of the buffer 82 | // insert the buffer contents pre-appended by the appropriate prefix into the stream 83 | mOutput << mPrefix << str(); 84 | // set the buffer to empty 85 | str(""); 86 | // flush the stream 87 | mOutput.flush(); 88 | } 89 | } 90 | 91 | void setShouldLog(bool shouldLog) 92 | { 93 | mShouldLog = shouldLog; 94 | } 95 | 96 | private: 97 | std::ostream& mOutput; 98 | std::string mPrefix; 99 | bool mShouldLog; 100 | }; 101 | 102 | //! 103 | //! \class LogStreamConsumerBase 104 | //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer 105 | //! 106 | class LogStreamConsumerBase 107 | { 108 | public: 109 | LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) 110 | : mBuffer(stream, prefix, shouldLog) 111 | { 112 | } 113 | 114 | protected: 115 | LogStreamConsumerBuffer mBuffer; 116 | }; 117 | 118 | //! 119 | //! \class LogStreamConsumer 120 | //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. 121 | //! Order of base classes is LogStreamConsumerBase and then std::ostream. 122 | //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field 123 | //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. 124 | //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. 125 | //! Please do not change the order of the parent classes. 126 | //! 127 | class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream 128 | { 129 | public: 130 | //! \brief Creates a LogStreamConsumer which logs messages with level severity. 131 | //! Reportable severity determines if the messages are severe enough to be logged. 132 | LogStreamConsumer(Severity reportableSeverity, Severity severity) 133 | : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) 134 | , std::ostream(&mBuffer) // links the stream buffer with the stream 135 | , mShouldLog(severity <= reportableSeverity) 136 | , mSeverity(severity) 137 | { 138 | } 139 | 140 | LogStreamConsumer(LogStreamConsumer&& other) 141 | : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) 142 | , std::ostream(&mBuffer) // links the stream buffer with the stream 143 | , mShouldLog(other.mShouldLog) 144 | , mSeverity(other.mSeverity) 145 | { 146 | } 147 | 148 | void setReportableSeverity(Severity reportableSeverity) 149 | { 150 | mShouldLog = mSeverity <= reportableSeverity; 151 | mBuffer.setShouldLog(mShouldLog); 152 | } 153 | 154 | private: 155 | static std::ostream& severityOstream(Severity severity) 156 | { 157 | return severity >= Severity::kINFO ? std::cout : std::cerr; 158 | } 159 | 160 | static std::string severityPrefix(Severity severity) 161 | { 162 | switch (severity) 163 | { 164 | case Severity::kINTERNAL_ERROR: return "[F] "; 165 | case Severity::kERROR: return "[E] "; 166 | case Severity::kWARNING: return "[W] "; 167 | case Severity::kINFO: return "[I] "; 168 | case Severity::kVERBOSE: return "[V] "; 169 | default: assert(0); return ""; 170 | } 171 | } 172 | 173 | bool mShouldLog; 174 | Severity mSeverity; 175 | }; 176 | 177 | //! \class Logger 178 | //! 179 | //! \brief Class which manages logging of TensorRT tools and samples 180 | //! 181 | //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, 182 | //! and supports logging two types of messages: 183 | //! 184 | //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) 185 | //! - Test pass/fail messages 186 | //! 187 | //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is 188 | //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. 189 | //! 190 | //! In the future, this class could be extended to support dumping test results to a file in some standard format 191 | //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). 192 | //! 193 | //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger 194 | //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT 195 | //! library and messages coming from the sample. 196 | //! 197 | //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the 198 | //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger 199 | //! object. 200 | 201 | class Logger : public nvinfer1::ILogger 202 | { 203 | public: 204 | Logger(Severity severity = Severity::kWARNING) 205 | : mReportableSeverity(severity) 206 | { 207 | } 208 | 209 | //! 210 | //! \enum TestResult 211 | //! \brief Represents the state of a given test 212 | //! 213 | enum class TestResult 214 | { 215 | kRUNNING, //!< The test is running 216 | kPASSED, //!< The test passed 217 | kFAILED, //!< The test failed 218 | kWAIVED //!< The test was waived 219 | }; 220 | 221 | //! 222 | //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger 223 | //! \return The nvinfer1::ILogger associated with this Logger 224 | //! 225 | //! TODO Once all samples are updated to use this method to register the logger with TensorRT, 226 | //! we can eliminate the inheritance of Logger from ILogger 227 | //! 228 | nvinfer1::ILogger& getTRTLogger() 229 | { 230 | return *this; 231 | } 232 | 233 | //! 234 | //! \brief Implementation of the nvinfer1::ILogger::log() virtual method 235 | //! 236 | //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the 237 | //! inheritance from nvinfer1::ILogger 238 | //! 239 | void log(Severity severity, const char* msg) override 240 | { 241 | LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; 242 | } 243 | 244 | //! 245 | //! \brief Method for controlling the verbosity of logging output 246 | //! 247 | //! \param severity The logger will only emit messages that have severity of this level or higher. 248 | //! 249 | void setReportableSeverity(Severity severity) 250 | { 251 | mReportableSeverity = severity; 252 | } 253 | 254 | //! 255 | //! \brief Opaque handle that holds logging information for a particular test 256 | //! 257 | //! This object is an opaque handle to information used by the Logger to print test results. 258 | //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used 259 | //! with Logger::reportTest{Start,End}(). 260 | //! 261 | class TestAtom 262 | { 263 | public: 264 | TestAtom(TestAtom&&) = default; 265 | 266 | private: 267 | friend class Logger; 268 | 269 | TestAtom(bool started, const std::string& name, const std::string& cmdline) 270 | : mStarted(started) 271 | , mName(name) 272 | , mCmdline(cmdline) 273 | { 274 | } 275 | 276 | bool mStarted; 277 | std::string mName; 278 | std::string mCmdline; 279 | }; 280 | 281 | //! 282 | //! \brief Define a test for logging 283 | //! 284 | //! \param[in] name The name of the test. This should be a string starting with 285 | //! "TensorRT" and containing dot-separated strings containing 286 | //! the characters [A-Za-z0-9_]. 287 | //! For example, "TensorRT.sample_googlenet" 288 | //! \param[in] cmdline The command line used to reproduce the test 289 | // 290 | //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). 291 | //! 292 | static TestAtom defineTest(const std::string& name, const std::string& cmdline) 293 | { 294 | return TestAtom(false, name, cmdline); 295 | } 296 | 297 | //! 298 | //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments 299 | //! as input 300 | //! 301 | //! \param[in] name The name of the test 302 | //! \param[in] argc The number of command-line arguments 303 | //! \param[in] argv The array of command-line arguments (given as C strings) 304 | //! 305 | //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). 306 | static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) 307 | { 308 | auto cmdline = genCmdlineString(argc, argv); 309 | return defineTest(name, cmdline); 310 | } 311 | 312 | //! 313 | //! \brief Report that a test has started. 314 | //! 315 | //! \pre reportTestStart() has not been called yet for the given testAtom 316 | //! 317 | //! \param[in] testAtom The handle to the test that has started 318 | //! 319 | static void reportTestStart(TestAtom& testAtom) 320 | { 321 | reportTestResult(testAtom, TestResult::kRUNNING); 322 | assert(!testAtom.mStarted); 323 | testAtom.mStarted = true; 324 | } 325 | 326 | //! 327 | //! \brief Report that a test has ended. 328 | //! 329 | //! \pre reportTestStart() has been called for the given testAtom 330 | //! 331 | //! \param[in] testAtom The handle to the test that has ended 332 | //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, 333 | //! TestResult::kFAILED, TestResult::kWAIVED 334 | //! 335 | static void reportTestEnd(const TestAtom& testAtom, TestResult result) 336 | { 337 | assert(result != TestResult::kRUNNING); 338 | assert(testAtom.mStarted); 339 | reportTestResult(testAtom, result); 340 | } 341 | 342 | static int reportPass(const TestAtom& testAtom) 343 | { 344 | reportTestEnd(testAtom, TestResult::kPASSED); 345 | return EXIT_SUCCESS; 346 | } 347 | 348 | static int reportFail(const TestAtom& testAtom) 349 | { 350 | reportTestEnd(testAtom, TestResult::kFAILED); 351 | return EXIT_FAILURE; 352 | } 353 | 354 | static int reportWaive(const TestAtom& testAtom) 355 | { 356 | reportTestEnd(testAtom, TestResult::kWAIVED); 357 | return EXIT_SUCCESS; 358 | } 359 | 360 | static int reportTest(const TestAtom& testAtom, bool pass) 361 | { 362 | return pass ? reportPass(testAtom) : reportFail(testAtom); 363 | } 364 | 365 | Severity getReportableSeverity() const 366 | { 367 | return mReportableSeverity; 368 | } 369 | 370 | private: 371 | //! 372 | //! \brief returns an appropriate string for prefixing a log message with the given severity 373 | //! 374 | static const char* severityPrefix(Severity severity) 375 | { 376 | switch (severity) 377 | { 378 | case Severity::kINTERNAL_ERROR: return "[F] "; 379 | case Severity::kERROR: return "[E] "; 380 | case Severity::kWARNING: return "[W] "; 381 | case Severity::kINFO: return "[I] "; 382 | case Severity::kVERBOSE: return "[V] "; 383 | default: assert(0); return ""; 384 | } 385 | } 386 | 387 | //! 388 | //! \brief returns an appropriate string for prefixing a test result message with the given result 389 | //! 390 | static const char* testResultString(TestResult result) 391 | { 392 | switch (result) 393 | { 394 | case TestResult::kRUNNING: return "RUNNING"; 395 | case TestResult::kPASSED: return "PASSED"; 396 | case TestResult::kFAILED: return "FAILED"; 397 | case TestResult::kWAIVED: return "WAIVED"; 398 | default: assert(0); return ""; 399 | } 400 | } 401 | 402 | //! 403 | //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity 404 | //! 405 | static std::ostream& severityOstream(Severity severity) 406 | { 407 | return severity >= Severity::kINFO ? std::cout : std::cerr; 408 | } 409 | 410 | //! 411 | //! \brief method that implements logging test results 412 | //! 413 | static void reportTestResult(const TestAtom& testAtom, TestResult result) 414 | { 415 | severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " 416 | << testAtom.mCmdline << std::endl; 417 | } 418 | 419 | //! 420 | //! \brief generate a command line string from the given (argc, argv) values 421 | //! 422 | static std::string genCmdlineString(int argc, char const* const* argv) 423 | { 424 | std::stringstream ss; 425 | for (int i = 0; i < argc; i++) 426 | { 427 | if (i > 0) 428 | ss << " "; 429 | ss << argv[i]; 430 | } 431 | return ss.str(); 432 | } 433 | 434 | Severity mReportableSeverity; 435 | }; 436 | 437 | namespace 438 | { 439 | 440 | //! 441 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE 442 | //! 443 | //! Example usage: 444 | //! 445 | //! LOG_VERBOSE(logger) << "hello world" << std::endl; 446 | //! 447 | inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) 448 | { 449 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); 450 | } 451 | 452 | //! 453 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO 454 | //! 455 | //! Example usage: 456 | //! 457 | //! LOG_INFO(logger) << "hello world" << std::endl; 458 | //! 459 | inline LogStreamConsumer LOG_INFO(const Logger& logger) 460 | { 461 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); 462 | } 463 | 464 | //! 465 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING 466 | //! 467 | //! Example usage: 468 | //! 469 | //! LOG_WARN(logger) << "hello world" << std::endl; 470 | //! 471 | inline LogStreamConsumer LOG_WARN(const Logger& logger) 472 | { 473 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); 474 | } 475 | 476 | //! 477 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR 478 | //! 479 | //! Example usage: 480 | //! 481 | //! LOG_ERROR(logger) << "hello world" << std::endl; 482 | //! 483 | inline LogStreamConsumer LOG_ERROR(const Logger& logger) 484 | { 485 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); 486 | } 487 | 488 | //! 489 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR 490 | // ("fatal" severity) 491 | //! 492 | //! Example usage: 493 | //! 494 | //! LOG_FATAL(logger) << "hello world" << std::endl; 495 | //! 496 | inline LogStreamConsumer LOG_FATAL(const Logger& logger) 497 | { 498 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); 499 | } 500 | 501 | } // anonymous namespace 502 | 503 | #endif // TENSORRT_LOGGING_H 504 | -------------------------------------------------------------------------------- /test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenJetson/tensorrt-yolov5/7c79e973e40c3e41488021c3efcc2f4f3b283ed8/test.png -------------------------------------------------------------------------------- /utils.h: -------------------------------------------------------------------------------- 1 | #ifndef __TRT_UTILS_H_ 2 | #define __TRT_UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #ifndef CUDA_CHECK 10 | 11 | #define CUDA_CHECK(callstr) \ 12 | { \ 13 | cudaError_t error_code = callstr; \ 14 | if (error_code != cudaSuccess) { \ 15 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ 16 | assert(0); \ 17 | } \ 18 | } 19 | 20 | #endif 21 | 22 | namespace Tn 23 | { 24 | class Profiler : public nvinfer1::IProfiler 25 | { 26 | public: 27 | void printLayerTimes(int itrationsTimes) 28 | { 29 | float totalTime = 0; 30 | for (size_t i = 0; i < mProfile.size(); i++) 31 | { 32 | printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes); 33 | totalTime += mProfile[i].second; 34 | } 35 | printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes); 36 | } 37 | private: 38 | typedef std::pair Record; 39 | std::vector mProfile; 40 | 41 | virtual void reportLayerTime(const char* layerName, float ms) 42 | { 43 | auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; }); 44 | if (record == mProfile.end()) 45 | mProfile.push_back(std::make_pair(layerName, ms)); 46 | else 47 | record->second += ms; 48 | } 49 | }; 50 | 51 | //Logger for TensorRT info/warning/errors 52 | class Logger : public nvinfer1::ILogger 53 | { 54 | public: 55 | 56 | Logger(): Logger(Severity::kWARNING) {} 57 | 58 | Logger(Severity severity): reportableSeverity(severity) {} 59 | 60 | void log(Severity severity, const char* msg) override 61 | { 62 | // suppress messages with severity enum value greater than the reportable 63 | if (severity > reportableSeverity) return; 64 | 65 | switch (severity) 66 | { 67 | case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break; 68 | case Severity::kERROR: std::cerr << "ERROR: "; break; 69 | case Severity::kWARNING: std::cerr << "WARNING: "; break; 70 | case Severity::kINFO: std::cerr << "INFO: "; break; 71 | default: std::cerr << "UNKNOWN: "; break; 72 | } 73 | std::cerr << msg << std::endl; 74 | } 75 | 76 | Severity reportableSeverity{Severity::kWARNING}; 77 | }; 78 | 79 | template 80 | void write(char*& buffer, const T& val) 81 | { 82 | *reinterpret_cast(buffer) = val; 83 | buffer += sizeof(T); 84 | } 85 | 86 | template 87 | void read(const char*& buffer, T& val) 88 | { 89 | val = *reinterpret_cast(buffer); 90 | buffer += sizeof(T); 91 | } 92 | } 93 | 94 | #endif -------------------------------------------------------------------------------- /yololayer.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "yololayer.h" 3 | #include "utils.h" 4 | 5 | using namespace Yolo; 6 | 7 | namespace nvinfer1 8 | { 9 | YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector& vYoloKernel) 10 | { 11 | mClassCount = classCount; 12 | mYoloV5NetWidth = netWidth; 13 | mYoloV5NetHeight = netHeight; 14 | mMaxOutObject = maxOut; 15 | mYoloKernel = vYoloKernel; 16 | mKernelCount = vYoloKernel.size(); 17 | 18 | CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); 19 | size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2; 20 | for (int ii = 0; ii < mKernelCount; ii++) 21 | { 22 | CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen)); 23 | const auto& yolo = mYoloKernel[ii]; 24 | CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); 25 | } 26 | } 27 | YoloLayerPlugin::~YoloLayerPlugin() 28 | { 29 | for (int ii = 0; ii < mKernelCount; ii++) 30 | { 31 | CUDA_CHECK(cudaFree(mAnchor[ii])); 32 | } 33 | CUDA_CHECK(cudaFreeHost(mAnchor)); 34 | } 35 | 36 | // create the plugin at runtime from a byte stream 37 | YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) 38 | { 39 | using namespace Tn; 40 | const char *d = reinterpret_cast(data), *a = d; 41 | read(d, mClassCount); 42 | read(d, mThreadCount); 43 | read(d, mKernelCount); 44 | read(d, mYoloV5NetWidth); 45 | read(d, mYoloV5NetHeight); 46 | read(d, mMaxOutObject); 47 | mYoloKernel.resize(mKernelCount); 48 | auto kernelSize = mKernelCount * sizeof(YoloKernel); 49 | memcpy(mYoloKernel.data(), d, kernelSize); 50 | d += kernelSize; 51 | CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); 52 | size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2; 53 | for (int ii = 0; ii < mKernelCount; ii++) 54 | { 55 | CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen)); 56 | const auto& yolo = mYoloKernel[ii]; 57 | CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); 58 | } 59 | assert(d == a + length); 60 | } 61 | 62 | void YoloLayerPlugin::serialize(void* buffer) const 63 | { 64 | using namespace Tn; 65 | char* d = static_cast(buffer), *a = d; 66 | write(d, mClassCount); 67 | write(d, mThreadCount); 68 | write(d, mKernelCount); 69 | write(d, mYoloV5NetWidth); 70 | write(d, mYoloV5NetHeight); 71 | write(d, mMaxOutObject); 72 | auto kernelSize = mKernelCount * sizeof(YoloKernel); 73 | memcpy(d, mYoloKernel.data(), kernelSize); 74 | d += kernelSize; 75 | 76 | assert(d == a + getSerializationSize()); 77 | } 78 | 79 | size_t YoloLayerPlugin::getSerializationSize() const 80 | { 81 | return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size() + sizeof(mYoloV5NetWidth) + sizeof(mYoloV5NetHeight) + sizeof(mMaxOutObject); 82 | } 83 | 84 | int YoloLayerPlugin::initialize() 85 | { 86 | return 0; 87 | } 88 | 89 | Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) 90 | { 91 | //output the result to channel 92 | int totalsize = mMaxOutObject * sizeof(Detection) / sizeof(float); 93 | 94 | return Dims3(totalsize + 1, 1, 1); 95 | } 96 | 97 | // Set plugin namespace 98 | void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) 99 | { 100 | mPluginNamespace = pluginNamespace; 101 | } 102 | 103 | const char* YoloLayerPlugin::getPluginNamespace() const 104 | { 105 | return mPluginNamespace; 106 | } 107 | 108 | // Return the DataType of the plugin output at the requested index 109 | DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const 110 | { 111 | return DataType::kFLOAT; 112 | } 113 | 114 | // Return true if output tensor is broadcast across a batch. 115 | bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const 116 | { 117 | return false; 118 | } 119 | 120 | // Return true if plugin can use input that is broadcast across batch without replication. 121 | bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const 122 | { 123 | return false; 124 | } 125 | 126 | void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) 127 | { 128 | } 129 | 130 | // Attach the plugin object to an execution context and grant the plugin the access to some context resource. 131 | void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) 132 | { 133 | } 134 | 135 | // Detach the plugin object from its execution context. 136 | void YoloLayerPlugin::detachFromContext() {} 137 | 138 | const char* YoloLayerPlugin::getPluginType() const 139 | { 140 | return "YoloLayer_TRT"; 141 | } 142 | 143 | const char* YoloLayerPlugin::getPluginVersion() const 144 | { 145 | return "1"; 146 | } 147 | 148 | void YoloLayerPlugin::destroy() 149 | { 150 | delete this; 151 | } 152 | 153 | // Clone the plugin 154 | IPluginV2IOExt* YoloLayerPlugin::clone() const 155 | { 156 | YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, mYoloKernel); 157 | p->setPluginNamespace(mPluginNamespace); 158 | return p; 159 | } 160 | 161 | __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }; 162 | 163 | __global__ void CalDetection(const float *input, float *output, int noElements, 164 | const int netwidth, const int netheight, int maxoutobject, int yoloWidth, int yoloHeight, const float anchors[CHECK_COUNT * 2], int classes, int outputElem) 165 | { 166 | 167 | int idx = threadIdx.x + blockDim.x * blockIdx.x; 168 | if (idx >= noElements) return; 169 | 170 | int total_grid = yoloWidth * yoloHeight; 171 | int bnIdx = idx / total_grid; 172 | idx = idx - total_grid * bnIdx; 173 | int info_len_i = 5 + classes; 174 | const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT); 175 | 176 | for (int k = 0; k < 3; ++k) { 177 | float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]); 178 | if (box_prob < IGNORE_THRESH) continue; 179 | int class_id = 0; 180 | float max_cls_prob = 0.0; 181 | for (int i = 5; i < info_len_i; ++i) { 182 | float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]); 183 | if (p > max_cls_prob) { 184 | max_cls_prob = p; 185 | class_id = i - 5; 186 | } 187 | } 188 | float *res_count = output + bnIdx * outputElem; 189 | int count = (int)atomicAdd(res_count, 1); 190 | if (count >= maxoutobject) return; 191 | char* data = (char *)res_count + sizeof(float) + count * sizeof(Detection); 192 | Detection* det = (Detection*)(data); 193 | 194 | int row = idx / yoloWidth; 195 | int col = idx % yoloWidth; 196 | 197 | //Location 198 | // pytorch: 199 | // y = x[i].sigmoid() 200 | // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i] # xy 201 | // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh 202 | // X: (sigmoid(tx) + cx)/FeaturemapW * netwidth 203 | det->bbox[0] = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth; 204 | det->bbox[1] = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight; 205 | 206 | // W: (Pw * e^tw) / FeaturemapW * netwidth 207 | // v5: https://github.com/ultralytics/yolov5/issues/471 208 | det->bbox[2] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]); 209 | det->bbox[2] = det->bbox[2] * det->bbox[2] * anchors[2 * k]; 210 | det->bbox[3] = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]); 211 | det->bbox[3] = det->bbox[3] * det->bbox[3] * anchors[2 * k + 1]; 212 | det->conf = box_prob * max_cls_prob; 213 | det->class_id = class_id; 214 | } 215 | } 216 | 217 | void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) 218 | { 219 | int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float); 220 | for (int idx = 0; idx < batchSize; ++idx) { 221 | CUDA_CHECK(cudaMemset(output + idx * outputElem, 0, sizeof(float))); 222 | } 223 | int numElem = 0; 224 | for (unsigned int i = 0; i < mYoloKernel.size(); ++i) 225 | { 226 | const auto& yolo = mYoloKernel[i]; 227 | numElem = yolo.width*yolo.height*batchSize; 228 | if (numElem < mThreadCount) 229 | mThreadCount = numElem; 230 | 231 | //printf("Net: %d %d \n", mYoloV5NetWidth, mYoloV5NetHeight); 232 | CalDetection << < (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount >> > 233 | (inputs[i], output, numElem, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, yolo.width, yolo.height, (float *)mAnchor[i], mClassCount, outputElem); 234 | } 235 | } 236 | 237 | 238 | int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) 239 | { 240 | forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize); 241 | return 0; 242 | } 243 | 244 | PluginFieldCollection YoloPluginCreator::mFC{}; 245 | std::vector YoloPluginCreator::mPluginAttributes; 246 | 247 | YoloPluginCreator::YoloPluginCreator() 248 | { 249 | mPluginAttributes.clear(); 250 | 251 | mFC.nbFields = mPluginAttributes.size(); 252 | mFC.fields = mPluginAttributes.data(); 253 | } 254 | 255 | const char* YoloPluginCreator::getPluginName() const 256 | { 257 | return "YoloLayer_TRT"; 258 | } 259 | 260 | const char* YoloPluginCreator::getPluginVersion() const 261 | { 262 | return "1"; 263 | } 264 | 265 | const PluginFieldCollection* YoloPluginCreator::getFieldNames() 266 | { 267 | return &mFC; 268 | } 269 | 270 | IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) 271 | { 272 | int class_count = 80; 273 | int input_w = 416; 274 | int input_h = 416; 275 | int max_output_object_count = 1000; 276 | std::vector yolo_kernels(3); 277 | 278 | const PluginField* fields = fc->fields; 279 | for (int i = 0; i < fc->nbFields; i++) { 280 | if (strcmp(fields[i].name, "netdata") == 0) { 281 | assert(fields[i].type == PluginFieldType::kFLOAT32); 282 | int *tmp = (int*)(fields[i].data); 283 | class_count = tmp[0]; 284 | input_w = tmp[1]; 285 | input_h = tmp[2]; 286 | max_output_object_count = tmp[3]; 287 | } else if (strstr(fields[i].name, "yolodata") != NULL) { 288 | assert(fields[i].type == PluginFieldType::kFLOAT32); 289 | int *tmp = (int*)(fields[i].data); 290 | YoloKernel kernel; 291 | kernel.width = tmp[0]; 292 | kernel.height = tmp[1]; 293 | for (int j = 0; j < fields[i].length - 2; j++) { 294 | kernel.anchors[j] = tmp[j + 2]; 295 | } 296 | yolo_kernels[2 - (fields[i].name[8] - '1')] = kernel; 297 | } 298 | } 299 | YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, yolo_kernels); 300 | obj->setPluginNamespace(mNamespace.c_str()); 301 | return obj; 302 | } 303 | 304 | IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) 305 | { 306 | // This object will be deleted when the network is destroyed, which will 307 | // call YoloLayerPlugin::destroy() 308 | YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); 309 | obj->setPluginNamespace(mNamespace.c_str()); 310 | return obj; 311 | } 312 | } 313 | -------------------------------------------------------------------------------- /yololayer.h: -------------------------------------------------------------------------------- 1 | #ifndef _YOLO_LAYER_H 2 | #define _YOLO_LAYER_H 3 | 4 | #include 5 | #include 6 | #include "NvInfer.h" 7 | 8 | namespace Yolo 9 | { 10 | static constexpr int CHECK_COUNT = 3; 11 | static constexpr float IGNORE_THRESH = 0.1f; 12 | struct YoloKernel 13 | { 14 | int width; 15 | int height; 16 | float anchors[CHECK_COUNT * 2]; 17 | }; 18 | static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000; 19 | static constexpr int CLASS_NUM = 80; 20 | static constexpr int INPUT_H = 608; 21 | static constexpr int INPUT_W = 608; 22 | 23 | static constexpr int LOCATIONS = 4; 24 | struct alignas(float) Detection { 25 | //center_x center_y w h 26 | float bbox[LOCATIONS]; 27 | float conf; // bbox_conf * cls_conf 28 | float class_id; 29 | }; 30 | } 31 | 32 | namespace nvinfer1 33 | { 34 | class YoloLayerPlugin : public IPluginV2IOExt 35 | { 36 | public: 37 | YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector& vYoloKernel); 38 | YoloLayerPlugin(const void* data, size_t length); 39 | ~YoloLayerPlugin(); 40 | 41 | int getNbOutputs() const override 42 | { 43 | return 1; 44 | } 45 | 46 | Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; 47 | 48 | int initialize() override; 49 | 50 | virtual void terminate() override {}; 51 | 52 | virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0; } 53 | 54 | virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; 55 | 56 | virtual size_t getSerializationSize() const override; 57 | 58 | virtual void serialize(void* buffer) const override; 59 | 60 | bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override { 61 | return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; 62 | } 63 | 64 | const char* getPluginType() const override; 65 | 66 | const char* getPluginVersion() const override; 67 | 68 | void destroy() override; 69 | 70 | IPluginV2IOExt* clone() const override; 71 | 72 | void setPluginNamespace(const char* pluginNamespace) override; 73 | 74 | const char* getPluginNamespace() const override; 75 | 76 | DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override; 77 | 78 | bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override; 79 | 80 | bool canBroadcastInputAcrossBatch(int inputIndex) const override; 81 | 82 | void attachToContext( 83 | cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override; 84 | 85 | void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override; 86 | 87 | void detachFromContext() override; 88 | 89 | private: 90 | void forwardGpu(const float *const * inputs, float * output, cudaStream_t stream, int batchSize = 1); 91 | int mThreadCount = 256; 92 | const char* mPluginNamespace; 93 | int mKernelCount; 94 | int mClassCount; 95 | int mYoloV5NetWidth; 96 | int mYoloV5NetHeight; 97 | int mMaxOutObject; 98 | std::vector mYoloKernel; 99 | void** mAnchor; 100 | }; 101 | 102 | class YoloPluginCreator : public IPluginCreator 103 | { 104 | public: 105 | YoloPluginCreator(); 106 | 107 | ~YoloPluginCreator() override = default; 108 | 109 | const char* getPluginName() const override; 110 | 111 | const char* getPluginVersion() const override; 112 | 113 | const PluginFieldCollection* getFieldNames() override; 114 | 115 | IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override; 116 | 117 | IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; 118 | 119 | void setPluginNamespace(const char* libNamespace) override 120 | { 121 | mNamespace = libNamespace; 122 | } 123 | 124 | const char* getPluginNamespace() const override 125 | { 126 | return mNamespace.c_str(); 127 | } 128 | 129 | private: 130 | std::string mNamespace; 131 | static PluginFieldCollection mFC; 132 | static std::vector mPluginAttributes; 133 | }; 134 | REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); 135 | }; 136 | 137 | #endif 138 | -------------------------------------------------------------------------------- /yolov5.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "cuda_runtime_api.h" 4 | #include "logging.h" 5 | #include "common.hpp" 6 | 7 | #define USE_FP16 // comment out this if want to use FP32 8 | #define DEVICE 0 // GPU id 9 | #define NMS_THRESH 0.4 10 | #define CONF_THRESH 0.5 11 | #define BATCH_SIZE 1 12 | 13 | #define NET s // s m l x 14 | #define NETSTRUCT(str) createEngine_##str 15 | #define CREATENET(net) NETSTRUCT(net) 16 | #define STR1(x) #x 17 | #define STR2(x) STR1(x) 18 | 19 | // stuff we know about the network and the input/output blobs 20 | static const int INPUT_H = Yolo::INPUT_H; 21 | static const int INPUT_W = Yolo::INPUT_W; 22 | static const int CLASS_NUM = Yolo::CLASS_NUM; 23 | static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1; // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1 24 | const char* INPUT_BLOB_NAME = "data"; 25 | const char* OUTPUT_BLOB_NAME = "prob"; 26 | static Logger gLogger; 27 | 28 | char *coco_classes[] = {"person","bicycle","car","motorcycle","airplane","bus","train","truck","boat","traffic light","fire hydrant","stop sign","parking meter","bench","bird","cat","dog","horse","sheep","cow","elephant","bear","zebra","giraffe","backpack","umbrella","handbag","tie","suitcase","frisbee","skis","snowboard","sports ball","kite","baseball bat","baseball glove","skateboard","surfboard", "tennis racket","bottle","wine glass","cup","fork","knife","spoon","bowl","banana","apple","sandwich","orange","broccoli","carrot","hot dog","pizza","donut","cake","chair","couch","potted plant","bed", "dining table","toilet","tv","laptop","mouse","remote","keyboard","cell phone","microwave","oven","toaster","sink","refrigerator","book","clock","vase","scissors","teddy bear","hair drier","toothbrush"}; 29 | 30 | // Creat the engine using only the API and not any parser. 31 | ICudaEngine* createEngine_s(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { 32 | INetworkDefinition* network = builder->createNetworkV2(0U); 33 | 34 | // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME 35 | ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W }); 36 | assert(data); 37 | 38 | std::map weightMap = loadWeights("../yolov5s.wts"); 39 | Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; 40 | 41 | // yolov5 backbone 42 | auto focus0 = focus(network, weightMap, *data, 3, 32, 3, "model.0"); 43 | auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), 64, 3, 2, 1, "model.1"); 44 | auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 64, 64, 1, true, 1, 0.5, "model.2"); 45 | auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), 128, 3, 2, 1, "model.3"); 46 | auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 128, 128, 3, true, 1, 0.5, "model.4"); 47 | auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), 256, 3, 2, 1, "model.5"); 48 | auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 256, 256, 3, true, 1, 0.5, "model.6"); 49 | auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), 512, 3, 2, 1, "model.7"); 50 | auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 512, 512, 5, 9, 13, "model.8"); 51 | 52 | // yolov5 head 53 | auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.9"); 54 | auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), 256, 1, 1, 1, "model.10"); 55 | 56 | float *deval = reinterpret_cast(malloc(sizeof(float) * 256 * 2 * 2)); 57 | for (int i = 0; i < 256 * 2 * 2; i++) { 58 | deval[i] = 1.0; 59 | } 60 | Weights deconvwts11{ DataType::kFLOAT, deval, 256 * 2 * 2 }; 61 | IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 256, DimsHW{ 2, 2 }, deconvwts11, emptywts); 62 | deconv11->setStrideNd(DimsHW{ 2, 2 }); 63 | deconv11->setNbGroups(256); 64 | weightMap["deconv11"] = deconvwts11; 65 | 66 | ITensor* inputTensors12[] = { deconv11->getOutput(0), bottleneck_csp6->getOutput(0) }; 67 | auto cat12 = network->addConcatenation(inputTensors12, 2); 68 | auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 512, 256, 1, false, 1, 0.5, "model.13"); 69 | auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), 128, 1, 1, 1, "model.14"); 70 | 71 | Weights deconvwts15{ DataType::kFLOAT, deval, 128 * 2 * 2 }; 72 | IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts15, emptywts); 73 | deconv15->setStrideNd(DimsHW{ 2, 2 }); 74 | deconv15->setNbGroups(128); 75 | 76 | ITensor* inputTensors16[] = { deconv15->getOutput(0), bottleneck_csp4->getOutput(0) }; 77 | auto cat16 = network->addConcatenation(inputTensors16, 2); 78 | auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 256, 128, 1, false, 1, 0.5, "model.17"); 79 | IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]); 80 | 81 | auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), 128, 3, 2, 1, "model.18"); 82 | ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) }; 83 | auto cat19 = network->addConcatenation(inputTensors19, 2); 84 | auto bottleneck_csp20 = bottleneckCSP(network, weightMap, *cat19->getOutput(0), 256, 256, 1, false, 1, 0.5, "model.20"); 85 | IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]); 86 | 87 | auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), 256, 3, 2, 1, "model.21"); 88 | ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) }; 89 | auto cat22 = network->addConcatenation(inputTensors22, 2); 90 | auto bottleneck_csp23 = bottleneckCSP(network, weightMap, *cat22->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.23"); 91 | IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]); 92 | 93 | auto yolo = addYoLoLayer(network, weightMap, det0, det1, det2); 94 | yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME); 95 | network->markOutput(*yolo->getOutput(0)); 96 | 97 | // Build engine 98 | builder->setMaxBatchSize(maxBatchSize); 99 | config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB 100 | #ifdef USE_FP16 101 | config->setFlag(BuilderFlag::kFP16); 102 | #endif 103 | std::cout << "Building engine, please wait for a while..." << std::endl; 104 | ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); 105 | std::cout << "Build engine successfully!" << std::endl; 106 | 107 | // Don't need the network any more 108 | network->destroy(); 109 | 110 | // Release host memory 111 | for (auto& mem : weightMap) 112 | { 113 | free((void*)(mem.second.values)); 114 | 115 | } 116 | return engine; 117 | } 118 | 119 | ICudaEngine* createEngine_m(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { 120 | INetworkDefinition* network = builder->createNetworkV2(0U); 121 | 122 | // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME 123 | ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W }); 124 | assert(data); 125 | 126 | std::map weightMap = loadWeights("../yolov5m.wts"); 127 | Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; 128 | 129 | /* ------ yolov5 backbone------ */ 130 | auto focus0 = focus(network, weightMap, *data, 3, 48, 3, "model.0"); 131 | auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), 96, 3, 2, 1, "model.1"); 132 | auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 96, 96, 2, true, 1, 0.5, "model.2"); 133 | auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), 192, 3, 2, 1, "model.3"); 134 | auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 192, 192, 6, true, 1, 0.5, "model.4"); 135 | auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), 384, 3, 2, 1, "model.5"); 136 | auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 384, 384, 6, true, 1, 0.5, "model.6"); 137 | auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), 768, 3, 2, 1, "model.7"); 138 | auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 768, 768, 5, 9, 13, "model.8"); 139 | /* ------ yolov5 head ------ */ 140 | auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 768, 768, 2, false, 1, 0.5, "model.9"); 141 | auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), 384, 1, 1, 1, "model.10"); 142 | 143 | float *deval = reinterpret_cast(malloc(sizeof(float) * 384 * 2 * 2)); 144 | for (int i = 0; i < 384 * 2 * 2; i++) { 145 | deval[i] = 1.0; 146 | } 147 | Weights deconvwts11{ DataType::kFLOAT, deval, 384 * 2 * 2 }; 148 | IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 384, DimsHW{ 2, 2 }, deconvwts11, emptywts); 149 | deconv11->setStrideNd(DimsHW{ 2, 2 }); 150 | deconv11->setNbGroups(384); 151 | weightMap["deconv11"] = deconvwts11; 152 | ITensor* inputTensors12[] = { deconv11->getOutput(0), bottleneck_csp6->getOutput(0) }; 153 | auto cat12 = network->addConcatenation(inputTensors12, 2); 154 | 155 | auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 768, 384, 2, false, 1, 0.5, "model.13"); 156 | 157 | auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), 192, 1, 1, 1, "model.14"); 158 | 159 | Weights deconvwts15{ DataType::kFLOAT, deval, 192 * 2 * 2 }; 160 | IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 192, DimsHW{ 2, 2 }, deconvwts15, emptywts); 161 | deconv15->setStrideNd(DimsHW{ 2, 2 }); 162 | deconv15->setNbGroups(192); 163 | 164 | ITensor* inputTensors16[] = { deconv15->getOutput(0), bottleneck_csp4->getOutput(0) }; 165 | auto cat16 = network->addConcatenation(inputTensors16, 2); 166 | auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 384, 192, 2, false, 1, 0.5, "model.17"); 167 | 168 | //yolo layer 0 169 | IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]); 170 | auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), 192, 3, 2, 1, "model.18"); 171 | ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) }; 172 | auto cat19 = network->addConcatenation(inputTensors19, 2); 173 | auto bottleneck_csp20 = bottleneckCSP(network, weightMap, *cat19->getOutput(0), 384, 384, 2, false, 1, 0.5, "model.20"); 174 | 175 | //yolo layer 1 176 | IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]); 177 | auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), 384, 3, 2, 1, "model.21"); 178 | ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) }; 179 | auto cat22 = network->addConcatenation(inputTensors22, 2); 180 | auto bottleneck_csp23 = bottleneckCSP(network, weightMap, *cat22->getOutput(0), 768, 768, 2, false, 1, 0.5, "model.23"); 181 | // yolo layer 2 182 | IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]); 183 | 184 | auto yolo = addYoLoLayer(network, weightMap, det0, det1, det2); 185 | yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME); 186 | network->markOutput(*yolo->getOutput(0)); 187 | 188 | // Build engine 189 | builder->setMaxBatchSize(maxBatchSize); 190 | config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB 191 | #ifdef USE_FP16 192 | config->setFlag(BuilderFlag::kFP16); 193 | #endif 194 | std::cout << "Building engine, please wait for a while..." << std::endl; 195 | ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); 196 | std::cout << "Build engine successfully!" << std::endl; 197 | 198 | // Don't need the network any more 199 | network->destroy(); 200 | 201 | // Release host memory 202 | for (auto& mem : weightMap) 203 | { 204 | free((void*)(mem.second.values)); 205 | } 206 | 207 | return engine; 208 | } 209 | 210 | ICudaEngine* createEngine_l(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { 211 | INetworkDefinition* network = builder->createNetworkV2(0U); 212 | 213 | // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME 214 | ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W }); 215 | assert(data); 216 | 217 | std::map weightMap = loadWeights("../yolov5l.wts"); 218 | Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; 219 | 220 | /* ------ yolov5 backbone------ */ 221 | auto focus0 = focus(network, weightMap, *data, 3, 64, 3, "model.0"); 222 | auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), 128, 3, 2, 1, "model.1"); 223 | auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 128, 128, 3, true, 1, 0.5, "model.2"); 224 | auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), 256, 3, 2, 1, "model.3"); 225 | auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 256, 256, 9, true, 1, 0.5, "model.4"); 226 | auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), 512, 3, 2, 1, "model.5"); 227 | auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 512, 512, 9, true, 1, 0.5, "model.6"); 228 | auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), 1024, 3, 2, 1, "model.7"); 229 | auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 1024, 1024, 5, 9, 13, "model.8"); 230 | 231 | /* ------ yolov5 head ------ */ 232 | auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 1024, 1024, 3, false, 1, 0.5, "model.9"); 233 | auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), 512, 1, 1, 1, "model.10"); 234 | 235 | float *deval = reinterpret_cast(malloc(sizeof(float) * 512 * 2 * 2)); 236 | for (int i = 0; i < 512 * 2 * 2; i++) { 237 | deval[i] = 1.0; 238 | } 239 | Weights deconvwts11{ DataType::kFLOAT, deval, 512 * 2 * 2 }; 240 | IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 512, DimsHW{ 2, 2 }, deconvwts11, emptywts); 241 | deconv11->setStrideNd(DimsHW{ 2, 2 }); 242 | deconv11->setNbGroups(512); 243 | weightMap["deconv11"] = deconvwts11; 244 | 245 | ITensor* inputTensors12[] = { deconv11->getOutput(0), bottleneck_csp6->getOutput(0) }; 246 | auto cat12 = network->addConcatenation(inputTensors12, 2); 247 | auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 1024, 512, 3, false, 1, 0.5, "model.13"); 248 | auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), 256, 1, 1, 1, "model.14"); 249 | 250 | Weights deconvwts15{ DataType::kFLOAT, deval, 256 * 2 * 2 }; 251 | IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 256, DimsHW{ 2, 2 }, deconvwts15, emptywts); 252 | deconv15->setStrideNd(DimsHW{ 2, 2 }); 253 | deconv15->setNbGroups(256); 254 | ITensor* inputTensors16[] = { deconv15->getOutput(0), bottleneck_csp4->getOutput(0) }; 255 | auto cat16 = network->addConcatenation(inputTensors16, 2); 256 | 257 | auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 512, 256, 3, false, 1, 0.5, "model.17"); 258 | 259 | // yolo layer 0 260 | IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]); 261 | auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), 256, 3, 2, 1, "model.18"); 262 | ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) }; 263 | auto cat19 = network->addConcatenation(inputTensors19, 2); 264 | auto bottleneck_csp20 = bottleneckCSP(network, weightMap, *cat19->getOutput(0), 512, 512, 3, false, 1, 0.5, "model.20"); 265 | //yolo layer 1 266 | IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]); 267 | auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), 512, 3, 2, 1, "model.21"); 268 | ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) }; 269 | auto cat22 = network->addConcatenation(inputTensors22, 2); 270 | auto bottleneck_csp23 = bottleneckCSP(network, weightMap, *cat22->getOutput(0), 1024, 1024, 3, false, 1, 0.5, "model.23"); 271 | IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]); 272 | 273 | auto yolo = addYoLoLayer(network, weightMap, det0, det1, det2); 274 | yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME); 275 | network->markOutput(*yolo->getOutput(0)); 276 | 277 | // Build engine 278 | builder->setMaxBatchSize(maxBatchSize); 279 | config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB 280 | #ifdef USE_FP16 281 | config->setFlag(BuilderFlag::kFP16); 282 | #endif 283 | std::cout << "Building engine, please wait for a while..." << std::endl; 284 | ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); 285 | std::cout << "Build engine successfully!" << std::endl; 286 | 287 | // Don't need the network any more 288 | network->destroy(); 289 | 290 | // Release host memory 291 | for (auto& mem : weightMap) 292 | { 293 | free((void*)(mem.second.values)); 294 | } 295 | 296 | return engine; 297 | } 298 | 299 | ICudaEngine* createEngine_x(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt) { 300 | INetworkDefinition* network = builder->createNetworkV2(0U); 301 | 302 | // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME 303 | ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W }); 304 | assert(data); 305 | 306 | std::map weightMap = loadWeights("../yolov5x.wts"); 307 | Weights emptywts{ DataType::kFLOAT, nullptr, 0 }; 308 | 309 | /* ------ yolov5 backbone------ */ 310 | auto focus0 = focus(network, weightMap, *data, 3, 80, 3, "model.0"); 311 | auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), 160, 3, 2, 1, "model.1"); 312 | auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 160, 160, 4, true, 1, 0.5, "model.2"); 313 | auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), 320, 3, 2, 1, "model.3"); 314 | auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 320, 320, 12, true, 1, 0.5, "model.4"); 315 | auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), 640, 3, 2, 1, "model.5"); 316 | auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 640, 640, 12, true, 1, 0.5, "model.6"); 317 | auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), 1280, 3, 2, 1, "model.7"); 318 | auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 1280, 1280, 5, 9, 13, "model.8"); 319 | 320 | /* ------- yolov5 head ------- */ 321 | auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 1280, 1280, 4, false, 1, 0.5, "model.9"); 322 | auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), 640, 1, 1, 1, "model.10"); 323 | 324 | float *deval = reinterpret_cast(malloc(sizeof(float) * 640 * 2 * 2)); 325 | for (int i = 0; i < 640 * 2 * 2; i++) { 326 | deval[i] = 1.0; 327 | } 328 | Weights deconvwts11{ DataType::kFLOAT, deval, 640 * 2 * 2 }; 329 | IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 640, DimsHW{ 2, 2 }, deconvwts11, emptywts); 330 | deconv11->setStrideNd(DimsHW{ 2, 2 }); 331 | deconv11->setNbGroups(640); 332 | weightMap["deconv11"] = deconvwts11; 333 | 334 | ITensor* inputTensors12[] = { deconv11->getOutput(0), bottleneck_csp6->getOutput(0) }; 335 | auto cat12 = network->addConcatenation(inputTensors12, 2); 336 | 337 | auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 1280, 640, 4, false, 1, 0.5, "model.13"); 338 | auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), 320, 1, 1, 1, "model.14"); 339 | 340 | Weights deconvwts15{ DataType::kFLOAT, deval, 320 * 2 * 2 }; 341 | IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 320, DimsHW{ 2, 2 }, deconvwts15, emptywts); 342 | deconv15->setStrideNd(DimsHW{ 2, 2 }); 343 | deconv15->setNbGroups(320); 344 | ITensor* inputTensors16[] = { deconv15->getOutput(0), bottleneck_csp4->getOutput(0) }; 345 | auto cat16 = network->addConcatenation(inputTensors16, 2); 346 | 347 | auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 640, 320, 4, false, 1, 0.5, "model.17"); 348 | 349 | // yolo layer 0 350 | IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]); 351 | auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), 320, 3, 2, 1, "model.18"); 352 | ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) }; 353 | auto cat19 = network->addConcatenation(inputTensors19, 2); 354 | auto bottleneck_csp20 = bottleneckCSP(network, weightMap, *cat19->getOutput(0), 640, 640, 4, false, 1, 0.5, "model.20"); 355 | // yolo layer 1 356 | IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]); 357 | auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), 640, 3, 2, 1, "model.21"); 358 | ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) }; 359 | auto cat22 = network->addConcatenation(inputTensors22, 2); 360 | auto bottleneck_csp23 = bottleneckCSP(network, weightMap, *cat22->getOutput(0), 1280, 1280, 4, false, 1, 0.5, "model.23"); 361 | // yolo layer 2 362 | IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]); 363 | 364 | auto yolo = addYoLoLayer(network, weightMap, det0, det1, det2); 365 | yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME); 366 | network->markOutput(*yolo->getOutput(0)); 367 | 368 | // Build engine 369 | builder->setMaxBatchSize(maxBatchSize); 370 | config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB 371 | #ifdef USE_FP16 372 | config->setFlag(BuilderFlag::kFP16); 373 | #endif 374 | std::cout << "Building engine, please wait for a while..." << std::endl; 375 | ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); 376 | std::cout << "Build engine successfully!" << std::endl; 377 | 378 | // Don't need the network any more 379 | network->destroy(); 380 | 381 | // Release host memory 382 | for (auto& mem : weightMap) 383 | { 384 | free((void*)(mem.second.values)); 385 | } 386 | 387 | return engine; 388 | } 389 | 390 | void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream) { 391 | // Create builder 392 | IBuilder* builder = createInferBuilder(gLogger); 393 | IBuilderConfig* config = builder->createBuilderConfig(); 394 | 395 | // Create model to populate the network, then set the outputs and create an engine 396 | ICudaEngine* engine = (CREATENET(NET))(maxBatchSize, builder, config, DataType::kFLOAT); 397 | //ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT); 398 | assert(engine != nullptr); 399 | 400 | // Serialize the engine 401 | (*modelStream) = engine->serialize(); 402 | 403 | // Close everything down 404 | engine->destroy(); 405 | builder->destroy(); 406 | } 407 | 408 | void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, int batchSize) { 409 | // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host 410 | CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); 411 | context.enqueue(batchSize, buffers, stream, nullptr); 412 | CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); 413 | cudaStreamSynchronize(stream); 414 | } 415 | 416 | int main(int argc, char** argv) { 417 | cudaSetDevice(DEVICE); 418 | // create a model using the API directly and serialize it to a stream 419 | char *trtModelStream{ nullptr }; 420 | size_t size{ 0 }; 421 | std::string engine_name = STR2(NET); 422 | engine_name = "yolov5" + engine_name + ".engine"; 423 | if (argc == 2 && std::string(argv[1]) == "-s") { 424 | IHostMemory* modelStream{ nullptr }; 425 | APIToModel(BATCH_SIZE, &modelStream); 426 | assert(modelStream != nullptr); 427 | std::ofstream p(engine_name, std::ios::binary); 428 | if (!p) { 429 | std::cerr << "could not open plan output file" << std::endl; 430 | return -1; 431 | } 432 | p.write(reinterpret_cast(modelStream->data()), modelStream->size()); 433 | modelStream->destroy(); 434 | return 0; 435 | } else if (argc == 2 && std::string(argv[1]) == "-v") { 436 | std::ifstream file(engine_name, std::ios::binary); 437 | if (file.good()) { 438 | file.seekg(0, file.end); 439 | size = file.tellg(); 440 | file.seekg(0, file.beg); 441 | trtModelStream = new char[size]; 442 | assert(trtModelStream); 443 | file.read(trtModelStream, size); 444 | file.close(); 445 | } 446 | } else { 447 | std::cerr << "arguments not right!" << std::endl; 448 | std::cerr << "./yolov5 -s // serialize model to plan file" << std::endl; 449 | std::cerr << "./yolov5 -v // run inference with camera" << std::endl; 450 | return -1; 451 | } 452 | 453 | std::vector file_names; 454 | /*if (read_files_in_dir(argv[2], file_names) < 0) { 455 | std::cout << "read_files_in_dir failed." << std::endl; 456 | return -1; 457 | }*/ 458 | 459 | // prepare input data --------------------------- 460 | static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; 461 | //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) 462 | // data[i] = 1.0; 463 | static float prob[BATCH_SIZE * OUTPUT_SIZE]; 464 | IRuntime* runtime = createInferRuntime(gLogger); 465 | assert(runtime != nullptr); 466 | ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); 467 | assert(engine != nullptr); 468 | IExecutionContext* context = engine->createExecutionContext(); 469 | assert(context != nullptr); 470 | delete[] trtModelStream; 471 | assert(engine->getNbBindings() == 2); 472 | void* buffers[2]; 473 | // In order to bind the buffers, we need to know the names of the input and output tensors. 474 | // Note that indices are guaranteed to be less than IEngine::getNbBindings() 475 | const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME); 476 | const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME); 477 | assert(inputIndex == 0); 478 | assert(outputIndex == 1); 479 | // Create GPU buffers on device 480 | CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float))); 481 | CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float))); 482 | // Create stream 483 | cudaStream_t stream; 484 | CHECK(cudaStreamCreate(&stream)); 485 | 486 | cv::VideoCapture capture(0); 487 | //cv::VideoCapture capture("../overpass.mp4"); 488 | //int fourcc = cv::VideoWriter::fourcc('M','J','P','G'); 489 | //capture.set(cv::CAP_PROP_FOURCC, fourcc); 490 | if(!capture.isOpened()){ 491 | std::cout << "Error opening video stream or file" << std::endl; 492 | return -1; 493 | } 494 | 495 | int key; 496 | int fcount = 0; 497 | while(1) 498 | { 499 | cv::Mat frame; 500 | capture >> frame; 501 | if(frame.empty()) 502 | { 503 | std::cout << "Fail to read image from camera!" << std::endl; 504 | break; 505 | } 506 | 507 | fcount++; 508 | for (int b = 0; b < fcount; b++) { 509 | cv::Mat img = frame; 510 | if (img.empty()) continue; 511 | cv::Mat pr_img = preprocess_img(img); // letterbox BGR to RGB 512 | int i = 0; 513 | for (int row = 0; row < INPUT_H; ++row) { 514 | uchar* uc_pixel = pr_img.data + row * pr_img.step; 515 | for (int col = 0; col < INPUT_W; ++col) { 516 | data[b * 3 * INPUT_H * INPUT_W + i] = (float)uc_pixel[2] / 255.0; 517 | data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0; 518 | data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0; 519 | uc_pixel += 3; 520 | ++i; 521 | } 522 | } 523 | } 524 | 525 | // Run inference 526 | auto start = std::chrono::system_clock::now(); 527 | doInference(*context, stream, buffers, data, prob, BATCH_SIZE); 528 | auto end = std::chrono::system_clock::now(); 529 | int fps = 1000.0/std::chrono::duration_cast(end - start).count(); 530 | //std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; 531 | std::vector> batch_res(fcount); 532 | for (int b = 0; b < fcount; b++) { 533 | auto& res = batch_res[b]; 534 | nms(res, &prob[b * OUTPUT_SIZE], CONF_THRESH, NMS_THRESH); 535 | } 536 | for (int b = 0; b < fcount; b++) { 537 | auto& res = batch_res[b]; 538 | //std::cout << res.size() << std::endl; 539 | for (size_t j = 0; j < res.size(); j++) { 540 | cv::Rect r = get_rect(frame, res[j].bbox); 541 | cv::rectangle(frame, r, cv::Scalar(0x27, 0xC1, 0x36), 2); 542 | std::string label = coco_classes[(int)res[j].class_id]; 543 | cv::putText(frame, label, cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 1.2); 544 | std::string jetson_fps = "Jetson Nano FPS: " + std::to_string(fps); 545 | cv::putText(frame, jetson_fps, cv::Point(11,80), cv::FONT_HERSHEY_PLAIN, 3, cv::Scalar(0, 0, 255), 2, cv::LINE_AA); 546 | } 547 | } 548 | 549 | cv::imshow("yolov5",frame); 550 | key = cv::waitKey(1); 551 | if (key == 'q'){ 552 | break; 553 | } 554 | 555 | fcount = 0; 556 | } 557 | 558 | capture.release(); 559 | // Release stream and buffers 560 | cudaStreamDestroy(stream); 561 | CHECK(cudaFree(buffers[inputIndex])); 562 | CHECK(cudaFree(buffers[outputIndex])); 563 | // Destroy the engine 564 | context->destroy(); 565 | engine->destroy(); 566 | runtime->destroy(); 567 | 568 | return 0; 569 | } 570 | -------------------------------------------------------------------------------- /yolov5_trt.py: -------------------------------------------------------------------------------- 1 | """ 2 | An example that uses TensorRT's Python api to make inferences. 3 | """ 4 | import ctypes 5 | import os 6 | import random 7 | import sys 8 | import threading 9 | import time 10 | 11 | import cv2 12 | import numpy as np 13 | import pycuda.autoinit 14 | import pycuda.driver as cuda 15 | import tensorrt as trt 16 | import torch 17 | import torchvision 18 | 19 | INPUT_W = 608 20 | INPUT_H = 608 21 | CONF_THRESH = 0.1 22 | IOU_THRESHOLD = 0.4 23 | 24 | 25 | def plot_one_box(x, img, color=None, label=None, line_thickness=None): 26 | """ 27 | description: Plots one bounding box on image img, 28 | this function comes from YoLov5 project. 29 | param: 30 | x: a box likes [x1,y1,x2,y2] 31 | img: a opencv image object 32 | color: color to draw rectangle, such as (0,255,0) 33 | label: str 34 | line_thickness: int 35 | return: 36 | no return 37 | 38 | """ 39 | tl = ( 40 | line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 41 | ) # line/font thickness 42 | color = color or [random.randint(0, 255) for _ in range(3)] 43 | c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) 44 | cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) 45 | if label: 46 | tf = max(tl - 1, 1) # font thickness 47 | t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] 48 | c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 49 | cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled 50 | cv2.putText( 51 | img, 52 | label, 53 | (c1[0], c1[1] - 2), 54 | 0, 55 | tl / 3, 56 | [225, 255, 255], 57 | thickness=tf, 58 | lineType=cv2.LINE_AA, 59 | ) 60 | 61 | 62 | class YoLov5TRT(object): 63 | """ 64 | description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops. 65 | """ 66 | 67 | def __init__(self, engine_file_path): 68 | # Create a Context on this device, 69 | self.cfx = cuda.Device(0).make_context() 70 | stream = cuda.Stream() 71 | TRT_LOGGER = trt.Logger(trt.Logger.INFO) 72 | runtime = trt.Runtime(TRT_LOGGER) 73 | 74 | # Deserialize the engine from file 75 | with open(engine_file_path, "rb") as f: 76 | engine = runtime.deserialize_cuda_engine(f.read()) 77 | context = engine.create_execution_context() 78 | 79 | host_inputs = [] 80 | cuda_inputs = [] 81 | host_outputs = [] 82 | cuda_outputs = [] 83 | bindings = [] 84 | 85 | for binding in engine: 86 | size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size 87 | dtype = trt.nptype(engine.get_binding_dtype(binding)) 88 | # Allocate host and device buffers 89 | host_mem = cuda.pagelocked_empty(size, dtype) 90 | cuda_mem = cuda.mem_alloc(host_mem.nbytes) 91 | # Append the device buffer to device bindings. 92 | bindings.append(int(cuda_mem)) 93 | # Append to the appropriate list. 94 | if engine.binding_is_input(binding): 95 | host_inputs.append(host_mem) 96 | cuda_inputs.append(cuda_mem) 97 | else: 98 | host_outputs.append(host_mem) 99 | cuda_outputs.append(cuda_mem) 100 | 101 | # Store 102 | self.stream = stream 103 | self.context = context 104 | self.engine = engine 105 | self.host_inputs = host_inputs 106 | self.cuda_inputs = cuda_inputs 107 | self.host_outputs = host_outputs 108 | self.cuda_outputs = cuda_outputs 109 | self.bindings = bindings 110 | 111 | def infer(self, input_image_path): 112 | threading.Thread.__init__(self) 113 | # Make self the active context, pushing it on top of the context stack. 114 | self.cfx.push() 115 | # Restore 116 | stream = self.stream 117 | context = self.context 118 | engine = self.engine 119 | host_inputs = self.host_inputs 120 | cuda_inputs = self.cuda_inputs 121 | host_outputs = self.host_outputs 122 | cuda_outputs = self.cuda_outputs 123 | bindings = self.bindings 124 | # Do image preprocess 125 | input_image, image_raw, origin_h, origin_w = self.preprocess_image( 126 | input_image_path 127 | ) 128 | # Copy input image to host buffer 129 | np.copyto(host_inputs[0], input_image.ravel()) 130 | # Transfer input data to the GPU. 131 | cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) 132 | # Run inference. 133 | context.execute_async(bindings=bindings, stream_handle=stream.handle) 134 | # Transfer predictions back from the GPU. 135 | cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) 136 | # Synchronize the stream 137 | stream.synchronize() 138 | # Remove any context from the top of the context stack, deactivating it. 139 | self.cfx.pop() 140 | # Here we use the first row of output in that batch_size = 1 141 | output = host_outputs[0] 142 | # Do postprocess 143 | result_boxes, result_scores, result_classid = self.post_process( 144 | output, origin_h, origin_w 145 | ) 146 | # Draw rectangles and labels on the original image 147 | for i in range(len(result_boxes)): 148 | box = result_boxes[i] 149 | plot_one_box( 150 | box, 151 | image_raw, 152 | label="{}:{:.2f}".format( 153 | categories[int(result_classid[i])], result_scores[i] 154 | ), 155 | ) 156 | parent, filename = os.path.split(input_image_path) 157 | save_name = os.path.join(parent, "output_" + filename) 158 | #  Save image 159 | cv2.imwrite(save_name, image_raw) 160 | 161 | def destroy(self): 162 | # Remove any context from the top of the context stack, deactivating it. 163 | self.cfx.pop() 164 | 165 | def preprocess_image(self, input_image_path): 166 | """ 167 | description: Read an image from image path, convert it to RGB, 168 | resize and pad it to target size, normalize to [0,1], 169 | transform to NCHW format. 170 | param: 171 | input_image_path: str, image path 172 | return: 173 | image: the processed image 174 | image_raw: the original image 175 | h: original height 176 | w: original width 177 | """ 178 | image_raw = cv2.imread(input_image_path) 179 | h, w, c = image_raw.shape 180 | image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) 181 | # Calculate widht and height and paddings 182 | r_w = INPUT_W / w 183 | r_h = INPUT_H / h 184 | if r_h > r_w: 185 | tw = INPUT_W 186 | th = int(r_w * h) 187 | tx1 = tx2 = 0 188 | ty1 = int((INPUT_H - th) / 2) 189 | ty2 = INPUT_H - th - ty1 190 | else: 191 | tw = int(r_h * w) 192 | th = INPUT_H 193 | tx1 = int((INPUT_W - tw) / 2) 194 | tx2 = INPUT_W - tw - tx1 195 | ty1 = ty2 = 0 196 | # Resize the image with long side while maintaining ratio 197 | image = cv2.resize(image, (tw, th)) 198 | # Pad the short side with (128,128,128) 199 | image = cv2.copyMakeBorder( 200 | image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, (128, 128, 128) 201 | ) 202 | image = image.astype(np.float32) 203 | # Normalize to [0,1] 204 | image /= 255.0 205 | # HWC to CHW format: 206 | image = np.transpose(image, [2, 0, 1]) 207 | # CHW to NCHW format 208 | image = np.expand_dims(image, axis=0) 209 | # Convert the image to row-major order, also known as "C order": 210 | image = np.ascontiguousarray(image) 211 | return image, image_raw, h, w 212 | 213 | def xywh2xyxy(self, origin_h, origin_w, x): 214 | """ 215 | description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right 216 | param: 217 | origin_h: height of original image 218 | origin_w: width of original image 219 | x: A boxes tensor, each row is a box [center_x, center_y, w, h] 220 | return: 221 | y: A boxes tensor, each row is a box [x1, y1, x2, y2] 222 | """ 223 | y = torch.zeros_like(x) if isinstance(x, torch.Tensor) else np.zeros_like(x) 224 | r_w = INPUT_W / origin_w 225 | r_h = INPUT_H / origin_h 226 | if r_h > r_w: 227 | y[:, 0] = x[:, 0] - x[:, 2] / 2 228 | y[:, 2] = x[:, 0] + x[:, 2] / 2 229 | y[:, 1] = x[:, 1] - x[:, 3] / 2 - (INPUT_H - r_w * origin_h) / 2 230 | y[:, 3] = x[:, 1] + x[:, 3] / 2 - (INPUT_H - r_w * origin_h) / 2 231 | y /= r_w 232 | else: 233 | y[:, 0] = x[:, 0] - x[:, 2] / 2 - (INPUT_W - r_h * origin_w) / 2 234 | y[:, 2] = x[:, 0] + x[:, 2] / 2 - (INPUT_W - r_h * origin_w) / 2 235 | y[:, 1] = x[:, 1] - x[:, 3] / 2 236 | y[:, 3] = x[:, 1] + x[:, 3] / 2 237 | y /= r_h 238 | 239 | return y 240 | 241 | def post_process(self, output, origin_h, origin_w): 242 | """ 243 | description: postprocess the prediction 244 | param: 245 | output: A tensor likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] 246 | origin_h: height of original image 247 | origin_w: width of original image 248 | return: 249 | result_boxes: finally boxes, a boxes tensor, each row is a box [x1, y1, x2, y2] 250 | result_scores: finally scores, a tensor, each element is the score correspoing to box 251 | result_classid: finally classid, a tensor, each element is the classid correspoing to box 252 | """ 253 | # Get the num of boxes detected 254 | num = int(output[0]) 255 | # Reshape to a two dimentional ndarray 256 | pred = np.reshape(output[1:], (-1, 6))[:num, :] 257 | # to a torch Tensor 258 | pred = torch.Tensor(pred).cuda() 259 | # Get the boxes 260 | boxes = pred[:, :4] 261 | # Get the scores 262 | scores = pred[:, 4] 263 | # Get the classid 264 | classid = pred[:, 5] 265 | # Choose those boxes that score > CONF_THRESH 266 | si = scores > CONF_THRESH 267 | boxes = boxes[si, :] 268 | scores = scores[si] 269 | classid = classid[si] 270 | # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] 271 | boxes = self.xywh2xyxy(origin_h, origin_w, boxes) 272 | # Do nms 273 | indices = torchvision.ops.nms(boxes, scores, iou_threshold=IOU_THRESHOLD).cpu() 274 | result_boxes = boxes[indices, :].cpu() 275 | result_scores = scores[indices].cpu() 276 | result_classid = classid[indices].cpu() 277 | return result_boxes, result_scores, result_classid 278 | 279 | 280 | class myThread(threading.Thread): 281 | def __init__(self, func, args): 282 | threading.Thread.__init__(self) 283 | self.func = func 284 | self.args = args 285 | 286 | def run(self): 287 | self.func(*self.args) 288 | 289 | 290 | if __name__ == "__main__": 291 | # load custom plugins 292 | PLUGIN_LIBRARY = "build/libmyplugins.so" 293 | ctypes.CDLL(PLUGIN_LIBRARY) 294 | engine_file_path = "build/yolov5s.engine" 295 | 296 | # load coco labels 297 | 298 | categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", 299 | "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", 300 | "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", 301 | "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", 302 | "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", 303 | "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", 304 | "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", 305 | "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", 306 | "hair drier", "toothbrush"] 307 | 308 | # a YoLov5TRT instance 309 | yolov5_wrapper = YoLov5TRT(engine_file_path) 310 | 311 | # from https://github.com/ultralytics/yolov5/tree/master/inference/images 312 | input_image_paths = ["zidane.jpg", "bus.jpg"] 313 | 314 | for input_image_path in input_image_paths: 315 | # create a new thread to do inference 316 | thread1 = myThread(yolov5_wrapper.infer, [input_image_path]) 317 | thread1.start() 318 | thread1.join() 319 | 320 | # destroy the instance 321 | yolov5_wrapper.destroy() 322 | -------------------------------------------------------------------------------- /yolov5s.engine: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenJetson/tensorrt-yolov5/7c79e973e40c3e41488021c3efcc2f4f3b283ed8/yolov5s.engine --------------------------------------------------------------------------------