├── CMakeLists.txt ├── README.md ├── calibrator.cpp ├── calibrator.h ├── common.hpp ├── cuda_utils.h ├── gen_engine.sh ├── gen_wts.py ├── logging.h ├── macros.h ├── samples ├── bus.jpg └── zidane.jpg ├── utils.h ├── yololayer.cu ├── yololayer.h ├── yolov5.cpp ├── yolov5_trt.py └── yolov5s.wts /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(yolov5) 4 | 5 | add_definitions(-std=c++11) 6 | add_definitions(-DAPI_EXPORTS) 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | if(WIN32) 14 | enable_language(CUDA) 15 | endif(WIN32) 16 | 17 | include_directories(${PROJECT_SOURCE_DIR}/include) 18 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 19 | # cuda 20 | include_directories(/usr/local/cuda/include) 21 | link_directories(/usr/local/cuda/lib64) 22 | # tensorrt 23 | include_directories(/usr/include/x86_64-linux-gnu/) 24 | link_directories(/usr/lib/x86_64-linux-gnu/) 25 | 26 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 27 | 28 | cuda_add_library(yoloplugin SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu) 29 | target_link_libraries(yoloplugin nvinfer cudart) 30 | 31 | find_package(OpenCV) 32 | include_directories(${OpenCV_INCLUDE_DIRS}) 33 | 34 | add_executable(yolov5 ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/yolov5.cpp) 35 | target_link_libraries(yolov5 nvinfer) 36 | target_link_libraries(yolov5 nvinfer_plugin) 37 | target_link_libraries(yolov5 cudart) 38 | target_link_libraries(yolov5 yoloplugin) 39 | target_link_libraries(yolov5 ${OpenCV_LIBS}) 40 | 41 | if(UNIX) 42 | add_definitions(-O2 -pthread) 43 | endif(UNIX) 44 | 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # yolov5-5.0 2 | Original codes from [tensorrtx](https://github.com/wang-xinyu/tensorrtx). I modified the yololayer and integrated batchedNMSPlugin. A `yolov5s.wts` is provided for fast demo. How to generate `.wts` can refer to https://github.com/wang-xinyu/tensorrtx/tree/master/yolov5. 3 | 4 | ## Run with docker 5 | 6 | 1. Run into docker, assume repo path is `/data/YoLov5-TensorRT-NMS` 7 | ``` 8 | sudo docker run --gpus all -v /data/YoLov5-TensorRT-NMS:/work -it registry.cn-guangzhou.aliyuncs.com/nvidia-images/tensorrt20.03-py3:v3 /bin/bash 9 | cd work 10 | mkdir build 11 | cd build 12 | cmake .. 13 | make 14 | sudo ./yolov5 -s // serialize model to plan file i.e. 'yolov5s.engine' 15 | sudo ./yolov5 -d ../samples // deserialize plan file and run inference, the images in samples will be processed. 16 | ``` 17 | 18 | 19 | 20 | ## How to Run, yolov5s as example 21 | 22 | 1. build and run 23 | ``` 24 | mkdir build 25 | cd build 26 | cmake .. 27 | make 28 | sudo ./yolov5 -s // serialize model to plan file i.e. 'yolov5s.engine' 29 | sudo ./yolov5 -d ../samples // deserialize plan file and run inference, the images in samples will be processed. 30 | ``` 31 | 2. check the images generated, as follows. _zidane.jpg and _bus.jpg 32 | 33 | 34 |

35 | 36 |

37 | 38 |

39 | 40 |

41 | 42 | 3. run Python example, please install Python tensorrt and Pycuda and then 43 | ``` 44 | python yolov5_trt.py 45 | ``` 46 | ## More Information 47 | 48 | See the readme in [tensorrtx home page.](https://github.com/wang-xinyu/tensorrtx) 49 | 50 | ## Known issues 51 | 52 | None! -------------------------------------------------------------------------------- /calibrator.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "calibrator.h" 6 | #include "cuda_utils.h" 7 | #include "utils.h" 8 | 9 | Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache) 10 | : batchsize_(batchsize) 11 | , input_w_(input_w) 12 | , input_h_(input_h) 13 | , img_idx_(0) 14 | , img_dir_(img_dir) 15 | , calib_table_name_(calib_table_name) 16 | , input_blob_name_(input_blob_name) 17 | , read_cache_(read_cache) 18 | { 19 | input_count_ = 3 * input_w * input_h * batchsize; 20 | CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float))); 21 | read_files_in_dir(img_dir, img_files_); 22 | } 23 | 24 | Int8EntropyCalibrator2::~Int8EntropyCalibrator2() 25 | { 26 | CUDA_CHECK(cudaFree(device_input_)); 27 | } 28 | 29 | int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT 30 | { 31 | return batchsize_; 32 | } 33 | 34 | bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT 35 | { 36 | if (img_idx_ + batchsize_ > (int)img_files_.size()) { 37 | return false; 38 | } 39 | 40 | std::vector input_imgs_; 41 | for (int i = img_idx_; i < img_idx_ + batchsize_; i++) { 42 | std::cout << img_files_[i] << " " << i << std::endl; 43 | cv::Mat temp = cv::imread(img_dir_ + img_files_[i]); 44 | if (temp.empty()){ 45 | std::cerr << "Fatal error: image cannot open!" << std::endl; 46 | return false; 47 | } 48 | cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_); 49 | input_imgs_.push_back(pr_img); 50 | } 51 | img_idx_ += batchsize_; 52 | cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false); 53 | 54 | CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); 55 | assert(!strcmp(names[0], input_blob_name_)); 56 | bindings[0] = device_input_; 57 | return true; 58 | } 59 | 60 | const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT 61 | { 62 | std::cout << "reading calib cache: " << calib_table_name_ << std::endl; 63 | calib_cache_.clear(); 64 | std::ifstream input(calib_table_name_, std::ios::binary); 65 | input >> std::noskipws; 66 | if (read_cache_ && input.good()) 67 | { 68 | std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calib_cache_)); 69 | } 70 | length = calib_cache_.size(); 71 | return length ? calib_cache_.data() : nullptr; 72 | } 73 | 74 | void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT 75 | { 76 | std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl; 77 | std::ofstream output(calib_table_name_, std::ios::binary); 78 | output.write(reinterpret_cast(cache), length); 79 | } 80 | 81 | -------------------------------------------------------------------------------- /calibrator.h: -------------------------------------------------------------------------------- 1 | #ifndef ENTROPY_CALIBRATOR_H 2 | #define ENTROPY_CALIBRATOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "macros.h" 8 | 9 | //! \class Int8EntropyCalibrator2 10 | //! 11 | //! \brief Implements Entropy calibrator 2. 12 | //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. 13 | //! 14 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 15 | { 16 | public: 17 | Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); 18 | 19 | virtual ~Int8EntropyCalibrator2(); 20 | int getBatchSize() const TRT_NOEXCEPT override; 21 | bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; 22 | const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; 23 | void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; 24 | 25 | private: 26 | int batchsize_; 27 | int input_w_; 28 | int input_h_; 29 | int img_idx_; 30 | std::string img_dir_; 31 | std::vector img_files_; 32 | size_t input_count_; 33 | std::string calib_table_name_; 34 | const char* input_blob_name_; 35 | bool read_cache_; 36 | void* device_input_; 37 | std::vector calib_cache_; 38 | }; 39 | 40 | #endif // ENTROPY_CALIBRATOR_H 41 | -------------------------------------------------------------------------------- /common.hpp: -------------------------------------------------------------------------------- 1 | #ifndef YOLOV5_COMMON_H_ 2 | #define YOLOV5_COMMON_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "NvInfer.h" 10 | #include "NvInferPlugin.h" 11 | #include "yololayer.h" 12 | 13 | using namespace nvinfer1; 14 | 15 | cv::Rect get_rect(cv::Mat &img, float *bbox) 16 | { 17 | int l, r, t, b; 18 | float r_w = Yolo::INPUT_W / (img.cols * 1.0); 19 | float r_h = Yolo::INPUT_H / (img.rows * 1.0); 20 | if (r_h > r_w) 21 | { 22 | l = bbox[0]; 23 | r = bbox[2]; 24 | t = bbox[1] - (Yolo::INPUT_H - r_w * img.rows) / 2; 25 | b = bbox[3] - (Yolo::INPUT_H - r_w * img.rows) / 2; 26 | l = l / r_w; 27 | r = r / r_w; 28 | t = t / r_w; 29 | b = b / r_w; 30 | } 31 | else 32 | { 33 | l = bbox[0] - (Yolo::INPUT_W - r_h * img.cols) / 2; 34 | r = bbox[2] - (Yolo::INPUT_W - r_h * img.cols) / 2; 35 | t = bbox[1]; 36 | b = bbox[3]; 37 | l = l / r_h; 38 | r = r / r_h; 39 | t = t / r_h; 40 | b = b / r_h; 41 | } 42 | return cv::Rect(l, t, r - l, b - t); 43 | } 44 | 45 | // TensorRT weight files have a simple space delimited format: 46 | // [type] [size] 47 | std::map loadWeights(const std::string file) 48 | { 49 | std::cout << "Loading weights: " << file << std::endl; 50 | std::map weightMap; 51 | 52 | // Open weights file 53 | std::ifstream input(file); 54 | assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); 55 | 56 | // Read number of weight blobs 57 | int32_t count; 58 | input >> count; 59 | assert(count > 0 && "Invalid weight map file."); 60 | 61 | while (count--) 62 | { 63 | Weights wt{DataType::kFLOAT, nullptr, 0}; 64 | uint32_t size; 65 | 66 | // Read name and type of blob 67 | std::string name; 68 | input >> name >> std::dec >> size; 69 | wt.type = DataType::kFLOAT; 70 | 71 | // Load blob 72 | uint32_t *val = reinterpret_cast(malloc(sizeof(val) * size)); 73 | for (uint32_t x = 0, y = size; x < y; ++x) 74 | { 75 | input >> std::hex >> val[x]; 76 | } 77 | wt.values = val; 78 | 79 | wt.count = size; 80 | weightMap[name] = wt; 81 | } 82 | 83 | return weightMap; 84 | } 85 | 86 | IScaleLayer *addBatchNorm2d(INetworkDefinition *network, std::map &weightMap, ITensor &input, std::string lname, float eps) 87 | { 88 | float *gamma = (float *)weightMap[lname + ".weight"].values; 89 | float *beta = (float *)weightMap[lname + ".bias"].values; 90 | float *mean = (float *)weightMap[lname + ".running_mean"].values; 91 | float *var = (float *)weightMap[lname + ".running_var"].values; 92 | int len = weightMap[lname + ".running_var"].count; 93 | 94 | float *scval = reinterpret_cast(malloc(sizeof(float) * len)); 95 | for (int i = 0; i < len; i++) 96 | { 97 | scval[i] = gamma[i] / sqrt(var[i] + eps); 98 | } 99 | Weights scale{DataType::kFLOAT, scval, len}; 100 | 101 | float *shval = reinterpret_cast(malloc(sizeof(float) * len)); 102 | for (int i = 0; i < len; i++) 103 | { 104 | shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); 105 | } 106 | Weights shift{DataType::kFLOAT, shval, len}; 107 | 108 | float *pval = reinterpret_cast(malloc(sizeof(float) * len)); 109 | for (int i = 0; i < len; i++) 110 | { 111 | pval[i] = 1.0; 112 | } 113 | Weights power{DataType::kFLOAT, pval, len}; 114 | 115 | weightMap[lname + ".scale"] = scale; 116 | weightMap[lname + ".shift"] = shift; 117 | weightMap[lname + ".power"] = power; 118 | IScaleLayer *scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); 119 | assert(scale_1); 120 | return scale_1; 121 | } 122 | 123 | ILayer *convBlock(INetworkDefinition *network, std::map &weightMap, ITensor &input, int outch, int ksize, int s, int g, std::string lname) 124 | { 125 | Weights emptywts{DataType::kFLOAT, nullptr, 0}; 126 | int p = ksize / 2; 127 | IConvolutionLayer *conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap[lname + ".conv.weight"], emptywts); 128 | assert(conv1); 129 | conv1->setStrideNd(DimsHW{s, s}); 130 | conv1->setPaddingNd(DimsHW{p, p}); 131 | conv1->setNbGroups(g); 132 | IScaleLayer *bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), lname + ".bn", 1e-3); 133 | 134 | // silu = x * sigmoid 135 | auto sig = network->addActivation(*bn1->getOutput(0), ActivationType::kSIGMOID); 136 | assert(sig); 137 | auto ew = network->addElementWise(*bn1->getOutput(0), *sig->getOutput(0), ElementWiseOperation::kPROD); 138 | assert(ew); 139 | return ew; 140 | } 141 | 142 | ILayer *focus(INetworkDefinition *network, std::map &weightMap, ITensor &input, int inch, int outch, int ksize, std::string lname) 143 | { 144 | ISliceLayer *s1 = network->addSlice(input, Dims3{0, 0, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 145 | ISliceLayer *s2 = network->addSlice(input, Dims3{0, 1, 0}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 146 | ISliceLayer *s3 = network->addSlice(input, Dims3{0, 0, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 147 | ISliceLayer *s4 = network->addSlice(input, Dims3{0, 1, 1}, Dims3{inch, Yolo::INPUT_H / 2, Yolo::INPUT_W / 2}, Dims3{1, 2, 2}); 148 | ITensor *inputTensors[] = {s1->getOutput(0), s2->getOutput(0), s3->getOutput(0), s4->getOutput(0)}; 149 | auto cat = network->addConcatenation(inputTensors, 4); 150 | auto conv = convBlock(network, weightMap, *cat->getOutput(0), outch, ksize, 1, 1, lname + ".conv"); 151 | return conv; 152 | } 153 | 154 | ILayer *bottleneck(INetworkDefinition *network, std::map &weightMap, ITensor &input, int c1, int c2, bool shortcut, int g, float e, std::string lname) 155 | { 156 | auto cv1 = convBlock(network, weightMap, input, (int)((float)c2 * e), 1, 1, 1, lname + ".cv1"); 157 | auto cv2 = convBlock(network, weightMap, *cv1->getOutput(0), c2, 3, 1, g, lname + ".cv2"); 158 | if (shortcut && c1 == c2) 159 | { 160 | auto ew = network->addElementWise(input, *cv2->getOutput(0), ElementWiseOperation::kSUM); 161 | return ew; 162 | } 163 | return cv2; 164 | } 165 | 166 | ILayer *bottleneckCSP(INetworkDefinition *network, std::map &weightMap, ITensor &input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) 167 | { 168 | Weights emptywts{DataType::kFLOAT, nullptr, 0}; 169 | int c_ = (int)((float)c2 * e); 170 | auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); 171 | auto cv2 = network->addConvolutionNd(input, c_, DimsHW{1, 1}, weightMap[lname + ".cv2.weight"], emptywts); 172 | ITensor *y1 = cv1->getOutput(0); 173 | for (int i = 0; i < n; i++) 174 | { 175 | auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i)); 176 | y1 = b->getOutput(0); 177 | } 178 | auto cv3 = network->addConvolutionNd(*y1, c_, DimsHW{1, 1}, weightMap[lname + ".cv3.weight"], emptywts); 179 | 180 | ITensor *inputTensors[] = {cv3->getOutput(0), cv2->getOutput(0)}; 181 | auto cat = network->addConcatenation(inputTensors, 2); 182 | 183 | IScaleLayer *bn = addBatchNorm2d(network, weightMap, *cat->getOutput(0), lname + ".bn", 1e-4); 184 | auto lr = network->addActivation(*bn->getOutput(0), ActivationType::kLEAKY_RELU); 185 | lr->setAlpha(0.1); 186 | 187 | auto cv4 = convBlock(network, weightMap, *lr->getOutput(0), c2, 1, 1, 1, lname + ".cv4"); 188 | return cv4; 189 | } 190 | 191 | ILayer *C3(INetworkDefinition *network, std::map &weightMap, ITensor &input, int c1, int c2, int n, bool shortcut, int g, float e, std::string lname) 192 | { 193 | int c_ = (int)((float)c2 * e); 194 | auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); 195 | auto cv2 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv2"); 196 | ITensor *y1 = cv1->getOutput(0); 197 | for (int i = 0; i < n; i++) 198 | { 199 | auto b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, g, 1.0, lname + ".m." + std::to_string(i)); 200 | y1 = b->getOutput(0); 201 | } 202 | 203 | ITensor *inputTensors[] = {y1, cv2->getOutput(0)}; 204 | auto cat = network->addConcatenation(inputTensors, 2); 205 | 206 | auto cv3 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv3"); 207 | return cv3; 208 | } 209 | 210 | ILayer *SPP(INetworkDefinition *network, std::map &weightMap, ITensor &input, int c1, int c2, int k1, int k2, int k3, std::string lname) 211 | { 212 | int c_ = c1 / 2; 213 | auto cv1 = convBlock(network, weightMap, input, c_, 1, 1, 1, lname + ".cv1"); 214 | 215 | auto pool1 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k1, k1}); 216 | pool1->setPaddingNd(DimsHW{k1 / 2, k1 / 2}); 217 | pool1->setStrideNd(DimsHW{1, 1}); 218 | auto pool2 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k2, k2}); 219 | pool2->setPaddingNd(DimsHW{k2 / 2, k2 / 2}); 220 | pool2->setStrideNd(DimsHW{1, 1}); 221 | auto pool3 = network->addPoolingNd(*cv1->getOutput(0), PoolingType::kMAX, DimsHW{k3, k3}); 222 | pool3->setPaddingNd(DimsHW{k3 / 2, k3 / 2}); 223 | pool3->setStrideNd(DimsHW{1, 1}); 224 | 225 | ITensor *inputTensors[] = {cv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0)}; 226 | auto cat = network->addConcatenation(inputTensors, 4); 227 | 228 | auto cv2 = convBlock(network, weightMap, *cat->getOutput(0), c2, 1, 1, 1, lname + ".cv2"); 229 | return cv2; 230 | } 231 | 232 | std::vector> getAnchors(std::map &weightMap, std::string lname) 233 | { 234 | std::vector> anchors; 235 | Weights wts = weightMap[lname + ".anchor_grid"]; 236 | int anchor_len = Yolo::CHECK_COUNT * 2; 237 | for (int i = 0; i < wts.count / anchor_len; i++) 238 | { 239 | auto *p = (const float *)wts.values + i * anchor_len; 240 | std::vector anchor(p, p + anchor_len); 241 | anchors.push_back(anchor); 242 | } 243 | return anchors; 244 | } 245 | 246 | IPluginV2Layer *addYoLoLayer(INetworkDefinition *network, std::map &weightMap, std::string lname, std::vector dets) 247 | { 248 | auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); 249 | auto anchors = getAnchors(weightMap, lname); 250 | PluginField plugin_fields[2]; 251 | int netinfo[4] = {Yolo::CLASS_NUM, Yolo::INPUT_W, Yolo::INPUT_H, Yolo::MAX_OUTPUT_BBOX_COUNT}; 252 | plugin_fields[0].data = netinfo; 253 | plugin_fields[0].length = 4; 254 | plugin_fields[0].name = "netinfo"; 255 | plugin_fields[0].type = PluginFieldType::kFLOAT32; 256 | int scale = 8; 257 | std::vector kernels; 258 | for (size_t i = 0; i < anchors.size(); i++) 259 | { 260 | Yolo::YoloKernel kernel; 261 | kernel.width = Yolo::INPUT_W / scale; 262 | kernel.height = Yolo::INPUT_H / scale; 263 | memcpy(kernel.anchors, &anchors[i][0], anchors[i].size() * sizeof(float)); 264 | kernels.push_back(kernel); 265 | scale *= 2; 266 | } 267 | plugin_fields[1].data = &kernels[0]; 268 | plugin_fields[1].length = kernels.size(); 269 | plugin_fields[1].name = "kernels"; 270 | plugin_fields[1].type = PluginFieldType::kFLOAT32; 271 | PluginFieldCollection plugin_data; 272 | plugin_data.nbFields = 2; 273 | plugin_data.fields = plugin_fields; 274 | IPluginV2 *plugin_obj = creator->createPlugin("yololayer", &plugin_data); 275 | std::vector input_tensors; 276 | for (auto det : dets) 277 | { 278 | input_tensors.push_back(det->getOutput(0)); 279 | } 280 | auto yolo = network->addPluginV2(&input_tensors[0], input_tensors.size(), *plugin_obj); 281 | return yolo; 282 | } 283 | 284 | IPluginV2Layer *addBatchedNMSLayer(INetworkDefinition *network, IPluginV2Layer *yolo, int num_classes, int top_k, int keep_top_k, float score_thresh, float iou_thresh, bool is_normalized = false, bool clip_boxes = false) 285 | { 286 | auto creator = getPluginRegistry()->getPluginCreator("BatchedNMS_TRT", "1"); 287 | // Set plugin fields and the field collection 288 | const bool share_location = true; 289 | const int background_id = -1; 290 | PluginField fields[9] = { 291 | PluginField{"shareLocation", &share_location, 292 | PluginFieldType::kINT32, 1}, 293 | PluginField{"backgroundLabelId", &background_id, 294 | PluginFieldType::kINT32, 1}, 295 | PluginField{"numClasses", &num_classes, 296 | PluginFieldType::kINT32, 1}, 297 | PluginField{"topK", &top_k, PluginFieldType::kINT32, 298 | 1}, 299 | PluginField{"keepTopK", &keep_top_k, 300 | PluginFieldType::kINT32, 1}, 301 | PluginField{"scoreThreshold", &score_thresh, 302 | PluginFieldType::kFLOAT32, 1}, 303 | PluginField{"iouThreshold", &iou_thresh, 304 | PluginFieldType::kFLOAT32, 1}, 305 | PluginField{"isNormalized", &is_normalized, 306 | PluginFieldType::kINT32, 1}, 307 | PluginField{"clipBoxes", &clip_boxes, 308 | PluginFieldType::kINT32, 1}, 309 | }; 310 | PluginFieldCollection pfc{9, fields}; 311 | IPluginV2 *pluginObj = creator->createPlugin("batchednms", &pfc); 312 | ITensor *inputTensors[] = {yolo->getOutput(0), yolo->getOutput(1)}; 313 | auto batchednmslayer = network->addPluginV2(inputTensors, 2, *pluginObj); 314 | batchednmslayer->setName("nms_layer"); 315 | assert(batchednmslayer); 316 | return batchednmslayer; 317 | } 318 | 319 | #endif 320 | -------------------------------------------------------------------------------- /cuda_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRTX_CUDA_UTILS_H_ 2 | #define TRTX_CUDA_UTILS_H_ 3 | 4 | #include 5 | 6 | #ifndef CUDA_CHECK 7 | #define CUDA_CHECK(callstr)\ 8 | {\ 9 | cudaError_t error_code = callstr;\ 10 | if (error_code != cudaSuccess) {\ 11 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ 12 | assert(0);\ 13 | }\ 14 | } 15 | #endif // CUDA_CHECK 16 | 17 | #endif // TRTX_CUDA_UTILS_H_ 18 | 19 | -------------------------------------------------------------------------------- /gen_engine.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CURRENT_PATH=$(pwd) 4 | 5 | MODEL_FILE=yolov5s.pt 6 | MODEL_NAME=yolov5s #s,m,l,x 7 | WTS_FILE=${MODEL_NAME}.wts 8 | ENGINE_FILE=${MODEL_NAME}.engine 9 | 10 | sudo docker run --gpus all \ 11 | -v ${CURRENT_PATH}:/work registry.cn-guangzhou.aliyuncs.com/nvidia-images/yolov5:4.0 \ 12 | python3 gen_wts.py --model=/work/${MODEL_FILE} --wts=/work/${WTS_FILE} 13 | 14 | 15 | # sudo docker run --gpus all -v /data/zww/tasks/suit-classification/v1/export:/work -it 657 bash 16 | 17 | 18 | # sudo docker run --gpus all \ 19 | # -v ${CURRENT_PATH}:/work \ 20 | # -w /work \ 21 | # -it registry.cn-guangzhou.aliyuncs.com/nvidia-images/tensorrt:21.06-py3-opencv \ 22 | # bash -c 'cd yolov5-4.0-nms-person && bash run.sh' -------------------------------------------------------------------------------- /gen_wts.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import os 4 | import struct 5 | import torch 6 | from utils.torch_utils import select_device 7 | 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser(description='Convert .pt file to .wts') 11 | parser.add_argument('-w', '--weights', required=True, help='Input weights (.pt) file path (required)') 12 | parser.add_argument('-o', '--output', help='Output (.wts) file path (optional)') 13 | args = parser.parse_args() 14 | if not os.path.isfile(args.weights): 15 | raise SystemExit('Invalid input file') 16 | if not args.output: 17 | args.output = os.path.splitext(args.weights)[0] + '.wts' 18 | elif os.path.isdir(args.output): 19 | args.output = os.path.join( 20 | args.output, 21 | os.path.splitext(os.path.basename(args.weights))[0] + '.wts') 22 | return args.weights, args.output 23 | 24 | 25 | pt_file, wts_file = parse_args() 26 | 27 | # Initialize 28 | device = select_device('cpu') 29 | # Load model 30 | model = torch.load(pt_file, map_location=device)['model'].float() # load to FP32 31 | model.to(device).eval() 32 | 33 | with open(wts_file, 'w') as f: 34 | f.write('{}\n'.format(len(model.state_dict().keys()))) 35 | for k, v in model.state_dict().items(): 36 | vr = v.reshape(-1).cpu().numpy() 37 | f.write('{} {} '.format(k, len(vr))) 38 | for vv in vr: 39 | f.write(' ') 40 | f.write(struct.pack('>f' ,float(vv)).hex()) 41 | f.write('\n') 42 | -------------------------------------------------------------------------------- /logging.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef TENSORRT_LOGGING_H 18 | #define TENSORRT_LOGGING_H 19 | 20 | #include "NvInferRuntimeCommon.h" 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include "macros.h" 29 | 30 | using Severity = nvinfer1::ILogger::Severity; 31 | 32 | class LogStreamConsumerBuffer : public std::stringbuf 33 | { 34 | public: 35 | LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) 36 | : mOutput(stream) 37 | , mPrefix(prefix) 38 | , mShouldLog(shouldLog) 39 | { 40 | } 41 | 42 | LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) 43 | : mOutput(other.mOutput) 44 | { 45 | } 46 | 47 | ~LogStreamConsumerBuffer() 48 | { 49 | // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence 50 | // std::streambuf::pptr() gives a pointer to the current position of the output sequence 51 | // if the pointer to the beginning is not equal to the pointer to the current position, 52 | // call putOutput() to log the output to the stream 53 | if (pbase() != pptr()) 54 | { 55 | putOutput(); 56 | } 57 | } 58 | 59 | // synchronizes the stream buffer and returns 0 on success 60 | // synchronizing the stream buffer consists of inserting the buffer contents into the stream, 61 | // resetting the buffer and flushing the stream 62 | virtual int sync() 63 | { 64 | putOutput(); 65 | return 0; 66 | } 67 | 68 | void putOutput() 69 | { 70 | if (mShouldLog) 71 | { 72 | // prepend timestamp 73 | std::time_t timestamp = std::time(nullptr); 74 | tm* tm_local = std::localtime(×tamp); 75 | std::cout << "["; 76 | std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; 77 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; 78 | std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; 79 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; 80 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; 81 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; 82 | // std::stringbuf::str() gets the string contents of the buffer 83 | // insert the buffer contents pre-appended by the appropriate prefix into the stream 84 | mOutput << mPrefix << str(); 85 | // set the buffer to empty 86 | str(""); 87 | // flush the stream 88 | mOutput.flush(); 89 | } 90 | } 91 | 92 | void setShouldLog(bool shouldLog) 93 | { 94 | mShouldLog = shouldLog; 95 | } 96 | 97 | private: 98 | std::ostream& mOutput; 99 | std::string mPrefix; 100 | bool mShouldLog; 101 | }; 102 | 103 | //! 104 | //! \class LogStreamConsumerBase 105 | //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer 106 | //! 107 | class LogStreamConsumerBase 108 | { 109 | public: 110 | LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) 111 | : mBuffer(stream, prefix, shouldLog) 112 | { 113 | } 114 | 115 | protected: 116 | LogStreamConsumerBuffer mBuffer; 117 | }; 118 | 119 | //! 120 | //! \class LogStreamConsumer 121 | //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. 122 | //! Order of base classes is LogStreamConsumerBase and then std::ostream. 123 | //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field 124 | //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. 125 | //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. 126 | //! Please do not change the order of the parent classes. 127 | //! 128 | class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream 129 | { 130 | public: 131 | //! \brief Creates a LogStreamConsumer which logs messages with level severity. 132 | //! Reportable severity determines if the messages are severe enough to be logged. 133 | LogStreamConsumer(Severity reportableSeverity, Severity severity) 134 | : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) 135 | , std::ostream(&mBuffer) // links the stream buffer with the stream 136 | , mShouldLog(severity <= reportableSeverity) 137 | , mSeverity(severity) 138 | { 139 | } 140 | 141 | LogStreamConsumer(LogStreamConsumer&& other) 142 | : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) 143 | , std::ostream(&mBuffer) // links the stream buffer with the stream 144 | , mShouldLog(other.mShouldLog) 145 | , mSeverity(other.mSeverity) 146 | { 147 | } 148 | 149 | void setReportableSeverity(Severity reportableSeverity) 150 | { 151 | mShouldLog = mSeverity <= reportableSeverity; 152 | mBuffer.setShouldLog(mShouldLog); 153 | } 154 | 155 | private: 156 | static std::ostream& severityOstream(Severity severity) 157 | { 158 | return severity >= Severity::kINFO ? std::cout : std::cerr; 159 | } 160 | 161 | static std::string severityPrefix(Severity severity) 162 | { 163 | switch (severity) 164 | { 165 | case Severity::kINTERNAL_ERROR: return "[F] "; 166 | case Severity::kERROR: return "[E] "; 167 | case Severity::kWARNING: return "[W] "; 168 | case Severity::kINFO: return "[I] "; 169 | case Severity::kVERBOSE: return "[V] "; 170 | default: assert(0); return ""; 171 | } 172 | } 173 | 174 | bool mShouldLog; 175 | Severity mSeverity; 176 | }; 177 | 178 | //! \class Logger 179 | //! 180 | //! \brief Class which manages logging of TensorRT tools and samples 181 | //! 182 | //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, 183 | //! and supports logging two types of messages: 184 | //! 185 | //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) 186 | //! - Test pass/fail messages 187 | //! 188 | //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is 189 | //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. 190 | //! 191 | //! In the future, this class could be extended to support dumping test results to a file in some standard format 192 | //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). 193 | //! 194 | //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger 195 | //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT 196 | //! library and messages coming from the sample. 197 | //! 198 | //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the 199 | //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger 200 | //! object. 201 | 202 | class Logger : public nvinfer1::ILogger 203 | { 204 | public: 205 | Logger(Severity severity = Severity::kWARNING) 206 | : mReportableSeverity(severity) 207 | { 208 | } 209 | 210 | //! 211 | //! \enum TestResult 212 | //! \brief Represents the state of a given test 213 | //! 214 | enum class TestResult 215 | { 216 | kRUNNING, //!< The test is running 217 | kPASSED, //!< The test passed 218 | kFAILED, //!< The test failed 219 | kWAIVED //!< The test was waived 220 | }; 221 | 222 | //! 223 | //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger 224 | //! \return The nvinfer1::ILogger associated with this Logger 225 | //! 226 | //! TODO Once all samples are updated to use this method to register the logger with TensorRT, 227 | //! we can eliminate the inheritance of Logger from ILogger 228 | //! 229 | nvinfer1::ILogger& getTRTLogger() 230 | { 231 | return *this; 232 | } 233 | 234 | //! 235 | //! \brief Implementation of the nvinfer1::ILogger::log() virtual method 236 | //! 237 | //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the 238 | //! inheritance from nvinfer1::ILogger 239 | //! 240 | void log(Severity severity, const char* msg) TRT_NOEXCEPT override 241 | { 242 | LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; 243 | } 244 | 245 | //! 246 | //! \brief Method for controlling the verbosity of logging output 247 | //! 248 | //! \param severity The logger will only emit messages that have severity of this level or higher. 249 | //! 250 | void setReportableSeverity(Severity severity) 251 | { 252 | mReportableSeverity = severity; 253 | } 254 | 255 | //! 256 | //! \brief Opaque handle that holds logging information for a particular test 257 | //! 258 | //! This object is an opaque handle to information used by the Logger to print test results. 259 | //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used 260 | //! with Logger::reportTest{Start,End}(). 261 | //! 262 | class TestAtom 263 | { 264 | public: 265 | TestAtom(TestAtom&&) = default; 266 | 267 | private: 268 | friend class Logger; 269 | 270 | TestAtom(bool started, const std::string& name, const std::string& cmdline) 271 | : mStarted(started) 272 | , mName(name) 273 | , mCmdline(cmdline) 274 | { 275 | } 276 | 277 | bool mStarted; 278 | std::string mName; 279 | std::string mCmdline; 280 | }; 281 | 282 | //! 283 | //! \brief Define a test for logging 284 | //! 285 | //! \param[in] name The name of the test. This should be a string starting with 286 | //! "TensorRT" and containing dot-separated strings containing 287 | //! the characters [A-Za-z0-9_]. 288 | //! For example, "TensorRT.sample_googlenet" 289 | //! \param[in] cmdline The command line used to reproduce the test 290 | // 291 | //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). 292 | //! 293 | static TestAtom defineTest(const std::string& name, const std::string& cmdline) 294 | { 295 | return TestAtom(false, name, cmdline); 296 | } 297 | 298 | //! 299 | //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments 300 | //! as input 301 | //! 302 | //! \param[in] name The name of the test 303 | //! \param[in] argc The number of command-line arguments 304 | //! \param[in] argv The array of command-line arguments (given as C strings) 305 | //! 306 | //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). 307 | static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) 308 | { 309 | auto cmdline = genCmdlineString(argc, argv); 310 | return defineTest(name, cmdline); 311 | } 312 | 313 | //! 314 | //! \brief Report that a test has started. 315 | //! 316 | //! \pre reportTestStart() has not been called yet for the given testAtom 317 | //! 318 | //! \param[in] testAtom The handle to the test that has started 319 | //! 320 | static void reportTestStart(TestAtom& testAtom) 321 | { 322 | reportTestResult(testAtom, TestResult::kRUNNING); 323 | assert(!testAtom.mStarted); 324 | testAtom.mStarted = true; 325 | } 326 | 327 | //! 328 | //! \brief Report that a test has ended. 329 | //! 330 | //! \pre reportTestStart() has been called for the given testAtom 331 | //! 332 | //! \param[in] testAtom The handle to the test that has ended 333 | //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, 334 | //! TestResult::kFAILED, TestResult::kWAIVED 335 | //! 336 | static void reportTestEnd(const TestAtom& testAtom, TestResult result) 337 | { 338 | assert(result != TestResult::kRUNNING); 339 | assert(testAtom.mStarted); 340 | reportTestResult(testAtom, result); 341 | } 342 | 343 | static int reportPass(const TestAtom& testAtom) 344 | { 345 | reportTestEnd(testAtom, TestResult::kPASSED); 346 | return EXIT_SUCCESS; 347 | } 348 | 349 | static int reportFail(const TestAtom& testAtom) 350 | { 351 | reportTestEnd(testAtom, TestResult::kFAILED); 352 | return EXIT_FAILURE; 353 | } 354 | 355 | static int reportWaive(const TestAtom& testAtom) 356 | { 357 | reportTestEnd(testAtom, TestResult::kWAIVED); 358 | return EXIT_SUCCESS; 359 | } 360 | 361 | static int reportTest(const TestAtom& testAtom, bool pass) 362 | { 363 | return pass ? reportPass(testAtom) : reportFail(testAtom); 364 | } 365 | 366 | Severity getReportableSeverity() const 367 | { 368 | return mReportableSeverity; 369 | } 370 | 371 | private: 372 | //! 373 | //! \brief returns an appropriate string for prefixing a log message with the given severity 374 | //! 375 | static const char* severityPrefix(Severity severity) 376 | { 377 | switch (severity) 378 | { 379 | case Severity::kINTERNAL_ERROR: return "[F] "; 380 | case Severity::kERROR: return "[E] "; 381 | case Severity::kWARNING: return "[W] "; 382 | case Severity::kINFO: return "[I] "; 383 | case Severity::kVERBOSE: return "[V] "; 384 | default: assert(0); return ""; 385 | } 386 | } 387 | 388 | //! 389 | //! \brief returns an appropriate string for prefixing a test result message with the given result 390 | //! 391 | static const char* testResultString(TestResult result) 392 | { 393 | switch (result) 394 | { 395 | case TestResult::kRUNNING: return "RUNNING"; 396 | case TestResult::kPASSED: return "PASSED"; 397 | case TestResult::kFAILED: return "FAILED"; 398 | case TestResult::kWAIVED: return "WAIVED"; 399 | default: assert(0); return ""; 400 | } 401 | } 402 | 403 | //! 404 | //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity 405 | //! 406 | static std::ostream& severityOstream(Severity severity) 407 | { 408 | return severity >= Severity::kINFO ? std::cout : std::cerr; 409 | } 410 | 411 | //! 412 | //! \brief method that implements logging test results 413 | //! 414 | static void reportTestResult(const TestAtom& testAtom, TestResult result) 415 | { 416 | severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " 417 | << testAtom.mCmdline << std::endl; 418 | } 419 | 420 | //! 421 | //! \brief generate a command line string from the given (argc, argv) values 422 | //! 423 | static std::string genCmdlineString(int argc, char const* const* argv) 424 | { 425 | std::stringstream ss; 426 | for (int i = 0; i < argc; i++) 427 | { 428 | if (i > 0) 429 | ss << " "; 430 | ss << argv[i]; 431 | } 432 | return ss.str(); 433 | } 434 | 435 | Severity mReportableSeverity; 436 | }; 437 | 438 | namespace 439 | { 440 | 441 | //! 442 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE 443 | //! 444 | //! Example usage: 445 | //! 446 | //! LOG_VERBOSE(logger) << "hello world" << std::endl; 447 | //! 448 | inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) 449 | { 450 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); 451 | } 452 | 453 | //! 454 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO 455 | //! 456 | //! Example usage: 457 | //! 458 | //! LOG_INFO(logger) << "hello world" << std::endl; 459 | //! 460 | inline LogStreamConsumer LOG_INFO(const Logger& logger) 461 | { 462 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); 463 | } 464 | 465 | //! 466 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING 467 | //! 468 | //! Example usage: 469 | //! 470 | //! LOG_WARN(logger) << "hello world" << std::endl; 471 | //! 472 | inline LogStreamConsumer LOG_WARN(const Logger& logger) 473 | { 474 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); 475 | } 476 | 477 | //! 478 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR 479 | //! 480 | //! Example usage: 481 | //! 482 | //! LOG_ERROR(logger) << "hello world" << std::endl; 483 | //! 484 | inline LogStreamConsumer LOG_ERROR(const Logger& logger) 485 | { 486 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); 487 | } 488 | 489 | //! 490 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR 491 | // ("fatal" severity) 492 | //! 493 | //! Example usage: 494 | //! 495 | //! LOG_FATAL(logger) << "hello world" << std::endl; 496 | //! 497 | inline LogStreamConsumer LOG_FATAL(const Logger& logger) 498 | { 499 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); 500 | } 501 | 502 | } // anonymous namespace 503 | 504 | #endif // TENSORRT_LOGGING_H 505 | -------------------------------------------------------------------------------- /macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #ifdef API_EXPORTS 5 | #if defined(_MSC_VER) 6 | #define API __declspec(dllexport) 7 | #else 8 | #define API __attribute__((visibility("default"))) 9 | #endif 10 | #else 11 | 12 | #if defined(_MSC_VER) 13 | #define API __declspec(dllimport) 14 | #else 15 | #define API 16 | #endif 17 | #endif // API_EXPORTS 18 | 19 | #if NV_TENSORRT_MAJOR >= 8 20 | #define TRT_NOEXCEPT noexcept 21 | #define TRT_CONST_ENQUEUE const 22 | #else 23 | #define TRT_NOEXCEPT 24 | #define TRT_CONST_ENQUEUE 25 | #endif 26 | 27 | #endif // __MACROS_H 28 | -------------------------------------------------------------------------------- /samples/bus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/upczww/YoLov5-TensorRT-NMS/62833cc748c80dad24fe4ed8e003416e4c47f430/samples/bus.jpg -------------------------------------------------------------------------------- /samples/zidane.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/upczww/YoLov5-TensorRT-NMS/62833cc748c80dad24fe4ed8e003416e4c47f430/samples/zidane.jpg -------------------------------------------------------------------------------- /utils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRTX_YOLOV5_UTILS_H_ 2 | #define TRTX_YOLOV5_UTILS_H_ 3 | 4 | #include 5 | #include 6 | 7 | static inline cv::Mat preprocess_img(cv::Mat &img, int input_w, int input_h) 8 | { 9 | int w, h, x, y; 10 | float r_w = input_w / (img.cols * 1.0); 11 | float r_h = input_h / (img.rows * 1.0); 12 | if (r_h > r_w) 13 | { 14 | w = input_w; 15 | h = r_w * img.rows; 16 | x = 0; 17 | y = (input_h - h) / 2; 18 | } 19 | else 20 | { 21 | w = r_h * img.cols; 22 | h = input_h; 23 | x = (input_w - w) / 2; 24 | y = 0; 25 | } 26 | cv::Mat re(h, w, CV_8UC3); 27 | cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); 28 | cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); 29 | re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); 30 | return out; 31 | } 32 | 33 | static inline int read_files_in_dir(const char *p_dir_name, std::vector &file_names) 34 | { 35 | DIR *p_dir = opendir(p_dir_name); 36 | if (p_dir == nullptr) 37 | { 38 | return -1; 39 | } 40 | 41 | struct dirent *p_file = nullptr; 42 | while ((p_file = readdir(p_dir)) != nullptr) 43 | { 44 | if (strcmp(p_file->d_name, ".") != 0 && 45 | strcmp(p_file->d_name, "..") != 0) 46 | { 47 | //std::string cur_file_name(p_dir_name); 48 | //cur_file_name += "/"; 49 | //cur_file_name += p_file->d_name; 50 | std::string cur_file_name(p_file->d_name); 51 | file_names.push_back(cur_file_name); 52 | } 53 | } 54 | 55 | closedir(p_dir); 56 | return 0; 57 | } 58 | 59 | #endif // TRTX_YOLOV5_UTILS_H_ 60 | -------------------------------------------------------------------------------- /yololayer.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "yololayer.h" 5 | #include "cuda_utils.h" 6 | 7 | namespace Tn 8 | { 9 | template 10 | void write(char*& buffer, const T& val) 11 | { 12 | *reinterpret_cast(buffer) = val; 13 | buffer += sizeof(T); 14 | } 15 | 16 | template 17 | void read(const char*& buffer, T& val) 18 | { 19 | val = *reinterpret_cast(buffer); 20 | buffer += sizeof(T); 21 | } 22 | } 23 | 24 | using namespace Yolo; 25 | 26 | namespace nvinfer1 27 | { 28 | YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector &vYoloKernel) 29 | { 30 | mClassCount = classCount; 31 | mYoloV5NetWidth = netWidth; 32 | mYoloV5NetHeight = netHeight; 33 | mMaxOutObject = maxOut; 34 | mYoloKernel = vYoloKernel; 35 | mKernelCount = vYoloKernel.size(); 36 | 37 | CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); 38 | size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2; 39 | for (int ii = 0; ii < mKernelCount; ii++) 40 | { 41 | CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen)); 42 | const auto& yolo = mYoloKernel[ii]; 43 | CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); 44 | } 45 | } 46 | YoloLayerPlugin::~YoloLayerPlugin() 47 | { 48 | for (int ii = 0; ii < mKernelCount; ii++) 49 | { 50 | CUDA_CHECK(cudaFree(mAnchor[ii])); 51 | } 52 | CUDA_CHECK(cudaFreeHost(mAnchor)); 53 | } 54 | 55 | // create the plugin at runtime from a byte stream 56 | YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) 57 | { 58 | using namespace Tn; 59 | const char *d = reinterpret_cast(data), *a = d; 60 | read(d, mClassCount); 61 | read(d, mThreadCount); 62 | read(d, mKernelCount); 63 | read(d, mYoloV5NetWidth); 64 | read(d, mYoloV5NetHeight); 65 | read(d, mMaxOutObject); 66 | mYoloKernel.resize(mKernelCount); 67 | auto kernelSize = mKernelCount * sizeof(YoloKernel); 68 | memcpy(mYoloKernel.data(), d, kernelSize); 69 | d += kernelSize; 70 | CUDA_CHECK(cudaMallocHost(&mAnchor, mKernelCount * sizeof(void*))); 71 | size_t AnchorLen = sizeof(float)* CHECK_COUNT * 2; 72 | for (int ii = 0; ii < mKernelCount; ii++) 73 | { 74 | CUDA_CHECK(cudaMalloc(&mAnchor[ii], AnchorLen)); 75 | const auto& yolo = mYoloKernel[ii]; 76 | CUDA_CHECK(cudaMemcpy(mAnchor[ii], yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); 77 | } 78 | assert(d == a + length); 79 | } 80 | 81 | void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT 82 | { 83 | using namespace Tn; 84 | char* d = static_cast(buffer), *a = d; 85 | write(d, mClassCount); 86 | write(d, mThreadCount); 87 | write(d, mKernelCount); 88 | write(d, mYoloV5NetWidth); 89 | write(d, mYoloV5NetHeight); 90 | write(d, mMaxOutObject); 91 | auto kernelSize = mKernelCount * sizeof(YoloKernel); 92 | memcpy(d, mYoloKernel.data(), kernelSize); 93 | d += kernelSize; 94 | 95 | assert(d == a + getSerializationSize()); 96 | } 97 | 98 | size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT 99 | { 100 | return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size() + sizeof(mYoloV5NetWidth) + sizeof(mYoloV5NetHeight) + sizeof(mMaxOutObject); 101 | } 102 | 103 | int YoloLayerPlugin::initialize() TRT_NOEXCEPT 104 | { 105 | return 0; 106 | } 107 | 108 | Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) TRT_NOEXCEPT 109 | { assert(index < 2); 110 | //output the result to channel 111 | if (index == 0) 112 | { 113 | return Dims3(mMaxOutObject, 1, 4); 114 | } 115 | return DimsHW(mMaxOutObject, mClassCount); 116 | } 117 | 118 | // Set plugin namespace 119 | void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT 120 | { 121 | mPluginNamespace = pluginNamespace; 122 | } 123 | 124 | const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT 125 | { 126 | return mPluginNamespace; 127 | } 128 | 129 | // Return the DataType of the plugin output at the requested index 130 | DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT 131 | { 132 | return DataType::kFLOAT; 133 | } 134 | 135 | // Return true if output tensor is broadcast across a batch. 136 | bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT 137 | { 138 | return false; 139 | } 140 | 141 | // Return true if plugin can use input that is broadcast across batch without replication. 142 | bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT 143 | { 144 | return false; 145 | } 146 | 147 | void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) TRT_NOEXCEPT 148 | { 149 | } 150 | 151 | // Attach the plugin object to an execution context and grant the plugin the access to some context resource. 152 | void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT 153 | { 154 | } 155 | 156 | // Detach the plugin object from its execution context. 157 | void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} 158 | 159 | const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT 160 | { 161 | return "YoloLayer_TRT"; 162 | } 163 | 164 | const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT 165 | { 166 | return "1"; 167 | } 168 | 169 | void YoloLayerPlugin::destroy() TRT_NOEXCEPT 170 | { 171 | delete this; 172 | } 173 | 174 | // Clone the plugin 175 | IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT 176 | { 177 | YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, mYoloKernel); 178 | p->setPluginNamespace(mPluginNamespace); 179 | return p; 180 | } 181 | 182 | __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }; 183 | 184 | __global__ void CalDetection(const float *input, float *bboxData, float *scoreData, int *countData, int noElements, 185 | const int netwidth, const int netheight, int maxoutobject, int yoloWidth, int yoloHeight, const float anchors[CHECK_COUNT * 2], int classes) 186 | { 187 | 188 | int idx = threadIdx.x + blockDim.x * blockIdx.x; 189 | if (idx >= noElements) return; 190 | 191 | int total_grid = yoloWidth * yoloHeight; 192 | int bnIdx = idx / total_grid; 193 | idx = idx - total_grid * bnIdx; 194 | int info_len_i = 5 + classes; // 85 195 | const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT); // b*h*w*3*85 196 | 197 | for (int k = 0; k < CHECK_COUNT; ++k) { 198 | float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]); 199 | if (box_prob < IGNORE_THRESH) continue; 200 | int *res_count = countData + bnIdx; 201 | int count = (int)atomicAdd(res_count, 1); 202 | if (count >= maxoutobject) return; 203 | 204 | float *curBbox = bboxData + bnIdx * maxoutobject * 4 + count * 4; 205 | float *curScore = scoreData + bnIdx * maxoutobject * classes + count * classes; 206 | 207 | for (int i = 5; i < info_len_i; ++i) 208 | { 209 | float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]); 210 | curScore[i - 5] = p * box_prob; 211 | } 212 | int row = idx / yoloWidth; 213 | int col = idx % yoloWidth; 214 | 215 | //Location 216 | // pytorch: 217 | // y = x[i].sigmoid() 218 | // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i] # xy 219 | // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh 220 | // X: (sigmoid(tx) + cx)/FeaturemapW * netwidth 221 | float cx = (col - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * netwidth / yoloWidth; 222 | float cy = (row - 0.5f + 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * netheight / yoloHeight; 223 | 224 | // W: (Pw * e^tw) / FeaturemapW * netwidth 225 | // v5: https://github.com/ultralytics/yolov5/issues/471 226 | float w = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]); 227 | w = w * w * anchors[2 * k]; 228 | float h = 2.0f * Logist(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]); 229 | h = h * h * anchors[2 * k + 1]; 230 | // cx,cy,w,h to x1,y1,x2,y2 231 | curBbox[0] = cx - 0.5 * w; 232 | curBbox[1] = cy - 0.5 * h; 233 | curBbox[2] = cx + 0.5 * w; 234 | curBbox[3] = cy + 0.5 * h; 235 | } 236 | } 237 | 238 | void YoloLayerPlugin::forwardGpu(const float* const* inputs, void** outputs, void* workspace, cudaStream_t stream, int batchSize) 239 | { 240 | float *bboxData = (float *)outputs[0]; 241 | float *scoreData = (float *)outputs[1]; 242 | int *countData = (int *)workspace; 243 | 244 | 245 | CUDA_CHECK(cudaMemset(countData, 0, sizeof(int) * batchSize)); 246 | CUDA_CHECK(cudaMemset(bboxData, 0, sizeof(float) * mMaxOutObject * 4 * batchSize)); 247 | CUDA_CHECK(cudaMemset(scoreData, 0, sizeof(float) * mMaxOutObject * mClassCount * batchSize)); 248 | 249 | int numElem = 0; 250 | for (unsigned int i = 0; i < mYoloKernel.size(); ++i){ 251 | const auto& yolo = mYoloKernel[i]; 252 | numElem = yolo.width * yolo.height * batchSize; 253 | if (numElem < mThreadCount) mThreadCount = numElem; 254 | CalDetection<<< (numElem + mThreadCount - 1) / mThreadCount, mThreadCount>>> 255 | (inputs[i], bboxData, scoreData, countData, numElem, mYoloV5NetWidth, mYoloV5NetHeight, mMaxOutObject, yolo.width, yolo.height, (float*)mAnchor[i], mClassCount); 256 | } 257 | } 258 | 259 | 260 | int YoloLayerPlugin::enqueue(int batchSize, const void *const *inputs, void ** outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT 261 | { 262 | forwardGpu((const float* const*)inputs, outputs, workspace, stream, batchSize); 263 | return 0; 264 | } 265 | 266 | PluginFieldCollection YoloPluginCreator::mFC{}; 267 | std::vector YoloPluginCreator::mPluginAttributes; 268 | 269 | YoloPluginCreator::YoloPluginCreator() 270 | { 271 | mPluginAttributes.clear(); 272 | 273 | mFC.nbFields = mPluginAttributes.size(); 274 | mFC.fields = mPluginAttributes.data(); 275 | } 276 | 277 | const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT 278 | { 279 | return "YoloLayer_TRT"; 280 | } 281 | 282 | const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT 283 | { 284 | return "1"; 285 | } 286 | 287 | const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT 288 | { 289 | return &mFC; 290 | } 291 | 292 | IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT 293 | { 294 | assert(fc->nbFields == 2); 295 | assert(strcmp(fc->fields[0].name, "netinfo") == 0); 296 | assert(strcmp(fc->fields[1].name, "kernels") == 0); 297 | int *p_netinfo = (int*)(fc->fields[0].data); 298 | int class_count = p_netinfo[0]; 299 | int input_w = p_netinfo[1]; 300 | int input_h = p_netinfo[2]; 301 | int max_output_object_count = p_netinfo[3]; 302 | std::vector kernels(fc->fields[1].length); 303 | memcpy(&kernels[0], fc->fields[1].data, kernels.size() * sizeof(Yolo::YoloKernel)); 304 | YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count, kernels); 305 | obj->setPluginNamespace(mNamespace.c_str()); 306 | return obj; 307 | } 308 | 309 | IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT 310 | { 311 | // This object will be deleted when the network is destroyed, which will 312 | // call YoloLayerPlugin::destroy() 313 | YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); 314 | obj->setPluginNamespace(mNamespace.c_str()); 315 | return obj; 316 | } 317 | } 318 | 319 | -------------------------------------------------------------------------------- /yololayer.h: -------------------------------------------------------------------------------- 1 | #ifndef _YOLO_LAYER_H 2 | #define _YOLO_LAYER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "macros.h" 8 | 9 | namespace Yolo 10 | { 11 | static constexpr int CHECK_COUNT = 3; 12 | static constexpr float IGNORE_THRESH = 0.1f; 13 | struct YoloKernel 14 | { 15 | int width; 16 | int height; 17 | float anchors[CHECK_COUNT * 2]; 18 | }; 19 | static constexpr int MAX_OUTPUT_BBOX_COUNT = 2048; 20 | static constexpr int CLASS_NUM = 80; 21 | static constexpr int INPUT_H = 640; // yolov5's input height and width must be divisible by 32. 22 | static constexpr int INPUT_W = 640; 23 | 24 | } 25 | 26 | namespace nvinfer1 27 | { 28 | class YoloLayerPlugin : public IPluginV2IOExt 29 | { 30 | public: 31 | YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut, const std::vector &vYoloKernel); 32 | YoloLayerPlugin(const void *data, size_t length); 33 | ~YoloLayerPlugin(); 34 | 35 | int getNbOutputs() const TRT_NOEXCEPT override 36 | { 37 | return 2; 38 | } 39 | 40 | Dims getOutputDimensions(int index, const Dims *inputs, int nbInputDims) TRT_NOEXCEPT override; 41 | 42 | int initialize() TRT_NOEXCEPT override; 43 | 44 | virtual void terminate() TRT_NOEXCEPT override{}; 45 | 46 | virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override 47 | { 48 | return maxBatchSize * sizeof(int); 49 | } 50 | 51 | virtual int enqueue(int batchSize, const void *const *inputs, void *TRT_CONST_ENQUEUE *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override; 52 | 53 | virtual size_t getSerializationSize() const TRT_NOEXCEPT override; 54 | 55 | virtual void serialize(void *buffer) const TRT_NOEXCEPT override; 56 | 57 | bool supportsFormatCombination(int pos, const PluginTensorDesc *inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override 58 | { 59 | return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; 60 | } 61 | 62 | const char *getPluginType() const TRT_NOEXCEPT override; 63 | 64 | const char *getPluginVersion() const TRT_NOEXCEPT override; 65 | 66 | void destroy() TRT_NOEXCEPT override; 67 | 68 | IPluginV2IOExt *clone() const TRT_NOEXCEPT override; 69 | 70 | void setPluginNamespace(const char *pluginNamespace) TRT_NOEXCEPT override; 71 | 72 | const char *getPluginNamespace() const TRT_NOEXCEPT override; 73 | 74 | DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes, int nbInputs) const TRT_NOEXCEPT override; 75 | 76 | bool isOutputBroadcastAcrossBatch(int outputIndex, const bool *inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; 77 | 78 | bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; 79 | 80 | void attachToContext( 81 | cudnnContext *cudnnContext, cublasContext *cublasContext, IGpuAllocator *gpuAllocator) TRT_NOEXCEPT override; 82 | 83 | void configurePlugin(const PluginTensorDesc *in, int nbInput, const PluginTensorDesc *out, int nbOutput) TRT_NOEXCEPT override; 84 | 85 | void detachFromContext() TRT_NOEXCEPT override; 86 | 87 | private: 88 | void forwardGpu(const float *const *inputs, void **outputs, void *workspace, cudaStream_t stream, int batchSize = 1); 89 | int mThreadCount = 256; 90 | const char *mPluginNamespace; 91 | int mKernelCount; 92 | int mClassCount; 93 | int mYoloV5NetWidth; 94 | int mYoloV5NetHeight; 95 | int mMaxOutObject; 96 | std::vector mYoloKernel; 97 | void **mAnchor; 98 | }; 99 | 100 | class API YoloPluginCreator : public IPluginCreator 101 | { 102 | public: 103 | YoloPluginCreator(); 104 | 105 | ~YoloPluginCreator() override = default; 106 | 107 | const char *getPluginName() const TRT_NOEXCEPT override; 108 | 109 | const char *getPluginVersion() const TRT_NOEXCEPT override; 110 | 111 | const PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override; 112 | 113 | IPluginV2IOExt *createPlugin(const char *name, const PluginFieldCollection *fc) TRT_NOEXCEPT override; 114 | 115 | IPluginV2IOExt *deserializePlugin(const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT override; 116 | 117 | void setPluginNamespace(const char *libNamespace) TRT_NOEXCEPT override 118 | { 119 | mNamespace = libNamespace; 120 | } 121 | 122 | const char *getPluginNamespace() const TRT_NOEXCEPT override 123 | { 124 | return mNamespace.c_str(); 125 | } 126 | 127 | private: 128 | std::string mNamespace; 129 | static PluginFieldCollection mFC; 130 | static std::vector mPluginAttributes; 131 | }; 132 | REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); 133 | }; 134 | 135 | #endif // _YOLO_LAYER_H 136 | -------------------------------------------------------------------------------- /yolov5.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "cuda_utils.h" 5 | #include "logging.h" 6 | #include "common.hpp" 7 | #include "utils.h" 8 | #include "calibrator.h" 9 | 10 | #define USE_FP16 // set USE_INT8 or USE_FP16 or USE_FP32 11 | #define DEVICE 0 // GPU id 12 | #define NMS_THRESH 0.45 13 | #define CONF_THRESH 0.25 14 | #define BATCH_SIZE 2 15 | #define KEEP_TOPK 20 16 | 17 | #define NET m // s m l x 18 | #define NETSTRUCT(str) createEngine_##str 19 | #define CREATENET(net) NETSTRUCT(net) 20 | #define STR1(x) #x 21 | #define STR2(x) STR1(x) 22 | 23 | // stuff we know about the network and the input/output blobs 24 | static const int INPUT_H = Yolo::INPUT_H; 25 | static const int INPUT_W = Yolo::INPUT_W; 26 | static const int CLASS_NUM = Yolo::CLASS_NUM; 27 | const char *INPUT_NAME = "data"; 28 | const char *OUTPUT_COUNTS = "count"; 29 | const char *OUTPUT_BOXES = "box"; 30 | const char *OUTPUT_SCORES = "score"; 31 | const char *OUTPUT_CLASSES = "class"; 32 | 33 | static Logger gLogger; 34 | 35 | static int get_width(int x, float gw, int divisor = 8) 36 | { 37 | return int(ceil((x * gw) / divisor)) * divisor; 38 | } 39 | 40 | static int get_depth(int x, float gd) 41 | { 42 | if (x == 1) 43 | return 1; 44 | int r = round(x * gd); 45 | if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) 46 | { 47 | --r; 48 | } 49 | return std::max(r, 1); 50 | } 51 | 52 | ICudaEngine *build_engine(unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt, float &gd, float &gw, std::string &wts_name) 53 | { 54 | INetworkDefinition *network = builder->createNetworkV2(0U); 55 | 56 | // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_NAME 57 | ITensor *data = network->addInput(INPUT_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); 58 | assert(data); 59 | 60 | Weights Div_255{DataType::kFLOAT, nullptr, 3}; 61 | float *wgt = new float[3]; 62 | for (unsigned int i = 0; i < 3; i++) 63 | wgt[i] = 255.0f; 64 | Div_255.values = wgt; 65 | IConstantLayer *d = network->addConstant(Dims3{3, 1, 1}, Div_255); 66 | auto input = network->addElementWise(*data, *d->getOutput(0), ElementWiseOperation::kDIV); 67 | 68 | std::map weightMap = loadWeights(wts_name); 69 | 70 | /* ------ yolov5 backbone------ */ 71 | auto focus0 = focus(network, weightMap, *input->getOutput(0), 3, get_width(64, gw), 3, "model.0"); 72 | auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1"); 73 | auto bottleneck_CSP2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2"); 74 | auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3"); 75 | auto bottleneck_csp4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(9, gd), true, 1, 0.5, "model.4"); 76 | auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5"); 77 | auto bottleneck_csp6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6"); 78 | auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.7"); 79 | auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), get_width(1024, gw), get_width(1024, gw), 5, 9, 13, "model.8"); 80 | 81 | /* ------ yolov5 head ------ */ 82 | auto bottleneck_csp9 = C3(network, weightMap, *spp8->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.9"); 83 | auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), get_width(512, gw), 1, 1, 1, "model.10"); 84 | 85 | auto upsample11 = network->addResize(*conv10->getOutput(0)); 86 | assert(upsample11); 87 | upsample11->setResizeMode(ResizeMode::kNEAREST); 88 | upsample11->setOutputDimensions(bottleneck_csp6->getOutput(0)->getDimensions()); 89 | 90 | ITensor *inputTensors12[] = {upsample11->getOutput(0), bottleneck_csp6->getOutput(0)}; 91 | auto cat12 = network->addConcatenation(inputTensors12, 2); 92 | auto bottleneck_csp13 = C3(network, weightMap, *cat12->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.13"); 93 | auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), get_width(256, gw), 1, 1, 1, "model.14"); 94 | 95 | auto upsample15 = network->addResize(*conv14->getOutput(0)); 96 | assert(upsample15); 97 | upsample15->setResizeMode(ResizeMode::kNEAREST); 98 | upsample15->setOutputDimensions(bottleneck_csp4->getOutput(0)->getDimensions()); 99 | 100 | ITensor *inputTensors16[] = {upsample15->getOutput(0), bottleneck_csp4->getOutput(0)}; 101 | auto cat16 = network->addConcatenation(inputTensors16, 2); 102 | 103 | auto bottleneck_csp17 = C3(network, weightMap, *cat16->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.17"); 104 | 105 | /* ------ detect ------ */ 106 | IConvolutionLayer *det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]); 107 | auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), get_width(256, gw), 3, 2, 1, "model.18"); 108 | ITensor *inputTensors19[] = {conv18->getOutput(0), conv14->getOutput(0)}; 109 | auto cat19 = network->addConcatenation(inputTensors19, 2); 110 | auto bottleneck_csp20 = C3(network, weightMap, *cat19->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.20"); 111 | IConvolutionLayer *det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]); 112 | auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), get_width(512, gw), 3, 2, 1, "model.21"); 113 | ITensor *inputTensors22[] = {conv21->getOutput(0), conv10->getOutput(0)}; 114 | auto cat22 = network->addConcatenation(inputTensors22, 2); 115 | auto bottleneck_csp23 = C3(network, weightMap, *cat22->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.23"); 116 | IConvolutionLayer *det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]); 117 | 118 | auto yolo = addYoLoLayer(network, weightMap, "model.24", std::vector{det0, det1, det2}); 119 | auto nms = addBatchedNMSLayer(network, yolo, Yolo::CLASS_NUM, Yolo::MAX_OUTPUT_BBOX_COUNT, KEEP_TOPK, CONF_THRESH, NMS_THRESH); 120 | 121 | nms->getOutput(0)->setName(OUTPUT_COUNTS); 122 | network->markOutput(*nms->getOutput(0)); 123 | 124 | nms->getOutput(1)->setName(OUTPUT_BOXES); 125 | network->markOutput(*nms->getOutput(1)); 126 | 127 | nms->getOutput(2)->setName(OUTPUT_SCORES); 128 | network->markOutput(*nms->getOutput(2)); 129 | 130 | nms->getOutput(3)->setName(OUTPUT_CLASSES); 131 | network->markOutput(*nms->getOutput(3)); 132 | 133 | // Build engine 134 | builder->setMaxBatchSize(maxBatchSize); 135 | config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB 136 | #if defined(USE_FP16) 137 | config->setFlag(BuilderFlag::kFP16); 138 | #elif defined(USE_INT8) 139 | std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; 140 | assert(builder->platformHasFastInt8()); 141 | config->setFlag(BuilderFlag::kINT8); 142 | Int8EntropyCalibrator2 *calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./coco_calib/", "int8calib.table", INPUT_NAME); 143 | config->setInt8Calibrator(calibrator); 144 | #endif 145 | 146 | std::cout << "Building engine, please wait for a while..." << std::endl; 147 | ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config); 148 | std::cout << "Build engine successfully!" << std::endl; 149 | 150 | // Don't need the network any more 151 | network->destroy(); 152 | 153 | // Release host memory 154 | for (auto &mem : weightMap) 155 | { 156 | free((void *)(mem.second.values)); 157 | } 158 | 159 | return engine; 160 | } 161 | 162 | ICudaEngine *build_engine_p6(unsigned int maxBatchSize, IBuilder *builder, IBuilderConfig *config, DataType dt, float &gd, float &gw, std::string &wts_name) 163 | { 164 | INetworkDefinition *network = builder->createNetworkV2(0U); 165 | 166 | // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_NAME 167 | ITensor *data = network->addInput(INPUT_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); 168 | assert(data); 169 | 170 | Weights Div_255{DataType::kFLOAT, nullptr, 3}; 171 | float *wgt = new float[3]; 172 | for (unsigned int i = 0; i < 3; i++) 173 | wgt[i] = 255.0f; 174 | Div_255.values = wgt; 175 | IConstantLayer *d = network->addConstant(Dims3{3, 1, 1}, Div_255); 176 | auto input = network->addElementWise(*data, *d->getOutput(0), ElementWiseOperation::kDIV); 177 | 178 | std::map weightMap = loadWeights(wts_name); 179 | 180 | /* ------ yolov5 backbone------ */ 181 | auto focus0 = focus(network, weightMap, *input->getOutput(0), 3, get_width(64, gw), 3, "model.0"); 182 | auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), get_width(128, gw), 3, 2, 1, "model.1"); 183 | auto c3_2 = C3(network, weightMap, *conv1->getOutput(0), get_width(128, gw), get_width(128, gw), get_depth(3, gd), true, 1, 0.5, "model.2"); 184 | auto conv3 = convBlock(network, weightMap, *c3_2->getOutput(0), get_width(256, gw), 3, 2, 1, "model.3"); 185 | auto c3_4 = C3(network, weightMap, *conv3->getOutput(0), get_width(256, gw), get_width(256, gw), get_depth(9, gd), true, 1, 0.5, "model.4"); 186 | auto conv5 = convBlock(network, weightMap, *c3_4->getOutput(0), get_width(512, gw), 3, 2, 1, "model.5"); 187 | auto c3_6 = C3(network, weightMap, *conv5->getOutput(0), get_width(512, gw), get_width(512, gw), get_depth(9, gd), true, 1, 0.5, "model.6"); 188 | auto conv7 = convBlock(network, weightMap, *c3_6->getOutput(0), get_width(768, gw), 3, 2, 1, "model.7"); 189 | auto c3_8 = C3(network, weightMap, *conv7->getOutput(0), get_width(768, gw), get_width(768, gw), get_depth(3, gd), true, 1, 0.5, "model.8"); 190 | auto conv9 = convBlock(network, weightMap, *c3_8->getOutput(0), get_width(1024, gw), 3, 2, 1, "model.9"); 191 | auto spp10 = SPP(network, weightMap, *conv9->getOutput(0), get_width(1024, gw), get_width(1024, gw), 3, 5, 7, "model.10"); 192 | auto c3_11 = C3(network, weightMap, *spp10->getOutput(0), get_width(1024, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.11"); 193 | 194 | /* ------ yolov5 head ------ */ 195 | auto conv12 = convBlock(network, weightMap, *c3_11->getOutput(0), get_width(768, gw), 1, 1, 1, "model.12"); 196 | auto upsample13 = network->addResize(*conv12->getOutput(0)); 197 | assert(upsample13); 198 | upsample13->setResizeMode(ResizeMode::kNEAREST); 199 | upsample13->setOutputDimensions(c3_8->getOutput(0)->getDimensions()); 200 | ITensor *inputTensors14[] = {upsample13->getOutput(0), c3_8->getOutput(0)}; 201 | auto cat14 = network->addConcatenation(inputTensors14, 2); 202 | auto c3_15 = C3(network, weightMap, *cat14->getOutput(0), get_width(1536, gw), get_width(768, gw), get_depth(3, gd), false, 1, 0.5, "model.15"); 203 | 204 | auto conv16 = convBlock(network, weightMap, *c3_15->getOutput(0), get_width(512, gw), 1, 1, 1, "model.16"); 205 | auto upsample17 = network->addResize(*conv16->getOutput(0)); 206 | assert(upsample17); 207 | upsample17->setResizeMode(ResizeMode::kNEAREST); 208 | upsample17->setOutputDimensions(c3_6->getOutput(0)->getDimensions()); 209 | ITensor *inputTensors18[] = {upsample17->getOutput(0), c3_6->getOutput(0)}; 210 | auto cat18 = network->addConcatenation(inputTensors18, 2); 211 | auto c3_19 = C3(network, weightMap, *cat18->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.19"); 212 | 213 | auto conv20 = convBlock(network, weightMap, *c3_19->getOutput(0), get_width(256, gw), 1, 1, 1, "model.20"); 214 | auto upsample21 = network->addResize(*conv20->getOutput(0)); 215 | assert(upsample21); 216 | upsample21->setResizeMode(ResizeMode::kNEAREST); 217 | upsample21->setOutputDimensions(c3_4->getOutput(0)->getDimensions()); 218 | ITensor *inputTensors21[] = {upsample21->getOutput(0), c3_4->getOutput(0)}; 219 | auto cat22 = network->addConcatenation(inputTensors21, 2); 220 | auto c3_23 = C3(network, weightMap, *cat22->getOutput(0), get_width(512, gw), get_width(256, gw), get_depth(3, gd), false, 1, 0.5, "model.23"); 221 | 222 | auto conv24 = convBlock(network, weightMap, *c3_23->getOutput(0), get_width(256, gw), 3, 2, 1, "model.24"); 223 | ITensor *inputTensors25[] = {conv24->getOutput(0), conv20->getOutput(0)}; 224 | auto cat25 = network->addConcatenation(inputTensors25, 2); 225 | auto c3_26 = C3(network, weightMap, *cat25->getOutput(0), get_width(1024, gw), get_width(512, gw), get_depth(3, gd), false, 1, 0.5, "model.26"); 226 | 227 | auto conv27 = convBlock(network, weightMap, *c3_26->getOutput(0), get_width(512, gw), 3, 2, 1, "model.27"); 228 | ITensor *inputTensors28[] = {conv27->getOutput(0), conv16->getOutput(0)}; 229 | auto cat28 = network->addConcatenation(inputTensors28, 2); 230 | auto c3_29 = C3(network, weightMap, *cat28->getOutput(0), get_width(1536, gw), get_width(768, gw), get_depth(3, gd), false, 1, 0.5, "model.29"); 231 | 232 | auto conv30 = convBlock(network, weightMap, *c3_29->getOutput(0), get_width(768, gw), 3, 2, 1, "model.30"); 233 | ITensor *inputTensors31[] = {conv30->getOutput(0), conv12->getOutput(0)}; 234 | auto cat31 = network->addConcatenation(inputTensors31, 2); 235 | auto c3_32 = C3(network, weightMap, *cat31->getOutput(0), get_width(2048, gw), get_width(1024, gw), get_depth(3, gd), false, 1, 0.5, "model.32"); 236 | 237 | /* ------ detect ------ */ 238 | IConvolutionLayer *det0 = network->addConvolutionNd(*c3_23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.33.m.0.weight"], weightMap["model.33.m.0.bias"]); 239 | IConvolutionLayer *det1 = network->addConvolutionNd(*c3_26->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.33.m.1.weight"], weightMap["model.33.m.1.bias"]); 240 | IConvolutionLayer *det2 = network->addConvolutionNd(*c3_29->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.33.m.2.weight"], weightMap["model.33.m.2.bias"]); 241 | IConvolutionLayer *det3 = network->addConvolutionNd(*c3_32->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["model.33.m.3.weight"], weightMap["model.33.m.3.bias"]); 242 | 243 | auto yolo = addYoLoLayer(network, weightMap, "model.33", std::vector{det0, det1, det2, det3}); 244 | auto nms = addBatchedNMSLayer(network, yolo, Yolo::CLASS_NUM, Yolo::MAX_OUTPUT_BBOX_COUNT, KEEP_TOPK, CONF_THRESH, NMS_THRESH); 245 | 246 | nms->getOutput(0)->setName(OUTPUT_COUNTS); 247 | network->markOutput(*nms->getOutput(0)); 248 | 249 | nms->getOutput(1)->setName(OUTPUT_BOXES); 250 | network->markOutput(*nms->getOutput(1)); 251 | 252 | nms->getOutput(2)->setName(OUTPUT_SCORES); 253 | network->markOutput(*nms->getOutput(2)); 254 | 255 | nms->getOutput(3)->setName(OUTPUT_CLASSES); 256 | network->markOutput(*nms->getOutput(3)); 257 | 258 | // Build engine 259 | builder->setMaxBatchSize(maxBatchSize); 260 | config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB 261 | #if defined(USE_FP16) 262 | config->setFlag(BuilderFlag::kFP16); 263 | #elif defined(USE_INT8) 264 | std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; 265 | assert(builder->platformHasFastInt8()); 266 | config->setFlag(BuilderFlag::kINT8); 267 | Int8EntropyCalibrator2 *calibrator = new Int8EntropyCalibrator2(1, INPUT_W, INPUT_H, "./coco_calib/", "int8calib.table", INPUT_NAME); 268 | config->setInt8Calibrator(calibrator); 269 | #endif 270 | 271 | std::cout << "Building engine, please wait for a while..." << std::endl; 272 | ICudaEngine *engine = builder->buildEngineWithConfig(*network, *config); 273 | std::cout << "Build engine successfully!" << std::endl; 274 | 275 | // Don't need the network any more 276 | network->destroy(); 277 | 278 | // Release host memory 279 | for (auto &mem : weightMap) 280 | { 281 | free((void *)(mem.second.values)); 282 | } 283 | 284 | return engine; 285 | } 286 | 287 | void APIToModel(unsigned int maxBatchSize, IHostMemory **modelStream, bool &is_p6, float &gd, float &gw, std::string &wts_name) 288 | { 289 | // Create builder 290 | IBuilder *builder = createInferBuilder(gLogger); 291 | IBuilderConfig *config = builder->createBuilderConfig(); 292 | 293 | // Create model to populate the network, then set the outputs and create an engine 294 | ICudaEngine *engine = nullptr; 295 | if (is_p6) 296 | { 297 | engine = build_engine_p6(maxBatchSize, builder, config, DataType::kFLOAT, gd, gw, wts_name); 298 | } 299 | else 300 | { 301 | engine = build_engine(maxBatchSize, builder, config, DataType::kFLOAT, gd, gw, wts_name); 302 | } 303 | assert(engine != nullptr); 304 | 305 | // Serialize the engine 306 | (*modelStream) = engine->serialize(); 307 | 308 | // Close everything down 309 | engine->destroy(); 310 | builder->destroy(); 311 | config->destroy(); 312 | } 313 | 314 | void doInference(IExecutionContext &context, cudaStream_t &stream, void **buffers, float *input, int *counts, float *boxes, float *scores, float *classes, int inputIndex, int countIndex, int bboxIndex, int scoreIndex, int classIndex, int batchSize) 315 | { 316 | // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host 317 | CUDA_CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); 318 | context.enqueue(batchSize, buffers, stream, nullptr); 319 | CUDA_CHECK(cudaMemcpyAsync(counts, buffers[countIndex], batchSize * sizeof(int), cudaMemcpyDeviceToHost, stream)); 320 | CUDA_CHECK(cudaMemcpyAsync(boxes, buffers[bboxIndex], batchSize * KEEP_TOPK * 4 * sizeof(float), cudaMemcpyDeviceToHost, stream)); 321 | CUDA_CHECK(cudaMemcpyAsync(scores, buffers[scoreIndex], batchSize * KEEP_TOPK * sizeof(float), cudaMemcpyDeviceToHost, stream)); 322 | CUDA_CHECK(cudaMemcpyAsync(classes, buffers[classIndex], batchSize * KEEP_TOPK * sizeof(float), cudaMemcpyDeviceToHost, stream)); 323 | 324 | cudaStreamSynchronize(stream); 325 | } 326 | 327 | bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, bool &is_p6, float &gd, float &gw, std::string &img_dir) 328 | { 329 | if (argc < 4) 330 | return false; 331 | if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) 332 | { 333 | wts = std::string(argv[2]); 334 | engine = std::string(argv[3]); 335 | auto net = std::string(argv[4]); 336 | if (net[0] == 's') 337 | { 338 | gd = 0.33; 339 | gw = 0.50; 340 | } 341 | else if (net[0] == 'm') 342 | { 343 | gd = 0.67; 344 | gw = 0.75; 345 | } 346 | else if (net[0] == 'l') 347 | { 348 | gd = 1.0; 349 | gw = 1.0; 350 | } 351 | else if (net[0] == 'x') 352 | { 353 | gd = 1.33; 354 | gw = 1.25; 355 | } 356 | else if (net[0] == 'c' && argc == 7) 357 | { 358 | gd = atof(argv[5]); 359 | gw = atof(argv[6]); 360 | } 361 | else 362 | { 363 | return false; 364 | } 365 | if (net.size() == 2 && net[1] == '6') 366 | { 367 | is_p6 = true; 368 | } 369 | } 370 | else if (std::string(argv[1]) == "-d" && argc == 4) 371 | { 372 | engine = std::string(argv[2]); 373 | img_dir = std::string(argv[3]); 374 | } 375 | else 376 | { 377 | return false; 378 | } 379 | return true; 380 | } 381 | 382 | int main(int argc, char **argv) 383 | { 384 | cudaSetDevice(DEVICE); 385 | initLibNvInferPlugins(&gLogger, ""); 386 | 387 | std::string wts_name = ""; 388 | std::string engine_name = ""; 389 | bool is_p6 = false; 390 | float gd = 0.0f, gw = 0.0f; 391 | std::string img_dir; 392 | if (!parse_args(argc, argv, wts_name, engine_name, is_p6, gd, gw, img_dir)) 393 | { 394 | std::cerr << "arguments not right!" << std::endl; 395 | std::cerr << "./yolov5 -s [.wts] [.engine] [s/m/l/x/s6/m6/l6/x6 or c/c6 gd gw] // serialize model to plan file" << std::endl; 396 | std::cerr << "./yolov5 -d [.engine] ../samples // deserialize plan file and run inference" << std::endl; 397 | return -1; 398 | } 399 | 400 | // create a model using the API directly and serialize it to a stream 401 | if (!wts_name.empty()) 402 | { 403 | IHostMemory *modelStream{nullptr}; 404 | APIToModel(BATCH_SIZE, &modelStream, is_p6, gd, gw, wts_name); 405 | assert(modelStream != nullptr); 406 | std::ofstream p(engine_name, std::ios::binary); 407 | if (!p) 408 | { 409 | std::cerr << "could not open plan output file" << std::endl; 410 | return -1; 411 | } 412 | p.write(reinterpret_cast(modelStream->data()), modelStream->size()); 413 | modelStream->destroy(); 414 | return 0; 415 | } 416 | 417 | // deserialize the .engine and run inference 418 | std::ifstream file(engine_name, std::ios::binary); 419 | if (!file.good()) 420 | { 421 | std::cerr << "read " << engine_name << " error!" << std::endl; 422 | return -1; 423 | } 424 | char *trtModelStream = nullptr; 425 | size_t size = 0; 426 | file.seekg(0, file.end); 427 | size = file.tellg(); 428 | file.seekg(0, file.beg); 429 | trtModelStream = new char[size]; 430 | assert(trtModelStream); 431 | file.read(trtModelStream, size); 432 | file.close(); 433 | 434 | std::vector file_names; 435 | if (read_files_in_dir(img_dir.c_str(), file_names) < 0) 436 | { 437 | std::cerr << "read_files_in_dir failed." << std::endl; 438 | return -1; 439 | } 440 | 441 | // prepare input data --------------------------- 442 | static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W]; 443 | static int counts[BATCH_SIZE]; 444 | static float boxes[BATCH_SIZE * KEEP_TOPK * 4]; 445 | static float scores[BATCH_SIZE * KEEP_TOPK]; 446 | static float classes[BATCH_SIZE * KEEP_TOPK]; 447 | 448 | IRuntime *runtime = createInferRuntime(gLogger); 449 | assert(runtime != nullptr); 450 | ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream, size); 451 | assert(engine != nullptr); 452 | IExecutionContext *context = engine->createExecutionContext(); 453 | assert(context != nullptr); 454 | delete[] trtModelStream; 455 | assert(engine->getNbBindings() == 5); 456 | void *buffers[5]; 457 | // In order to bind the buffers, we need to know the names of the input and output tensors. 458 | // Note that indices are guaranteed to be less than IEngine::getNbBindings() 459 | const int inputIndex = engine->getBindingIndex(INPUT_NAME); 460 | const int countIndex = engine->getBindingIndex(OUTPUT_COUNTS); 461 | const int bboxIndex = engine->getBindingIndex(OUTPUT_BOXES); 462 | const int scoreIndex = engine->getBindingIndex(OUTPUT_SCORES); 463 | const int classIndex = engine->getBindingIndex(OUTPUT_CLASSES); 464 | // Create GPU buffers on device 465 | CUDA_CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float))); 466 | CUDA_CHECK(cudaMalloc(&buffers[countIndex], BATCH_SIZE * sizeof(int))); 467 | CUDA_CHECK(cudaMalloc(&buffers[bboxIndex], BATCH_SIZE * KEEP_TOPK * 4 * sizeof(float))); 468 | CUDA_CHECK(cudaMalloc(&buffers[scoreIndex], BATCH_SIZE * KEEP_TOPK * sizeof(float))); 469 | CUDA_CHECK(cudaMalloc(&buffers[classIndex], BATCH_SIZE * KEEP_TOPK * sizeof(float))); 470 | 471 | // Create stream 472 | cudaStream_t stream; 473 | CUDA_CHECK(cudaStreamCreate(&stream)); 474 | 475 | int fcount = 0; 476 | for (int f = 0; f < (int)file_names.size(); f++) 477 | { 478 | fcount++; 479 | if (fcount < BATCH_SIZE && f + 1 != (int)file_names.size()) 480 | continue; 481 | for (int b = 0; b < fcount; b++) 482 | { 483 | cv::Mat img = cv::imread(img_dir + "/" + file_names[f - fcount + 1 + b]); 484 | if (img.empty()) 485 | continue; 486 | cv::Mat pr_img = preprocess_img(img, INPUT_W, INPUT_H); // letterbox BGR to RGB 487 | int i = 0; 488 | for (int row = 0; row < INPUT_H; ++row) 489 | { 490 | uchar *uc_pixel = pr_img.data + row * pr_img.step; 491 | for (int col = 0; col < INPUT_W; ++col) 492 | { 493 | data[b * 3 * INPUT_H * INPUT_W + i] = (float)uc_pixel[2]; 494 | data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = (float)uc_pixel[1]; 495 | data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0]; 496 | uc_pixel += 3; 497 | ++i; 498 | } 499 | } 500 | } 501 | 502 | // Run inference 503 | auto start = std::chrono::system_clock::now(); 504 | doInference(*context, stream, buffers, data, counts, boxes, scores, classes, inputIndex, countIndex, bboxIndex, scoreIndex, classIndex, BATCH_SIZE); 505 | auto end = std::chrono::system_clock::now(); 506 | std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; 507 | for (int b = 0; b < fcount; b++) 508 | { 509 | std::cout << "detect count " << counts[b] << std::endl; 510 | cv::Mat img = cv::imread(std::string(argv[3]) + "/" + file_names[f - fcount + 1 + b]); 511 | for (int j = 0; j < counts[b]; j++) 512 | { 513 | float *curBbox = boxes + (b * KEEP_TOPK + j) * 4; 514 | float *curScore = scores + (b * KEEP_TOPK + j); 515 | float *curClass = classes + (b * KEEP_TOPK + j); 516 | for (int i = 0; i < 4; i++) 517 | { 518 | std::cout << curBbox[i] << " " << std::endl; 519 | } 520 | cv::Rect r = get_rect(img, curBbox); 521 | cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); 522 | cv::putText(img, std::to_string(int(*curClass)) + " " + std::to_string(*curScore), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); 523 | } 524 | cv::imwrite("_" + file_names[f - fcount + 1 + b], img); 525 | } 526 | fcount = 0; 527 | } 528 | 529 | // Release stream and buffers 530 | cudaStreamDestroy(stream); 531 | CUDA_CHECK(cudaFree(buffers[inputIndex])); 532 | CUDA_CHECK(cudaFree(buffers[countIndex])); 533 | CUDA_CHECK(cudaFree(buffers[bboxIndex])); 534 | CUDA_CHECK(cudaFree(buffers[scoreIndex])); 535 | CUDA_CHECK(cudaFree(buffers[classIndex])); // Destroy the engine 536 | context->destroy(); 537 | engine->destroy(); 538 | runtime->destroy(); 539 | 540 | return 0; 541 | } 542 | -------------------------------------------------------------------------------- /yolov5_trt.py: -------------------------------------------------------------------------------- 1 | """ 2 | An example that uses TensorRT's Python api to make inferences. 3 | """ 4 | import ctypes 5 | import os 6 | import shutil 7 | import random 8 | import sys 9 | import threading 10 | import time 11 | import cv2 12 | import numpy as np 13 | import pycuda.autoinit 14 | import pycuda.driver as cuda 15 | import tensorrt as trt 16 | 17 | 18 | def get_img_path_batches(batch_size, img_dir): 19 | ret = [] 20 | batch = [] 21 | for root, dirs, files in os.walk(img_dir): 22 | for name in files: 23 | if len(batch) == batch_size: 24 | ret.append(batch) 25 | batch = [] 26 | batch.append(os.path.join(root, name)) 27 | if len(batch) > 0: 28 | ret.append(batch) 29 | return ret 30 | 31 | 32 | def plot_one_box(x, img, color=None, label=None, line_thickness=None): 33 | """ 34 | description: Plots one bounding box on image img, 35 | this function comes from YoLov5 project. 36 | param: 37 | x: a box likes [x1,y1,x2,y2] 38 | img: a opencv image object 39 | color: color to draw rectangle, such as (0,255,0) 40 | label: str 41 | line_thickness: int 42 | return: 43 | no return 44 | 45 | """ 46 | tl = ( 47 | line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 48 | ) # line/font thickness 49 | color = color or [random.randint(0, 255) for _ in range(3)] 50 | c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) 51 | cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) 52 | if label: 53 | tf = max(tl - 1, 1) # font thickness 54 | t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] 55 | c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 56 | cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled 57 | cv2.putText( 58 | img, 59 | label, 60 | (c1[0], c1[1] - 2), 61 | 0, 62 | tl / 3, 63 | [225, 255, 255], 64 | thickness=tf, 65 | lineType=cv2.LINE_AA, 66 | ) 67 | 68 | 69 | class YoLov5TRT(object): 70 | """ 71 | description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops. 72 | """ 73 | 74 | def __init__(self, engine_file_path): 75 | # Create a Context on this device, 76 | self.ctx = cuda.Device(0).make_context() 77 | stream = cuda.Stream() 78 | runtime = trt.Runtime(TRT_LOGGER) 79 | 80 | # Deserialize the engine from file 81 | with open(engine_file_path, "rb") as f: 82 | engine = runtime.deserialize_cuda_engine(f.read()) 83 | context = engine.create_execution_context() 84 | 85 | host_inputs = [] 86 | cuda_inputs = [] 87 | host_outputs = [] 88 | cuda_outputs = [] 89 | bindings = [] 90 | 91 | for binding in engine: 92 | print('bingding:', binding, engine.get_binding_shape(binding)) 93 | size = trt.volume(engine.get_binding_shape( 94 | binding)) * engine.max_batch_size 95 | dtype = trt.nptype(engine.get_binding_dtype(binding)) 96 | # Allocate host and device buffers 97 | host_mem = cuda.pagelocked_empty(size, dtype) 98 | cuda_mem = cuda.mem_alloc(host_mem.nbytes) 99 | # Append the device buffer to device bindings. 100 | bindings.append(int(cuda_mem)) 101 | # Append to the appropriate list. 102 | if engine.binding_is_input(binding): 103 | self.input_w = engine.get_binding_shape(binding)[-1] 104 | self.input_h = engine.get_binding_shape(binding)[-2] 105 | host_inputs.append(host_mem) 106 | cuda_inputs.append(cuda_mem) 107 | else: 108 | host_outputs.append(host_mem) 109 | cuda_outputs.append(cuda_mem) 110 | 111 | # Store 112 | self.stream = stream 113 | self.context = context 114 | self.engine = engine 115 | self.host_inputs = host_inputs 116 | self.cuda_inputs = cuda_inputs 117 | self.host_outputs = host_outputs 118 | self.cuda_outputs = cuda_outputs 119 | self.bindings = bindings 120 | self.batch_size = engine.max_batch_size 121 | 122 | def infer(self, raw_image_generator): 123 | threading.Thread.__init__(self) 124 | # Make self the active context, pushing it on top of the context stack. 125 | self.ctx.push() 126 | # Do image preprocess 127 | batch_image_raw = [] 128 | batch_origin_h = [] 129 | batch_origin_w = [] 130 | batch_input_image = np.empty( 131 | shape=[self.batch_size, 3, self.input_h, self.input_w]) 132 | for i, image_raw in enumerate(raw_image_generator): 133 | input_image, image_raw, origin_h, origin_w = self.preprocess_image( 134 | image_raw) 135 | batch_image_raw.append(image_raw) 136 | batch_origin_h.append(origin_h) 137 | batch_origin_w.append(origin_w) 138 | np.copyto(batch_input_image[i], input_image) 139 | batch_input_image = np.ascontiguousarray(batch_input_image) 140 | batch_size = len(batch_input_image) 141 | # Copy input image to host buffer 142 | np.copyto(self.host_inputs[0], batch_input_image.ravel()) 143 | start = time.time() 144 | # Transfer input data to the GPU. 145 | cuda.memcpy_htod_async( 146 | self.cuda_inputs[0], self.host_inputs[0], self.stream) 147 | # Run inference. 148 | self.context.execute_async(batch_size=self.batch_size, 149 | bindings=self.bindings, stream_handle=self.stream.handle) 150 | # Transfer predictions back from the GPU. 151 | for self.host_output, self.cuda_output in zip(self.host_outputs, self.cuda_outputs): 152 | cuda.memcpy_dtoh_async( 153 | self.host_output, self.cuda_output, self.stream) 154 | # Synchronize the stream 155 | self.stream.synchronize() 156 | end = time.time() 157 | # Remove any context from the top of the context stack, deactivating it. 158 | self.ctx.pop() 159 | output_counts = self.host_outputs[0] # [b,1] 160 | output_boxes = self.host_outputs[1].reshape( 161 | batch_size, -1, 4) # [b,keep_topk,4] 162 | output_scores = self.host_outputs[2].reshape( 163 | batch_size, -1) # [b,keep_topk] 164 | output_classes = self.host_outputs[3].reshape( 165 | batch_size, -1) # [b,keep_topk] 166 | # Do postprocess 167 | for i in range(batch_size): 168 | result_count = output_counts[i] 169 | result_boxes = output_boxes[i][:result_count] 170 | result_scores = output_scores[i][:result_count] 171 | result_classes = output_classes[i][:result_count] 172 | # Draw rectangles and labels on the original image 173 | for j in range(len(result_boxes)): 174 | box = self.get_rect( 175 | result_boxes[j], batch_origin_h[i], batch_origin_w[i], self.input_h, self.input_w) 176 | plot_one_box( 177 | box, 178 | batch_image_raw[i], 179 | label="{}:{:.2f}".format( 180 | categories[int(result_classes[j])], result_scores[j] 181 | ), 182 | ) 183 | return batch_image_raw, end - start 184 | 185 | def destroy(self): 186 | # Remove any context from the top of the context stack, deactivating it. 187 | self.ctx.pop() 188 | 189 | def get_raw_image(self, image_path_batch): 190 | """ 191 | description: Read an image from image path 192 | """ 193 | for img_path in image_path_batch: 194 | yield cv2.imread(img_path) 195 | 196 | def get_raw_image_zeros(self, image_path_batch=None): 197 | """ 198 | description: Ready data for warmup 199 | """ 200 | for _ in range(self.batch_size): 201 | yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) 202 | 203 | def preprocess_image(self, raw_bgr_image): 204 | """ 205 | description: Convert BGR image to RGB, 206 | resize and pad it to target size, normalize to [0,1], 207 | transform to NCHW format. 208 | param: 209 | input_image_path: str, image path 210 | return: 211 | image: the processed image 212 | image_raw: the original image 213 | h: original height 214 | w: original width 215 | """ 216 | image_raw = raw_bgr_image 217 | h, w, c = image_raw.shape 218 | image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) 219 | # Calculate widht and height and paddings 220 | r_w = self.input_w / w 221 | r_h = self.input_h / h 222 | if r_h > r_w: 223 | tw = self.input_w 224 | th = int(r_w * h) 225 | tx1 = tx2 = 0 226 | ty1 = int((self.input_h - th) / 2) 227 | ty2 = self.input_h - th - ty1 228 | else: 229 | tw = int(r_h * w) 230 | th = self.input_h 231 | tx1 = int((self.input_w - tw) / 2) 232 | tx2 = self.input_w - tw - tx1 233 | ty1 = ty2 = 0 234 | # Resize the image with long side while maintaining ratio 235 | image = cv2.resize(image, (tw, th)) 236 | # Pad the short side with (128,128,128) 237 | image = cv2.copyMakeBorder( 238 | image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, value=( 239 | 128, 128, 128) 240 | ) 241 | image = image.astype(np.float32) 242 | # HWC to CHW format: 243 | image = np.transpose(image, [2, 0, 1]) 244 | # CHW to NCHW format 245 | image = np.expand_dims(image, axis=0) 246 | # Convert the image to row-major order, also known as "C order": 247 | image = np.ascontiguousarray(image) 248 | return image, image_raw, h, w 249 | 250 | def get_rect(self, bbox, image_h, image_w, input_h, input_w): 251 | """ 252 | description: postprocess the bbox 253 | param: 254 | bbox: [x1,y1,x2,y2] 255 | image_h: height of original image 256 | image_w: width of original image 257 | input_h: height of network input 258 | input_w: width of network input 259 | return: 260 | result_bbox: finally box 261 | """ 262 | 263 | result_bbox = [0, 0, 0, 0] 264 | r_w = input_w / (image_w * 1.0) 265 | r_h = input_h / (image_h * 1.0) 266 | if r_h > r_w: 267 | l = bbox[0] / r_w 268 | r = bbox[2] / r_w 269 | t = (bbox[1] - (input_h - r_w * image_h) / 2) / r_w 270 | b = (bbox[3] - (input_h - r_w * image_h) / 2) / r_w 271 | else: 272 | l = (bbox[0] - (input_w - r_h * image_w) / 2) / r_h 273 | r = (bbox[2] - (input_w - r_h * image_w) / 2) / r_h 274 | t = bbox[1] / r_h 275 | b = bbox[3] / r_h 276 | result_bbox[0] = l 277 | result_bbox[1] = t 278 | result_bbox[2] = r 279 | result_bbox[3] = b 280 | return result_bbox 281 | 282 | 283 | class inferThread(threading.Thread): 284 | def __init__(self, yolov5_wrapper, image_path_batch): 285 | threading.Thread.__init__(self) 286 | self.yolov5_wrapper = yolov5_wrapper 287 | self.image_path_batch = image_path_batch 288 | 289 | def run(self): 290 | batch_image_raw, use_time = self.yolov5_wrapper.infer( 291 | self.yolov5_wrapper.get_raw_image(self.image_path_batch)) 292 | for i, img_path in enumerate(self.image_path_batch): 293 | filename = os.path.basename(img_path) 294 | save_name = os.path.join('output', filename) 295 | # Save image 296 | cv2.imwrite(save_name, batch_image_raw[i]) 297 | print('input->{}, time->{:.2f}ms, saving into output/'.format( 298 | self.image_path_batch, use_time * 1000)) 299 | 300 | 301 | class warmUpThread(threading.Thread): 302 | def __init__(self, yolov5_wrapper): 303 | threading.Thread.__init__(self) 304 | self.yolov5_wrapper = yolov5_wrapper 305 | 306 | def run(self): 307 | batch_image_raw, use_time = self.yolov5_wrapper.infer( 308 | self.yolov5_wrapper.get_raw_image_zeros()) 309 | print( 310 | 'warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) 311 | 312 | 313 | if __name__ == "__main__": 314 | TRT_LOGGER = trt.Logger(trt.Logger.INFO) 315 | # load tensorrt plugins 316 | trt.init_libnvinfer_plugins(TRT_LOGGER, "") 317 | # load custom plugin and engine 318 | PLUGIN_LIBRARY = "build/libyoloplugin.so" 319 | engine_file_path = "build/yolov5s.engine" 320 | 321 | if len(sys.argv) > 1: 322 | engine_file_path = sys.argv[1] 323 | if len(sys.argv) > 2: 324 | PLUGIN_LIBRARY = sys.argv[2] 325 | 326 | ctypes.CDLL(PLUGIN_LIBRARY) 327 | 328 | # load coco labels 329 | 330 | categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", 331 | "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", 332 | "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", 333 | "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", 334 | "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", 335 | "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", 336 | "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", 337 | "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", 338 | "hair drier", "toothbrush"] 339 | 340 | if os.path.exists('output/'): 341 | shutil.rmtree('output/') 342 | os.makedirs('output/') 343 | # a YoLov5TRT instance 344 | yolov5_wrapper = YoLov5TRT(engine_file_path) 345 | try: 346 | print('batch size is', yolov5_wrapper.batch_size) 347 | 348 | image_dir = "samples/" 349 | image_path_batches = get_img_path_batches( 350 | yolov5_wrapper.batch_size, image_dir) 351 | 352 | # for i in range(10): 353 | # # create a new thread to do warm_up 354 | # thread1 = warmUpThread(yolov5_wrapper) 355 | # thread1.start() 356 | # thread1.join() 357 | for batch in image_path_batches: 358 | # create a new thread to do inference 359 | thread1 = inferThread(yolov5_wrapper, batch) 360 | thread1.start() 361 | thread1.join() 362 | finally: 363 | # destroy the instance 364 | yolov5_wrapper.destroy() 365 | --------------------------------------------------------------------------------