├── 1.jpg ├── 2.jpg ├── 3.jpg ├── calibaration.txt ├── README.md ├── yolov3-tiny-trt-fp32.cpp └── yolov3-tiny-trt-int8.cpp /1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BBuf/yolov3-tiny-onnx-TensorRT/HEAD/1.jpg -------------------------------------------------------------------------------- /2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BBuf/yolov3-tiny-onnx-TensorRT/HEAD/2.jpg -------------------------------------------------------------------------------- /3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BBuf/yolov3-tiny-onnx-TensorRT/HEAD/3.jpg -------------------------------------------------------------------------------- /calibaration.txt: -------------------------------------------------------------------------------- 1 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000001.jpg 2 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000002.jpg 3 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000003.jpg 4 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000004.jpg 5 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000005.jpg 6 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000006.jpg 7 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000007.jpg 8 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000008.jpg 9 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000009.jpg 10 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000010.jpg 11 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000011.jpg 12 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000012.jpg 13 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000013.jpg 14 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000014.jpg 15 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000015.jpg 16 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000016.jpg 17 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000017.jpg 18 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000018.jpg 19 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000019.jpg 20 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000020.jpg 21 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000021.jpg 22 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000022.jpg 23 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000023.jpg 24 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000024.jpg 25 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000025.jpg 26 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000026.jpg 27 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000027.jpg 28 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000028.jpg 29 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000029.jpg 30 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000030.jpg 31 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000031.jpg 32 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000032.jpg 33 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000033.jpg 34 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000034.jpg 35 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000035.jpg 36 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000036.jpg 37 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000037.jpg 38 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000038.jpg 39 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000039.jpg 40 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000040.jpg 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # yolov3-tiny-onnx-TensorRT 2 | ## Requirements 3 | 4 | - TensorRT 6.0 5 | - VS 2015/Clion 6 | - Cuda 9.0 + Cudnn 7.6 7 | 8 | 9 | 10 | ## Model Converter 11 | 12 | Convert your Darknet yolov3-tiny model to onnx,please follow these steps: 13 | 14 | ## Requirements 15 | 16 | python=2.7 17 | numpy=1.16.1 18 | onnx=1.4.1 (important) 19 | pycuda=2019.1.1 20 | Pillow=6.1.0 21 | wget=3.2 22 | 23 | ## custom settings 24 | 25 | data_processing.py: 26 | line14: LABEL_FILE_PATH = '/home/nvidia/yolov3-tiny2onnx2trt/coco_labels.txt' 27 | line19: CATEGORY_NUM = 80 28 | 29 | yolov3_to_onnx.py: 30 | line778: img_size = 416 31 | line784: cfg_file_path = '/home/nvidia/yolov3-tiny2onnx2trt/yolov3-tiny.cfg' 32 | line811: weights_file_path = '/home/nvidia/yolov3-tiny2onnx2trt/yolov3-tiny.weights' 33 | line826: output_file_path = 'yolov3-tiny.onnx' 34 | 35 | onnx_to_tensorrt.py: 36 | line39: input_size = 416 37 | line40: batch_size = 1 38 | line42~line46: 39 | onnx_file_path = 'yolov3-tiny.onnx' 40 | engine_file_path = 'yolov3-tiny.trt' 41 | input_file_list = '/home/nvidia/yolov3-tiny2onnx2trt/imagelist.txt' 42 | IMAGE_PATH = '/home/nvidia/yolov3-tiny2onnx2trt/images/' 43 | save_path = '/home/nvidia/yolov3-tiny2onnx2trt/' 44 | ## notes (very important!) 45 | 46 | 0.The onnx version must be 1.4.1. If it is not, please run the commands: 47 | pip uninstall onnx 48 | pip install onnx==1.4.1 49 | 50 | 1.The cfg-file's last line must be a blank line. You should press Enter to add a blank line if there is no blank line at the end of the file. 51 | 52 | ## steps 53 | 54 | 0.Put your .weights file in the folder 55 | |-yolov3-tiny2onnx2trt 56 | |-yolov3-tiny.weights 57 | 58 | 1.Change your settings as "#custom settings" 59 | 60 | 2.Run commands: 61 | cd yolov3-tiny2onnx2trt 62 | python yolov3_to_onnx.py 63 | 64 | you will get a yolov3-tiny.onnx file 65 | 66 | 3.Run commands: 67 | python onnx_to_tensorrt.py: 68 | 69 | you will get a yolov3-tiny.trt file and some inferenced images. 70 | 71 | 72 | # TensorRT FP32 Inference 73 | 74 | - run yolov3-tiny-trt-fp32.cpp( You can modify the number of categories by yourself )。 75 | 76 | - The visualization results are as follows: 77 | 78 | 79 | 80 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20200409142919546.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2p1c3Rfc29ydA==,size_16,color_FFFFFF,t_70#pic_center) 81 | 82 | 83 | 84 | 85 | 86 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20200409143133305.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2p1c3Rfc29ydA==,size_16,color_FFFFFF,t_70#pic_center) 87 | 88 | 89 | 90 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20200409143229456.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2p1c3Rfc29ydA==,size_16,color_FFFFFF,t_70#pic_center) 91 | 92 | 93 | 94 | # TensorRT INT8 Calibaration 95 | 96 | - Prepare calibaration data(*.txt),like this: 97 | 98 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20200409151326680.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2p1c3Rfc29ydA==,size_16,color_FFFFFF,t_70) 99 | 100 | - Create a class that inherits INT8EntropyCalibrator, the code is as follows: 101 | 102 | 103 | 104 | ```c++ 105 | namespace nvinfer1 { 106 | class int8EntroyCalibrator : public nvinfer1::IInt8EntropyCalibrator { 107 | public: 108 | int8EntroyCalibrator(const int &bacthSize, 109 | const std::string &imgPath, 110 | const std::string &calibTablePath); 111 | 112 | virtual ~int8EntroyCalibrator(); 113 | 114 | int getBatchSize() const override { return batchSize; } 115 | 116 | bool getBatch(void *bindings[], const char *names[], int nbBindings) override; 117 | 118 | const void *readCalibrationCache(std::size_t &length) override; 119 | 120 | void writeCalibrationCache(const void *ptr, std::size_t length) override; 121 | 122 | private: 123 | 124 | bool forwardFace; 125 | 126 | int batchSize; 127 | size_t inputCount; 128 | size_t imageIndex; 129 | 130 | std::string calibTablePath; 131 | std::vector imgPaths; 132 | 133 | float *batchData{ nullptr }; 134 | void *deviceInput{ nullptr }; 135 | 136 | 137 | 138 | bool readCache; 139 | std::vector calibrationCache; 140 | }; 141 | 142 | int8EntroyCalibrator::int8EntroyCalibrator(const int &bacthSize, const std::string &imgPath, 143 | const std::string &calibTablePath) :batchSize(bacthSize), calibTablePath(calibTablePath), imageIndex(0), forwardFace( 144 | false) { 145 | int inputChannel = 3; 146 | int inputH = 416; 147 | int inputW = 416; 148 | inputCount = bacthSize*inputChannel*inputH*inputW; 149 | std::fstream f(imgPath); 150 | if (f.is_open()) { 151 | std::string temp; 152 | while (std::getline(f, temp)) imgPaths.push_back(temp); 153 | } 154 | int len = imgPaths.size(); 155 | for (int i = 0; i < len; i++) { 156 | cout << imgPaths[i] << endl; 157 | } 158 | batchData = new float[inputCount]; 159 | CHECK(cudaMalloc(&deviceInput, inputCount * sizeof(float))); 160 | } 161 | 162 | int8EntroyCalibrator::~int8EntroyCalibrator() { 163 | CHECK(cudaFree(deviceInput)); 164 | if (batchData) 165 | delete[] batchData; 166 | } 167 | 168 | bool int8EntroyCalibrator::getBatch(void **bindings, const char **names, int nbBindings) { 169 | cout << imageIndex << " " << batchSize << endl; 170 | cout << imgPaths.size() << endl; 171 | if (imageIndex + batchSize > int(imgPaths.size())) 172 | return false; 173 | // load batch 174 | float* ptr = batchData; 175 | for (size_t j = imageIndex; j < imageIndex + batchSize; ++j) 176 | { 177 | //cout << imgPaths[j] << endl; 178 | Mat img = cv::imread(imgPaths[j]); 179 | vectorinputData = prepareImage(img); 180 | cout << inputData.size() << endl; 181 | cout << inputCount << endl; 182 | if ((int)(inputData.size()) != inputCount) 183 | { 184 | std::cout << "InputSize error. check include/ctdetConfig.h" << std::endl; 185 | return false; 186 | } 187 | assert(inputData.size() == inputCount); 188 | int len = (int)(inputData.size()); 189 | memcpy(ptr, inputData.data(), len * sizeof(float)); 190 | 191 | ptr += inputData.size(); 192 | std::cout << "load image " << imgPaths[j] << " " << (j + 1)*100. / imgPaths.size() << "%" << std::endl; 193 | } 194 | imageIndex += batchSize; 195 | CHECK(cudaMemcpy(deviceInput, batchData, inputCount * sizeof(float), cudaMemcpyHostToDevice)); 196 | bindings[0] = deviceInput; 197 | return true; 198 | } 199 | const void* int8EntroyCalibrator::readCalibrationCache(std::size_t &length) 200 | { 201 | calibrationCache.clear(); 202 | std::ifstream input(calibTablePath, std::ios::binary); 203 | input >> std::noskipws; 204 | if (readCache && input.good()) 205 | std::copy(std::istream_iterator(input), std::istream_iterator(), 206 | std::back_inserter(calibrationCache)); 207 | 208 | length = calibrationCache.size(); 209 | return length ? &calibrationCache[0] : nullptr; 210 | } 211 | 212 | void int8EntroyCalibrator::writeCalibrationCache(const void *cache, std::size_t length) 213 | { 214 | std::ofstream output(calibTablePath, std::ios::binary); 215 | output.write(reinterpret_cast(cache), length); 216 | } 217 | } 218 | ``` 219 | 220 | 221 | 222 | - Change onnxToTRTModel function in yolov3-tiny-trt-fp32.cpp,the code is as follows: 223 | 224 | ```c++ 225 | bool onnxToTRTModel(const std::string& modelFile, 226 | const std::string& filename, 227 | IHostMemory*& trtModelStream) // output buffer for the TensorRT model 228 | { 229 | IBuilder* builder = createInferBuilder(gLogger.getTRTLogger()); 230 | assert(builder != nullptr); 231 | nvinfer1::INetworkDefinition* network = builder->createNetwork(); 232 | 233 | if (!builder->platformHasFastInt8()) return false; 234 | 235 | auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger()); 236 | 237 | 238 | //config->setPrintLayerInfo(true); 239 | //parser->reportParsingInfo(); 240 | 241 | if (!parser->parseFromFile(modelFile.c_str(), static_cast(gLogger.getReportableSeverity()))) 242 | { 243 | gLogError << "Failure while parsing ONNX file" << std::endl; 244 | return false; 245 | } 246 | 247 | 248 | builder->setMaxBatchSize(BATCH_SIZE); 249 | builder->setMaxWorkspaceSize(1 << 30); 250 | 251 | nvinfer1::int8EntroyCalibrator *calibrator = nullptr; 252 | if (calibFile.size()>0) calibrator = new nvinfer1::int8EntroyCalibrator(BATCH_SIZE, calibFile, "F:/TensorRT-6.0.1.5/data/v3tiny/calib.table"); 253 | 254 | 255 | //builder->setFp16Mode(true); 256 | std::cout << "setInt8Mode" << std::endl; 257 | if (!builder->platformHasFastInt8()) 258 | std::cout << "Notice: the platform do not has fast for int8" << std::endl; 259 | builder->setInt8Mode(true); 260 | builder->setInt8Calibrator(calibrator); 261 | /*if (gArgs.runInInt8) 262 | { 263 | samplesCommon::setAllTensorScales(network, 127.0f, 127.0f); 264 | }*/ 265 | //samplesCommon::setAllTensorScales(network, 1.0f, 1.0f); 266 | cout << "start building engine" << endl; 267 | ICudaEngine* engine = builder->buildCudaEngine(*network); 268 | cout << "build engine done" << endl; 269 | assert(engine); 270 | if (calibrator) { 271 | delete calibrator; 272 | calibrator = nullptr; 273 | } 274 | parser->destroy(); 275 | 276 | trtModelStream = engine->serialize(); 277 | 278 | nvinfer1::IHostMemory* data = engine->serialize(); 279 | std::ofstream file; 280 | file.open(filename, std::ios::binary | std::ios::out); 281 | cout << "writing engine file..." << endl; 282 | file.write((const char*)data->data(), data->size()); 283 | cout << "save engine file done" << endl; 284 | file.close(); 285 | 286 | engine->destroy(); 287 | network->destroy(); 288 | builder->destroy(); 289 | 290 | return true 291 | ``` 292 | 293 | 294 | 295 | - Finally you can get a INT8 TensorRT model,enjoy it。 296 | 297 | 298 | 299 | # Accuracy And Speed 300 | 301 | - GTX 1050 Ti 302 | 303 | | YOLOV3-Tiny TRT模型 | mAP(50) | Inference Time | 304 | | ------------------- | ------- | -------------- | 305 | | FP32 | 95.0% | 42ms | 306 | | INT8 | 95.0% | 10ms | 307 | 308 | 309 | 310 | # Reference 311 | 312 | - https://github.com/zombie0117/yolov3-tiny-onnx-TensorRT 313 | - https://mp.weixin.qq.com/s/rYuodkH-tf-q4uZ0QAkuAw 314 | - https://mp.weixin.qq.com/s/huP2J565irXXU7SSIk-Hwg 315 | - https://mp.weixin.qq.com/s/9WKJi4AnOFKKqvK8R9ph1g 316 | - https://mp.weixin.qq.com/s/QcotYLHVVkf5sEvgKZKemg 317 | - https://mp.weixin.qq.com/s/WiVhlR9-rpe-O9J9ULc_bA -------------------------------------------------------------------------------- /yolov3-tiny-trt-fp32.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "NvInfer.h" 14 | #include "NvOnnxParser.h" 15 | #include "NvOnnxParserRuntime.h" 16 | #include "argsParser.h" 17 | #include "logger.h" 18 | #include "common.h" 19 | 20 | using namespace std; 21 | using namespace nvinfer1; 22 | using namespace nvonnxparser; 23 | using namespace cv; 24 | 25 | 26 | // origin params 27 | samplesCommon::Args gArgs; 28 | 29 | // Res params 30 | string onnxFile = "F:/TensorRT-6.0.1.5/data/v3tiny/yolov3-tiny.onnx"; 31 | string engineFile = "F:/TensorRT-6.0.1.5/data/v3tiny/yolov3-tiny.trt"; 32 | 33 | vector labels = { "abnormal" }; 34 | const int Classes = 13; 35 | vector > output_shape = { { 1, 15 + 3 * Classes , 13, 13 },{ 1, 15 + 3 * Classes, 26, 26 } }; 36 | vector > g_masks = { { 3, 4, 5 },{ 0, 1, 2 } }; 37 | vector > g_anchors = { { 10, 14 },{ 23, 27 },{ 37, 58 },{ 81, 82 },{ 135, 169 },{ 344, 319 } }; 38 | float obj_threshold = 0.10; 39 | float nms_threshold = 0.45; 40 | 41 | int CATEGORY = 1; 42 | int BATCH_SIZE = 1; 43 | int INPUT_CHANNEL = 3; 44 | int DETECT_WIDTH = 416; 45 | int DETECT_HEIGHT = 416; 46 | 47 | // Res struct & function 48 | typedef struct DetectionRes { 49 | float x, y, w, h, prob; 50 | } DetectionRes; 51 | 52 | float sigmoid(float in) { 53 | return 1.f / (1.f + exp(-in)); 54 | } 55 | float exponential(float in) { 56 | return exp(in); 57 | } 58 | 59 | float* merge(float* out1, float* out2, int bsize_out1, int bsize_out2) 60 | { 61 | float* out_total = new float[bsize_out1 + bsize_out2]; 62 | 63 | for (int j = 0; j < bsize_out1; ++j) 64 | { 65 | int index = j; 66 | out_total[index] = out1[j]; 67 | } 68 | 69 | for (int j = 0; j < bsize_out2; ++j) 70 | { 71 | int index = j + bsize_out1; 72 | out_total[index] = out2[j]; 73 | } 74 | return out_total; 75 | } 76 | 77 | vector split(const string& str, char delim) 78 | { 79 | stringstream ss(str); 80 | string token; 81 | vector container; 82 | while (getline(ss, token, delim)) 83 | { 84 | container.push_back(token); 85 | } 86 | 87 | return container; 88 | } 89 | 90 | 91 | 92 | void DoNms(vector& detections, float nmsThresh) { 93 | auto iouCompute = [](float * lbox, float* rbox) { 94 | float interBox[] = { 95 | max(lbox[0], rbox[0]), //left 96 | min(lbox[0] + lbox[2], rbox[0] + rbox[2]), //right 97 | max(lbox[1], rbox[1]), //top 98 | min(lbox[1] + lbox[3], rbox[1] + rbox[3]), //bottom 99 | }; 100 | 101 | if (interBox[2] >= interBox[3] || interBox[0] >= interBox[1]) 102 | return 0.0f; 103 | 104 | float interBoxS = (interBox[1] - interBox[0] + 1) * (interBox[3] - interBox[2] + 1); 105 | return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS); 106 | }; 107 | 108 | sort(detections.begin(), detections.end(), [=](const DetectionRes & left, const DetectionRes & right) { 109 | return left.prob > right.prob; 110 | }); 111 | 112 | vector result; 113 | for (unsigned int m = 0; m < detections.size(); ++m) { 114 | result.push_back(detections[m]); 115 | for (unsigned int n = m + 1; n < detections.size(); ++n) { 116 | if (iouCompute((float *)(&detections[m]), (float *)(&detections[n])) > nmsThresh) { 117 | detections.erase(detections.begin() + n); 118 | --n; 119 | } 120 | } 121 | } 122 | detections = move(result); 123 | } 124 | 125 | vector postProcess(cv::Mat& image, float * output) { 126 | vector detections; 127 | int total_size = 0; 128 | for (int i = 0; i < output_shape.size(); i++) { 129 | auto shape = output_shape[i]; 130 | int size = 1; 131 | for (int j = 0; j < shape.size(); j++) { 132 | size *= shape[j]; 133 | } 134 | total_size += size; 135 | } 136 | 137 | int offset = 0; 138 | float * transposed_output = new float[total_size]; 139 | float * transposed_output_t = transposed_output; 140 | for (int i = 0; i < output_shape.size(); i++) { 141 | auto shape = output_shape[i]; // nchw 142 | int chw = shape[1] * shape[2] * shape[3]; 143 | int hw = shape[2] * shape[3]; 144 | for (int n = 0; n < shape[0]; n++) { 145 | int offset_n = offset + n * chw; 146 | for (int h = 0; h < shape[2]; h++) { 147 | for (int w = 0; w < shape[3]; w++) { 148 | int h_w = h * shape[3] + w; 149 | for (int c = 0; c < shape[1]; c++) { 150 | int offset_c = offset_n + hw * c + h_w; 151 | *transposed_output_t++ = output[offset_c]; 152 | } 153 | } 154 | } 155 | } 156 | offset += shape[0] * chw; 157 | } 158 | vector > shapes; 159 | for (int i = 0; i < output_shape.size(); i++) { 160 | auto shape = output_shape[i]; 161 | vector tmp = { shape[2], shape[3], 3, 5 + Classes }; 162 | shapes.push_back(tmp); 163 | } 164 | 165 | offset = 0; 166 | for (int i = 0; i < output_shape.size(); i++) { 167 | auto masks = g_masks[i]; 168 | vector > anchors; 169 | for (auto mask : masks) 170 | anchors.push_back(g_anchors[mask]); 171 | auto shape = shapes[i]; 172 | for (int h = 0; h < shape[0]; h++) { 173 | int offset_h = offset + h * shape[1] * shape[2] * shape[3]; 174 | for (int w = 0; w < shape[1]; w++) { 175 | int offset_w = offset_h + w * shape[2] * shape[3]; 176 | for (int c = 0; c < shape[2]; c++) { 177 | int offset_c = offset_w + c * shape[3]; 178 | float * ptr = transposed_output + offset_c; 179 | //ptr[4] = sigmoid(ptr[4]); 180 | //ptr[5] = sigmoid(ptr[5]); 181 | float score = 0; 182 | for (int i = 5; i < 5 + Classes; i++) { 183 | score = max(score, sigmoid(ptr[4]) * sigmoid(ptr[i])); 184 | } 185 | 186 | if (score < obj_threshold) 187 | continue; 188 | ptr[0] = sigmoid(ptr[0]); 189 | ptr[1] = sigmoid(ptr[1]); 190 | ptr[2] = exponential(ptr[2]) * anchors[c][0]; 191 | ptr[3] = exponential(ptr[3]) * anchors[c][1]; 192 | 193 | ptr[0] += w; 194 | ptr[1] += h; 195 | ptr[0] /= shape[0]; 196 | ptr[1] /= shape[1]; 197 | ptr[2] /= DETECT_WIDTH; 198 | ptr[3] /= DETECT_WIDTH; 199 | ptr[0] -= ptr[2] / 2; 200 | ptr[1] -= ptr[3] / 2; 201 | 202 | DetectionRes det;; 203 | det.x = ptr[0]; 204 | det.y = ptr[1]; 205 | det.w = ptr[2]; 206 | det.h = ptr[3]; 207 | det.prob = score; 208 | detections.push_back(det); 209 | } 210 | } 211 | } 212 | offset += shape[0] * shape[1] * shape[2] * shape[3]; 213 | } 214 | delete[]transposed_output; 215 | 216 | int h = DETECT_WIDTH; //net h 217 | int w = DETECT_WIDTH; //net w 218 | 219 | //scale bbox to img 220 | int width = image.cols; 221 | int height = image.rows; 222 | float scale = min(float(w) / width, float(h) / height); 223 | float scaleSize[] = { width * scale, height * scale }; 224 | 225 | //correct box 226 | for (auto& bbox : detections) { 227 | bbox.x = (bbox.x * w - (w - scaleSize[0]) / 2.f) / scale; 228 | bbox.y = (bbox.y * h - (h - scaleSize[1]) / 2.f) / scale; 229 | bbox.w *= w; 230 | bbox.h *= h; 231 | bbox.w /= scale; 232 | bbox.h /= scale; 233 | } 234 | 235 | //nms 236 | float nmsThresh = nms_threshold; 237 | if (nmsThresh > 0) 238 | DoNms(detections, nmsThresh); 239 | 240 | return detections; 241 | } 242 | 243 | 244 | // prepare img 245 | vector prepareImage(cv::Mat& img) { 246 | int c = 3; 247 | int h = DETECT_WIDTH; //net h 248 | int w = DETECT_WIDTH; //net w 249 | 250 | float scale = min(float(w) / img.cols, float(h) / img.rows); 251 | auto scaleSize = cv::Size(img.cols * scale, img.rows * scale); 252 | 253 | cv::Mat rgb; 254 | cv::cvtColor(img, rgb, CV_BGR2RGB); 255 | cv::Mat resized; 256 | cv::resize(rgb, resized, scaleSize, 0, 0, INTER_CUBIC); 257 | 258 | cv::Mat cropped(h, w, CV_8UC3, 127); 259 | Rect rect((w - scaleSize.width) / 2, (h - scaleSize.height) / 2, scaleSize.width, scaleSize.height); 260 | resized.copyTo(cropped(rect)); 261 | 262 | cv::Mat img_float; 263 | cropped.convertTo(img_float, CV_32FC3, 1.f / 255.0); 264 | 265 | 266 | //HWC TO CHW 267 | vector input_channels(c); 268 | cv::split(img_float, input_channels); 269 | 270 | vector result(h * w * c); 271 | auto data = result.data(); 272 | int channelLength = h * w; 273 | for (int i = 0; i < c; ++i) { 274 | memcpy(data, input_channels[i].data, channelLength * sizeof(float)); 275 | data += channelLength; 276 | } 277 | return result; 278 | } 279 | 280 | 281 | // load engine file 282 | bool readTrtFile(const std::string& engineFile, //name of the engine file 283 | IHostMemory*& trtModelStream) //output buffer for the TensorRT model 284 | { 285 | using namespace std; 286 | fstream file; 287 | cout << "loading filename from:" << engineFile << endl; 288 | nvinfer1::IRuntime* trtRuntime; 289 | nvonnxparser::IPluginFactory* onnxPlugin = createPluginFactory(gLogger.getTRTLogger()); 290 | file.open(engineFile, ios::binary | ios::in); 291 | file.seekg(0, ios::end); 292 | int length = file.tellg(); 293 | //cout << "length:" << length << endl; 294 | file.seekg(0, ios::beg); 295 | std::unique_ptr data(new char[length]); 296 | file.read(data.get(), length); 297 | file.close(); 298 | cout << "load engine done" << endl; 299 | std::cout << "deserializing" << endl; 300 | trtRuntime = createInferRuntime(gLogger.getTRTLogger()); 301 | ICudaEngine* engine = trtRuntime->deserializeCudaEngine(data.get(), length, onnxPlugin); 302 | cout << "deserialize done" << endl; 303 | trtModelStream = engine->serialize(); 304 | 305 | return true; 306 | } 307 | 308 | 309 | // ONNX模型转为TensorRT引擎 310 | bool onnxToTRTModel(const std::string& modelFile, // onnx文件的名字 311 | const std::string& filename, // TensorRT引擎的名字 312 | IHostMemory*& trtModelStream) // output buffer for the TensorRT model 313 | { 314 | // 创建builder 315 | IBuilder* builder = createInferBuilder(gLogger.getTRTLogger()); 316 | assert(builder != nullptr); 317 | nvinfer1::INetworkDefinition* network = builder->createNetwork(); 318 | 319 | // 解析ONNX模型 320 | auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger()); 321 | 322 | 323 | //可选的 - 取消下面的注释可以查看网络中每层的星系信息 324 | //config->setPrintLayerInfo(true); 325 | //parser->reportParsingInfo(); 326 | 327 | //判断是否成功解析ONNX模型 328 | if (!parser->parseFromFile(modelFile.c_str(), static_cast(gLogger.getReportableSeverity()))) 329 | { 330 | gLogError << "Failure while parsing ONNX file" << std::endl; 331 | return false; 332 | } 333 | 334 | // 建立推理引擎 335 | builder->setMaxBatchSize(BATCH_SIZE); 336 | builder->setMaxWorkspaceSize(1 << 30); 337 | builder->setFp16Mode(true); 338 | builder->setInt8Mode(gArgs.runInInt8); 339 | 340 | if (gArgs.runInInt8) 341 | { 342 | samplesCommon::setAllTensorScales(network, 127.0f, 127.0f); 343 | } 344 | 345 | cout << "start building engine" << endl; 346 | ICudaEngine* engine = builder->buildCudaEngine(*network); 347 | cout << "build engine done" << endl; 348 | assert(engine); 349 | 350 | // 销毁模型解释器 351 | parser->destroy(); 352 | 353 | // 序列化引擎 354 | trtModelStream = engine->serialize(); 355 | 356 | // 保存引擎 357 | nvinfer1::IHostMemory* data = engine->serialize(); 358 | std::ofstream file; 359 | file.open(filename, std::ios::binary | std::ios::out); 360 | cout << "writing engine file..." << endl; 361 | file.write((const char*)data->data(), data->size()); 362 | cout << "save engine file done" << endl; 363 | file.close(); 364 | 365 | // 销毁所有相关的东西 366 | engine->destroy(); 367 | network->destroy(); 368 | builder->destroy(); 369 | 370 | return true; 371 | } 372 | 373 | inline int64_t volume(const nvinfer1::Dims& d) 374 | { 375 | return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); 376 | } 377 | 378 | inline unsigned int getElementSize(nvinfer1::DataType t) 379 | { 380 | switch (t) 381 | { 382 | case nvinfer1::DataType::kINT32: return 4; 383 | case nvinfer1::DataType::kFLOAT: return 4; 384 | case nvinfer1::DataType::kHALF: return 2; 385 | case nvinfer1::DataType::kINT8: return 1; 386 | } 387 | throw std::runtime_error("Invalid DataType."); 388 | return 0; 389 | } 390 | 391 | //执行前向推理 392 | void doInferenceFrieza(IHostMemory* trtModelStream) 393 | { 394 | //get engine 395 | assert(trtModelStream != nullptr); 396 | IRuntime* runtime = createInferRuntime(gLogger); 397 | nvonnxparser::IPluginFactory* onnxPlugin = createPluginFactory(gLogger.getTRTLogger()); 398 | assert(runtime != nullptr); 399 | if (gArgs.useDLACore >= 0) 400 | { 401 | runtime->setDLACore(gArgs.useDLACore); 402 | } 403 | ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), onnxPlugin); 404 | 405 | // 创建推理引擎 406 | assert(engine != nullptr); 407 | trtModelStream->destroy(); 408 | IExecutionContext* context = engine->createExecutionContext(); 409 | assert(context != nullptr); 410 | 411 | //读取输入数据到缓冲区管理对象中 412 | assert(engine->getNbBindings() == 3); 413 | void* buffers[3]; 414 | std::vector bufferSize; 415 | int nbBindings = engine->getNbBindings(); 416 | bufferSize.resize(nbBindings); 417 | 418 | for (int i = 0; i < nbBindings; ++i) 419 | { 420 | nvinfer1::Dims dims = engine->getBindingDimensions(i); 421 | nvinfer1::DataType dtype = engine->getBindingDataType(i); 422 | int64_t totalSize = volume(dims) * 1 * getElementSize(dtype); 423 | bufferSize[i] = totalSize; 424 | CHECK(cudaMalloc(&buffers[i], totalSize)); 425 | } 426 | 427 | // 创建CUDA流以执行此推断 428 | cudaStream_t stream; 429 | CHECK(cudaStreamCreate(&stream)); 430 | 431 | //define inputImgs inputData outputDetections ... 432 | //vector inputData; 433 | //inputData.reserve(DETECT_HEIGHT*DETECT_WIDTH*INPUT_CHANNEL*BATCH_SIZE); 434 | vector inputImgs; 435 | vector outputs; 436 | int outSize1 = bufferSize[1] / sizeof(float); 437 | int outSize2 = bufferSize[2] / sizeof(float); 438 | float* out1 = new float[outSize1]; 439 | float* out2 = new float[outSize2]; 440 | 441 | int index = 1, 442 | batchCount = 0; 443 | 444 | cv::Mat img = cv::imread("F:/TensorRT-6.0.1.5/data/v3tiny/4.jpg"); 445 | inputImgs.push_back(img); 446 | auto t_start_pre = std::chrono::high_resolution_clock::now(); 447 | vector curInput = prepareImage(img); 448 | auto t_end_pre = std::chrono::high_resolution_clock::now(); 449 | float total_pre = std::chrono::duration(t_end_pre - t_start_pre).count(); 450 | std::cout << "prepare image take: " << total_pre << " ms." << endl; 451 | 452 | /* 453 | inputData.insert(inputData.end(), curInput.begin(), curInput.end()); 454 | batchCount++; 455 | if (batchCount < BATCH_SIZE && i + 1 < fileNames.size()) 456 | continue; 457 | */ 458 | 459 | // DMA the input to the GPU, execute the batch asynchronously, and DMA it back: 460 | // 将数据从主机输入缓冲区异步复制到设备输入缓冲区 461 | CHECK(cudaMemcpyAsync(buffers[0], curInput.data(), bufferSize[0], cudaMemcpyHostToDevice, stream)); 462 | 463 | // 执行推理 464 | auto t_start = std::chrono::high_resolution_clock::now(); 465 | context->execute(BATCH_SIZE, buffers); 466 | auto t_end = std::chrono::high_resolution_clock::now(); 467 | float total = std::chrono::duration(t_end - t_start).count(); 468 | std::cout << "Inference take: " << total << " ms." << endl; 469 | 470 | CHECK(cudaMemcpyAsync(out1, buffers[1], bufferSize[1], cudaMemcpyDeviceToHost, stream)); 471 | CHECK(cudaMemcpyAsync(out2, buffers[2], bufferSize[2], cudaMemcpyDeviceToHost, stream)); 472 | cudaStreamSynchronize(stream); 473 | 474 | float* out = new float[outSize1 + outSize2]; 475 | out = merge(out1, out2, outSize1, outSize2); 476 | 477 | // postprocess 478 | auto t_start_post = std::chrono::high_resolution_clock::now(); 479 | auto boxes = postProcess(img, out); 480 | auto t_end_post = std::chrono::high_resolution_clock::now(); 481 | float total_post = std::chrono::duration(t_end_post - t_start_post).count(); 482 | std::cout << "Postprocess take: " << total_post << " ms." << endl; 483 | 484 | //print boxes 485 | for (int i = 0; i < boxes.size(); ++i) 486 | { 487 | cout << boxes[i].prob << ", " << boxes[i].x << ", " << boxes[i].y << ", " << boxes[i].w << ", " << boxes[i].h << endl; 488 | int x = boxes[i].x, 489 | y = boxes[i].y, 490 | w = boxes[i].w, 491 | h = boxes[i].h; 492 | cv::Rect rect = { x, y, w, h }; 493 | cv::rectangle(img, rect, cv::Scalar(255, 255, 0), 2); 494 | } 495 | 496 | cout << "\n" << endl; 497 | 498 | 499 | // release the stream and the buffers 500 | cudaStreamDestroy(stream); 501 | CHECK(cudaFree(buffers[0])); 502 | CHECK(cudaFree(buffers[1])); 503 | CHECK(cudaFree(buffers[2])); 504 | 505 | // destroy the engine 506 | context->destroy(); 507 | engine->destroy(); 508 | runtime->destroy(); 509 | 510 | cv::imshow("result", img); 511 | cv::imwrite("F:\\res.jpg", img); 512 | waitKey(0); 513 | 514 | } 515 | 516 | int main() 517 | { 518 | // read imgs list 519 | 520 | // create a TensorRT model from the onnx model and serialize it to a stream 521 | IHostMemory* trtModelStream{ nullptr }; 522 | 523 | // create and load engine 524 | fstream existEngine; 525 | existEngine.open(engineFile, ios::in); 526 | if (existEngine) 527 | { 528 | readTrtFile(engineFile, trtModelStream); 529 | assert(trtModelStream != nullptr); 530 | } 531 | else 532 | { 533 | onnxToTRTModel(onnxFile, engineFile, trtModelStream); 534 | assert(trtModelStream != nullptr); 535 | } 536 | 537 | //onnxToTRTModel(onnxFile, engineFile, trtModelStream); 538 | 539 | //do inference 540 | doInferenceFrieza(trtModelStream); 541 | 542 | return 0; 543 | } -------------------------------------------------------------------------------- /yolov3-tiny-trt-int8.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "EntropyCalibrator.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "NvInfer.h" 15 | #include "NvOnnxParser.h" 16 | #include "NvOnnxParserRuntime.h" 17 | #include "argsParser.h" 18 | #include "logger.h" 19 | #include "common.h" 20 | 21 | using namespace std; 22 | using namespace nvinfer1; 23 | using namespace nvonnxparser; 24 | using namespace cv; 25 | #include "NvInfer.h" 26 | #include 27 | #include 28 | 29 | // origin params 30 | samplesCommon::Args gArgs; 31 | 32 | // Res params 33 | string onnxFile = "F:/TensorRT-6.0.1.5/data/v3tiny/yolov3-tiny.onnx"; 34 | string engineFile = "F:/TensorRT-6.0.1.5/data/v3tiny/yolov3-tiny.trt"; 35 | string calibFile = "F:/TensorRT-6.0.1.5/data/v3tiny/calibaration.txt"; 36 | 37 | vector labels = { "abnormal" }; 38 | const int Classes = 13; 39 | vector > output_shape = { { 1, 15 + 3 * Classes , 13, 13 },{ 1, 15 + 3 * Classes, 26, 26 } }; 40 | vector > g_masks = { { 3, 4, 5 },{ 0, 1, 2 } }; 41 | vector > g_anchors = { { 10, 14 },{ 23, 27 },{ 37, 58 },{ 81, 82 },{ 135, 169 },{ 344, 319 } }; 42 | float obj_threshold = 0.10; 43 | float nms_threshold = 0.45; 44 | 45 | int CATEGORY = 1; 46 | int BATCH_SIZE = 1; 47 | int INPUT_CHANNEL = 3; 48 | int DETECT_WIDTH = 416; 49 | int DETECT_HEIGHT = 416; 50 | 51 | // 准备输入图片,返回预处理后的Vector 52 | vector prepareImage(cv::Mat& img) { 53 | int c = 3; 54 | int h = DETECT_WIDTH; //net h 55 | int w = DETECT_WIDTH; //net w 56 | 57 | float scale = min(float(w) / img.cols, float(h) / img.rows); 58 | auto scaleSize = cv::Size(img.cols * scale, img.rows * scale); 59 | 60 | cv::Mat rgb; 61 | cv::cvtColor(img, rgb, CV_BGR2RGB); 62 | cv::Mat resized; 63 | cv::resize(rgb, resized, scaleSize, 0, 0, INTER_CUBIC); 64 | 65 | cv::Mat cropped(h, w, CV_8UC3, 127); 66 | Rect rect((w - scaleSize.width) / 2, (h - scaleSize.height) / 2, scaleSize.width, scaleSize.height); 67 | resized.copyTo(cropped(rect)); 68 | 69 | cv::Mat img_float; 70 | cropped.convertTo(img_float, CV_32FC3, 1.f / 255.0); 71 | 72 | 73 | //HWC TO CHW 74 | vector input_channels(c); 75 | cv::split(img_float, input_channels); 76 | 77 | vector result(h * w * c); 78 | auto data = result.data(); 79 | int channelLength = h * w; 80 | for (int i = 0; i < c; ++i) { 81 | memcpy(data, input_channels[i].data, channelLength * sizeof(float)); 82 | data += channelLength; 83 | } 84 | return result; 85 | } 86 | 87 | namespace nvinfer1 { 88 | class int8EntroyCalibrator : public nvinfer1::IInt8EntropyCalibrator { 89 | public: 90 | int8EntroyCalibrator(const int &bacthSize, 91 | const std::string &imgPath, 92 | const std::string &calibTablePath); 93 | 94 | virtual ~int8EntroyCalibrator(); 95 | 96 | int getBatchSize() const override { return batchSize; } 97 | 98 | bool getBatch(void *bindings[], const char *names[], int nbBindings) override; 99 | 100 | const void *readCalibrationCache(std::size_t &length) override; 101 | 102 | void writeCalibrationCache(const void *ptr, std::size_t length) override; 103 | 104 | private: 105 | 106 | bool forwardFace; 107 | 108 | int batchSize; 109 | size_t inputCount; 110 | size_t imageIndex; 111 | 112 | std::string calibTablePath; 113 | std::vector imgPaths; 114 | 115 | float *batchData{ nullptr }; 116 | void *deviceInput{ nullptr }; 117 | 118 | 119 | 120 | bool readCache; 121 | std::vector calibrationCache; 122 | }; 123 | 124 | int8EntroyCalibrator::int8EntroyCalibrator(const int &bacthSize, const std::string &imgPath, 125 | const std::string &calibTablePath) :batchSize(bacthSize), calibTablePath(calibTablePath), imageIndex(0), forwardFace( 126 | false) { 127 | int inputChannel = 3; 128 | int inputH = 416; 129 | int inputW = 416; 130 | inputCount = bacthSize*inputChannel*inputH*inputW; 131 | std::fstream f(imgPath); 132 | if (f.is_open()) { 133 | std::string temp; 134 | while (std::getline(f, temp)) imgPaths.push_back(temp); 135 | } 136 | int len = imgPaths.size(); 137 | for (int i = 0; i < len; i++) { 138 | cout << imgPaths[i] << endl; 139 | } 140 | batchData = new float[inputCount]; 141 | CHECK(cudaMalloc(&deviceInput, inputCount * sizeof(float))); 142 | } 143 | 144 | int8EntroyCalibrator::~int8EntroyCalibrator() { 145 | CHECK(cudaFree(deviceInput)); 146 | if (batchData) 147 | delete[] batchData; 148 | } 149 | 150 | bool int8EntroyCalibrator::getBatch(void **bindings, const char **names, int nbBindings) { 151 | cout << imageIndex << " " << batchSize << endl; 152 | cout << imgPaths.size() << endl; 153 | if (imageIndex + batchSize > int(imgPaths.size())) 154 | return false; 155 | // load batch 156 | float* ptr = batchData; 157 | for (size_t j = imageIndex; j < imageIndex + batchSize; ++j) 158 | { 159 | //cout << imgPaths[j] << endl; 160 | Mat img = cv::imread(imgPaths[j]); 161 | vectorinputData = prepareImage(img); 162 | cout << inputData.size() << endl; 163 | cout << inputCount << endl; 164 | if ((int)(inputData.size()) != inputCount) 165 | { 166 | std::cout << "InputSize error. check include/ctdetConfig.h" << std::endl; 167 | return false; 168 | } 169 | assert(inputData.size() == inputCount); 170 | int len = (int)(inputData.size()); 171 | memcpy(ptr, inputData.data(), len * sizeof(float)); 172 | 173 | ptr += inputData.size(); 174 | std::cout << "load image " << imgPaths[j] << " " << (j + 1)*100. / imgPaths.size() << "%" << std::endl; 175 | } 176 | imageIndex += batchSize; 177 | CHECK(cudaMemcpy(deviceInput, batchData, inputCount * sizeof(float), cudaMemcpyHostToDevice)); 178 | bindings[0] = deviceInput; 179 | return true; 180 | } 181 | const void* int8EntroyCalibrator::readCalibrationCache(std::size_t &length) 182 | { 183 | calibrationCache.clear(); 184 | std::ifstream input(calibTablePath, std::ios::binary); 185 | input >> std::noskipws; 186 | if (readCache && input.good()) 187 | std::copy(std::istream_iterator(input), std::istream_iterator(), 188 | std::back_inserter(calibrationCache)); 189 | 190 | length = calibrationCache.size(); 191 | return length ? &calibrationCache[0] : nullptr; 192 | } 193 | 194 | void int8EntroyCalibrator::writeCalibrationCache(const void *cache, std::size_t length) 195 | { 196 | std::ofstream output(calibTablePath, std::ios::binary); 197 | output.write(reinterpret_cast(cache), length); 198 | } 199 | } 200 | 201 | 202 | // Res struct & function 203 | typedef struct DetectionRes { 204 | float x, y, w, h, prob; 205 | } DetectionRes; 206 | 207 | float sigmoid(float in) { 208 | return 1.f / (1.f + exp(-in)); 209 | } 210 | float exponential(float in) { 211 | return exp(in); 212 | } 213 | 214 | float* merge(float* out1, float* out2, int bsize_out1, int bsize_out2) 215 | { 216 | float* out_total = new float[bsize_out1 + bsize_out2]; 217 | 218 | for (int j = 0; j < bsize_out1; ++j) 219 | { 220 | int index = j; 221 | out_total[index] = out1[j]; 222 | } 223 | 224 | for (int j = 0; j < bsize_out2; ++j) 225 | { 226 | int index = j + bsize_out1; 227 | out_total[index] = out2[j]; 228 | } 229 | return out_total; 230 | } 231 | 232 | vector split(const string& str, char delim) 233 | { 234 | stringstream ss(str); 235 | string token; 236 | vector container; 237 | while (getline(ss, token, delim)) 238 | { 239 | container.push_back(token); 240 | } 241 | 242 | return container; 243 | } 244 | 245 | 246 | // 执行NMS 247 | void DoNms(vector& detections, float nmsThresh) { 248 | auto iouCompute = [](float * lbox, float* rbox) { 249 | float interBox[] = { 250 | max(lbox[0], rbox[0]), //left 251 | min(lbox[0] + lbox[2], rbox[0] + rbox[2]), //right 252 | max(lbox[1], rbox[1]), //top 253 | min(lbox[1] + lbox[3], rbox[1] + rbox[3]), //bottom 254 | }; 255 | 256 | if (interBox[2] >= interBox[3] || interBox[0] >= interBox[1]) 257 | return 0.0f; 258 | 259 | float interBoxS = (interBox[1] - interBox[0] + 1) * (interBox[3] - interBox[2] + 1); 260 | return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS); 261 | }; 262 | 263 | sort(detections.begin(), detections.end(), [=](const DetectionRes & left, const DetectionRes & right) { 264 | return left.prob > right.prob; 265 | }); 266 | 267 | vector result; 268 | for (unsigned int m = 0; m < detections.size(); ++m) { 269 | result.push_back(detections[m]); 270 | for (unsigned int n = m + 1; n < detections.size(); ++n) { 271 | if (iouCompute((float *)(&detections[m]), (float *)(&detections[n])) > nmsThresh) { 272 | detections.erase(detections.begin() + n); 273 | --n; 274 | } 275 | } 276 | } 277 | detections = move(result); 278 | } 279 | 280 | //后处理 281 | vector postProcess(cv::Mat& image, float * output) { 282 | vector detections; 283 | int total_size = 0; 284 | for (int i = 0; i < output_shape.size(); i++) { 285 | auto shape = output_shape[i]; 286 | int size = 1; 287 | for (int j = 0; j < shape.size(); j++) { 288 | size *= shape[j]; 289 | } 290 | total_size += size; 291 | } 292 | 293 | int offset = 0; 294 | float * transposed_output = new float[total_size]; 295 | float * transposed_output_t = transposed_output; 296 | for (int i = 0; i < output_shape.size(); i++) { 297 | auto shape = output_shape[i]; // nchw 298 | int chw = shape[1] * shape[2] * shape[3]; 299 | int hw = shape[2] * shape[3]; 300 | for (int n = 0; n < shape[0]; n++) { 301 | int offset_n = offset + n * chw; 302 | for (int h = 0; h < shape[2]; h++) { 303 | for (int w = 0; w < shape[3]; w++) { 304 | int h_w = h * shape[3] + w; 305 | for (int c = 0; c < shape[1]; c++) { 306 | int offset_c = offset_n + hw * c + h_w; 307 | *transposed_output_t++ = output[offset_c]; 308 | } 309 | } 310 | } 311 | } 312 | offset += shape[0] * chw; 313 | } 314 | vector > shapes; 315 | for (int i = 0; i < output_shape.size(); i++) { 316 | auto shape = output_shape[i]; 317 | vector tmp = { shape[2], shape[3], 3, 5 + Classes }; 318 | shapes.push_back(tmp); 319 | } 320 | 321 | offset = 0; 322 | for (int i = 0; i < output_shape.size(); i++) { 323 | auto masks = g_masks[i]; 324 | vector > anchors; 325 | for (auto mask : masks) 326 | anchors.push_back(g_anchors[mask]); 327 | auto shape = shapes[i]; 328 | for (int h = 0; h < shape[0]; h++) { 329 | int offset_h = offset + h * shape[1] * shape[2] * shape[3]; 330 | for (int w = 0; w < shape[1]; w++) { 331 | int offset_w = offset_h + w * shape[2] * shape[3]; 332 | for (int c = 0; c < shape[2]; c++) { 333 | int offset_c = offset_w + c * shape[3]; 334 | float * ptr = transposed_output + offset_c; 335 | //ptr[4] = sigmoid(ptr[4]); 336 | //ptr[5] = sigmoid(ptr[5]); 337 | float score = 0; 338 | for (int i = 5; i < 5 + Classes; i++) { 339 | score = max(score, sigmoid(ptr[4]) * sigmoid(ptr[i])); 340 | } 341 | 342 | if (score < obj_threshold) 343 | continue; 344 | ptr[0] = sigmoid(ptr[0]); 345 | ptr[1] = sigmoid(ptr[1]); 346 | ptr[2] = exponential(ptr[2]) * anchors[c][0]; 347 | ptr[3] = exponential(ptr[3]) * anchors[c][1]; 348 | 349 | ptr[0] += w; 350 | ptr[1] += h; 351 | ptr[0] /= shape[0]; 352 | ptr[1] /= shape[1]; 353 | ptr[2] /= DETECT_WIDTH; 354 | ptr[3] /= DETECT_WIDTH; 355 | ptr[0] -= ptr[2] / 2; 356 | ptr[1] -= ptr[3] / 2; 357 | 358 | DetectionRes det;; 359 | det.x = ptr[0]; 360 | det.y = ptr[1]; 361 | det.w = ptr[2]; 362 | det.h = ptr[3]; 363 | det.prob = score; 364 | detections.push_back(det); 365 | } 366 | } 367 | } 368 | offset += shape[0] * shape[1] * shape[2] * shape[3]; 369 | } 370 | delete[]transposed_output; 371 | 372 | int h = DETECT_WIDTH; //net h 373 | int w = DETECT_WIDTH; //net w 374 | 375 | //scale bbox to img 376 | int width = image.cols; 377 | int height = image.rows; 378 | float scale = min(float(w) / width, float(h) / height); 379 | float scaleSize[] = { width * scale, height * scale }; 380 | 381 | //correct box 382 | for (auto& bbox : detections) { 383 | bbox.x = (bbox.x * w - (w - scaleSize[0]) / 2.f) / scale; 384 | bbox.y = (bbox.y * h - (h - scaleSize[1]) / 2.f) / scale; 385 | bbox.w *= w; 386 | bbox.h *= h; 387 | bbox.w /= scale; 388 | bbox.h /= scale; 389 | } 390 | 391 | //nms 392 | float nmsThresh = nms_threshold; 393 | if (nmsThresh > 0) 394 | DoNms(detections, nmsThresh); 395 | 396 | return detections; 397 | } 398 | 399 | 400 | // 加载TensorRT引擎文件 401 | bool readTrtFile(const std::string& engineFile, //引擎文件的名字 402 | IHostMemory*& trtModelStream) //TensorRT模型的输出Buffer 403 | { 404 | fstream file; 405 | cout << "loading filename from:" << engineFile << endl; 406 | nvinfer1::IRuntime* trtRuntime; 407 | nvonnxparser::IPluginFactory* onnxPlugin = createPluginFactory(gLogger.getTRTLogger()); // 408 | file.open(engineFile, ios::binary | ios::in); 409 | file.seekg(0, ios::end); 410 | int length = file.tellg(); 411 | //cout << "length:" << length << endl; 412 | file.seekg(0, ios::beg); 413 | std::unique_ptr data(new char[length]); 414 | file.read(data.get(), length); 415 | file.close(); 416 | cout << "load engine done" << endl; 417 | std::cout << "deserializing" << endl; 418 | trtRuntime = createInferRuntime(gLogger.getTRTLogger()); 419 | ICudaEngine* engine = trtRuntime->deserializeCudaEngine(data.get(), length, onnxPlugin); 420 | cout << "deserialize done" << endl; 421 | trtModelStream = engine->serialize(); 422 | 423 | return true; 424 | } 425 | 426 | 427 | // ONNX模型转为TensorRT引擎 428 | bool onnxToTRTModel(const std::string& modelFile, // onnx文件的名字 429 | const std::string& filename, // TensorRT引擎的名字 430 | IHostMemory*& trtModelStream) // output buffer for the TensorRT model 431 | { 432 | // 创建builder 433 | IBuilder* builder = createInferBuilder(gLogger.getTRTLogger()); 434 | assert(builder != nullptr); 435 | nvinfer1::INetworkDefinition* network = builder->createNetwork(); 436 | 437 | if (!builder->platformHasFastInt8()) return false; 438 | 439 | // 解析ONNX模型 440 | auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger()); 441 | 442 | 443 | //可选的 - 取消下面的注释可以查看网络中每层的详细信息 444 | //config->setPrintLayerInfo(true); 445 | //parser->reportParsingInfo(); 446 | 447 | //判断是否成功解析ONNX模型 448 | if (!parser->parseFromFile(modelFile.c_str(), static_cast(gLogger.getReportableSeverity()))) 449 | { 450 | gLogError << "Failure while parsing ONNX file" << std::endl; 451 | return false; 452 | } 453 | 454 | 455 | // 建立推理引擎 456 | builder->setMaxBatchSize(BATCH_SIZE); 457 | builder->setMaxWorkspaceSize(1 << 30); 458 | 459 | nvinfer1::int8EntroyCalibrator *calibrator = nullptr; 460 | if (calibFile.size()>0) calibrator = new nvinfer1::int8EntroyCalibrator(BATCH_SIZE, calibFile, "F:/TensorRT-6.0.1.5/data/v3tiny/calib.table"); 461 | 462 | 463 | //builder->setFp16Mode(true); 464 | std::cout << "setInt8Mode" << std::endl; 465 | if (!builder->platformHasFastInt8()) 466 | std::cout << "Notice: the platform do not has fast for int8" << std::endl; 467 | builder->setInt8Mode(true); 468 | builder->setInt8Calibrator(calibrator); 469 | /*if (gArgs.runInInt8) 470 | { 471 | samplesCommon::setAllTensorScales(network, 127.0f, 127.0f); 472 | }*/ 473 | //samplesCommon::setAllTensorScales(network, 1.0f, 1.0f); 474 | cout << "start building engine" << endl; 475 | ICudaEngine* engine = builder->buildCudaEngine(*network); 476 | cout << "build engine done" << endl; 477 | assert(engine); 478 | if (calibrator) { 479 | delete calibrator; 480 | calibrator = nullptr; 481 | } 482 | // 销毁模型解释器 483 | parser->destroy(); 484 | 485 | // 序列化引擎 486 | trtModelStream = engine->serialize(); 487 | 488 | // 保存引擎 489 | nvinfer1::IHostMemory* data = engine->serialize(); 490 | std::ofstream file; 491 | file.open(filename, std::ios::binary | std::ios::out); 492 | cout << "writing engine file..." << endl; 493 | file.write((const char*)data->data(), data->size()); 494 | cout << "save engine file done" << endl; 495 | file.close(); 496 | 497 | // 销毁所有相关的东西 498 | engine->destroy(); 499 | network->destroy(); 500 | builder->destroy(); 501 | 502 | return true; 503 | } 504 | 505 | inline int64_t volume(const nvinfer1::Dims& d) 506 | { 507 | return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies()); 508 | } 509 | 510 | inline unsigned int getElementSize(nvinfer1::DataType t) 511 | { 512 | switch (t) 513 | { 514 | case nvinfer1::DataType::kINT32: return 4; 515 | case nvinfer1::DataType::kFLOAT: return 4; 516 | case nvinfer1::DataType::kHALF: return 2; 517 | case nvinfer1::DataType::kINT8: return 1; 518 | } 519 | throw std::runtime_error("Invalid DataType."); 520 | return 0; 521 | } 522 | 523 | //执行前向推理 524 | void doInferenceFrieza(IHostMemory* trtModelStream) 525 | { 526 | //get engine 527 | assert(trtModelStream != nullptr); 528 | IRuntime* runtime = createInferRuntime(gLogger); 529 | nvonnxparser::IPluginFactory* onnxPlugin = createPluginFactory(gLogger.getTRTLogger()); 530 | assert(runtime != nullptr); 531 | if (gArgs.useDLACore >= 0) 532 | { 533 | runtime->setDLACore(gArgs.useDLACore); 534 | } 535 | ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), onnxPlugin); 536 | 537 | // 创建推理引擎 538 | assert(engine != nullptr); 539 | trtModelStream->destroy(); 540 | IExecutionContext* context = engine->createExecutionContext(); 541 | assert(context != nullptr); 542 | 543 | //读取输入数据到缓冲区管理对象中 544 | assert(engine->getNbBindings() == 3); 545 | void* buffers[3]; 546 | std::vector bufferSize; 547 | int nbBindings = engine->getNbBindings(); 548 | bufferSize.resize(nbBindings); 549 | 550 | for (int i = 0; i < nbBindings; ++i) 551 | { 552 | nvinfer1::Dims dims = engine->getBindingDimensions(i); 553 | nvinfer1::DataType dtype = engine->getBindingDataType(i); 554 | int64_t totalSize = volume(dims) * 1 * getElementSize(dtype); 555 | bufferSize[i] = totalSize; 556 | CHECK(cudaMalloc(&buffers[i], totalSize)); 557 | } 558 | 559 | // 创建CUDA流以执行此推断 560 | cudaStream_t stream; 561 | CHECK(cudaStreamCreate(&stream)); 562 | 563 | //define inputImgs inputData outputDetections ... 564 | //vector inputData; 565 | //inputData.reserve(DETECT_HEIGHT*DETECT_WIDTH*INPUT_CHANNEL*BATCH_SIZE); 566 | vector inputImgs; 567 | vector outputs; 568 | int outSize1 = bufferSize[1] / sizeof(float); 569 | int outSize2 = bufferSize[2] / sizeof(float); 570 | float* out1 = new float[outSize1]; 571 | float* out2 = new float[outSize2]; 572 | 573 | int index = 1, 574 | batchCount = 0; 575 | 576 | cv::Mat img = cv::imread("F:/TensorRT-6.0.1.5/data/v3tiny/1.jpg"); 577 | inputImgs.push_back(img); 578 | auto t_start_pre = std::chrono::high_resolution_clock::now(); 579 | vector curInput = prepareImage(img); 580 | auto t_end_pre = std::chrono::high_resolution_clock::now(); 581 | float total_pre = std::chrono::duration(t_end_pre - t_start_pre).count(); 582 | std::cout << "prepare image take: " << total_pre << " ms." << endl; 583 | 584 | /* 585 | inputData.insert(inputData.end(), curInput.begin(), curInput.end()); 586 | batchCount++; 587 | if (batchCount < BATCH_SIZE && i + 1 < fileNames.size()) 588 | continue; 589 | */ 590 | 591 | // DMA the input to the GPU, execute the batch asynchronously, and DMA it back: 592 | // 将数据从主机输入缓冲区异步复制到设备输入缓冲区 593 | CHECK(cudaMemcpyAsync(buffers[0], curInput.data(), bufferSize[0], cudaMemcpyHostToDevice, stream)); 594 | 595 | // 执行推理 596 | 597 | auto t_start = std::chrono::high_resolution_clock::now(); 598 | for (int i = 0; i < 20; i++) { 599 | context->execute(BATCH_SIZE, buffers); 600 | } 601 | auto t_end = std::chrono::high_resolution_clock::now(); 602 | float total = std::chrono::duration(t_end - t_start).count(); 603 | std::cout << "Inference take: " << total / 20.0 << " ms." << endl; 604 | 605 | CHECK(cudaMemcpyAsync(out1, buffers[1], bufferSize[1], cudaMemcpyDeviceToHost, stream)); 606 | CHECK(cudaMemcpyAsync(out2, buffers[2], bufferSize[2], cudaMemcpyDeviceToHost, stream)); 607 | cudaStreamSynchronize(stream); 608 | 609 | float* out = new float[outSize1 + outSize2]; 610 | out = merge(out1, out2, outSize1, outSize2); 611 | 612 | // postprocess 613 | auto t_start_post = std::chrono::high_resolution_clock::now(); 614 | auto boxes = postProcess(img, out); 615 | auto t_end_post = std::chrono::high_resolution_clock::now(); 616 | float total_post = std::chrono::duration(t_end_post - t_start_post).count(); 617 | std::cout << "Postprocess take: " << total_post << " ms." << endl; 618 | 619 | //print boxes 620 | for (int i = 0; i < boxes.size(); ++i) 621 | { 622 | cout << boxes[i].prob << ", " << boxes[i].x << ", " << boxes[i].y << ", " << boxes[i].w << ", " << boxes[i].h << endl; 623 | int x = boxes[i].x, 624 | y = boxes[i].y, 625 | w = boxes[i].w, 626 | h = boxes[i].h; 627 | cv::Rect rect = { x, y, w, h }; 628 | cv::rectangle(img, rect, cv::Scalar(255, 255, 0), 2); 629 | } 630 | 631 | cout << "\n" << endl; 632 | 633 | 634 | // release the stream and the buffers 635 | cudaStreamDestroy(stream); 636 | CHECK(cudaFree(buffers[0])); 637 | CHECK(cudaFree(buffers[1])); 638 | CHECK(cudaFree(buffers[2])); 639 | 640 | // destroy the engine 641 | context->destroy(); 642 | engine->destroy(); 643 | runtime->destroy(); 644 | 645 | cv::imshow("result", img); 646 | waitKey(0); 647 | 648 | } 649 | 650 | int main() 651 | { 652 | // read imgs list 653 | 654 | // create a TensorRT model from the onnx model and serialize it to a stream 655 | IHostMemory* trtModelStream{ nullptr }; 656 | 657 | // create and load engine 658 | fstream existEngine; 659 | existEngine.open(engineFile, ios::in); 660 | if (existEngine) 661 | { 662 | readTrtFile(engineFile, trtModelStream); 663 | assert(trtModelStream != nullptr); 664 | } 665 | else 666 | { 667 | onnxToTRTModel(onnxFile, engineFile, trtModelStream); 668 | assert(trtModelStream != nullptr); 669 | } 670 | 671 | //onnxToTRTModel(onnxFile, engineFile, trtModelStream); 672 | gArgs.runInInt8 = true; 673 | //do inference 674 | doInferenceFrieza(trtModelStream); 675 | 676 | return 0; 677 | } --------------------------------------------------------------------------------