├── 1.jpg
├── 2.jpg
├── 3.jpg
├── calibaration.txt
├── README.md
├── yolov3-tiny-trt-fp32.cpp
└── yolov3-tiny-trt-int8.cpp


/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/yolov3-tiny-onnx-TensorRT/HEAD/1.jpg


--------------------------------------------------------------------------------
/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/yolov3-tiny-onnx-TensorRT/HEAD/2.jpg


--------------------------------------------------------------------------------
/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BBuf/yolov3-tiny-onnx-TensorRT/HEAD/3.jpg


--------------------------------------------------------------------------------
/calibaration.txt:
--------------------------------------------------------------------------------
 1 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000001.jpg
 2 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000002.jpg
 3 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000003.jpg
 4 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000004.jpg
 5 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000005.jpg
 6 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000006.jpg
 7 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000007.jpg
 8 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000008.jpg
 9 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000009.jpg
10 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000010.jpg
11 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000011.jpg
12 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000012.jpg
13 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000013.jpg
14 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000014.jpg
15 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000015.jpg
16 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000016.jpg
17 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000017.jpg
18 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000018.jpg
19 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000019.jpg
20 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000020.jpg
21 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000021.jpg
22 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000022.jpg
23 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000023.jpg
24 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000024.jpg
25 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000025.jpg
26 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000026.jpg
27 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000027.jpg
28 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000028.jpg
29 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000029.jpg
30 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000030.jpg
31 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000031.jpg
32 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000032.jpg
33 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000033.jpg
34 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000034.jpg
35 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000035.jpg
36 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000036.jpg
37 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000037.jpg
38 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000038.jpg
39 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000039.jpg
40 | F:/TensorRT-6.0.1.5/data/v3tiny/calbration/000040.jpg
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # yolov3-tiny-onnx-TensorRT
  2 | ## Requirements
  3 | 
  4 | - TensorRT 6.0
  5 | - VS 2015/Clion
  6 | - Cuda 9.0 + Cudnn 7.6
  7 | 
  8 | 
  9 | 
 10 | ## Model Converter
 11 | 
 12 | Convert your Darknet yolov3-tiny model to onnx，please follow these steps：
 13 | 
 14 | ## Requirements
 15 | 
 16 | 	python=2.7
 17 | 	numpy=1.16.1
 18 | 	onnx=1.4.1 (important)
 19 | 	pycuda=2019.1.1
 20 | 	Pillow=6.1.0
 21 | 	wget=3.2
 22 | 
 23 | ## custom settings
 24 | 
 25 | 	data_processing.py:
 26 | 		line14: LABEL_FILE_PATH = '/home/nvidia/yolov3-tiny2onnx2trt/coco_labels.txt'
 27 | 		line19: CATEGORY_NUM = 80
 28 | 	
 29 | 	yolov3_to_onnx.py:
 30 | 		line778: img_size = 416
 31 | 		line784: cfg_file_path = '/home/nvidia/yolov3-tiny2onnx2trt/yolov3-tiny.cfg'
 32 | 		line811: weights_file_path = '/home/nvidia/yolov3-tiny2onnx2trt/yolov3-tiny.weights'
 33 | 		line826: output_file_path = 'yolov3-tiny.onnx'
 34 | 	
 35 | 	onnx_to_tensorrt.py:
 36 | 		line39: input_size = 416
 37 | 		line40: batch_size = 1
 38 | 		line42~line46:
 39 | 		    onnx_file_path = 'yolov3-tiny.onnx'
 40 | 		    engine_file_path = 'yolov3-tiny.trt'
 41 | 		    input_file_list = '/home/nvidia/yolov3-tiny2onnx2trt/imagelist.txt'
 42 | 		    IMAGE_PATH = '/home/nvidia/yolov3-tiny2onnx2trt/images/'
 43 | 		    save_path = '/home/nvidia/yolov3-tiny2onnx2trt/'
 44 | ## notes (very important!)
 45 | 
 46 | 	0.The onnx version must be 1.4.1. If it is not, please run the commands:
 47 | 		pip uninstall onnx
 48 | 		pip install onnx==1.4.1
 49 | 	
 50 | 	1.The cfg-file's last line must be a blank line. You should press Enter to add a blank line if there is no blank line at the end of the file.
 51 | 
 52 | ## steps
 53 | 
 54 | 	0.Put your .weights file in the folder
 55 | 		|-yolov3-tiny2onnx2trt
 56 | 			|-yolov3-tiny.weights
 57 | 	
 58 | 	1.Change your settings as "#custom settings"
 59 | 	
 60 | 	2.Run commands:
 61 | 		cd yolov3-tiny2onnx2trt
 62 | 		python yolov3_to_onnx.py
 63 | 	
 64 | 		you will get a yolov3-tiny.onnx file
 65 | 	
 66 | 	3.Run commands:	
 67 | 	  	python onnx_to_tensorrt.py:
 68 | 	
 69 | 		you will get a yolov3-tiny.trt file and some inferenced images.
 70 | 
 71 | 
 72 | # TensorRT FP32 Inference
 73 | 
 74 | - run yolov3-tiny-trt-fp32.cpp（ You can modify the number of categories by yourself ）。
 75 | 
 76 | - The visualization results are as follows：
 77 | 
 78 | 
 79 | 
 80 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20200409142919546.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2p1c3Rfc29ydA==,size_16,color_FFFFFF,t_70#pic_center)
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20200409143133305.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2p1c3Rfc29ydA==,size_16,color_FFFFFF,t_70#pic_center)
 87 | 
 88 | 
 89 | 
 90 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20200409143229456.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2p1c3Rfc29ydA==,size_16,color_FFFFFF,t_70#pic_center)
 91 | 
 92 | 
 93 | 
 94 | # TensorRT INT8 Calibaration
 95 | 
 96 | - Prepare calibaration data(*.txt)，like this：
 97 | 
 98 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20200409151326680.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2p1c3Rfc29ydA==,size_16,color_FFFFFF,t_70)
 99 | 
100 | - Create a class that inherits INT8EntropyCalibrator, the code is as follows：
101 | 
102 | 
103 | 
104 | ```c++
105 | namespace nvinfer1 {
106 | 	class int8EntroyCalibrator : public nvinfer1::IInt8EntropyCalibrator {
107 | 	public:
108 | 		int8EntroyCalibrator(const int &bacthSize,
109 | 			const std::string &imgPath,
110 | 			const std::string &calibTablePath);
111 | 
112 | 		virtual ~int8EntroyCalibrator();
113 | 
114 | 		int getBatchSize() const override { return batchSize; }
115 | 
116 | 		bool getBatch(void *bindings[], const char *names[], int nbBindings) override;
117 | 
118 | 		const void *readCalibrationCache(std::size_t &length) override;
119 | 
120 | 		void writeCalibrationCache(const void *ptr, std::size_t length) override;
121 | 
122 | 	private:
123 | 
124 | 		bool forwardFace;
125 | 
126 | 		int batchSize;
127 | 		size_t inputCount;
128 | 		size_t imageIndex;
129 | 
130 | 		std::string calibTablePath;
131 | 		std::vector<std::string> imgPaths;
132 | 
133 | 		float *batchData{ nullptr };
134 | 		void  *deviceInput{ nullptr };
135 | 
136 | 
137 | 
138 | 		bool readCache;
139 | 		std::vector<char> calibrationCache;
140 | 	};
141 | 
142 | 	int8EntroyCalibrator::int8EntroyCalibrator(const int &bacthSize, const std::string &imgPath,
143 | 		const std::string &calibTablePath) :batchSize(bacthSize), calibTablePath(calibTablePath), imageIndex(0), forwardFace(
144 | 			false) {
145 | 		int inputChannel = 3;
146 | 		int inputH = 416;
147 | 		int inputW = 416;
148 | 		inputCount = bacthSize*inputChannel*inputH*inputW;
149 | 		std::fstream f(imgPath);
150 | 		if (f.is_open()) {
151 | 			std::string temp;
152 | 			while (std::getline(f, temp)) imgPaths.push_back(temp);
153 | 		}
154 | 		int len = imgPaths.size();
155 | 		for (int i = 0; i < len; i++) {
156 | 			cout << imgPaths[i] << endl;
157 | 		}
158 | 		batchData = new float[inputCount];
159 | 		CHECK(cudaMalloc(&deviceInput, inputCount * sizeof(float)));
160 | 	}
161 | 
162 | 	int8EntroyCalibrator::~int8EntroyCalibrator() {
163 | 		CHECK(cudaFree(deviceInput));
164 | 		if (batchData)
165 | 			delete[] batchData;
166 | 	}
167 | 
168 | 	bool int8EntroyCalibrator::getBatch(void **bindings, const char **names, int nbBindings) {
169 | 		cout << imageIndex << " " << batchSize << endl;
170 | 		cout << imgPaths.size() << endl;
171 | 		if (imageIndex + batchSize > int(imgPaths.size()))
172 | 			return false;
173 | 		// load batch
174 | 		float* ptr = batchData;
175 | 		for (size_t j = imageIndex; j < imageIndex + batchSize; ++j)
176 | 		{
177 | 			//cout << imgPaths[j] << endl;
178 | 			Mat img = cv::imread(imgPaths[j]);
179 | 			vector<float>inputData = prepareImage(img);
180 | 			cout << inputData.size() << endl;
181 | 			cout << inputCount << endl;
182 | 			if ((int)(inputData.size()) != inputCount)
183 | 			{
184 | 				std::cout << "InputSize error. check include/ctdetConfig.h" << std::endl;
185 | 				return false;
186 | 			}
187 | 			assert(inputData.size() == inputCount);
188 | 			int len = (int)(inputData.size());
189 | 			memcpy(ptr, inputData.data(), len * sizeof(float));
190 | 
191 | 			ptr += inputData.size();
192 | 			std::cout << "load image " << imgPaths[j] << "  " << (j + 1)*100. / imgPaths.size() << "%" << std::endl;
193 | 		}
194 | 		imageIndex += batchSize;
195 | 		CHECK(cudaMemcpy(deviceInput, batchData, inputCount * sizeof(float), cudaMemcpyHostToDevice));
196 | 		bindings[0] = deviceInput;
197 | 		return true;
198 | 	}
199 | 	const void* int8EntroyCalibrator::readCalibrationCache(std::size_t &length)
200 | 	{
201 | 		calibrationCache.clear();
202 | 		std::ifstream input(calibTablePath, std::ios::binary);
203 | 		input >> std::noskipws;
204 | 		if (readCache && input.good())
205 | 			std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(),
206 | 				std::back_inserter(calibrationCache));
207 | 
208 | 		length = calibrationCache.size();
209 | 		return length ? &calibrationCache[0] : nullptr;
210 | 	}
211 | 
212 | 	void int8EntroyCalibrator::writeCalibrationCache(const void *cache, std::size_t length)
213 | 	{
214 | 		std::ofstream output(calibTablePath, std::ios::binary);
215 | 		output.write(reinterpret_cast<const char*>(cache), length);
216 | 	}
217 | }
218 | ```
219 | 
220 | 
221 | 
222 | - Change onnxToTRTModel function in yolov3-tiny-trt-fp32.cpp，the code is as follows：
223 | 
224 | ```c++
225 | bool onnxToTRTModel(const std::string& modelFile,
226 | 	const std::string& filename,  
227 | 	IHostMemory*& trtModelStream) // output buffer for the TensorRT model
228 | {
229 | 	IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
230 | 	assert(builder != nullptr);
231 | 	nvinfer1::INetworkDefinition* network = builder->createNetwork();
232 | 
233 | 	if (!builder->platformHasFastInt8()) return false;
234 | 
235 | 	auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());
236 | 
237 | 
238 | 	//config->setPrintLayerInfo(true);
239 | 	//parser->reportParsingInfo();
240 | 
241 | 	if (!parser->parseFromFile(modelFile.c_str(), static_cast<int>(gLogger.getReportableSeverity())))
242 | 	{
243 | 		gLogError << "Failure while parsing ONNX file" << std::endl;
244 | 		return false;
245 | 	}
246 | 
247 | 	
248 | 	builder->setMaxBatchSize(BATCH_SIZE);
249 | 	builder->setMaxWorkspaceSize(1 << 30);
250 | 
251 | 	nvinfer1::int8EntroyCalibrator *calibrator = nullptr;
252 | 	if (calibFile.size()>0) calibrator = new nvinfer1::int8EntroyCalibrator(BATCH_SIZE, calibFile, "F:/TensorRT-6.0.1.5/data/v3tiny/calib.table");
253 | 
254 | 
255 | 	//builder->setFp16Mode(true);
256 | 	std::cout << "setInt8Mode" << std::endl;
257 | 	if (!builder->platformHasFastInt8())
258 | 		std::cout << "Notice: the platform do not has fast for int8" << std::endl;
259 | 	builder->setInt8Mode(true);
260 | 	builder->setInt8Calibrator(calibrator);
261 | 	/*if (gArgs.runInInt8)
262 | 	{
263 | 		samplesCommon::setAllTensorScales(network, 127.0f, 127.0f);
264 | 	}*/
265 | 	//samplesCommon::setAllTensorScales(network, 1.0f, 1.0f);
266 | 	cout << "start building engine" << endl;
267 | 	ICudaEngine* engine = builder->buildCudaEngine(*network);
268 | 	cout << "build engine done" << endl;
269 | 	assert(engine);
270 | 	if (calibrator) {
271 | 		delete calibrator;
272 | 		calibrator = nullptr;
273 | 	}
274 | 	parser->destroy();
275 | 
276 | 	trtModelStream = engine->serialize();
277 | 
278 | 	nvinfer1::IHostMemory* data = engine->serialize();
279 | 	std::ofstream file;
280 | 	file.open(filename, std::ios::binary | std::ios::out);
281 | 	cout << "writing engine file..." << endl;
282 | 	file.write((const char*)data->data(), data->size());
283 | 	cout << "save engine file done" << endl;
284 | 	file.close();
285 | 
286 | 	engine->destroy();
287 | 	network->destroy();
288 | 	builder->destroy();
289 | 
290 | 	return true
291 | ```
292 | 
293 | 
294 | 
295 | - Finally you can get a INT8 TensorRT model，enjoy it。
296 | 
297 | 
298 | 
299 | # Accuracy And Speed
300 | 
301 | - GTX 1050 Ti
302 | 
303 | | YOLOV3-Tiny TRT模型 | mAP(50) | Inference Time |
304 | | ------------------- | ------- | -------------- |
305 | | FP32                | 95.0%   | 42ms           |
306 | | INT8                | 95.0%   | 10ms           |
307 | 
308 | 
309 | 
310 | # Reference
311 | 
312 | - https://github.com/zombie0117/yolov3-tiny-onnx-TensorRT
313 | - https://mp.weixin.qq.com/s/rYuodkH-tf-q4uZ0QAkuAw
314 | - https://mp.weixin.qq.com/s/huP2J565irXXU7SSIk-Hwg
315 | - https://mp.weixin.qq.com/s/9WKJi4AnOFKKqvK8R9ph1g
316 | - https://mp.weixin.qq.com/s/QcotYLHVVkf5sEvgKZKemg
317 | - https://mp.weixin.qq.com/s/WiVhlR9-rpe-O9J9ULc_bA


--------------------------------------------------------------------------------
/yolov3-tiny-trt-fp32.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <opencv2/opencv.hpp>
  3 | #include <assert.h>
  4 | #include <cmath>
  5 | #include <cuda_runtime_api.h>
  6 | #include <fstream>
  7 | #include <iomanip>
  8 | #include <iostream>
  9 | #include <sstream>
 10 | #include <sys/stat.h>
 11 | #include <time.h>
 12 | 
 13 | #include "NvInfer.h"
 14 | #include "NvOnnxParser.h"
 15 | #include "NvOnnxParserRuntime.h"
 16 | #include "argsParser.h"
 17 | #include "logger.h"
 18 | #include "common.h"
 19 | 
 20 | using namespace std;
 21 | using namespace nvinfer1;
 22 | using namespace nvonnxparser;
 23 | using namespace cv;
 24 | 
 25 | 
 26 | // origin params
 27 | samplesCommon::Args gArgs;
 28 | 
 29 | // Res params
 30 | string onnxFile = "F:/TensorRT-6.0.1.5/data/v3tiny/yolov3-tiny.onnx";
 31 | string engineFile = "F:/TensorRT-6.0.1.5/data/v3tiny/yolov3-tiny.trt";
 32 | 
 33 | vector<string> labels = { "abnormal" };
 34 | const int Classes = 13;
 35 | vector<vector<int> > output_shape = { { 1, 15 + 3 * Classes , 13, 13 },{ 1, 15 + 3 * Classes, 26, 26 } };
 36 | vector<vector<int> > g_masks = { { 3, 4, 5 },{ 0, 1, 2 } };
 37 | vector<vector<int> > g_anchors = { { 10, 14 },{ 23, 27 },{ 37, 58 },{ 81, 82 },{ 135, 169 },{ 344, 319 } };
 38 | float obj_threshold = 0.10;
 39 | float nms_threshold = 0.45;
 40 | 
 41 | int CATEGORY = 1;
 42 | int BATCH_SIZE = 1;
 43 | int INPUT_CHANNEL = 3;
 44 | int DETECT_WIDTH = 416;
 45 | int DETECT_HEIGHT = 416;
 46 | 
 47 | // Res struct & function
 48 | typedef struct DetectionRes {
 49 | 	float x, y, w, h, prob;
 50 | } DetectionRes;
 51 | 
 52 | float sigmoid(float in) {
 53 | 	return 1.f / (1.f + exp(-in));
 54 | }
 55 | float exponential(float in) {
 56 | 	return exp(in);
 57 | }
 58 | 
 59 | float* merge(float* out1, float* out2, int bsize_out1, int bsize_out2)
 60 | {
 61 | 	float* out_total = new float[bsize_out1 + bsize_out2];
 62 | 
 63 | 	for (int j = 0; j < bsize_out1; ++j)
 64 | 	{
 65 | 		int index = j;
 66 | 		out_total[index] = out1[j];
 67 | 	}
 68 | 
 69 | 	for (int j = 0; j < bsize_out2; ++j)
 70 | 	{
 71 | 		int index = j + bsize_out1;
 72 | 		out_total[index] = out2[j];
 73 | 	}
 74 | 	return out_total;
 75 | }
 76 | 
 77 | vector<string> split(const string& str, char delim)
 78 | {
 79 | 	stringstream ss(str);
 80 | 	string token;
 81 | 	vector<string> container;
 82 | 	while (getline(ss, token, delim))
 83 | 	{
 84 | 		container.push_back(token);
 85 | 	}
 86 | 
 87 | 	return container;
 88 | }
 89 | 
 90 | 
 91 | 
 92 | void DoNms(vector<DetectionRes>& detections, float nmsThresh) {
 93 | 	auto iouCompute = [](float * lbox, float* rbox) {
 94 | 		float interBox[] = {
 95 | 			max(lbox[0], rbox[0]), //left
 96 | 			min(lbox[0] + lbox[2], rbox[0] + rbox[2]), //right
 97 | 			max(lbox[1], rbox[1]), //top
 98 | 			min(lbox[1] + lbox[3], rbox[1] + rbox[3]), //bottom
 99 | 		};
100 | 
101 | 		if (interBox[2] >= interBox[3] || interBox[0] >= interBox[1])
102 | 			return 0.0f;
103 | 
104 | 		float interBoxS = (interBox[1] - interBox[0] + 1) * (interBox[3] - interBox[2] + 1);
105 | 		return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
106 | 	};
107 | 
108 | 	sort(detections.begin(), detections.end(), [=](const DetectionRes & left, const DetectionRes & right) {
109 | 		return left.prob > right.prob;
110 | 	});
111 | 
112 | 	vector<DetectionRes> result;
113 | 	for (unsigned int m = 0; m < detections.size(); ++m) {
114 | 		result.push_back(detections[m]);
115 | 		for (unsigned int n = m + 1; n < detections.size(); ++n) {
116 | 			if (iouCompute((float *)(&detections[m]), (float *)(&detections[n])) > nmsThresh) {
117 | 				detections.erase(detections.begin() + n);
118 | 				--n;
119 | 			}
120 | 		}
121 | 	}
122 | 	detections = move(result);
123 | }
124 | 
125 | vector<DetectionRes> postProcess(cv::Mat& image, float * output) {
126 | 	vector<DetectionRes> detections;
127 | 	int total_size = 0;
128 | 	for (int i = 0; i < output_shape.size(); i++) {
129 | 		auto shape = output_shape[i];
130 | 		int size = 1;
131 | 		for (int j = 0; j < shape.size(); j++) {
132 | 			size *= shape[j];
133 | 		}
134 | 		total_size += size;
135 | 	}
136 | 
137 | 	int offset = 0;
138 | 	float * transposed_output = new float[total_size];
139 | 	float * transposed_output_t = transposed_output;
140 | 	for (int i = 0; i < output_shape.size(); i++) {
141 | 		auto shape = output_shape[i];  // nchw
142 | 		int chw = shape[1] * shape[2] * shape[3];
143 | 		int hw = shape[2] * shape[3];
144 | 		for (int n = 0; n < shape[0]; n++) {
145 | 			int offset_n = offset + n * chw;
146 | 			for (int h = 0; h < shape[2]; h++) {
147 | 				for (int w = 0; w < shape[3]; w++) {
148 | 					int h_w = h * shape[3] + w;
149 | 					for (int c = 0; c < shape[1]; c++) {
150 | 						int offset_c = offset_n + hw * c + h_w;
151 | 						*transposed_output_t++ = output[offset_c];
152 | 					}
153 | 				}
154 | 			}
155 | 		}
156 | 		offset += shape[0] * chw;
157 | 	}
158 | 	vector<vector<int> > shapes;
159 | 	for (int i = 0; i < output_shape.size(); i++) {
160 | 		auto shape = output_shape[i];
161 | 		vector<int> tmp = { shape[2], shape[3], 3, 5 + Classes };
162 | 		shapes.push_back(tmp);
163 | 	}
164 | 
165 | 	offset = 0;
166 | 	for (int i = 0; i < output_shape.size(); i++) {
167 | 		auto masks = g_masks[i];
168 | 		vector<vector<int> > anchors;
169 | 		for (auto mask : masks)
170 | 			anchors.push_back(g_anchors[mask]);
171 | 		auto shape = shapes[i];
172 | 		for (int h = 0; h < shape[0]; h++) {
173 | 			int offset_h = offset + h * shape[1] * shape[2] * shape[3];
174 | 			for (int w = 0; w < shape[1]; w++) {
175 | 				int offset_w = offset_h + w * shape[2] * shape[3];
176 | 				for (int c = 0; c < shape[2]; c++) {
177 | 					int offset_c = offset_w + c * shape[3];
178 | 					float * ptr = transposed_output + offset_c;
179 | 					//ptr[4] = sigmoid(ptr[4]);
180 | 					//ptr[5] = sigmoid(ptr[5]);
181 | 					float score = 0;
182 | 					for (int i = 5; i < 5 + Classes; i++) {
183 | 						score = max(score, sigmoid(ptr[4]) * sigmoid(ptr[i]));
184 | 					}
185 | 
186 | 					if (score < obj_threshold)
187 | 						continue;
188 | 					ptr[0] = sigmoid(ptr[0]);
189 | 					ptr[1] = sigmoid(ptr[1]);
190 | 					ptr[2] = exponential(ptr[2]) * anchors[c][0];
191 | 					ptr[3] = exponential(ptr[3]) * anchors[c][1];
192 | 
193 | 					ptr[0] += w;
194 | 					ptr[1] += h;
195 | 					ptr[0] /= shape[0];
196 | 					ptr[1] /= shape[1];
197 | 					ptr[2] /= DETECT_WIDTH;
198 | 					ptr[3] /= DETECT_WIDTH;
199 | 					ptr[0] -= ptr[2] / 2;
200 | 					ptr[1] -= ptr[3] / 2;
201 | 
202 | 					DetectionRes det;;
203 | 					det.x = ptr[0];
204 | 					det.y = ptr[1];
205 | 					det.w = ptr[2];
206 | 					det.h = ptr[3];
207 | 					det.prob = score;
208 | 					detections.push_back(det);
209 | 				}
210 | 			}
211 | 		}
212 | 		offset += shape[0] * shape[1] * shape[2] * shape[3];
213 | 	}
214 | 	delete[]transposed_output;
215 | 
216 | 	int h = DETECT_WIDTH;   //net h
217 | 	int w = DETECT_WIDTH;   //net w
218 | 
219 | 							//scale bbox to img
220 | 	int width = image.cols;
221 | 	int height = image.rows;
222 | 	float scale = min(float(w) / width, float(h) / height);
223 | 	float scaleSize[] = { width * scale, height * scale };
224 | 
225 | 	//correct box
226 | 	for (auto& bbox : detections) {
227 | 		bbox.x = (bbox.x * w - (w - scaleSize[0]) / 2.f) / scale;
228 | 		bbox.y = (bbox.y * h - (h - scaleSize[1]) / 2.f) / scale;
229 | 		bbox.w *= w;
230 | 		bbox.h *= h;
231 | 		bbox.w /= scale;
232 | 		bbox.h /= scale;
233 | 	}
234 | 
235 | 	//nms
236 | 	float nmsThresh = nms_threshold;
237 | 	if (nmsThresh > 0)
238 | 		DoNms(detections, nmsThresh);
239 | 
240 | 	return detections;
241 | }
242 | 
243 | 
244 | // prepare img
245 | vector<float> prepareImage(cv::Mat& img) {
246 | 	int c = 3;
247 | 	int h = DETECT_WIDTH;   //net h
248 | 	int w = DETECT_WIDTH;   //net w
249 | 
250 | 	float scale = min(float(w) / img.cols, float(h) / img.rows);
251 | 	auto scaleSize = cv::Size(img.cols * scale, img.rows * scale);
252 | 
253 | 	cv::Mat rgb;
254 | 	cv::cvtColor(img, rgb, CV_BGR2RGB);
255 | 	cv::Mat resized;
256 | 	cv::resize(rgb, resized, scaleSize, 0, 0, INTER_CUBIC);
257 | 
258 | 	cv::Mat cropped(h, w, CV_8UC3, 127);
259 | 	Rect rect((w - scaleSize.width) / 2, (h - scaleSize.height) / 2, scaleSize.width, scaleSize.height);
260 | 	resized.copyTo(cropped(rect));
261 | 
262 | 	cv::Mat img_float;
263 | 	cropped.convertTo(img_float, CV_32FC3, 1.f / 255.0);
264 | 
265 | 
266 | 	//HWC TO CHW
267 | 	vector<Mat> input_channels(c);
268 | 	cv::split(img_float, input_channels);
269 | 
270 | 	vector<float> result(h * w * c);
271 | 	auto data = result.data();
272 | 	int channelLength = h * w;
273 | 	for (int i = 0; i < c; ++i) {
274 | 		memcpy(data, input_channels[i].data, channelLength * sizeof(float));
275 | 		data += channelLength;
276 | 	}
277 | 	return result;
278 | }
279 | 
280 | 
281 | // load engine file
282 | bool readTrtFile(const std::string& engineFile, //name of the engine file
283 | 	IHostMemory*& trtModelStream)  //output buffer for the TensorRT model
284 | {
285 | 	using namespace std;
286 | 	fstream file;
287 | 	cout << "loading filename from:" << engineFile << endl;
288 | 	nvinfer1::IRuntime* trtRuntime;
289 | 	nvonnxparser::IPluginFactory* onnxPlugin = createPluginFactory(gLogger.getTRTLogger());
290 | 	file.open(engineFile, ios::binary | ios::in);
291 | 	file.seekg(0, ios::end);
292 | 	int length = file.tellg();
293 | 	//cout << "length:" << length << endl;
294 | 	file.seekg(0, ios::beg);
295 | 	std::unique_ptr<char[]> data(new char[length]);
296 | 	file.read(data.get(), length);
297 | 	file.close();
298 | 	cout << "load engine done" << endl;
299 | 	std::cout << "deserializing" << endl;
300 | 	trtRuntime = createInferRuntime(gLogger.getTRTLogger());
301 | 	ICudaEngine* engine = trtRuntime->deserializeCudaEngine(data.get(), length, onnxPlugin);
302 | 	cout << "deserialize done" << endl;
303 | 	trtModelStream = engine->serialize();
304 | 
305 | 	return true;
306 | }
307 | 
308 | 
309 | // ONNX模型转为TensorRT引擎
310 | bool onnxToTRTModel(const std::string& modelFile, // onnx文件的名字
311 | 	const std::string& filename,  // TensorRT引擎的名字 
312 | 	IHostMemory*& trtModelStream) // output buffer for the TensorRT model
313 | {
314 | 	// 创建builder
315 | 	IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
316 | 	assert(builder != nullptr);
317 | 	nvinfer1::INetworkDefinition* network = builder->createNetwork();
318 | 
319 | 	// 解析ONNX模型
320 | 	auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());
321 | 
322 | 
323 | 	//可选的 - 取消下面的注释可以查看网络中每层的星系信息
324 | 	//config->setPrintLayerInfo(true);
325 | 	//parser->reportParsingInfo();
326 | 
327 | 	//判断是否成功解析ONNX模型
328 | 	if (!parser->parseFromFile(modelFile.c_str(), static_cast<int>(gLogger.getReportableSeverity())))
329 | 	{
330 | 		gLogError << "Failure while parsing ONNX file" << std::endl;
331 | 		return false;
332 | 	}
333 | 
334 | 	// 建立推理引擎
335 | 	builder->setMaxBatchSize(BATCH_SIZE);
336 | 	builder->setMaxWorkspaceSize(1 << 30);
337 | 	builder->setFp16Mode(true);
338 | 	builder->setInt8Mode(gArgs.runInInt8);
339 | 
340 | 	if (gArgs.runInInt8)
341 | 	{
342 | 		samplesCommon::setAllTensorScales(network, 127.0f, 127.0f);
343 | 	}
344 | 
345 | 	cout << "start building engine" << endl;
346 | 	ICudaEngine* engine = builder->buildCudaEngine(*network);
347 | 	cout << "build engine done" << endl;
348 | 	assert(engine);
349 | 
350 | 	// 销毁模型解释器
351 | 	parser->destroy();
352 | 
353 | 	// 序列化引擎
354 | 	trtModelStream = engine->serialize();
355 | 
356 | 	// 保存引擎
357 | 	nvinfer1::IHostMemory* data = engine->serialize();
358 | 	std::ofstream file;
359 | 	file.open(filename, std::ios::binary | std::ios::out);
360 | 	cout << "writing engine file..." << endl;
361 | 	file.write((const char*)data->data(), data->size());
362 | 	cout << "save engine file done" << endl;
363 | 	file.close();
364 | 
365 | 	// 销毁所有相关的东西
366 | 	engine->destroy();
367 | 	network->destroy();
368 | 	builder->destroy();
369 | 
370 | 	return true;
371 | }
372 | 
373 | inline int64_t volume(const nvinfer1::Dims& d)
374 | {
375 | 	return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
376 | }
377 | 
378 | inline unsigned int getElementSize(nvinfer1::DataType t)
379 | {
380 | 	switch (t)
381 | 	{
382 | 	case nvinfer1::DataType::kINT32: return 4;
383 | 	case nvinfer1::DataType::kFLOAT: return 4;
384 | 	case nvinfer1::DataType::kHALF: return 2;
385 | 	case nvinfer1::DataType::kINT8: return 1;
386 | 	}
387 | 	throw std::runtime_error("Invalid DataType.");
388 | 	return 0;
389 | }
390 | 
391 | //执行前向推理
392 | void doInferenceFrieza(IHostMemory* trtModelStream)
393 | {
394 | 	//get engine
395 | 	assert(trtModelStream != nullptr);
396 | 	IRuntime* runtime = createInferRuntime(gLogger);
397 | 	nvonnxparser::IPluginFactory* onnxPlugin = createPluginFactory(gLogger.getTRTLogger());
398 | 	assert(runtime != nullptr);
399 | 	if (gArgs.useDLACore >= 0)
400 | 	{
401 | 		runtime->setDLACore(gArgs.useDLACore);
402 | 	}
403 | 	ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), onnxPlugin);
404 | 
405 | 	// 创建推理引擎
406 | 	assert(engine != nullptr);
407 | 	trtModelStream->destroy();
408 | 	IExecutionContext* context = engine->createExecutionContext();
409 | 	assert(context != nullptr);
410 | 
411 | 	//读取输入数据到缓冲区管理对象中
412 | 	assert(engine->getNbBindings() == 3);
413 | 	void* buffers[3];
414 | 	std::vector<int64_t> bufferSize;
415 | 	int nbBindings = engine->getNbBindings();
416 | 	bufferSize.resize(nbBindings);
417 | 
418 | 	for (int i = 0; i < nbBindings; ++i)
419 | 	{
420 | 		nvinfer1::Dims dims = engine->getBindingDimensions(i);
421 | 		nvinfer1::DataType dtype = engine->getBindingDataType(i);
422 | 		int64_t totalSize = volume(dims) * 1 * getElementSize(dtype);
423 | 		bufferSize[i] = totalSize;
424 | 		CHECK(cudaMalloc(&buffers[i], totalSize));
425 | 	}
426 | 
427 | 	// 创建CUDA流以执行此推断
428 | 	cudaStream_t stream;
429 | 	CHECK(cudaStreamCreate(&stream));
430 | 
431 | 	//define inputImgs inputData outputDetections ...
432 | 	//vector<float> inputData;
433 | 	//inputData.reserve(DETECT_HEIGHT*DETECT_WIDTH*INPUT_CHANNEL*BATCH_SIZE);
434 | 	vector<cv::Mat> inputImgs;
435 | 	vector<DetectionRes> outputs;
436 | 	int outSize1 = bufferSize[1] / sizeof(float);
437 | 	int outSize2 = bufferSize[2] / sizeof(float);
438 | 	float* out1 = new float[outSize1];
439 | 	float* out2 = new float[outSize2];
440 | 
441 | 	int index = 1,
442 | 		batchCount = 0;
443 | 
444 | 	cv::Mat img = cv::imread("F:/TensorRT-6.0.1.5/data/v3tiny/4.jpg");
445 | 	inputImgs.push_back(img);
446 | 	auto t_start_pre = std::chrono::high_resolution_clock::now();
447 | 	vector<float> curInput = prepareImage(img);
448 | 	auto t_end_pre = std::chrono::high_resolution_clock::now();
449 | 	float total_pre = std::chrono::duration<float, std::milli>(t_end_pre - t_start_pre).count();
450 | 	std::cout << "prepare image take: " << total_pre << " ms." << endl;
451 | 
452 | 	/*
453 | 	inputData.insert(inputData.end(), curInput.begin(), curInput.end());
454 | 	batchCount++;
455 | 	if (batchCount < BATCH_SIZE && i + 1 < fileNames.size())
456 | 	continue;
457 | 	*/
458 | 
459 | 	// DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
460 | 	// 将数据从主机输入缓冲区异步复制到设备输入缓冲区
461 | 	CHECK(cudaMemcpyAsync(buffers[0], curInput.data(), bufferSize[0], cudaMemcpyHostToDevice, stream));
462 | 
463 | 	// 执行推理
464 | 	auto t_start = std::chrono::high_resolution_clock::now();
465 | 	context->execute(BATCH_SIZE, buffers);
466 | 	auto t_end = std::chrono::high_resolution_clock::now();
467 | 	float total = std::chrono::duration<float, std::milli>(t_end - t_start).count();
468 | 	std::cout << "Inference take: " << total << " ms." << endl;
469 | 
470 | 	CHECK(cudaMemcpyAsync(out1, buffers[1], bufferSize[1], cudaMemcpyDeviceToHost, stream));
471 | 	CHECK(cudaMemcpyAsync(out2, buffers[2], bufferSize[2], cudaMemcpyDeviceToHost, stream));
472 | 	cudaStreamSynchronize(stream);
473 | 
474 | 	float* out = new float[outSize1 + outSize2];
475 | 	out = merge(out1, out2, outSize1, outSize2);
476 | 
477 | 	// postprocess
478 | 	auto t_start_post = std::chrono::high_resolution_clock::now();
479 | 	auto boxes = postProcess(img, out);
480 | 	auto t_end_post = std::chrono::high_resolution_clock::now();
481 | 	float total_post = std::chrono::duration<float, std::milli>(t_end_post - t_start_post).count();
482 | 	std::cout << "Postprocess take: " << total_post << " ms." << endl;
483 | 
484 | 	//print boxes
485 | 	for (int i = 0; i < boxes.size(); ++i)
486 | 	{
487 | 		cout << boxes[i].prob << ", " << boxes[i].x << ", " << boxes[i].y << ", " << boxes[i].w << ", " << boxes[i].h << endl;
488 | 		int x = boxes[i].x,
489 | 			y = boxes[i].y,
490 | 			w = boxes[i].w,
491 | 			h = boxes[i].h;
492 | 		cv::Rect rect = { x, y, w, h };
493 | 		cv::rectangle(img, rect, cv::Scalar(255, 255, 0), 2);
494 | 	}
495 | 
496 | 	cout << "\n" << endl;
497 | 
498 | 
499 | 	// release the stream and the buffers
500 | 	cudaStreamDestroy(stream);
501 | 	CHECK(cudaFree(buffers[0]));
502 | 	CHECK(cudaFree(buffers[1]));
503 | 	CHECK(cudaFree(buffers[2]));
504 | 
505 | 	// destroy the engine
506 | 	context->destroy();
507 | 	engine->destroy();
508 | 	runtime->destroy();
509 | 
510 | 	cv::imshow("result", img);
511 | 	cv::imwrite("F:\\res.jpg", img);
512 | 	waitKey(0);
513 | 
514 | }
515 | 
516 | int main()
517 | {
518 | 	// read imgs list
519 | 
520 | 	// create a TensorRT model from the onnx model and serialize it to a stream
521 | 	IHostMemory* trtModelStream{ nullptr };
522 | 
523 | 	// create and load engine
524 | 	fstream existEngine;
525 | 	existEngine.open(engineFile, ios::in);
526 | 	if (existEngine)
527 | 	{
528 | 		readTrtFile(engineFile, trtModelStream);
529 | 		assert(trtModelStream != nullptr);
530 | 	}
531 | 	else
532 | 	{
533 | 		onnxToTRTModel(onnxFile, engineFile, trtModelStream);
534 | 		assert(trtModelStream != nullptr);
535 | 	}
536 | 
537 | 	//onnxToTRTModel(onnxFile, engineFile, trtModelStream);
538 | 
539 | 	//do inference
540 | 	doInferenceFrieza(trtModelStream);
541 | 
542 | 	return 0;
543 | }


--------------------------------------------------------------------------------
/yolov3-tiny-trt-int8.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <opencv2/opencv.hpp>
  3 | #include <assert.h>
  4 | #include <cmath>
  5 | #include <cuda_runtime_api.h>
  6 | #include "EntropyCalibrator.h"
  7 | #include <fstream>
  8 | #include <iomanip>
  9 | #include <iostream>
 10 | #include <sstream>
 11 | #include <sys/stat.h>
 12 | #include <time.h>
 13 | 
 14 | #include "NvInfer.h"
 15 | #include "NvOnnxParser.h"
 16 | #include "NvOnnxParserRuntime.h"
 17 | #include "argsParser.h"
 18 | #include "logger.h"
 19 | #include "common.h"
 20 | 
 21 | using namespace std;
 22 | using namespace nvinfer1;
 23 | using namespace nvonnxparser;
 24 | using namespace cv;
 25 | #include "NvInfer.h"
 26 | #include <vector>
 27 | #include <string>
 28 | 
 29 | // origin params
 30 | samplesCommon::Args gArgs;
 31 | 
 32 | // Res params
 33 | string onnxFile = "F:/TensorRT-6.0.1.5/data/v3tiny/yolov3-tiny.onnx";
 34 | string engineFile = "F:/TensorRT-6.0.1.5/data/v3tiny/yolov3-tiny.trt";
 35 | string calibFile = "F:/TensorRT-6.0.1.5/data/v3tiny/calibaration.txt";
 36 | 
 37 | vector<string> labels = { "abnormal" };
 38 | const int Classes = 13;
 39 | vector<vector<int> > output_shape = { { 1, 15 + 3 * Classes , 13, 13 },{ 1, 15 + 3 * Classes, 26, 26 } };
 40 | vector<vector<int> > g_masks = { { 3, 4, 5 },{ 0, 1, 2 } };
 41 | vector<vector<int> > g_anchors = { { 10, 14 },{ 23, 27 },{ 37, 58 },{ 81, 82 },{ 135, 169 },{ 344, 319 } };
 42 | float obj_threshold = 0.10;
 43 | float nms_threshold = 0.45;
 44 | 
 45 | int CATEGORY = 1;
 46 | int BATCH_SIZE = 1;
 47 | int INPUT_CHANNEL = 3;
 48 | int DETECT_WIDTH = 416;
 49 | int DETECT_HEIGHT = 416;
 50 | 
 51 | // 准备输入图片，返回预处理后的Vector
 52 | vector<float> prepareImage(cv::Mat& img) {
 53 | 	int c = 3;
 54 | 	int h = DETECT_WIDTH;   //net h
 55 | 	int w = DETECT_WIDTH;   //net w
 56 | 
 57 | 	float scale = min(float(w) / img.cols, float(h) / img.rows);
 58 | 	auto scaleSize = cv::Size(img.cols * scale, img.rows * scale);
 59 | 
 60 | 	cv::Mat rgb;
 61 | 	cv::cvtColor(img, rgb, CV_BGR2RGB);
 62 | 	cv::Mat resized;
 63 | 	cv::resize(rgb, resized, scaleSize, 0, 0, INTER_CUBIC);
 64 | 
 65 | 	cv::Mat cropped(h, w, CV_8UC3, 127);
 66 | 	Rect rect((w - scaleSize.width) / 2, (h - scaleSize.height) / 2, scaleSize.width, scaleSize.height);
 67 | 	resized.copyTo(cropped(rect));
 68 | 
 69 | 	cv::Mat img_float;
 70 | 	cropped.convertTo(img_float, CV_32FC3, 1.f / 255.0);
 71 | 
 72 | 
 73 | 	//HWC TO CHW
 74 | 	vector<Mat> input_channels(c);
 75 | 	cv::split(img_float, input_channels);
 76 | 
 77 | 	vector<float> result(h * w * c);
 78 | 	auto data = result.data();
 79 | 	int channelLength = h * w;
 80 | 	for (int i = 0; i < c; ++i) {
 81 | 		memcpy(data, input_channels[i].data, channelLength * sizeof(float));
 82 | 		data += channelLength;
 83 | 	}
 84 | 	return result;
 85 | }
 86 | 
 87 | namespace nvinfer1 {
 88 | 	class int8EntroyCalibrator : public nvinfer1::IInt8EntropyCalibrator {
 89 | 	public:
 90 | 		int8EntroyCalibrator(const int &bacthSize,
 91 | 			const std::string &imgPath,
 92 | 			const std::string &calibTablePath);
 93 | 
 94 | 		virtual ~int8EntroyCalibrator();
 95 | 
 96 | 		int getBatchSize() const override { return batchSize; }
 97 | 
 98 | 		bool getBatch(void *bindings[], const char *names[], int nbBindings) override;
 99 | 
100 | 		const void *readCalibrationCache(std::size_t &length) override;
101 | 
102 | 		void writeCalibrationCache(const void *ptr, std::size_t length) override;
103 | 
104 | 	private:
105 | 
106 | 		bool forwardFace;
107 | 
108 | 		int batchSize;
109 | 		size_t inputCount;
110 | 		size_t imageIndex;
111 | 
112 | 		std::string calibTablePath;
113 | 		std::vector<std::string> imgPaths;
114 | 
115 | 		float *batchData{ nullptr };
116 | 		void  *deviceInput{ nullptr };
117 | 
118 | 
119 | 
120 | 		bool readCache;
121 | 		std::vector<char> calibrationCache;
122 | 	};
123 | 
124 | 	int8EntroyCalibrator::int8EntroyCalibrator(const int &bacthSize, const std::string &imgPath,
125 | 		const std::string &calibTablePath) :batchSize(bacthSize), calibTablePath(calibTablePath), imageIndex(0), forwardFace(
126 | 			false) {
127 | 		int inputChannel = 3;
128 | 		int inputH = 416;
129 | 		int inputW = 416;
130 | 		inputCount = bacthSize*inputChannel*inputH*inputW;
131 | 		std::fstream f(imgPath);
132 | 		if (f.is_open()) {
133 | 			std::string temp;
134 | 			while (std::getline(f, temp)) imgPaths.push_back(temp);
135 | 		}
136 | 		int len = imgPaths.size();
137 | 		for (int i = 0; i < len; i++) {
138 | 			cout << imgPaths[i] << endl;
139 | 		}
140 | 		batchData = new float[inputCount];
141 | 		CHECK(cudaMalloc(&deviceInput, inputCount * sizeof(float)));
142 | 	}
143 | 
144 | 	int8EntroyCalibrator::~int8EntroyCalibrator() {
145 | 		CHECK(cudaFree(deviceInput));
146 | 		if (batchData)
147 | 			delete[] batchData;
148 | 	}
149 | 
150 | 	bool int8EntroyCalibrator::getBatch(void **bindings, const char **names, int nbBindings) {
151 | 		cout << imageIndex << " " << batchSize << endl;
152 | 		cout << imgPaths.size() << endl;
153 | 		if (imageIndex + batchSize > int(imgPaths.size()))
154 | 			return false;
155 | 		// load batch
156 | 		float* ptr = batchData;
157 | 		for (size_t j = imageIndex; j < imageIndex + batchSize; ++j)
158 | 		{
159 | 			//cout << imgPaths[j] << endl;
160 | 			Mat img = cv::imread(imgPaths[j]);
161 | 			vector<float>inputData = prepareImage(img);
162 | 			cout << inputData.size() << endl;
163 | 			cout << inputCount << endl;
164 | 			if ((int)(inputData.size()) != inputCount)
165 | 			{
166 | 				std::cout << "InputSize error. check include/ctdetConfig.h" << std::endl;
167 | 				return false;
168 | 			}
169 | 			assert(inputData.size() == inputCount);
170 | 			int len = (int)(inputData.size());
171 | 			memcpy(ptr, inputData.data(), len * sizeof(float));
172 | 
173 | 			ptr += inputData.size();
174 | 			std::cout << "load image " << imgPaths[j] << "  " << (j + 1)*100. / imgPaths.size() << "%" << std::endl;
175 | 		}
176 | 		imageIndex += batchSize;
177 | 		CHECK(cudaMemcpy(deviceInput, batchData, inputCount * sizeof(float), cudaMemcpyHostToDevice));
178 | 		bindings[0] = deviceInput;
179 | 		return true;
180 | 	}
181 | 	const void* int8EntroyCalibrator::readCalibrationCache(std::size_t &length)
182 | 	{
183 | 		calibrationCache.clear();
184 | 		std::ifstream input(calibTablePath, std::ios::binary);
185 | 		input >> std::noskipws;
186 | 		if (readCache && input.good())
187 | 			std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(),
188 | 				std::back_inserter(calibrationCache));
189 | 
190 | 		length = calibrationCache.size();
191 | 		return length ? &calibrationCache[0] : nullptr;
192 | 	}
193 | 
194 | 	void int8EntroyCalibrator::writeCalibrationCache(const void *cache, std::size_t length)
195 | 	{
196 | 		std::ofstream output(calibTablePath, std::ios::binary);
197 | 		output.write(reinterpret_cast<const char*>(cache), length);
198 | 	}
199 | }
200 | 
201 | 
202 | // Res struct & function
203 | typedef struct DetectionRes {
204 | 	float x, y, w, h, prob;
205 | } DetectionRes;
206 | 
207 | float sigmoid(float in) {
208 | 	return 1.f / (1.f + exp(-in));
209 | }
210 | float exponential(float in) {
211 | 	return exp(in);
212 | }
213 | 
214 | float* merge(float* out1, float* out2, int bsize_out1, int bsize_out2)
215 | {
216 | 	float* out_total = new float[bsize_out1 + bsize_out2];
217 | 
218 | 	for (int j = 0; j < bsize_out1; ++j)
219 | 	{
220 | 		int index = j;
221 | 		out_total[index] = out1[j];
222 | 	}
223 | 
224 | 	for (int j = 0; j < bsize_out2; ++j)
225 | 	{
226 | 		int index = j + bsize_out1;
227 | 		out_total[index] = out2[j];
228 | 	}
229 | 	return out_total;
230 | }
231 | 
232 | vector<string> split(const string& str, char delim)
233 | {
234 | 	stringstream ss(str);
235 | 	string token;
236 | 	vector<string> container;
237 | 	while (getline(ss, token, delim))
238 | 	{
239 | 		container.push_back(token);
240 | 	}
241 | 
242 | 	return container;
243 | }
244 | 
245 | 
246 | // 执行NMS
247 | void DoNms(vector<DetectionRes>& detections, float nmsThresh) {
248 | 	auto iouCompute = [](float * lbox, float* rbox) {
249 | 		float interBox[] = {
250 | 			max(lbox[0], rbox[0]), //left
251 | 			min(lbox[0] + lbox[2], rbox[0] + rbox[2]), //right
252 | 			max(lbox[1], rbox[1]), //top
253 | 			min(lbox[1] + lbox[3], rbox[1] + rbox[3]), //bottom
254 | 		};
255 | 
256 | 		if (interBox[2] >= interBox[3] || interBox[0] >= interBox[1])
257 | 			return 0.0f;
258 | 
259 | 		float interBoxS = (interBox[1] - interBox[0] + 1) * (interBox[3] - interBox[2] + 1);
260 | 		return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
261 | 	};
262 | 
263 | 	sort(detections.begin(), detections.end(), [=](const DetectionRes & left, const DetectionRes & right) {
264 | 		return left.prob > right.prob;
265 | 	});
266 | 
267 | 	vector<DetectionRes> result;
268 | 	for (unsigned int m = 0; m < detections.size(); ++m) {
269 | 		result.push_back(detections[m]);
270 | 		for (unsigned int n = m + 1; n < detections.size(); ++n) {
271 | 			if (iouCompute((float *)(&detections[m]), (float *)(&detections[n])) > nmsThresh) {
272 | 				detections.erase(detections.begin() + n);
273 | 				--n;
274 | 			}
275 | 		}
276 | 	}
277 | 	detections = move(result);
278 | }
279 | 
280 | //后处理
281 | vector<DetectionRes> postProcess(cv::Mat& image, float * output) {
282 | 	vector<DetectionRes> detections;
283 | 	int total_size = 0;
284 | 	for (int i = 0; i < output_shape.size(); i++) {
285 | 		auto shape = output_shape[i];
286 | 		int size = 1;
287 | 		for (int j = 0; j < shape.size(); j++) {
288 | 			size *= shape[j];
289 | 		}
290 | 		total_size += size;
291 | 	}
292 | 
293 | 	int offset = 0;
294 | 	float * transposed_output = new float[total_size];
295 | 	float * transposed_output_t = transposed_output;
296 | 	for (int i = 0; i < output_shape.size(); i++) {
297 | 		auto shape = output_shape[i];  // nchw
298 | 		int chw = shape[1] * shape[2] * shape[3];
299 | 		int hw = shape[2] * shape[3];
300 | 		for (int n = 0; n < shape[0]; n++) {
301 | 			int offset_n = offset + n * chw;
302 | 			for (int h = 0; h < shape[2]; h++) {
303 | 				for (int w = 0; w < shape[3]; w++) {
304 | 					int h_w = h * shape[3] + w;
305 | 					for (int c = 0; c < shape[1]; c++) {
306 | 						int offset_c = offset_n + hw * c + h_w;
307 | 						*transposed_output_t++ = output[offset_c];
308 | 					}
309 | 				}
310 | 			}
311 | 		}
312 | 		offset += shape[0] * chw;
313 | 	}
314 | 	vector<vector<int> > shapes;
315 | 	for (int i = 0; i < output_shape.size(); i++) {
316 | 		auto shape = output_shape[i];
317 | 		vector<int> tmp = { shape[2], shape[3], 3, 5 + Classes };
318 | 		shapes.push_back(tmp);
319 | 	}
320 | 
321 | 	offset = 0;
322 | 	for (int i = 0; i < output_shape.size(); i++) {
323 | 		auto masks = g_masks[i];
324 | 		vector<vector<int> > anchors;
325 | 		for (auto mask : masks)
326 | 			anchors.push_back(g_anchors[mask]);
327 | 		auto shape = shapes[i];
328 | 		for (int h = 0; h < shape[0]; h++) {
329 | 			int offset_h = offset + h * shape[1] * shape[2] * shape[3];
330 | 			for (int w = 0; w < shape[1]; w++) {
331 | 				int offset_w = offset_h + w * shape[2] * shape[3];
332 | 				for (int c = 0; c < shape[2]; c++) {
333 | 					int offset_c = offset_w + c * shape[3];
334 | 					float * ptr = transposed_output + offset_c;
335 | 					//ptr[4] = sigmoid(ptr[4]);
336 | 					//ptr[5] = sigmoid(ptr[5]);
337 | 					float score = 0;
338 | 					for (int i = 5; i < 5 + Classes; i++) {
339 | 						score = max(score, sigmoid(ptr[4]) * sigmoid(ptr[i]));
340 | 					}
341 | 
342 | 					if (score < obj_threshold)
343 | 						continue;
344 | 					ptr[0] = sigmoid(ptr[0]);
345 | 					ptr[1] = sigmoid(ptr[1]);
346 | 					ptr[2] = exponential(ptr[2]) * anchors[c][0];
347 | 					ptr[3] = exponential(ptr[3]) * anchors[c][1];
348 | 
349 | 					ptr[0] += w;
350 | 					ptr[1] += h;
351 | 					ptr[0] /= shape[0];
352 | 					ptr[1] /= shape[1];
353 | 					ptr[2] /= DETECT_WIDTH;
354 | 					ptr[3] /= DETECT_WIDTH;
355 | 					ptr[0] -= ptr[2] / 2;
356 | 					ptr[1] -= ptr[3] / 2;
357 | 
358 | 					DetectionRes det;;
359 | 					det.x = ptr[0];
360 | 					det.y = ptr[1];
361 | 					det.w = ptr[2];
362 | 					det.h = ptr[3];
363 | 					det.prob = score;
364 | 					detections.push_back(det);
365 | 				}
366 | 			}
367 | 		}
368 | 		offset += shape[0] * shape[1] * shape[2] * shape[3];
369 | 	}
370 | 	delete[]transposed_output;
371 | 
372 | 	int h = DETECT_WIDTH;   //net h
373 | 	int w = DETECT_WIDTH;   //net w
374 | 
375 | 							//scale bbox to img
376 | 	int width = image.cols;
377 | 	int height = image.rows;
378 | 	float scale = min(float(w) / width, float(h) / height);
379 | 	float scaleSize[] = { width * scale, height * scale };
380 | 
381 | 	//correct box
382 | 	for (auto& bbox : detections) {
383 | 		bbox.x = (bbox.x * w - (w - scaleSize[0]) / 2.f) / scale;
384 | 		bbox.y = (bbox.y * h - (h - scaleSize[1]) / 2.f) / scale;
385 | 		bbox.w *= w;
386 | 		bbox.h *= h;
387 | 		bbox.w /= scale;
388 | 		bbox.h /= scale;
389 | 	}
390 | 
391 | 	//nms
392 | 	float nmsThresh = nms_threshold;
393 | 	if (nmsThresh > 0)
394 | 		DoNms(detections, nmsThresh);
395 | 
396 | 	return detections;
397 | }
398 | 
399 | 
400 | // 加载TensorRT引擎文件
401 | bool readTrtFile(const std::string& engineFile, //引擎文件的名字
402 | 	IHostMemory*& trtModelStream)  //TensorRT模型的输出Buffer
403 | {
404 | 	fstream file;
405 | 	cout << "loading filename from:" << engineFile << endl;
406 | 	nvinfer1::IRuntime* trtRuntime;
407 | 	nvonnxparser::IPluginFactory* onnxPlugin = createPluginFactory(gLogger.getTRTLogger()); //
408 | 	file.open(engineFile, ios::binary | ios::in);
409 | 	file.seekg(0, ios::end);
410 | 	int length = file.tellg();
411 | 	//cout << "length:" << length << endl;
412 | 	file.seekg(0, ios::beg);
413 | 	std::unique_ptr<char[]> data(new char[length]);
414 | 	file.read(data.get(), length);
415 | 	file.close();
416 | 	cout << "load engine done" << endl;
417 | 	std::cout << "deserializing" << endl;
418 | 	trtRuntime = createInferRuntime(gLogger.getTRTLogger());
419 | 	ICudaEngine* engine = trtRuntime->deserializeCudaEngine(data.get(), length, onnxPlugin);
420 | 	cout << "deserialize done" << endl;
421 | 	trtModelStream = engine->serialize();
422 | 
423 | 	return true;
424 | }
425 | 
426 | 
427 | // ONNX模型转为TensorRT引擎
428 | bool onnxToTRTModel(const std::string& modelFile, // onnx文件的名字
429 | 	const std::string& filename,  // TensorRT引擎的名字 
430 | 	IHostMemory*& trtModelStream) // output buffer for the TensorRT model
431 | {
432 | 	// 创建builder
433 | 	IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
434 | 	assert(builder != nullptr);
435 | 	nvinfer1::INetworkDefinition* network = builder->createNetwork();
436 | 
437 | 	if (!builder->platformHasFastInt8()) return false;
438 | 
439 | 	// 解析ONNX模型
440 | 	auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());
441 | 
442 | 
443 | 	//可选的 - 取消下面的注释可以查看网络中每层的详细信息
444 | 	//config->setPrintLayerInfo(true);
445 | 	//parser->reportParsingInfo();
446 | 
447 | 	//判断是否成功解析ONNX模型
448 | 	if (!parser->parseFromFile(modelFile.c_str(), static_cast<int>(gLogger.getReportableSeverity())))
449 | 	{
450 | 		gLogError << "Failure while parsing ONNX file" << std::endl;
451 | 		return false;
452 | 	}
453 | 
454 | 
455 | 	// 建立推理引擎
456 | 	builder->setMaxBatchSize(BATCH_SIZE);
457 | 	builder->setMaxWorkspaceSize(1 << 30);
458 | 
459 | 	nvinfer1::int8EntroyCalibrator *calibrator = nullptr;
460 | 	if (calibFile.size()>0) calibrator = new nvinfer1::int8EntroyCalibrator(BATCH_SIZE, calibFile, "F:/TensorRT-6.0.1.5/data/v3tiny/calib.table");
461 | 
462 | 
463 | 	//builder->setFp16Mode(true);
464 | 	std::cout << "setInt8Mode" << std::endl;
465 | 	if (!builder->platformHasFastInt8())
466 | 		std::cout << "Notice: the platform do not has fast for int8" << std::endl;
467 | 	builder->setInt8Mode(true);
468 | 	builder->setInt8Calibrator(calibrator);
469 | 	/*if (gArgs.runInInt8)
470 | 	{
471 | 	samplesCommon::setAllTensorScales(network, 127.0f, 127.0f);
472 | 	}*/
473 | 	//samplesCommon::setAllTensorScales(network, 1.0f, 1.0f);
474 | 	cout << "start building engine" << endl;
475 | 	ICudaEngine* engine = builder->buildCudaEngine(*network);
476 | 	cout << "build engine done" << endl;
477 | 	assert(engine);
478 | 	if (calibrator) {
479 | 		delete calibrator;
480 | 		calibrator = nullptr;
481 | 	}
482 | 	// 销毁模型解释器
483 | 	parser->destroy();
484 | 
485 | 	// 序列化引擎
486 | 	trtModelStream = engine->serialize();
487 | 
488 | 	// 保存引擎
489 | 	nvinfer1::IHostMemory* data = engine->serialize();
490 | 	std::ofstream file;
491 | 	file.open(filename, std::ios::binary | std::ios::out);
492 | 	cout << "writing engine file..." << endl;
493 | 	file.write((const char*)data->data(), data->size());
494 | 	cout << "save engine file done" << endl;
495 | 	file.close();
496 | 
497 | 	// 销毁所有相关的东西
498 | 	engine->destroy();
499 | 	network->destroy();
500 | 	builder->destroy();
501 | 
502 | 	return true;
503 | }
504 | 
505 | inline int64_t volume(const nvinfer1::Dims& d)
506 | {
507 | 	return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
508 | }
509 | 
510 | inline unsigned int getElementSize(nvinfer1::DataType t)
511 | {
512 | 	switch (t)
513 | 	{
514 | 	case nvinfer1::DataType::kINT32: return 4;
515 | 	case nvinfer1::DataType::kFLOAT: return 4;
516 | 	case nvinfer1::DataType::kHALF: return 2;
517 | 	case nvinfer1::DataType::kINT8: return 1;
518 | 	}
519 | 	throw std::runtime_error("Invalid DataType.");
520 | 	return 0;
521 | }
522 | 
523 | //执行前向推理
524 | void doInferenceFrieza(IHostMemory* trtModelStream)
525 | {
526 | 	//get engine
527 | 	assert(trtModelStream != nullptr);
528 | 	IRuntime* runtime = createInferRuntime(gLogger);
529 | 	nvonnxparser::IPluginFactory* onnxPlugin = createPluginFactory(gLogger.getTRTLogger());
530 | 	assert(runtime != nullptr);
531 | 	if (gArgs.useDLACore >= 0)
532 | 	{
533 | 		runtime->setDLACore(gArgs.useDLACore);
534 | 	}
535 | 	ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), onnxPlugin);
536 | 
537 | 	// 创建推理引擎
538 | 	assert(engine != nullptr);
539 | 	trtModelStream->destroy();
540 | 	IExecutionContext* context = engine->createExecutionContext();
541 | 	assert(context != nullptr);
542 | 
543 | 	//读取输入数据到缓冲区管理对象中
544 | 	assert(engine->getNbBindings() == 3);
545 | 	void* buffers[3];
546 | 	std::vector<int64_t> bufferSize;
547 | 	int nbBindings = engine->getNbBindings();
548 | 	bufferSize.resize(nbBindings);
549 | 
550 | 	for (int i = 0; i < nbBindings; ++i)
551 | 	{
552 | 		nvinfer1::Dims dims = engine->getBindingDimensions(i);
553 | 		nvinfer1::DataType dtype = engine->getBindingDataType(i);
554 | 		int64_t totalSize = volume(dims) * 1 * getElementSize(dtype);
555 | 		bufferSize[i] = totalSize;
556 | 		CHECK(cudaMalloc(&buffers[i], totalSize));
557 | 	}
558 | 
559 | 	// 创建CUDA流以执行此推断
560 | 	cudaStream_t stream;
561 | 	CHECK(cudaStreamCreate(&stream));
562 | 
563 | 	//define inputImgs inputData outputDetections ...
564 | 	//vector<float> inputData;
565 | 	//inputData.reserve(DETECT_HEIGHT*DETECT_WIDTH*INPUT_CHANNEL*BATCH_SIZE);
566 | 	vector<cv::Mat> inputImgs;
567 | 	vector<DetectionRes> outputs;
568 | 	int outSize1 = bufferSize[1] / sizeof(float);
569 | 	int outSize2 = bufferSize[2] / sizeof(float);
570 | 	float* out1 = new float[outSize1];
571 | 	float* out2 = new float[outSize2];
572 | 
573 | 	int index = 1,
574 | 		batchCount = 0;
575 | 
576 | 	cv::Mat img = cv::imread("F:/TensorRT-6.0.1.5/data/v3tiny/1.jpg");
577 | 	inputImgs.push_back(img);
578 | 	auto t_start_pre = std::chrono::high_resolution_clock::now();
579 | 	vector<float> curInput = prepareImage(img);
580 | 	auto t_end_pre = std::chrono::high_resolution_clock::now();
581 | 	float total_pre = std::chrono::duration<float, std::milli>(t_end_pre - t_start_pre).count();
582 | 	std::cout << "prepare image take: " << total_pre << " ms." << endl;
583 | 
584 | 	/*
585 | 	inputData.insert(inputData.end(), curInput.begin(), curInput.end());
586 | 	batchCount++;
587 | 	if (batchCount < BATCH_SIZE && i + 1 < fileNames.size())
588 | 	continue;
589 | 	*/
590 | 
591 | 	// DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
592 | 	// 将数据从主机输入缓冲区异步复制到设备输入缓冲区
593 | 	CHECK(cudaMemcpyAsync(buffers[0], curInput.data(), bufferSize[0], cudaMemcpyHostToDevice, stream));
594 | 
595 | 	// 执行推理
596 | 
597 | 	auto t_start = std::chrono::high_resolution_clock::now();
598 | 	for (int i = 0; i < 20; i++) {
599 | 		context->execute(BATCH_SIZE, buffers);
600 | 	}
601 | 	auto t_end = std::chrono::high_resolution_clock::now();
602 | 	float total = std::chrono::duration<float, std::milli>(t_end - t_start).count();
603 | 	std::cout << "Inference take: " << total / 20.0 << " ms." << endl;
604 | 
605 | 	CHECK(cudaMemcpyAsync(out1, buffers[1], bufferSize[1], cudaMemcpyDeviceToHost, stream));
606 | 	CHECK(cudaMemcpyAsync(out2, buffers[2], bufferSize[2], cudaMemcpyDeviceToHost, stream));
607 | 	cudaStreamSynchronize(stream);
608 | 
609 | 	float* out = new float[outSize1 + outSize2];
610 | 	out = merge(out1, out2, outSize1, outSize2);
611 | 
612 | 	// postprocess
613 | 	auto t_start_post = std::chrono::high_resolution_clock::now();
614 | 	auto boxes = postProcess(img, out);
615 | 	auto t_end_post = std::chrono::high_resolution_clock::now();
616 | 	float total_post = std::chrono::duration<float, std::milli>(t_end_post - t_start_post).count();
617 | 	std::cout << "Postprocess take: " << total_post << " ms." << endl;
618 | 
619 | 	//print boxes
620 | 	for (int i = 0; i < boxes.size(); ++i)
621 | 	{
622 | 		cout << boxes[i].prob << ", " << boxes[i].x << ", " << boxes[i].y << ", " << boxes[i].w << ", " << boxes[i].h << endl;
623 | 		int x = boxes[i].x,
624 | 			y = boxes[i].y,
625 | 			w = boxes[i].w,
626 | 			h = boxes[i].h;
627 | 		cv::Rect rect = { x, y, w, h };
628 | 		cv::rectangle(img, rect, cv::Scalar(255, 255, 0), 2);
629 | 	}
630 | 
631 | 	cout << "\n" << endl;
632 | 
633 | 
634 | 	// release the stream and the buffers
635 | 	cudaStreamDestroy(stream);
636 | 	CHECK(cudaFree(buffers[0]));
637 | 	CHECK(cudaFree(buffers[1]));
638 | 	CHECK(cudaFree(buffers[2]));
639 | 
640 | 	// destroy the engine
641 | 	context->destroy();
642 | 	engine->destroy();
643 | 	runtime->destroy();
644 | 
645 | 	cv::imshow("result", img);
646 | 	waitKey(0);
647 | 
648 | }
649 | 
650 | int main()
651 | {
652 | 	// read imgs list
653 | 
654 | 	// create a TensorRT model from the onnx model and serialize it to a stream
655 | 	IHostMemory* trtModelStream{ nullptr };
656 | 
657 | 	// create and load engine
658 | 	fstream existEngine;
659 | 	existEngine.open(engineFile, ios::in);
660 | 	if (existEngine)
661 | 	{
662 | 		readTrtFile(engineFile, trtModelStream);
663 | 		assert(trtModelStream != nullptr);
664 | 	}
665 | 	else
666 | 	{
667 | 		onnxToTRTModel(onnxFile, engineFile, trtModelStream);
668 | 		assert(trtModelStream != nullptr);
669 | 	}
670 | 
671 | 	//onnxToTRTModel(onnxFile, engineFile, trtModelStream);
672 | 	gArgs.runInInt8 = true;
673 | 	//do inference
674 | 	doInferenceFrieza(trtModelStream);
675 | 
676 | 	return 0;
677 | }


--------------------------------------------------------------------------------