├── 1_trt_base
    ├── trt_yolox
    │   ├── readme.md
    │   ├── cpp
    │   │   ├── CMakeLists.txt
    │   │   └── yolox_end2end.cpp
    │   └── py
    │   │   ├── tools.py
    │   │   ├── trt.py
    │   │   └── trt_end2end.py
    ├── readme.md
    ├── trt_plugin
    │   ├── demo01
    │   │   ├── CMakeLists.txt
    │   │   ├── demo01.h
    │   │   ├── test.py
    │   │   └── demo01.cu
    │   ├── 新建插件.md
    │   └── yolox_end2end
    │   │   ├── CMakeLists.txt
    │   │   └── end2end.md
    ├── trt_demo
    │   ├── trt_cpp
    │   │   ├── CMakeLists.txt
    │   │   └── main.cpp
    │   └── trt_py
    │   │   ├── model2onnx.py
    │   │   └── trt_python.py
    └── trt_rtdetr
    │   ├── readme.md
    │   ├── rtdetr_onnx.py
    │   └── rtdetr_trt.py
├── .gitmodules
├── 3_faster_ncnn
    ├── img
    │   └── 000026.jpg
    ├── readme.md
    ├── CMakeLists.txt
    └── src
    │   ├── apps
    │       ├── yolo
    │       │   ├── yolo.h
    │       │   └── yolo.cpp
    │       ├── common.h
    │       └── common.cpp
    │   ├── base
    │       ├── tools.hpp
    │       └── infer_base.hpp
    │   └── main.cpp
├── 2_faster_tensorrt
    ├── inference
    │   ├── 1.jpg
    │   └── 2.jpg
    ├── sources
    │   ├── ori.jpg
    │   ├── 2streamv1.jpg
    │   ├── 2streamv2.jpg
    │   ├── ori_queue.jpg
    │   └── 2steam_overview.jpg
    ├── src
    │   ├── eval
    │   │   ├── get_imgid_txt.py
    │   │   ├── save.hpp
    │   │   ├── eval.py
    │   │   └── eval.cpp
    │   ├── onnx_model
    │   │   ├── rtdetr_sim_export_trt.py
    │   │   └── v8onnx_tranpose.py
    │   ├── apps
    │   │   ├── rtdetr
    │   │   │   ├── rtdetr.h
    │   │   │   └── rtdetr.cpp
    │   │   ├── common.hpp
    │   │   └── yolo
    │   │   │   └── yolo.h
    │   ├── base
    │   │   ├── trt_base.hpp
    │   │   ├── monopoly_accocator.hpp
    │   │   ├── trt_base.cpp
    │   │   ├── tools.hpp
    │   │   ├── memory_tensor.hpp
    │   │   ├── infer_base.cpp
    │   │   └── infer_base.hpp
    │   └── kernels
    │   │   └── cuda_kernel.cuh
    ├── CMakeLists.txt
    └── readme.md
├── .gitignore
├── LICENSE
├── README.md
└── .clang-format


/1_trt_base/trt_yolox/readme.md:
--------------------------------------------------------------------------------
1 | yolox的推理。我没有记错的话focus被我换成了普通卷积更好导出和优化。
2 | 别的无非就是要注意后处理是不是要在head中增加，其实都是比较简单的。
3 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "4_faster_rknn"]
2 | 	path = 4_faster_rknn
3 | 	url = https://github.com/0zzx0/zzx_rknn.git
4 | 


--------------------------------------------------------------------------------
/3_faster_ncnn/img/000026.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0zzx0/faster_deployment/HEAD/3_faster_ncnn/img/000026.jpg


--------------------------------------------------------------------------------
/2_faster_tensorrt/inference/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0zzx0/faster_deployment/HEAD/2_faster_tensorrt/inference/1.jpg


--------------------------------------------------------------------------------
/2_faster_tensorrt/inference/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0zzx0/faster_deployment/HEAD/2_faster_tensorrt/inference/2.jpg


--------------------------------------------------------------------------------
/2_faster_tensorrt/sources/ori.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0zzx0/faster_deployment/HEAD/2_faster_tensorrt/sources/ori.jpg


--------------------------------------------------------------------------------
/2_faster_tensorrt/sources/2streamv1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0zzx0/faster_deployment/HEAD/2_faster_tensorrt/sources/2streamv1.jpg


--------------------------------------------------------------------------------
/2_faster_tensorrt/sources/2streamv2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0zzx0/faster_deployment/HEAD/2_faster_tensorrt/sources/2streamv2.jpg


--------------------------------------------------------------------------------
/2_faster_tensorrt/sources/ori_queue.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0zzx0/faster_deployment/HEAD/2_faster_tensorrt/sources/ori_queue.jpg


--------------------------------------------------------------------------------
/2_faster_tensorrt/sources/2steam_overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0zzx0/faster_deployment/HEAD/2_faster_tensorrt/sources/2steam_overview.jpg


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/
 2 | 
 3 | # c++编译
 4 | **/build/*
 5 | 
 6 | # 模型权重及转换中间文件 动态链接库文件
 7 | *.engine
 8 | *.onnx
 9 | *.so
10 | 
11 | *.bin
12 | *.param
13 | 
14 | # 备份
15 | 2_faster_tensorrt/src/un_used_code.cpp
16 | 
17 | # 本地more测试文件夹
18 | 5_yolov8/
19 | 6_trt_more/


--------------------------------------------------------------------------------
/1_trt_base/readme.md:
--------------------------------------------------------------------------------
1 | # TensorRT
2 | 
3 | > 测试平台: i9-9900K + 2080Ti + 32G + Ubuntu18.04 + cuda10.2 + cudnn8.7 + trt8.5.3
4 | 
5 | 1. `trt_demo`: 使用python api和c++ api进行trt模型转换和推理的demo。
6 | 2. `trt_plugin`: trt增加自定义plugin的基本demo，以及yolox的nms过程采用trt nms plugin的使用方法。
7 | 3. `trt_yolox`: 采用python和c++ 推理yolox的demo。
8 | 4. `trt_rtdetr`: paddlepaddle版本rtdetr的trt转换和推理。
9 | 


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/eval/get_imgid_txt.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | with open('/home/zzx/Experiment/Data/UTDAC2020/annotations/instances_val2017.json', 'r') as f:
 4 |     coco_data = json.load(f)
 5 | 
 6 | # 获取标签信息
 7 | images = coco_data['images']
 8 | print('图片数量：', len(images))
 9 | 
10 | with open('img_id.txt', 'w') as f:
11 |     for image in images:
12 |         f.write(f"{image['id']} {image['file_name']}\n")


--------------------------------------------------------------------------------
/3_faster_ncnn/readme.md:
--------------------------------------------------------------------------------
 1 | # Faster_NCNN
 2 | 
 3 | 本仓库在ncnn上的推理加速是在有限，由于我主要是在cpu上运行ncnn，所以预处理相对整个推理占比极小，吞吐量提速不是很明显。但是接口依然是十分简单易用，并且相对容易扩展。
 4 | 
 5 | ## 模型转换
 6 | 
 7 | ncnn模型转换可以参考[ncnn仓库](https://github.com/Tencent/ncnn)
 8 | 
 9 | ## 模型推理
10 | 
11 | ```cpp
12 | // 创建模型
13 | auto yolo = YoloNCNN::create_infer(param_path, model_path, confidence_threshold, nms_threshold);
14 | 
15 | // 推理图片
16 | auto objs = yolo->commit(image);
17 | 
18 | // 得到结果
19 | auto res = objs.get();
20 | ```
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/onnx_model/rtdetr_sim_export_trt.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import onnx_graphsurgeon as gs
 3 | import onnx
 4 | 
 5 | # paddle上的修改可以参考这位大佬的文章
 6 | # https://zhuanlan.zhihu.com/p/623794029
 7 | 
 8 | model = onnx.load("./rtdetr_r18vd_6x_coco.onnx")
 9 | graph = gs.import_onnx(model)
10 | graph.outputs[0].name = "output"
11 | # print(graph.outputs)
12 | 
13 | onnx.save(gs.export_onnx(graph), "rtdetr_r18vd_6x_coco_output.onnx")
14 | 
15 | os.system("onnxsim rtdetr_r18vd_6x_coco_output.onnx rtdetr_r18vd_6x_coco_output_sim.onnx")
16 | 
17 | os.system("trtexec --onnx=./rtdetr_r18vd_6x_coco_output_sim.onnx --workspace=4096 --shapes=image:1x3x640x640 --saveEngine=rtdetr_r18vd_6x_coco.trt  --fp16")
18 | 
19 | 


--------------------------------------------------------------------------------
/1_trt_base/trt_plugin/demo01/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | 
 3 | set(CMAKE_CXX_STANDARD 11)
 4 | set(CMAKE_BUILD_TYPE Debug)
 5 | 
 6 | project(trt_cpp LANGUAGES CXX CUDA)
 7 | 
 8 | # tensorrt
 9 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/include)
10 | link_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/lib)
11 |  
12 | # add CUDA
13 | find_package(CUDA REQUIRED)
14 | message("CUDA_LIBRARIES:${CUDA_LIBRARIES}")
15 | message("CUDA_INCLUDE_DIRS:${CUDA_INCLUDE_DIRS}")
16 | include_directories(${CUDA_INCLUDE_DIRS})
17 | 
18 | 
19 | SET(LIBHELLO_SRC demo01.cu demo01.h cookbookHelper.cuh)
20 | ADD_LIBRARY(demo01 SHARED ${LIBHELLO_SRC})
21 | 
22 | # link
23 | target_link_libraries(demo01 nvinfer ${CUDA_LIBRARIES})


--------------------------------------------------------------------------------
/1_trt_base/trt_demo/trt_cpp/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | 
 3 | set(CMAKE_CXX_STANDARD 11)
 4 | set(CMAKE_BUILD_TYPE Debug)
 5 | 
 6 | project(trt_cpp LANGUAGES CXX CUDA)
 7 | 
 8 | # tensorrt
 9 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/include)
10 | link_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/lib)
11 |  
12 | # add CUDA
13 | find_package(CUDA REQUIRED)
14 | message("CUDA_LIBRARIES:${CUDA_LIBRARIES}")
15 | message("CUDA_INCLUDE_DIRS:${CUDA_INCLUDE_DIRS}")
16 | include_directories(${CUDA_INCLUDE_DIRS})
17 |  
18 | add_executable(main main.cpp cookbookHelper.cuh)
19 | # link
20 | # target_link_libraries(main ${LIBS} ${CUDA_LIBRARIES})
21 | target_link_libraries(main nvinfer nvonnxparser ${CUDA_LIBRARIES})
22 | 


--------------------------------------------------------------------------------
/3_faster_ncnn/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0)
 2 | 
 3 | project(yolo_ncnn)
 4 | 
 5 | set(CMAKE_CXX_STANDARD 11)
 6 | set(CMAKE_BUILD_TYPE Debug)
 7 | 
 8 | find_package(OpenCV REQUIRED)
 9 | find_package(ncnn REQUIRED)
10 | 
11 | include_directories(${OpenCV_INCLUDE_DIRS})
12 | include_directories(${PROJECT_SOURCE_DIR}/src)
13 | 
14 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/ncnn/build/install/include/ncnn)
15 | set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -O0 -Wfatal-errors -pthread -w -g")
16 | 
17 | add_executable(yolox_ncnn 
18 |     ${PROJECT_SOURCE_DIR}/src/main.cpp
19 |     ${PROJECT_SOURCE_DIR}/src/apps/common.cpp
20 |     ${PROJECT_SOURCE_DIR}/src/apps/yolo/yolo.cpp
21 | )
22 | target_link_libraries(yolox_ncnn ${OpenCV_LIBS} ncnn pthread)
23 | 


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/eval/save.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <fstream>
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | class SaveResult {
 8 | public:
 9 |     SaveResult(std::string &filename) {
10 |         out.open(filename);
11 |     }
12 |     ~SaveResult() {
13 |         if(out.is_open()) {
14 |             out.close();
15 |         }
16 |     }
17 | 
18 |     void save_one_line(std::string &img_name, std::string & image_id, int category_id, float score, std::vector<int> &result) {
19 |         if(out.is_open()) {
20 |             out << img_name << " "<< image_id << " "<< category_id << " " << score << " ";
21 |             for(auto &i : result) {
22 |                 out << i << " ";
23 |             }
24 |             out << "\n";
25 |         }
26 |     }
27 | 
28 | private:
29 |     std::ofstream out;
30 | };
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/onnx_model/v8onnx_tranpose.py:
--------------------------------------------------------------------------------
 1 | import onnx
 2 | import numpy as np
 3 | import onnx_graphsurgeon as gs
 4 | 
 5 | 
 6 | """
 7 | pip install nvidia-pyindex
 8 | pip install onnx-graphsurgeon 
 9 | 
10 | """
11 | 
12 | model = onnx.load("./onnx_weights/v8n.onnx")
13 | graph = gs.import_onnx(model)
14 | 
15 | # graph.outputs[0].name = "output"
16 | old_shape = graph.outputs[0].shape
17 | output_tensort = gs.Variable("output", graph.outputs[0].dtype, [old_shape[0], old_shape[2], old_shape[1]] )
18 | 
19 | graph.nodes[-1].outputs[0].name = "oldoutput"
20 | 
21 | reshape_node = gs.Node(
22 |     op="Transpose",
23 |     name="outputtranspose",
24 |     inputs=[graph.nodes[-1].outputs[0]],
25 |     outputs=[output_tensort],
26 |     attrs={"perm": [0, 2, 1]}
27 | )
28 | 
29 | # print(type(graph.nodes)) # list
30 | graph.nodes.append(reshape_node)
31 | 
32 | 
33 | graph.outputs = reshape_node.outputs
34 | for node in graph.outputs:
35 |     print(node)
36 | 
37 | 
38 | graph.cleanup().toposort()
39 | onnx.save(gs.export_onnx(graph), "v8_transpose.onnx")
40 | 
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 zzx_ncepu_bit
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/1_trt_base/trt_rtdetr/readme.md:
--------------------------------------------------------------------------------
 1 | # RT-DETR的tensorrt转换
 2 | 
 3 | 百度家的这个新模型是真不错，尤其是出了r18的可以类比yolo系列的s模型了。
 4 | 
 5 | ## paddle infer
 6 | ```shell
 7 | python tools/infer.py -c configs/rtdetr/rtdetr_r18vd_6x_coco.yml \
 8 |               -o weights=0zzx/rtdetr_r18vd_dec3_6x_coco.pdparams \
 9 |               --infer_img=./demo/000000570688.jpg
10 | ```
11 | 
12 | 
13 | ## paddle onnx
14 | paddlepaddle-gpu需要大于2.4.1要不报错。
15 | 首先需要先导出
16 | ```shell
17 | python tools/export_model.py -c configs/rtdetr/rtdetr_r18vd_6x_coco.yml \
18 |               -o weights=rtdetr_r18vd_6x_coco.pdparams trt=True \
19 |               --output_dir=output_inference
20 | ```
21 | 然后转成onnx
22 | ```shell
23 | paddle2onnx --model_dir=rtdetr_r18vd_6x_coco \
24 |             --model_filename model.pdmodel  \
25 |             --params_filename model.pdiparams \
26 |             --opset_version 16 \
27 |             --save_file rtdetr_r18vd_6x_coco.onnx
28 | ```
29 | 
30 | ## trt转换
31 | ```shell
32 | trtexec --onnx=./rtdetr_r18vd_6x_coco.onnx \
33 |         --workspace=4096 \
34 |         --shapes=image:1x3x640x640 \
35 |         --saveEngine=rtdetr_r18vd_6x_coco.trt \
36 |         --avgRuns=100 \
37 |         --fp16
38 | ```


--------------------------------------------------------------------------------
/1_trt_base/trt_plugin/新建插件.md:
--------------------------------------------------------------------------------
 1 | # Tensorrt 插件
 2 | 
 3 | 
 4 | ## 从registry加载plugin
 5 | 
 6 | ```c++
 7 | // 从注册器根据名字和版本找到需要的plugin 
 8 | auto creator = getPluginRegistry()->getPluginCreator(pluginName, pluginVersion);
 9 | const PluginFieldCollection* pluginFC = creator->getFieldNames();
10 | 
11 | // Populate the fields parameters for the plugin layer 
12 | // PluginFieldCollection *pluginData = parseAndFillFields(pluginFC, layerFields); 
13 | 
14 | // 使用layer和data创建对象
15 | IPluginV2 *pluginObj = creator->createPlugin(layerName, pluginData);    // 内部有new 申请 必须destory
16 | 
17 | // 增加这一层到网络里面
18 | auto layer = network.addPluginV2(&inputs[0], int(inputs.size()), pluginObj);
19 | … (build rest of the network and serialize engine)
20 | // Destroy the plugin object
21 | pluginObj->destroy()
22 | … (free allocated pluginData)
23 | ```
24 | 
25 | 
26 | 从一个parser解析出来的模型，加载插件，ONNX解析器会自动尝试将无法识别的节点作为插件导入。如果在插件注册表中找到与节点具有相同op_type的插件，则解析器将节点的属性作为插件字段参数转发给插件创建者，以便创建插件。默认情况下，解析器使用`1`作为插件版本，使用`""`作为插件命名空间。可以通过在相应的ONNX节点中设置`plugin_version`和`plugin_namespace`字符串属性来覆盖此行为。
27 | 
28 | 
29 | 
30 | 
31 | ## 自定义一个plugin
32 | 
33 | https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#extending
34 | 


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/eval/eval.py:
--------------------------------------------------------------------------------
 1 | from pycocotools.coco import COCO
 2 | from pycocotools.cocoeval import COCOeval
 3 | import numpy as np
 4 | import json
 5 | 
 6 | 
 7 | def get_coco_from_txt(txtfile, json_file, clsid2catid):
 8 | 
 9 |     dataset_res = []
10 | 
11 |     with open(txtfile, 'r') as f:
12 |         datas = f.readlines()
13 |         # print(len(datas))
14 | 
15 |         for data in datas:
16 |             info = data.split(" ")[:-1]
17 |             result = {}
18 |             result["image_id"] = int(info[1])
19 |             result["category_id"] = clsid2catid[int(info[2])]
20 |             result["bbox"] = [int(info[4]), int(info[5]), int(info[6]), int(info[7])]
21 |             result["score"] = float(info[3])
22 |             dataset_res.append(result)
23 | 
24 |     with open(json_file, "w") as f:
25 |         json.dump(dataset_res, f)
26 |     print("json 保存成功")
27 | 
28 | 
29 | annFile = "/home/zzx/Experiment/Data/UTDAC2020/annotations/instances_val2017.json"
30 | resFile = "./results.txt"
31 | resJson = 'eval_results.json'
32 | 
33 | cocoGt=COCO(annFile)
34 | clsid2catid = cocoGt.getCatIds()
35 | 
36 | get_coco_from_txt(resFile, resJson, clsid2catid)
37 | cocoDt = cocoGt.loadRes(resJson)
38 | 
39 | cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
40 | cocoEval.params.imgIds = cocoGt.getImgIds()
41 | cocoEval.evaluate()
42 | cocoEval.accumulate()
43 | cocoEval.summarize()
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/1_trt_base/trt_plugin/yolox_end2end/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0)
 2 | 
 3 | project(yolox)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
15 | # cuda
16 | include_directories(/usr/local/cuda-10.2/include)
17 | link_directories(/usr/local/cuda-10.2/lib64)
18 | # cudnn
19 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/cudnn8.7.0.84_cuda10/include)
20 | link_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/cudnn8.7.0.84_cuda10/lib64)
21 | # tensorrt
22 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/include)
23 | link_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/lib)
24 | 
25 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
26 | 
27 | find_package(OpenCV)
28 | include_directories(${OpenCV_INCLUDE_DIRS})
29 | 
30 | add_executable(yolox_end2end ${PROJECT_SOURCE_DIR}/yolox_end2end.cpp)
31 | target_link_libraries(yolox_end2end nvinfer nvinfer_plugin)
32 | target_link_libraries(yolox_end2end cudart)
33 | target_link_libraries(yolox_end2end ${OpenCV_LIBS})
34 | 
35 | add_definitions(-O2 -pthread)
36 | 
37 | 


--------------------------------------------------------------------------------
/3_faster_ncnn/src/apps/yolo/yolo.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "opencv2/opencv.hpp"
 4 | 
 5 | #include "../common.h"
 6 | #include "../../base/infer_base.hpp"
 7 | #include "../../base/tools.hpp"
 8 | 
 9 | 
10 | namespace YoloNCNN{
11 | 
12 | using namespace FasterNCNN;
13 | 
14 | 
15 | using Infer = InferBase<cv::Mat, std::vector<ObjBox>>;
16 | using Det = DetBase<cv::Mat, std::vector<ObjBox>>;
17 | // 推理
18 | class InferImpl : public Infer, Det{
19 | 
20 | public:
21 | 
22 | 
23 |     bool startup(const std::string &param_path,
24 |                  const std::string &model_path, 
25 |                  float confidence, float iou_thr);
26 |     virtual void worker(std::promise<bool> &pro) override;
27 |     virtual bool preprocess(Job &job, const cv::Mat &input) override;
28 | 
29 |     virtual std::shared_future<std::vector<ObjBox>> commit(const cv::Mat &input) override;
30 | 
31 | private:
32 | 
33 |     std::string param_path_;
34 |     std::string model_path_;
35 |     float confidence_;
36 |     float iou_thr_;
37 | 
38 |     std::shared_ptr<postProcess> postprocess_;
39 |     std::vector<ObjBox> results_;
40 | 
41 |     int infer_thread_ = 8;
42 |     int class_num = 4;
43 | 
44 | };
45 | 
46 | 
47 | 
48 | std::shared_ptr<Infer> create_infer(const std::string &param_path, 
49 |                                     const std::string &model_path, 
50 |                                     float confidence,
51 |                                     float iou_thr
52 |                                     );
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/1_trt_base/trt_yolox/cpp/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0)
 2 | 
 3 | project(yolox)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
15 | # cuda
16 | include_directories(/usr/local/cuda-10.2/include)
17 | link_directories(/usr/local/cuda-10.2/lib64)
18 | # cudnn
19 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/cudnn8.7.0.84_cuda10/include)
20 | link_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/cudnn8.7.0.84_cuda10/lib64)
21 | # tensorrt
22 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/include)
23 | link_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/lib)
24 | 
25 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
26 | 
27 | find_package(OpenCV)
28 | include_directories(${OpenCV_INCLUDE_DIRS})
29 | 
30 | add_executable(yolox_end2end ${PROJECT_SOURCE_DIR}/yolox_end2end.cpp)
31 | target_link_libraries(yolox_end2end nvinfer nvinfer_plugin)
32 | target_link_libraries(yolox_end2end cudart)
33 | target_link_libraries(yolox_end2end ${OpenCV_LIBS})
34 | 
35 | add_executable(yolox ${PROJECT_SOURCE_DIR}/yolox.cpp)
36 | target_link_libraries(yolox nvinfer)
37 | target_link_libraries(yolox cudart)
38 | target_link_libraries(yolox ${OpenCV_LIBS})
39 | 
40 | add_definitions(-O2 -pthread)
41 | 
42 | 


--------------------------------------------------------------------------------
/3_faster_ncnn/src/apps/common.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include <vector>
 5 | #include <opencv2/opencv.hpp>
 6 | #include "net.h"
 7 | 
 8 | #include "../base/tools.hpp"
 9 | 
10 | namespace FasterNCNN {
11 | 
12 | 
13 | // bboxes
14 | struct ObjBox{
15 | 
16 |     float GetWidth() { return (x2 - x1); };
17 |     float GetHeight() { return (y2 - y1); };
18 |     float area() { return GetWidth() * GetHeight(); };
19 | 
20 |     int x1;
21 |     int y1;
22 |     int x2;
23 |     int y2;
24 | 
25 |     int category;
26 |     float score;
27 | };
28 | 
29 | struct GridAndStride{
30 |     int grid0;
31 |     int grid1;
32 |     int stride;
33 | };
34 | 
35 | 
36 | static float InterSectionArea(const ObjBox &a, const ObjBox &b);
37 | static bool ScoreSort(ObjBox a, ObjBox b);
38 | static void nms(std::vector<ObjBox> &src_boxes, std::vector<ObjBox> &dst_boxes, float threshold);
39 | 
40 | 
41 | class postProcess {
42 | 
43 | public:
44 |     enum class postProcessType : int{
45 | 
46 |             yolox   = 0,
47 |             yolov8  = 1,
48 |     };
49 | 
50 | 
51 | public:
52 |     postProcess(postProcessType type, float input_h, float input_w, float conf_thr, float nms_thr);
53 |     ~postProcess() { };
54 | 
55 |     void forward(ncnn::Mat &output_);
56 |     void yolox_generate_grids_and_stride();
57 |     void yolox_decode(ncnn::Mat &output_);
58 | 
59 | 
60 | protected:
61 |     int input_h_;
62 |     int input_w_;
63 |     float conf_thr_;
64 |     float nms_thr_;
65 | 
66 |     // std::vector<ObjBox> out_boxes;
67 |     // std::vector<ObjBox> nms_boxes;
68 | 
69 |     const std::vector<int> strides{8, 16, 32};
70 |     std::vector<GridAndStride> grid_strides;
71 | 
72 | public:
73 |     std::vector<ObjBox> out_boxes;
74 |     std::vector<ObjBox> nms_boxes;
75 | 
76 | };
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/apps/rtdetr/rtdetr.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file rtdetr.h
 3 |  * @author 0zzx0
 4 |  * @brief RTDETR推理
 5 |  * @version 0.1
 6 |  * @date 2023-08-21
 7 |  *
 8 |  * @copyright Copyright (c) 2023
 9 |  *
10 |  */
11 | 
12 | #ifndef RTDETR_H
13 | #define RTDETR_H
14 | 
15 | #include "../common.hpp"
16 | 
17 | namespace RTDETR {
18 | using namespace FasterTRT;
19 | 
20 | // 线程安全模板类设置模板类型
21 | using ThreadSafedAsyncInferImpl =
22 |     ThreadSafedAsyncInfer<cv::Mat,                       // input
23 |                           BoxArray,                      // output
24 |                           std::tuple<std::string, int>,  // start param
25 |                           AffineMatrix                   // additional
26 |                           >;
27 | using Infer = InferBase<cv::Mat, BoxArray>;
28 | 
29 | /**
30 |  * @brief 推理类的实现，继承必备父类，重写父类方法
31 |  *
32 |  */
33 | class RtDetrTRTInferImpl : public Infer, public ThreadSafedAsyncInferImpl {
34 | public:
35 |     ~RtDetrTRTInferImpl();
36 | 
37 |     virtual bool startup(const std::string &file, int gpuid, int batch_size,
38 |                          float confidence_threshold);
39 |     virtual void worker(std::promise<bool> &result) override;
40 |     virtual bool preprocess(Job &job, const cv::Mat &image) override;
41 | 
42 |     virtual std::vector<std::shared_future<BoxArray>> commits(
43 |         const std::vector<cv::Mat> &images) override;
44 |     virtual std::shared_future<BoxArray> commit(const cv::Mat &image) override;
45 | 
46 | private:
47 |     int input_width_ = 0;
48 |     int input_height_ = 0;
49 |     int gpu_ = 0;
50 |     float confidence_threshold_ = 0;
51 |     cudaStream_t stream_ = nullptr;
52 |     cudaStream_t stream_pro_ = nullptr;
53 |     Norm normalize_;
54 |     int batch_size_ = 1;
55 | };
56 | 
57 | // 创建推理器
58 | std::shared_ptr<Infer> create_infer(const std::string &engine_file, int gpuid, int batch_size,
59 |                                     float confidence_threshold = 0.2f);
60 | 
61 | }  // namespace RTDETR
62 | 
63 | #endif
64 | 


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/eval/eval.cpp:
--------------------------------------------------------------------------------
 1 | #include <fstream>
 2 | #include <iostream>
 3 | #include <string>
 4 | #include <vector>
 5 | 
 6 | #include "../apps/yolo/yolo.h"
 7 | #include "save.hpp"
 8 | 
 9 | using namespace std;
10 | 
11 | const string base_path = "/home/zzx/Experiment/Data/UTDAC2020/val2017/";
12 | YOLO::YoloType type = YOLO::YoloType::X;
13 | const string model_file = "../yolox_b16.engine";
14 | const int deviceid = 0;
15 | 
16 | const float confidence_threshold = 0.5f;
17 | const float nms_threshold = 0.65f;
18 | 
19 | int main() {
20 |     int batch_size = 1;
21 |     YOLO::set_device(deviceid);
22 | 
23 |     auto yolo = YOLO::create_infer(model_file, type, deviceid, batch_size, confidence_threshold,
24 |                                    nms_threshold);
25 | 
26 |     ifstream img_id("../src/eval/img_id.txt");
27 |     vector<string> all_id;
28 |     vector<string> all_img;
29 | 
30 |     while(!img_id.eof()) {
31 |         string id;
32 |         string name;
33 |         img_id >> id;
34 |         img_id >> name;
35 |         if(id.size() == 0) break;
36 | 
37 |         all_id.push_back(id);
38 |         all_img.push_back(name);
39 |         // cout << id << " " << name << endl;
40 |     }
41 |     img_id.close();
42 | 
43 |     string resfile_name = "../src/eval/results.txt";
44 |     SaveResult resfile(resfile_name);
45 | 
46 |     assert(all_id.size() == all_img.size());
47 | 
48 |     for(int i = 0; i < all_id.size(); i++) {
49 |         string cur_img = base_path + all_img[i];
50 |         auto image = cv::imread(cur_img);
51 |         auto objs = yolo->commit(image);
52 |         auto res = objs.get();
53 |         for(auto& one : res) {
54 |             int x = one.left;
55 |             int y = one.top;
56 |             int w = one.right - one.left;
57 |             int h = one.bottom - one.top;
58 |             vector<int> xywh{x, y, w, h};
59 |             // cout << one.left << one.right << one.bottom << one.top << endl;
60 |             resfile.save_one_line(all_img[i], all_id[i], one.class_label, one.confidence, xywh);
61 |         }
62 |     }
63 |     return 0;
64 | }


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/apps/common.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file common.h
 3 |  * @author 0zzx0
 4 |  * @brief
 5 |  * @version 0.1
 6 |  * @date 2023-08-21
 7 |  *
 8 |  * @copyright Copyright (c) 2023
 9 |  *
10 |  */
11 | 
12 | #ifndef COMMON_H
13 | #define COMMON_H
14 | 
15 | #include <future>
16 | #include <memory>
17 | #include <opencv2/opencv.hpp>
18 | 
19 | #include <NvInfer.h>
20 | #include <NvOnnxParser.h>
21 | #include <cuda_runtime.h>
22 | 
23 | #include "../base/tools.hpp"
24 | #include "../base/trt_base.hpp"
25 | #include "../base/infer_base.hpp"
26 | #include "../base/memory_tensor.hpp"
27 | #include "../base/monopoly_accocator.hpp"
28 | 
29 | namespace FasterTRT {
30 | 
31 | // 推理结果格式
32 | struct Box {
33 |     float left, top, right, bottom, confidence;
34 |     int class_label;
35 | 
36 |     Box() = default;
37 |     Box(float left, float top, float right, float bottom, float confidence, int class_label)
38 |         : left(left),
39 |           top(top),
40 |           right(right),
41 |           bottom(bottom),
42 |           confidence(confidence),
43 |           class_label(class_label) {}
44 | };
45 | typedef std::vector<Box> BoxArray;
46 | 
47 | // 仿射变换矩阵
48 | struct AffineMatrix {
49 |     float i2d[6];  // image to dst(network), 2x3 matrix
50 |     float d2i[6];  // dst to image, 2x3 matrix
51 | 
52 |     void compute(const cv::Size &from, const cv::Size &to) {
53 |         float scale_x = to.width / (float)from.width;
54 |         float scale_y = to.height / (float)from.height;
55 |         float scale = std::min(scale_x, scale_y);
56 |         i2d[0] = scale;
57 |         i2d[1] = 0;
58 |         i2d[2] = -scale * from.width * 0.5 + to.width * 0.5 + scale * 0.5 - 0.5;
59 |         i2d[3] = 0;
60 |         i2d[4] = scale;
61 |         i2d[5] = -scale * from.height * 0.5 + to.height * 0.5 + scale * 0.5 - 0.5;
62 | 
63 |         cv::Mat m2x3_i2d(2, 3, CV_32F, i2d);
64 |         cv::Mat m2x3_d2i(2, 3, CV_32F, d2i);
65 |         cv::invertAffineTransform(m2x3_i2d, m2x3_d2i);
66 |     }
67 | 
68 |     cv::Mat i2d_mat() { return cv::Mat(2, 3, CV_32F, i2d); }
69 | };
70 | 
71 | }  // namespace FasterTRT
72 | 
73 | #endif


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/base/trt_base.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file trt_base.hpp
 3 |  * @author 0zzx0
 4 |  * @brief trt base
 5 |  * @version 0.1
 6 |  * @date 2023-6-11 2023-8-21
 7 |  *
 8 |  * @copyright Copyright (c) 2023
 9 |  *
10 |  */
11 | 
12 | #ifndef TRT_BASE_H
13 | #define TRT_BASE_H
14 | 
15 | #include <NvOnnxParser.h>
16 | 
17 | #include "memory_tensor.hpp"
18 | #include "monopoly_accocator.hpp"
19 | #include "infer_base.hpp"
20 | 
21 | namespace FasterTRT {
22 | 
23 | // 推理数据类型
24 | enum class Mode : int { FP32, FP16, INT8 };
25 | const char* mode_string(Mode type);
26 | 
27 | ////////////////////量化用的///////////////////////////
28 | typedef std::function<void(int current, int count, const std::vector<std::string>& files,
29 |                            std::shared_ptr<Tensor>& tensor)>
30 |     Int8Process;
31 | 
32 | /**
33 |  * @brief int8 量化 未测试
34 |  *
35 |  */
36 | class Int8EntropyCalibrator : public IInt8EntropyCalibrator2 {
37 | public:
38 |     Int8EntropyCalibrator(const std::vector<std::string>& imagefiles, nvinfer1::Dims dims,
39 |                           const Int8Process& preprocess);
40 |     Int8EntropyCalibrator(const std::vector<uint8_t>& entropyCalibratorData, nvinfer1::Dims dims,
41 |                           const Int8Process& preprocess);
42 |     virtual ~Int8EntropyCalibrator();
43 | 
44 |     int getBatchSize() const noexcept;
45 |     bool next();
46 |     bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept;
47 | 
48 |     const std::vector<uint8_t>& getEntropyCalibratorData();
49 |     const void* readCalibrationCache(size_t& length) noexcept;
50 |     virtual void writeCalibrationCache(const void* cache, size_t length) noexcept;
51 | 
52 | private:
53 |     Int8Process preprocess_;
54 |     std::vector<std::string> allimgs_;
55 |     size_t batchCudaSize_ = 0;
56 |     int cursor_ = 0;
57 |     nvinfer1::Dims dims_;
58 |     std::vector<std::string> files_;
59 |     std::shared_ptr<Tensor> tensor_;
60 |     std::vector<uint8_t> entropyCalibratorData_;
61 |     bool fromCalibratorData_ = false;
62 |     cudaStream_t stream_ = nullptr;
63 | };
64 | 
65 | // 检索目录下的所有图像："*.jpg;*.png;*.bmp;*.jpeg;*.tiff"
66 | std::vector<std::string> glob_image_files(const std::string& directory);
67 | 
68 | }  // namespace FasterTRT
69 | 
70 | #endif
71 | 


--------------------------------------------------------------------------------
/1_trt_base/trt_demo/trt_py/model2onnx.py:
--------------------------------------------------------------------------------
 1 | import tensorrt as trt 
 2 | import torch
 3 | import torch.nn as nn
 4 | import onnx
 5 | 
 6 | class MyModule(nn.Module):
 7 |     def __init__(self) -> None:
 8 |         super().__init__()
 9 |         self.conv = nn.Conv2d(in_channels=3,out_channels=3,kernel_size=3,stride=1,padding=1)
10 |         self.bn = nn.BatchNorm2d(3)
11 |         self.act = nn.ReLU(inplace=True)
12 |         self.pool = nn.MaxPool2d(2, 2)
13 |     
14 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
15 |         x = self.act(self.bn(self.conv(x)))
16 |         return self.pool(x)
17 | 
18 | 
19 | device = torch.device('cuda:0')
20 | onnx_model_name = '../files/model.onnx'
21 | torch.onnx.export(MyModule(), 
22 |                   torch.randn(1, 3, 224, 224), 
23 |                   onnx_model_name, 
24 |                   input_names=['input'],
25 |                   output_names=['output'], 
26 |                   opset_version=11)
27 | 
28 | 
29 | def ger_engine():
30 |     torch.onnx.export(MyModule(), torch.randn(1, 3, 112, 112), onnx_model_name, input_names=['input'],
31 |                     output_names=['output'], opset_version=11)
32 | 
33 |     onnx_model = onnx.load(onnx_model_name)
34 | 
35 |     logger = trt.Logger(trt.Logger.ERROR)
36 |     builder = trt.Builder(logger)
37 |     # EXPLICIT_BATCH 显式batch
38 |     EXPLICIT_BATCH = 1 << (int)( 
39 |         trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) 
40 |     network = builder.create_network(EXPLICIT_BATCH)    # 创建network
41 | 
42 |     parser = trt.OnnxParser(network, logger)            # 解析onnx
43 | 
44 |     if not parser.parse(onnx_model.SerializePartialToString()):
45 |         error_mags = ' '
46 |         for error in range(parser.num_errors):
47 |             error_mags += error
48 |         raise RuntimeError(f"解析失败辣: {error_mags}")
49 | 
50 |     config = builder.create_builder_config()
51 |     config.max_workspace_size = 1 << 20
52 |     profile = builder.create_optimization_profile() 
53 | 
54 |     profile.set_shape('input', [1,3 ,112 ,112],[1,3 ,112 ,112],[1,3 ,112 ,112]) 
55 |     config.add_optimization_profile(profile) 
56 |     # create engine 
57 |     with torch.cuda.device(device): 
58 |         engine = builder.build_engine(network, config) 
59 |     
60 |     with open('model.engine', mode='wb') as f: 
61 |         f.write(bytearray(engine.serialize())) 
62 |         print("generating file done!") 
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/2_faster_tensorrt/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.0)
 2 | 
 3 | project(yolo_trt)
 4 | 
 5 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 6 | set(CMAKE_CXX_STANDARD 11)
 7 | set(CMAKE_BUILD_TYPE Debug)
 8 | 
 9 | find_package(CUDA REQUIRED)
10 | find_package(OpenCV REQUIRED)
11 | 
12 | include_directories(${OpenCV_INCLUDE_DIRS})
13 | include_directories(${PROJECT_SOURCE_DIR}/src)
14 | 
15 | # cuda
16 | include_directories(/usr/local/cuda-10.2/include)
17 | link_directories(/usr/local/cuda-10.2/lib64)
18 | # cudnn
19 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/cudnn8.7.0.84_cuda10/include)
20 | link_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/cudnn8.7.0.84_cuda10/lib)
21 | # tensorrt
22 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/include)
23 | link_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/lib)
24 | 
25 | set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -O0 -Wfatal-errors -pthread -w -g")
26 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11 -O0 -Xcompiler -fPIC -g -w ${CUDA_GEN_CODE}")
27 | 
28 | file(GLOB_RECURSE cuda_srcs ${PROJECT_SOURCE_DIR}/src/kernels/*.cu)
29 | 
30 | cuda_add_library(cuda_kernels SHARED ${cuda_srcs})
31 | target_link_libraries(cuda_kernels cuda cudart)
32 | target_link_libraries(cuda_kernels ${OpenCV_LIBS})
33 | 
34 | add_executable(yolo 
35 |     ${PROJECT_SOURCE_DIR}/src/main.cpp
36 |     ${PROJECT_SOURCE_DIR}/src/base/infer_base.cpp
37 |     ${PROJECT_SOURCE_DIR}/src/base/memory_tensor.cpp
38 |     ${PROJECT_SOURCE_DIR}/src/base/trt_base.cpp
39 |     ${PROJECT_SOURCE_DIR}/src/apps/yolo/yolo.cpp
40 |     ${PROJECT_SOURCE_DIR}/src/apps/rtdetr/rtdetr.cpp
41 | )
42 | target_link_libraries(yolo cuda_kernels)
43 | target_link_libraries(yolo nvinfer nvinfer_plugin nvonnxparser)
44 | target_link_libraries(yolo cuda cublas cudart cudnn)
45 | target_link_libraries(yolo pthread)
46 | target_link_libraries(yolo ${OpenCV_LIBS})
47 | 
48 | 
49 | add_executable(eval 
50 |     ${PROJECT_SOURCE_DIR}/src/eval/eval.cpp
51 |     ${PROJECT_SOURCE_DIR}/src/eval/save.hpp
52 |     ${PROJECT_SOURCE_DIR}/src/base/infer_base.cpp
53 |     ${PROJECT_SOURCE_DIR}/src/base/memory_tensor.cpp
54 |     ${PROJECT_SOURCE_DIR}/src/base/trt_base.cpp
55 |     ${PROJECT_SOURCE_DIR}/src/apps/yolo/yolo.cpp
56 |     )
57 | target_link_libraries(eval cuda_kernels)
58 | target_link_libraries(eval nvinfer nvinfer_plugin nvonnxparser)
59 | target_link_libraries(eval cuda cublas cudart cudnn)
60 | target_link_libraries(eval pthread)
61 | target_link_libraries(eval ${OpenCV_LIBS})
62 | 


--------------------------------------------------------------------------------
/3_faster_ncnn/src/apps/yolo/yolo.cpp:
--------------------------------------------------------------------------------
 1 | #include "yolo.h"
 2 | 
 3 | 
 4 | namespace YoloNCNN{
 5 | 
 6 | 
 7 | bool InferImpl::startup(const std::string &param_path, const std::string &model_path, float confidence, float iou_thr){
 8 |     param_path_ = param_path;
 9 |     model_path_ = model_path;
10 |     confidence_ = confidence;
11 |     iou_thr_ = iou_thr;
12 | 
13 |     // 等待线程创建和里面的初始化完成
14 |     return Det::startup();
15 | }
16 | 
17 | void InferImpl::worker(std::promise<bool> &pro){
18 | 
19 |     input_w_ = 640;
20 |     input_h_ = 640;
21 |     input_name_ = "images";
22 |     output_name_ = "output";
23 | 
24 |     net_.load_param(param_path_.c_str());
25 |     net_.load_model(model_path_.c_str());
26 |     postprocess_ = std::make_shared<postProcess>(postProcess::postProcessType::yolox, input_h_, input_w_, confidence_, iou_thr_);
27 | 
28 |     INFO("ncnn模型加载成功! ");
29 | 
30 |     pro.set_value(true); // satrtup 函数结束
31 | 
32 |     // std::vector<Job> fetch_jobs;
33 |     Job fetch_job;
34 |     while(get_job_and_wait(fetch_job)){
35 |         
36 |         input_ = fetch_job.input;
37 |         forward();
38 |         postprocess_->forward(output_);
39 |         fetch_job.pro->set_value(postprocess_->nms_boxes);
40 |     }
41 | 
42 |     INFO("推理结束！");
43 | }
44 | 
45 | 
46 | std::shared_future<std::vector<ObjBox>> InferImpl::commit(const cv::Mat &input){
47 |     return Det::commit(input);
48 | }
49 | 
50 | 
51 | bool InferImpl::preprocess(Job &job, const cv::Mat &input) {
52 |     int img_w = input.cols;
53 |     int img_h = input.rows;
54 | 
55 |     int w = img_w;
56 |     int h = img_h;
57 |     float scale = 1.f;
58 |     if (w > h){
59 |         scale = (float)input_w_ / w;
60 |         w = input_w_;
61 |         h = h * scale;
62 |     } else{
63 |         scale = (float)input_h_ / h;
64 |         h = input_h_;
65 |         w = w * scale;
66 |     }
67 |     ncnn::Mat in = ncnn::Mat::from_pixels_resize(input.data, ncnn::Mat::PIXEL_BGR, img_w, img_h, w, h);
68 | 
69 |     // pad to YOLOX_TARGET_SIZE rectangle
70 |     int wpad = input_w_ - w;
71 |     int hpad = input_h_ - h;
72 | 
73 |     ncnn::copy_make_border(in, job.input, 0, hpad, 0, wpad, ncnn::BORDER_CONSTANT, 114.f);
74 |     // input_.substract_mean_normalize(mean_vals_, norm_vals_);
75 |     return true;
76 | 
77 | }
78 | 
79 | 
80 | std::shared_ptr<Infer> create_infer(const std::string &param_path, const std::string &model_path, float confidence, float iou_thr){
81 |     std::shared_ptr<InferImpl> instance = std::make_shared<InferImpl>();
82 |     if(!instance->startup(param_path, model_path, confidence, iou_thr)){
83 |         instance.reset();
84 |     }
85 |     return instance;  // 创建子类对象 返回父类指针，这样实现封着。外部只能调用commit
86 | }
87 | 
88 | 
89 | } //end namespace


--------------------------------------------------------------------------------
/3_faster_ncnn/src/base/tools.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | 
 5 | #include <dirent.h>
 6 | #include <sys/types.h>
 7 | #include <sys/stat.h>
 8 | #include <unistd.h>
 9 | #include <stdarg.h>
10 | 
11 | namespace FasterNCNN {
12 | 
13 | /*
14 | logger
15 | */
16 | enum class LogLevel : int{
17 |     Debug   = 5,
18 |     Verbose = 4,
19 |     Info    = 3,
20 |     Warning = 2,
21 |     Error   = 1,
22 |     Fatal   = 0
23 | };
24 | 
25 | 
26 | static const char* level_string(LogLevel level);
27 | static void __log_func(const char* file, int line, LogLevel level, const char* fmt, ...);
28 | static std::string file_name(const std::string& path, bool include_suffix);
29 | 
30 | 
31 | /* 修改这个level来实现修改日志输出级别 */
32 | #define CURRENT_LOG_LEVEL       LogLevel::Info
33 | #define INFOD(...)			__log_func(__FILE__, __LINE__, LogLevel::Debug, __VA_ARGS__)
34 | #define INFOV(...)			__log_func(__FILE__, __LINE__, LogLevel::Verbose, __VA_ARGS__)
35 | #define INFO(...)			__log_func(__FILE__, __LINE__, LogLevel::Info, __VA_ARGS__)
36 | #define INFOW(...)			__log_func(__FILE__, __LINE__, LogLevel::Warning, __VA_ARGS__)
37 | #define INFOE(...)			__log_func(__FILE__, __LINE__, LogLevel::Error, __VA_ARGS__)
38 | #define INFOF(...)			__log_func(__FILE__, __LINE__, LogLevel::Fatal, __VA_ARGS__)
39 | 
40 | static const char* level_string(LogLevel level){
41 |     switch (level){
42 |         case LogLevel::Debug: return "debug";
43 |         case LogLevel::Verbose: return "verbo";
44 |         case LogLevel::Info: return "info";
45 |         case LogLevel::Warning: return "warn";
46 |         case LogLevel::Error: return "error";
47 |         case LogLevel::Fatal: return "fatal";
48 |         default: return "unknow";
49 |     }
50 | }
51 | 
52 | static void __log_func(const char* file, int line, LogLevel level, const char* fmt, ...){
53 | 
54 |     if(level > CURRENT_LOG_LEVEL)
55 |         return;
56 | 
57 |     va_list vl;
58 |     va_start(vl, fmt);
59 |     
60 |     char buffer[2048];
61 |     std::string filename = file_name(file, true);
62 |     int n = snprintf(buffer, sizeof(buffer), "[%s][%s:%d]:", level_string(level), filename.c_str(), line);
63 |     vsnprintf(buffer + n, sizeof(buffer) - n, fmt, vl);
64 | 
65 |     fprintf(stdout, "%s\n", buffer);
66 |     if (level == LogLevel::Fatal) {
67 |         fflush(stdout);
68 |         abort();
69 |     }
70 | }
71 | 
72 | static std::string file_name(const std::string& path, bool include_suffix){
73 | 
74 |     if (path.empty()) return "";
75 |     int p = path.rfind('/');
76 |     p += 1;
77 | 
78 |     //include suffix
79 |     if (include_suffix)
80 |         return path.substr(p);
81 | 
82 |     int u = path.rfind('.');
83 |     if (u == -1)
84 |         return path.substr(p);
85 | 
86 |     if (u <= p) u = path.size();
87 |     return path.substr(p, u - p);
88 | }
89 | 
90 | 
91 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Faster Deployment
  2 | 
  3 | > 作者本人能力和想法都十分有限，确实可能很多情况没有想到，欢迎大家讨论！
  4 | 
  5 | 本仓库主要是针对深度学习模型的TensorRT、ncnn、rknn等的后端推理框架部署工作，有较好的接口便捷性和推理性能。当前主要主要应用在单目机器人，所以benchmark一般设置`batch=1`，采用单幅图片连续输入或者单视频流输入的方式进行测试，模拟实际情况。
  6 | 
  7 | 目前建议优先使用faster_tensorrt，因为这是本仓库主要的提升方向，支持的算法最多，后续更新也会更快。另外两个也以也会慢慢更新。
  8 | 
  9 | - 1_trt_base: 主要是tensorrt的基础操作，包括模型转换、推理、构建插件以及一些运行和优化的demo
 10 | - 2_faster_tensorrt: 主要是tensorrt的封装和优化
 11 | - 3_faster_ncnn: 参考`2_faster_tensort`封装ncnn推理过程
 12 | - 4_faser_rknn: 封装rknn的推理过程
 13 | 
 14 | ## 0 致谢
 15 | 
 16 | 首先需要感谢手写ai团队开源的[TensorRT_Pro](https://github.com/shouxieai/tensorRT_Pro)，让我受益良多，本仓库中tensorrt的代码也均是在其基础上进行优化，以及按照该仓库的整体思路优化ncnn和rknn的推理。
 17 | 
 18 | <!-- 该仓库的部分优点：
 19 | 1. 接口简单清晰
 20 | 2. 预处理和后处理自写CUDA加速
 21 | 3. batch可根据实际数据动态调整(前提是trtmodel转换中设置动态batch)
 22 | 4. 写了内存和数据的管理类，无需手动操作，并且可以实现内存复用，无需反复申请。
 23 | 5. 预处理和推理同时进行
 24 | 6. 生产者消费者模式，合理好用。 -->
 25 | 
 26 | ## 1 当前支持
 27 | 
 28 | ### 1.1 faster_tensorrt
 29 | 
 30 | #### 目标检测
 31 | 
 32 | - [x] yolox
 33 | - [x] yolov8
 34 | - [x] rtdetr
 35 | 
 36 | #### 单目深度估计
 37 | 
 38 | - [ ] [lite-mono](https://github.com/noahzn/Lite-Mono)
 39 | 
 40 | <!-- https://zhuanlan.zhihu.com/p/614680720 -->
 41 | 
 42 | <!-- #### 语义分割 -->
 43 | 
 44 | ### 1.2 faster_ncnn
 45 | 
 46 | #### 目标检测
 47 | - [x] yolox
 48 | - [ ] yolov8
 49 | 
 50 | 
 51 | ### 1.3 faster_rknn
 52 | 
 53 | #### 目标检测
 54 | - [x] yolox
 55 | - [ ] yolov8
 56 | 
 57 | 
 58 | ## 2 问题&分析
 59 | 
 60 | 我们首先应该思考在实际机器人的视频流推理上需要的是什么？
 61 | 
 62 | 明确前提：此时的模型已经充分优化过了，包括剪枝、量化之类的操作以及后端推理器已经对模型算子进行了自动或手动的融合、量化、图优化等优化。<u>*简而言之，可以认为单幅图像的纯inference时间是不可能再缩短了。*</u>
 63 | 
 64 | <font size=5 >**🔥高吞吐和低延迟！！！** </font>
 65 | 
 66 | > 本文中两个词的意义 :   <br>
 67 | > **延迟**：图片从诞生到推理完成需要的时间。 <br>
 68 | > **吞吐**：相等时间内处理图片的数量。 <br>
 69 | > 发现一个更好的解释，来自trt的文档[性能评估](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#measure-performance)
 70 | 
 71 | 
 72 | 推理的的目标肯定是吞吐量特别大，同时延迟超级小，在一定条件下，其实这两个是互斥的。这也是表明程序性能(软件性能+硬件性能)两种方式，吞吐量代表并行能力，延迟代表串行效果。
 73 | > 吞吐和延迟互斥是因为，在服务器侧部署时，为了提高吞吐量，往往会先缓存一些数据，组成batch进行推理(也是tensorrt_pro的策略)。实现在相同时间内，更多图像的推理，同时毫无疑问这在并行能力高的GPU设备上是十分有效的，可以极大提升吞吐量(显存和算力足够情况下可以轻松提升10倍以上)。但是，这对于每一帧图像来说，它从输入到输出的延迟就会提高了！这对实际机器人🤖来说是显然无法接受的，因为控制的核心还是反馈，当信号量频率低或者不稳定时，对于机器人的控制和决策来说难度很大。
 74 | 
 75 | 需要注意，单幅图像的延迟，在这个层面是无法缩短的哦！因为一幅图像到推理器后端模型推理步骤的时候，必须经过`预处理->推理->后处理`三个步骤，这三个步骤的时间耗时在本阶段是无法缩小的(这是在之前优化网络结构和模型转换的时候考虑的)，本项目是希望在不增加单图延迟的基础上，尽量高的提升模型吞吐量，也就是尽量重合、去掉一些无用、重复、耗时的操作，对于机器人来说就是可以拥有更高频率的目标位置信息等。
 76 | 
 77 | 
 78 | 另外采用多线程或者线程池推理，对机器人处理图像带来的问题是：在输入图像帧率很高的时候，无法保证图像按照输入顺序输出，这对于机器人这种需要根据目标前后运动状态进行决策的智能体来说是有问题的，可能会导致误判。不过我认为如果可以实现这是一个跨越量级的提高吞吐量的方法，当然前提是不增加延迟并且输出有序，我也会继续尝试。
 79 | 
 80 | ## 3 实现
 81 | 
 82 | 具体的实现过程在这里[2_faster_tensorrt](./2_faster_tensorrt/readme.md)，有完整的代码解释、模型推理接口、增加模型方法等的说明。
 83 | 
 84 | 
 85 | ## 4 总结
 86 | 
 87 | 那本仓库要做的是什么？
 88 | 
 89 | 1. 首先暂时抛弃了多batch，因为当前使用的机器人不需要多输入。
 90 | 2. 设置任务队列(超过则阻塞，尽量不增加延迟)，可以根据模型预处理和推理耗时手动调整，保证最优的吞吐和延迟。
 91 | 3. 尽量在可加速的硬件上执行预处理和后处理。
 92 | 
 93 | 
 94 | 
 95 | ## other
 96 | 
 97 | >类似百度的fastdeploy、mmdeploy等部署仓库都是有好有坏。首先它们对自家框架都支持的比较好比如paddledetection, mmlab系列的模型仓库等，但是缺点就是现在后端框架的api的更新可能比较快，有时可能无法用到最新的版本和接口，而且开源仓库的维护成本确实比较大，而且人员有限，所以他们可能更新的会稍微慢一点(我在工作确定之后也会参与到相应的开源项目中去，添砖加瓦)。不过另一方面不愧是大厂，这些代码仓库整体设计思路和实验确实都是非常好，可扩展性都贼拉好(相对本仓库)，后期我也会慢慢学习学习，来优化本仓库。
 98 | 
 99 | [FastDeploy](https://github.com/PaddlePaddle/FastDeploy)
100 | 
101 | [mmdeploy](https://github.com/open-mmlab/mmdeploy)
102 | 
103 | <!-- https://github.com/PaddlePaddle/FastDeploy/blob/develop/tutorials/multi_thread/README_CN.md -->
104 | 
105 | 


--------------------------------------------------------------------------------
/1_trt_base/trt_plugin/demo01/demo01.h:
--------------------------------------------------------------------------------
 1 | #include "cookbookHelper.cuh"
 2 | 
 3 | namespace
 4 | {
 5 |     static const char *PLUGIN_NAME {"ZZX_ADDScalar"};
 6 |     static const char *PLUGIN_VERSION {"1"};
 7 | }
 8 | 
 9 | namespace nvinfer1
10 | {
11 | class ZZX_ADDScalar : public IPluginV2DynamicExt
12 | {
13 | private:
14 |     const std::string name_;
15 |     std::string       namespace_;
16 |     struct
17 |     {
18 |         float scalar;
19 |     } m_;
20 | 
21 | public:
22 |     ZZX_ADDScalar() = delete;   // 删除默认构造函数
23 |     ZZX_ADDScalar(const std::string &name, float scalar);
24 |     ZZX_ADDScalar(const std::string &name, const void *buffer, size_t length);
25 |     ~ZZX_ADDScalar();
26 | 
27 |     // 继承自IPluginV2的方法
28 |     const char *getPluginType() const noexcept override;
29 |     const char *getPluginVersion() const noexcept override;
30 |     int32_t     getNbOutputs() const noexcept override;
31 |     int32_t     initialize() noexcept override;
32 |     void        terminate() noexcept override;
33 |     size_t      getSerializationSize() const noexcept override;
34 |     void        serialize(void *buffer) const noexcept override;
35 |     void        destroy() noexcept override;
36 |     void        setPluginNamespace(const char *pluginNamespace) noexcept override;
37 |     const char *getPluginNamespace() const noexcept override;
38 | 
39 |     // 继承自IPluginV2Ext的方法
40 |     DataType getOutputDataType(int32_t index, DataType const *inputTypes, int32_t nbInputs) const noexcept override;
41 |     void     attachToContext(cudnnContext *contextCudnn, cublasContext *contextCublas, IGpuAllocator *gpuAllocator) noexcept override;
42 |     void     detachFromContext() noexcept override;
43 | 
44 |     // 继承自IPluginV2DynamicExt的方法
45 |     IPluginV2DynamicExt *clone() const noexcept override;
46 |     DimsExprs            getOutputDimensions(int32_t outputIndex, const DimsExprs *inputs, int32_t nbInputs, IExprBuilder &exprBuilder) noexcept override;
47 |     bool                 supportsFormatCombination(int32_t pos, const PluginTensorDesc *inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override;
48 |     void                 configurePlugin(const DynamicPluginTensorDesc *in, int32_t nbInputs, const DynamicPluginTensorDesc *out, int32_t nbOutputs) noexcept override;
49 |     size_t               getWorkspaceSize(const PluginTensorDesc *inputs, int32_t nbInputs, const PluginTensorDesc *outputs, int32_t nbOutputs) const noexcept override;
50 |     int32_t              enqueue(const PluginTensorDesc *inputDesc, const PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
51 | 
52 | protected:
53 |     // 防止一些编译警告
54 |     using nvinfer1::IPluginV2::enqueue;
55 |     using nvinfer1::IPluginV2::getOutputDimensions;
56 |     using nvinfer1::IPluginV2::getWorkspaceSize;
57 |     using nvinfer1::IPluginV2Ext::configurePlugin;
58 | };
59 | 
60 | 
61 | class ZZXAddScalarPluginCreator : public IPluginCreator
62 | {
63 | private:
64 |     static PluginFieldCollection    fc_;
65 |     static std::vector<PluginField> attr_;
66 |     std::string                     namespace_;
67 | public:
68 |     ZZXAddScalarPluginCreator();
69 |     ~ZZXAddScalarPluginCreator();
70 |     const char *                 getPluginName() const noexcept override;
71 |     const char *                 getPluginVersion() const noexcept override;
72 |     const PluginFieldCollection *getFieldNames() noexcept override;
73 |     IPluginV2DynamicExt *        createPlugin(const char *name, const PluginFieldCollection *fc) noexcept override;
74 |     IPluginV2DynamicExt *        deserializePlugin(const char *name, const void *serialData, size_t serialLength) noexcept override;
75 |     void                         setPluginNamespace(const char *pluginNamespace) noexcept override;
76 |     const char *                 getPluginNamespace() const noexcept override;
77 | 
78 | };
79 | 
80 | }
81 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | # google风格改
  3 | 
  4 | Language:        Cpp
  5 | # BasedOnStyle:  Google
  6 | AccessModifierOffset: -4
  7 | AlignAfterOpenBracket: Align
  8 | AlignConsecutiveAssignments: false
  9 | AlignConsecutiveDeclarations: false
 10 | AlignEscapedNewlines: Left
 11 | AlignOperands:   true
 12 | AlignTrailingComments: true
 13 | AllowAllParametersOfDeclarationOnNextLine: true
 14 | AllowShortBlocksOnASingleLine: false
 15 | AllowShortCaseLabelsOnASingleLine: false
 16 | AllowShortFunctionsOnASingleLine: Inline
 17 | AllowShortIfStatementsOnASingleLine: true
 18 | AllowShortLoopsOnASingleLine: true
 19 | AlwaysBreakAfterDefinitionReturnType: None
 20 | AlwaysBreakAfterReturnType: None
 21 | AlwaysBreakBeforeMultilineStrings: true
 22 | AlwaysBreakTemplateDeclarations: true
 23 | BinPackArguments: true
 24 | BinPackParameters: true
 25 | BraceWrapping:   
 26 |   AfterClass:      false
 27 |   AfterControlStatement: false
 28 |   AfterEnum:       false
 29 |   AfterFunction:   false
 30 |   AfterNamespace:  false
 31 |   AfterObjCDeclaration: false
 32 |   AfterStruct:     false
 33 |   AfterUnion:      false
 34 |   AfterExternBlock: false
 35 |   BeforeCatch:     false
 36 |   BeforeElse:      false
 37 |   IndentBraces:    false
 38 |   SplitEmptyFunction: true
 39 |   SplitEmptyRecord: true
 40 |   SplitEmptyNamespace: true
 41 | BreakBeforeBinaryOperators: None
 42 | BreakBeforeBraces: Attach
 43 | BreakBeforeInheritanceComma: false
 44 | BreakBeforeTernaryOperators: true
 45 | BreakConstructorInitializersBeforeComma: false
 46 | BreakConstructorInitializers: BeforeColon
 47 | BreakAfterJavaFieldAnnotations: false
 48 | BreakStringLiterals: true
 49 | ColumnLimit:     100
 50 | CommentPragmas:  '^ IWYU pragma:'
 51 | CompactNamespaces: false
 52 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
 53 | ConstructorInitializerIndentWidth: 4
 54 | ContinuationIndentWidth: 4
 55 | Cpp11BracedListStyle: true
 56 | DerivePointerAlignment: true
 57 | DisableFormat:   false
 58 | ExperimentalAutoDetectBinPacking: false
 59 | FixNamespaceComments: true
 60 | ForEachMacros:   
 61 |   - foreach
 62 |   - Q_FOREACH
 63 |   - BOOST_FOREACH
 64 | IncludeBlocks:   Preserve
 65 | IncludeCategories: 
 66 |   - Regex:           '^<ext/.*\.h>'
 67 |     Priority:        2
 68 |   - Regex:           '^<.*\.h>'
 69 |     Priority:        1
 70 |   - Regex:           '^<.*'
 71 |     Priority:        2
 72 |   - Regex:           '.*'
 73 |     Priority:        3
 74 | IncludeIsMainRegex: '([-_](test|unittest))?$'
 75 | IndentCaseLabels: true
 76 | IndentPPDirectives: None
 77 | IndentWidth:     4
 78 | IndentWrappedFunctionNames: false
 79 | JavaScriptQuotes: Leave
 80 | JavaScriptWrapImports: true
 81 | KeepEmptyLinesAtTheStartOfBlocks: false
 82 | MacroBlockBegin: ''
 83 | MacroBlockEnd:   ''
 84 | MaxEmptyLinesToKeep: 1
 85 | NamespaceIndentation: None
 86 | ObjCBlockIndentWidth: 2
 87 | ObjCSpaceAfterProperty: false
 88 | ObjCSpaceBeforeProtocolList: false
 89 | PenaltyBreakAssignment: 2
 90 | PenaltyBreakBeforeFirstCallParameter: 1
 91 | PenaltyBreakComment: 300
 92 | PenaltyBreakFirstLessLess: 120
 93 | PenaltyBreakString: 1000
 94 | PenaltyExcessCharacter: 1000000
 95 | PenaltyReturnTypeOnItsOwnLine: 200
 96 | PointerAlignment: Right
 97 | RawStringFormats: 
 98 |   - Delimiter:       pb
 99 |     Language:        TextProto
100 |     BasedOnStyle:    google
101 | ReflowComments:  true
102 | SortIncludes:    false
103 | SortUsingDeclarations: false
104 | SpaceAfterCStyleCast: false
105 | SpaceAfterTemplateKeyword: true
106 | SpaceBeforeAssignmentOperators: true
107 | SpaceBeforeParens: Never    # ControlStatements 
108 | SpaceInEmptyParentheses: false
109 | SpacesBeforeTrailingComments: 2
110 | SpacesInAngles:  false
111 | SpacesInContainerLiterals: true
112 | SpacesInCStyleCastParentheses: false
113 | SpacesInParentheses: false
114 | SpacesInSquareBrackets: false
115 | Standard:        Cpp11
116 | TabWidth:        4
117 | UseTab:          Never
118 | ...
119 | 
120 | 


--------------------------------------------------------------------------------
/3_faster_ncnn/src/base/infer_base.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <queue>
  4 | #include <string>
  5 | #include <vector>
  6 | #include <thread>
  7 | #include <future>
  8 | #include <memory>
  9 | #include <condition_variable>
 10 | 
 11 | #include <opencv2/opencv.hpp>
 12 | #include "net.h"
 13 | 
 14 | #include "tools.hpp"
 15 | 
 16 | namespace FasterNCNN{
 17 | 
 18 | 
 19 | template<class Input, class Output>
 20 | class DetBase {
 21 | 
 22 | public:
 23 |     struct Job{
 24 |         ncnn::Mat input;
 25 |         Output output;
 26 |         std::shared_ptr<std::promise<Output>> pro;
 27 |     };
 28 | 
 29 | 
 30 |     virtual ~DetBase() {stop();};
 31 |     void stop() {
 32 |             run_ = false;
 33 |         cond_.notify_all();
 34 | 
 35 |         /// cleanup jobs
 36 |         {
 37 |             std::unique_lock<std::mutex> l(jobs_lock_);
 38 |             while(!jobs_.empty()){
 39 |                 auto& item = jobs_.front();
 40 |                 if(item.pro)
 41 |                     item.pro->set_value(Output());
 42 |                 jobs_.pop();
 43 |             }
 44 |         };
 45 | 
 46 |         if(worker_){
 47 |             worker_->join();
 48 |             worker_.reset();
 49 |         }
 50 |     }
 51 | 
 52 |     // 启动 初始化线程 用一个promise等待worker中的初始化结束
 53 |     bool startup() {
 54 |         run_ = true;
 55 | 
 56 |         std::promise<bool> pro;
 57 |         worker_      = std::make_shared<std::thread>(&DetBase::worker, this, std::ref(pro));
 58 |         return pro.get_future().get();
 59 |     }
 60 | 
 61 |     // 工作线程(纯虚)
 62 |     virtual void worker(std::promise<bool>& result) = 0;
 63 |     // 预处理(纯虚)
 64 |     virtual bool preprocess(Job& job, const Input& input) = 0;
 65 | 
 66 | 
 67 | 
 68 |     virtual void forward() {
 69 |         auto ex = net_.create_extractor();
 70 |         // INFO("inputname: %s", input_name);
 71 |         // INFO("outputname: %s", output_name);
 72 |         // if(ncnn_use_vulkan_compute_) ex.set_vulkan_compute(true);
 73 |         ex.set_num_threads(ncnn_num_threads_);
 74 |         ex.input(input_name_.c_str(), input_);
 75 |         ex.extract(output_name_.c_str(), output_);
 76 |     }
 77 |     virtual std::shared_future<Output> commit(const Input& input) {
 78 |         Job job;
 79 |         job.pro = std::make_shared<std::promise<Output>>();
 80 |         if(!preprocess(job, input)){
 81 |             job.pro->set_value(Output());
 82 |             return job.pro->get_future();
 83 |         }
 84 |         
 85 |         ////////////////////上锁并且推进队列////////////////////////////
 86 |         {
 87 |             std::unique_lock<std::mutex> l(jobs_lock_);
 88 |             // jobs_.push(job);
 89 |             jobs_.emplace(job);
 90 |         };
 91 |         cond_.notify_one();
 92 |         return job.pro->get_future();
 93 |     }
 94 | 
 95 |     // 获取任务 等待之前的任务执行完毕
 96 |     virtual bool get_job_and_wait(Job& fetch_job) {
 97 |         std::unique_lock<std::mutex> l(jobs_lock_);
 98 |         cond_.wait(l, [&](){
 99 |             return !run_ || !jobs_.empty();
100 |         });
101 | 
102 |         if(!run_) return false;
103 |         
104 |         fetch_job = std::move(jobs_.front());
105 |         jobs_.pop();
106 |         return true;
107 |     }
108 | 
109 | 
110 | protected:
111 |     // ncnn
112 |     ncnn::Net net_;
113 |     int input_w_;
114 |     int input_h_;
115 |     std::string input_name_;
116 |     std::string output_name_;
117 |     ncnn::Mat input_;
118 |     ncnn::Mat output_;
119 |     int ncnn_num_threads_ = 8;
120 |     bool ncnn_use_vulkan_compute_;
121 | 
122 |     // multi threads
123 |     std::atomic<bool> run_;
124 |     std::mutex jobs_lock_;
125 |     std::queue<Job> jobs_;
126 |     std::shared_ptr<std::thread> worker_;
127 |     std::condition_variable cond_;
128 | 
129 | };
130 | 
131 | // 接口类 需要重写里面的纯虚函数
132 | template<class Input, class Output>
133 | class InferBase{
134 | public:
135 |     virtual std::shared_future<Output> commit(const Input &input) = 0;
136 | };
137 | 
138 | 
139 | }
140 | 
141 | 


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/apps/yolo/yolo.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file yolo.h
  3 |  * @author 0zzx0
  4 |  * @brief 重写改进，集成高性能yolo推理接口
  5 |  * @version 1.0
  6 |  * @date 2023-6-11
  7 |  *
  8 |  * @copyright Copyright (c) 2023
  9 |  *
 10 |  */
 11 | 
 12 | #ifndef YOLO_HPP
 13 | #define YOLO_HPP
 14 | 
 15 | #include "../common.hpp"
 16 | 
 17 | namespace YOLO {
 18 | using namespace FasterTRT;
 19 | 
 20 | // 模型选择
 21 | enum class YoloType : int { V5 = 0, X = 1, V8 = 2 };
 22 | 
 23 | // 模型名字
 24 | const char *type_name(YoloType type);
 25 | 
 26 | /**
 27 |  * @brief Decode配置的实现
 28 |  *  不同模型输出的decode一般都不一样，即使是yolo系列也有一些区别，
 29 |  *  这里需要实现不同模型的decode。尤其是anchor base 和anchor free的区别
 30 |  */
 31 | struct DecodeMeta {
 32 |     int num_anchor;
 33 |     int num_level;
 34 |     float w[16], h[16];
 35 |     int strides[16];
 36 | 
 37 |     // static DecodeMeta v5_p5_default_meta();
 38 |     static DecodeMeta x_default_meta();
 39 |     static DecodeMeta v8_default_meta();
 40 | };
 41 | 
 42 | // 线程安全模板类设置模板类型
 43 | using ThreadSafedAsyncInferImpl =
 44 |     ThreadSafedAsyncInfer<cv::Mat,                       // input
 45 |                           BoxArray,                      // output
 46 |                           std::tuple<std::string, int>,  // start param
 47 |                           AffineMatrix                   // additional
 48 |                           >;
 49 | using Infer = InferBase<cv::Mat, BoxArray>;
 50 | 
 51 | /**
 52 |  * @brief 推理类的实现，继承必备父类，重写父类方法
 53 |  *
 54 |  */
 55 | class YoloTRTInferImpl : public Infer, public ThreadSafedAsyncInferImpl {
 56 | public:
 57 |     // 析构 调用来自基类ThreadSafedAsyncInferImpl的stop函数
 58 |     ~YoloTRTInferImpl();
 59 | 
 60 |     virtual bool startup(const std::string &file, YoloType type, int gpuid, int batch_size,
 61 |                          float confidence_threshold, float nms_threshold,
 62 |                          bool is_use_trtnNMSPlugin = false);
 63 |     virtual void worker(std::promise<bool> &result) override;
 64 |     virtual bool preprocess(Job &job, const cv::Mat &image) override;
 65 | 
 66 |     virtual std::vector<std::shared_future<BoxArray>> commits(
 67 |         const std::vector<cv::Mat> &images) override;
 68 |     virtual std::shared_future<BoxArray> commit(const cv::Mat &image) override;
 69 | 
 70 |     void init_yolox_prior_box(Tensor &prior_box);
 71 |     void init_yolov8_prior_box(Tensor &prior_box);
 72 |     void init_yolov5_prior_box(Tensor &prior_box);
 73 | 
 74 | private:
 75 |     int input_width_ = 0;
 76 |     int input_height_ = 0;
 77 |     int gpu_ = 0;
 78 |     float confidence_threshold_ = 0;
 79 |     float nms_threshold_ = 0;
 80 |     cudaStream_t stream_ = nullptr;
 81 |     cudaStream_t stream_pro_ = nullptr;
 82 |     Norm normalize_;
 83 |     YoloType type_;
 84 |     DecodeMeta meta_;
 85 |     int batch_size_ = 1;
 86 |     bool is_use_trtnNMSPlugin_ = false;
 87 | };
 88 | 
 89 | /*
 90 |     trt模型编译(不过我实际建议直接用trtexec转换,嘻嘻0_0)
 91 |     max max_batch_size：为最大可以允许的batch数量
 92 |     source_onnx_file：onnx文件
 93 |     save_engine_file：储存的tensorRT模型
 94 |     max_workspace_size：最大工作空间大小，一般给1GB，在嵌入式可以改为256MB，单位是byte
 95 |     int8 images
 96 |    folder：对于Mode为INT8时，需要提供图像数据进行标定，请提供文件夹，会自动检索下面的jpg/jpeg/tiff/png/bmp
 97 |     int8_entropy_calibrator_cache_file：对于int8模式下，熵文件可以缓存，避免二次加载数据，可以跨平台使用，是一个txt文件
 98 | */
 99 | bool compile(Mode mode, YoloType type, unsigned int max_batch_size,
100 |              const std::string &source_onnx_file, const std::string &save_engine_file,
101 |              size_t max_workspace_size = 1 << 30, const std::string &int8_images_folder = "",
102 |              const std::string &int8_entropy_calibrator_cache_file = "");
103 | 
104 | // image转成tensor
105 | void image_to_tensor(const cv::Mat &image, std::shared_ptr<Tensor> &tensor, YoloType type,
106 |                      int ibatch);
107 | 
108 | // 创建推理器
109 | std::shared_ptr<Infer> create_infer(const std::string &engine_file, YoloType type, int gpuid,
110 |                                     int batch_size, float confidence_threshold = 0.2f,
111 |                                     float nms_threshold = 0.5f);
112 | 
113 | };  // end namespace YOLO
114 | 
115 | #endif


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/base/monopoly_accocator.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file monopoly_accocator.hpp
  3 |  * @author 0zzx0
  4 |  * @brief 独占分配器
  5 |  * @version 0.1
  6 |  * @date 2023-6-11 2023-8-21
  7 |  *
  8 |  * @copyright Copyright (c) 2023
  9 |  *
 10 |  */
 11 | 
 12 | #ifndef MONOPOLY_ALLOCATOR_HPP
 13 | #define MONOPOLY_ALLOCATOR_HPP
 14 | 
 15 | #include <vector>
 16 | #include <mutex>
 17 | #include <memory>
 18 | #include <condition_variable>
 19 | 
 20 | namespace FasterTRT {
 21 | ///////////////////////////class MonopolyAllocator///////////////////////////
 22 | /* 独占分配器
 23 |     通过对tensor做独占管理，具有max_batch * 2个tensor，通过query获取一个tensor
 24 |     当推理结束后，该tensor释放使用权，即可交给下一个图像使用，内存实现复用
 25 | 
 26 |  * 1. tensor复用
 27 |  * 2. tensor的预处理和推理并行
 28 |  *
 29 |  * 输入图像时，具有2倍batch的空间进行预处理用于缓存
 30 |  * 引擎推理时，每次拿1个batch的数据进行推理
 31 |  * 当引擎推理速度慢而预处理速度快时，输入图像需要进行等候。
 32 |  **/
 33 | template <class _ItemType>
 34 | class MonopolyAllocator {
 35 | public:
 36 |     /* MonopolyData是数据容器类
 37 |     允许query获取的item执行item->release释放自身所有权，该对象可以被复用
 38 |     通过item->data()获取储存的对象的指针
 39 |     */
 40 |     class MonopolyData {
 41 |     public:
 42 |         std::shared_ptr<_ItemType>& data() { return data_; }
 43 |         void release() { manager_->release_one(this); }
 44 | 
 45 |     private:
 46 |         MonopolyData(MonopolyAllocator* pmanager) { manager_ = pmanager; }
 47 | 
 48 |     private:
 49 |         friend class MonopolyAllocator;
 50 |         MonopolyAllocator* manager_ = nullptr;
 51 |         std::shared_ptr<_ItemType> data_;
 52 |         bool available_ = true;
 53 |     };
 54 |     typedef std::shared_ptr<MonopolyData> MonopolyDataPointer;
 55 | 
 56 |     // 构造函数 初始化尺寸
 57 |     MonopolyAllocator(int size) {
 58 |         capacity_ = size;
 59 |         num_available_ = size;
 60 |         datas_.resize(size);
 61 | 
 62 |         for(int i = 0; i < size; ++i)
 63 |             datas_[i] = std::shared_ptr<MonopolyData>(new MonopolyData(this));
 64 |     }
 65 | 
 66 |     // 析构
 67 |     virtual ~MonopolyAllocator() {
 68 |         run_ = false;
 69 |         cv_.notify_all();
 70 | 
 71 |         std::unique_lock<std::mutex> l(lock_);
 72 |         cv_exit_.wait(l, [&]() { return num_wait_thread_ == 0; });
 73 |     }
 74 |     /* 获取一个可用的对象
 75 |     timeout：超时时间，如果没有可用的对象，将会进入阻塞等待，如果等待超时则返回空指针
 76 |     请求得到一个对象后，该对象被占用，除非他执行了release释放该对象所有权
 77 |     */
 78 |     MonopolyDataPointer query(int timeout = 10000) {
 79 |         std::unique_lock<std::mutex> l(lock_);
 80 |         if(!run_) return nullptr;
 81 | 
 82 |         if(num_available_ == 0) {
 83 |             num_wait_thread_++;
 84 | 
 85 |             auto state = cv_.wait_for(l, std::chrono::milliseconds(timeout),
 86 |                                       [&]() { return num_available_ > 0 || !run_; });
 87 | 
 88 |             num_wait_thread_--;
 89 |             cv_exit_.notify_one();
 90 | 
 91 |             // timeout, no available, exit program
 92 |             if(!state || num_available_ == 0 || !run_) return nullptr;
 93 |         }
 94 | 
 95 |         auto item = std::find_if(datas_.begin(), datas_.end(),
 96 |                                  [](MonopolyDataPointer& item) { return item->available_; });
 97 |         if(item == datas_.end()) return nullptr;
 98 | 
 99 |         (*item)->available_ = false;
100 |         num_available_--;
101 |         return *item;
102 |     }
103 | 
104 |     // 有效数量
105 |     int num_available() { return num_available_; }
106 | 
107 |     // 空间大小
108 |     int capacity() { return capacity_; }
109 | 
110 | private:
111 |     // 释放一个对象的所有权
112 |     void release_one(MonopolyData* prq) {
113 |         std::unique_lock<std::mutex> l(lock_);
114 |         if(!prq->available_) {
115 |             prq->available_ = true;
116 |             num_available_++;
117 |             cv_.notify_one();
118 |         }
119 |     }
120 | 
121 | private:
122 |     std::mutex lock_;
123 |     std::condition_variable cv_;
124 |     std::condition_variable cv_exit_;
125 |     std::vector<MonopolyDataPointer> datas_;
126 |     int capacity_ = 0;
127 |     volatile int num_available_ = 0;
128 |     volatile int num_wait_thread_ = 0;
129 |     volatile bool run_ = true;
130 | };
131 | 
132 | };  // namespace FasterTRT
133 | 
134 | #endif


--------------------------------------------------------------------------------
/3_faster_ncnn/src/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <chrono>
  2 | 
  3 | #include "apps/yolo/yolo.h"
  4 | 
  5 | using namespace std;
  6 | 
  7 | // 打印结果因袭
  8 | static void printBox(vector<YoloNCNN::ObjBox> &boxs) {
  9 |     printf("obj nums: %d \n", boxs.size());
 10 |     for(int i=0;i<boxs.size();++i) {
 11 |         printf("obj[%d], socre: %.3f, id: %d, location: [%d, %d, %d, %d]\n", 
 12 |                 i, boxs[i].score, boxs[i].category, boxs[i].x1, boxs[i].x2, boxs[i].y1, boxs[i].y2);
 13 |     }
 14 | }
 15 | 
 16 | 
 17 | void t2(){
 18 |     string param_path = "../model.param";
 19 |     string model_path = "../model.bin";
 20 | 
 21 |     auto infer = YoloNCNN::create_infer(param_path, model_path, 0.5, 0.65); 
 22 | 
 23 |     if (infer == nullptr){
 24 |         printf("Infer is nullptr.\n");
 25 |         return ;
 26 |     }
 27 |     
 28 |     string img_path = "../img/000026.jpg";
 29 |     cv::Mat img = cv::imread(img_path);
 30 | 
 31 |     queue<std::shared_future<std::vector<YoloNCNN::ObjBox>>> out_queue;
 32 | 
 33 |     auto start = std::chrono::system_clock::now();
 34 |     for(int i=0;i<100;i++){
 35 | 
 36 |         // auto start = std::chrono::system_clock::now();
 37 |         auto fut = infer->commit(img);     // 任务提交
 38 |         // auto end = std::chrono::system_clock::now();
 39 |         // cout << chrono::duration_cast<chrono::milliseconds>(end - start).count()<< "ms" << endl;
 40 | 
 41 |         out_queue.push(fut);
 42 |         if(out_queue.size() <= 1){
 43 |             continue;
 44 |         }
 45 |         auto res = out_queue.front().get();
 46 |         out_queue.pop();
 47 |     }
 48 |     while(!out_queue.empty()){
 49 |         auto res = out_queue.front().get();
 50 |         out_queue.pop();
 51 |     }
 52 |     
 53 |     auto end = std::chrono::system_clock::now();
 54 |     cout << chrono::duration_cast<chrono::milliseconds>(end - start).count() / 100.0f<< "ms" << endl;
 55 | 
 56 |     // cout << res.size() << endl;
 57 |     // for (size_t i = 0; i < res.size(); i++){
 58 |     //     NCNN_DET::ObjBox box = res[i];
 59 |     //     cout<<box.score<<endl;
 60 |     //     cv::rectangle(img, cv::Point(box.x1, box.y1), cv::Point(box.x2, box.y2),
 61 |     //               cv::Scalar(0, 0, 255), 2);
 62 |     //     // cv::putText(img, pred->class_names[box.category], cv::Point(box.x1,
 63 |     //     // box.y1),
 64 |     //     //             cv::FONT_HERSHEY_SIMPLEX, 0.75, cv::Scalar(0, 255, 0), 2);
 65 |     // }
 66 |     // cv::imwrite("result_test.jpg", img);
 67 | 
 68 |     return ;
 69 | }
 70 | 
 71 | void t1(){
 72 |     string param_path = "../model.param";
 73 |     string model_path = "../model.bin";
 74 | 
 75 |     auto infer = YoloNCNN::create_infer(param_path, model_path, 0.5, 0.45); 
 76 | 
 77 |     if (infer == nullptr){
 78 |         printf("Infer is nullptr.\n");
 79 |         return ;
 80 |     }
 81 | 
 82 |     string img_path = "../img/000026.jpg";
 83 |     cv::Mat img = cv::imread(img_path);
 84 | 
 85 |     // warmup
 86 |     for(int i=0;i<10;i++){
 87 |         auto res = infer->commit(img).get();     // 将任务提交给推理器（推理器执行commit)
 88 |     }
 89 | 
 90 |     int count = 10;
 91 |     auto start = std::chrono::system_clock::now();
 92 |     for(int i=0;i<count;i++){
 93 |         auto res = infer->commit(img).get();     // 将任务提交给推理器（推理器执行commit)
 94 |     }
 95 |     
 96 |     auto end = std::chrono::system_clock::now();
 97 |     float cost_time = chrono::duration_cast<chrono::milliseconds>(end - start).count();
 98 |     cout << cost_time / (count * 1.0) << "ms" << endl;
 99 | 
100 |     auto res = infer->commit(img).get();
101 |     printBox(res);
102 |     // for (size_t i = 0; i < res.size(); i++){
103 |     //     auto box = res[i];
104 |     //     cout << box.score << endl;
105 |     //     // cv::rectangle(img, cv::Point(box.x1, box.y1), cv::Point(box.x2, box.y2),
106 |     //             //   cv::Scalar(0, 0, 255), 2);
107 |     //     // cv::putText(img, pred->class_names[box.category], cv::Point(box.x1,
108 |     //     // box.y1),
109 |     //     //             cv::FONT_HERSHEY_SIMPLEX, 0.75, cv::Scalar(0, 255, 0), 2);
110 |     // }
111 |     // cv::imwrite("result_test.jpg", img);
112 |     return ;
113 | 
114 | }
115 | 
116 | int main(){
117 |     t1();
118 |     // t2();
119 | 
120 | }
121 | 
122 | 


--------------------------------------------------------------------------------
/3_faster_ncnn/src/apps/common.cpp:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | 
  3 | namespace FasterNCNN{
  4 | 
  5 | float InterSectionArea(const ObjBox &a, const ObjBox &b){
  6 |     if(a.x1 > b.x2 || a.x2 < b.x1 || a.y1 > b.y2 || a.y2 < b.y1){
  7 |         return 0.0f;
  8 |     }
  9 |     float inter_w = std::min(a.x2, b.x2) - std::min(a.x1, b.x1);
 10 |     float inter_h = std::min(a.y2, b.y2) - std::min(a.y1, b.y1);
 11 | 
 12 |     return inter_w * inter_h; 
 13 | }
 14 | 
 15 | bool ScoreSort(ObjBox a, ObjBox b){
 16 |     return (a.score > b.score);
 17 | }
 18 | 
 19 | void nms(std::vector<ObjBox> &src_boxes, std::vector<ObjBox> &dst_boxes, float threshold){
 20 |     std::vector<int> picked;
 21 |     std::sort(src_boxes.begin(), src_boxes.end(), ScoreSort);
 22 | 
 23 |     for (int i = 0; i < src_boxes.size(); i++){
 24 |         int keep = 1;
 25 |         for(int j=0; j < picked.size(); j++){
 26 |             float inter_area = InterSectionArea(src_boxes[i], src_boxes[picked[j]]);
 27 |             float union_area = src_boxes[i].area() + src_boxes[picked[j]].area() - inter_area;
 28 |             float iou = inter_area / union_area;
 29 |             if((iou > threshold) && (src_boxes[i].category == src_boxes[picked[j]].category)){
 30 |                 keep = 0;
 31 |                 break;
 32 |             }
 33 |         }
 34 |         if(keep){
 35 |             picked.push_back(i);
 36 |         }
 37 |     }
 38 |     for(int i=0;i<picked.size();i++){
 39 |         dst_boxes.push_back(src_boxes[picked[i]]);
 40 |     }
 41 |     return ;
 42 | 
 43 | }
 44 | 
 45 | 
 46 | 
 47 | postProcess::postProcess(postProcessType type, float input_h, float input_w, float conf_thr, float nms_thr) {
 48 |     if(type != postProcessType::yolox) {
 49 |         INFOE("目前只支持yolox后处理! ");
 50 |         return ;
 51 |     }
 52 |     input_h_ = input_h;
 53 |     input_w_ = input_w;
 54 |     conf_thr_ = conf_thr;
 55 |     nms_thr_ = nms_thr;
 56 |     yolox_generate_grids_and_stride();
 57 | 
 58 | }
 59 | 
 60 | void postProcess::forward(ncnn::Mat &output_ ){
 61 |     out_boxes.clear();
 62 |     nms_boxes.clear();
 63 |     yolox_decode(output_);
 64 |     nms(out_boxes, nms_boxes, nms_thr_);
 65 | }
 66 | 
 67 | void postProcess::yolox_generate_grids_and_stride(){
 68 |     for( auto stride: strides){
 69 |         int num_grid = input_w_ / stride;
 70 |         for(int g1=0; g1<num_grid; g1++){
 71 |             for (int g0 = 0; g0<num_grid; g0++){
 72 |                 GridAndStride gs;
 73 |                 gs.grid0 = g0;
 74 |                 gs.grid1 = g1;
 75 |                 gs.stride = stride;
 76 |                 grid_strides.push_back(gs);
 77 |             }
 78 |         }
 79 |     }
 80 | }
 81 | 
 82 | void postProcess::yolox_decode(ncnn::Mat &output_){
 83 |     
 84 |     const int num_grid = output_.h;
 85 |     const int num_class = output_.w - 5;
 86 |     const int num_anchors = grid_strides.size();
 87 | 
 88 |     const float* feat_ptr = output_.channel(0);
 89 |     for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++){
 90 | 
 91 |         const int grid0 = grid_strides[anchor_idx].grid0;
 92 |         const int grid1 = grid_strides[anchor_idx].grid1;
 93 |         const int stride = grid_strides[anchor_idx].stride;
 94 | 
 95 |         // yolox/models/yolo_head.py decode logic
 96 |         //  outputs[..., :2] = (outputs[..., :2] + grids) * strides
 97 |         //  outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
 98 |         float x_center = (feat_ptr[0] + grid0) * stride;
 99 |         float y_center = (feat_ptr[1] + grid1) * stride;
100 |         float w = exp(feat_ptr[2]) * stride;
101 |         float h = exp(feat_ptr[3]) * stride;
102 |         float x0 = x_center - w * 0.5f;
103 |         float y0 = y_center - h * 0.5f;
104 |         float x1 = x_center + w * 0.5f;
105 |         float y1 = y_center + h * 0.5f;
106 | 
107 |         float box_objectness = feat_ptr[4];
108 |         for (int class_idx = 0; class_idx < num_class; class_idx++){
109 | 
110 |             float box_cls_score = feat_ptr[5 + class_idx];
111 |             float box_prob = box_objectness * box_cls_score;
112 |             if (box_prob > conf_thr_){
113 | 
114 |                 ObjBox obj;
115 |                 obj.x1 = x0;
116 |                 obj.y1 = y0;
117 |                 obj.x2 = x1;
118 |                 obj.y2 = y1;
119 | 
120 |                 obj.category = class_idx;
121 |                 obj.score = box_prob;
122 | 
123 |                 out_boxes.push_back(obj);
124 |             }
125 | 
126 |         } // class loop
127 |         feat_ptr += output_.w;
128 | 
129 |     } // point anchor loop
130 | }
131 | 
132 | 
133 | }
134 | 


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/base/trt_base.cpp:
--------------------------------------------------------------------------------
  1 | #include "trt_base.hpp"
  2 | 
  3 | namespace FasterTRT {
  4 | 
  5 | // 返回mode的名字
  6 | const char* mode_string(Mode type) {
  7 |     switch(type) {
  8 |         case Mode::FP32:
  9 |             return "FP32";
 10 |         case Mode::FP16:
 11 |             return "FP16";
 12 |         case Mode::INT8:
 13 |             return "INT8";
 14 |         default:
 15 |             return "UnknowCompileMode";
 16 |     }
 17 | }
 18 | 
 19 | //////////////////////////////////////////////////////////////////////
 20 | ////////////////////// Int8EntropyCalibrator /////////////////////////
 21 | //////////////////////////////////////////////////////////////////////
 22 | Int8EntropyCalibrator::Int8EntropyCalibrator(const std::vector<std::string>& imagefiles,
 23 |                                              nvinfer1::Dims dims, const Int8Process& preprocess) {
 24 |     Assert(preprocess != nullptr);
 25 |     this->dims_ = dims;
 26 |     this->allimgs_ = imagefiles;
 27 |     this->preprocess_ = preprocess;
 28 |     this->fromCalibratorData_ = false;
 29 |     files_.resize(dims.d[0]);
 30 |     checkCudaRuntime(cudaStreamCreate(&stream_));
 31 | }
 32 | 
 33 | Int8EntropyCalibrator::Int8EntropyCalibrator(const std::vector<uint8_t>& entropyCalibratorData,
 34 |                                              nvinfer1::Dims dims, const Int8Process& preprocess) {
 35 |     Assert(preprocess != nullptr);
 36 | 
 37 |     this->dims_ = dims;
 38 |     this->entropyCalibratorData_ = entropyCalibratorData;
 39 |     this->preprocess_ = preprocess;
 40 |     this->fromCalibratorData_ = true;
 41 |     files_.resize(dims.d[0]);
 42 |     checkCudaRuntime(cudaStreamCreate(&stream_));
 43 | }
 44 | 
 45 | Int8EntropyCalibrator::~Int8EntropyCalibrator() {
 46 |     checkCudaRuntime(cudaStreamDestroy(stream_));
 47 | }
 48 | 
 49 | int Int8EntropyCalibrator::getBatchSize() const noexcept {
 50 |     return dims_.d[0];
 51 | }
 52 | 
 53 | bool Int8EntropyCalibrator::next() {
 54 |     int batch_size = dims_.d[0];
 55 |     if(cursor_ + batch_size > allimgs_.size()) return false;
 56 | 
 57 |     int old_cursor = cursor_;
 58 |     for(int i = 0; i < batch_size; ++i) files_[i] = allimgs_[cursor_++];
 59 | 
 60 |     if(!tensor_) {
 61 |         tensor_.reset(new Tensor(dims_.nbDims, dims_.d));
 62 |         tensor_->set_stream(stream_);
 63 |         tensor_->set_workspace(std::make_shared<MixMemory>());
 64 |     }
 65 | 
 66 |     preprocess_(old_cursor, allimgs_.size(), files_, tensor_);
 67 |     return true;
 68 | }
 69 | 
 70 | bool Int8EntropyCalibrator::getBatch(void* bindings[], const char* names[],
 71 |                                      int nbBindings) noexcept {
 72 |     if(!next()) return false;
 73 |     bindings[0] = tensor_->gpu();
 74 |     return true;
 75 | }
 76 | 
 77 | const std::vector<uint8_t>& Int8EntropyCalibrator::getEntropyCalibratorData() {
 78 |     return entropyCalibratorData_;
 79 | }
 80 | 
 81 | const void* Int8EntropyCalibrator::readCalibrationCache(size_t& length) noexcept {
 82 |     if(fromCalibratorData_) {
 83 |         length = this->entropyCalibratorData_.size();
 84 |         return this->entropyCalibratorData_.data();
 85 |     }
 86 | 
 87 |     length = 0;
 88 |     return nullptr;
 89 | }
 90 | 
 91 | void Int8EntropyCalibrator::writeCalibrationCache(const void* cache, size_t length) noexcept {
 92 |     entropyCalibratorData_.assign((uint8_t*)cache, (uint8_t*)cache + length);
 93 | }
 94 | 
 95 | std::vector<std::string> glob_image_files(const std::string& directory) {
 96 |     /* 检索目录下的所有图像："*.jpg;*.png;*.bmp;*.jpeg;*.tiff" */
 97 |     std::vector<std::string> files, output;
 98 |     std::set<std::string> pattern_set{"jpg", "png", "bmp", "jpeg", "tiff"};
 99 | 
100 |     if(directory.empty()) {
101 |         INFOE("Glob images from folder failed, folder is empty");
102 |         return output;
103 |     }
104 | 
105 |     try {
106 |         std::vector<cv::String> files_;
107 |         files_.reserve(10000);
108 |         cv::glob(directory + "/*", files_, true);
109 |         files.insert(files.end(), files_.begin(), files_.end());
110 |     } catch(...) {
111 |         INFOE("Glob %s failed", directory.c_str());
112 |         return output;
113 |     }
114 | 
115 |     for(int i = 0; i < files.size(); ++i) {
116 |         auto& file = files[i];
117 |         int p = file.rfind(".");
118 |         if(p == -1) continue;
119 | 
120 |         auto suffix = file.substr(p + 1);
121 |         std::transform(suffix.begin(), suffix.end(), suffix.begin(), [](char c) {
122 |             if(c >= 'A' && c <= 'Z') c -= 'A' + 'a';
123 |             return c;
124 |         });
125 |         if(pattern_set.find(suffix) != pattern_set.end()) output.push_back(file);
126 |     }
127 |     return output;
128 | }
129 | 
130 | }  // namespace FasterTRT


--------------------------------------------------------------------------------
/1_trt_base/trt_yolox/py/tools.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import cv2
  3 | from typing import Tuple
  4 | 
  5 | 
  6 | # 图像预处理
  7 | def preproc(img: np.ndarray, input_size: tuple, swap: tuple=(2, 0, 1))->Tuple[np.ndarray, float]:
  8 |     padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
  9 |     
 10 |     r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
 11 |     resized_img = cv2.resize(img, 
 12 |                              (int(img.shape[1] * r), int(img.shape[0] * r)),
 13 |                              interpolation=cv2.INTER_LINEAR,
 14 |                              ).astype(np.uint8)
 15 |     padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
 16 | 
 17 |     padded_img = padded_img.transpose(swap)
 18 |     padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
 19 |     return padded_img, r
 20 | 
 21 | # 后处理 不包括nms
 22 | def demo_postprocess(outputs: np.ndarray, img_size: tuple, p6: bool=False):
 23 |     grids = []
 24 |     expanded_strides = []
 25 |     strides = [8, 16, 32] if not p6 else [8, 16, 32, 64]
 26 | 
 27 |     hsizes = [img_size[0] // stride for stride in strides]
 28 |     wsizes = [img_size[1] // stride for stride in strides]
 29 | 
 30 |     for hsize, wsize, stride in zip(hsizes, wsizes, strides):
 31 |         xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
 32 |         grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
 33 |         grids.append(grid)
 34 |         shape = grid.shape[:2]
 35 |         expanded_strides.append(np.full((*shape, 1), stride))
 36 | 
 37 |     grids = np.concatenate(grids, 1)
 38 |     expanded_strides = np.concatenate(expanded_strides, 1)
 39 |     outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
 40 |     outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
 41 | 
 42 |     return outputs
 43 | 
 44 | 
 45 | 
 46 | def nms(boxes, scores, nms_thr):
 47 |     """Single class NMS implemented in Numpy."""
 48 |     x1 = boxes[:, 0]
 49 |     y1 = boxes[:, 1]
 50 |     x2 = boxes[:, 2]
 51 |     y2 = boxes[:, 3]
 52 | 
 53 |     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
 54 |     order = scores.argsort()[::-1]
 55 | 
 56 |     keep = []
 57 |     while order.size > 0:
 58 |         i = order[0]
 59 |         keep.append(i)
 60 |         xx1 = np.maximum(x1[i], x1[order[1:]])
 61 |         yy1 = np.maximum(y1[i], y1[order[1:]])
 62 |         xx2 = np.minimum(x2[i], x2[order[1:]])
 63 |         yy2 = np.minimum(y2[i], y2[order[1:]])
 64 | 
 65 |         w = np.maximum(0.0, xx2 - xx1 + 1)
 66 |         h = np.maximum(0.0, yy2 - yy1 + 1)
 67 |         inter = w * h
 68 |         ovr = inter / (areas[i] + areas[order[1:]] - inter)
 69 | 
 70 |         inds = np.where(ovr <= nms_thr)[0]
 71 |         order = order[inds + 1]
 72 | 
 73 |     return keep
 74 | 
 75 | 
 76 | def multiclass_nms(boxes, scores, nms_thr, score_thr, class_agnostic=True):
 77 |     """Multiclass NMS implemented in Numpy"""
 78 |     if class_agnostic:
 79 |         nms_method = multiclass_nms_class_agnostic
 80 |     else:
 81 |         nms_method = multiclass_nms_class_aware
 82 |     return nms_method(boxes, scores, nms_thr, score_thr)
 83 | 
 84 | 
 85 | def multiclass_nms_class_aware(boxes, scores, nms_thr, score_thr):
 86 |     """Multiclass NMS implemented in Numpy. Class-aware version."""
 87 |     final_dets = []
 88 |     num_classes = scores.shape[1]
 89 |     for cls_ind in range(num_classes):
 90 |         cls_scores = scores[:, cls_ind]
 91 |         valid_score_mask = cls_scores > score_thr
 92 |         if valid_score_mask.sum() == 0:
 93 |             continue
 94 |         else:
 95 |             valid_scores = cls_scores[valid_score_mask]
 96 |             valid_boxes = boxes[valid_score_mask]
 97 |             keep = nms(valid_boxes, valid_scores, nms_thr)
 98 |             if len(keep) > 0:
 99 |                 cls_inds = np.ones((len(keep), 1)) * cls_ind
100 |                 dets = np.concatenate(
101 |                     [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1
102 |                 )
103 |                 final_dets.append(dets)
104 |     if len(final_dets) == 0:
105 |         return None
106 |     return np.concatenate(final_dets, 0)
107 | 
108 | 
109 | def multiclass_nms_class_agnostic(boxes, scores, nms_thr, score_thr):
110 |     """Multiclass NMS implemented in Numpy. Class-agnostic version."""
111 |     cls_inds = scores.argmax(1)
112 |     cls_scores = scores[np.arange(len(cls_inds)), cls_inds]
113 | 
114 |     valid_score_mask = cls_scores > score_thr
115 |     if valid_score_mask.sum() == 0:
116 |         return None
117 |     valid_scores = cls_scores[valid_score_mask]
118 |     valid_boxes = boxes[valid_score_mask]
119 |     valid_cls_inds = cls_inds[valid_score_mask]
120 |     keep = nms(valid_boxes, valid_scores, nms_thr)
121 |     if keep:
122 |         dets = np.concatenate(
123 |             [valid_boxes[keep], valid_scores[keep, None], valid_cls_inds[keep, None]], 1
124 |         )
125 |     return dets
126 | 
127 | 


--------------------------------------------------------------------------------
/1_trt_base/trt_plugin/yolox_end2end/end2end.md:
--------------------------------------------------------------------------------
  1 | # Yolo Convert End2End Tensorrt
  2 | 
  3 | yolox默认的后处理是在cpu上直接进行了，所以在经过tensorrt加速后其实后处理还是在cpu上进行。于是考虑吧nms的过程加入到tensorrt生成序列化文件的过程中。因为trt已经有了nms的插件，所以加进去就行了，目前主要有两种思路：
  4 | 
  5 | 1. 参考mmyolo的easydeploy，设置一个TRT::EfficientNMS 的op，在export onnx的时候同时转过去，然后trtexec在模型转换中会自动识别到NMS的插件并替换，实现端到端的操作。
  6 | 
  7 | 2. 生成序列化文件后，通过增加插件层，来让模型在运行中调用NMS插件，实现端到端。
  8 | 
  9 | ## 0. 一些实验
 10 | 首先是发现end2end后反而慢了很多再找问题
 11 | 1. 测试模型转换。利用trtexec和torch2trt导出测试结果合速度都基本一致。排除此问题。非端到端处理的情况下运行速度为2.84-2.85ms
 12 | 2. 经过测试，发现转成端到端的模型后，在c++环境下有加速效果5.045ms->4.577ms. 但是在py环境下反而速度降低
 13 | 
 14 | 下面是当前的测试结果（通过增加op的方法），处理时间包括前处理（除imread以外的所有操作），后处理（包括nms，不包括绘制矩形和保存图片）。程序首先预热50轮，然后连续运行1000轮，计算平均耗时。
 15 | > 平台: 2080ti + i9-9900K + trt8.5 + cuda10.2 + cudnn8.7
 16 | 
 17 | | 程序      | normal    | end2end |            
 18 | | :-----:   | :-----:   | :-----: |                                   
 19 | | python    | 4.00ms   | 3.26ms |    
 20 | | C++       | 5.045ms   | 4.577ms |    
 21 | 
 22 | 
 23 | 这里看到c++要不py慢这么多，经过试验发现主要是前处理的差距太大了。如果不包括前处理阶段,这样就合理很多了
 24 | | 程序         | end2end |            
 25 | | :-----:     | :-----: |                                   
 26 | | python      | 1.94ms |    
 27 | | C++         | 1.8ms |  
 28 | 
 29 | ## 1. 增加OP
 30 | 首先定义一个文件，作为转onnx的过渡op
 31 | ```python
 32 | import torch
 33 | from torch import Tensor
 34 | # refer https://github.com/open-mmlab/mmyolo/blob/dev/projects/easydeploy/nms/trt_nms.py
 35 | 
 36 | # onnx 自定义节点
 37 | class TRTEfficientNMSop(torch.autograd.Function):
 38 | 
 39 |     @staticmethod
 40 |     def forward(
 41 |         ctx,
 42 |         boxes: Tensor,
 43 |         scores: Tensor,
 44 |         background_class: int = -1,
 45 |         box_coding: int = 0,
 46 |         iou_threshold: float = 0.45,
 47 |         max_output_boxes: int = 100,
 48 |         plugin_version: str = '1',
 49 |         score_activation: int = 0,
 50 |         score_threshold: float = 0.25,
 51 |     ):
 52 |         batch_size, _, num_classes = scores.shape
 53 |         num_det = torch.randint(
 54 |             0, max_output_boxes, (batch_size, 1), dtype=torch.int32)
 55 |         det_boxes = torch.randn(batch_size, max_output_boxes, 4)
 56 |         det_scores = torch.randn(batch_size, max_output_boxes)
 57 |         det_classes = torch.randint(
 58 |             0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32)
 59 |         return num_det, det_boxes, det_scores, det_classes
 60 | 
 61 |     @staticmethod
 62 |     def symbolic(g,
 63 |                  boxes: Tensor,
 64 |                  scores: Tensor,
 65 |                  background_class: int = -1,
 66 |                  box_coding: int = 0,
 67 |                  iou_threshold: float = 0.45,
 68 |                  max_output_boxes: int = 100,
 69 |                  plugin_version: str = '1',
 70 |                  score_activation: int = 0,
 71 |                  score_threshold: float = 0.25):
 72 |         out = g.op(
 73 |             'TRT::EfficientNMS_TRT',
 74 |             boxes,
 75 |             scores,
 76 |             background_class_i=background_class,
 77 |             box_coding_i=box_coding,
 78 |             iou_threshold_f=iou_threshold,
 79 |             max_output_boxes_i=max_output_boxes,
 80 |             plugin_version_s=plugin_version,
 81 |             score_activation_i=score_activation,
 82 |             score_threshold_f=score_threshold,
 83 |             outputs=4)
 84 |         num_det, det_boxes, det_scores, det_classes = out
 85 |         return num_det, det_boxes, det_scores, det_classes
 86 | 
 87 | 
 88 | def efficient_nms(
 89 |     boxes: Tensor,
 90 |     scores: Tensor,
 91 |     max_output_boxes_per_class: int = 1000,
 92 |     iou_threshold: float = 0.5,
 93 |     score_threshold: float = 0.05,
 94 |     pre_top_k: int = -1,
 95 |     keep_top_k: int = 100,
 96 |     box_coding: int = 0,
 97 | ):
 98 |   
 99 |     num_det, det_boxes, det_scores, det_classes = TRTEfficientNMSop.apply(
100 |         boxes, scores, -1, box_coding, iou_threshold, keep_top_k, '1', 0,
101 |         score_threshold)
102 |     return num_det, det_boxes, det_scores, det_classes
103 | 
104 | ```
105 | 
106 | 针对yolox输出的模型，建立一个部署模型来增加后处理op，这个地方需要注意nms的输入需要对应到官方插件库的输入输出说明[nms tensorrt-plugin](https://github.com/NVIDIA/TensorRT/tree/release/8.5/plugin/efficientNMSPlugin),对这个新模型进行export即可，
107 | ```python
108 | class DeployModel(nn.Module):
109 |     def __init__(self, baseModel: nn.Module):
110 |         super().__init__()
111 |         self.baseModel = baseModel
112 |         
113 |         self.pre_top_k =  1000
114 |         self.keep_top_k =  100
115 |         self.iou_threshold =  0.45
116 |         self.score_threshold =  0.1
117 | 
118 |     def forward(self, inputs: Tensor):
119 |         outputs = self.baseModel(inputs)
120 | 
121 |         bboxes = outputs[:, :, :4]
122 |         scores = outputs[:, :,  4:5] * outputs[:, :, 5:]
123 |         
124 |         return efficient_nms(bboxes, scores, self.keep_top_k, self.iou_threshold,
125 |                         self.score_threshold, self.pre_top_k, self.keep_top_k, box_coding=1)
126 | ```
127 | 
128 | 然后就可以直接使用trtexec转成trt的engine文件了。
129 | 
130 | 


--------------------------------------------------------------------------------
/1_trt_base/trt_demo/trt_py/trt_python.py:
--------------------------------------------------------------------------------
  1 | import tensorrt as trt
  2 | # import pycuda.autoinit
  3 | # import pycuda.driver as cuda
  4 | import numpy as np
  5 | from cuda import cudart
  6 | 
  7 | model_path = '../files/model.onnx'
  8 | model_engine_path = '../files/model_py.engine'
  9 | 
 10 | 
 11 | def get_engine():
 12 |     # 构建阶段
 13 |     logger = trt.Logger(trt.Logger.WARNING)     # logger
 14 |     builder = trt.Builder(logger)               # builder
 15 | 
 16 |     # 创建网络定义
 17 |     network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
 18 |     profile = builder.create_optimization_profile()     # 动态尺寸的话需要这个 
 19 |     config = builder.create_builder_config()            # 配置
 20 |     config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 20) # 1 MiB
 21 | 
 22 |     # 创建解析器
 23 |     parser = trt.OnnxParser(network, logger)
 24 |     success = parser.parse_from_file(model_path)    # 加载文件
 25 |     for idx in range(parser.num_errors):
 26 |         print(parser.get_error(idx))
 27 | 
 28 |     # 构建engine
 29 |     serialized_engine = builder.build_serialized_network(network, config)
 30 | 
 31 |     with open(model_engine_path, "wb") as f:
 32 |         f.write(serialized_engine)
 33 | 
 34 | def inferV1():
 35 |     logger = trt.Logger(trt.Logger.WARNING)     # logger
 36 | 
 37 |     runtime = trt.Runtime(logger)
 38 |     with open(model_engine_path, "rb") as f:
 39 |         serialized_engine = f.read()
 40 |     engine = runtime.deserialize_cuda_engine(serialized_engine)
 41 | 
 42 |     
 43 |     stream = cuda.Stream()
 44 |     context = engine.create_execution_context()
 45 | 
 46 |     h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)
 47 |     h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)
 48 |     d_input = cuda.mem_alloc(h_input.nbytes)
 49 |     d_output = cuda.mem_alloc(h_output.nbytes)
 50 | 
 51 |     with engine.create_execution_context() as context:
 52 |         # Transfer input data to the GPU.
 53 |         cuda.memcpy_htod_async(d_input, h_input, stream)
 54 |         # Run inference.
 55 |         context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
 56 |         # Transfer predictions back from the GPU.
 57 |         cuda.memcpy_dtoh_async(h_output, d_output, stream)
 58 |         # Synchronize the stream
 59 |         stream.synchronize()
 60 |         # Return the host output. 该数据等同于原始模型的输出数据
 61 |     return h_output
 62 | 
 63 | 
 64 | def inferV2():
 65 |     logger = trt.Logger(trt.Logger.WARNING)     # logger
 66 | 
 67 |     runtime = trt.Runtime(logger)
 68 |     with open(model_engine_path, "rb") as f:
 69 |         serialized_engine = f.read()
 70 |     engine = runtime.deserialize_cuda_engine(serialized_engine)
 71 | 
 72 |     nIO = engine.num_io_tensors     # io变量数量
 73 |     lTensorName = [engine.get_tensor_name(i) for i in range(nIO)]   # 获取io变量名字
 74 |     nInput = [engine.get_tensor_mode(lTensorName[i]) for i in range(nIO)].count(trt.TensorIOMode.INPUT) # 输入tensor数量
 75 |     Output = [engine.get_tensor_mode(lTensorName[i]) for i in range(nIO)].count(trt.TensorIOMode.OUTPUT)
 76 | 
 77 |     context = engine.create_execution_context()
 78 |     print("===============INPUT/OUTPUT=================== ")
 79 |     for i in range(nIO):
 80 |         print(f"[{i}]{'Input ' if i < nInput else 'Output'} -> "+
 81 |               f"{engine.get_tensor_dtype(lTensorName[i])} " +       # 数据类型
 82 |               f"{engine.get_tensor_shape(lTensorName[i])} " +       # engine形状
 83 |               f"{context.get_tensor_shape(lTensorName[i])} " +      # context形状
 84 |               f"{lTensorName[i]} ")                                 # 名字
 85 |     print("============================================ ")
 86 |     
 87 |     data = np.arange(3 * 224 * 224, dtype=np.float32).reshape(1, 3, 224, 224)   
 88 | 
 89 |     # cpu端数据
 90 |     bufferH = []                                                            
 91 |     bufferH.append(np.ascontiguousarray(data))  # 输入数据转内存连续
 92 |     for i in range(nInput, nIO):                # 输出数据
 93 |         bufferH.append(np.empty(context.get_tensor_shape(lTensorName[i]), dtype=trt.nptype(engine.get_tensor_dtype(lTensorName[i]))))
 94 |     
 95 |     # gpu端数据申请显存
 96 |     bufferD = []
 97 |     for i in range(nIO):
 98 |         bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])
 99 |     
100 |     # 输入数据复制到显存
101 |     for i in range(nInput):                                                    
102 |         cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
103 | 
104 |     # 设置输入输出数据的地址(buffer)
105 |     for i in range(nIO):
106 |         context.set_tensor_address(lTensorName[i], int(bufferD[i]))            
107 | 
108 |     # 推理
109 |     context.execute_async_v3(0) 
110 | 
111 |     for i in range(nInput, nIO):    # 数据拷会cpu
112 |         cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
113 | 
114 |     for i in range(nIO):
115 |         print(f'{lTensorName[i]}:\t {bufferH[i].shape}')
116 | 
117 | 
118 |     for b in bufferD:       # 释放显存  
119 |         cudart.cudaFree(b)
120 | 
121 | 
122 | if __name__ == '__main__':
123 |     # get_engine()
124 |     # print(infer().shape)
125 |     inferV2()   # 推荐
126 | 


--------------------------------------------------------------------------------
/1_trt_base/trt_plugin/demo01/test.py:
--------------------------------------------------------------------------------
  1 | import ctypes
  2 | from cuda import cudart
  3 | import numpy as np
  4 | import os
  5 | import tensorrt as trt
  6 | 
  7 | SOFILE = './build/libdemo01.so'
  8 | np.set_printoptions(precision=3, linewidth=100, suppress=True)  # 控制Python中小数的显示精度
  9 | np.random.seed(123456)
 10 | cudart.cudaDeviceSynchronize()
 11 | 
 12 | def printArrayInfomation(x, info="", n=5):
 13 |     print(f"{info}: {x.shape}, SumAbs={np.sum(abs(x)) :.5e}, Var={np.var(x) :.5f}, \
 14 |           Max={np.max(x) :.5f},Min={np.min(x) :.5f},SAD={np.sum(np.abs(np.diff(x.reshape(-1)))) :.5f}")
 15 |     print('\t', x.reshape(-1)[:n], x.reshape(-1)[-n:])
 16 | 
 17 | def check(a, b, weak=False, checkEpsilon=1e-5):
 18 |     if weak:
 19 |         res = np.all(np.abs(a - b) < checkEpsilon)
 20 |     else:
 21 |         res = np.all(a == b)
 22 |     diff0 = np.max(np.abs(a - b))
 23 |     diff1 = np.max(np.abs(a - b) / (np.abs(b) + checkEpsilon))
 24 |     print(f"check:{res}, absDiff={diff0}, relDiff={diff1}")
 25 | 
 26 | def addScalarCPU(inputH, scalar):
 27 |     return [inputH[0] + scalar]
 28 | 
 29 | def getAddScalarPlugin(scalar):
 30 |     for c in trt.get_plugin_registry().plugin_creator_list:
 31 |         if c.name == "ZZX_ADDScalar":
 32 |             parameterList = []
 33 |             parameterList.append(trt.PluginField("scalar", np.float32(scalar), trt.PluginFieldType.FLOAT32))
 34 |             return c.create_plugin(c.name, trt.PluginFieldCollection(parameterList))
 35 |     return None
 36 | 
 37 | 
 38 | def run(shape, scalar):
 39 |     testCase = f"<shape={shape},scalar={scalar}>"
 40 |     trtFile = f"./model-Dim{len(shape)}.plan"
 41 |     print(f"Test {testCase}")
 42 |     logger = trt.Logger(trt.Logger.ERROR)
 43 |     trt.init_libnvinfer_plugins(logger, '')
 44 |     ctypes.cdll.LoadLibrary(SOFILE)
 45 | 
 46 |     if os.path.isfile(trtFile):
 47 |         with open(trtFile, "rb") as f:
 48 |             engine = trt.Runtime(logger).deserialize_cuda_engine(f.read())
 49 |         if engine == None:
 50 |             print("Failed loading engine!")
 51 |             return
 52 |         print("Succeeded loading engine!")
 53 |     else:
 54 |         builder = trt.Builder(logger)
 55 |         network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
 56 |         profile = builder.create_optimization_profile()
 57 |         config = builder.create_builder_config()
 58 | 
 59 |         inputT0 = network.add_input("inputT0", trt.float32, [-1 for i in shape])
 60 |         profile.set_shape(inputT0.name, [1 for i in shape], [8 for i in shape], [32 for i in shape])
 61 |         config.add_optimization_profile(profile)
 62 | 
 63 |         pluginLayer = network.add_plugin_v2([inputT0], getAddScalarPlugin(scalar))
 64 |         network.mark_output(pluginLayer.get_output(0))
 65 |         engineString = builder.build_serialized_network(network, config)
 66 |         if engineString == None:
 67 |             print("Failed building engine!")
 68 |             return
 69 |         print("Succeeded building engine!")
 70 |         with open(trtFile, "wb") as f:
 71 |             f.write(engineString)
 72 |         engine = trt.Runtime(logger).deserialize_cuda_engine(engineString)
 73 | 
 74 |     nIO = engine.num_io_tensors
 75 |     lTensorName = [engine.get_tensor_name(i) for i in range(nIO)]
 76 |     nInput = [engine.get_tensor_mode(lTensorName[i]) for i in range(nIO)].count(trt.TensorIOMode.INPUT)
 77 | 
 78 |     context = engine.create_execution_context()
 79 |     context.set_input_shape(lTensorName[0], shape)
 80 |     #for i in range(nIO):
 81 |     #    print("[%2d]%s->" % (i, "Input " if i < nInput else "Output"), engine.get_tensor_dtype(lTensorName[i]), engine.get_tensor_shape(lTensorName[i]), context.get_tensor_shape(lTensorName[i]), lTensorName[i])
 82 | 
 83 |     bufferH = []
 84 |     bufferH.append(np.arange(np.prod(shape), dtype=np.float32).reshape(shape))
 85 |     for i in range(nInput, nIO):
 86 |         bufferH.append(np.empty(context.get_tensor_shape(lTensorName[i]), dtype=trt.nptype(engine.get_tensor_dtype(lTensorName[i]))))
 87 |     bufferD = []
 88 |     for i in range(nIO):
 89 |         bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1])
 90 | 
 91 |     for i in range(nInput):
 92 |         cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
 93 | 
 94 |     for i in range(nIO):
 95 |         context.set_tensor_address(lTensorName[i], int(bufferD[i]))
 96 | 
 97 |     context.execute_async_v3(0)
 98 | 
 99 |     for i in range(nInput, nIO):
100 |         cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
101 | 
102 |     outputCPU = addScalarCPU(bufferH[:nInput], scalar)
103 |     """
104 |     for i in range(nInput):
105 |         printArrayInfomation(bufferH[i])
106 |     for i in range(nInput, nIO):
107 |         printArrayInfomation(bufferH[i])
108 |     for i in range(nInput, nIO):
109 |         printArrayInfomation(outputCPU[i - nInput])
110 |     """
111 |     check(bufferH[nInput:][0], outputCPU[0], True)
112 | 
113 |     for b in bufferD:
114 |         cudart.cudaFree(b)
115 |     print(f"Test {testCase} finish!")
116 | 
117 | if __name__ == "__main__":
118 |     os.system("rm -rf ./*.plan")
119 | 
120 |     run([32], 1)
121 |     run([32, 32], 1)
122 |     run([16, 16, 16], 1)
123 |     run([8, 8, 8, 8], 1)
124 |     run([32], 1)
125 |     run([32, 32], 1)
126 |     run([16, 16, 16], 1)
127 |     run([8, 8, 8, 8], 1)
128 | 
129 |     print("Test all finish!")
130 | 


--------------------------------------------------------------------------------
/1_trt_base/trt_yolox/py/trt.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import time
  3 | import numpy as np
  4 | import tensorrt as trt
  5 | from cuda import cudart
  6 | 
  7 | from tools import preproc, demo_postprocess, multiclass_nms
  8 | 
  9 | model_path = '/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/bin/yolox.engine'
 10 | img_path = "../../imgs/000026.jpg"
 11 | score_thr = 0.5
 12 | 
 13 | COCO_CLASSES = ('echinus', 'starfish', 'holothurian', 'scallop')
 14 | _COLORS = np.array(
 15 |     [
 16 |         0.000, 0.447, 0.741,
 17 |         0.850, 0.325, 0.098,
 18 |         0.929, 0.694, 0.125,
 19 |         0.494, 0.184, 0.556,
 20 |     ]
 21 | ).astype(np.float32).reshape(-1, 3)
 22 | 
 23 | 
 24 | def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
 25 |     for i in range(len(boxes)):
 26 |         box = boxes[i]
 27 |         cls_id = int(cls_ids[i])
 28 |         score = scores[i]
 29 |         if score < conf:
 30 |             continue
 31 |         x0 = int(box[0])
 32 |         y0 = int(box[1])
 33 |         x1 = int(box[2])
 34 |         y1 = int(box[3])
 35 | 
 36 |         color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist()
 37 |         text = '{}:{:.1f}%'.format(class_names[cls_id], score * 100)
 38 |         txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255)
 39 |         font = cv2.FONT_HERSHEY_SIMPLEX
 40 | 
 41 |         txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
 42 |         cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
 43 | 
 44 |         txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist()
 45 |         cv2.rectangle(
 46 |             img,
 47 |             (x0, y0 + 1),
 48 |             (x0 + txt_size[0] + 1, y0 + int(1.5*txt_size[1])),
 49 |             txt_bk_color,
 50 |             -1
 51 |         )
 52 |         cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1)
 53 | 
 54 |     return img
 55 | 
 56 | 
 57 | 
 58 | class YoloTRT:
 59 |     def __init__(self) -> None:
 60 |         # 构建阶段
 61 |         self.logger = trt.Logger(trt.Logger.WARNING)     # logger
 62 |         trt.init_libnvinfer_plugins( self.logger, namespace='')   # 加载插件
 63 | 
 64 |         self.runtime = trt.Runtime(self.logger)
 65 | 
 66 |         with open(model_path, "rb") as f:
 67 |             serialized_engine = f.read()
 68 |         self.engine = self.runtime.deserialize_cuda_engine(serialized_engine)
 69 |         self.context = self.engine.create_execution_context()
 70 | 
 71 |         self.nIO = self.engine.num_io_tensors     # io变量数量
 72 |         self.lTensorName = [self.engine.get_tensor_name(i) for i in range(self.nIO)]   # 获取io变量名字
 73 |         self.nInput = [self.engine.get_tensor_mode(self.lTensorName[i]) for i in range(self.nIO)].count(trt.TensorIOMode.INPUT) # 输入tensor数量
 74 |         self.Output = [self.engine.get_tensor_mode(self.lTensorName[i]) for i in range(self.nIO)].count(trt.TensorIOMode.OUTPUT)
 75 |         
 76 |         print("===============INPUT/OUTPUT=================== ")
 77 |         for i in range(self.nIO):
 78 |             print(f"[{i}]{'Input ' if i < self.nInput else 'Output'} -> "+
 79 |                 f"{self.engine.get_tensor_dtype(self.lTensorName[i])} " +       # 数据类型
 80 |                 f"{self.engine.get_tensor_shape(self.lTensorName[i])} " +       # engine形状
 81 |                 f"{self.context.get_tensor_shape(self.lTensorName[i])} " +      # context形状
 82 |                 f"{self.lTensorName[i]} ")                                 # 名字
 83 |         print("============================================ ")
 84 | 
 85 |         # cpu端数据
 86 |         self.bufferH = []                                                            
 87 |         for i in range(self.nIO):
 88 |             self.bufferH.append(np.empty(self.context.get_tensor_shape(self.lTensorName[i]), 
 89 |                                          dtype=trt.nptype(self.engine.get_tensor_dtype(self.lTensorName[i]))))
 90 |     
 91 |         # # gpu端数据申请显存
 92 |         self.bufferD = []
 93 |         for i in range(self.nIO):
 94 |             self.bufferD.append(cudart.cudaMalloc(self.bufferH[i].nbytes)[1])
 95 |     
 96 |     def infer(self, origin_img):
 97 |         data, ratio = preproc(origin_img, (640, 640))
 98 |         # cpu端数据
 99 |         self.bufferH[0] = data
100 | 
101 |         for i in range(self.nInput, self.nIO):                # 输出数据
102 |             self.bufferH.append(np.empty(self.context.get_tensor_shape(self.lTensorName[i]), 
103 |                                     dtype=trt.nptype(self.engine.get_tensor_dtype(self.lTensorName[i]))))
104 |         
105 |         # 输入数据复制到显存
106 |         for i in range(self.nInput):                                                    
107 |             cudart.cudaMemcpy(self.bufferD[i], self.bufferH[i].ctypes.data, self.bufferH[i].nbytes, 
108 |                             cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
109 |         
110 |         # # 推理
111 |         self.context.execute_v2(self.bufferD)     # batchsize bingings
112 |         for i in range(self.nInput, self.nIO):    # 数据拷会cpu
113 |             cudart.cudaMemcpy(self.bufferH[i].ctypes.data, self.bufferD[i], self.bufferH[i].nbytes, 
114 |                             cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
115 |     
116 |         predictions = demo_postprocess(self.bufferH[1], (640, 640))[0]
117 |         boxes = predictions[:, :4]
118 |         scores = predictions[:, 4:5] * predictions[:, 5:]
119 |         boxes_xyxy = np.ones_like(boxes)
120 |         boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
121 |         boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2.
122 |         boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2.
123 |         boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2.
124 |         boxes_xyxy /= ratio
125 |         dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1)
126 | 
127 |         return dets
128 |     
129 |     def plot_save(self, origin_img, dets):
130 |         final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
131 |         origin_img = vis(origin_img, final_boxes, final_scores, final_cls_inds,
132 |                         conf=score_thr, class_names=COCO_CLASSES)
133 |         cv2.imwrite("ans.jpg", origin_img)
134 | 
135 |     def myfree(self):
136 |         for i in self.bufferD:       # 释放显存  
137 |             cudart.cudaFree(i)
138 | 
139 | 
140 | 
141 | if __name__ == '__main__':
142 |     origin_img = cv2.imread(img_path)
143 | 
144 |     yolo_trt = YoloTRT()
145 |     for _ in range(50):
146 |         yolo_trt.infer(origin_img)
147 | 
148 |     time_b = time.perf_counter()
149 |     for _ in range(1000):
150 |         dets = yolo_trt.infer(origin_img)
151 | 
152 |     time_e = time.perf_counter()
153 |     print(f"cost time: {(time_e-time_b)*1000 / 1000.0 :.2f}ms")
154 | 
155 |     yolo_trt.plot_save(origin_img, dets)
156 | 
157 | 
158 | 
159 |     yolo_trt.myfree()
160 | 
161 | 


--------------------------------------------------------------------------------
/1_trt_base/trt_demo/trt_cpp/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <fstream>
  3 | #include <vector>
  4 | #include"NvInfer.h"
  5 | #include "NvOnnxParser.h"
  6 | #include "cookbookHelper.cuh"
  7 | 
  8 | using namespace nvinfer1;
  9 | // using namespace nvonnxparser;
 10 | 
 11 | // const std::string trtfile {"../files/model_cpp.engine"};
 12 | // const std::string onnxfile {"../files/model.onnx"};
 13 | const char* trtfile="../../files/model_cpp.engine";
 14 | const char* onnxfile="../../files/model.onnx";
 15 | 
 16 | 
 17 | // logger 
 18 | class MyLogger : public ILogger           
 19 | {
 20 |     void log(Severity severity, const char* msg) noexcept override	//noexcept不会抛出异常。override虚函数重写
 21 |     {
 22 |         // suppress info-level messages
 23 |         if (severity <= Severity::kWARNING)
 24 |             std::cout << msg << std::endl;
 25 |     }
 26 | } logger;
 27 | 
 28 | 
 29 | void get_engine(){
 30 |     IBuilder* builder = createInferBuilder(logger);
 31 |     INetworkDefinition* network = builder->createNetworkV2(1U << int(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
 32 |     IOptimizationProfile* profile = builder->createOptimizationProfile();
 33 |     IBuilderConfig* config = builder->createBuilderConfig();
 34 |     config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, 1 << 30);
 35 |     /* 这个是指定输入尺寸和输入名字的
 36 |         ITensor *inputTensor = network->addInput("inputT0", DataType::kFLOAT, Dims32 {3, {-1, -1, -1}});
 37 |         profile->setDimensions(inputTensor->getName(), OptProfileSelector::kMIN, Dims32 {3, {1, 1, 1}});
 38 |         profile->setDimensions(inputTensor->getName(), OptProfileSelector::kOPT, Dims32 {3, {3, 4, 5}});
 39 |         profile->setDimensions(inputTensor->getName(), OptProfileSelector::kMAX, Dims32 {3, {6, 8, 10}});
 40 |         config->addOptimizationProfile(profile);
 41 |     */
 42 |     // onnx解析器
 43 |     nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, logger);
 44 |     parser->parseFromFile(onnxfile, static_cast<int32_t>(ILogger::Severity::kWARNING));
 45 |     for(int32_t i = 0; i < parser->getNbErrors(); ++i){
 46 |         std::cout << parser->getError(i)->desc() << std::endl;
 47 |     }
 48 | 
 49 |     IHostMemory* engineString = builder->buildSerializedNetwork(*network, *config); //创建engine
 50 |     if(engineString == nullptr || engineString->size()==0){
 51 |         std::cout<<"building 序列化 engine失败"<<std::endl;
 52 |         return;
 53 |     }
 54 | 
 55 |     std::ofstream engineFile(trtfile, std::ios::binary);
 56 |     engineFile.write(static_cast<char *>(engineString->data()), engineString->size());
 57 |     if (engineFile.fail())
 58 |         {
 59 |             std::cout << "保存失败" << std::endl;
 60 |             return;
 61 |         }
 62 |     std::cout << "生成成功！" << std::endl;
 63 | }
 64 | 
 65 | 
 66 | 
 67 | void infer(){
 68 |     ICudaEngine *engine = nullptr;
 69 | 
 70 |     std::ifstream engineFile(trtfile, std::ios::binary);
 71 |     long int fsize = 0;
 72 | 
 73 |     engineFile.seekg(0, engineFile.end);    // 指针设到文件最后，也就是继续写入
 74 |     fsize = engineFile.tellg();             // 返回当前定位指针的位置，也代表着输入流的大小
 75 |     engineFile.seekg(0, engineFile.beg);    // 文件开头
 76 |     std::vector<char> engineString(fsize);
 77 |     engineFile.read(engineString.data(), fsize);   
 78 | 
 79 |     if(engineString.size() == 0){
 80 |         std::cout<<"读取序列化数据失败"<<std::endl;
 81 |         return;
 82 |     } 
 83 | 
 84 |     IRuntime* runtime {createInferRuntime(logger)};
 85 |     engine = runtime->deserializeCudaEngine(engineString.data(), fsize);
 86 |     if(engine == nullptr){
 87 |         std::cout<<"反序列化失败"<<std::endl;
 88 |         return;
 89 |     }
 90 | 
 91 |     int nIO = engine->getNbIOTensors(); // io数量
 92 |     int nInput = 0;
 93 |     int nOutput = 0;
 94 |     std::vector<std::string> vTensorName(nIO);
 95 |     for (int i = 0; i < nIO; ++i){
 96 |         vTensorName[i] = std::string(engine->getIOTensorName(i));
 97 |         nInput += int(engine->getTensorIOMode(vTensorName[i].c_str()) == TensorIOMode::kINPUT);
 98 |         nOutput += int(engine->getTensorIOMode(vTensorName[i].c_str()) == TensorIOMode::kOUTPUT);
 99 |     }
100 | 
101 |     IExecutionContext* context = engine->createExecutionContext();
102 |     // context->setInputShape(vTensorName[0].c_str(), Dims32 {3, 3, 4, 5});
103 | 
104 |     // 打印输出输出形状啥的
105 |     for (int i = 0; i < nIO; ++i){
106 |         std::cout<<std::string(i < nInput ? "Input [" : "Output[");
107 |         std::cout<<i << std::string("] -> ");
108 |         std::cout << dataTypeToString(engine->getTensorDataType(vTensorName[i].c_str())) << std::string(" ");
109 |         std::cout << shapeToString(engine->getTensorShape(vTensorName[i].c_str())) << std::string(" ");
110 |         std::cout << shapeToString(context->getTensorShape(vTensorName[i].c_str())) << std::string(" ");
111 |         std::cout << vTensorName[i] << std::endl;
112 |     }
113 |     
114 |     std::vector<int> vTensorSize(nIO, 0);
115 |     for (int i = 0; i < nIO; ++i){
116 |         Dims32 dim  = context->getTensorShape(vTensorName[i].c_str());
117 |         int size = 1;
118 |         for (int j = 0; j < dim.nbDims; ++j){
119 |             size *= dim.d[j];
120 |         }
121 |         vTensorSize[i] = size * dataTypeToSize(engine->getTensorDataType(vTensorName[i].c_str()));
122 |     }
123 | 
124 |     std::vector<void *> vBufferH {nIO, nullptr};    // cpu
125 |     std::vector<void *> vBufferD {nIO, nullptr};    // gpu
126 |     // gpu分配内存
127 |     for (int i = 0; i < nIO; ++i){
128 |         vBufferH[i] = (void *)new char[vTensorSize[i]];
129 |         CHECK(cudaMalloc(&vBufferD[i], vTensorSize[i]));
130 |     }
131 | 
132 |     // 赋值
133 |     float *pData = (float *)vBufferH[0];
134 |     for (int i = 0; i < vTensorSize[0] / dataTypeToSize(engine->getTensorDataType(vTensorName[0].c_str())); ++i){
135 |         pData[i] = float(i);
136 |     }
137 | 
138 |     // 数据复制 cpu -> gpu
139 |     for (int i = 0; i < nInput; ++i){
140 |         CHECK(cudaMemcpy(vBufferD[i], vBufferH[i], vTensorSize[i], cudaMemcpyHostToDevice));
141 |     }
142 | 
143 |     // gpu上名字对应地址
144 |     for (int i = 0; i < nIO; ++i){
145 |         context->setTensorAddress(vTensorName[i].c_str(), vBufferD[i]);
146 |     }
147 | 
148 |     // 推理
149 |     context->enqueueV3(0);
150 | 
151 |     // 数据复制 gpu -> cpu
152 |     for (int i = nInput; i < nIO; ++i){
153 |         CHECK(cudaMemcpy(vBufferH[i], vBufferD[i], vTensorSize[i], cudaMemcpyDeviceToHost));
154 |     }
155 | 
156 |     // 打印输出
157 |     // for (int i = 0; i < nIO; ++i){
158 |     //     printArrayInfomation((float *)vBufferH[i], context->getTensorShape(vTensorName[i].c_str()), vTensorName[i], true);
159 |     // }
160 | 
161 |     // 释放内存 释放gpu显存
162 |     for (int i = 0; i < nIO; ++i){
163 |         delete[] vBufferH[i];
164 |         CHECK(cudaFree(vBufferD[i]));
165 |     }
166 | 
167 |     return;
168 | }
169 | 
170 | 
171 | int main(){
172 |     CHECK(cudaSetDevice(0));
173 |     // get_engine();
174 |     infer();
175 |     return 0;
176 | }


--------------------------------------------------------------------------------
/1_trt_base/trt_yolox/py/trt_end2end.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple
  2 | import cv2
  3 | import time
  4 | import numpy as np
  5 | import tensorrt as trt
  6 | from cuda import cudart
  7 | 
  8 | 
  9 | model_path = "/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/bin/yolox_end2end.engine"
 10 | img_path = "../../imgs/000026.jpg"
 11 | score_thr = 0.5
 12 | 
 13 | COCO_CLASSES = ('echinus', 'starfish', 'holothurian', 'scallop')
 14 | _COLORS = np.array(
 15 |     [
 16 |         0.000, 0.447, 0.741,
 17 |         0.850, 0.325, 0.098,
 18 |         0.929, 0.694, 0.125,
 19 |         0.494, 0.184, 0.556,
 20 |     ]
 21 | ).astype(np.float32).reshape(-1, 3)
 22 | 
 23 | 
 24 | def preproc(img: np.ndarray, input_size: tuple, swap: tuple=(2, 0, 1))->Tuple[np.ndarray, float]:
 25 |     padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
 26 |     
 27 |     r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
 28 |     resized_img = cv2.resize(img, 
 29 |                              (int(img.shape[1] * r), int(img.shape[0] * r)),
 30 |                              interpolation=cv2.INTER_LINEAR,
 31 |                              ).astype(np.uint8)
 32 |     padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
 33 | 
 34 |     padded_img = padded_img.transpose(swap)
 35 |     padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
 36 |     return padded_img, r
 37 | 
 38 | 
 39 | def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
 40 |     for i in range(len(boxes)):
 41 |         box = boxes[i]
 42 |         cls_id = int(cls_ids[i])
 43 |         score = scores[i]
 44 |         if score < conf:
 45 |             continue
 46 |         x0 = int(box[0])
 47 |         y0 = int(box[1])
 48 |         x1 = int(box[2])
 49 |         y1 = int(box[3])
 50 | 
 51 |         color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist()
 52 |         text = '{}:{:.1f}%'.format(class_names[cls_id], score * 100)
 53 |         txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255)
 54 |         font = cv2.FONT_HERSHEY_SIMPLEX
 55 | 
 56 |         txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
 57 |         cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
 58 | 
 59 |         txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist()
 60 |         cv2.rectangle(
 61 |             img,
 62 |             (x0, y0 + 1),
 63 |             (x0 + txt_size[0] + 1, y0 + int(1.5*txt_size[1])),
 64 |             txt_bk_color,
 65 |             -1
 66 |         )
 67 |         cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1)
 68 | 
 69 |     return img
 70 | 
 71 | 
 72 | 
 73 | class YoloTRT:
 74 |     def __init__(self) -> None:
 75 |         # 构建阶段
 76 |         self.logger = trt.Logger(trt.Logger.WARNING)     # logger
 77 |         trt.init_libnvinfer_plugins( self.logger, namespace='')   # 加载插件
 78 | 
 79 |         self.runtime = trt.Runtime(self.logger)
 80 | 
 81 |         with open(model_path, "rb") as f:
 82 |             serialized_engine = f.read()
 83 |         self.engine = self.runtime.deserialize_cuda_engine(serialized_engine)
 84 |         self.context = self.engine.create_execution_context()
 85 | 
 86 |         self.nIO = self.engine.num_io_tensors     # io变量数量
 87 |         self.lTensorName = [self.engine.get_tensor_name(i) for i in range(self.nIO)]   # 获取io变量名字
 88 |         self.nInput = [self.engine.get_tensor_mode(self.lTensorName[i]) for i in range(self.nIO)].count(trt.TensorIOMode.INPUT) # 输入tensor数量
 89 |         self.Output = [self.engine.get_tensor_mode(self.lTensorName[i]) for i in range(self.nIO)].count(trt.TensorIOMode.OUTPUT)
 90 |         
 91 |         print("===============INPUT/OUTPUT=================== ")
 92 |         for i in range(self.nIO):
 93 |             print(f"[{i}]{'Input ' if i < self.nInput else 'Output'} -> "+
 94 |                 f"{self.engine.get_tensor_dtype(self.lTensorName[i])} " +       # 数据类型
 95 |                 f"{self.engine.get_tensor_shape(self.lTensorName[i])} " +       # engine形状
 96 |                 f"{self.context.get_tensor_shape(self.lTensorName[i])} " +      # context形状
 97 |                 f"{self.lTensorName[i]} ")                                 # 名字
 98 |         print("============================================ ")
 99 | 
100 |         # cpu端数据
101 |         self.bufferH = []                                                            
102 |         for i in range(self.nIO):
103 |             self.bufferH.append(np.empty(self.context.get_tensor_shape(self.lTensorName[i]), 
104 |                                          dtype=trt.nptype(self.engine.get_tensor_dtype(self.lTensorName[i]))))
105 |     
106 |         # # gpu端数据申请显存
107 |         self.bufferD = []
108 |         for i in range(self.nIO):
109 |             self.bufferD.append(cudart.cudaMalloc(self.bufferH[i].nbytes)[1])
110 |     
111 |     def infer(self, origin_img):
112 |         data, ratio = preproc(origin_img, (640, 640))
113 |         # cpu端数据
114 |         self.bufferH = []    
115 |         self.bufferH.append(data)  # 输入数据转内存连续                                                        
116 |         # self.bufferH.append(np.ascontiguousarray(data))  # 输入数据转内存连续
117 | 
118 |         for i in range(self.nInput, self.nIO):                # 输出数据
119 |             self.bufferH.append(np.empty(self.context.get_tensor_shape(self.lTensorName[i]), 
120 |                                     dtype=trt.nptype(self.engine.get_tensor_dtype(self.lTensorName[i]))))
121 |         
122 |         # 输入数据复制到显存
123 |         for i in range(self.nInput):                                                    
124 |             cudart.cudaMemcpy(self.bufferD[i], self.bufferH[i].ctypes.data, self.bufferH[i].nbytes, 
125 |                             cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
126 |         
127 |         # # 推理
128 |         self.context.execute_v2(self.bufferD)     # batchsize bingings
129 |         for i in range(self.nInput, self.nIO):    # 数据拷会cpu
130 |             cudart.cudaMemcpy(self.bufferH[i].ctypes.data, self.bufferD[i], self.bufferH[i].nbytes, 
131 |                             cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
132 |         
133 |         dets = self.bufferH[self.nInput:self.nIO]
134 |     
135 |         return dets
136 |     
137 |     # def plot_save(self, origin_img, dets):
138 |     #     final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
139 |     #     origin_img = vis(origin_img, final_boxes, final_scores, final_cls_inds,
140 |     #                     conf=score_thr, class_names=COCO_CLASSES)
141 |     #     cv2.imwrite("ans.jpg", origin_img)
142 | 
143 |     def myfree(self):
144 |         for i in self.bufferD:       # 释放显存  
145 |             cudart.cudaFree(i)
146 | 
147 | 
148 | 
149 | if __name__ == '__main__':
150 |     origin_img = cv2.imread(img_path)
151 | 
152 |     yolo_trt = YoloTRT()
153 |     for _ in range(50):
154 |         yolo_trt.infer(origin_img)
155 | 
156 |     time_b = time.perf_counter()
157 |     for _ in range(1000):
158 |         dets = yolo_trt.infer(origin_img)
159 | 
160 |     time_e = time.perf_counter()
161 |     print(f"cost time: {(time_e-time_b)*1000 / 1000.0 :.2f}ms")
162 |     # print(dets)
163 | 
164 |     # yolo_trt.plot_save(origin_img, dets)
165 | 
166 |     yolo_trt.myfree()
167 | 


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/kernels/cuda_kernel.cuh:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file cuda_kernel.cuh
  3 |  * @author 0zzx0
  4 |  * @brief 定义一些自定义的CUDA操作，主要是预处理部分和后处理部分的加速
  5 |  * @version 1.0
  6 |  * @date 2023-6-11
  7 |  *
  8 |  * @copyright Copyright (c) 2023
  9 |  *
 10 |  */
 11 | 
 12 | #ifndef CUDA_KERNEL_CUH
 13 | #define CUDA_KERNEL_CUH
 14 | 
 15 | #include <cuda_runtime.h>
 16 | 
 17 | #include "../base/tools.hpp"
 18 | 
 19 | namespace FasterTRT {
 20 | 
 21 | #define GPU_BLOCK_THREADS 512   // gpu 每个block线程数量
 22 | const int NUM_BOX_ELEMENT = 7;  // left, top, right, bottom, confidence, class, keepflag
 23 | 
 24 | // 用于插值计算的常量和函数
 25 | #define INTER_RESIZE_COEF_BITS 11
 26 | #define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
 27 | #define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
 28 | 
 29 | template <typename _T>
 30 | static __inline__ __device__ _T limit(_T value, _T low, _T high) {
 31 |     return value < low ? low : (value > high ? high : value);
 32 | }
 33 | static __inline__ __device__ int resize_cast(int value) {
 34 |     return (value + (1 << (CAST_BITS - 1))) >> CAST_BITS;
 35 | }
 36 | 
 37 | // sigmoid 和 逆sigmoid 具体是否使用看模型里面输出前有没有sigmoid吧
 38 | static __host__ inline float desigmoid(float y) {
 39 |     return -log(1.0f / y - 1.0f);
 40 | }
 41 | 
 42 | static __device__ inline float sigmoid(float x) {
 43 |     return 1.0f / (1.0f + exp(-x));
 44 | }
 45 | 
 46 | static dim3 grid_dims(int numJobs);
 47 | static dim3 block_dims(int numJobs);
 48 | 
 49 | //////////////////////归一化策略/////////////////
 50 | enum class NormType : int { None = 0, MeanStd = 1, AlphaBeta = 2 };
 51 | 
 52 | //////////////////////通道策略/////////////////
 53 | enum class ChannelType : int { None = 0, SwapRB = 1 };
 54 | 
 55 | /* 归一化操作，可以支持均值标准差，alpha beta 以及输入图片通道部分swap RB */
 56 | struct Norm {
 57 |     float mean[3];
 58 |     float std[3];
 59 |     float alpha, beta;
 60 |     NormType type = NormType::None;
 61 |     ChannelType channel_type = ChannelType::None;
 62 | 
 63 |     // out = (x * alpha - mean) / std
 64 |     static Norm mean_std(const float mean[3], const float std[3], float alpha = 1 / 255.0f,
 65 |                          ChannelType channel_type = ChannelType::None);
 66 | 
 67 |     // out = x * alpha + beta
 68 |     static Norm alpha_beta(float alpha, float beta = 0,
 69 |                            ChannelType channel_type = ChannelType::None);
 70 | 
 71 |     // None
 72 |     static Norm None();
 73 | };
 74 | 
 75 | // 仿射变换
 76 | static __device__ void affine_project(float* matrix, float x, float y, float* ox, float* oy);
 77 | 
 78 | // 计算iou
 79 | static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft,
 80 |                                 float btop, float bright, float bbottom);
 81 | // nms kernel
 82 | static __global__ void fast_nms_kernel(float* bboxes, int max_objects, float threshold);
 83 | 
 84 | // yolox的解码kernel
 85 | static __global__ void yolox_decode_kernel(float* predict, int num_bboxes, int fm_area,
 86 |                                            int num_classes, float confidence_threshold,
 87 |                                            float* invert_affine_matrix, float* parray,
 88 |                                            const float* prior_box, int max_objects);
 89 | 
 90 | // yolox的解码
 91 | void yolox_decode_kernel_invoker(float* predict, int num_bboxes, int fm_area, int num_classes,
 92 |                                  float confidence_threshold, float nms_threshold,
 93 |                                  float* invert_affine_matrix, float* parray, const float* prior_box,
 94 |                                  int max_objects, cudaStream_t stream);
 95 | 
 96 | // yolov8的解码kernel
 97 | static __global__ void yolov8_decode_kernel(float* predict, int num_bboxes, int fm_area,
 98 |                                             int num_classes, float confidence_threshold,
 99 |                                             float* invert_affine_matrix, float* parray,
100 |                                             const float* prior_box, int max_objects);
101 | 
102 | // yolov8的解码
103 | void yolov8_decode_kernel_invoker(float* predict, int num_bboxes, int fm_area, int num_classes,
104 |                                   float confidence_threshold, float nms_threshold,
105 |                                   float* invert_affine_matrix, float* parray,
106 |                                   const float* prior_box, int max_objects, cudaStream_t stream);
107 | 
108 | // rtdetr的解码kernel
109 | static __global__ void rtdetr_decode_kernel(float* predict, int num_bboxes, int fm_area,
110 |                                             int num_classes, float confidence_threshold,
111 |                                             float* invert_affine_matrix, float* parray,
112 |                                             int max_objects, int input_size);
113 | 
114 | // rtdetr的解码
115 | void rtdetr_decode_kernel_invoker(float* predict, int num_bboxes, int fm_area, int num_classes,
116 |                                   float confidence_threshold, float* invert_affine_matrix,
117 |                                   float* parray, int max_objects, int input_size, cudaStream_t stream);
118 | 
119 | /**
120 |  * @brief 通过仿射变换完成双线性插值resize并且归一化的kernel
121 |  *
122 |  * @param src 原始图像数据
123 |  * @param src_line_size 图像长度(宽度*3)
124 |  * @param src_width 原始图像宽
125 |  * @param src_height 原始图像高
126 |  * @param dst 目标图像数据
127 |  * @param dst_width 目标图像宽
128 |  * @param dst_height 目标图像高
129 |  * @param const_value_st padding值
130 |  * @param warp_affine_matrix_2_3 仿射变化矩阵
131 |  * @param norm 归一化策略
132 |  * @param edge 目标图像范围
133 |  */
134 | static __global__ void warp_affine_bilinear_and_normalize_plane_kernel(
135 |     uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width,
136 |     int dst_height, uint8_t const_value_st, float* warp_affine_matrix_2_3, Norm norm, int edge);
137 | 
138 | /**
139 |  * @brief 通过仿射变换完成双线性插值resize并且归一化
140 |  *
141 |  * @param src 原始图像数据
142 |  * @param src_line_size 图像长度(宽度*3)
143 |  * @param src_width 原始图像宽
144 |  * @param src_height 原始图像高
145 |  * @param dst 目标图像数据
146 |  * @param dst_width 目标图像宽
147 |  * @param dst_height 目标图像高
148 |  * @param matrix_2_3 仿射变化矩阵
149 |  * @param const_value padding值
150 |  * @param norm 归一化策略
151 |  * @param stream cuda stream
152 |  */
153 | void warp_affine_bilinear_and_normalize_plane(uint8_t* src, int src_line_size, int src_width,
154 |                                               int src_height, float* dst, int dst_width,
155 |                                               int dst_height, float* matrix_2_3,
156 |                                               uint8_t const_value, const Norm& norm,
157 |                                               cudaStream_t stream);
158 | 
159 | static __global__ void resize_bilinear_and_normalize_kernel(uint8_t* src, int src_line_size,
160 |                                                             int src_width, int src_height,
161 |                                                             float* dst, int dst_width,
162 |                                                             int dst_height, float sx, float sy,
163 |                                                             Norm norm, int edge);
164 | 
165 | void resize_bilinear_and_normalize(uint8_t* src, int src_line_size, int src_width, int src_height,
166 |                                    float* dst, int dst_width, int dst_height, const Norm& norm,
167 |                                    cudaStream_t stream);
168 | 
169 | };  // namespace FasterTRT
170 | 
171 | #endif
172 | 


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/base/tools.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file tools.hpp
  3 |  * @author 0zzx0
  4 |  * @brief 一些工具函数 包括CUDA检查 输出文件保存读取等函数 全部在tools里面定义并直接实现
  5 |  * @version 0.1
  6 |  * @date 2023-6-11 2023-8-21
  7 |  *
  8 |  * @copyright Copyright (c) 2023
  9 |  *
 10 |  */
 11 | 
 12 | #ifndef TOOLS_HPP
 13 | #define TOOLS_HPP
 14 | 
 15 | #include <fstream>
 16 | #include <string>
 17 | #include <vector>
 18 | #include <opencv2/opencv.hpp>
 19 | 
 20 | #include <dirent.h>
 21 | #include <sys/types.h>
 22 | #include <sys/stat.h>
 23 | #include <unistd.h>
 24 | #include <stdarg.h>
 25 | 
 26 | #include <cuda_runtime.h>
 27 | 
 28 | namespace FasterTRT {
 29 | 
 30 | enum class LogLevel : int { Debug = 5, Verbose = 4, Info = 3, Warning = 2, Error = 1, Fatal = 0 };
 31 | 
 32 | #define CURRENT_DEVICE_ID -1  // 当前设备
 33 | static bool check_runtime(cudaError_t e, const char* call, int line, const char* file);
 34 | static const char* level_string(LogLevel level);
 35 | static std::string file_name(const std::string& path, bool include_suffix);
 36 | static void __log_func(const char* file, int line, LogLevel level, const char* fmt, ...);
 37 | 
 38 | ///////////////////////TRT/////////////////////////////
 39 | #define TRT_STR(v) #v
 40 | #define TRT_VERSION_STRING(major, minor, patch, build) \
 41 |     TRT_STR(major) "." TRT_STR(minor) "." TRT_STR(patch) "." TRT_STR(build)
 42 | static const char* trt_version() {
 43 |     return TRT_VERSION_STRING(NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH,
 44 |                               NV_TENSORRT_BUILD);
 45 | }
 46 | 
 47 | /* 修改这个level来实现修改日志输出级别 */
 48 | #define CURRENT_LOG_LEVEL LogLevel::Info
 49 | #define INFOD(...) __log_func(__FILE__, __LINE__, LogLevel::Debug, __VA_ARGS__)
 50 | #define INFOV(...) __log_func(__FILE__, __LINE__, LogLevel::Verbose, __VA_ARGS__)
 51 | #define INFO(...) __log_func(__FILE__, __LINE__, LogLevel::Info, __VA_ARGS__)
 52 | #define INFOW(...) __log_func(__FILE__, __LINE__, LogLevel::Warning, __VA_ARGS__)
 53 | #define INFOE(...) __log_func(__FILE__, __LINE__, LogLevel::Error, __VA_ARGS__)
 54 | #define INFOF(...) __log_func(__FILE__, __LINE__, LogLevel::Fatal, __VA_ARGS__)
 55 | 
 56 | #define KernelPositionBlock                                 \
 57 |     int position = (blockDim.x * blockIdx.x + threadIdx.x); \
 58 |     if(position >= (edge)) return;
 59 | 
 60 | #define checkCudaRuntime(call) check_runtime(call, #call, __LINE__, __FILE__)
 61 | 
 62 | #define checkCudaKernel(...)                                            \
 63 |     __VA_ARGS__;                                                        \
 64 |     do {                                                                \
 65 |         cudaError_t cudaStatus = cudaPeekAtLastError();                 \
 66 |         if(cudaStatus != cudaSuccess) {                                 \
 67 |             INFOE("launch failed: %s", cudaGetErrorString(cudaStatus)); \
 68 |         }                                                               \
 69 |     } while(0);
 70 | 
 71 | #define Assert(op)                        \
 72 |     do {                                  \
 73 |         bool cond = !(!(op));             \
 74 |         if(!cond) {                       \
 75 |             INFOF("Assert failed, " #op); \
 76 |         }                                 \
 77 |     } while(false)
 78 | 
 79 | static bool check_runtime(cudaError_t e, const char* call, int line, const char* file) {
 80 |     if(e != cudaSuccess) {
 81 |         INFOE("CUDA Runtime error %s # %s, code = %s [ %d ] in file %s:%d", call,
 82 |               cudaGetErrorString(e), cudaGetErrorName(e), e, file, line);
 83 |         return false;
 84 |     }
 85 |     return true;
 86 | }
 87 | 
 88 | static const char* level_string(LogLevel level) {
 89 |     switch(level) {
 90 |         case LogLevel::Debug:
 91 |             return "debug";
 92 |         case LogLevel::Verbose:
 93 |             return "verbo";
 94 |         case LogLevel::Info:
 95 |             return "info";
 96 |         case LogLevel::Warning:
 97 |             return "warn";
 98 |         case LogLevel::Error:
 99 |             return "error";
100 |         case LogLevel::Fatal:
101 |             return "fatal";
102 |         default:
103 |             return "unknow";
104 |     }
105 | }
106 | 
107 | static std::string file_name(const std::string& path, bool include_suffix) {
108 |     if(path.empty()) return "";
109 |     int p = path.rfind('/');
110 |     p += 1;
111 | 
112 |     // include suffix
113 |     if(include_suffix) return path.substr(p);
114 | 
115 |     int u = path.rfind('.');
116 |     if(u == -1) return path.substr(p);
117 | 
118 |     if(u <= p) u = path.size();
119 |     return path.substr(p, u - p);
120 | }
121 | 
122 | static void __log_func(const char* file, int line, LogLevel level, const char* fmt, ...) {
123 |     if(level > CURRENT_LOG_LEVEL) return;
124 | 
125 |     va_list vl;
126 |     va_start(vl, fmt);
127 | 
128 |     char buffer[2048];
129 |     std::string filename = file_name(file, true);
130 |     int n = snprintf(buffer, sizeof(buffer), "[%s][%s:%d]:", level_string(level), filename.c_str(),
131 |                      line);
132 |     vsnprintf(buffer + n, sizeof(buffer) - n, fmt, vl);
133 | 
134 |     fprintf(stdout, "%s\n", buffer);
135 |     if(level == LogLevel::Fatal) {
136 |         fflush(stdout);
137 |         abort();
138 |     }
139 | }
140 | 
141 | static bool exists(const std::string& path) {
142 |     return access(path.c_str(), R_OK) == 0;
143 | }
144 | 
145 | static bool save_file(const std::string& file, const void* data, size_t length) {
146 |     FILE* f = fopen(file.c_str(), "wb");
147 |     if(!f) return false;
148 | 
149 |     if(data && length > 0) {
150 |         if(fwrite(data, 1, length, f) != length) {
151 |             fclose(f);
152 |             return false;
153 |         }
154 |     }
155 |     fclose(f);
156 |     return true;
157 | }
158 | 
159 | static bool save_file(const std::string& file, const std::vector<uint8_t>& data) {
160 |     return save_file(file, data.data(), data.size());
161 | }
162 | 
163 | /* 构造时设置当前gpuid，析构时修改为原来的gpuid */
164 | class AutoDevice {
165 | public:
166 |     AutoDevice(int device_id = 0) {
167 |         cudaGetDevice(&old_);
168 |         checkCudaRuntime(cudaSetDevice(device_id));
169 |     }
170 | 
171 |     virtual ~AutoDevice() { checkCudaRuntime(cudaSetDevice(old_)); }
172 | 
173 | private:
174 |     int old_ = -1;
175 | };
176 | 
177 | static bool check_device_id(int device_id) {
178 |     int device_count = -1;
179 |     checkCudaRuntime(cudaGetDeviceCount(&device_count));
180 |     if(device_id < 0 || device_id >= device_count) {
181 |         INFOE("Invalid device id: %d, count = %d", device_id, device_count);
182 |         return false;
183 |     }
184 |     return true;
185 | }
186 | 
187 | static int get_device(int device_id) {
188 |     if(device_id != CURRENT_DEVICE_ID) {
189 |         check_device_id(device_id);
190 |         return device_id;
191 |     }
192 |     checkCudaRuntime(cudaGetDevice(&device_id));
193 |     return device_id;
194 | }
195 | 
196 | static std::vector<uint8_t> load_file(const std::string& file) {
197 |     std::ifstream in(file, std::ios::in | std::ios::binary);
198 |     if(!in.is_open()) return {};
199 | 
200 |     in.seekg(0, std::ios::end);
201 |     size_t length = in.tellg();
202 | 
203 |     std::vector<uint8_t> data;
204 |     if(length > 0) {
205 |         in.seekg(0, std::ios::beg);
206 |         data.resize(length);
207 | 
208 |         in.read((char*)&data[0], length);
209 |     }
210 |     in.close();
211 |     return data;
212 | }
213 | 
214 | inline int upbound(int n, int align = 32) {
215 |     return (n + align - 1) / align * align;
216 | }
217 | 
218 | template <typename _T>
219 | static std::string join_dims(const std::vector<_T>& dims) {
220 |     std::stringstream output;
221 |     char buf[64];
222 |     const char* fmts[] = {"%d", " x %d"};
223 |     for(int i = 0; i < dims.size(); ++i) {
224 |         snprintf(buf, sizeof(buf), fmts[i != 0], dims[i]);
225 |         output << buf;
226 |     }
227 |     return output.str();
228 | }
229 | 
230 | // 设置推理设备
231 | static void set_device(int device_id) {
232 |     if(device_id == -1) return;
233 |     checkCudaRuntime(cudaSetDevice(device_id));
234 | }
235 | 
236 | };  // namespace FasterTRT
237 | 
238 | #endif


--------------------------------------------------------------------------------
/1_trt_base/trt_rtdetr/rtdetr_onnx.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import cv2
  3 | import numpy as np
  4 | import argparse
  5 | import onnxruntime as ort
  6 | from pathlib import Path
  7 | from tqdm import tqdm
  8 | 
  9 | 
 10 | COCO_CLASSES = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
 11 |                 "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
 12 |                 "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
 13 |                 "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
 14 |                 "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
 15 |                 "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
 16 |                 "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
 17 |                 "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase",
 18 |                 "scissors", "teddy bear", "hair drier", "toothbrush", ]
 19 | 
 20 | class PicoDet():
 21 |     def __init__(self,
 22 |                  model_pb_path,
 23 |                  prob_threshold=0.5):
 24 |         self.classes = COCO_CLASSES
 25 |         self.num_classes = len(self.classes)
 26 |         self.prob_threshold = prob_threshold
 27 |         self.mean = np.array(
 28 |             [103.53, 116.28, 123.675], dtype=np.float32).reshape(1, 1, 3)
 29 |         self.std = np.array(
 30 |             [57.375, 57.12, 58.395], dtype=np.float32).reshape(1, 1, 3)
 31 |         so = ort.SessionOptions()
 32 |         so.log_severity_level = 3
 33 |         self.net = ort.InferenceSession(model_pb_path, so)
 34 |         inputs_name = [a.name for a in self.net.get_inputs()]
 35 |         inputs_shape = {
 36 |             k: v.shape
 37 |             for k, v in zip(inputs_name, self.net.get_inputs())
 38 |         }
 39 |         self.input_shape = inputs_shape['image'][2:]
 40 | 
 41 |     def _normalize(self, img):
 42 |         img = img.astype(np.float32)
 43 |         img = (img / 255.0 - self.mean / 255.0) / (self.std / 255.0)
 44 |         return img
 45 | 
 46 |     def resize_image(self, srcimg, keep_ratio=False):
 47 |         top, left, newh, neww = 0, 0, self.input_shape[0], self.input_shape[1]
 48 |         origin_shape = srcimg.shape[:2]
 49 |         im_scale_y = newh / float(origin_shape[0])
 50 |         im_scale_x = neww / float(origin_shape[1])
 51 |         img_shape = np.array([
 52 |             [float(self.input_shape[0]), float(self.input_shape[1])]
 53 |         ]).astype('float32')
 54 |         scale_factor = np.array([[im_scale_y, im_scale_x]]).astype('float32')
 55 | 
 56 |         if keep_ratio and srcimg.shape[0] != srcimg.shape[1]:
 57 |             hw_scale = srcimg.shape[0] / srcimg.shape[1]
 58 |             if hw_scale > 1:
 59 |                 newh, neww = self.input_shape[0], int(self.input_shape[1] /
 60 |                                                       hw_scale)
 61 |                 img = cv2.resize(
 62 |                     srcimg, (neww, newh), interpolation=cv2.INTER_AREA)
 63 |                 left = int((self.input_shape[1] - neww) * 0.5)
 64 |                 img = cv2.copyMakeBorder(
 65 |                     img,
 66 |                     0,
 67 |                     0,
 68 |                     left,
 69 |                     self.input_shape[1] - neww - left,
 70 |                     cv2.BORDER_CONSTANT,
 71 |                     value=0)  # add border
 72 |             else:
 73 |                 newh, neww = int(self.input_shape[0] *
 74 |                                  hw_scale), self.input_shape[1]
 75 |                 img = cv2.resize(
 76 |                     srcimg, (neww, newh), interpolation=cv2.INTER_AREA)
 77 |                 top = int((self.input_shape[0] - newh) * 0.5)
 78 |                 img = cv2.copyMakeBorder(
 79 |                     img,
 80 |                     top,
 81 |                     self.input_shape[0] - newh - top,
 82 |                     0,
 83 |                     0,
 84 |                     cv2.BORDER_CONSTANT,
 85 |                     value=0)
 86 |         else:
 87 |             img = cv2.resize(
 88 |                 srcimg, self.input_shape, interpolation=cv2.INTER_LINEAR)
 89 | 
 90 |         return img, img_shape, scale_factor
 91 | 
 92 |     def get_color_map_list(self, num_classes):
 93 |         color_map = num_classes * [0, 0, 0]
 94 |         for i in range(0, num_classes):
 95 |             j = 0
 96 |             lab = i
 97 |             while lab:
 98 |                 color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j))
 99 |                 color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j))
100 |                 color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j))
101 |                 j += 1
102 |                 lab >>= 3
103 |         color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)]
104 |         return color_map
105 | 
106 |     def detect(self, srcimg):
107 |         img, im_shape, scale_factor = self.resize_image(srcimg)
108 |         img = self._normalize(img)
109 | 
110 |         blob = np.expand_dims(np.transpose(img, (2, 0, 1)), axis=0)
111 | 
112 |         inputs_dict = {
113 |             'im_shape': im_shape,
114 |             'image': blob,
115 |             'scale_factor': scale_factor
116 |         }
117 |         inputs_name = [a.name for a in self.net.get_inputs()]
118 |         net_inputs = {k: inputs_dict[k] for k in inputs_name}
119 | 
120 |         outs = self.net.run(None, net_inputs)
121 | 
122 |         outs = np.array(outs[0])
123 |         expect_boxes = (outs[:, 1] > 0.5) & (outs[:, 0] > -1)
124 |         np_boxes = outs[expect_boxes, :]
125 | 
126 |         print(np_boxes)
127 | 
128 |         # color_list = self.get_color_map_list(self.num_classes)
129 |         # clsid2color = {}
130 | 
131 |         # for i in range(np_boxes.shape[0]):
132 |         #     classid, conf = int(np_boxes[i, 0]), np_boxes[i, 1]
133 |         #     xmin, ymin, xmax, ymax = int(np_boxes[i, 2]), int(np_boxes[
134 |         #         i, 3]), int(np_boxes[i, 4]), int(np_boxes[i, 5])
135 | 
136 |         #     if classid not in clsid2color:
137 |         #         clsid2color[classid] = color_list[classid]
138 |         #     color = tuple(clsid2color[classid])
139 | 
140 |         #     cv2.rectangle(
141 |         #         srcimg, (xmin, ymin), (xmax, ymax), color, thickness=2)
142 |         #     print(self.classes[classid] + ': ' + str(round(conf, 3)))
143 |         #     cv2.putText(
144 |         #         srcimg,
145 |         #         self.classes[classid] + ':' + str(round(conf, 3)), (xmin,
146 |         #                                                             ymin - 10),
147 |         #         cv2.FONT_HERSHEY_SIMPLEX,
148 |         #         0.8, (0, 255, 0),
149 |         #         thickness=2)
150 | 
151 |         return srcimg
152 | 
153 |     def detect_folder(self, img_fold, result_path):
154 |         img_fold = Path(img_fold)
155 |         result_path = Path(result_path)
156 |         result_path.mkdir(parents=True, exist_ok=True)
157 | 
158 |         img_name_list = filter(
159 |             lambda x: str(x).endswith(".png") or str(x).endswith(".jpg"),
160 |             img_fold.iterdir(), )
161 |         img_name_list = list(img_name_list)
162 |         print(f"find {len(img_name_list)} images")
163 | 
164 |         for img_path in tqdm(img_name_list):
165 |             img = cv2.imread(str(img_path), 1)
166 |             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
167 | 
168 |             srcimg = net.detect(img)
169 |             save_path = str(result_path / img_path.name.replace(".png", ".jpg"))
170 |             cv2.imwrite(save_path, srcimg)
171 | 
172 | 
173 | if __name__ == '__main__':
174 | 
175 |     model_path = "rtdetr_r18vd_6x_coco/rtdetr_r18vd_6x_coco.onnx"
176 |     img_file = "../demo/000000570688.jpg"
177 |     conf = 0.5
178 |     net = PicoDet(model_path, conf)
179 |     
180 |     img = cv2.imread(img_file)
181 |     t1 = time.perf_counter()
182 |     for _ in range(100):
183 |         net.detect(img)
184 |     t2 = time.perf_counter()
185 |     print(f"time: {(t2-t1)*1000/100.0}ms")
186 |     
187 | 
188 | 


--------------------------------------------------------------------------------
/1_trt_base/trt_plugin/demo01/demo01.cu:
--------------------------------------------------------------------------------
  1 | #include "demo01.h"
  2 | 
  3 | __global__ void addScalarKernel(const float *input, float *output, const float scalar, const int nElement)
  4 | {
  5 |     const int index = blockIdx.x * blockDim.x + threadIdx.x;
  6 |     if (index >= nElement)
  7 |         return;
  8 | 
  9 |     float _1      = input[index];
 10 |     float _2      = _1 + scalar;
 11 |     output[index] = _2;
 12 | }
 13 | 
 14 | namespace nvinfer1
 15 | {
 16 | ZZX_ADDScalar::ZZX_ADDScalar(const std::string &name, float scalar): name_(name)
 17 | {
 18 |     WHERE_AM_I();   // debug用的
 19 |     m_.scalar = scalar;
 20 | }
 21 | 
 22 | ZZX_ADDScalar::ZZX_ADDScalar(const std::string &name, const void *buffer, size_t length): name_(name)
 23 | {
 24 |     WHERE_AM_I();
 25 |     memcpy(&m_, buffer, sizeof(m_));
 26 | }
 27 | 
 28 | ZZX_ADDScalar::~ZZX_ADDScalar()
 29 | {
 30 |     WHERE_AM_I();
 31 | }
 32 | 
 33 | // 深拷贝
 34 | IPluginV2DynamicExt *ZZX_ADDScalar::clone() const noexcept
 35 | {
 36 |     WHERE_AM_I();
 37 |     auto p = new ZZX_ADDScalar(name_, &m_, sizeof(m_));
 38 |     p->setPluginNamespace(namespace_.c_str());
 39 |     return p;
 40 | }
 41 | 
 42 | // 获得输出数量 自定义为1
 43 | int32_t ZZX_ADDScalar::getNbOutputs() const noexcept
 44 | {
 45 |     WHERE_AM_I();
 46 |     return 1;
 47 | }
 48 | 
 49 | // 获得输出数据类型 自定义为和输入一样
 50 | DataType ZZX_ADDScalar::getOutputDataType(int32_t index, DataType const *inputTypes, int32_t nbInputs) const noexcept
 51 | {
 52 |     WHERE_AM_I();
 53 |     return inputTypes[0];
 54 | }
 55 | 
 56 | // 获取输出维度
 57 | DimsExprs ZZX_ADDScalar::getOutputDimensions(int32_t outputIndex, const DimsExprs *inputs, int32_t nbInputs, IExprBuilder &exprBuilder) noexcept
 58 | {
 59 |     WHERE_AM_I();
 60 |     return inputs[0];
 61 | }
 62 | 
 63 | 
 64 | bool ZZX_ADDScalar::supportsFormatCombination(int32_t pos, const PluginTensorDesc *inOut, int32_t nbInputs, int32_t nbOutputs) noexcept
 65 | {
 66 |     WHERE_AM_I();
 67 |     bool res;
 68 |     switch (pos)
 69 |     {
 70 |     case 0:
 71 |         res = inOut[0].type == DataType::kFLOAT && inOut[0].format == TensorFormat::kLINEAR;
 72 |         break;
 73 |     case 1:
 74 |         res = inOut[1].type == inOut[0].type && inOut[1].format == inOut[0].format;
 75 |         break;
 76 |     default: // should NOT be here!
 77 |         res = false;
 78 |     }
 79 | #ifdef DEBUG
 80 |     std::cout << "\tpos=" << pos << ",res=" << res << "->[";
 81 |     for (int i = 0; i < nbInputs + nbOutputs; ++i)
 82 |     {
 83 |         std::cout << formatToString(inOut[i].format) << ",";
 84 |     }
 85 |     std::cout << "],[";
 86 |     for (int i = 0; i < nbInputs + nbOutputs; ++i)
 87 |     {
 88 |         std::cout << dataTypeToString(inOut[i].type) << ",";
 89 |     }
 90 |     std::cout << "]" << std::endl;
 91 | #endif
 92 |     return res;
 93 | }
 94 | 
 95 | // 推理前调用
 96 | void ZZX_ADDScalar::configurePlugin(const DynamicPluginTensorDesc *in, int32_t nbInputs, const DynamicPluginTensorDesc *out, int32_t nbOutputs) noexcept
 97 | {
 98 |     WHERE_AM_I();
 99 |     return;
100 | }
101 | 
102 | // 告诉trt需要多大中间变量储存空间， 便于后续优化
103 | size_t ZZX_ADDScalar::getWorkspaceSize(const PluginTensorDesc *inputs, int32_t nbInputs, const PluginTensorDesc *outputs, int32_t nbOutputs) const noexcept
104 | {
105 |     WHERE_AM_I();
106 |     return 0;
107 | }
108 | 
109 | // 核心，调用核函数 不要在这里使用cudaMalloc*等函数（导致巨大的申请开销）
110 | int32_t ZZX_ADDScalar::enqueue(const PluginTensorDesc *inputDesc, const PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept
111 | {
112 |     WHERE_AM_I();
113 |     int nElement = 1;
114 |     for (int i = 0; i < inputDesc[0].dims.nbDims; ++i)
115 |     {
116 |         nElement *= inputDesc[0].dims.d[i];
117 |     }
118 |     dim3 grid(CEIL_DIVIDE(nElement, 256), 1, 1), block(256, 1, 1);
119 |     addScalarKernel<<<grid, block, 0, stream>>>(reinterpret_cast<const float *>(inputs[0]), reinterpret_cast<float *>(outputs[0]), m_.scalar, nElement);
120 |     return 0;
121 | }
122 | 
123 | // context/engine 销毁的时候调用
124 | void ZZX_ADDScalar::destroy() noexcept
125 | {
126 |     WHERE_AM_I();
127 |     delete this;
128 |     return;
129 | }
130 | 
131 | // engine 创建时被调用，用于初始化 Plugin
132 | int32_t ZZX_ADDScalar::initialize() noexcept
133 | {
134 |     WHERE_AM_I();
135 |     return 0;
136 | }
137 | 
138 | // terminate (engine 销毁时被调用，用于释放 initialize 函数申请的资源
139 | void ZZX_ADDScalar::terminate() noexcept
140 | {
141 |     WHERE_AM_I();
142 |     return;
143 | }
144 | 
145 | // 序列化
146 | // （报告序列化需要的空间大小，单位 Byte
147 | size_t ZZX_ADDScalar::getSerializationSize() const noexcept
148 | {
149 |     WHERE_AM_I();
150 |     return sizeof(m_);
151 | }
152 | 
153 | // （将 Plugin 数据序列化到给定的 buffer 中）
154 | void ZZX_ADDScalar::serialize(void *buffer) const noexcept
155 | {
156 |     WHERE_AM_I();
157 |     memcpy(buffer, &m_, sizeof(m_));
158 |     return;
159 | }
160 | 
161 | void ZZX_ADDScalar::setPluginNamespace(const char *pluginNamespace) noexcept
162 | {
163 |     WHERE_AM_I();
164 |     namespace_ = pluginNamespace;
165 |     return;
166 | }
167 | 
168 | const char *ZZX_ADDScalar::getPluginNamespace() const noexcept
169 | {
170 |     WHERE_AM_I();
171 |     return namespace_.c_str();
172 | }
173 | 
174 | const char *ZZX_ADDScalar::getPluginType() const noexcept
175 | {
176 |     WHERE_AM_I();
177 |     return PLUGIN_NAME;
178 | }
179 | 
180 | const char *ZZX_ADDScalar::getPluginVersion() const noexcept
181 | {
182 |     WHERE_AM_I();
183 |     return PLUGIN_VERSION;
184 | }
185 | 
186 | // （申请使用 context 独占的 cudnn 或 cublas
187 | void ZZX_ADDScalar::attachToContext(cudnnContext *contextCudnn, cublasContext *contextCublas, IGpuAllocator *gpuAllocator) noexcept
188 | {
189 |     WHERE_AM_I();
190 |     return;
191 | }
192 | 
193 | //（销毁 context 独占的 cudnn 或 cublas 资
194 | void ZZX_ADDScalar::detachFromContext() noexcept
195 | {
196 |     WHERE_AM_I();
197 |     return;
198 | }
199 | 
200 | 
201 | 
202 | 
203 | // class AddScalarPluginCreator
204 | PluginFieldCollection    ZZXAddScalarPluginCreator::fc_ {};
205 | std::vector<PluginField> ZZXAddScalarPluginCreator::attr_;
206 | 
207 | ZZXAddScalarPluginCreator::ZZXAddScalarPluginCreator()
208 | {
209 |     WHERE_AM_I();
210 |     attr_.clear();
211 |     attr_.emplace_back(PluginField("scalar", nullptr, PluginFieldType::kFLOAT32, 1));
212 |     fc_.nbFields = attr_.size();
213 |     fc_.fields   = attr_.data();
214 | }
215 | 
216 | ZZXAddScalarPluginCreator::~ZZXAddScalarPluginCreator()
217 | {
218 |     WHERE_AM_I();
219 | }
220 | 
221 | // 接受权重，构造这个算子 
222 | IPluginV2DynamicExt* ZZXAddScalarPluginCreator::createPlugin(const char *name, const PluginFieldCollection *fc) noexcept
223 | {
224 |     WHERE_AM_I();
225 |     float                          scalar = 0;
226 |     std::map<std::string, float *> parameterMap {{"scalar", &scalar}};
227 | 
228 |     for (int i = 0; i < fc->nbFields; ++i)
229 |     {
230 |         if (parameterMap.find(fc->fields[i].name) != parameterMap.end())
231 |         {
232 |             *parameterMap[fc->fields[i].name] = *reinterpret_cast<const float *>(fc->fields[i].data);
233 |         }
234 |     }
235 |     ZZX_ADDScalar *pObj = new ZZX_ADDScalar(name, scalar);
236 |     pObj->setPluginNamespace(namespace_.c_str());
237 |     return pObj;
238 | }
239 | 
240 | // 反序列化
241 | IPluginV2DynamicExt *ZZXAddScalarPluginCreator::deserializePlugin(const char *name, const void *serialData, size_t serialLength) noexcept
242 | {
243 |     WHERE_AM_I();
244 |     ZZX_ADDScalar *pObj = new ZZX_ADDScalar(name, serialData, serialLength);
245 |     pObj->setPluginNamespace(namespace_.c_str());
246 |     return pObj;
247 | }
248 | 
249 | void ZZXAddScalarPluginCreator::setPluginNamespace(const char *pluginNamespace) noexcept
250 | {
251 |     WHERE_AM_I();
252 |     namespace_ = pluginNamespace;
253 |     return;
254 | }
255 | 
256 | const char *ZZXAddScalarPluginCreator::getPluginNamespace() const noexcept
257 | {
258 |     WHERE_AM_I();
259 |     return namespace_.c_str();
260 | }
261 | 
262 | const char *ZZXAddScalarPluginCreator::getPluginName() const noexcept
263 | {
264 |     WHERE_AM_I();
265 |     return PLUGIN_NAME;
266 | }
267 | 
268 | const char *ZZXAddScalarPluginCreator::getPluginVersion() const noexcept
269 | {
270 |     WHERE_AM_I();
271 |     return PLUGIN_VERSION;
272 | }
273 | 
274 | const PluginFieldCollection *ZZXAddScalarPluginCreator::getFieldNames() noexcept
275 | {
276 |     WHERE_AM_I();
277 |     return &fc_;
278 | }
279 | 
280 | REGISTER_TENSORRT_PLUGIN(ZZXAddScalarPluginCreator);
281 | 
282 | }
283 | 


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/apps/rtdetr/rtdetr.cpp:
--------------------------------------------------------------------------------
  1 | #include "rtdetr.h"
  2 | 
  3 | namespace RTDETR {
  4 | 
  5 | RtDetrTRTInferImpl::~RtDetrTRTInferImpl() {
  6 |     stop();
  7 | }
  8 | 
  9 | // 启动 但不是重写基类的startup 参数不一样 里面会去调用基类
 10 | bool RtDetrTRTInferImpl::startup(const std::string& file, int gpuid, int batch_size,
 11 |                                  float confidence_threshold) {
 12 |     // const float mean_norm[3] = {103.53, 116.28, 123.675};
 13 |     // const float std_norm[3] = {57.375, 57.12, 58.395};
 14 |     // normalize_ = Norm::mean_std(mean_norm, std_norm, 1 / 255.0f, ChannelType::SwapRB);
 15 |     normalize_ = Norm::alpha_beta(1 / 255.0f, 0.0f, ChannelType::SwapRB);
 16 |     confidence_threshold_ = confidence_threshold;
 17 |     batch_size_ = batch_size;
 18 |     return ThreadSafedAsyncInferImpl::startup(std::make_tuple(file, gpuid));
 19 | }
 20 | 
 21 | // 重写基类worker 工作线程
 22 | void RtDetrTRTInferImpl::worker(std::promise<bool>& result) {
 23 |     std::string file = std::get<0>(start_param_);
 24 |     int gpuid = std::get<1>(start_param_);
 25 | 
 26 |     set_device(gpuid);
 27 |     auto engine = load_infer(file, batch_size_);
 28 |     if(engine == nullptr) {
 29 |         INFOE("Engine %s load failed", file.c_str());
 30 |         result.set_value(false);
 31 |         return;
 32 |     }
 33 | 
 34 |     engine->print();
 35 | 
 36 |     const int MAX_IMAGE_BBOX = 100;
 37 |     const int NUM_BOX_ELEMENT = 7;  // left, top, right, bottom, confidence, class, keepflag
 38 |     Tensor affin_matrix_device(FasterTRT::DataType::Float);
 39 |     Tensor output_array_device(FasterTRT::DataType::Float);
 40 | 
 41 |     // 输入输出
 42 |     int max_batch_size = engine->get_max_batch_size();
 43 |     auto input = engine->tensor("image");
 44 |     auto output = engine->tensor("output");
 45 | 
 46 |     // decode数据
 47 |     int num_classes, output_num_bboxes, output_fm_area;
 48 |     output_num_bboxes = output->size(0) * output->size(1);
 49 |     output_fm_area = output->size(2);
 50 |     num_classes = output->size(2) - 4;
 51 | 
 52 |     // 输入
 53 |     input_width_ = input->size(3);
 54 |     input_height_ = input->size(2);
 55 |     tensor_allocator_ = std::make_shared<MonopolyAllocator<Tensor>>(max_batch_size * 2);
 56 |     stream_ = engine->get_stream();
 57 |     gpu_ = gpuid;
 58 |     result.set_value(true);  // 初始化完成 返回给startup函数结束
 59 | 
 60 |     input->resize_single_dim(0, max_batch_size).to_gpu();
 61 |     affin_matrix_device.set_stream(stream_);
 62 | 
 63 |     // 这里8个值的目的是保证 8 * sizeof(float) % 32 == 0
 64 |     affin_matrix_device.resize(max_batch_size, 8).to_gpu();
 65 | 
 66 |     // 输出数据
 67 |     output_array_device.set_stream(stream_);
 68 |     output_array_device.resize(max_batch_size, 1 + MAX_IMAGE_BBOX * NUM_BOX_ELEMENT).to_gpu();
 69 | 
 70 |     auto decode_kernel_invoker = rtdetr_decode_kernel_invoker;
 71 | 
 72 |     // 循环等待&检测
 73 |     std::vector<Job> fetch_jobs;
 74 |     while(get_jobs_and_wait(fetch_jobs, max_batch_size)) {
 75 |         int infer_batch_size = fetch_jobs.size();
 76 |         input->resize_single_dim(0, infer_batch_size);
 77 | 
 78 |         for(int ibatch = 0; ibatch < infer_batch_size; ++ibatch) {
 79 |             auto& job = fetch_jobs[ibatch];
 80 |             auto& mono = job.mono_tensor->data();
 81 |             affin_matrix_device.copy_from_gpu(affin_matrix_device.offset(ibatch),
 82 |                                               mono->get_workspace()->gpu(), 6);
 83 |             input->copy_from_gpu(input->offset(ibatch), mono->gpu(), mono->count());
 84 |             job.mono_tensor->release();
 85 |         }
 86 | 
 87 |         engine->forward(false);
 88 | 
 89 |         output_array_device.to_gpu(false);
 90 |         for(int ibatch = 0; ibatch < infer_batch_size; ++ibatch) {
 91 |             // auto& job = fetch_jobs[ibatch];
 92 |             float* image_based_output = output->gpu<float>(ibatch);
 93 |             float* output_array_ptr = output_array_device.gpu<float>(ibatch);
 94 |             auto affine_matrix = affin_matrix_device.gpu<float>(ibatch);
 95 |             checkCudaRuntime(cudaMemsetAsync(output_array_ptr, 0, sizeof(int), stream_));
 96 |             decode_kernel_invoker(image_based_output, output_num_bboxes, output_fm_area,
 97 |                                   num_classes, confidence_threshold_, affine_matrix,
 98 |                                   output_array_ptr, MAX_IMAGE_BBOX, input_width_, stream_);
 99 |         }
100 | 
101 |         output_array_device.to_cpu();
102 |         for(int ibatch = 0; ibatch < infer_batch_size; ++ibatch) {
103 |             float* parray = output_array_device.cpu<float>(ibatch);
104 |             int count = std::min(MAX_IMAGE_BBOX, (int)*parray);
105 |             auto& job = fetch_jobs[ibatch];
106 |             auto& image_based_boxes = job.output;
107 |             for(int i = 0; i < count; ++i) {
108 |                 float* pbox = parray + 1 + i * NUM_BOX_ELEMENT;
109 |                 int label = pbox[5];
110 |                 int keepflag = pbox[6];
111 |                 if(keepflag == 1) {
112 |                     image_based_boxes.emplace_back(pbox[0], pbox[1], pbox[2], pbox[3], pbox[4],
113 |                                                    label);
114 |                 }
115 |             }
116 |             job.pro->set_value(image_based_boxes);
117 |         }
118 |         fetch_jobs.clear();
119 |     }
120 |     stream_ = nullptr;
121 |     // TODO 这个流是否要考虑换个地方释放？
122 |     checkCudaRuntime(cudaStreamDestroy(stream_pro_));
123 |     stream_pro_ = nullptr;
124 |     tensor_allocator_.reset();
125 |     INFO("Engine destroy.");
126 | }
127 | 
128 | // 预处理
129 | bool RtDetrTRTInferImpl::preprocess(Job& job, const cv::Mat& image) {
130 |     if(tensor_allocator_ == nullptr) {
131 |         INFOE("tensor_allocator_ is nullptr");
132 |         return false;
133 |     }
134 | 
135 |     job.mono_tensor = tensor_allocator_->query();
136 |     if(job.mono_tensor == nullptr) {
137 |         INFOE("Tensor allocator query failed.");
138 |         return false;
139 |     }
140 | 
141 |     if(stream_pro_ == nullptr) {
142 |         checkCudaRuntime(cudaStreamCreate(&stream_pro_));
143 |     }
144 | 
145 |     AutoDevice auto_device(gpu_);
146 |     auto& tensor = job.mono_tensor->data();
147 |     if(tensor == nullptr) {
148 |         // not init
149 |         tensor = std::make_shared<Tensor>();
150 |         tensor->set_workspace(std::make_shared<MixMemory>());
151 |     }
152 | 
153 |     cv::Size input_size(input_width_, input_height_);
154 |     job.additional.compute(image.size(), input_size);
155 | 
156 |     tensor->set_stream(stream_pro_);
157 |     tensor->resize(1, 3, input_height_, input_width_);
158 | 
159 |     size_t size_image = image.cols * image.rows * 3;
160 |     size_t size_matrix = upbound(sizeof(job.additional.d2i), 32);
161 |     auto workspace = tensor->get_workspace();
162 |     uint8_t* gpu_workspace = (uint8_t*)workspace->gpu(size_matrix + size_image);
163 |     float* affine_matrix_device = (float*)gpu_workspace;
164 |     uint8_t* image_device = size_matrix + gpu_workspace;
165 | 
166 |     uint8_t* cpu_workspace = (uint8_t*)workspace->cpu(size_matrix + size_image);
167 |     float* affine_matrix_host = (float*)cpu_workspace;
168 |     uint8_t* image_host = size_matrix + cpu_workspace;
169 | 
170 |     memcpy(image_host, image.data, size_image);
171 |     memcpy(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i));
172 |     checkCudaRuntime(
173 |         cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_pro_));
174 |     checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host,
175 |                                      sizeof(job.additional.d2i), cudaMemcpyHostToDevice,
176 |                                      stream_pro_));
177 | 
178 |     warp_affine_bilinear_and_normalize_plane(image_device, image.cols * 3, image.cols, image.rows,
179 |                                              tensor->gpu<float>(), input_width_, input_height_,
180 |                                              affine_matrix_device, .0, normalize_, stream_pro_);
181 |     // 这个地方需要同步，确保数据放到gpu后才可以吧任务提交到队列中。
182 |     cudaStreamSynchronize(stream_pro_);
183 | 
184 |     return true;
185 | }
186 | 
187 | // 提交<vector>任务
188 | std::vector<std::shared_future<BoxArray>> RtDetrTRTInferImpl::commits(
189 |     const std::vector<cv::Mat>& images) {
190 |     return ThreadSafedAsyncInferImpl::commits(images);
191 | }
192 | 
193 | // 提交cv::Mat任务
194 | std::shared_future<BoxArray> RtDetrTRTInferImpl::commit(const cv::Mat& image) {
195 |     return ThreadSafedAsyncInferImpl::commit(image);
196 | }
197 | 
198 | // 创建推理器
199 | std::shared_ptr<Infer> create_infer(const std::string& engine_file, int gpuid, int batch_size,
200 |                                     float confidence_threshold) {
201 |     std::shared_ptr<RtDetrTRTInferImpl> instance(new RtDetrTRTInferImpl());
202 |     if(!instance->startup(engine_file, gpuid, batch_size, confidence_threshold)) {
203 |         instance.reset();
204 |     }
205 |     return instance;
206 | }
207 | }  // namespace RTDETR
208 | 


--------------------------------------------------------------------------------
/1_trt_base/trt_rtdetr/rtdetr_trt.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import time
  3 | import numpy as np
  4 | import tensorrt as trt
  5 | from cuda import cudart
  6 | 
  7 | model_path = "rtdetr_r18vd_6x_coco/rtdetr_r18vd_6x_coco.trt"
  8 | img_path = "../demo/000000570688.jpg"
  9 | 
 10 | mean = np.array(
 11 |             [103.53, 116.28, 123.675], dtype=np.float32).reshape(1, 1, 3)
 12 | std = np.array(
 13 |             [57.375, 57.12, 58.395], dtype=np.float32).reshape(1, 1, 3)
 14 | def normalize(img):
 15 |     img = img.astype(np.float32)
 16 |     img = (img / 255.0 - mean / 255.0) / (std / 255.0)
 17 |     return img
 18 | 
 19 | def resize_image(srcimg, input_shape, keep_ratio=False):
 20 |     top, left, newh, neww = 0, 0, input_shape[0], input_shape[1]
 21 |     origin_shape = srcimg.shape[:2]
 22 |     im_scale_y = newh / float(origin_shape[0])
 23 |     im_scale_x = neww / float(origin_shape[1])
 24 |     img_shape = np.array([
 25 |         [float(input_shape[0]), float(input_shape[1])]
 26 |     ]).astype('float32')
 27 |     scale_factor = np.array([[im_scale_y, im_scale_x]]).astype('float32')
 28 | 
 29 |     if keep_ratio and srcimg.shape[0] != srcimg.shape[1]:
 30 |         hw_scale = srcimg.shape[0] / srcimg.shape[1]
 31 |         if hw_scale > 1:
 32 |             newh, neww = input_shape[0], int(input_shape[1] /
 33 |                                                     hw_scale)
 34 |             img = cv2.resize(
 35 |                 srcimg, (neww, newh), interpolation=cv2.INTER_AREA)
 36 |             left = int((input_shape[1] - neww) * 0.5)
 37 |             img = cv2.copyMakeBorder(
 38 |                 img,
 39 |                 0,
 40 |                 0,
 41 |                 left,
 42 |                 input_shape[1] - neww - left,
 43 |                 cv2.BORDER_CONSTANT,
 44 |                 value=0)  # add border
 45 |         else:
 46 |             newh, neww = int(input_shape[0] *
 47 |                                 hw_scale), input_shape[1]
 48 |             img = cv2.resize(
 49 |                 srcimg, (neww, newh), interpolation=cv2.INTER_AREA)
 50 |             top = int((input_shape[0] - newh) * 0.5)
 51 |             img = cv2.copyMakeBorder(
 52 |                 img,
 53 |                 top,
 54 |                 input_shape[0] - newh - top,
 55 |                 0,
 56 |                 0,
 57 |                 cv2.BORDER_CONSTANT,
 58 |                 value=0)
 59 |     else:
 60 |         img = cv2.resize(
 61 |             srcimg, input_shape, interpolation=cv2.INTER_LINEAR)
 62 | 
 63 |     return img, img_shape, scale_factor
 64 | 
 65 | 
 66 | 
 67 | 
 68 | class RtdetrTrt:
 69 |     def __init__(self) -> None:
 70 |         self.logger = trt.Logger(trt.Logger.WARNING)
 71 |         trt.init_libnvinfer_plugins( self.logger, namespace='')   # 加载插件
 72 | 
 73 |         self.runtime = trt.Runtime(self.logger)
 74 | 
 75 |         with open(model_path, "rb") as f:
 76 |             serialized_engine = f.read()
 77 |         self.engine = self.runtime.deserialize_cuda_engine(serialized_engine)
 78 |         self.context = self.engine.create_execution_context()
 79 | 
 80 |         self.nIO = self.engine.num_io_tensors     # io变量数量
 81 |         self.lTensorName = [self.engine.get_tensor_name(i) for i in range(self.nIO)]   # 获取io变量名字
 82 |         self.nInput = [self.engine.get_tensor_mode(self.lTensorName[i]) for i in range(self.nIO)].count(trt.TensorIOMode.INPUT) # 输入tensor数量
 83 |         self.Output = [self.engine.get_tensor_mode(self.lTensorName[i]) for i in range(self.nIO)].count(trt.TensorIOMode.OUTPUT)
 84 |         
 85 |         print("===============INPUT/OUTPUT=================== ")
 86 |         for i in range(self.nIO):
 87 |             print(f"[{i}]{'Input ' if i < self.nInput else 'Output'} -> "+
 88 |                 f"{self.engine.get_tensor_dtype(self.lTensorName[i])} " +       # 数据类型
 89 |                 f"{self.engine.get_tensor_shape(self.lTensorName[i])} " +       # engine形状
 90 |                 f"{self.context.get_tensor_shape(self.lTensorName[i])} " +      # context形状
 91 |                 f"{self.lTensorName[i]} ")                                 # 名字
 92 |         print("============================================== ")
 93 | 
 94 |         # cpu端数据
 95 |         self.bufferH = []                                                            
 96 |         for i in range(self.nIO):
 97 |             self.bufferH.append(np.empty(self.context.get_tensor_shape(self.lTensorName[i]), 
 98 |                                          dtype=trt.nptype(self.engine.get_tensor_dtype(self.lTensorName[i]))))
 99 |     
100 |         # # gpu端数据申请显存
101 |         self.bufferD = []
102 |         for i in range(self.nIO):
103 |             self.bufferD.append(cudart.cudaMalloc(self.bufferH[i].nbytes)[1])
104 | 
105 | 
106 |     def infer(self, origin_img):
107 | 
108 |         img, img_shape, scale_factor = resize_image(origin_img, (640, 640))
109 |         img = normalize(img)
110 |         blob = np.expand_dims(np.transpose(img, (2, 0, 1)), axis=0)
111 | 
112 |         # cpu端数据
113 |         self.bufferH[0] = np.ascontiguousarray(img_shape)
114 |         self.bufferH[1] = np.ascontiguousarray(blob)
115 |         self.bufferH[2] = np.ascontiguousarray(scale_factor)
116 |                                                         
117 | 
118 |         for i in range(self.nInput, self.nIO):                # 输出数据
119 |             self.bufferH[i] = np.empty(self.context.get_tensor_shape(self.lTensorName[i]), 
120 |                                     dtype=trt.nptype(self.engine.get_tensor_dtype(self.lTensorName[i])))
121 |         
122 |         # 输入数据复制到显存
123 |         for i in range(self.nInput):                                                    
124 |             cudart.cudaMemcpy(self.bufferD[i], self.bufferH[i].ctypes.data, self.bufferH[i].nbytes, 
125 |                             cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
126 |         
127 |         # # 推理 execute_async_v2 execute_v2
128 |         self.context.execute_v2(self.bufferD)     # batchsize bingings
129 |         for i in range(self.nInput, self.nIO):    # 数据拷会cpu
130 |             cudart.cudaMemcpy(self.bufferH[i].ctypes.data, self.bufferD[i], self.bufferH[i].nbytes, 
131 |                             cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
132 | 
133 |         # for i in range(self.nInput, self.nIO):  
134 |         #     print(len(self.bufferH[i]))
135 |         dets = []
136 |         for i in self.bufferH[-1]:
137 |             if i[1] > 0.5:
138 |                 dets.append(i)
139 |         return dets
140 |     
141 | 
142 |     def infer_(self, img_shape, blob, scale_factor):
143 |         # cpu端数据
144 |         self.bufferH[0] = np.ascontiguousarray(img_shape)
145 |         self.bufferH[1] = np.ascontiguousarray(blob)
146 |         self.bufferH[2] = np.ascontiguousarray(scale_factor)
147 |                                                         
148 | 
149 |         for i in range(self.nInput, self.nIO):                # 输出数据
150 |             self.bufferH[i] = np.empty(self.context.get_tensor_shape(self.lTensorName[i]), 
151 |                                     dtype=trt.nptype(self.engine.get_tensor_dtype(self.lTensorName[i])))
152 |         
153 |         # 输入数据复制到显存
154 |         for i in range(self.nInput):                                                    
155 |             cudart.cudaMemcpy(self.bufferD[i], self.bufferH[i].ctypes.data, self.bufferH[i].nbytes, 
156 |                             cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
157 |         
158 |         # # 推理 execute_async_v2 execute_v2
159 |         self.context.execute_v2(self.bufferD)     # batchsize bingings
160 |         for i in range(self.nInput, self.nIO):    # 数据拷会cpu
161 |             cudart.cudaMemcpy(self.bufferH[i].ctypes.data, self.bufferD[i], self.bufferH[i].nbytes, 
162 |                             cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost)
163 | 
164 |         # for i in range(self.nInput, self.nIO):  
165 |         #     print(len(self.bufferH[i]))
166 |         dets = []
167 |         for i in self.bufferH[-1]:
168 |             if i[1] > 0.5:
169 |                 dets.append(i)
170 |         return dets
171 |     
172 | 
173 |     def myfree(self):
174 |         for i in self.bufferD:       # 释放显存  
175 |             cudart.cudaFree(i)
176 | 
177 | 
178 | if __name__ == '__main__':
179 |     origin_img = cv2.imread(img_path)
180 | 
181 |     img, img_shape, scale_factor = resize_image(origin_img, (640, 640))
182 |     img = normalize(img)
183 |     blob = np.expand_dims(np.transpose(img, (2, 0, 1)), axis=0)
184 | 
185 | 
186 |     yolo_trt = RtdetrTrt()
187 |     yolo_trt.infer(origin_img)
188 |     for _ in range(50):
189 |         # yolo_trt.infer(origin_img)
190 |         yolo_trt.infer_(img, img_shape, scale_factor)
191 | 
192 |     time_b = time.perf_counter()
193 |     for _ in range(1000):
194 |         # dets = yolo_trt.infer(origin_img)
195 |         dets = yolo_trt.infer_(img, img_shape, scale_factor)
196 | 
197 |     time_e = time.perf_counter()
198 |     print(f"cost time: {(time_e-time_b)*1000 / 1000.0 :.2f}ms")
199 | 
200 |     for i in dets:
201 |         print(f"class: {i[0]:.0f}\tsocre: {i[1] :.2f}\tx1: {i[2] :.0f}\ty1: {i[3] :.0f}\tx2: {i[4] :.0f}\ty2: {i[5] :.0f}")
202 | 
203 | 
204 | 
205 |     yolo_trt.myfree()
206 | 
207 | 


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/base/memory_tensor.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file memory_tensor.hpp
  3 |  * @author 0zzx0
  4 |  * @brief 内存显存相关
  5 |  * @version 0.1
  6 |  * @date 2023-6-11 2023-8-21
  7 |  *
  8 |  * @copyright Copyright (c) 2023
  9 |  *
 10 |  */
 11 | 
 12 | #ifndef MEMORY_TENSOR_HPP
 13 | #define MEMORY_TENSOR_HPP
 14 | 
 15 | #include <memory>
 16 | #include <opencv2/opencv.hpp>
 17 | #include <memory.h>
 18 | #include <cuda_fp16.h>
 19 | 
 20 | #include "tools.hpp"
 21 | 
 22 | namespace FasterTRT {
 23 | 
 24 | typedef struct {
 25 |     unsigned short _;
 26 | } float16;
 27 | enum class DataType : int { Unknow = -1, Float = 0, Float16 = 1, Int32 = 2, UInt8 = 3 };
 28 | 
 29 | enum class DataHead : int { Init = 0, Device = 1, Host = 2 };
 30 | 
 31 | float float16_to_float(float16 value);
 32 | float16 float_to_float16(float value);
 33 | int data_type_size(DataType dt);
 34 | const char* data_head_string(DataHead dh);
 35 | const char* data_type_string(DataType dt);
 36 | 
 37 | /**
 38 |  * @brief MixMemory: gpu/cpu内存管理
 39 |     实现对gpu和cpu内存进行分配和释放
 40 |     cpu使用的是pinned memory，当对gpu做内存复制时，性能比较好
 41 |  *
 42 |  */
 43 | class MixMemory {
 44 | public:
 45 |     MixMemory(int device_id = CURRENT_DEVICE_ID);
 46 |     MixMemory(void* cpu, size_t cpu_size, void* gpu, size_t gpu_size);
 47 |     virtual ~MixMemory();
 48 |     void* gpu(size_t size);
 49 |     void* cpu(size_t size);
 50 |     void release_gpu();
 51 |     void release_cpu();
 52 |     void release_all();
 53 | 
 54 |     inline bool owner_gpu() const { return owner_gpu_; }
 55 |     inline bool owner_cpu() const { return owner_cpu_; }
 56 | 
 57 |     inline size_t cpu_size() const { return cpu_size_; }
 58 |     inline size_t gpu_size() const { return gpu_size_; }
 59 |     inline int device_id() const { return device_id_; }
 60 | 
 61 |     inline void* gpu() const { return gpu_; }
 62 | 
 63 |     // Pinned Memory
 64 |     inline void* cpu() const { return cpu_; }
 65 | 
 66 |     void reference_data(void* cpu, size_t cpu_size, void* gpu, size_t gpu_size);
 67 | 
 68 | private:
 69 |     int device_id_ = 0;
 70 | 
 71 |     void* cpu_ = nullptr;
 72 |     size_t cpu_size_ = 0;
 73 |     bool owner_cpu_ = true;
 74 | 
 75 |     void* gpu_ = nullptr;
 76 |     size_t gpu_size_ = 0;
 77 |     bool owner_gpu_ = true;
 78 | };
 79 | 
 80 | /**
 81 |  * @brief Tensor类，实现张量的管理
 82 |     由于NN多用张量，必须有个类进行管理才方便，实现内存自动分配，计算索引等等
 83 |     如果要调试，可以执行save_to_file，储存为文件后，在python中加载并查看
 84 |  *
 85 |  */
 86 | class Tensor {
 87 | public:
 88 |     Tensor(const Tensor& other) = delete;
 89 |     Tensor& operator=(const Tensor& other) = delete;
 90 | 
 91 |     explicit Tensor(DataType dtype = DataType::Float, std::shared_ptr<MixMemory> data = nullptr,
 92 |                     int device_id = CURRENT_DEVICE_ID);
 93 |     explicit Tensor(int n, int c, int h, int w, DataType dtype = DataType::Float,
 94 |                     std::shared_ptr<MixMemory> data = nullptr, int device_id = CURRENT_DEVICE_ID);
 95 |     explicit Tensor(int ndims, const int* dims, DataType dtype = DataType::Float,
 96 |                     std::shared_ptr<MixMemory> data = nullptr, int device_id = CURRENT_DEVICE_ID);
 97 |     explicit Tensor(const std::vector<int>& dims, DataType dtype = DataType::Float,
 98 |                     std::shared_ptr<MixMemory> data = nullptr, int device_id = CURRENT_DEVICE_ID);
 99 |     virtual ~Tensor();
100 | 
101 |     int numel() const;
102 |     inline int ndims() const { return shape_.size(); }
103 |     inline int size(int index) const { return shape_[index]; }
104 |     inline int shape(int index) const { return shape_[index]; }
105 | 
106 |     inline int batch() const { return shape_[0]; }
107 |     inline int channel() const { return shape_[1]; }
108 |     inline int height() const { return shape_[2]; }
109 |     inline int width() const { return shape_[3]; }
110 | 
111 |     inline DataType type() const { return dtype_; }
112 |     inline const std::vector<int>& dims() const { return shape_; }
113 |     inline const std::vector<size_t>& strides() const { return strides_; }
114 |     inline int bytes() const { return bytes_; }
115 |     inline int bytes(int start_axis) const { return count(start_axis) * element_size(); }
116 |     inline int element_size() const { return data_type_size(dtype_); }
117 |     inline DataHead head() const { return head_; }
118 | 
119 |     std::shared_ptr<Tensor> clone() const;
120 |     Tensor& release();
121 |     Tensor& set_to(float value);
122 |     bool empty() const;
123 | 
124 |     ///////////////偏置部分
125 |     template <typename... _Args>
126 |     int offset(int index, _Args... index_args) const {
127 |         const int index_array[] = {index, index_args...};
128 |         return offset_array(sizeof...(index_args) + 1, index_array);
129 |     }
130 | 
131 |     int offset_array(const std::vector<int>& index) const;
132 |     int offset_array(size_t size, const int* index_array) const;
133 | 
134 |     ////////////////resize部分
135 |     template <typename... _Args>
136 |     Tensor& resize(int dim_size, _Args... dim_size_args) {
137 |         const int dim_size_array[] = {dim_size, dim_size_args...};
138 |         return resize(sizeof...(dim_size_args) + 1, dim_size_array);
139 |     }
140 | 
141 |     Tensor& resize(int ndims, const int* dims);
142 |     Tensor& resize(const std::vector<int>& dims);
143 |     Tensor& resize_single_dim(int idim, int size);
144 |     int count(int start_axis = 0) const;
145 |     int device() const { return device_id_; }
146 | 
147 |     ////////////////////数据操作部分
148 |     Tensor& to_gpu(bool copy = true);
149 |     Tensor& to_cpu(bool copy = true);
150 | 
151 |     Tensor& to_half();
152 |     Tensor& to_float();
153 | 
154 |     inline void* cpu() const {
155 |         ((Tensor*)this)->to_cpu();
156 |         return data_->cpu();
157 |     }
158 |     inline void* gpu() const {
159 |         ((Tensor*)this)->to_gpu();
160 |         return data_->gpu();
161 |     }
162 | 
163 |     template <typename DType>
164 |     inline const DType* cpu() const {
165 |         return (DType*)cpu();
166 |     }
167 |     template <typename DType>
168 |     inline DType* cpu() {
169 |         return (DType*)cpu();
170 |     }
171 | 
172 |     template <typename DType, typename... _Args>
173 |     inline DType* cpu(int i, _Args&&... args) {
174 |         return cpu<DType>() + offset(i, args...);
175 |     }
176 | 
177 |     template <typename DType>
178 |     inline const DType* gpu() const {
179 |         return (DType*)gpu();
180 |     }
181 |     template <typename DType>
182 |     inline DType* gpu() {
183 |         return (DType*)gpu();
184 |     }
185 | 
186 |     template <typename DType, typename... _Args>
187 |     inline DType* gpu(int i, _Args&&... args) {
188 |         return gpu<DType>() + offset(i, args...);
189 |     }
190 | 
191 |     template <typename DType, typename... _Args>
192 |     inline DType& at(int i, _Args&&... args) {
193 |         return *(cpu<DType>() + offset(i, args...));
194 |     }
195 | 
196 |     std::shared_ptr<MixMemory> get_data() const { return data_; }
197 |     std::shared_ptr<MixMemory> get_workspace() const { return workspace_; }
198 |     Tensor& set_workspace(std::shared_ptr<MixMemory> workspace) {
199 |         workspace_ = workspace;
200 |         return *this;
201 |     }
202 | 
203 |     bool is_stream_owner() const { return stream_owner_; }
204 |     cudaStream_t get_stream() const { return stream_; }
205 |     Tensor& set_stream(cudaStream_t stream, bool owner = false) {
206 |         stream_ = stream;
207 |         stream_owner_ = owner;
208 |         return *this;
209 |     }
210 | 
211 |     Tensor& set_mat(int n, const cv::Mat& image);
212 |     Tensor& set_norm_mat(int n, const cv::Mat& image, float mean[3], float std[3]);
213 |     cv::Mat at_mat(int n = 0, int c = 0) {
214 |         return cv::Mat(height(), width(), CV_32F, cpu<float>(n, c));
215 |     }
216 | 
217 |     Tensor& synchronize();
218 |     const char* shape_string() const { return shape_string_; }
219 |     const char* descriptor() const;
220 | 
221 |     Tensor& copy_from_gpu(size_t offset, const void* src, size_t num_element,
222 |                           int device_id = CURRENT_DEVICE_ID);
223 |     Tensor& copy_from_cpu(size_t offset, const void* src, size_t num_element);
224 | 
225 |     void reference_data(const std::vector<int>& shape, void* cpu_data, size_t cpu_size,
226 |                         void* gpu_data, size_t gpu_size, DataType dtype);
227 |     /**
228 | 
229 |     # 以下代码是python中加载Tensor
230 |     import numpy as np
231 | 
232 |     def load_tensor(file):
233 | 
234 |         with open(file, "rb") as f:
235 |             binary_data = f.read()
236 | 
237 |         magic_number, ndims, dtype = np.frombuffer(binary_data, np.uint32, count=3, offset=0)
238 |         assert magic_number == 0xFCCFE2E2, f"{file} not a tensor file."
239 | 
240 |         dims = np.frombuffer(binary_data, np.uint32, count=ndims, offset=3 * 4)
241 | 
242 |         if dtype == 0:
243 |             np_dtype = np.float32
244 |         elif dtype == 1:
245 |             np_dtype = np.float16
246 |         else:
247 |             assert False, f"Unsupport dtype = {dtype}, can not convert to numpy dtype"
248 | 
249 |         return np.frombuffer(binary_data, np_dtype, offset=(ndims + 3) * 4).reshape(*dims)
250 | 
251 |         **/
252 |     bool save_to_file(const std::string& file) const;
253 |     bool load_from_file(const std::string& file);
254 | 
255 | private:
256 |     Tensor& compute_shape_string();
257 |     Tensor& adajust_memory_by_update_dims_or_type();
258 |     void setup_data(std::shared_ptr<MixMemory> data);
259 | 
260 | private:
261 |     std::vector<int> shape_;
262 |     std::vector<size_t> strides_;
263 |     size_t bytes_ = 0;
264 |     DataHead head_ = DataHead::Init;
265 |     DataType dtype_ = DataType::Float;
266 |     cudaStream_t stream_ = nullptr;
267 |     int device_id_ = 0;
268 |     char shape_string_[100];
269 |     char descriptor_string_[100];
270 |     std::shared_ptr<MixMemory> data_;
271 |     std::shared_ptr<MixMemory> workspace_;
272 | 
273 |     bool stream_owner_ = false;
274 | };
275 | 
276 | };  // namespace FasterTRT
277 | 
278 | #endif


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/base/infer_base.cpp:
--------------------------------------------------------------------------------
  1 | #include "infer_base.hpp"
  2 | 
  3 | namespace FasterTRT {
  4 | ///////////////////////////////////////////////////////////////////////
  5 | /////////////////////////// TRTInferImpl //////////////////////////////
  6 | ///////////////////////////////////////////////////////////////////////
  7 | 
  8 | TRTInferImpl::~TRTInferImpl() {
  9 |     destroy();
 10 | }
 11 | 
 12 | // 销毁对象(析构默认调用)
 13 | void TRTInferImpl::destroy() {
 14 |     int old_device = 0;
 15 |     checkCudaRuntime(cudaGetDevice(&old_device));
 16 |     checkCudaRuntime(cudaSetDevice(device_));
 17 |     this->context_.reset();
 18 |     this->blobsNameMapper_.clear();
 19 |     this->outputs_.clear();
 20 |     this->inputs_.clear();
 21 |     this->inputs_name_.clear();
 22 |     this->outputs_name_.clear();
 23 |     checkCudaRuntime(cudaSetDevice(old_device));
 24 | }
 25 | 
 26 | // 打印信息 输入输出信息
 27 | void TRTInferImpl::print() {
 28 |     if(!context_) {
 29 |         INFOW("Infer print, nullptr.");
 30 |         return;
 31 |     }
 32 | 
 33 |     INFO("Infer %p I/O detail", this);
 34 |     INFO("\tMax Batch Size: %d", this->get_max_batch_size());
 35 |     INFO("\tInputs count: %d", inputs_.size());
 36 |     for(int i = 0; i < inputs_.size(); ++i) {
 37 |         INFO("\t\t%d.%s : shape {%s}", i, inputs_name_[i].c_str(), inputs_[i]->shape_string());
 38 |     }
 39 | 
 40 |     INFO("\tOutputs count: %d", outputs_.size());
 41 |     for(int i = 0; i < outputs_.size(); ++i) {
 42 |         INFO("\t\t%d.%s : shape {%s}", i, outputs_name_[i].c_str(), outputs_[i]->shape_string());
 43 |     }
 44 | }
 45 | 
 46 | // 序列化engine
 47 | std::shared_ptr<std::vector<uint8_t>> TRTInferImpl::serial_engine() {
 48 |     auto memory = this->context_->engine_->serialize();
 49 |     auto output = std::make_shared<std::vector<uint8_t>>((uint8_t*)memory->data(),
 50 |                                                          (uint8_t*)memory->data() + memory->size());
 51 |     memory->destroy();
 52 |     return output;
 53 | }
 54 | 
 55 | // 从内存加载
 56 | bool TRTInferImpl::load_from_memory(const void* pdata, size_t size) {
 57 |     if(pdata == nullptr || size == 0) return false;
 58 | 
 59 |     context_.reset(new EngineContext());
 60 | 
 61 |     // build model
 62 |     if(!context_->build_model(pdata, size)) {
 63 |         context_.reset();
 64 |         return false;
 65 |     }
 66 | 
 67 |     workspace_.reset(new MixMemory());
 68 |     cudaGetDevice(&device_);
 69 |     build_engine_input_and_outputs_mapper();
 70 |     return true;
 71 | }
 72 | 
 73 | // 从文件加载
 74 | bool TRTInferImpl::load(const std::string& file, int batch_size) {
 75 |     auto data = load_file(file);
 76 |     if(data.empty()) return false;
 77 | 
 78 |     context_.reset(new EngineContext());
 79 | 
 80 |     // build model
 81 |     if(!context_->build_model(data.data(), data.size())) {
 82 |         context_.reset();
 83 |         return false;
 84 |     }
 85 |     batch_max_size_ = batch_size;
 86 | 
 87 |     workspace_.reset(new MixMemory());
 88 |     cudaGetDevice(&device_);
 89 |     build_engine_input_and_outputs_mapper();
 90 |     return true;
 91 | }
 92 | 
 93 | // 获取设备的内存大小
 94 | size_t TRTInferImpl::get_device_memory_size() {
 95 |     EngineContext* context = (EngineContext*)this->context_.get();
 96 |     return context->context_->getEngine().getDeviceMemorySize();
 97 | }
 98 | 
 99 | // 获取输入输出等信息
100 | void TRTInferImpl::build_engine_input_and_outputs_mapper() {
101 |     EngineContext* context = (EngineContext*)this->context_.get();
102 |     int nbBindings = context->engine_->getNbBindings();
103 |     // int max_batchsize = context->engine_->getMaxBatchSize();
104 |     int max_batchsize = batch_max_size_;
105 | 
106 |     inputs_.clear();
107 |     inputs_name_.clear();
108 |     outputs_.clear();
109 |     outputs_name_.clear();
110 |     orderdBlobs_.clear();
111 |     bindingsPtr_.clear();
112 |     blobsNameMapper_.clear();
113 |     for(int i = 0; i < nbBindings; ++i) {
114 |         auto dims = context->engine_->getBindingDimensions(i);
115 |         auto type = context->engine_->getBindingDataType(i);
116 |         const char* bindingName = context->engine_->getBindingName(i);
117 |         dims.d[0] = max_batchsize;
118 |         auto newTensor = std::make_shared<Tensor>(dims.nbDims, dims.d);
119 |         newTensor->set_stream(this->context_->stream_);
120 |         newTensor->set_workspace(this->workspace_);
121 |         if(context->engine_->bindingIsInput(i)) {
122 |             // if is input
123 |             inputs_.push_back(newTensor);
124 |             inputs_name_.push_back(bindingName);
125 |             inputs_map_to_ordered_index_.push_back(orderdBlobs_.size());
126 |         } else {
127 |             // if is output
128 |             outputs_.push_back(newTensor);
129 |             outputs_name_.push_back(bindingName);
130 |             outputs_map_to_ordered_index_.push_back(orderdBlobs_.size());
131 |         }
132 |         blobsNameMapper_[bindingName] = i;
133 |         orderdBlobs_.push_back(newTensor);
134 |     }
135 |     bindingsPtr_.resize(orderdBlobs_.size());
136 | }
137 | 
138 | // 数据和推理引擎设置cuda流
139 | void TRTInferImpl::set_stream(cudaStream_t stream) {
140 |     this->context_->set_stream(stream);
141 | 
142 |     for(auto& t : orderdBlobs_) t->set_stream(stream);
143 | }
144 | 
145 | // 获取当前cuda流
146 | cudaStream_t TRTInferImpl::get_stream() {
147 |     return this->context_->stream_;
148 | }
149 | 
150 | // 获取当前设备
151 | int TRTInferImpl::device() {
152 |     return device_;
153 | }
154 | 
155 | // 等待同步
156 | void TRTInferImpl::synchronize() {
157 |     checkCudaRuntime(cudaStreamSynchronize(context_->stream_));
158 | }
159 | 
160 | // 判断是否属于输出
161 | bool TRTInferImpl::is_output_name(const std::string& name) {
162 |     return std::find(outputs_name_.begin(), outputs_name_.end(), name) != outputs_name_.end();
163 | }
164 | 
165 | // 判断是否属于输入
166 | bool TRTInferImpl::is_input_name(const std::string& name) {
167 |     return std::find(inputs_name_.begin(), inputs_name_.end(), name) != inputs_name_.end();
168 | }
169 | 
170 | // 推理
171 | void TRTInferImpl::forward(bool sync) {
172 |     EngineContext* context = (EngineContext*)context_.get();
173 |     int inputBatchSize = inputs_[0]->size(0);
174 |     for(int i = 0; i < context->engine_->getNbBindings(); ++i) {
175 |         auto dims = context->engine_->getBindingDimensions(i);
176 |         auto type = context->engine_->getBindingDataType(i);
177 |         dims.d[0] = inputBatchSize;
178 |         if(context->engine_->bindingIsInput(i)) {
179 |             context->context_->setBindingDimensions(i, dims);
180 |         }
181 |     }
182 | 
183 |     for(int i = 0; i < outputs_.size(); ++i) {
184 |         outputs_[i]->resize_single_dim(0, inputBatchSize);
185 |         outputs_[i]->to_gpu(false);
186 |     }
187 | 
188 |     for(int i = 0; i < orderdBlobs_.size(); ++i) bindingsPtr_[i] = orderdBlobs_[i]->gpu();
189 | 
190 |     void** bindingsptr = bindingsPtr_.data();
191 |     // bool execute_result = context->context_->enqueue(inputBatchSize, bindingsptr,
192 |     // context->stream_, nullptr);
193 |     bool execute_result = context->context_->enqueueV2(bindingsptr, context->stream_, nullptr);
194 |     if(!execute_result) {
195 |         auto code = cudaGetLastError();
196 |         INFOF("execute fail, code %d[%s], message %s", code, cudaGetErrorName(code),
197 |               cudaGetErrorString(code));
198 |     }
199 | 
200 |     if(sync) {
201 |         synchronize();
202 |     }
203 | }
204 | 
205 | // 获取workspace_(这是一个内存管理类的指针)
206 | std::shared_ptr<MixMemory> TRTInferImpl::get_workspace() {
207 |     return workspace_;
208 | }
209 | 
210 | // 返回输入数量
211 | int TRTInferImpl::num_input() {
212 |     return this->inputs_.size();
213 | }
214 | 
215 | // 返回输出数量
216 | int TRTInferImpl::num_output() {
217 |     return this->outputs_.size();
218 | }
219 | 
220 | // 设置第index的输入
221 | void TRTInferImpl::set_input(int index, std::shared_ptr<Tensor> tensor) {
222 |     Assert(index >= 0 && index < inputs_.size());
223 |     this->inputs_[index] = tensor;
224 | 
225 |     int order_index = inputs_map_to_ordered_index_[index];
226 |     this->orderdBlobs_[order_index] = tensor;
227 | }
228 | 
229 | // 设置第index的输出
230 | void TRTInferImpl::set_output(int index, std::shared_ptr<Tensor> tensor) {
231 |     Assert(index >= 0 && index < outputs_.size());
232 |     this->outputs_[index] = tensor;
233 | 
234 |     int order_index = outputs_map_to_ordered_index_[index];
235 |     this->orderdBlobs_[order_index] = tensor;
236 | }
237 | 
238 | // 返回第index输入tensor
239 | std::shared_ptr<Tensor> TRTInferImpl::input(int index) {
240 |     Assert(index >= 0 && index < inputs_name_.size());
241 |     return this->inputs_[index];
242 | }
243 | 
244 | // 返回第index输入tensor名字
245 | std::string TRTInferImpl::get_input_name(int index) {
246 |     Assert(index >= 0 && index < inputs_name_.size());
247 |     return inputs_name_[index];
248 | }
249 | 
250 | // 返回第index输输出tensor
251 | std::shared_ptr<Tensor> TRTInferImpl::output(int index) {
252 |     Assert(index >= 0 && index < outputs_.size());
253 |     return outputs_[index];
254 | }
255 | 
256 | // 返回第index输出tensor名字
257 | std::string TRTInferImpl::get_output_name(int index) {
258 |     Assert(index >= 0 && index < outputs_name_.size());
259 |     return outputs_name_[index];
260 | }
261 | 
262 | // 获取最大batchsize
263 | int TRTInferImpl::get_max_batch_size() {
264 |     Assert(this->context_ != nullptr);
265 |     // return this->context_->engine_->getMaxBatchSize();
266 |     return batch_max_size_;
267 | }
268 | 
269 | // 根据名字查找tensor
270 | std::shared_ptr<Tensor> TRTInferImpl::tensor(const std::string& name) {
271 |     Assert(this->blobsNameMapper_.find(name) != this->blobsNameMapper_.end());
272 |     return orderdBlobs_[blobsNameMapper_[name]];
273 | }
274 | 
275 | ///////////////////////////////////////////////////////////////////////
276 | /////////////////////////加载文件初始化对象 /////////////////////////////
277 | ///////////////////////////////////////////////////////////////////////
278 | std::shared_ptr<TRTInferImpl> load_infer(const std::string& file, int batch_size) {
279 |     std::shared_ptr<TRTInferImpl> infer(new TRTInferImpl());
280 |     if(!infer->load(file, batch_size)) infer.reset();
281 |     return infer;
282 | }
283 | 
284 | };  // namespace FasterTRT
285 | 


--------------------------------------------------------------------------------
/2_faster_tensorrt/readme.md:
--------------------------------------------------------------------------------
  1 | # Faster tensorrt
  2 | 
  3 | ## 前言
  4 | 
  5 | 使用之前你应该已经了解trt的构建和推理流程，所以此处不再涉及基础使用。你应该修改的最少有
  6 | ```txt
  7 | 1. CMakeLists.txt中的cuda、cudnn、tensorrt环境路径
  8 | 2. main.cpp中的测试推理图片/视频的路径、trt二进制文件路径，推理类别等
  9 | 3. 预处理和后处理也要根据实际使用模型修改，本文代码以yolox为例
 10 | ```
 11 | 
 12 | 原始的TensorRT_Pro有十分优秀的性能，并且接口的设计也很巧妙。但是我在复现和使用的时候发现部分可能不太适用于我当前使用的机器人。
 13 | 1. 它的加速是在将需要推理的所有图像全部commit, 然后它内部每个batch的加载和推理。但是在单目机器人上往往是视频流输入，此时是一般是不能输入batch数据的，所以此时实际上是不会比直接推理快多少。
 14 | 
 15 | 2. 图像commit后的结果等待没有任务队列管理
 16 | 
 17 | 3. 它用的是自写的CUDA NMS，但是实际上TensorRT8上有很多官方的NMS插件，可以替换。两者的实际效果对比待测试。
 18 | 
 19 | 
 20 | ## 1. 文件说明
 21 | 
 22 | 我在大多数地方都已经加了中文注释，应该能够容易看懂。当然注释可能也会有写错或者理解错误啥的，还是需要有自己的思考的，也欢迎一起交流。在`src`目录下一共有五大部分，分别是`apps`,`base`,`eval`,`kernels`,`onnx_model`,`main.cpp`。
 23 | 
 24 | ### 1.1 base
 25 | 
 26 | 这里主要是一些整体框架的基础，根据仔细需要求修改。
 27 | 
 28 | 1. `tools.hpp`: 一些工具函数 包括log日志打印，CUDA检查，输出文件保存读取等定义并直接实现
 29 | 2. `memory_tensor.hpp`: 定义`MixMemory`实现内存和显存的申请和释放；定义`Tensor`实现张量的管理、扩容、拷贝等
 30 | 3. `memory_tensor.cpp`: `MixMemory`和`Tensor`的实现
 31 | 4. `monopoly_accocator.hpp`: 定义内存独占管理分配器，最终实现预处理和推理并行的重要工具
 32 | 5. `infer_base.hpp`: 定义trt引擎管理类和异步安全推理类
 33 | 6. `infer_base.cpp`: trt引擎管理类和异步安全推理类的实现
 34 | 7. `trt_base.hpp`: 定义trt引擎构建和量化
 35 | 8. `trt_base.hpp`: trt引擎构建和量化实现
 36 | 
 37 | ### 1.2 kernels
 38 | 
 39 | 推荐把cuda相关实现放在此文件夹中。
 40 | 
 41 | 1. `cuda_kernel.cuh`: cuda核函数的定义
 42 | 1. `cuda_kernel.cu`: cuda核函数的实现，预处理和后处理相关的cuda加速代码
 43 | 
 44 | 
 45 | ### 1.3 eval
 46 | 
 47 | 这里一个评估相关代码，可以测试相关数据集(coco格式)使用trt推理的map，暂时主要针对目标检测。
 48 | 
 49 | 1. `save.hpp`: 一个保存检测结果到文件里的类
 50 | 2. `get_imgid_txt.py`: 读取`eval_results.json`，来保存图片name和id到文件`img_id.txt`
 51 | 3. `eval.cpp`: 读取`img_id.txt`中的图片，进行推理，并保存相应结果到`results.txt`
 52 | 4. `img_id.txt`: img的id和img的name的对应，便于评估
 53 | 5. `results.txt`: 检测的结果
 54 | 6. `eval_results.json`: 检测结果保存到json文件
 55 | 7. `eval.py`: 最终的评估程序，打印结果
 56 | 
 57 | 
 58 | ### 1.4 apps
 59 | 
 60 | 这里是实际模型的实现地方，定义模型的结构，推理过程，预处理和后处理流程等，推荐每个模型新建一个文件夹实现。
 61 | 
 62 | 1. `common.hpp`: 一些视觉任务中都会用到的功能挡在这里，比如bbox定义、图片仿射变换的计算等
 63 | 
 64 | 然后就一些具体模型的实现了
 65 | 2. `yolo/yolo.h`: 定义yolo的推理
 66 | 3. `yolo/yolo.cpp`: yolo推理的实现
 67 | 
 68 | ### 1.5 onnx_model
 69 | 
 70 | 有一些模型的onnx文件需要一些操作才能被本仓库正确检测，这个地方存放编辑onnx的python文件。
 71 | 
 72 | ### 1.6 main
 73 | 
 74 | 1. `main.cpp`: 主函数,调用和实现功能都在此处，动态的控制队列也是在此处实际推理中实现。
 75 | 
 76 | 
 77 | ## 2. 使用教程
 78 | 
 79 | ### 2.1 模型转换
 80 | 
 81 | #### 2.1.1 trtexec
 82 | 模型转换部分，在不需要增加自定义算子的时候，想要导出tensorrt的engine，**trtexec is all you need！**
 83 | 
 84 | ```shell
 85 | # 构建模型时
 86 | trtexec 
 87 |     --onnx = ./model NCHW.onnx  # 指定onnx模型文件名
 88 |     # --output=y:0                # 指定输出张量名（使用 Onnx 时该选项无效）
 89 |     --minShapes =x:0:1x1x28x28
 90 |     --optShapes =x:0:4x1x28x28
 91 |     --maxShapes =x:0:16x1x28x28 # 指定输入形状的范围最小值、最常见值、最大值
 92 |     --workspace = 1024   # 以后要用 memPoolSize 优化过程可使用显存最大值
 93 |     --fp16  	         # 指定引擎精度和稀疏性等属性 int8 noTF32 best sparsity
 94 |     --saveEngine=model.plan # 指定输出引擎文件名
 95 |     --skipInference         # 只创建引擎不运行 旧版本叫buildonly
 96 |     --verbose 	            # 打印详细日志
 97 |     --timingCacheFile=timing.cache # 指定输出优化计时缓存文件名
 98 |     --profilingVerbosity =detailed # 构建期保留更多的逐层信息
 99 |     --dumpLayerInfo                 # 打印层信息
100 |     --exportLayerInfo=layerInfo.txt # 导出引擎逐层信息，可与 profilingVerbosity 合用
101 | 
102 | # 模型推理时
103 | trtexec 
104 |     --loadEngine=model.plan # 读取 engine 文件
105 |     --shapes=x:1x1x28x28    # 指定输入张量形状
106 |     --warmUp=1000           # 热身阶段最短运行时间（单位： ms
107 |     --duration=10           # 测试阶段最短运行时间（单位： s
108 |     --iterations=100        # 指定测试阶段运行的最小迭代次数
109 |     --useCudaGraph          # 使用 CUDAGraph 来捕获和执行推理过程 
110 |     --noDataTransfers       # 关闭 Host 和 Device 之间的数据传输
111 |     --streams=2             # 使用多个 stream 来运行推理
112 |     --threads               # 使用多线程
113 |     --verbose               # 打印详细日志
114 |     --dumpProfile 
115 |     --exportProfile=layerProfile.txt 	# 保存逐层性能数据信息
116 | ```
117 | 
118 | 
119 | #### 2.1.2 polygraphy
120 | 很牛的工具！
121 | 
122 | polygraphy工具，可以多后端运行对比，对比不同后端结果，生成engine等（重要），还可以判断那些算子不能被trt加速，并把这些切割出来
123 | Build TensorRT engine using the ONNX file, and compare the output of each layer between Onnxruntime and TensorRT
124 | ```shell
125 | polygraphy run model.onnx \
126 |     --onnxrt --trt \
127 |     --workspace 1000000000 \
128 |     --save-engine=model-FP32-MarkAll.plan \
129 |     --atol 1e-3 --rtol 1e-3 \
130 |     --verbose \
131 |     --onnx-outputs mark all \
132 |     --trt-outputs mark all \
133 |     --trt-min-shapes 'tensor-0:[1,1,28,28]' \
134 |     --trt-opt-shapes 'tensor-0:[4,1,28,28]' \
135 |     --trt-max-shapes 'tensor-0:[16,1,28,28]' \
136 |     --input-shapes   'tensor-0:[4,1,28,28]'
137 |     > result-run-FP32-MarkAll.log 2>&1
138 | 
139 | ```
140 | 
141 | #### 2.1.3 trt api
142 | 除此之外，tensorrt_pro中也给出了一个complie的模型转换接口，我也搬运了过来
143 | ```cpp
144 | 
145 | bool compile(
146 |     Mode mode,
147 |     YoloType type,
148 |     unsigned int max_batch_size,
149 |     const string& source_onnx_file,
150 |     const string& save_engine_file,
151 |     size_t max_workspace_size = 1<<30,
152 |     const string& int8_images_folder="",
153 |     const string& int8_entropy_calibrator_cache_file=""
154 | );
155 | ```
156 | 
157 | 
158 | ### 2.2 模型推理
159 | 
160 | 目前已经仓库里支持了
161 | 1. yolox： 基本是官方默认的吧，我把fcous换成了conv
162 | 2. yolov8：yolov8导出的onnx模型需要经过编辑，主要是输出增加一个维度调整，方便和yolox的一起处理。可以参考代码[v8onnx_tranpose.py](./src/onnx_model/v8onnx_tranpose.py)
163 | 3. rtdetr：百度家出的检测器，导出可以参考[rtdetr_sim_export_trt.py](./src/onnx_model/rtdetr_sim_export_trt.py)
164 | ...
165 | 
166 | 
167 | 本仓库就突出一个接口简单。
168 | 
169 | ```cpp
170 | // 创建模型
171 | auto yolo = YOLO::create_infer(model_file, type, deviceid, batch_size, confidence_threshold, nms_threshold);
172 | 
173 | // 推理图片
174 | auto objs = yolo->commit(image);
175 | 
176 | // 得到结果
177 | auto res = objs.get();
178 | 
179 | ```
180 | 控制队列形式
181 | ```cpp
182 | 
183 | queue<shared_future<YOLO::BoxArray>> out_queue;
184 | 
185 | for(int i=0;i<10;i++) {
186 |     auto objs = yolo->commit(image);
187 |     out_queue.emplace(objs);
188 |     if(out_queue.size() < keep_queue_long) {
189 |         continue;
190 |     }
191 |     auto res = out_queue.front().get();
192 |     out_queue.pop();
193 | }
194 | while(!out_queue.empty()) {
195 |     auto res = out_queue.front().get();
196 |     out_queue.pop();
197 | }
198 | ```
199 | 
200 | ### 2.3 模型测评
201 | 
202 | 使用c++的推理结果来实现coco格式的eval格式，进而便于对比加速前后精度的变化。稍微有点麻烦，整体思想是保存c++的推理结果，然后用python的pycocotools来实现结果的计算。
203 | 
204 | 首先运行`eval/get_imgid_txt.py`，得到`img_id.txt`文件，包含了图片名称和图片id的对应
205 | ```
206 | 0 005894.jpg
207 | 1 004755.jpg
208 | ```
209 | 
210 | 然后默认cmake会编译eval文件夹的内容，当需要模型评测时，运行`build/eval`可以得到`results.txt`,包含推理结果
211 | ```
212 | 005894.jpg 0 0 0.836939 1175 609 229 181 
213 | 005894.jpg 0 1 0.768631 2468 1880 99 162 
214 | 005894.jpg 0 2 0.70347 1938 607 216 141 
215 | 005894.jpg 0 2 0.781555 944 1442 163 203 
216 | 004755.jpg 1 1 0.557236 622 361 59 45 
217 | 004755.jpg 1 1 0.676005 383 79 64 44 
218 | ```
219 | 最后运行`eval/eval.py`，得到最终的coco格式的map
220 | ```
221 | Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.447
222 | Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.751
223 | Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.486
224 | Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.175
225 | Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.404
226 | Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.503
227 | Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.166
228 | Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.465
229 | Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.507
230 | Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.244
231 | Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.471
232 | Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.558
233 | ```
234 | 
235 | 
236 | ### 2.4 自定义模型
237 | 
238 | #### 2.4.1 新建文件夹(bushi)
239 | 
240 | 建议在`src/apps`目录下新建一个文件夹，此处以暂未出现的`YoloDetr`称呼。创建相应的头文件和源文件`yolo_detr.h`,`yolo_detr.cpp`，相关的预处理和后处理之类的肯定是要和训练期间的设置保持一致的，不过既然叫`***detr`了明显是不需要后处理的😏。预处理的话建议在gpu上完成，可以在`src/kernels`文件夹中新创建你需要的预处理，或者找之前满足你要求的已经实现的预处理。
241 | 
242 | #### 2.5.2 实现
243 | 
244 | 首先包含相关头文件，并使用命名空间
245 | ```cpp
246 | #include "../common.hpp"    
247 | using namespace FasterTRT;
248 | ```
249 | 然后新建一个推理类，并实现相关方法。
250 | ```cpp
251 | // 线程安全模板类设置模板类型
252 | using ThreadSafedAsyncInferImpl = ThreadSafedAsyncInfer
253 | <
254 |     cv::Mat,                    // input
255 |     BoxArray,                   // output
256 |     tuple<string, int>,         // start param
257 |     AffineMatrix                // additional
258 | >;
259 | // 推理基类
260 | using Infer = InferBase<cv::Mat, BoxArray>;
261 | 
262 | // 创建该模型的类
263 | class YoloDETR : public Infer, public ThreadSafedAsyncInferImpl {
264 | 
265 |     // 最少实现
266 |     // 1. 初始化 里面需要调用ThreadSafedAsyncInferImpl::startup(make_tuple(file, gpuid));
267 |     virtual bool startup(const string& file, YoloType type, int gpuid, int batch_size, float confidence_threshold, float nms_threshold);
268 |     
269 |     // 2. 工作线程 里面指定输入输出 并初始化内存显存，指定推理顺序等等
270 |     virtual void worker(promise<bool>& result) override;
271 | 
272 |     // 3. 预处理操作
273 |     virtual bool preprocess(Job& job, const Mat& image) override;
274 | 
275 |     // 4. 推理，包括组推理和单个推理
276 |     virtual vector<shared_future<BoxArray>> commits(const vector<Mat>& images) override;
277 |     virtual shared_future<BoxArray> commit(const Mat& image) override;
278 | 
279 | }
280 | 
281 | // 封装接口，最终暴露给用户的只有commit和commits方法。
282 | // 当然也可以选择把所有接口都开放，不使用这里初始化就行了。
283 | shared_ptr<Infer> create_infer(...){
284 |     shared_ptr<YoloDETR> instance(new YoloDETR());
285 |     if(!instance->startup(...)){
286 |         instance.reset();
287 |     }
288 |     return instance;
289 | }
290 | 
291 | ```
292 | 
293 | 
294 | ## 3. 推理性能
295 | 
296 | 使用Nsight Systems看看cuda处理过程，我后来才发现trt_pro中的fast_yolo是没有多流的，而完整版本是有多流选项的。不过我自己也已经实现了多流了，下面是我优化的过程。
297 | 
298 | 首先是原始版本，多线程但是单流，这时候向流中提交任务还是串行实现的，虽然整体效率yolox官方给的高很多，但是还有提升空间。
299 | ![](./sources/ori.jpg) 
300 | 这个时候cpu侧的双线程已经没用了，及时加上控制队列反而会造成这样的后果。
301 | ![](./sources/ori_queue.jpg)
302 | 
303 | 然后，通过上图可以发现，H2D十分耗时，于是考虑使用双流，一流推理，另外一流专门执行H2D，顺带完成预处理工作。
304 | ![](./sources/2streamv1.jpg)
305 | 
306 | 这个时候可能会疑惑，哎我现在已经是双流了，数据处理也确实在两条stream上了，为啥还不能并行呢？这是因为双流有一个问题是需要保证数据流的顺序。图片输入在提交任务到steam1(预处理流)后，数据异步拷贝到gpu，但是这个时候生产者已经把任务放到了任务队列中，所以推理线程会立即开始着手取数据和推理，这个时候由于是双流可能访问同一显存，就十分不安全了。于是我在stream1的最后执行了一个流同步的操作。
307 | 那怎么实现多流并行呢？我是用的方法是采用一个控制队列，保存推理线程返回future，然后立即推流下一帧，这个时候数据也安全，推理也安全！并且可以实现并行。
308 | ![](./sources/2streamv2.jpg)
309 | 
310 | 并且整体的gpu利用率也更近紧凑了,下图左边是加控制队列后，右边是加控制队列前。
311 | ![](./sources/2steam_overview.jpg)
312 | 
313 | 
314 | 在2080Ti(8.5)上推理图片，不包含图像的读取和画框，warmup500，跑2000轮，平均耗时
315 | 
316 | | method    | ori       | ori+queue | ori+2stream   | ori+queue+2stream |
317 | | :----:    | :----:    | :----:    | :----:        | :----:            |
318 | | cost time | 2.25ms    | 1.84ms    | 2.28ms        | 1.41ms            |
319 | | FPS       | 444.64    | 542.89    | 438.6         | 709.98            |
320 | 
321 | <!-- 但是显然不同平台也由不同的问题，在TX2(trt 8.2。因为之后的版本不支持了),因为瓶颈是模型本身了。
322 | | method    | ori       | ori+queue | ori+2stream   | ori+queue+2stream |
323 | | :----:    | :----:    | :----:    | :----:        | :----:            |
324 | | cost time | 40.12     | 38.12ms   | 39.58ms       | 36.32ms          |
325 | | FPS       | 24.92     | 26.23     | 25.26         | 27.53            | -->
326 | 
327 | 
328 | ## 4. More优化
329 | 
330 | - [ ] gpu内存异步操作内核进一步融合，使用一个gpu内核实现运算符组合，减少数据传输和内核启动延迟
331 | - [ ] 一个tensorrt的engine可以创建多个context，实现多线程调用。只占用一个engine显存的大小，同时供多个推理运算
332 | - [ ] 向量化全局内存访问，提高内存访问效率
333 | - [ ] transformer系列算法加速支持 by fastertransformers
334 | 


--------------------------------------------------------------------------------
/2_faster_tensorrt/src/base/infer_base.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file infer_base.hpp
  3 |  * @author 0zzx0
  4 |  * @brief 推理器基类
  5 |  * @version 0.1
  6 |  * @date 2023-6-11 2023-8-21
  7 |  *
  8 |  * @copyright Copyright (c) 2023
  9 |  *
 10 |  */
 11 | 
 12 | #ifndef INFER_BASE_HPP
 13 | #define INFER_BASE_HPP
 14 | 
 15 | #include <string>
 16 | #include <future>
 17 | #include <memory>
 18 | #include <mutex>
 19 | #include <thread>
 20 | #include <queue>
 21 | #include <condition_variable>
 22 | 
 23 | #include <NvInfer.h>
 24 | 
 25 | #include "memory_tensor.hpp"
 26 | #include "monopoly_accocator.hpp"
 27 | #include "../kernels/cuda_kernel.cuh"
 28 | 
 29 | namespace FasterTRT {
 30 | 
 31 | using namespace nvinfer1;
 32 | 
 33 | /////////////////////////// TRT logger ///////////////////////////
 34 | class Logger : public ILogger {
 35 | public:
 36 |     virtual void log(Severity severity, const char* msg) noexcept override {
 37 |         if(severity == Severity::kINTERNAL_ERROR) {
 38 |             INFOE("NVInfer INTERNAL_ERROR: %s", msg);
 39 |             abort();
 40 |         } else if(severity == Severity::kERROR) {
 41 |             INFOE("NVInfer: %s", msg);
 42 |         } else if(severity == Severity::kWARNING) {
 43 |             INFOW("NVInfer: %s", msg);
 44 |         } else if(severity == Severity::kINFO) {
 45 |             INFOD("NVInfer: %s", msg);
 46 |         } else {
 47 |             INFOD("%s", msg);
 48 |         }
 49 |     }
 50 | };
 51 | static Logger gLogger;
 52 | 
 53 | // 销毁tensorrt中间指针对象的函数模板
 54 | template <typename _T>
 55 | static void destroy_nvidia_pointer(_T* ptr) {
 56 |     if(ptr) ptr->destroy();
 57 | }
 58 | 
 59 | /**
 60 |  * @brief 封装trt运行过程中的部分资源 方便一起创建和销毁
 61 |  *
 62 |  */
 63 | class EngineContext {
 64 | public:
 65 |     virtual ~EngineContext() { destroy(); }
 66 | 
 67 |     // 设置stream 如果已经存在销毁旧的，添加新的
 68 |     void set_stream(cudaStream_t stream) {
 69 |         if(owner_stream_) {
 70 |             if(stream_) {
 71 |                 cudaStreamDestroy(stream_);
 72 |             }
 73 |             owner_stream_ = false;
 74 |         }
 75 |         stream_ = stream;
 76 |     }
 77 | 
 78 |     // 使用智能指针创建runtime engine context和初始化stream
 79 |     bool build_model(const void* pdata, size_t size) {
 80 |         destroy();
 81 | 
 82 |         if(pdata == nullptr || size == 0) return false;
 83 | 
 84 |         owner_stream_ = true;
 85 |         checkCudaRuntime(cudaStreamCreate(&stream_));
 86 |         if(stream_ == nullptr) return false;
 87 | 
 88 |         runtime_ = std::shared_ptr<IRuntime>(createInferRuntime(gLogger),
 89 |                                              destroy_nvidia_pointer<IRuntime>);
 90 |         if(runtime_ == nullptr) return false;
 91 | 
 92 |         engine_ =
 93 |             std::shared_ptr<ICudaEngine>(runtime_->deserializeCudaEngine(pdata, size, nullptr),
 94 |                                          destroy_nvidia_pointer<ICudaEngine>);
 95 |         if(engine_ == nullptr) return false;
 96 | 
 97 |         // runtime_->setDLACore(0);
 98 |         context_ = std::shared_ptr<IExecutionContext>(engine_->createExecutionContext(),
 99 |                                                       destroy_nvidia_pointer<IExecutionContext>);
100 |         return context_ != nullptr;
101 |     }
102 | 
103 | private:
104 |     // 销毁这些指针 通过让智能指针引用计数减一
105 |     void destroy() {
106 |         context_.reset();
107 |         engine_.reset();
108 |         runtime_.reset();
109 | 
110 |         if(owner_stream_) {
111 |             if(stream_) {
112 |                 cudaStreamDestroy(stream_);
113 |             }
114 |         }
115 |         stream_ = nullptr;
116 |     }
117 | 
118 | public:
119 |     cudaStream_t stream_ = nullptr;
120 |     bool owner_stream_ = false;
121 |     std::shared_ptr<IExecutionContext> context_;
122 |     std::shared_ptr<ICudaEngine> engine_;
123 |     std::shared_ptr<IRuntime> runtime_ = nullptr;
124 | };
125 | 
126 | /**
127 |  * @brief 推理引擎的创建和推理
128 |     可以获取推理模型的各类输入输出信息
129 |  *
130 |  */
131 | class TRTInferImpl {
132 | public:
133 |     virtual ~TRTInferImpl();
134 |     bool load(const std::string& file, int batch_size);
135 |     bool load_from_memory(const void* pdata, size_t size);
136 |     void destroy();
137 | 
138 |     void forward(bool sync);
139 | 
140 |     int get_max_batch_size();
141 |     cudaStream_t get_stream();
142 |     void set_stream(cudaStream_t stream);
143 |     void synchronize();
144 |     size_t get_device_memory_size();
145 |     std::shared_ptr<MixMemory> get_workspace();
146 |     std::shared_ptr<Tensor> input(int index = 0);
147 |     std::string get_input_name(int index = 0);
148 |     std::shared_ptr<Tensor> output(int index = 0);
149 |     std::string get_output_name(int index = 0);
150 |     std::shared_ptr<Tensor> tensor(const std::string& name);
151 |     bool is_output_name(const std::string& name);
152 |     bool is_input_name(const std::string& name);
153 |     void set_input(int index, std::shared_ptr<Tensor> tensor);
154 |     void set_output(int index, std::shared_ptr<Tensor> tensor);
155 |     std::shared_ptr<std::vector<uint8_t>> serial_engine();
156 | 
157 |     void print();
158 | 
159 |     int num_output();
160 |     int num_input();
161 |     int device();
162 | 
163 | private:
164 |     void build_engine_input_and_outputs_mapper();
165 | 
166 | private:
167 |     std::vector<std::shared_ptr<Tensor>> inputs_;
168 |     std::vector<std::shared_ptr<Tensor>> outputs_;
169 |     std::vector<int> inputs_map_to_ordered_index_;
170 |     std::vector<int> outputs_map_to_ordered_index_;
171 |     std::vector<std::string> inputs_name_;
172 |     std::vector<std::string> outputs_name_;
173 |     std::vector<std::shared_ptr<Tensor>> orderdBlobs_;
174 |     std::map<std::string, int> blobsNameMapper_;
175 |     std::shared_ptr<EngineContext> context_;
176 |     std::vector<void*> bindingsPtr_;
177 |     std::shared_ptr<MixMemory> workspace_;
178 |     int device_ = 0;
179 |     int batch_max_size_ = 1;
180 | };
181 | 
182 | /**
183 |  * @brief  异步线程安全的推理器(虚基类 子类至少重写preprocess work)
184 |     通过异步线程启动，使得调用方允许任意线程调用把图像做输入，并通过future来获取异步结果
185 |     模板类
186 |  *
187 |  * @tparam Input 输入
188 |  * @tparam Output 输入
189 |  * @tparam StartParam 参数
190 |  * @tparam JobAdditional job参数
191 |  */
192 | template <class Input, class Output, class StartParam = std::tuple<std::string, int>,
193 |           class JobAdditional = int>
194 | class ThreadSafedAsyncInfer {
195 | public:
196 |     // Job数据类型。
197 |     struct Job {
198 |         Input input;
199 |         Output output;
200 |         JobAdditional additional;
201 |         MonopolyAllocator<Tensor>::MonopolyDataPointer mono_tensor;
202 |         std::shared_ptr<std::promise<Output>> pro;
203 |     };
204 | 
205 |     virtual ~ThreadSafedAsyncInfer() { stop(); }
206 | 
207 |     // 停止 由析构函数调用
208 |     void stop() {
209 |         run_ = false;
210 |         cond_.notify_all();
211 | 
212 |         /// cleanup jobs
213 |         {
214 |             std::unique_lock<std::mutex> l(jobs_lock_);
215 |             while(!jobs_.empty()) {
216 |                 auto& item = jobs_.front();
217 |                 if(item.pro) item.pro->set_value(Output());
218 |                 jobs_.pop();
219 |             }
220 |         };
221 | 
222 |         if(worker_) {
223 |             worker_->join();
224 |             worker_.reset();
225 |         }
226 |     }
227 | 
228 |     // 启动 初始化线程 用一个promise等待worker中的初始化结束
229 |     bool startup(const StartParam& param) {
230 |         run_ = true;
231 | 
232 |         std::promise<bool> pro;
233 |         start_param_ = param;
234 |         worker_ =
235 |             std::make_shared<std::thread>(&ThreadSafedAsyncInfer::worker, this, std::ref(pro));
236 |         return pro.get_future().get();
237 |     }
238 | 
239 |     // 单输入commit 先预处理input 然后上锁推进工作队列 cond_ 提醒 然后开始等待output
240 |     virtual std::shared_future<Output> commit(const Input& input) {
241 |         Job job;
242 |         job.pro = std::make_shared<std::promise<Output>>();
243 |         if(!preprocess(job, input)) {
244 |             job.pro->set_value(Output());
245 |             return job.pro->get_future();
246 |         }
247 | 
248 |         ////////////////////上锁并且推进队列////////////////////////////
249 |         {
250 |             std::unique_lock<std::mutex> l(jobs_lock_);
251 |             // jobs_.push(job);
252 |             jobs_.emplace(job);
253 |         };
254 |         cond_.notify_one();
255 |         return job.pro->get_future();
256 |     }
257 | 
258 |     // vector 输入commit
259 |     virtual std::vector<std::shared_future<Output>> commits(const std::vector<Input>& inputs) {
260 |         int batch_size = std::min((int)inputs.size(), this->tensor_allocator_->capacity());
261 |         std::vector<Job> jobs(inputs.size());
262 |         std::vector<std::shared_future<Output>> results(inputs.size());
263 | 
264 |         int nepoch = (inputs.size() + batch_size - 1) / batch_size;
265 |         for(int epoch = 0; epoch < nepoch; ++epoch) {
266 |             int begin = epoch * batch_size;
267 |             int end = std::min((int)inputs.size(), begin + batch_size);
268 | 
269 |             for(int i = begin; i < end; ++i) {
270 |                 Job& job = jobs[i];
271 |                 job.pro = std::make_shared<std::promise<Output>>();
272 |                 if(!preprocess(job, inputs[i])) {
273 |                     job.pro->set_value(Output());
274 |                 }
275 |                 results[i] = job.pro->get_future();
276 |             }
277 | 
278 |             ///////////////////////////////////////////////////////////
279 |             {
280 |                 std::unique_lock<std::mutex> l(jobs_lock_);
281 |                 for(int i = begin; i < end; ++i) {
282 |                     jobs_.emplace(std::move(jobs[i]));
283 |                 };
284 |             }
285 |             cond_.notify_one();
286 |         }
287 |         return results;
288 |     }
289 | 
290 | protected:
291 |     // 工作线程(纯虚)
292 |     virtual void worker(std::promise<bool>& result) = 0;
293 |     // 预处理(纯虚)
294 |     virtual bool preprocess(Job& job, const Input& input) = 0;
295 | 
296 |     // 获取任务组 等待之前的任务执行完毕
297 |     virtual bool get_jobs_and_wait(std::vector<Job>& fetch_jobs, int max_size) {
298 |         std::unique_lock<std::mutex> l(jobs_lock_);
299 |         cond_.wait(l, [&]() {
300 |             return !run_ || !jobs_.empty();
301 |         });  // 当前run=true 且 job为empty(队列中的任务做完)的时候才会等待
302 | 
303 |         if(!run_) return false;
304 | 
305 |         fetch_jobs.clear();
306 |         for(int i = 0; i < max_size && !jobs_.empty(); ++i) {
307 |             fetch_jobs.emplace_back(std::move(jobs_.front()));
308 |             jobs_.pop();
309 |         }
310 |         return true;
311 |     }
312 | 
313 |     // 获取任务 等待之前的任务执行完毕
314 |     virtual bool get_job_and_wait(Job& fetch_job) {
315 |         std::unique_lock<std::mutex> l(jobs_lock_);
316 |         cond_.wait(l, [&]() { return !run_ || !jobs_.empty(); });
317 | 
318 |         if(!run_) return false;
319 | 
320 |         fetch_job = std::move(jobs_.front());
321 |         jobs_.pop();
322 |         return true;
323 |     }
324 | 
325 | protected:
326 |     StartParam start_param_;
327 |     std::atomic<bool> run_;
328 |     std::mutex jobs_lock_;
329 |     std::queue<Job> jobs_;
330 |     std::shared_ptr<std::thread> worker_;
331 |     std::condition_variable cond_;
332 |     std::shared_ptr<MonopolyAllocator<Tensor>> tensor_allocator_;
333 | };
334 | 
335 | /**
336 |  * @brief 推理的虚基类 最终暴露给用户的接口，实际推理的类应该继承并实现本类中的方法
337 |  *
338 |  * @tparam Intput 输入
339 |  * @tparam Output 输出
340 |  */
341 | template <class Intput, class Output>
342 | class InferBase {
343 | public:
344 |     virtual std::shared_future<Output> commit(const Intput& image) = 0;
345 |     virtual std::vector<std::shared_future<Output>> commits(const std::vector<Intput>& images) = 0;
346 | };
347 | 
348 | // 产生一个trt推理的智能指针 参数是序列化文件路径
349 | std::shared_ptr<TRTInferImpl> load_infer(const std::string& file, int batch_size);
350 | 
351 | };  // namespace FasterTRT
352 | 
353 | #endif
354 | 


--------------------------------------------------------------------------------
/1_trt_base/trt_yolox/cpp/yolox_end2end.cpp:
--------------------------------------------------------------------------------
  1 | #include <fstream>
  2 | #include <iostream>
  3 | #include <sstream>
  4 | #include <numeric>
  5 | #include <chrono>
  6 | #include <vector>
  7 | #include <opencv2/opencv.hpp>
  8 | #include <dirent.h>
  9 | 
 10 | #include "NvInfer.h"
 11 | #include "NvInferPlugin.h"
 12 | #include "cuda_runtime_api.h"
 13 | #include "logging.h"
 14 | 
 15 | #define CHECK(status) \
 16 |     do\
 17 |     {\
 18 |         auto ret = (status);\
 19 |         if (ret != 0)\
 20 |         {\
 21 |             std::cerr << "Cuda failure: " << ret << std::endl;\
 22 |             abort();\
 23 |         }\
 24 |     } while (0)
 25 | 
 26 | #define DEVICE 0  // GPU id
 27 | static const int INPUT_W = 640;
 28 | static const int INPUT_H = 640;
 29 | const char* INPUT_BLOB_NAME = "images";
 30 | const char* OUTPUT_BLOB_NAME1 = "output";
 31 | const char* OUTPUT_BLOB_NAME2 = "943";
 32 | const char* OUTPUT_BLOB_NAME3 = "944";
 33 | const char* OUTPUT_BLOB_NAME4 = "945";
 34 | 
 35 | const std::vector<std::string> class_names={"echinus", "starfish", "holothurian", "scallop"};
 36 | const std::vector<std::vector<float>> color_list =
 37 | {
 38 |     {0.000, 0.447, 0.741},
 39 |     {0.850, 0.325, 0.098},
 40 |     {0.929, 0.694, 0.125},
 41 |     {0.494, 0.184, 0.556}
 42 | };
 43 | 
 44 | using namespace nvinfer1;
 45 | 
 46 | // class Logger : public nvinfer1::ILogger {
 47 | // public:
 48 | //     void log(Severity severity, const char* msg) noexcept override {
 49 | //         if (severity != Severity::kINFO) {
 50 | //             std::cout << msg << std::endl;
 51 | //         }
 52 | //     }
 53 | // };
 54 | 
 55 | 
 56 | class YoloEnd2End{
 57 | public:
 58 |     YoloEnd2End(const std::string model_path);
 59 |     cv::Mat static_resize(cv::Mat& image);
 60 |     float* blobFromImage(cv::Mat& img);
 61 |     void draw_objects(const cv::Mat& img, float* Boxes, int* ClassIndexs, int* BboxNum);
 62 |     void Infer(cv::Mat& img, float* Boxes, float* score, int* ClassIndexs, int* BboxNum);
 63 |     ~YoloEnd2End();
 64 | 
 65 | private:
 66 |     nvinfer1::ICudaEngine* engine = nullptr;
 67 |     nvinfer1::IRuntime* runtime = nullptr;
 68 |     nvinfer1::IExecutionContext* context = nullptr;
 69 |     cudaStream_t stream = nullptr;
 70 |     void* buffs[5];
 71 |     int iH, iW, in_size, out_size1, out_size2, out_size3, out_size4;
 72 |     Logger gLogger;
 73 | };
 74 | 
 75 | // resize
 76 | cv::Mat YoloEnd2End::static_resize(cv::Mat& img) {
 77 |     float r = std::min(INPUT_W / (img.cols*1.0), INPUT_H / (img.rows*1.0));
 78 |     // r = std::min(r, 1.0f);
 79 |     int unpad_w = r * img.cols;
 80 |     int unpad_h = r * img.rows;
 81 |     cv::Mat re(unpad_h, unpad_w, CV_8UC3);
 82 |     cv::resize(img, re, re.size());
 83 |     cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(114, 114, 114));
 84 |     re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
 85 |     return out;
 86 | }
 87 | 
 88 | // float* YoloEnd2End::blobFromImage(cv::Mat& img){
 89 | //     float* blob = new float[img.total()*3];
 90 | //     // std::memcpy(blob, img.data, img.total() * 3* sizeof(float));
 91 | //     int img_h = img.rows;
 92 | //     int img_w = img.cols;
 93 | //     int channelLength = img_w * img_h;
 94 | //     std::vector<cv::Mat> split_img = {
 95 | //                 cv::Mat(img_h, img_w, CV_32FC1, blob + channelLength * 0),
 96 | //                 cv::Mat(img_h, img_w, CV_32FC1, blob + channelLength * 1),
 97 | //                 cv::Mat(img_h, img_w, CV_32FC1, blob + channelLength * 2)
 98 | //         };
 99 | //     cv::split(img, split_img);
100 | //     return blob;
101 | // }
102 | 
103 | 
104 | float* YoloEnd2End::blobFromImage(cv::Mat& img){
105 |     float* blob = new float[img.total()*3];
106 |     int channels = 3;
107 |     int img_h = img.rows;
108 |     int img_w = img.cols;
109 |     for (size_t c = 0; c < channels; c++) 
110 |     {
111 |         for (size_t  h = 0; h < img_h; h++) 
112 |         {
113 |             for (size_t w = 0; w < img_w; w++) 
114 |             {
115 |                 blob[c * img_w * img_h + h * img_w + w] =
116 |                     (float)img.at<cv::Vec3b>(h, w)[c];
117 |             }
118 |         }
119 |     }
120 |     return blob;
121 | }
122 | 
123 | YoloEnd2End::YoloEnd2End(const std::string model_path) {
124 |     std::ifstream ifile(model_path, std::ios::in | std::ios::binary);
125 |     if (!ifile) {
126 |         std::cout << "read serialized file failed\n";
127 |         std::abort();
128 |     }
129 | 
130 |     ifile.seekg(0, std::ios::end);
131 |     const int mdsize = ifile.tellg();
132 |     ifile.clear();
133 |     ifile.seekg(0, std::ios::beg);
134 |     std::vector<char> buf(mdsize);
135 |     ifile.read(&buf[0], mdsize);
136 |     ifile.close();
137 |     std::cout << "model size: " << mdsize << std::endl;
138 | 
139 |     runtime = nvinfer1::createInferRuntime(gLogger);
140 |     initLibNvInferPlugins(&gLogger, "");
141 |     engine = runtime->deserializeCudaEngine((void*)&buf[0], mdsize, nullptr);
142 | 
143 |     auto in_dims = engine->getTensorShape(INPUT_BLOB_NAME);
144 |     // auto in_dims = engine->getBindingDimensions(engine->getBindingIndex("images"));
145 |     iH = in_dims.d[2];
146 |     iW = in_dims.d[3];
147 |     in_size = 1;
148 |     for (int j = 0; j < in_dims.nbDims; j++) {
149 |         in_size *= in_dims.d[j];
150 |     }
151 |     auto out_dims1 = engine->getTensorShape(OUTPUT_BLOB_NAME1);
152 |     // auto out_dims1 = engine->getBindingDimensions(engine->getBindingIndex("output"));
153 |     out_size1 = 1;
154 |     for (int j = 0; j < out_dims1.nbDims; j++) {
155 |         out_size1 *= out_dims1.d[j];
156 |     }
157 |     auto out_dims2 = engine->getTensorShape(OUTPUT_BLOB_NAME2);
158 |     // auto out_dims2 = engine->getBindingDimensions(engine->getBindingIndex("943"));
159 |     out_size2 = 1;
160 |     for (int j = 0; j < out_dims2.nbDims; j++) {
161 |         out_size2 *= out_dims2.d[j];
162 |     }
163 |     auto out_dims3 = engine->getTensorShape(OUTPUT_BLOB_NAME3);
164 |     // auto out_dims3 = engine->getBindingDimensions(engine->getBindingIndex("944"));
165 |     out_size3 = 1;
166 |     for (int j = 0; j < out_dims3.nbDims; j++) {
167 |         out_size3 *= out_dims3.d[j];
168 |     }
169 |     auto out_dims4 = engine->getTensorShape(OUTPUT_BLOB_NAME4);
170 |     // auto out_dims4 = engine->getBindingDimensions(engine->getBindingIndex("945"));
171 |     out_size4 = 1;
172 |     for (int j = 0; j < out_dims4.nbDims; j++) {
173 |         out_size4 *= out_dims4.d[j];
174 |     }
175 | 
176 |     context = engine->createExecutionContext();
177 |     if (!context) {
178 |         std::cout << "create execution context failed\n";
179 |         std::abort();
180 |     }
181 | 
182 |     CHECK(cudaMalloc(&buffs[0], in_size * sizeof(float)));
183 |     CHECK(cudaMalloc(&buffs[1], out_size1 * sizeof(int)));
184 |     CHECK(cudaMalloc(&buffs[2], out_size2 * sizeof(float)));
185 |     CHECK(cudaMalloc(&buffs[3], out_size3 * sizeof(float)));
186 |     CHECK(cudaMalloc(&buffs[4], out_size4 * sizeof(int)));
187 |     CHECK(cudaStreamCreate(&stream));
188 | }
189 | 
190 | void YoloEnd2End::Infer(cv::Mat& img, float* Boxes, float* score, int* ClassIndexs, int* BboxNum) {
191 | 
192 |     cv::Mat pr_img;
193 |     pr_img = this->static_resize(img);
194 |     float* blob = this->blobFromImage(pr_img);
195 |     float scale = std::min(INPUT_W / (img.cols*1.0), INPUT_H / (img.rows*1.0));
196 | 
197 |     static int* num_dets = new int[out_size1];
198 |     static float* det_boxes = new float[out_size2];
199 |     static float* det_scores = new float[out_size3];
200 |     static int* det_classes = new int[out_size4];
201 | 
202 |     CHECK(cudaMemcpyAsync(buffs[0], &blob[0], in_size * sizeof(float), cudaMemcpyHostToDevice, stream));
203 |       
204 |     context->enqueueV2(&buffs[0], stream, nullptr);
205 | 
206 |     CHECK(cudaMemcpyAsync(num_dets, buffs[1], out_size1 * sizeof(int), cudaMemcpyDeviceToHost, stream));
207 |     CHECK(cudaMemcpyAsync(det_boxes, buffs[2], out_size2 * sizeof(float), cudaMemcpyDeviceToHost, stream));
208 |     CHECK(cudaMemcpyAsync(det_scores, buffs[3], out_size3 * sizeof(float), cudaMemcpyDeviceToHost, stream));
209 |     CHECK(cudaMemcpyAsync(det_classes, buffs[4], out_size4 * sizeof(int), cudaMemcpyDeviceToHost, stream));
210 |   
211 |     BboxNum[0] = num_dets[0];
212 |     int img_w = img.cols;
213 |     int img_h = img.rows;
214 |     for (size_t i = 0; i < num_dets[0]; i++) {
215 |         float x0 = (det_boxes[i * 4]) / scale;
216 |         float y0 = (det_boxes[i * 4 + 1]) / scale;
217 |         float w = (det_boxes[i * 4 + 2]) / scale;
218 |         float h = (det_boxes[i * 4 + 3]) / scale;
219 | 
220 |         x0 = x0 - w/2.0;
221 |         y0 = y0 - h/2.0;
222 |         x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
223 |         y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
224 |         w = std::max(w, 0.f);
225 |         h = std::max(h, 0.f);
226 |         Boxes[i * 4] = x0;
227 |         Boxes[i * 4 + 1] = y0;
228 |         Boxes[i * 4 + 2] = w;
229 |         Boxes[i * 4 + 3] = h;
230 |         ClassIndexs[i] = det_classes[i];
231 |         score[i*4] = det_scores[i];
232 |     }
233 |     delete blob;
234 | }
235 | 
236 | void YoloEnd2End::draw_objects(const cv::Mat& img, float* Boxes, int* ClassIndexs, int* BboxNum) {
237 |     cv::Mat image = img.clone();
238 |     for (int j = 0; j < BboxNum[0]; j++) {
239 |         cv::Rect rect(Boxes[j * 4], Boxes[j * 4 + 1], Boxes[j * 4 + 2], Boxes[j * 4 + 3]);
240 | 
241 |         cv::Scalar color = cv::Scalar(color_list[ClassIndexs[j]][0], 
242 |                                         color_list[ClassIndexs[j]][1], 
243 |                                         color_list[ClassIndexs[j]][2]);
244 |   
245 |         cv::rectangle(image, rect, color * 255, 2);
246 |         cv::putText(
247 |             image,
248 |             class_names[ClassIndexs[j]],
249 |             cv::Point(rect.x, rect.y - 1),
250 |             cv::FONT_HERSHEY_PLAIN,
251 |             1.2,
252 |             color * 255,
253 |             2);
254 |         cv::imwrite("result.jpg", image);
255 |     }
256 | }
257 | 
258 | YoloEnd2End::~YoloEnd2End() {
259 |     std::cout<<"释放内存、显存"<<std::endl;
260 |     cudaStreamSynchronize(stream);
261 |     CHECK(cudaFree(buffs[0]));
262 |     CHECK(cudaFree(buffs[1]));
263 |     CHECK(cudaFree(buffs[2]));
264 |     CHECK(cudaFree(buffs[3]));
265 |     CHECK(cudaFree(buffs[4]));
266 |     cudaStreamDestroy(stream);
267 |     delete context;
268 |     delete engine;
269 |     delete runtime;
270 |     // context->destroy();
271 |     // engine->destroy();
272 |     // runtime->destroy();
273 | }
274 | 
275 | 
276 | int main(int argc, char** argv) {
277 |     cudaSetDevice(DEVICE);
278 |     const std::string input_image_path = "../../../../2_faster_tensorrt/inference/1.jpg";
279 |     const std::string engine_file_path="../../../../2_faster_tensorrt/yolox_end2end.engine";
280 | 
281 |   
282 |     float* Boxes = new float[400];
283 |     float* Scores = new float[100];
284 |     int* BboxNum = new int[1];
285 |     int* ClassIndexs = new int[100];
286 |     YoloEnd2End yolo_end2end(engine_file_path);
287 |     cv::Mat img;
288 |     img = cv::imread(input_image_path);
289 |     // warmup 
290 |     for (int num =0; num < 500; num++) {
291 |         yolo_end2end.Infer(img, Boxes, Scores, ClassIndexs, BboxNum);
292 |     }
293 |     // inference
294 |     auto start = std::chrono::system_clock::now();
295 |     for (int num = 0; num < 1000; num++) {
296 |         yolo_end2end.Infer(img, Boxes, Scores, ClassIndexs, BboxNum);
297 |     }
298 |     auto end = std::chrono::system_clock::now();
299 |     std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() /1000.0<< "ms" << std::endl;
300 | 
301 |     // std::cout<<BboxNum[0]<<std::endl;
302 |     // std::cout<<ClassIndexs[0]<<" "<<ClassIndexs[1]<<std::endl;
303 |     // std::cout<<Boxes[0]<<" "<<Boxes[1]<<" "<<Boxes[2]<<" "<<Boxes[3]<<std::endl;
304 | 
305 |     yolo_end2end.draw_objects(img, Boxes, ClassIndexs, BboxNum);
306 | 
307 |     delete[] Boxes;
308 |     delete[] Scores;
309 |     delete[] BboxNum;
310 |     delete[] ClassIndexs;
311 | 
312 |     return 0;
313 | 
314 | }
315 | 


--------------------------------------------------------------------------------