├── 1_trt_base ├── trt_yolox │ ├── readme.md │ ├── cpp │ │ ├── CMakeLists.txt │ │ └── yolox_end2end.cpp │ └── py │ │ ├── tools.py │ │ ├── trt.py │ │ └── trt_end2end.py ├── readme.md ├── trt_plugin │ ├── demo01 │ │ ├── CMakeLists.txt │ │ ├── demo01.h │ │ ├── test.py │ │ └── demo01.cu │ ├── 新建插件.md │ └── yolox_end2end │ │ ├── CMakeLists.txt │ │ └── end2end.md ├── trt_demo │ ├── trt_cpp │ │ ├── CMakeLists.txt │ │ └── main.cpp │ └── trt_py │ │ ├── model2onnx.py │ │ └── trt_python.py └── trt_rtdetr │ ├── readme.md │ ├── rtdetr_onnx.py │ └── rtdetr_trt.py ├── .gitmodules ├── 3_faster_ncnn ├── img │ └── 000026.jpg ├── readme.md ├── CMakeLists.txt └── src │ ├── apps │ ├── yolo │ │ ├── yolo.h │ │ └── yolo.cpp │ ├── common.h │ └── common.cpp │ ├── base │ ├── tools.hpp │ └── infer_base.hpp │ └── main.cpp ├── 2_faster_tensorrt ├── inference │ ├── 1.jpg │ └── 2.jpg ├── sources │ ├── ori.jpg │ ├── 2streamv1.jpg │ ├── 2streamv2.jpg │ ├── ori_queue.jpg │ └── 2steam_overview.jpg ├── src │ ├── eval │ │ ├── get_imgid_txt.py │ │ ├── save.hpp │ │ ├── eval.py │ │ └── eval.cpp │ ├── onnx_model │ │ ├── rtdetr_sim_export_trt.py │ │ └── v8onnx_tranpose.py │ ├── apps │ │ ├── rtdetr │ │ │ ├── rtdetr.h │ │ │ └── rtdetr.cpp │ │ ├── common.hpp │ │ └── yolo │ │ │ └── yolo.h │ ├── base │ │ ├── trt_base.hpp │ │ ├── monopoly_accocator.hpp │ │ ├── trt_base.cpp │ │ ├── tools.hpp │ │ ├── memory_tensor.hpp │ │ ├── infer_base.cpp │ │ └── infer_base.hpp │ └── kernels │ │ └── cuda_kernel.cuh ├── CMakeLists.txt └── readme.md ├── .gitignore ├── LICENSE ├── README.md └── .clang-format /1_trt_base/trt_yolox/readme.md: -------------------------------------------------------------------------------- 1 | yolox的推理。我没有记错的话focus被我换成了普通卷积更好导出和优化。 2 | 别的无非就是要注意后处理是不是要在head中增加,其实都是比较简单的。 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "4_faster_rknn"] 2 | path = 4_faster_rknn 3 | url = https://github.com/0zzx0/zzx_rknn.git 4 | -------------------------------------------------------------------------------- /3_faster_ncnn/img/000026.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0zzx0/faster_deployment/HEAD/3_faster_ncnn/img/000026.jpg -------------------------------------------------------------------------------- /2_faster_tensorrt/inference/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0zzx0/faster_deployment/HEAD/2_faster_tensorrt/inference/1.jpg -------------------------------------------------------------------------------- /2_faster_tensorrt/inference/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0zzx0/faster_deployment/HEAD/2_faster_tensorrt/inference/2.jpg -------------------------------------------------------------------------------- /2_faster_tensorrt/sources/ori.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0zzx0/faster_deployment/HEAD/2_faster_tensorrt/sources/ori.jpg -------------------------------------------------------------------------------- /2_faster_tensorrt/sources/2streamv1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0zzx0/faster_deployment/HEAD/2_faster_tensorrt/sources/2streamv1.jpg -------------------------------------------------------------------------------- /2_faster_tensorrt/sources/2streamv2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0zzx0/faster_deployment/HEAD/2_faster_tensorrt/sources/2streamv2.jpg -------------------------------------------------------------------------------- /2_faster_tensorrt/sources/ori_queue.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0zzx0/faster_deployment/HEAD/2_faster_tensorrt/sources/ori_queue.jpg -------------------------------------------------------------------------------- /2_faster_tensorrt/sources/2steam_overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0zzx0/faster_deployment/HEAD/2_faster_tensorrt/sources/2steam_overview.jpg -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | 3 | # c++编译 4 | **/build/* 5 | 6 | # 模型权重及转换中间文件 动态链接库文件 7 | *.engine 8 | *.onnx 9 | *.so 10 | 11 | *.bin 12 | *.param 13 | 14 | # 备份 15 | 2_faster_tensorrt/src/un_used_code.cpp 16 | 17 | # 本地more测试文件夹 18 | 5_yolov8/ 19 | 6_trt_more/ -------------------------------------------------------------------------------- /1_trt_base/readme.md: -------------------------------------------------------------------------------- 1 | # TensorRT 2 | 3 | > 测试平台: i9-9900K + 2080Ti + 32G + Ubuntu18.04 + cuda10.2 + cudnn8.7 + trt8.5.3 4 | 5 | 1. `trt_demo`: 使用python api和c++ api进行trt模型转换和推理的demo。 6 | 2. `trt_plugin`: trt增加自定义plugin的基本demo,以及yolox的nms过程采用trt nms plugin的使用方法。 7 | 3. `trt_yolox`: 采用python和c++ 推理yolox的demo。 8 | 4. `trt_rtdetr`: paddlepaddle版本rtdetr的trt转换和推理。 9 | -------------------------------------------------------------------------------- /2_faster_tensorrt/src/eval/get_imgid_txt.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | with open('/home/zzx/Experiment/Data/UTDAC2020/annotations/instances_val2017.json', 'r') as f: 4 | coco_data = json.load(f) 5 | 6 | # 获取标签信息 7 | images = coco_data['images'] 8 | print('图片数量:', len(images)) 9 | 10 | with open('img_id.txt', 'w') as f: 11 | for image in images: 12 | f.write(f"{image['id']} {image['file_name']}\n") -------------------------------------------------------------------------------- /3_faster_ncnn/readme.md: -------------------------------------------------------------------------------- 1 | # Faster_NCNN 2 | 3 | 本仓库在ncnn上的推理加速是在有限,由于我主要是在cpu上运行ncnn,所以预处理相对整个推理占比极小,吞吐量提速不是很明显。但是接口依然是十分简单易用,并且相对容易扩展。 4 | 5 | ## 模型转换 6 | 7 | ncnn模型转换可以参考[ncnn仓库](https://github.com/Tencent/ncnn) 8 | 9 | ## 模型推理 10 | 11 | ```cpp 12 | // 创建模型 13 | auto yolo = YoloNCNN::create_infer(param_path, model_path, confidence_threshold, nms_threshold); 14 | 15 | // 推理图片 16 | auto objs = yolo->commit(image); 17 | 18 | // 得到结果 19 | auto res = objs.get(); 20 | ``` 21 | 22 | 23 | -------------------------------------------------------------------------------- /2_faster_tensorrt/src/onnx_model/rtdetr_sim_export_trt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import onnx_graphsurgeon as gs 3 | import onnx 4 | 5 | # paddle上的修改可以参考这位大佬的文章 6 | # https://zhuanlan.zhihu.com/p/623794029 7 | 8 | model = onnx.load("./rtdetr_r18vd_6x_coco.onnx") 9 | graph = gs.import_onnx(model) 10 | graph.outputs[0].name = "output" 11 | # print(graph.outputs) 12 | 13 | onnx.save(gs.export_onnx(graph), "rtdetr_r18vd_6x_coco_output.onnx") 14 | 15 | os.system("onnxsim rtdetr_r18vd_6x_coco_output.onnx rtdetr_r18vd_6x_coco_output_sim.onnx") 16 | 17 | os.system("trtexec --onnx=./rtdetr_r18vd_6x_coco_output_sim.onnx --workspace=4096 --shapes=image:1x3x640x640 --saveEngine=rtdetr_r18vd_6x_coco.trt --fp16") 18 | 19 | -------------------------------------------------------------------------------- /1_trt_base/trt_plugin/demo01/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | set(CMAKE_CXX_STANDARD 11) 4 | set(CMAKE_BUILD_TYPE Debug) 5 | 6 | project(trt_cpp LANGUAGES CXX CUDA) 7 | 8 | # tensorrt 9 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/include) 10 | link_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/lib) 11 | 12 | # add CUDA 13 | find_package(CUDA REQUIRED) 14 | message("CUDA_LIBRARIES:${CUDA_LIBRARIES}") 15 | message("CUDA_INCLUDE_DIRS:${CUDA_INCLUDE_DIRS}") 16 | include_directories(${CUDA_INCLUDE_DIRS}) 17 | 18 | 19 | SET(LIBHELLO_SRC demo01.cu demo01.h cookbookHelper.cuh) 20 | ADD_LIBRARY(demo01 SHARED ${LIBHELLO_SRC}) 21 | 22 | # link 23 | target_link_libraries(demo01 nvinfer ${CUDA_LIBRARIES}) -------------------------------------------------------------------------------- /1_trt_base/trt_demo/trt_cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | set(CMAKE_CXX_STANDARD 11) 4 | set(CMAKE_BUILD_TYPE Debug) 5 | 6 | project(trt_cpp LANGUAGES CXX CUDA) 7 | 8 | # tensorrt 9 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/include) 10 | link_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/lib) 11 | 12 | # add CUDA 13 | find_package(CUDA REQUIRED) 14 | message("CUDA_LIBRARIES:${CUDA_LIBRARIES}") 15 | message("CUDA_INCLUDE_DIRS:${CUDA_INCLUDE_DIRS}") 16 | include_directories(${CUDA_INCLUDE_DIRS}) 17 | 18 | add_executable(main main.cpp cookbookHelper.cuh) 19 | # link 20 | # target_link_libraries(main ${LIBS} ${CUDA_LIBRARIES}) 21 | target_link_libraries(main nvinfer nvonnxparser ${CUDA_LIBRARIES}) 22 | -------------------------------------------------------------------------------- /3_faster_ncnn/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | 3 | project(yolo_ncnn) 4 | 5 | set(CMAKE_CXX_STANDARD 11) 6 | set(CMAKE_BUILD_TYPE Debug) 7 | 8 | find_package(OpenCV REQUIRED) 9 | find_package(ncnn REQUIRED) 10 | 11 | include_directories(${OpenCV_INCLUDE_DIRS}) 12 | include_directories(${PROJECT_SOURCE_DIR}/src) 13 | 14 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/ncnn/build/install/include/ncnn) 15 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -O0 -Wfatal-errors -pthread -w -g") 16 | 17 | add_executable(yolox_ncnn 18 | ${PROJECT_SOURCE_DIR}/src/main.cpp 19 | ${PROJECT_SOURCE_DIR}/src/apps/common.cpp 20 | ${PROJECT_SOURCE_DIR}/src/apps/yolo/yolo.cpp 21 | ) 22 | target_link_libraries(yolox_ncnn ${OpenCV_LIBS} ncnn pthread) 23 | -------------------------------------------------------------------------------- /2_faster_tensorrt/src/eval/save.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | class SaveResult { 8 | public: 9 | SaveResult(std::string &filename) { 10 | out.open(filename); 11 | } 12 | ~SaveResult() { 13 | if(out.is_open()) { 14 | out.close(); 15 | } 16 | } 17 | 18 | void save_one_line(std::string &img_name, std::string & image_id, int category_id, float score, std::vector &result) { 19 | if(out.is_open()) { 20 | out << img_name << " "<< image_id << " "<< category_id << " " << score << " "; 21 | for(auto &i : result) { 22 | out << i << " "; 23 | } 24 | out << "\n"; 25 | } 26 | } 27 | 28 | private: 29 | std::ofstream out; 30 | }; 31 | 32 | 33 | -------------------------------------------------------------------------------- /2_faster_tensorrt/src/onnx_model/v8onnx_tranpose.py: -------------------------------------------------------------------------------- 1 | import onnx 2 | import numpy as np 3 | import onnx_graphsurgeon as gs 4 | 5 | 6 | """ 7 | pip install nvidia-pyindex 8 | pip install onnx-graphsurgeon 9 | 10 | """ 11 | 12 | model = onnx.load("./onnx_weights/v8n.onnx") 13 | graph = gs.import_onnx(model) 14 | 15 | # graph.outputs[0].name = "output" 16 | old_shape = graph.outputs[0].shape 17 | output_tensort = gs.Variable("output", graph.outputs[0].dtype, [old_shape[0], old_shape[2], old_shape[1]] ) 18 | 19 | graph.nodes[-1].outputs[0].name = "oldoutput" 20 | 21 | reshape_node = gs.Node( 22 | op="Transpose", 23 | name="outputtranspose", 24 | inputs=[graph.nodes[-1].outputs[0]], 25 | outputs=[output_tensort], 26 | attrs={"perm": [0, 2, 1]} 27 | ) 28 | 29 | # print(type(graph.nodes)) # list 30 | graph.nodes.append(reshape_node) 31 | 32 | 33 | graph.outputs = reshape_node.outputs 34 | for node in graph.outputs: 35 | print(node) 36 | 37 | 38 | graph.cleanup().toposort() 39 | onnx.save(gs.export_onnx(graph), "v8_transpose.onnx") 40 | 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 zzx_ncepu_bit 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /1_trt_base/trt_rtdetr/readme.md: -------------------------------------------------------------------------------- 1 | # RT-DETR的tensorrt转换 2 | 3 | 百度家的这个新模型是真不错,尤其是出了r18的可以类比yolo系列的s模型了。 4 | 5 | ## paddle infer 6 | ```shell 7 | python tools/infer.py -c configs/rtdetr/rtdetr_r18vd_6x_coco.yml \ 8 | -o weights=0zzx/rtdetr_r18vd_dec3_6x_coco.pdparams \ 9 | --infer_img=./demo/000000570688.jpg 10 | ``` 11 | 12 | 13 | ## paddle onnx 14 | paddlepaddle-gpu需要大于2.4.1要不报错。 15 | 首先需要先导出 16 | ```shell 17 | python tools/export_model.py -c configs/rtdetr/rtdetr_r18vd_6x_coco.yml \ 18 | -o weights=rtdetr_r18vd_6x_coco.pdparams trt=True \ 19 | --output_dir=output_inference 20 | ``` 21 | 然后转成onnx 22 | ```shell 23 | paddle2onnx --model_dir=rtdetr_r18vd_6x_coco \ 24 | --model_filename model.pdmodel \ 25 | --params_filename model.pdiparams \ 26 | --opset_version 16 \ 27 | --save_file rtdetr_r18vd_6x_coco.onnx 28 | ``` 29 | 30 | ## trt转换 31 | ```shell 32 | trtexec --onnx=./rtdetr_r18vd_6x_coco.onnx \ 33 | --workspace=4096 \ 34 | --shapes=image:1x3x640x640 \ 35 | --saveEngine=rtdetr_r18vd_6x_coco.trt \ 36 | --avgRuns=100 \ 37 | --fp16 38 | ``` -------------------------------------------------------------------------------- /1_trt_base/trt_plugin/新建插件.md: -------------------------------------------------------------------------------- 1 | # Tensorrt 插件 2 | 3 | 4 | ## 从registry加载plugin 5 | 6 | ```c++ 7 | // 从注册器根据名字和版本找到需要的plugin 8 | auto creator = getPluginRegistry()->getPluginCreator(pluginName, pluginVersion); 9 | const PluginFieldCollection* pluginFC = creator->getFieldNames(); 10 | 11 | // Populate the fields parameters for the plugin layer 12 | // PluginFieldCollection *pluginData = parseAndFillFields(pluginFC, layerFields); 13 | 14 | // 使用layer和data创建对象 15 | IPluginV2 *pluginObj = creator->createPlugin(layerName, pluginData); // 内部有new 申请 必须destory 16 | 17 | // 增加这一层到网络里面 18 | auto layer = network.addPluginV2(&inputs[0], int(inputs.size()), pluginObj); 19 | … (build rest of the network and serialize engine) 20 | // Destroy the plugin object 21 | pluginObj->destroy() 22 | … (free allocated pluginData) 23 | ``` 24 | 25 | 26 | 从一个parser解析出来的模型,加载插件,ONNX解析器会自动尝试将无法识别的节点作为插件导入。如果在插件注册表中找到与节点具有相同op_type的插件,则解析器将节点的属性作为插件字段参数转发给插件创建者,以便创建插件。默认情况下,解析器使用`1`作为插件版本,使用`""`作为插件命名空间。可以通过在相应的ONNX节点中设置`plugin_version`和`plugin_namespace`字符串属性来覆盖此行为。 27 | 28 | 29 | 30 | 31 | ## 自定义一个plugin 32 | 33 | https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#extending 34 | -------------------------------------------------------------------------------- /2_faster_tensorrt/src/eval/eval.py: -------------------------------------------------------------------------------- 1 | from pycocotools.coco import COCO 2 | from pycocotools.cocoeval import COCOeval 3 | import numpy as np 4 | import json 5 | 6 | 7 | def get_coco_from_txt(txtfile, json_file, clsid2catid): 8 | 9 | dataset_res = [] 10 | 11 | with open(txtfile, 'r') as f: 12 | datas = f.readlines() 13 | # print(len(datas)) 14 | 15 | for data in datas: 16 | info = data.split(" ")[:-1] 17 | result = {} 18 | result["image_id"] = int(info[1]) 19 | result["category_id"] = clsid2catid[int(info[2])] 20 | result["bbox"] = [int(info[4]), int(info[5]), int(info[6]), int(info[7])] 21 | result["score"] = float(info[3]) 22 | dataset_res.append(result) 23 | 24 | with open(json_file, "w") as f: 25 | json.dump(dataset_res, f) 26 | print("json 保存成功") 27 | 28 | 29 | annFile = "/home/zzx/Experiment/Data/UTDAC2020/annotations/instances_val2017.json" 30 | resFile = "./results.txt" 31 | resJson = 'eval_results.json' 32 | 33 | cocoGt=COCO(annFile) 34 | clsid2catid = cocoGt.getCatIds() 35 | 36 | get_coco_from_txt(resFile, resJson, clsid2catid) 37 | cocoDt = cocoGt.loadRes(resJson) 38 | 39 | cocoEval = COCOeval(cocoGt, cocoDt, 'bbox') 40 | cocoEval.params.imgIds = cocoGt.getImgIds() 41 | cocoEval.evaluate() 42 | cocoEval.accumulate() 43 | cocoEval.summarize() 44 | 45 | 46 | -------------------------------------------------------------------------------- /1_trt_base/trt_plugin/yolox_end2end/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | 3 | project(yolox) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 15 | # cuda 16 | include_directories(/usr/local/cuda-10.2/include) 17 | link_directories(/usr/local/cuda-10.2/lib64) 18 | # cudnn 19 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/cudnn8.7.0.84_cuda10/include) 20 | link_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/cudnn8.7.0.84_cuda10/lib64) 21 | # tensorrt 22 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/include) 23 | link_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/lib) 24 | 25 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 26 | 27 | find_package(OpenCV) 28 | include_directories(${OpenCV_INCLUDE_DIRS}) 29 | 30 | add_executable(yolox_end2end ${PROJECT_SOURCE_DIR}/yolox_end2end.cpp) 31 | target_link_libraries(yolox_end2end nvinfer nvinfer_plugin) 32 | target_link_libraries(yolox_end2end cudart) 33 | target_link_libraries(yolox_end2end ${OpenCV_LIBS}) 34 | 35 | add_definitions(-O2 -pthread) 36 | 37 | -------------------------------------------------------------------------------- /3_faster_ncnn/src/apps/yolo/yolo.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "opencv2/opencv.hpp" 4 | 5 | #include "../common.h" 6 | #include "../../base/infer_base.hpp" 7 | #include "../../base/tools.hpp" 8 | 9 | 10 | namespace YoloNCNN{ 11 | 12 | using namespace FasterNCNN; 13 | 14 | 15 | using Infer = InferBase>; 16 | using Det = DetBase>; 17 | // 推理 18 | class InferImpl : public Infer, Det{ 19 | 20 | public: 21 | 22 | 23 | bool startup(const std::string ¶m_path, 24 | const std::string &model_path, 25 | float confidence, float iou_thr); 26 | virtual void worker(std::promise &pro) override; 27 | virtual bool preprocess(Job &job, const cv::Mat &input) override; 28 | 29 | virtual std::shared_future> commit(const cv::Mat &input) override; 30 | 31 | private: 32 | 33 | std::string param_path_; 34 | std::string model_path_; 35 | float confidence_; 36 | float iou_thr_; 37 | 38 | std::shared_ptr postprocess_; 39 | std::vector results_; 40 | 41 | int infer_thread_ = 8; 42 | int class_num = 4; 43 | 44 | }; 45 | 46 | 47 | 48 | std::shared_ptr create_infer(const std::string ¶m_path, 49 | const std::string &model_path, 50 | float confidence, 51 | float iou_thr 52 | ); 53 | 54 | } 55 | -------------------------------------------------------------------------------- /1_trt_base/trt_yolox/cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | 3 | project(yolox) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 15 | # cuda 16 | include_directories(/usr/local/cuda-10.2/include) 17 | link_directories(/usr/local/cuda-10.2/lib64) 18 | # cudnn 19 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/cudnn8.7.0.84_cuda10/include) 20 | link_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/cudnn8.7.0.84_cuda10/lib64) 21 | # tensorrt 22 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/include) 23 | link_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/lib) 24 | 25 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 26 | 27 | find_package(OpenCV) 28 | include_directories(${OpenCV_INCLUDE_DIRS}) 29 | 30 | add_executable(yolox_end2end ${PROJECT_SOURCE_DIR}/yolox_end2end.cpp) 31 | target_link_libraries(yolox_end2end nvinfer nvinfer_plugin) 32 | target_link_libraries(yolox_end2end cudart) 33 | target_link_libraries(yolox_end2end ${OpenCV_LIBS}) 34 | 35 | add_executable(yolox ${PROJECT_SOURCE_DIR}/yolox.cpp) 36 | target_link_libraries(yolox nvinfer) 37 | target_link_libraries(yolox cudart) 38 | target_link_libraries(yolox ${OpenCV_LIBS}) 39 | 40 | add_definitions(-O2 -pthread) 41 | 42 | -------------------------------------------------------------------------------- /3_faster_ncnn/src/apps/common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "net.h" 7 | 8 | #include "../base/tools.hpp" 9 | 10 | namespace FasterNCNN { 11 | 12 | 13 | // bboxes 14 | struct ObjBox{ 15 | 16 | float GetWidth() { return (x2 - x1); }; 17 | float GetHeight() { return (y2 - y1); }; 18 | float area() { return GetWidth() * GetHeight(); }; 19 | 20 | int x1; 21 | int y1; 22 | int x2; 23 | int y2; 24 | 25 | int category; 26 | float score; 27 | }; 28 | 29 | struct GridAndStride{ 30 | int grid0; 31 | int grid1; 32 | int stride; 33 | }; 34 | 35 | 36 | static float InterSectionArea(const ObjBox &a, const ObjBox &b); 37 | static bool ScoreSort(ObjBox a, ObjBox b); 38 | static void nms(std::vector &src_boxes, std::vector &dst_boxes, float threshold); 39 | 40 | 41 | class postProcess { 42 | 43 | public: 44 | enum class postProcessType : int{ 45 | 46 | yolox = 0, 47 | yolov8 = 1, 48 | }; 49 | 50 | 51 | public: 52 | postProcess(postProcessType type, float input_h, float input_w, float conf_thr, float nms_thr); 53 | ~postProcess() { }; 54 | 55 | void forward(ncnn::Mat &output_); 56 | void yolox_generate_grids_and_stride(); 57 | void yolox_decode(ncnn::Mat &output_); 58 | 59 | 60 | protected: 61 | int input_h_; 62 | int input_w_; 63 | float conf_thr_; 64 | float nms_thr_; 65 | 66 | // std::vector out_boxes; 67 | // std::vector nms_boxes; 68 | 69 | const std::vector strides{8, 16, 32}; 70 | std::vector grid_strides; 71 | 72 | public: 73 | std::vector out_boxes; 74 | std::vector nms_boxes; 75 | 76 | }; 77 | 78 | } 79 | -------------------------------------------------------------------------------- /2_faster_tensorrt/src/apps/rtdetr/rtdetr.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file rtdetr.h 3 | * @author 0zzx0 4 | * @brief RTDETR推理 5 | * @version 0.1 6 | * @date 2023-08-21 7 | * 8 | * @copyright Copyright (c) 2023 9 | * 10 | */ 11 | 12 | #ifndef RTDETR_H 13 | #define RTDETR_H 14 | 15 | #include "../common.hpp" 16 | 17 | namespace RTDETR { 18 | using namespace FasterTRT; 19 | 20 | // 线程安全模板类设置模板类型 21 | using ThreadSafedAsyncInferImpl = 22 | ThreadSafedAsyncInfer, // start param 25 | AffineMatrix // additional 26 | >; 27 | using Infer = InferBase; 28 | 29 | /** 30 | * @brief 推理类的实现,继承必备父类,重写父类方法 31 | * 32 | */ 33 | class RtDetrTRTInferImpl : public Infer, public ThreadSafedAsyncInferImpl { 34 | public: 35 | ~RtDetrTRTInferImpl(); 36 | 37 | virtual bool startup(const std::string &file, int gpuid, int batch_size, 38 | float confidence_threshold); 39 | virtual void worker(std::promise &result) override; 40 | virtual bool preprocess(Job &job, const cv::Mat &image) override; 41 | 42 | virtual std::vector> commits( 43 | const std::vector &images) override; 44 | virtual std::shared_future commit(const cv::Mat &image) override; 45 | 46 | private: 47 | int input_width_ = 0; 48 | int input_height_ = 0; 49 | int gpu_ = 0; 50 | float confidence_threshold_ = 0; 51 | cudaStream_t stream_ = nullptr; 52 | cudaStream_t stream_pro_ = nullptr; 53 | Norm normalize_; 54 | int batch_size_ = 1; 55 | }; 56 | 57 | // 创建推理器 58 | std::shared_ptr create_infer(const std::string &engine_file, int gpuid, int batch_size, 59 | float confidence_threshold = 0.2f); 60 | 61 | } // namespace RTDETR 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /2_faster_tensorrt/src/eval/eval.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "../apps/yolo/yolo.h" 7 | #include "save.hpp" 8 | 9 | using namespace std; 10 | 11 | const string base_path = "/home/zzx/Experiment/Data/UTDAC2020/val2017/"; 12 | YOLO::YoloType type = YOLO::YoloType::X; 13 | const string model_file = "../yolox_b16.engine"; 14 | const int deviceid = 0; 15 | 16 | const float confidence_threshold = 0.5f; 17 | const float nms_threshold = 0.65f; 18 | 19 | int main() { 20 | int batch_size = 1; 21 | YOLO::set_device(deviceid); 22 | 23 | auto yolo = YOLO::create_infer(model_file, type, deviceid, batch_size, confidence_threshold, 24 | nms_threshold); 25 | 26 | ifstream img_id("../src/eval/img_id.txt"); 27 | vector all_id; 28 | vector all_img; 29 | 30 | while(!img_id.eof()) { 31 | string id; 32 | string name; 33 | img_id >> id; 34 | img_id >> name; 35 | if(id.size() == 0) break; 36 | 37 | all_id.push_back(id); 38 | all_img.push_back(name); 39 | // cout << id << " " << name << endl; 40 | } 41 | img_id.close(); 42 | 43 | string resfile_name = "../src/eval/results.txt"; 44 | SaveResult resfile(resfile_name); 45 | 46 | assert(all_id.size() == all_img.size()); 47 | 48 | for(int i = 0; i < all_id.size(); i++) { 49 | string cur_img = base_path + all_img[i]; 50 | auto image = cv::imread(cur_img); 51 | auto objs = yolo->commit(image); 52 | auto res = objs.get(); 53 | for(auto& one : res) { 54 | int x = one.left; 55 | int y = one.top; 56 | int w = one.right - one.left; 57 | int h = one.bottom - one.top; 58 | vector xywh{x, y, w, h}; 59 | // cout << one.left << one.right << one.bottom << one.top << endl; 60 | resfile.save_one_line(all_img[i], all_id[i], one.class_label, one.confidence, xywh); 61 | } 62 | } 63 | return 0; 64 | } -------------------------------------------------------------------------------- /2_faster_tensorrt/src/apps/common.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file common.h 3 | * @author 0zzx0 4 | * @brief 5 | * @version 0.1 6 | * @date 2023-08-21 7 | * 8 | * @copyright Copyright (c) 2023 9 | * 10 | */ 11 | 12 | #ifndef COMMON_H 13 | #define COMMON_H 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | #include "../base/tools.hpp" 24 | #include "../base/trt_base.hpp" 25 | #include "../base/infer_base.hpp" 26 | #include "../base/memory_tensor.hpp" 27 | #include "../base/monopoly_accocator.hpp" 28 | 29 | namespace FasterTRT { 30 | 31 | // 推理结果格式 32 | struct Box { 33 | float left, top, right, bottom, confidence; 34 | int class_label; 35 | 36 | Box() = default; 37 | Box(float left, float top, float right, float bottom, float confidence, int class_label) 38 | : left(left), 39 | top(top), 40 | right(right), 41 | bottom(bottom), 42 | confidence(confidence), 43 | class_label(class_label) {} 44 | }; 45 | typedef std::vector BoxArray; 46 | 47 | // 仿射变换矩阵 48 | struct AffineMatrix { 49 | float i2d[6]; // image to dst(network), 2x3 matrix 50 | float d2i[6]; // dst to image, 2x3 matrix 51 | 52 | void compute(const cv::Size &from, const cv::Size &to) { 53 | float scale_x = to.width / (float)from.width; 54 | float scale_y = to.height / (float)from.height; 55 | float scale = std::min(scale_x, scale_y); 56 | i2d[0] = scale; 57 | i2d[1] = 0; 58 | i2d[2] = -scale * from.width * 0.5 + to.width * 0.5 + scale * 0.5 - 0.5; 59 | i2d[3] = 0; 60 | i2d[4] = scale; 61 | i2d[5] = -scale * from.height * 0.5 + to.height * 0.5 + scale * 0.5 - 0.5; 62 | 63 | cv::Mat m2x3_i2d(2, 3, CV_32F, i2d); 64 | cv::Mat m2x3_d2i(2, 3, CV_32F, d2i); 65 | cv::invertAffineTransform(m2x3_i2d, m2x3_d2i); 66 | } 67 | 68 | cv::Mat i2d_mat() { return cv::Mat(2, 3, CV_32F, i2d); } 69 | }; 70 | 71 | } // namespace FasterTRT 72 | 73 | #endif -------------------------------------------------------------------------------- /2_faster_tensorrt/src/base/trt_base.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file trt_base.hpp 3 | * @author 0zzx0 4 | * @brief trt base 5 | * @version 0.1 6 | * @date 2023-6-11 2023-8-21 7 | * 8 | * @copyright Copyright (c) 2023 9 | * 10 | */ 11 | 12 | #ifndef TRT_BASE_H 13 | #define TRT_BASE_H 14 | 15 | #include 16 | 17 | #include "memory_tensor.hpp" 18 | #include "monopoly_accocator.hpp" 19 | #include "infer_base.hpp" 20 | 21 | namespace FasterTRT { 22 | 23 | // 推理数据类型 24 | enum class Mode : int { FP32, FP16, INT8 }; 25 | const char* mode_string(Mode type); 26 | 27 | ////////////////////量化用的/////////////////////////// 28 | typedef std::function& files, 29 | std::shared_ptr& tensor)> 30 | Int8Process; 31 | 32 | /** 33 | * @brief int8 量化 未测试 34 | * 35 | */ 36 | class Int8EntropyCalibrator : public IInt8EntropyCalibrator2 { 37 | public: 38 | Int8EntropyCalibrator(const std::vector& imagefiles, nvinfer1::Dims dims, 39 | const Int8Process& preprocess); 40 | Int8EntropyCalibrator(const std::vector& entropyCalibratorData, nvinfer1::Dims dims, 41 | const Int8Process& preprocess); 42 | virtual ~Int8EntropyCalibrator(); 43 | 44 | int getBatchSize() const noexcept; 45 | bool next(); 46 | bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept; 47 | 48 | const std::vector& getEntropyCalibratorData(); 49 | const void* readCalibrationCache(size_t& length) noexcept; 50 | virtual void writeCalibrationCache(const void* cache, size_t length) noexcept; 51 | 52 | private: 53 | Int8Process preprocess_; 54 | std::vector allimgs_; 55 | size_t batchCudaSize_ = 0; 56 | int cursor_ = 0; 57 | nvinfer1::Dims dims_; 58 | std::vector files_; 59 | std::shared_ptr tensor_; 60 | std::vector entropyCalibratorData_; 61 | bool fromCalibratorData_ = false; 62 | cudaStream_t stream_ = nullptr; 63 | }; 64 | 65 | // 检索目录下的所有图像:"*.jpg;*.png;*.bmp;*.jpeg;*.tiff" 66 | std::vector glob_image_files(const std::string& directory); 67 | 68 | } // namespace FasterTRT 69 | 70 | #endif 71 | -------------------------------------------------------------------------------- /1_trt_base/trt_demo/trt_py/model2onnx.py: -------------------------------------------------------------------------------- 1 | import tensorrt as trt 2 | import torch 3 | import torch.nn as nn 4 | import onnx 5 | 6 | class MyModule(nn.Module): 7 | def __init__(self) -> None: 8 | super().__init__() 9 | self.conv = nn.Conv2d(in_channels=3,out_channels=3,kernel_size=3,stride=1,padding=1) 10 | self.bn = nn.BatchNorm2d(3) 11 | self.act = nn.ReLU(inplace=True) 12 | self.pool = nn.MaxPool2d(2, 2) 13 | 14 | def forward(self, x: torch.Tensor) -> torch.Tensor: 15 | x = self.act(self.bn(self.conv(x))) 16 | return self.pool(x) 17 | 18 | 19 | device = torch.device('cuda:0') 20 | onnx_model_name = '../files/model.onnx' 21 | torch.onnx.export(MyModule(), 22 | torch.randn(1, 3, 224, 224), 23 | onnx_model_name, 24 | input_names=['input'], 25 | output_names=['output'], 26 | opset_version=11) 27 | 28 | 29 | def ger_engine(): 30 | torch.onnx.export(MyModule(), torch.randn(1, 3, 112, 112), onnx_model_name, input_names=['input'], 31 | output_names=['output'], opset_version=11) 32 | 33 | onnx_model = onnx.load(onnx_model_name) 34 | 35 | logger = trt.Logger(trt.Logger.ERROR) 36 | builder = trt.Builder(logger) 37 | # EXPLICIT_BATCH 显式batch 38 | EXPLICIT_BATCH = 1 << (int)( 39 | trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) 40 | network = builder.create_network(EXPLICIT_BATCH) # 创建network 41 | 42 | parser = trt.OnnxParser(network, logger) # 解析onnx 43 | 44 | if not parser.parse(onnx_model.SerializePartialToString()): 45 | error_mags = ' ' 46 | for error in range(parser.num_errors): 47 | error_mags += error 48 | raise RuntimeError(f"解析失败辣: {error_mags}") 49 | 50 | config = builder.create_builder_config() 51 | config.max_workspace_size = 1 << 20 52 | profile = builder.create_optimization_profile() 53 | 54 | profile.set_shape('input', [1,3 ,112 ,112],[1,3 ,112 ,112],[1,3 ,112 ,112]) 55 | config.add_optimization_profile(profile) 56 | # create engine 57 | with torch.cuda.device(device): 58 | engine = builder.build_engine(network, config) 59 | 60 | with open('model.engine', mode='wb') as f: 61 | f.write(bytearray(engine.serialize())) 62 | print("generating file done!") 63 | 64 | 65 | -------------------------------------------------------------------------------- /2_faster_tensorrt/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | 3 | project(yolo_trt) 4 | 5 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 6 | set(CMAKE_CXX_STANDARD 11) 7 | set(CMAKE_BUILD_TYPE Debug) 8 | 9 | find_package(CUDA REQUIRED) 10 | find_package(OpenCV REQUIRED) 11 | 12 | include_directories(${OpenCV_INCLUDE_DIRS}) 13 | include_directories(${PROJECT_SOURCE_DIR}/src) 14 | 15 | # cuda 16 | include_directories(/usr/local/cuda-10.2/include) 17 | link_directories(/usr/local/cuda-10.2/lib64) 18 | # cudnn 19 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/cudnn8.7.0.84_cuda10/include) 20 | link_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/cudnn8.7.0.84_cuda10/lib) 21 | # tensorrt 22 | include_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/include) 23 | link_directories(/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/lib) 24 | 25 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -O0 -Wfatal-errors -pthread -w -g") 26 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11 -O0 -Xcompiler -fPIC -g -w ${CUDA_GEN_CODE}") 27 | 28 | file(GLOB_RECURSE cuda_srcs ${PROJECT_SOURCE_DIR}/src/kernels/*.cu) 29 | 30 | cuda_add_library(cuda_kernels SHARED ${cuda_srcs}) 31 | target_link_libraries(cuda_kernels cuda cudart) 32 | target_link_libraries(cuda_kernels ${OpenCV_LIBS}) 33 | 34 | add_executable(yolo 35 | ${PROJECT_SOURCE_DIR}/src/main.cpp 36 | ${PROJECT_SOURCE_DIR}/src/base/infer_base.cpp 37 | ${PROJECT_SOURCE_DIR}/src/base/memory_tensor.cpp 38 | ${PROJECT_SOURCE_DIR}/src/base/trt_base.cpp 39 | ${PROJECT_SOURCE_DIR}/src/apps/yolo/yolo.cpp 40 | ${PROJECT_SOURCE_DIR}/src/apps/rtdetr/rtdetr.cpp 41 | ) 42 | target_link_libraries(yolo cuda_kernels) 43 | target_link_libraries(yolo nvinfer nvinfer_plugin nvonnxparser) 44 | target_link_libraries(yolo cuda cublas cudart cudnn) 45 | target_link_libraries(yolo pthread) 46 | target_link_libraries(yolo ${OpenCV_LIBS}) 47 | 48 | 49 | add_executable(eval 50 | ${PROJECT_SOURCE_DIR}/src/eval/eval.cpp 51 | ${PROJECT_SOURCE_DIR}/src/eval/save.hpp 52 | ${PROJECT_SOURCE_DIR}/src/base/infer_base.cpp 53 | ${PROJECT_SOURCE_DIR}/src/base/memory_tensor.cpp 54 | ${PROJECT_SOURCE_DIR}/src/base/trt_base.cpp 55 | ${PROJECT_SOURCE_DIR}/src/apps/yolo/yolo.cpp 56 | ) 57 | target_link_libraries(eval cuda_kernels) 58 | target_link_libraries(eval nvinfer nvinfer_plugin nvonnxparser) 59 | target_link_libraries(eval cuda cublas cudart cudnn) 60 | target_link_libraries(eval pthread) 61 | target_link_libraries(eval ${OpenCV_LIBS}) 62 | -------------------------------------------------------------------------------- /3_faster_ncnn/src/apps/yolo/yolo.cpp: -------------------------------------------------------------------------------- 1 | #include "yolo.h" 2 | 3 | 4 | namespace YoloNCNN{ 5 | 6 | 7 | bool InferImpl::startup(const std::string ¶m_path, const std::string &model_path, float confidence, float iou_thr){ 8 | param_path_ = param_path; 9 | model_path_ = model_path; 10 | confidence_ = confidence; 11 | iou_thr_ = iou_thr; 12 | 13 | // 等待线程创建和里面的初始化完成 14 | return Det::startup(); 15 | } 16 | 17 | void InferImpl::worker(std::promise &pro){ 18 | 19 | input_w_ = 640; 20 | input_h_ = 640; 21 | input_name_ = "images"; 22 | output_name_ = "output"; 23 | 24 | net_.load_param(param_path_.c_str()); 25 | net_.load_model(model_path_.c_str()); 26 | postprocess_ = std::make_shared(postProcess::postProcessType::yolox, input_h_, input_w_, confidence_, iou_thr_); 27 | 28 | INFO("ncnn模型加载成功! "); 29 | 30 | pro.set_value(true); // satrtup 函数结束 31 | 32 | // std::vector fetch_jobs; 33 | Job fetch_job; 34 | while(get_job_and_wait(fetch_job)){ 35 | 36 | input_ = fetch_job.input; 37 | forward(); 38 | postprocess_->forward(output_); 39 | fetch_job.pro->set_value(postprocess_->nms_boxes); 40 | } 41 | 42 | INFO("推理结束!"); 43 | } 44 | 45 | 46 | std::shared_future> InferImpl::commit(const cv::Mat &input){ 47 | return Det::commit(input); 48 | } 49 | 50 | 51 | bool InferImpl::preprocess(Job &job, const cv::Mat &input) { 52 | int img_w = input.cols; 53 | int img_h = input.rows; 54 | 55 | int w = img_w; 56 | int h = img_h; 57 | float scale = 1.f; 58 | if (w > h){ 59 | scale = (float)input_w_ / w; 60 | w = input_w_; 61 | h = h * scale; 62 | } else{ 63 | scale = (float)input_h_ / h; 64 | h = input_h_; 65 | w = w * scale; 66 | } 67 | ncnn::Mat in = ncnn::Mat::from_pixels_resize(input.data, ncnn::Mat::PIXEL_BGR, img_w, img_h, w, h); 68 | 69 | // pad to YOLOX_TARGET_SIZE rectangle 70 | int wpad = input_w_ - w; 71 | int hpad = input_h_ - h; 72 | 73 | ncnn::copy_make_border(in, job.input, 0, hpad, 0, wpad, ncnn::BORDER_CONSTANT, 114.f); 74 | // input_.substract_mean_normalize(mean_vals_, norm_vals_); 75 | return true; 76 | 77 | } 78 | 79 | 80 | std::shared_ptr create_infer(const std::string ¶m_path, const std::string &model_path, float confidence, float iou_thr){ 81 | std::shared_ptr instance = std::make_shared(); 82 | if(!instance->startup(param_path, model_path, confidence, iou_thr)){ 83 | instance.reset(); 84 | } 85 | return instance; // 创建子类对象 返回父类指针,这样实现封着。外部只能调用commit 86 | } 87 | 88 | 89 | } //end namespace -------------------------------------------------------------------------------- /3_faster_ncnn/src/base/tools.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace FasterNCNN { 12 | 13 | /* 14 | logger 15 | */ 16 | enum class LogLevel : int{ 17 | Debug = 5, 18 | Verbose = 4, 19 | Info = 3, 20 | Warning = 2, 21 | Error = 1, 22 | Fatal = 0 23 | }; 24 | 25 | 26 | static const char* level_string(LogLevel level); 27 | static void __log_func(const char* file, int line, LogLevel level, const char* fmt, ...); 28 | static std::string file_name(const std::string& path, bool include_suffix); 29 | 30 | 31 | /* 修改这个level来实现修改日志输出级别 */ 32 | #define CURRENT_LOG_LEVEL LogLevel::Info 33 | #define INFOD(...) __log_func(__FILE__, __LINE__, LogLevel::Debug, __VA_ARGS__) 34 | #define INFOV(...) __log_func(__FILE__, __LINE__, LogLevel::Verbose, __VA_ARGS__) 35 | #define INFO(...) __log_func(__FILE__, __LINE__, LogLevel::Info, __VA_ARGS__) 36 | #define INFOW(...) __log_func(__FILE__, __LINE__, LogLevel::Warning, __VA_ARGS__) 37 | #define INFOE(...) __log_func(__FILE__, __LINE__, LogLevel::Error, __VA_ARGS__) 38 | #define INFOF(...) __log_func(__FILE__, __LINE__, LogLevel::Fatal, __VA_ARGS__) 39 | 40 | static const char* level_string(LogLevel level){ 41 | switch (level){ 42 | case LogLevel::Debug: return "debug"; 43 | case LogLevel::Verbose: return "verbo"; 44 | case LogLevel::Info: return "info"; 45 | case LogLevel::Warning: return "warn"; 46 | case LogLevel::Error: return "error"; 47 | case LogLevel::Fatal: return "fatal"; 48 | default: return "unknow"; 49 | } 50 | } 51 | 52 | static void __log_func(const char* file, int line, LogLevel level, const char* fmt, ...){ 53 | 54 | if(level > CURRENT_LOG_LEVEL) 55 | return; 56 | 57 | va_list vl; 58 | va_start(vl, fmt); 59 | 60 | char buffer[2048]; 61 | std::string filename = file_name(file, true); 62 | int n = snprintf(buffer, sizeof(buffer), "[%s][%s:%d]:", level_string(level), filename.c_str(), line); 63 | vsnprintf(buffer + n, sizeof(buffer) - n, fmt, vl); 64 | 65 | fprintf(stdout, "%s\n", buffer); 66 | if (level == LogLevel::Fatal) { 67 | fflush(stdout); 68 | abort(); 69 | } 70 | } 71 | 72 | static std::string file_name(const std::string& path, bool include_suffix){ 73 | 74 | if (path.empty()) return ""; 75 | int p = path.rfind('/'); 76 | p += 1; 77 | 78 | //include suffix 79 | if (include_suffix) 80 | return path.substr(p); 81 | 82 | int u = path.rfind('.'); 83 | if (u == -1) 84 | return path.substr(p); 85 | 86 | if (u <= p) u = path.size(); 87 | return path.substr(p, u - p); 88 | } 89 | 90 | 91 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Faster Deployment 2 | 3 | > 作者本人能力和想法都十分有限,确实可能很多情况没有想到,欢迎大家讨论! 4 | 5 | 本仓库主要是针对深度学习模型的TensorRT、ncnn、rknn等的后端推理框架部署工作,有较好的接口便捷性和推理性能。当前主要主要应用在单目机器人,所以benchmark一般设置`batch=1`,采用单幅图片连续输入或者单视频流输入的方式进行测试,模拟实际情况。 6 | 7 | 目前建议优先使用faster_tensorrt,因为这是本仓库主要的提升方向,支持的算法最多,后续更新也会更快。另外两个也以也会慢慢更新。 8 | 9 | - 1_trt_base: 主要是tensorrt的基础操作,包括模型转换、推理、构建插件以及一些运行和优化的demo 10 | - 2_faster_tensorrt: 主要是tensorrt的封装和优化 11 | - 3_faster_ncnn: 参考`2_faster_tensort`封装ncnn推理过程 12 | - 4_faser_rknn: 封装rknn的推理过程 13 | 14 | ## 0 致谢 15 | 16 | 首先需要感谢手写ai团队开源的[TensorRT_Pro](https://github.com/shouxieai/tensorRT_Pro),让我受益良多,本仓库中tensorrt的代码也均是在其基础上进行优化,以及按照该仓库的整体思路优化ncnn和rknn的推理。 17 | 18 | 25 | 26 | ## 1 当前支持 27 | 28 | ### 1.1 faster_tensorrt 29 | 30 | #### 目标检测 31 | 32 | - [x] yolox 33 | - [x] yolov8 34 | - [x] rtdetr 35 | 36 | #### 单目深度估计 37 | 38 | - [ ] [lite-mono](https://github.com/noahzn/Lite-Mono) 39 | 40 | 41 | 42 | 43 | 44 | ### 1.2 faster_ncnn 45 | 46 | #### 目标检测 47 | - [x] yolox 48 | - [ ] yolov8 49 | 50 | 51 | ### 1.3 faster_rknn 52 | 53 | #### 目标检测 54 | - [x] yolox 55 | - [ ] yolov8 56 | 57 | 58 | ## 2 问题&分析 59 | 60 | 我们首先应该思考在实际机器人的视频流推理上需要的是什么? 61 | 62 | 明确前提:此时的模型已经充分优化过了,包括剪枝、量化之类的操作以及后端推理器已经对模型算子进行了自动或手动的融合、量化、图优化等优化。*简而言之,可以认为单幅图像的纯inference时间是不可能再缩短了。* 63 | 64 | **🔥高吞吐和低延迟!!!** 65 | 66 | > 本文中两个词的意义 :
67 | > **延迟**:图片从诞生到推理完成需要的时间。
68 | > **吞吐**:相等时间内处理图片的数量。
69 | > 发现一个更好的解释,来自trt的文档[性能评估](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#measure-performance) 70 | 71 | 72 | 推理的的目标肯定是吞吐量特别大,同时延迟超级小,在一定条件下,其实这两个是互斥的。这也是表明程序性能(软件性能+硬件性能)两种方式,吞吐量代表并行能力,延迟代表串行效果。 73 | > 吞吐和延迟互斥是因为,在服务器侧部署时,为了提高吞吐量,往往会先缓存一些数据,组成batch进行推理(也是tensorrt_pro的策略)。实现在相同时间内,更多图像的推理,同时毫无疑问这在并行能力高的GPU设备上是十分有效的,可以极大提升吞吐量(显存和算力足够情况下可以轻松提升10倍以上)。但是,这对于每一帧图像来说,它从输入到输出的延迟就会提高了!这对实际机器人🤖来说是显然无法接受的,因为控制的核心还是反馈,当信号量频率低或者不稳定时,对于机器人的控制和决策来说难度很大。 74 | 75 | 需要注意,单幅图像的延迟,在这个层面是无法缩短的哦!因为一幅图像到推理器后端模型推理步骤的时候,必须经过`预处理->推理->后处理`三个步骤,这三个步骤的时间耗时在本阶段是无法缩小的(这是在之前优化网络结构和模型转换的时候考虑的),本项目是希望在不增加单图延迟的基础上,尽量高的提升模型吞吐量,也就是尽量重合、去掉一些无用、重复、耗时的操作,对于机器人来说就是可以拥有更高频率的目标位置信息等。 76 | 77 | 78 | 另外采用多线程或者线程池推理,对机器人处理图像带来的问题是:在输入图像帧率很高的时候,无法保证图像按照输入顺序输出,这对于机器人这种需要根据目标前后运动状态进行决策的智能体来说是有问题的,可能会导致误判。不过我认为如果可以实现这是一个跨越量级的提高吞吐量的方法,当然前提是不增加延迟并且输出有序,我也会继续尝试。 79 | 80 | ## 3 实现 81 | 82 | 具体的实现过程在这里[2_faster_tensorrt](./2_faster_tensorrt/readme.md),有完整的代码解释、模型推理接口、增加模型方法等的说明。 83 | 84 | 85 | ## 4 总结 86 | 87 | 那本仓库要做的是什么? 88 | 89 | 1. 首先暂时抛弃了多batch,因为当前使用的机器人不需要多输入。 90 | 2. 设置任务队列(超过则阻塞,尽量不增加延迟),可以根据模型预处理和推理耗时手动调整,保证最优的吞吐和延迟。 91 | 3. 尽量在可加速的硬件上执行预处理和后处理。 92 | 93 | 94 | 95 | ## other 96 | 97 | >类似百度的fastdeploy、mmdeploy等部署仓库都是有好有坏。首先它们对自家框架都支持的比较好比如paddledetection, mmlab系列的模型仓库等,但是缺点就是现在后端框架的api的更新可能比较快,有时可能无法用到最新的版本和接口,而且开源仓库的维护成本确实比较大,而且人员有限,所以他们可能更新的会稍微慢一点(我在工作确定之后也会参与到相应的开源项目中去,添砖加瓦)。不过另一方面不愧是大厂,这些代码仓库整体设计思路和实验确实都是非常好,可扩展性都贼拉好(相对本仓库),后期我也会慢慢学习学习,来优化本仓库。 98 | 99 | [FastDeploy](https://github.com/PaddlePaddle/FastDeploy) 100 | 101 | [mmdeploy](https://github.com/open-mmlab/mmdeploy) 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /1_trt_base/trt_plugin/demo01/demo01.h: -------------------------------------------------------------------------------- 1 | #include "cookbookHelper.cuh" 2 | 3 | namespace 4 | { 5 | static const char *PLUGIN_NAME {"ZZX_ADDScalar"}; 6 | static const char *PLUGIN_VERSION {"1"}; 7 | } 8 | 9 | namespace nvinfer1 10 | { 11 | class ZZX_ADDScalar : public IPluginV2DynamicExt 12 | { 13 | private: 14 | const std::string name_; 15 | std::string namespace_; 16 | struct 17 | { 18 | float scalar; 19 | } m_; 20 | 21 | public: 22 | ZZX_ADDScalar() = delete; // 删除默认构造函数 23 | ZZX_ADDScalar(const std::string &name, float scalar); 24 | ZZX_ADDScalar(const std::string &name, const void *buffer, size_t length); 25 | ~ZZX_ADDScalar(); 26 | 27 | // 继承自IPluginV2的方法 28 | const char *getPluginType() const noexcept override; 29 | const char *getPluginVersion() const noexcept override; 30 | int32_t getNbOutputs() const noexcept override; 31 | int32_t initialize() noexcept override; 32 | void terminate() noexcept override; 33 | size_t getSerializationSize() const noexcept override; 34 | void serialize(void *buffer) const noexcept override; 35 | void destroy() noexcept override; 36 | void setPluginNamespace(const char *pluginNamespace) noexcept override; 37 | const char *getPluginNamespace() const noexcept override; 38 | 39 | // 继承自IPluginV2Ext的方法 40 | DataType getOutputDataType(int32_t index, DataType const *inputTypes, int32_t nbInputs) const noexcept override; 41 | void attachToContext(cudnnContext *contextCudnn, cublasContext *contextCublas, IGpuAllocator *gpuAllocator) noexcept override; 42 | void detachFromContext() noexcept override; 43 | 44 | // 继承自IPluginV2DynamicExt的方法 45 | IPluginV2DynamicExt *clone() const noexcept override; 46 | DimsExprs getOutputDimensions(int32_t outputIndex, const DimsExprs *inputs, int32_t nbInputs, IExprBuilder &exprBuilder) noexcept override; 47 | bool supportsFormatCombination(int32_t pos, const PluginTensorDesc *inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override; 48 | void configurePlugin(const DynamicPluginTensorDesc *in, int32_t nbInputs, const DynamicPluginTensorDesc *out, int32_t nbOutputs) noexcept override; 49 | size_t getWorkspaceSize(const PluginTensorDesc *inputs, int32_t nbInputs, const PluginTensorDesc *outputs, int32_t nbOutputs) const noexcept override; 50 | int32_t enqueue(const PluginTensorDesc *inputDesc, const PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override; 51 | 52 | protected: 53 | // 防止一些编译警告 54 | using nvinfer1::IPluginV2::enqueue; 55 | using nvinfer1::IPluginV2::getOutputDimensions; 56 | using nvinfer1::IPluginV2::getWorkspaceSize; 57 | using nvinfer1::IPluginV2Ext::configurePlugin; 58 | }; 59 | 60 | 61 | class ZZXAddScalarPluginCreator : public IPluginCreator 62 | { 63 | private: 64 | static PluginFieldCollection fc_; 65 | static std::vector attr_; 66 | std::string namespace_; 67 | public: 68 | ZZXAddScalarPluginCreator(); 69 | ~ZZXAddScalarPluginCreator(); 70 | const char * getPluginName() const noexcept override; 71 | const char * getPluginVersion() const noexcept override; 72 | const PluginFieldCollection *getFieldNames() noexcept override; 73 | IPluginV2DynamicExt * createPlugin(const char *name, const PluginFieldCollection *fc) noexcept override; 74 | IPluginV2DynamicExt * deserializePlugin(const char *name, const void *serialData, size_t serialLength) noexcept override; 75 | void setPluginNamespace(const char *pluginNamespace) noexcept override; 76 | const char * getPluginNamespace() const noexcept override; 77 | 78 | }; 79 | 80 | } 81 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | # google风格改 3 | 4 | Language: Cpp 5 | # BasedOnStyle: Google 6 | AccessModifierOffset: -4 7 | AlignAfterOpenBracket: Align 8 | AlignConsecutiveAssignments: false 9 | AlignConsecutiveDeclarations: false 10 | AlignEscapedNewlines: Left 11 | AlignOperands: true 12 | AlignTrailingComments: true 13 | AllowAllParametersOfDeclarationOnNextLine: true 14 | AllowShortBlocksOnASingleLine: false 15 | AllowShortCaseLabelsOnASingleLine: false 16 | AllowShortFunctionsOnASingleLine: Inline 17 | AllowShortIfStatementsOnASingleLine: true 18 | AllowShortLoopsOnASingleLine: true 19 | AlwaysBreakAfterDefinitionReturnType: None 20 | AlwaysBreakAfterReturnType: None 21 | AlwaysBreakBeforeMultilineStrings: true 22 | AlwaysBreakTemplateDeclarations: true 23 | BinPackArguments: true 24 | BinPackParameters: true 25 | BraceWrapping: 26 | AfterClass: false 27 | AfterControlStatement: false 28 | AfterEnum: false 29 | AfterFunction: false 30 | AfterNamespace: false 31 | AfterObjCDeclaration: false 32 | AfterStruct: false 33 | AfterUnion: false 34 | AfterExternBlock: false 35 | BeforeCatch: false 36 | BeforeElse: false 37 | IndentBraces: false 38 | SplitEmptyFunction: true 39 | SplitEmptyRecord: true 40 | SplitEmptyNamespace: true 41 | BreakBeforeBinaryOperators: None 42 | BreakBeforeBraces: Attach 43 | BreakBeforeInheritanceComma: false 44 | BreakBeforeTernaryOperators: true 45 | BreakConstructorInitializersBeforeComma: false 46 | BreakConstructorInitializers: BeforeColon 47 | BreakAfterJavaFieldAnnotations: false 48 | BreakStringLiterals: true 49 | ColumnLimit: 100 50 | CommentPragmas: '^ IWYU pragma:' 51 | CompactNamespaces: false 52 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 53 | ConstructorInitializerIndentWidth: 4 54 | ContinuationIndentWidth: 4 55 | Cpp11BracedListStyle: true 56 | DerivePointerAlignment: true 57 | DisableFormat: false 58 | ExperimentalAutoDetectBinPacking: false 59 | FixNamespaceComments: true 60 | ForEachMacros: 61 | - foreach 62 | - Q_FOREACH 63 | - BOOST_FOREACH 64 | IncludeBlocks: Preserve 65 | IncludeCategories: 66 | - Regex: '^' 67 | Priority: 2 68 | - Regex: '^<.*\.h>' 69 | Priority: 1 70 | - Regex: '^<.*' 71 | Priority: 2 72 | - Regex: '.*' 73 | Priority: 3 74 | IncludeIsMainRegex: '([-_](test|unittest))?$' 75 | IndentCaseLabels: true 76 | IndentPPDirectives: None 77 | IndentWidth: 4 78 | IndentWrappedFunctionNames: false 79 | JavaScriptQuotes: Leave 80 | JavaScriptWrapImports: true 81 | KeepEmptyLinesAtTheStartOfBlocks: false 82 | MacroBlockBegin: '' 83 | MacroBlockEnd: '' 84 | MaxEmptyLinesToKeep: 1 85 | NamespaceIndentation: None 86 | ObjCBlockIndentWidth: 2 87 | ObjCSpaceAfterProperty: false 88 | ObjCSpaceBeforeProtocolList: false 89 | PenaltyBreakAssignment: 2 90 | PenaltyBreakBeforeFirstCallParameter: 1 91 | PenaltyBreakComment: 300 92 | PenaltyBreakFirstLessLess: 120 93 | PenaltyBreakString: 1000 94 | PenaltyExcessCharacter: 1000000 95 | PenaltyReturnTypeOnItsOwnLine: 200 96 | PointerAlignment: Right 97 | RawStringFormats: 98 | - Delimiter: pb 99 | Language: TextProto 100 | BasedOnStyle: google 101 | ReflowComments: true 102 | SortIncludes: false 103 | SortUsingDeclarations: false 104 | SpaceAfterCStyleCast: false 105 | SpaceAfterTemplateKeyword: true 106 | SpaceBeforeAssignmentOperators: true 107 | SpaceBeforeParens: Never # ControlStatements 108 | SpaceInEmptyParentheses: false 109 | SpacesBeforeTrailingComments: 2 110 | SpacesInAngles: false 111 | SpacesInContainerLiterals: true 112 | SpacesInCStyleCastParentheses: false 113 | SpacesInParentheses: false 114 | SpacesInSquareBrackets: false 115 | Standard: Cpp11 116 | TabWidth: 4 117 | UseTab: Never 118 | ... 119 | 120 | -------------------------------------------------------------------------------- /3_faster_ncnn/src/base/infer_base.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include "net.h" 13 | 14 | #include "tools.hpp" 15 | 16 | namespace FasterNCNN{ 17 | 18 | 19 | template 20 | class DetBase { 21 | 22 | public: 23 | struct Job{ 24 | ncnn::Mat input; 25 | Output output; 26 | std::shared_ptr> pro; 27 | }; 28 | 29 | 30 | virtual ~DetBase() {stop();}; 31 | void stop() { 32 | run_ = false; 33 | cond_.notify_all(); 34 | 35 | /// cleanup jobs 36 | { 37 | std::unique_lock l(jobs_lock_); 38 | while(!jobs_.empty()){ 39 | auto& item = jobs_.front(); 40 | if(item.pro) 41 | item.pro->set_value(Output()); 42 | jobs_.pop(); 43 | } 44 | }; 45 | 46 | if(worker_){ 47 | worker_->join(); 48 | worker_.reset(); 49 | } 50 | } 51 | 52 | // 启动 初始化线程 用一个promise等待worker中的初始化结束 53 | bool startup() { 54 | run_ = true; 55 | 56 | std::promise pro; 57 | worker_ = std::make_shared(&DetBase::worker, this, std::ref(pro)); 58 | return pro.get_future().get(); 59 | } 60 | 61 | // 工作线程(纯虚) 62 | virtual void worker(std::promise& result) = 0; 63 | // 预处理(纯虚) 64 | virtual bool preprocess(Job& job, const Input& input) = 0; 65 | 66 | 67 | 68 | virtual void forward() { 69 | auto ex = net_.create_extractor(); 70 | // INFO("inputname: %s", input_name); 71 | // INFO("outputname: %s", output_name); 72 | // if(ncnn_use_vulkan_compute_) ex.set_vulkan_compute(true); 73 | ex.set_num_threads(ncnn_num_threads_); 74 | ex.input(input_name_.c_str(), input_); 75 | ex.extract(output_name_.c_str(), output_); 76 | } 77 | virtual std::shared_future commit(const Input& input) { 78 | Job job; 79 | job.pro = std::make_shared>(); 80 | if(!preprocess(job, input)){ 81 | job.pro->set_value(Output()); 82 | return job.pro->get_future(); 83 | } 84 | 85 | ////////////////////上锁并且推进队列//////////////////////////// 86 | { 87 | std::unique_lock l(jobs_lock_); 88 | // jobs_.push(job); 89 | jobs_.emplace(job); 90 | }; 91 | cond_.notify_one(); 92 | return job.pro->get_future(); 93 | } 94 | 95 | // 获取任务 等待之前的任务执行完毕 96 | virtual bool get_job_and_wait(Job& fetch_job) { 97 | std::unique_lock l(jobs_lock_); 98 | cond_.wait(l, [&](){ 99 | return !run_ || !jobs_.empty(); 100 | }); 101 | 102 | if(!run_) return false; 103 | 104 | fetch_job = std::move(jobs_.front()); 105 | jobs_.pop(); 106 | return true; 107 | } 108 | 109 | 110 | protected: 111 | // ncnn 112 | ncnn::Net net_; 113 | int input_w_; 114 | int input_h_; 115 | std::string input_name_; 116 | std::string output_name_; 117 | ncnn::Mat input_; 118 | ncnn::Mat output_; 119 | int ncnn_num_threads_ = 8; 120 | bool ncnn_use_vulkan_compute_; 121 | 122 | // multi threads 123 | std::atomic run_; 124 | std::mutex jobs_lock_; 125 | std::queue jobs_; 126 | std::shared_ptr worker_; 127 | std::condition_variable cond_; 128 | 129 | }; 130 | 131 | // 接口类 需要重写里面的纯虚函数 132 | template 133 | class InferBase{ 134 | public: 135 | virtual std::shared_future commit(const Input &input) = 0; 136 | }; 137 | 138 | 139 | } 140 | 141 | -------------------------------------------------------------------------------- /2_faster_tensorrt/src/apps/yolo/yolo.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file yolo.h 3 | * @author 0zzx0 4 | * @brief 重写改进,集成高性能yolo推理接口 5 | * @version 1.0 6 | * @date 2023-6-11 7 | * 8 | * @copyright Copyright (c) 2023 9 | * 10 | */ 11 | 12 | #ifndef YOLO_HPP 13 | #define YOLO_HPP 14 | 15 | #include "../common.hpp" 16 | 17 | namespace YOLO { 18 | using namespace FasterTRT; 19 | 20 | // 模型选择 21 | enum class YoloType : int { V5 = 0, X = 1, V8 = 2 }; 22 | 23 | // 模型名字 24 | const char *type_name(YoloType type); 25 | 26 | /** 27 | * @brief Decode配置的实现 28 | * 不同模型输出的decode一般都不一样,即使是yolo系列也有一些区别, 29 | * 这里需要实现不同模型的decode。尤其是anchor base 和anchor free的区别 30 | */ 31 | struct DecodeMeta { 32 | int num_anchor; 33 | int num_level; 34 | float w[16], h[16]; 35 | int strides[16]; 36 | 37 | // static DecodeMeta v5_p5_default_meta(); 38 | static DecodeMeta x_default_meta(); 39 | static DecodeMeta v8_default_meta(); 40 | }; 41 | 42 | // 线程安全模板类设置模板类型 43 | using ThreadSafedAsyncInferImpl = 44 | ThreadSafedAsyncInfer, // start param 47 | AffineMatrix // additional 48 | >; 49 | using Infer = InferBase; 50 | 51 | /** 52 | * @brief 推理类的实现,继承必备父类,重写父类方法 53 | * 54 | */ 55 | class YoloTRTInferImpl : public Infer, public ThreadSafedAsyncInferImpl { 56 | public: 57 | // 析构 调用来自基类ThreadSafedAsyncInferImpl的stop函数 58 | ~YoloTRTInferImpl(); 59 | 60 | virtual bool startup(const std::string &file, YoloType type, int gpuid, int batch_size, 61 | float confidence_threshold, float nms_threshold, 62 | bool is_use_trtnNMSPlugin = false); 63 | virtual void worker(std::promise &result) override; 64 | virtual bool preprocess(Job &job, const cv::Mat &image) override; 65 | 66 | virtual std::vector> commits( 67 | const std::vector &images) override; 68 | virtual std::shared_future commit(const cv::Mat &image) override; 69 | 70 | void init_yolox_prior_box(Tensor &prior_box); 71 | void init_yolov8_prior_box(Tensor &prior_box); 72 | void init_yolov5_prior_box(Tensor &prior_box); 73 | 74 | private: 75 | int input_width_ = 0; 76 | int input_height_ = 0; 77 | int gpu_ = 0; 78 | float confidence_threshold_ = 0; 79 | float nms_threshold_ = 0; 80 | cudaStream_t stream_ = nullptr; 81 | cudaStream_t stream_pro_ = nullptr; 82 | Norm normalize_; 83 | YoloType type_; 84 | DecodeMeta meta_; 85 | int batch_size_ = 1; 86 | bool is_use_trtnNMSPlugin_ = false; 87 | }; 88 | 89 | /* 90 | trt模型编译(不过我实际建议直接用trtexec转换,嘻嘻0_0) 91 | max max_batch_size:为最大可以允许的batch数量 92 | source_onnx_file:onnx文件 93 | save_engine_file:储存的tensorRT模型 94 | max_workspace_size:最大工作空间大小,一般给1GB,在嵌入式可以改为256MB,单位是byte 95 | int8 images 96 | folder:对于Mode为INT8时,需要提供图像数据进行标定,请提供文件夹,会自动检索下面的jpg/jpeg/tiff/png/bmp 97 | int8_entropy_calibrator_cache_file:对于int8模式下,熵文件可以缓存,避免二次加载数据,可以跨平台使用,是一个txt文件 98 | */ 99 | bool compile(Mode mode, YoloType type, unsigned int max_batch_size, 100 | const std::string &source_onnx_file, const std::string &save_engine_file, 101 | size_t max_workspace_size = 1 << 30, const std::string &int8_images_folder = "", 102 | const std::string &int8_entropy_calibrator_cache_file = ""); 103 | 104 | // image转成tensor 105 | void image_to_tensor(const cv::Mat &image, std::shared_ptr &tensor, YoloType type, 106 | int ibatch); 107 | 108 | // 创建推理器 109 | std::shared_ptr create_infer(const std::string &engine_file, YoloType type, int gpuid, 110 | int batch_size, float confidence_threshold = 0.2f, 111 | float nms_threshold = 0.5f); 112 | 113 | }; // end namespace YOLO 114 | 115 | #endif -------------------------------------------------------------------------------- /2_faster_tensorrt/src/base/monopoly_accocator.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file monopoly_accocator.hpp 3 | * @author 0zzx0 4 | * @brief 独占分配器 5 | * @version 0.1 6 | * @date 2023-6-11 2023-8-21 7 | * 8 | * @copyright Copyright (c) 2023 9 | * 10 | */ 11 | 12 | #ifndef MONOPOLY_ALLOCATOR_HPP 13 | #define MONOPOLY_ALLOCATOR_HPP 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | namespace FasterTRT { 21 | ///////////////////////////class MonopolyAllocator/////////////////////////// 22 | /* 独占分配器 23 | 通过对tensor做独占管理,具有max_batch * 2个tensor,通过query获取一个tensor 24 | 当推理结束后,该tensor释放使用权,即可交给下一个图像使用,内存实现复用 25 | 26 | * 1. tensor复用 27 | * 2. tensor的预处理和推理并行 28 | * 29 | * 输入图像时,具有2倍batch的空间进行预处理用于缓存 30 | * 引擎推理时,每次拿1个batch的数据进行推理 31 | * 当引擎推理速度慢而预处理速度快时,输入图像需要进行等候。 32 | **/ 33 | template 34 | class MonopolyAllocator { 35 | public: 36 | /* MonopolyData是数据容器类 37 | 允许query获取的item执行item->release释放自身所有权,该对象可以被复用 38 | 通过item->data()获取储存的对象的指针 39 | */ 40 | class MonopolyData { 41 | public: 42 | std::shared_ptr<_ItemType>& data() { return data_; } 43 | void release() { manager_->release_one(this); } 44 | 45 | private: 46 | MonopolyData(MonopolyAllocator* pmanager) { manager_ = pmanager; } 47 | 48 | private: 49 | friend class MonopolyAllocator; 50 | MonopolyAllocator* manager_ = nullptr; 51 | std::shared_ptr<_ItemType> data_; 52 | bool available_ = true; 53 | }; 54 | typedef std::shared_ptr MonopolyDataPointer; 55 | 56 | // 构造函数 初始化尺寸 57 | MonopolyAllocator(int size) { 58 | capacity_ = size; 59 | num_available_ = size; 60 | datas_.resize(size); 61 | 62 | for(int i = 0; i < size; ++i) 63 | datas_[i] = std::shared_ptr(new MonopolyData(this)); 64 | } 65 | 66 | // 析构 67 | virtual ~MonopolyAllocator() { 68 | run_ = false; 69 | cv_.notify_all(); 70 | 71 | std::unique_lock l(lock_); 72 | cv_exit_.wait(l, [&]() { return num_wait_thread_ == 0; }); 73 | } 74 | /* 获取一个可用的对象 75 | timeout:超时时间,如果没有可用的对象,将会进入阻塞等待,如果等待超时则返回空指针 76 | 请求得到一个对象后,该对象被占用,除非他执行了release释放该对象所有权 77 | */ 78 | MonopolyDataPointer query(int timeout = 10000) { 79 | std::unique_lock l(lock_); 80 | if(!run_) return nullptr; 81 | 82 | if(num_available_ == 0) { 83 | num_wait_thread_++; 84 | 85 | auto state = cv_.wait_for(l, std::chrono::milliseconds(timeout), 86 | [&]() { return num_available_ > 0 || !run_; }); 87 | 88 | num_wait_thread_--; 89 | cv_exit_.notify_one(); 90 | 91 | // timeout, no available, exit program 92 | if(!state || num_available_ == 0 || !run_) return nullptr; 93 | } 94 | 95 | auto item = std::find_if(datas_.begin(), datas_.end(), 96 | [](MonopolyDataPointer& item) { return item->available_; }); 97 | if(item == datas_.end()) return nullptr; 98 | 99 | (*item)->available_ = false; 100 | num_available_--; 101 | return *item; 102 | } 103 | 104 | // 有效数量 105 | int num_available() { return num_available_; } 106 | 107 | // 空间大小 108 | int capacity() { return capacity_; } 109 | 110 | private: 111 | // 释放一个对象的所有权 112 | void release_one(MonopolyData* prq) { 113 | std::unique_lock l(lock_); 114 | if(!prq->available_) { 115 | prq->available_ = true; 116 | num_available_++; 117 | cv_.notify_one(); 118 | } 119 | } 120 | 121 | private: 122 | std::mutex lock_; 123 | std::condition_variable cv_; 124 | std::condition_variable cv_exit_; 125 | std::vector datas_; 126 | int capacity_ = 0; 127 | volatile int num_available_ = 0; 128 | volatile int num_wait_thread_ = 0; 129 | volatile bool run_ = true; 130 | }; 131 | 132 | }; // namespace FasterTRT 133 | 134 | #endif -------------------------------------------------------------------------------- /3_faster_ncnn/src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "apps/yolo/yolo.h" 4 | 5 | using namespace std; 6 | 7 | // 打印结果因袭 8 | static void printBox(vector &boxs) { 9 | printf("obj nums: %d \n", boxs.size()); 10 | for(int i=0;i>> out_queue; 32 | 33 | auto start = std::chrono::system_clock::now(); 34 | for(int i=0;i<100;i++){ 35 | 36 | // auto start = std::chrono::system_clock::now(); 37 | auto fut = infer->commit(img); // 任务提交 38 | // auto end = std::chrono::system_clock::now(); 39 | // cout << chrono::duration_cast(end - start).count()<< "ms" << endl; 40 | 41 | out_queue.push(fut); 42 | if(out_queue.size() <= 1){ 43 | continue; 44 | } 45 | auto res = out_queue.front().get(); 46 | out_queue.pop(); 47 | } 48 | while(!out_queue.empty()){ 49 | auto res = out_queue.front().get(); 50 | out_queue.pop(); 51 | } 52 | 53 | auto end = std::chrono::system_clock::now(); 54 | cout << chrono::duration_cast(end - start).count() / 100.0f<< "ms" << endl; 55 | 56 | // cout << res.size() << endl; 57 | // for (size_t i = 0; i < res.size(); i++){ 58 | // NCNN_DET::ObjBox box = res[i]; 59 | // cout<class_names[box.category], cv::Point(box.x1, 63 | // // box.y1), 64 | // // cv::FONT_HERSHEY_SIMPLEX, 0.75, cv::Scalar(0, 255, 0), 2); 65 | // } 66 | // cv::imwrite("result_test.jpg", img); 67 | 68 | return ; 69 | } 70 | 71 | void t1(){ 72 | string param_path = "../model.param"; 73 | string model_path = "../model.bin"; 74 | 75 | auto infer = YoloNCNN::create_infer(param_path, model_path, 0.5, 0.45); 76 | 77 | if (infer == nullptr){ 78 | printf("Infer is nullptr.\n"); 79 | return ; 80 | } 81 | 82 | string img_path = "../img/000026.jpg"; 83 | cv::Mat img = cv::imread(img_path); 84 | 85 | // warmup 86 | for(int i=0;i<10;i++){ 87 | auto res = infer->commit(img).get(); // 将任务提交给推理器(推理器执行commit) 88 | } 89 | 90 | int count = 10; 91 | auto start = std::chrono::system_clock::now(); 92 | for(int i=0;icommit(img).get(); // 将任务提交给推理器(推理器执行commit) 94 | } 95 | 96 | auto end = std::chrono::system_clock::now(); 97 | float cost_time = chrono::duration_cast(end - start).count(); 98 | cout << cost_time / (count * 1.0) << "ms" << endl; 99 | 100 | auto res = infer->commit(img).get(); 101 | printBox(res); 102 | // for (size_t i = 0; i < res.size(); i++){ 103 | // auto box = res[i]; 104 | // cout << box.score << endl; 105 | // // cv::rectangle(img, cv::Point(box.x1, box.y1), cv::Point(box.x2, box.y2), 106 | // // cv::Scalar(0, 0, 255), 2); 107 | // // cv::putText(img, pred->class_names[box.category], cv::Point(box.x1, 108 | // // box.y1), 109 | // // cv::FONT_HERSHEY_SIMPLEX, 0.75, cv::Scalar(0, 255, 0), 2); 110 | // } 111 | // cv::imwrite("result_test.jpg", img); 112 | return ; 113 | 114 | } 115 | 116 | int main(){ 117 | t1(); 118 | // t2(); 119 | 120 | } 121 | 122 | -------------------------------------------------------------------------------- /3_faster_ncnn/src/apps/common.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | namespace FasterNCNN{ 4 | 5 | float InterSectionArea(const ObjBox &a, const ObjBox &b){ 6 | if(a.x1 > b.x2 || a.x2 < b.x1 || a.y1 > b.y2 || a.y2 < b.y1){ 7 | return 0.0f; 8 | } 9 | float inter_w = std::min(a.x2, b.x2) - std::min(a.x1, b.x1); 10 | float inter_h = std::min(a.y2, b.y2) - std::min(a.y1, b.y1); 11 | 12 | return inter_w * inter_h; 13 | } 14 | 15 | bool ScoreSort(ObjBox a, ObjBox b){ 16 | return (a.score > b.score); 17 | } 18 | 19 | void nms(std::vector &src_boxes, std::vector &dst_boxes, float threshold){ 20 | std::vector picked; 21 | std::sort(src_boxes.begin(), src_boxes.end(), ScoreSort); 22 | 23 | for (int i = 0; i < src_boxes.size(); i++){ 24 | int keep = 1; 25 | for(int j=0; j < picked.size(); j++){ 26 | float inter_area = InterSectionArea(src_boxes[i], src_boxes[picked[j]]); 27 | float union_area = src_boxes[i].area() + src_boxes[picked[j]].area() - inter_area; 28 | float iou = inter_area / union_area; 29 | if((iou > threshold) && (src_boxes[i].category == src_boxes[picked[j]].category)){ 30 | keep = 0; 31 | break; 32 | } 33 | } 34 | if(keep){ 35 | picked.push_back(i); 36 | } 37 | } 38 | for(int i=0;i conf_thr_){ 113 | 114 | ObjBox obj; 115 | obj.x1 = x0; 116 | obj.y1 = y0; 117 | obj.x2 = x1; 118 | obj.y2 = y1; 119 | 120 | obj.category = class_idx; 121 | obj.score = box_prob; 122 | 123 | out_boxes.push_back(obj); 124 | } 125 | 126 | } // class loop 127 | feat_ptr += output_.w; 128 | 129 | } // point anchor loop 130 | } 131 | 132 | 133 | } 134 | -------------------------------------------------------------------------------- /2_faster_tensorrt/src/base/trt_base.cpp: -------------------------------------------------------------------------------- 1 | #include "trt_base.hpp" 2 | 3 | namespace FasterTRT { 4 | 5 | // 返回mode的名字 6 | const char* mode_string(Mode type) { 7 | switch(type) { 8 | case Mode::FP32: 9 | return "FP32"; 10 | case Mode::FP16: 11 | return "FP16"; 12 | case Mode::INT8: 13 | return "INT8"; 14 | default: 15 | return "UnknowCompileMode"; 16 | } 17 | } 18 | 19 | ////////////////////////////////////////////////////////////////////// 20 | ////////////////////// Int8EntropyCalibrator ///////////////////////// 21 | ////////////////////////////////////////////////////////////////////// 22 | Int8EntropyCalibrator::Int8EntropyCalibrator(const std::vector& imagefiles, 23 | nvinfer1::Dims dims, const Int8Process& preprocess) { 24 | Assert(preprocess != nullptr); 25 | this->dims_ = dims; 26 | this->allimgs_ = imagefiles; 27 | this->preprocess_ = preprocess; 28 | this->fromCalibratorData_ = false; 29 | files_.resize(dims.d[0]); 30 | checkCudaRuntime(cudaStreamCreate(&stream_)); 31 | } 32 | 33 | Int8EntropyCalibrator::Int8EntropyCalibrator(const std::vector& entropyCalibratorData, 34 | nvinfer1::Dims dims, const Int8Process& preprocess) { 35 | Assert(preprocess != nullptr); 36 | 37 | this->dims_ = dims; 38 | this->entropyCalibratorData_ = entropyCalibratorData; 39 | this->preprocess_ = preprocess; 40 | this->fromCalibratorData_ = true; 41 | files_.resize(dims.d[0]); 42 | checkCudaRuntime(cudaStreamCreate(&stream_)); 43 | } 44 | 45 | Int8EntropyCalibrator::~Int8EntropyCalibrator() { 46 | checkCudaRuntime(cudaStreamDestroy(stream_)); 47 | } 48 | 49 | int Int8EntropyCalibrator::getBatchSize() const noexcept { 50 | return dims_.d[0]; 51 | } 52 | 53 | bool Int8EntropyCalibrator::next() { 54 | int batch_size = dims_.d[0]; 55 | if(cursor_ + batch_size > allimgs_.size()) return false; 56 | 57 | int old_cursor = cursor_; 58 | for(int i = 0; i < batch_size; ++i) files_[i] = allimgs_[cursor_++]; 59 | 60 | if(!tensor_) { 61 | tensor_.reset(new Tensor(dims_.nbDims, dims_.d)); 62 | tensor_->set_stream(stream_); 63 | tensor_->set_workspace(std::make_shared()); 64 | } 65 | 66 | preprocess_(old_cursor, allimgs_.size(), files_, tensor_); 67 | return true; 68 | } 69 | 70 | bool Int8EntropyCalibrator::getBatch(void* bindings[], const char* names[], 71 | int nbBindings) noexcept { 72 | if(!next()) return false; 73 | bindings[0] = tensor_->gpu(); 74 | return true; 75 | } 76 | 77 | const std::vector& Int8EntropyCalibrator::getEntropyCalibratorData() { 78 | return entropyCalibratorData_; 79 | } 80 | 81 | const void* Int8EntropyCalibrator::readCalibrationCache(size_t& length) noexcept { 82 | if(fromCalibratorData_) { 83 | length = this->entropyCalibratorData_.size(); 84 | return this->entropyCalibratorData_.data(); 85 | } 86 | 87 | length = 0; 88 | return nullptr; 89 | } 90 | 91 | void Int8EntropyCalibrator::writeCalibrationCache(const void* cache, size_t length) noexcept { 92 | entropyCalibratorData_.assign((uint8_t*)cache, (uint8_t*)cache + length); 93 | } 94 | 95 | std::vector glob_image_files(const std::string& directory) { 96 | /* 检索目录下的所有图像:"*.jpg;*.png;*.bmp;*.jpeg;*.tiff" */ 97 | std::vector files, output; 98 | std::set pattern_set{"jpg", "png", "bmp", "jpeg", "tiff"}; 99 | 100 | if(directory.empty()) { 101 | INFOE("Glob images from folder failed, folder is empty"); 102 | return output; 103 | } 104 | 105 | try { 106 | std::vector files_; 107 | files_.reserve(10000); 108 | cv::glob(directory + "/*", files_, true); 109 | files.insert(files.end(), files_.begin(), files_.end()); 110 | } catch(...) { 111 | INFOE("Glob %s failed", directory.c_str()); 112 | return output; 113 | } 114 | 115 | for(int i = 0; i < files.size(); ++i) { 116 | auto& file = files[i]; 117 | int p = file.rfind("."); 118 | if(p == -1) continue; 119 | 120 | auto suffix = file.substr(p + 1); 121 | std::transform(suffix.begin(), suffix.end(), suffix.begin(), [](char c) { 122 | if(c >= 'A' && c <= 'Z') c -= 'A' + 'a'; 123 | return c; 124 | }); 125 | if(pattern_set.find(suffix) != pattern_set.end()) output.push_back(file); 126 | } 127 | return output; 128 | } 129 | 130 | } // namespace FasterTRT -------------------------------------------------------------------------------- /1_trt_base/trt_yolox/py/tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | from typing import Tuple 4 | 5 | 6 | # 图像预处理 7 | def preproc(img: np.ndarray, input_size: tuple, swap: tuple=(2, 0, 1))->Tuple[np.ndarray, float]: 8 | padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114 9 | 10 | r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1]) 11 | resized_img = cv2.resize(img, 12 | (int(img.shape[1] * r), int(img.shape[0] * r)), 13 | interpolation=cv2.INTER_LINEAR, 14 | ).astype(np.uint8) 15 | padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img 16 | 17 | padded_img = padded_img.transpose(swap) 18 | padded_img = np.ascontiguousarray(padded_img, dtype=np.float32) 19 | return padded_img, r 20 | 21 | # 后处理 不包括nms 22 | def demo_postprocess(outputs: np.ndarray, img_size: tuple, p6: bool=False): 23 | grids = [] 24 | expanded_strides = [] 25 | strides = [8, 16, 32] if not p6 else [8, 16, 32, 64] 26 | 27 | hsizes = [img_size[0] // stride for stride in strides] 28 | wsizes = [img_size[1] // stride for stride in strides] 29 | 30 | for hsize, wsize, stride in zip(hsizes, wsizes, strides): 31 | xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize)) 32 | grid = np.stack((xv, yv), 2).reshape(1, -1, 2) 33 | grids.append(grid) 34 | shape = grid.shape[:2] 35 | expanded_strides.append(np.full((*shape, 1), stride)) 36 | 37 | grids = np.concatenate(grids, 1) 38 | expanded_strides = np.concatenate(expanded_strides, 1) 39 | outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides 40 | outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides 41 | 42 | return outputs 43 | 44 | 45 | 46 | def nms(boxes, scores, nms_thr): 47 | """Single class NMS implemented in Numpy.""" 48 | x1 = boxes[:, 0] 49 | y1 = boxes[:, 1] 50 | x2 = boxes[:, 2] 51 | y2 = boxes[:, 3] 52 | 53 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 54 | order = scores.argsort()[::-1] 55 | 56 | keep = [] 57 | while order.size > 0: 58 | i = order[0] 59 | keep.append(i) 60 | xx1 = np.maximum(x1[i], x1[order[1:]]) 61 | yy1 = np.maximum(y1[i], y1[order[1:]]) 62 | xx2 = np.minimum(x2[i], x2[order[1:]]) 63 | yy2 = np.minimum(y2[i], y2[order[1:]]) 64 | 65 | w = np.maximum(0.0, xx2 - xx1 + 1) 66 | h = np.maximum(0.0, yy2 - yy1 + 1) 67 | inter = w * h 68 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 69 | 70 | inds = np.where(ovr <= nms_thr)[0] 71 | order = order[inds + 1] 72 | 73 | return keep 74 | 75 | 76 | def multiclass_nms(boxes, scores, nms_thr, score_thr, class_agnostic=True): 77 | """Multiclass NMS implemented in Numpy""" 78 | if class_agnostic: 79 | nms_method = multiclass_nms_class_agnostic 80 | else: 81 | nms_method = multiclass_nms_class_aware 82 | return nms_method(boxes, scores, nms_thr, score_thr) 83 | 84 | 85 | def multiclass_nms_class_aware(boxes, scores, nms_thr, score_thr): 86 | """Multiclass NMS implemented in Numpy. Class-aware version.""" 87 | final_dets = [] 88 | num_classes = scores.shape[1] 89 | for cls_ind in range(num_classes): 90 | cls_scores = scores[:, cls_ind] 91 | valid_score_mask = cls_scores > score_thr 92 | if valid_score_mask.sum() == 0: 93 | continue 94 | else: 95 | valid_scores = cls_scores[valid_score_mask] 96 | valid_boxes = boxes[valid_score_mask] 97 | keep = nms(valid_boxes, valid_scores, nms_thr) 98 | if len(keep) > 0: 99 | cls_inds = np.ones((len(keep), 1)) * cls_ind 100 | dets = np.concatenate( 101 | [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1 102 | ) 103 | final_dets.append(dets) 104 | if len(final_dets) == 0: 105 | return None 106 | return np.concatenate(final_dets, 0) 107 | 108 | 109 | def multiclass_nms_class_agnostic(boxes, scores, nms_thr, score_thr): 110 | """Multiclass NMS implemented in Numpy. Class-agnostic version.""" 111 | cls_inds = scores.argmax(1) 112 | cls_scores = scores[np.arange(len(cls_inds)), cls_inds] 113 | 114 | valid_score_mask = cls_scores > score_thr 115 | if valid_score_mask.sum() == 0: 116 | return None 117 | valid_scores = cls_scores[valid_score_mask] 118 | valid_boxes = boxes[valid_score_mask] 119 | valid_cls_inds = cls_inds[valid_score_mask] 120 | keep = nms(valid_boxes, valid_scores, nms_thr) 121 | if keep: 122 | dets = np.concatenate( 123 | [valid_boxes[keep], valid_scores[keep, None], valid_cls_inds[keep, None]], 1 124 | ) 125 | return dets 126 | 127 | -------------------------------------------------------------------------------- /1_trt_base/trt_plugin/yolox_end2end/end2end.md: -------------------------------------------------------------------------------- 1 | # Yolo Convert End2End Tensorrt 2 | 3 | yolox默认的后处理是在cpu上直接进行了,所以在经过tensorrt加速后其实后处理还是在cpu上进行。于是考虑吧nms的过程加入到tensorrt生成序列化文件的过程中。因为trt已经有了nms的插件,所以加进去就行了,目前主要有两种思路: 4 | 5 | 1. 参考mmyolo的easydeploy,设置一个TRT::EfficientNMS 的op,在export onnx的时候同时转过去,然后trtexec在模型转换中会自动识别到NMS的插件并替换,实现端到端的操作。 6 | 7 | 2. 生成序列化文件后,通过增加插件层,来让模型在运行中调用NMS插件,实现端到端。 8 | 9 | ## 0. 一些实验 10 | 首先是发现end2end后反而慢了很多再找问题 11 | 1. 测试模型转换。利用trtexec和torch2trt导出测试结果合速度都基本一致。排除此问题。非端到端处理的情况下运行速度为2.84-2.85ms 12 | 2. 经过测试,发现转成端到端的模型后,在c++环境下有加速效果5.045ms->4.577ms. 但是在py环境下反而速度降低 13 | 14 | 下面是当前的测试结果(通过增加op的方法),处理时间包括前处理(除imread以外的所有操作),后处理(包括nms,不包括绘制矩形和保存图片)。程序首先预热50轮,然后连续运行1000轮,计算平均耗时。 15 | > 平台: 2080ti + i9-9900K + trt8.5 + cuda10.2 + cudnn8.7 16 | 17 | | 程序 | normal | end2end | 18 | | :-----: | :-----: | :-----: | 19 | | python | 4.00ms | 3.26ms | 20 | | C++ | 5.045ms | 4.577ms | 21 | 22 | 23 | 这里看到c++要不py慢这么多,经过试验发现主要是前处理的差距太大了。如果不包括前处理阶段,这样就合理很多了 24 | | 程序 | end2end | 25 | | :-----: | :-----: | 26 | | python | 1.94ms | 27 | | C++ | 1.8ms | 28 | 29 | ## 1. 增加OP 30 | 首先定义一个文件,作为转onnx的过渡op 31 | ```python 32 | import torch 33 | from torch import Tensor 34 | # refer https://github.com/open-mmlab/mmyolo/blob/dev/projects/easydeploy/nms/trt_nms.py 35 | 36 | # onnx 自定义节点 37 | class TRTEfficientNMSop(torch.autograd.Function): 38 | 39 | @staticmethod 40 | def forward( 41 | ctx, 42 | boxes: Tensor, 43 | scores: Tensor, 44 | background_class: int = -1, 45 | box_coding: int = 0, 46 | iou_threshold: float = 0.45, 47 | max_output_boxes: int = 100, 48 | plugin_version: str = '1', 49 | score_activation: int = 0, 50 | score_threshold: float = 0.25, 51 | ): 52 | batch_size, _, num_classes = scores.shape 53 | num_det = torch.randint( 54 | 0, max_output_boxes, (batch_size, 1), dtype=torch.int32) 55 | det_boxes = torch.randn(batch_size, max_output_boxes, 4) 56 | det_scores = torch.randn(batch_size, max_output_boxes) 57 | det_classes = torch.randint( 58 | 0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32) 59 | return num_det, det_boxes, det_scores, det_classes 60 | 61 | @staticmethod 62 | def symbolic(g, 63 | boxes: Tensor, 64 | scores: Tensor, 65 | background_class: int = -1, 66 | box_coding: int = 0, 67 | iou_threshold: float = 0.45, 68 | max_output_boxes: int = 100, 69 | plugin_version: str = '1', 70 | score_activation: int = 0, 71 | score_threshold: float = 0.25): 72 | out = g.op( 73 | 'TRT::EfficientNMS_TRT', 74 | boxes, 75 | scores, 76 | background_class_i=background_class, 77 | box_coding_i=box_coding, 78 | iou_threshold_f=iou_threshold, 79 | max_output_boxes_i=max_output_boxes, 80 | plugin_version_s=plugin_version, 81 | score_activation_i=score_activation, 82 | score_threshold_f=score_threshold, 83 | outputs=4) 84 | num_det, det_boxes, det_scores, det_classes = out 85 | return num_det, det_boxes, det_scores, det_classes 86 | 87 | 88 | def efficient_nms( 89 | boxes: Tensor, 90 | scores: Tensor, 91 | max_output_boxes_per_class: int = 1000, 92 | iou_threshold: float = 0.5, 93 | score_threshold: float = 0.05, 94 | pre_top_k: int = -1, 95 | keep_top_k: int = 100, 96 | box_coding: int = 0, 97 | ): 98 | 99 | num_det, det_boxes, det_scores, det_classes = TRTEfficientNMSop.apply( 100 | boxes, scores, -1, box_coding, iou_threshold, keep_top_k, '1', 0, 101 | score_threshold) 102 | return num_det, det_boxes, det_scores, det_classes 103 | 104 | ``` 105 | 106 | 针对yolox输出的模型,建立一个部署模型来增加后处理op,这个地方需要注意nms的输入需要对应到官方插件库的输入输出说明[nms tensorrt-plugin](https://github.com/NVIDIA/TensorRT/tree/release/8.5/plugin/efficientNMSPlugin),对这个新模型进行export即可, 107 | ```python 108 | class DeployModel(nn.Module): 109 | def __init__(self, baseModel: nn.Module): 110 | super().__init__() 111 | self.baseModel = baseModel 112 | 113 | self.pre_top_k = 1000 114 | self.keep_top_k = 100 115 | self.iou_threshold = 0.45 116 | self.score_threshold = 0.1 117 | 118 | def forward(self, inputs: Tensor): 119 | outputs = self.baseModel(inputs) 120 | 121 | bboxes = outputs[:, :, :4] 122 | scores = outputs[:, :, 4:5] * outputs[:, :, 5:] 123 | 124 | return efficient_nms(bboxes, scores, self.keep_top_k, self.iou_threshold, 125 | self.score_threshold, self.pre_top_k, self.keep_top_k, box_coding=1) 126 | ``` 127 | 128 | 然后就可以直接使用trtexec转成trt的engine文件了。 129 | 130 | -------------------------------------------------------------------------------- /1_trt_base/trt_demo/trt_py/trt_python.py: -------------------------------------------------------------------------------- 1 | import tensorrt as trt 2 | # import pycuda.autoinit 3 | # import pycuda.driver as cuda 4 | import numpy as np 5 | from cuda import cudart 6 | 7 | model_path = '../files/model.onnx' 8 | model_engine_path = '../files/model_py.engine' 9 | 10 | 11 | def get_engine(): 12 | # 构建阶段 13 | logger = trt.Logger(trt.Logger.WARNING) # logger 14 | builder = trt.Builder(logger) # builder 15 | 16 | # 创建网络定义 17 | network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) 18 | profile = builder.create_optimization_profile() # 动态尺寸的话需要这个 19 | config = builder.create_builder_config() # 配置 20 | config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 20) # 1 MiB 21 | 22 | # 创建解析器 23 | parser = trt.OnnxParser(network, logger) 24 | success = parser.parse_from_file(model_path) # 加载文件 25 | for idx in range(parser.num_errors): 26 | print(parser.get_error(idx)) 27 | 28 | # 构建engine 29 | serialized_engine = builder.build_serialized_network(network, config) 30 | 31 | with open(model_engine_path, "wb") as f: 32 | f.write(serialized_engine) 33 | 34 | def inferV1(): 35 | logger = trt.Logger(trt.Logger.WARNING) # logger 36 | 37 | runtime = trt.Runtime(logger) 38 | with open(model_engine_path, "rb") as f: 39 | serialized_engine = f.read() 40 | engine = runtime.deserialize_cuda_engine(serialized_engine) 41 | 42 | 43 | stream = cuda.Stream() 44 | context = engine.create_execution_context() 45 | 46 | h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32) 47 | h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32) 48 | d_input = cuda.mem_alloc(h_input.nbytes) 49 | d_output = cuda.mem_alloc(h_output.nbytes) 50 | 51 | with engine.create_execution_context() as context: 52 | # Transfer input data to the GPU. 53 | cuda.memcpy_htod_async(d_input, h_input, stream) 54 | # Run inference. 55 | context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle) 56 | # Transfer predictions back from the GPU. 57 | cuda.memcpy_dtoh_async(h_output, d_output, stream) 58 | # Synchronize the stream 59 | stream.synchronize() 60 | # Return the host output. 该数据等同于原始模型的输出数据 61 | return h_output 62 | 63 | 64 | def inferV2(): 65 | logger = trt.Logger(trt.Logger.WARNING) # logger 66 | 67 | runtime = trt.Runtime(logger) 68 | with open(model_engine_path, "rb") as f: 69 | serialized_engine = f.read() 70 | engine = runtime.deserialize_cuda_engine(serialized_engine) 71 | 72 | nIO = engine.num_io_tensors # io变量数量 73 | lTensorName = [engine.get_tensor_name(i) for i in range(nIO)] # 获取io变量名字 74 | nInput = [engine.get_tensor_mode(lTensorName[i]) for i in range(nIO)].count(trt.TensorIOMode.INPUT) # 输入tensor数量 75 | Output = [engine.get_tensor_mode(lTensorName[i]) for i in range(nIO)].count(trt.TensorIOMode.OUTPUT) 76 | 77 | context = engine.create_execution_context() 78 | print("===============INPUT/OUTPUT=================== ") 79 | for i in range(nIO): 80 | print(f"[{i}]{'Input ' if i < nInput else 'Output'} -> "+ 81 | f"{engine.get_tensor_dtype(lTensorName[i])} " + # 数据类型 82 | f"{engine.get_tensor_shape(lTensorName[i])} " + # engine形状 83 | f"{context.get_tensor_shape(lTensorName[i])} " + # context形状 84 | f"{lTensorName[i]} ") # 名字 85 | print("============================================ ") 86 | 87 | data = np.arange(3 * 224 * 224, dtype=np.float32).reshape(1, 3, 224, 224) 88 | 89 | # cpu端数据 90 | bufferH = [] 91 | bufferH.append(np.ascontiguousarray(data)) # 输入数据转内存连续 92 | for i in range(nInput, nIO): # 输出数据 93 | bufferH.append(np.empty(context.get_tensor_shape(lTensorName[i]), dtype=trt.nptype(engine.get_tensor_dtype(lTensorName[i])))) 94 | 95 | # gpu端数据申请显存 96 | bufferD = [] 97 | for i in range(nIO): 98 | bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) 99 | 100 | # 输入数据复制到显存 101 | for i in range(nInput): 102 | cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) 103 | 104 | # 设置输入输出数据的地址(buffer) 105 | for i in range(nIO): 106 | context.set_tensor_address(lTensorName[i], int(bufferD[i])) 107 | 108 | # 推理 109 | context.execute_async_v3(0) 110 | 111 | for i in range(nInput, nIO): # 数据拷会cpu 112 | cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) 113 | 114 | for i in range(nIO): 115 | print(f'{lTensorName[i]}:\t {bufferH[i].shape}') 116 | 117 | 118 | for b in bufferD: # 释放显存 119 | cudart.cudaFree(b) 120 | 121 | 122 | if __name__ == '__main__': 123 | # get_engine() 124 | # print(infer().shape) 125 | inferV2() # 推荐 126 | -------------------------------------------------------------------------------- /1_trt_base/trt_plugin/demo01/test.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | from cuda import cudart 3 | import numpy as np 4 | import os 5 | import tensorrt as trt 6 | 7 | SOFILE = './build/libdemo01.so' 8 | np.set_printoptions(precision=3, linewidth=100, suppress=True) # 控制Python中小数的显示精度 9 | np.random.seed(123456) 10 | cudart.cudaDeviceSynchronize() 11 | 12 | def printArrayInfomation(x, info="", n=5): 13 | print(f"{info}: {x.shape}, SumAbs={np.sum(abs(x)) :.5e}, Var={np.var(x) :.5f}, \ 14 | Max={np.max(x) :.5f},Min={np.min(x) :.5f},SAD={np.sum(np.abs(np.diff(x.reshape(-1)))) :.5f}") 15 | print('\t', x.reshape(-1)[:n], x.reshape(-1)[-n:]) 16 | 17 | def check(a, b, weak=False, checkEpsilon=1e-5): 18 | if weak: 19 | res = np.all(np.abs(a - b) < checkEpsilon) 20 | else: 21 | res = np.all(a == b) 22 | diff0 = np.max(np.abs(a - b)) 23 | diff1 = np.max(np.abs(a - b) / (np.abs(b) + checkEpsilon)) 24 | print(f"check:{res}, absDiff={diff0}, relDiff={diff1}") 25 | 26 | def addScalarCPU(inputH, scalar): 27 | return [inputH[0] + scalar] 28 | 29 | def getAddScalarPlugin(scalar): 30 | for c in trt.get_plugin_registry().plugin_creator_list: 31 | if c.name == "ZZX_ADDScalar": 32 | parameterList = [] 33 | parameterList.append(trt.PluginField("scalar", np.float32(scalar), trt.PluginFieldType.FLOAT32)) 34 | return c.create_plugin(c.name, trt.PluginFieldCollection(parameterList)) 35 | return None 36 | 37 | 38 | def run(shape, scalar): 39 | testCase = f"" 40 | trtFile = f"./model-Dim{len(shape)}.plan" 41 | print(f"Test {testCase}") 42 | logger = trt.Logger(trt.Logger.ERROR) 43 | trt.init_libnvinfer_plugins(logger, '') 44 | ctypes.cdll.LoadLibrary(SOFILE) 45 | 46 | if os.path.isfile(trtFile): 47 | with open(trtFile, "rb") as f: 48 | engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) 49 | if engine == None: 50 | print("Failed loading engine!") 51 | return 52 | print("Succeeded loading engine!") 53 | else: 54 | builder = trt.Builder(logger) 55 | network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) 56 | profile = builder.create_optimization_profile() 57 | config = builder.create_builder_config() 58 | 59 | inputT0 = network.add_input("inputT0", trt.float32, [-1 for i in shape]) 60 | profile.set_shape(inputT0.name, [1 for i in shape], [8 for i in shape], [32 for i in shape]) 61 | config.add_optimization_profile(profile) 62 | 63 | pluginLayer = network.add_plugin_v2([inputT0], getAddScalarPlugin(scalar)) 64 | network.mark_output(pluginLayer.get_output(0)) 65 | engineString = builder.build_serialized_network(network, config) 66 | if engineString == None: 67 | print("Failed building engine!") 68 | return 69 | print("Succeeded building engine!") 70 | with open(trtFile, "wb") as f: 71 | f.write(engineString) 72 | engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) 73 | 74 | nIO = engine.num_io_tensors 75 | lTensorName = [engine.get_tensor_name(i) for i in range(nIO)] 76 | nInput = [engine.get_tensor_mode(lTensorName[i]) for i in range(nIO)].count(trt.TensorIOMode.INPUT) 77 | 78 | context = engine.create_execution_context() 79 | context.set_input_shape(lTensorName[0], shape) 80 | #for i in range(nIO): 81 | # print("[%2d]%s->" % (i, "Input " if i < nInput else "Output"), engine.get_tensor_dtype(lTensorName[i]), engine.get_tensor_shape(lTensorName[i]), context.get_tensor_shape(lTensorName[i]), lTensorName[i]) 82 | 83 | bufferH = [] 84 | bufferH.append(np.arange(np.prod(shape), dtype=np.float32).reshape(shape)) 85 | for i in range(nInput, nIO): 86 | bufferH.append(np.empty(context.get_tensor_shape(lTensorName[i]), dtype=trt.nptype(engine.get_tensor_dtype(lTensorName[i])))) 87 | bufferD = [] 88 | for i in range(nIO): 89 | bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) 90 | 91 | for i in range(nInput): 92 | cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) 93 | 94 | for i in range(nIO): 95 | context.set_tensor_address(lTensorName[i], int(bufferD[i])) 96 | 97 | context.execute_async_v3(0) 98 | 99 | for i in range(nInput, nIO): 100 | cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) 101 | 102 | outputCPU = addScalarCPU(bufferH[:nInput], scalar) 103 | """ 104 | for i in range(nInput): 105 | printArrayInfomation(bufferH[i]) 106 | for i in range(nInput, nIO): 107 | printArrayInfomation(bufferH[i]) 108 | for i in range(nInput, nIO): 109 | printArrayInfomation(outputCPU[i - nInput]) 110 | """ 111 | check(bufferH[nInput:][0], outputCPU[0], True) 112 | 113 | for b in bufferD: 114 | cudart.cudaFree(b) 115 | print(f"Test {testCase} finish!") 116 | 117 | if __name__ == "__main__": 118 | os.system("rm -rf ./*.plan") 119 | 120 | run([32], 1) 121 | run([32, 32], 1) 122 | run([16, 16, 16], 1) 123 | run([8, 8, 8, 8], 1) 124 | run([32], 1) 125 | run([32, 32], 1) 126 | run([16, 16, 16], 1) 127 | run([8, 8, 8, 8], 1) 128 | 129 | print("Test all finish!") 130 | -------------------------------------------------------------------------------- /1_trt_base/trt_yolox/py/trt.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import time 3 | import numpy as np 4 | import tensorrt as trt 5 | from cuda import cudart 6 | 7 | from tools import preproc, demo_postprocess, multiclass_nms 8 | 9 | model_path = '/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/bin/yolox.engine' 10 | img_path = "../../imgs/000026.jpg" 11 | score_thr = 0.5 12 | 13 | COCO_CLASSES = ('echinus', 'starfish', 'holothurian', 'scallop') 14 | _COLORS = np.array( 15 | [ 16 | 0.000, 0.447, 0.741, 17 | 0.850, 0.325, 0.098, 18 | 0.929, 0.694, 0.125, 19 | 0.494, 0.184, 0.556, 20 | ] 21 | ).astype(np.float32).reshape(-1, 3) 22 | 23 | 24 | def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None): 25 | for i in range(len(boxes)): 26 | box = boxes[i] 27 | cls_id = int(cls_ids[i]) 28 | score = scores[i] 29 | if score < conf: 30 | continue 31 | x0 = int(box[0]) 32 | y0 = int(box[1]) 33 | x1 = int(box[2]) 34 | y1 = int(box[3]) 35 | 36 | color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist() 37 | text = '{}:{:.1f}%'.format(class_names[cls_id], score * 100) 38 | txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255) 39 | font = cv2.FONT_HERSHEY_SIMPLEX 40 | 41 | txt_size = cv2.getTextSize(text, font, 0.4, 1)[0] 42 | cv2.rectangle(img, (x0, y0), (x1, y1), color, 2) 43 | 44 | txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist() 45 | cv2.rectangle( 46 | img, 47 | (x0, y0 + 1), 48 | (x0 + txt_size[0] + 1, y0 + int(1.5*txt_size[1])), 49 | txt_bk_color, 50 | -1 51 | ) 52 | cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1) 53 | 54 | return img 55 | 56 | 57 | 58 | class YoloTRT: 59 | def __init__(self) -> None: 60 | # 构建阶段 61 | self.logger = trt.Logger(trt.Logger.WARNING) # logger 62 | trt.init_libnvinfer_plugins( self.logger, namespace='') # 加载插件 63 | 64 | self.runtime = trt.Runtime(self.logger) 65 | 66 | with open(model_path, "rb") as f: 67 | serialized_engine = f.read() 68 | self.engine = self.runtime.deserialize_cuda_engine(serialized_engine) 69 | self.context = self.engine.create_execution_context() 70 | 71 | self.nIO = self.engine.num_io_tensors # io变量数量 72 | self.lTensorName = [self.engine.get_tensor_name(i) for i in range(self.nIO)] # 获取io变量名字 73 | self.nInput = [self.engine.get_tensor_mode(self.lTensorName[i]) for i in range(self.nIO)].count(trt.TensorIOMode.INPUT) # 输入tensor数量 74 | self.Output = [self.engine.get_tensor_mode(self.lTensorName[i]) for i in range(self.nIO)].count(trt.TensorIOMode.OUTPUT) 75 | 76 | print("===============INPUT/OUTPUT=================== ") 77 | for i in range(self.nIO): 78 | print(f"[{i}]{'Input ' if i < self.nInput else 'Output'} -> "+ 79 | f"{self.engine.get_tensor_dtype(self.lTensorName[i])} " + # 数据类型 80 | f"{self.engine.get_tensor_shape(self.lTensorName[i])} " + # engine形状 81 | f"{self.context.get_tensor_shape(self.lTensorName[i])} " + # context形状 82 | f"{self.lTensorName[i]} ") # 名字 83 | print("============================================ ") 84 | 85 | # cpu端数据 86 | self.bufferH = [] 87 | for i in range(self.nIO): 88 | self.bufferH.append(np.empty(self.context.get_tensor_shape(self.lTensorName[i]), 89 | dtype=trt.nptype(self.engine.get_tensor_dtype(self.lTensorName[i])))) 90 | 91 | # # gpu端数据申请显存 92 | self.bufferD = [] 93 | for i in range(self.nIO): 94 | self.bufferD.append(cudart.cudaMalloc(self.bufferH[i].nbytes)[1]) 95 | 96 | def infer(self, origin_img): 97 | data, ratio = preproc(origin_img, (640, 640)) 98 | # cpu端数据 99 | self.bufferH[0] = data 100 | 101 | for i in range(self.nInput, self.nIO): # 输出数据 102 | self.bufferH.append(np.empty(self.context.get_tensor_shape(self.lTensorName[i]), 103 | dtype=trt.nptype(self.engine.get_tensor_dtype(self.lTensorName[i])))) 104 | 105 | # 输入数据复制到显存 106 | for i in range(self.nInput): 107 | cudart.cudaMemcpy(self.bufferD[i], self.bufferH[i].ctypes.data, self.bufferH[i].nbytes, 108 | cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) 109 | 110 | # # 推理 111 | self.context.execute_v2(self.bufferD) # batchsize bingings 112 | for i in range(self.nInput, self.nIO): # 数据拷会cpu 113 | cudart.cudaMemcpy(self.bufferH[i].ctypes.data, self.bufferD[i], self.bufferH[i].nbytes, 114 | cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) 115 | 116 | predictions = demo_postprocess(self.bufferH[1], (640, 640))[0] 117 | boxes = predictions[:, :4] 118 | scores = predictions[:, 4:5] * predictions[:, 5:] 119 | boxes_xyxy = np.ones_like(boxes) 120 | boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2. 121 | boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2. 122 | boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2. 123 | boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2. 124 | boxes_xyxy /= ratio 125 | dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1) 126 | 127 | return dets 128 | 129 | def plot_save(self, origin_img, dets): 130 | final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5] 131 | origin_img = vis(origin_img, final_boxes, final_scores, final_cls_inds, 132 | conf=score_thr, class_names=COCO_CLASSES) 133 | cv2.imwrite("ans.jpg", origin_img) 134 | 135 | def myfree(self): 136 | for i in self.bufferD: # 释放显存 137 | cudart.cudaFree(i) 138 | 139 | 140 | 141 | if __name__ == '__main__': 142 | origin_img = cv2.imread(img_path) 143 | 144 | yolo_trt = YoloTRT() 145 | for _ in range(50): 146 | yolo_trt.infer(origin_img) 147 | 148 | time_b = time.perf_counter() 149 | for _ in range(1000): 150 | dets = yolo_trt.infer(origin_img) 151 | 152 | time_e = time.perf_counter() 153 | print(f"cost time: {(time_e-time_b)*1000 / 1000.0 :.2f}ms") 154 | 155 | yolo_trt.plot_save(origin_img, dets) 156 | 157 | 158 | 159 | yolo_trt.myfree() 160 | 161 | -------------------------------------------------------------------------------- /1_trt_base/trt_demo/trt_cpp/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include"NvInfer.h" 5 | #include "NvOnnxParser.h" 6 | #include "cookbookHelper.cuh" 7 | 8 | using namespace nvinfer1; 9 | // using namespace nvonnxparser; 10 | 11 | // const std::string trtfile {"../files/model_cpp.engine"}; 12 | // const std::string onnxfile {"../files/model.onnx"}; 13 | const char* trtfile="../../files/model_cpp.engine"; 14 | const char* onnxfile="../../files/model.onnx"; 15 | 16 | 17 | // logger 18 | class MyLogger : public ILogger 19 | { 20 | void log(Severity severity, const char* msg) noexcept override //noexcept不会抛出异常。override虚函数重写 21 | { 22 | // suppress info-level messages 23 | if (severity <= Severity::kWARNING) 24 | std::cout << msg << std::endl; 25 | } 26 | } logger; 27 | 28 | 29 | void get_engine(){ 30 | IBuilder* builder = createInferBuilder(logger); 31 | INetworkDefinition* network = builder->createNetworkV2(1U << int(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); 32 | IOptimizationProfile* profile = builder->createOptimizationProfile(); 33 | IBuilderConfig* config = builder->createBuilderConfig(); 34 | config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, 1 << 30); 35 | /* 这个是指定输入尺寸和输入名字的 36 | ITensor *inputTensor = network->addInput("inputT0", DataType::kFLOAT, Dims32 {3, {-1, -1, -1}}); 37 | profile->setDimensions(inputTensor->getName(), OptProfileSelector::kMIN, Dims32 {3, {1, 1, 1}}); 38 | profile->setDimensions(inputTensor->getName(), OptProfileSelector::kOPT, Dims32 {3, {3, 4, 5}}); 39 | profile->setDimensions(inputTensor->getName(), OptProfileSelector::kMAX, Dims32 {3, {6, 8, 10}}); 40 | config->addOptimizationProfile(profile); 41 | */ 42 | // onnx解析器 43 | nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, logger); 44 | parser->parseFromFile(onnxfile, static_cast(ILogger::Severity::kWARNING)); 45 | for(int32_t i = 0; i < parser->getNbErrors(); ++i){ 46 | std::cout << parser->getError(i)->desc() << std::endl; 47 | } 48 | 49 | IHostMemory* engineString = builder->buildSerializedNetwork(*network, *config); //创建engine 50 | if(engineString == nullptr || engineString->size()==0){ 51 | std::cout<<"building 序列化 engine失败"<(engineString->data()), engineString->size()); 57 | if (engineFile.fail()) 58 | { 59 | std::cout << "保存失败" << std::endl; 60 | return; 61 | } 62 | std::cout << "生成成功!" << std::endl; 63 | } 64 | 65 | 66 | 67 | void infer(){ 68 | ICudaEngine *engine = nullptr; 69 | 70 | std::ifstream engineFile(trtfile, std::ios::binary); 71 | long int fsize = 0; 72 | 73 | engineFile.seekg(0, engineFile.end); // 指针设到文件最后,也就是继续写入 74 | fsize = engineFile.tellg(); // 返回当前定位指针的位置,也代表着输入流的大小 75 | engineFile.seekg(0, engineFile.beg); // 文件开头 76 | std::vector engineString(fsize); 77 | engineFile.read(engineString.data(), fsize); 78 | 79 | if(engineString.size() == 0){ 80 | std::cout<<"读取序列化数据失败"<deserializeCudaEngine(engineString.data(), fsize); 86 | if(engine == nullptr){ 87 | std::cout<<"反序列化失败"<getNbIOTensors(); // io数量 92 | int nInput = 0; 93 | int nOutput = 0; 94 | std::vector vTensorName(nIO); 95 | for (int i = 0; i < nIO; ++i){ 96 | vTensorName[i] = std::string(engine->getIOTensorName(i)); 97 | nInput += int(engine->getTensorIOMode(vTensorName[i].c_str()) == TensorIOMode::kINPUT); 98 | nOutput += int(engine->getTensorIOMode(vTensorName[i].c_str()) == TensorIOMode::kOUTPUT); 99 | } 100 | 101 | IExecutionContext* context = engine->createExecutionContext(); 102 | // context->setInputShape(vTensorName[0].c_str(), Dims32 {3, 3, 4, 5}); 103 | 104 | // 打印输出输出形状啥的 105 | for (int i = 0; i < nIO; ++i){ 106 | std::cout< "); 108 | std::cout << dataTypeToString(engine->getTensorDataType(vTensorName[i].c_str())) << std::string(" "); 109 | std::cout << shapeToString(engine->getTensorShape(vTensorName[i].c_str())) << std::string(" "); 110 | std::cout << shapeToString(context->getTensorShape(vTensorName[i].c_str())) << std::string(" "); 111 | std::cout << vTensorName[i] << std::endl; 112 | } 113 | 114 | std::vector vTensorSize(nIO, 0); 115 | for (int i = 0; i < nIO; ++i){ 116 | Dims32 dim = context->getTensorShape(vTensorName[i].c_str()); 117 | int size = 1; 118 | for (int j = 0; j < dim.nbDims; ++j){ 119 | size *= dim.d[j]; 120 | } 121 | vTensorSize[i] = size * dataTypeToSize(engine->getTensorDataType(vTensorName[i].c_str())); 122 | } 123 | 124 | std::vector vBufferH {nIO, nullptr}; // cpu 125 | std::vector vBufferD {nIO, nullptr}; // gpu 126 | // gpu分配内存 127 | for (int i = 0; i < nIO; ++i){ 128 | vBufferH[i] = (void *)new char[vTensorSize[i]]; 129 | CHECK(cudaMalloc(&vBufferD[i], vTensorSize[i])); 130 | } 131 | 132 | // 赋值 133 | float *pData = (float *)vBufferH[0]; 134 | for (int i = 0; i < vTensorSize[0] / dataTypeToSize(engine->getTensorDataType(vTensorName[0].c_str())); ++i){ 135 | pData[i] = float(i); 136 | } 137 | 138 | // 数据复制 cpu -> gpu 139 | for (int i = 0; i < nInput; ++i){ 140 | CHECK(cudaMemcpy(vBufferD[i], vBufferH[i], vTensorSize[i], cudaMemcpyHostToDevice)); 141 | } 142 | 143 | // gpu上名字对应地址 144 | for (int i = 0; i < nIO; ++i){ 145 | context->setTensorAddress(vTensorName[i].c_str(), vBufferD[i]); 146 | } 147 | 148 | // 推理 149 | context->enqueueV3(0); 150 | 151 | // 数据复制 gpu -> cpu 152 | for (int i = nInput; i < nIO; ++i){ 153 | CHECK(cudaMemcpy(vBufferH[i], vBufferD[i], vTensorSize[i], cudaMemcpyDeviceToHost)); 154 | } 155 | 156 | // 打印输出 157 | // for (int i = 0; i < nIO; ++i){ 158 | // printArrayInfomation((float *)vBufferH[i], context->getTensorShape(vTensorName[i].c_str()), vTensorName[i], true); 159 | // } 160 | 161 | // 释放内存 释放gpu显存 162 | for (int i = 0; i < nIO; ++i){ 163 | delete[] vBufferH[i]; 164 | CHECK(cudaFree(vBufferD[i])); 165 | } 166 | 167 | return; 168 | } 169 | 170 | 171 | int main(){ 172 | CHECK(cudaSetDevice(0)); 173 | // get_engine(); 174 | infer(); 175 | return 0; 176 | } -------------------------------------------------------------------------------- /1_trt_base/trt_yolox/py/trt_end2end.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | import cv2 3 | import time 4 | import numpy as np 5 | import tensorrt as trt 6 | from cuda import cudart 7 | 8 | 9 | model_path = "/home/zzx/Github/zzx_yolo/EXTRA_PKG/TensorRT-8.5.3.1/bin/yolox_end2end.engine" 10 | img_path = "../../imgs/000026.jpg" 11 | score_thr = 0.5 12 | 13 | COCO_CLASSES = ('echinus', 'starfish', 'holothurian', 'scallop') 14 | _COLORS = np.array( 15 | [ 16 | 0.000, 0.447, 0.741, 17 | 0.850, 0.325, 0.098, 18 | 0.929, 0.694, 0.125, 19 | 0.494, 0.184, 0.556, 20 | ] 21 | ).astype(np.float32).reshape(-1, 3) 22 | 23 | 24 | def preproc(img: np.ndarray, input_size: tuple, swap: tuple=(2, 0, 1))->Tuple[np.ndarray, float]: 25 | padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114 26 | 27 | r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1]) 28 | resized_img = cv2.resize(img, 29 | (int(img.shape[1] * r), int(img.shape[0] * r)), 30 | interpolation=cv2.INTER_LINEAR, 31 | ).astype(np.uint8) 32 | padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img 33 | 34 | padded_img = padded_img.transpose(swap) 35 | padded_img = np.ascontiguousarray(padded_img, dtype=np.float32) 36 | return padded_img, r 37 | 38 | 39 | def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None): 40 | for i in range(len(boxes)): 41 | box = boxes[i] 42 | cls_id = int(cls_ids[i]) 43 | score = scores[i] 44 | if score < conf: 45 | continue 46 | x0 = int(box[0]) 47 | y0 = int(box[1]) 48 | x1 = int(box[2]) 49 | y1 = int(box[3]) 50 | 51 | color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist() 52 | text = '{}:{:.1f}%'.format(class_names[cls_id], score * 100) 53 | txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255) 54 | font = cv2.FONT_HERSHEY_SIMPLEX 55 | 56 | txt_size = cv2.getTextSize(text, font, 0.4, 1)[0] 57 | cv2.rectangle(img, (x0, y0), (x1, y1), color, 2) 58 | 59 | txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist() 60 | cv2.rectangle( 61 | img, 62 | (x0, y0 + 1), 63 | (x0 + txt_size[0] + 1, y0 + int(1.5*txt_size[1])), 64 | txt_bk_color, 65 | -1 66 | ) 67 | cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1) 68 | 69 | return img 70 | 71 | 72 | 73 | class YoloTRT: 74 | def __init__(self) -> None: 75 | # 构建阶段 76 | self.logger = trt.Logger(trt.Logger.WARNING) # logger 77 | trt.init_libnvinfer_plugins( self.logger, namespace='') # 加载插件 78 | 79 | self.runtime = trt.Runtime(self.logger) 80 | 81 | with open(model_path, "rb") as f: 82 | serialized_engine = f.read() 83 | self.engine = self.runtime.deserialize_cuda_engine(serialized_engine) 84 | self.context = self.engine.create_execution_context() 85 | 86 | self.nIO = self.engine.num_io_tensors # io变量数量 87 | self.lTensorName = [self.engine.get_tensor_name(i) for i in range(self.nIO)] # 获取io变量名字 88 | self.nInput = [self.engine.get_tensor_mode(self.lTensorName[i]) for i in range(self.nIO)].count(trt.TensorIOMode.INPUT) # 输入tensor数量 89 | self.Output = [self.engine.get_tensor_mode(self.lTensorName[i]) for i in range(self.nIO)].count(trt.TensorIOMode.OUTPUT) 90 | 91 | print("===============INPUT/OUTPUT=================== ") 92 | for i in range(self.nIO): 93 | print(f"[{i}]{'Input ' if i < self.nInput else 'Output'} -> "+ 94 | f"{self.engine.get_tensor_dtype(self.lTensorName[i])} " + # 数据类型 95 | f"{self.engine.get_tensor_shape(self.lTensorName[i])} " + # engine形状 96 | f"{self.context.get_tensor_shape(self.lTensorName[i])} " + # context形状 97 | f"{self.lTensorName[i]} ") # 名字 98 | print("============================================ ") 99 | 100 | # cpu端数据 101 | self.bufferH = [] 102 | for i in range(self.nIO): 103 | self.bufferH.append(np.empty(self.context.get_tensor_shape(self.lTensorName[i]), 104 | dtype=trt.nptype(self.engine.get_tensor_dtype(self.lTensorName[i])))) 105 | 106 | # # gpu端数据申请显存 107 | self.bufferD = [] 108 | for i in range(self.nIO): 109 | self.bufferD.append(cudart.cudaMalloc(self.bufferH[i].nbytes)[1]) 110 | 111 | def infer(self, origin_img): 112 | data, ratio = preproc(origin_img, (640, 640)) 113 | # cpu端数据 114 | self.bufferH = [] 115 | self.bufferH.append(data) # 输入数据转内存连续 116 | # self.bufferH.append(np.ascontiguousarray(data)) # 输入数据转内存连续 117 | 118 | for i in range(self.nInput, self.nIO): # 输出数据 119 | self.bufferH.append(np.empty(self.context.get_tensor_shape(self.lTensorName[i]), 120 | dtype=trt.nptype(self.engine.get_tensor_dtype(self.lTensorName[i])))) 121 | 122 | # 输入数据复制到显存 123 | for i in range(self.nInput): 124 | cudart.cudaMemcpy(self.bufferD[i], self.bufferH[i].ctypes.data, self.bufferH[i].nbytes, 125 | cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) 126 | 127 | # # 推理 128 | self.context.execute_v2(self.bufferD) # batchsize bingings 129 | for i in range(self.nInput, self.nIO): # 数据拷会cpu 130 | cudart.cudaMemcpy(self.bufferH[i].ctypes.data, self.bufferD[i], self.bufferH[i].nbytes, 131 | cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) 132 | 133 | dets = self.bufferH[self.nInput:self.nIO] 134 | 135 | return dets 136 | 137 | # def plot_save(self, origin_img, dets): 138 | # final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5] 139 | # origin_img = vis(origin_img, final_boxes, final_scores, final_cls_inds, 140 | # conf=score_thr, class_names=COCO_CLASSES) 141 | # cv2.imwrite("ans.jpg", origin_img) 142 | 143 | def myfree(self): 144 | for i in self.bufferD: # 释放显存 145 | cudart.cudaFree(i) 146 | 147 | 148 | 149 | if __name__ == '__main__': 150 | origin_img = cv2.imread(img_path) 151 | 152 | yolo_trt = YoloTRT() 153 | for _ in range(50): 154 | yolo_trt.infer(origin_img) 155 | 156 | time_b = time.perf_counter() 157 | for _ in range(1000): 158 | dets = yolo_trt.infer(origin_img) 159 | 160 | time_e = time.perf_counter() 161 | print(f"cost time: {(time_e-time_b)*1000 / 1000.0 :.2f}ms") 162 | # print(dets) 163 | 164 | # yolo_trt.plot_save(origin_img, dets) 165 | 166 | yolo_trt.myfree() 167 | -------------------------------------------------------------------------------- /2_faster_tensorrt/src/kernels/cuda_kernel.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * @file cuda_kernel.cuh 3 | * @author 0zzx0 4 | * @brief 定义一些自定义的CUDA操作,主要是预处理部分和后处理部分的加速 5 | * @version 1.0 6 | * @date 2023-6-11 7 | * 8 | * @copyright Copyright (c) 2023 9 | * 10 | */ 11 | 12 | #ifndef CUDA_KERNEL_CUH 13 | #define CUDA_KERNEL_CUH 14 | 15 | #include 16 | 17 | #include "../base/tools.hpp" 18 | 19 | namespace FasterTRT { 20 | 21 | #define GPU_BLOCK_THREADS 512 // gpu 每个block线程数量 22 | const int NUM_BOX_ELEMENT = 7; // left, top, right, bottom, confidence, class, keepflag 23 | 24 | // 用于插值计算的常量和函数 25 | #define INTER_RESIZE_COEF_BITS 11 26 | #define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS) 27 | #define CAST_BITS (INTER_RESIZE_COEF_BITS << 1) 28 | 29 | template 30 | static __inline__ __device__ _T limit(_T value, _T low, _T high) { 31 | return value < low ? low : (value > high ? high : value); 32 | } 33 | static __inline__ __device__ int resize_cast(int value) { 34 | return (value + (1 << (CAST_BITS - 1))) >> CAST_BITS; 35 | } 36 | 37 | // sigmoid 和 逆sigmoid 具体是否使用看模型里面输出前有没有sigmoid吧 38 | static __host__ inline float desigmoid(float y) { 39 | return -log(1.0f / y - 1.0f); 40 | } 41 | 42 | static __device__ inline float sigmoid(float x) { 43 | return 1.0f / (1.0f + exp(-x)); 44 | } 45 | 46 | static dim3 grid_dims(int numJobs); 47 | static dim3 block_dims(int numJobs); 48 | 49 | //////////////////////归一化策略///////////////// 50 | enum class NormType : int { None = 0, MeanStd = 1, AlphaBeta = 2 }; 51 | 52 | //////////////////////通道策略///////////////// 53 | enum class ChannelType : int { None = 0, SwapRB = 1 }; 54 | 55 | /* 归一化操作,可以支持均值标准差,alpha beta 以及输入图片通道部分swap RB */ 56 | struct Norm { 57 | float mean[3]; 58 | float std[3]; 59 | float alpha, beta; 60 | NormType type = NormType::None; 61 | ChannelType channel_type = ChannelType::None; 62 | 63 | // out = (x * alpha - mean) / std 64 | static Norm mean_std(const float mean[3], const float std[3], float alpha = 1 / 255.0f, 65 | ChannelType channel_type = ChannelType::None); 66 | 67 | // out = x * alpha + beta 68 | static Norm alpha_beta(float alpha, float beta = 0, 69 | ChannelType channel_type = ChannelType::None); 70 | 71 | // None 72 | static Norm None(); 73 | }; 74 | 75 | // 仿射变换 76 | static __device__ void affine_project(float* matrix, float x, float y, float* ox, float* oy); 77 | 78 | // 计算iou 79 | static __device__ float box_iou(float aleft, float atop, float aright, float abottom, float bleft, 80 | float btop, float bright, float bbottom); 81 | // nms kernel 82 | static __global__ void fast_nms_kernel(float* bboxes, int max_objects, float threshold); 83 | 84 | // yolox的解码kernel 85 | static __global__ void yolox_decode_kernel(float* predict, int num_bboxes, int fm_area, 86 | int num_classes, float confidence_threshold, 87 | float* invert_affine_matrix, float* parray, 88 | const float* prior_box, int max_objects); 89 | 90 | // yolox的解码 91 | void yolox_decode_kernel_invoker(float* predict, int num_bboxes, int fm_area, int num_classes, 92 | float confidence_threshold, float nms_threshold, 93 | float* invert_affine_matrix, float* parray, const float* prior_box, 94 | int max_objects, cudaStream_t stream); 95 | 96 | // yolov8的解码kernel 97 | static __global__ void yolov8_decode_kernel(float* predict, int num_bboxes, int fm_area, 98 | int num_classes, float confidence_threshold, 99 | float* invert_affine_matrix, float* parray, 100 | const float* prior_box, int max_objects); 101 | 102 | // yolov8的解码 103 | void yolov8_decode_kernel_invoker(float* predict, int num_bboxes, int fm_area, int num_classes, 104 | float confidence_threshold, float nms_threshold, 105 | float* invert_affine_matrix, float* parray, 106 | const float* prior_box, int max_objects, cudaStream_t stream); 107 | 108 | // rtdetr的解码kernel 109 | static __global__ void rtdetr_decode_kernel(float* predict, int num_bboxes, int fm_area, 110 | int num_classes, float confidence_threshold, 111 | float* invert_affine_matrix, float* parray, 112 | int max_objects, int input_size); 113 | 114 | // rtdetr的解码 115 | void rtdetr_decode_kernel_invoker(float* predict, int num_bboxes, int fm_area, int num_classes, 116 | float confidence_threshold, float* invert_affine_matrix, 117 | float* parray, int max_objects, int input_size, cudaStream_t stream); 118 | 119 | /** 120 | * @brief 通过仿射变换完成双线性插值resize并且归一化的kernel 121 | * 122 | * @param src 原始图像数据 123 | * @param src_line_size 图像长度(宽度*3) 124 | * @param src_width 原始图像宽 125 | * @param src_height 原始图像高 126 | * @param dst 目标图像数据 127 | * @param dst_width 目标图像宽 128 | * @param dst_height 目标图像高 129 | * @param const_value_st padding值 130 | * @param warp_affine_matrix_2_3 仿射变化矩阵 131 | * @param norm 归一化策略 132 | * @param edge 目标图像范围 133 | */ 134 | static __global__ void warp_affine_bilinear_and_normalize_plane_kernel( 135 | uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, 136 | int dst_height, uint8_t const_value_st, float* warp_affine_matrix_2_3, Norm norm, int edge); 137 | 138 | /** 139 | * @brief 通过仿射变换完成双线性插值resize并且归一化 140 | * 141 | * @param src 原始图像数据 142 | * @param src_line_size 图像长度(宽度*3) 143 | * @param src_width 原始图像宽 144 | * @param src_height 原始图像高 145 | * @param dst 目标图像数据 146 | * @param dst_width 目标图像宽 147 | * @param dst_height 目标图像高 148 | * @param matrix_2_3 仿射变化矩阵 149 | * @param const_value padding值 150 | * @param norm 归一化策略 151 | * @param stream cuda stream 152 | */ 153 | void warp_affine_bilinear_and_normalize_plane(uint8_t* src, int src_line_size, int src_width, 154 | int src_height, float* dst, int dst_width, 155 | int dst_height, float* matrix_2_3, 156 | uint8_t const_value, const Norm& norm, 157 | cudaStream_t stream); 158 | 159 | static __global__ void resize_bilinear_and_normalize_kernel(uint8_t* src, int src_line_size, 160 | int src_width, int src_height, 161 | float* dst, int dst_width, 162 | int dst_height, float sx, float sy, 163 | Norm norm, int edge); 164 | 165 | void resize_bilinear_and_normalize(uint8_t* src, int src_line_size, int src_width, int src_height, 166 | float* dst, int dst_width, int dst_height, const Norm& norm, 167 | cudaStream_t stream); 168 | 169 | }; // namespace FasterTRT 170 | 171 | #endif 172 | -------------------------------------------------------------------------------- /2_faster_tensorrt/src/base/tools.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file tools.hpp 3 | * @author 0zzx0 4 | * @brief 一些工具函数 包括CUDA检查 输出文件保存读取等函数 全部在tools里面定义并直接实现 5 | * @version 0.1 6 | * @date 2023-6-11 2023-8-21 7 | * 8 | * @copyright Copyright (c) 2023 9 | * 10 | */ 11 | 12 | #ifndef TOOLS_HPP 13 | #define TOOLS_HPP 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include 27 | 28 | namespace FasterTRT { 29 | 30 | enum class LogLevel : int { Debug = 5, Verbose = 4, Info = 3, Warning = 2, Error = 1, Fatal = 0 }; 31 | 32 | #define CURRENT_DEVICE_ID -1 // 当前设备 33 | static bool check_runtime(cudaError_t e, const char* call, int line, const char* file); 34 | static const char* level_string(LogLevel level); 35 | static std::string file_name(const std::string& path, bool include_suffix); 36 | static void __log_func(const char* file, int line, LogLevel level, const char* fmt, ...); 37 | 38 | ///////////////////////TRT///////////////////////////// 39 | #define TRT_STR(v) #v 40 | #define TRT_VERSION_STRING(major, minor, patch, build) \ 41 | TRT_STR(major) "." TRT_STR(minor) "." TRT_STR(patch) "." TRT_STR(build) 42 | static const char* trt_version() { 43 | return TRT_VERSION_STRING(NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH, 44 | NV_TENSORRT_BUILD); 45 | } 46 | 47 | /* 修改这个level来实现修改日志输出级别 */ 48 | #define CURRENT_LOG_LEVEL LogLevel::Info 49 | #define INFOD(...) __log_func(__FILE__, __LINE__, LogLevel::Debug, __VA_ARGS__) 50 | #define INFOV(...) __log_func(__FILE__, __LINE__, LogLevel::Verbose, __VA_ARGS__) 51 | #define INFO(...) __log_func(__FILE__, __LINE__, LogLevel::Info, __VA_ARGS__) 52 | #define INFOW(...) __log_func(__FILE__, __LINE__, LogLevel::Warning, __VA_ARGS__) 53 | #define INFOE(...) __log_func(__FILE__, __LINE__, LogLevel::Error, __VA_ARGS__) 54 | #define INFOF(...) __log_func(__FILE__, __LINE__, LogLevel::Fatal, __VA_ARGS__) 55 | 56 | #define KernelPositionBlock \ 57 | int position = (blockDim.x * blockIdx.x + threadIdx.x); \ 58 | if(position >= (edge)) return; 59 | 60 | #define checkCudaRuntime(call) check_runtime(call, #call, __LINE__, __FILE__) 61 | 62 | #define checkCudaKernel(...) \ 63 | __VA_ARGS__; \ 64 | do { \ 65 | cudaError_t cudaStatus = cudaPeekAtLastError(); \ 66 | if(cudaStatus != cudaSuccess) { \ 67 | INFOE("launch failed: %s", cudaGetErrorString(cudaStatus)); \ 68 | } \ 69 | } while(0); 70 | 71 | #define Assert(op) \ 72 | do { \ 73 | bool cond = !(!(op)); \ 74 | if(!cond) { \ 75 | INFOF("Assert failed, " #op); \ 76 | } \ 77 | } while(false) 78 | 79 | static bool check_runtime(cudaError_t e, const char* call, int line, const char* file) { 80 | if(e != cudaSuccess) { 81 | INFOE("CUDA Runtime error %s # %s, code = %s [ %d ] in file %s:%d", call, 82 | cudaGetErrorString(e), cudaGetErrorName(e), e, file, line); 83 | return false; 84 | } 85 | return true; 86 | } 87 | 88 | static const char* level_string(LogLevel level) { 89 | switch(level) { 90 | case LogLevel::Debug: 91 | return "debug"; 92 | case LogLevel::Verbose: 93 | return "verbo"; 94 | case LogLevel::Info: 95 | return "info"; 96 | case LogLevel::Warning: 97 | return "warn"; 98 | case LogLevel::Error: 99 | return "error"; 100 | case LogLevel::Fatal: 101 | return "fatal"; 102 | default: 103 | return "unknow"; 104 | } 105 | } 106 | 107 | static std::string file_name(const std::string& path, bool include_suffix) { 108 | if(path.empty()) return ""; 109 | int p = path.rfind('/'); 110 | p += 1; 111 | 112 | // include suffix 113 | if(include_suffix) return path.substr(p); 114 | 115 | int u = path.rfind('.'); 116 | if(u == -1) return path.substr(p); 117 | 118 | if(u <= p) u = path.size(); 119 | return path.substr(p, u - p); 120 | } 121 | 122 | static void __log_func(const char* file, int line, LogLevel level, const char* fmt, ...) { 123 | if(level > CURRENT_LOG_LEVEL) return; 124 | 125 | va_list vl; 126 | va_start(vl, fmt); 127 | 128 | char buffer[2048]; 129 | std::string filename = file_name(file, true); 130 | int n = snprintf(buffer, sizeof(buffer), "[%s][%s:%d]:", level_string(level), filename.c_str(), 131 | line); 132 | vsnprintf(buffer + n, sizeof(buffer) - n, fmt, vl); 133 | 134 | fprintf(stdout, "%s\n", buffer); 135 | if(level == LogLevel::Fatal) { 136 | fflush(stdout); 137 | abort(); 138 | } 139 | } 140 | 141 | static bool exists(const std::string& path) { 142 | return access(path.c_str(), R_OK) == 0; 143 | } 144 | 145 | static bool save_file(const std::string& file, const void* data, size_t length) { 146 | FILE* f = fopen(file.c_str(), "wb"); 147 | if(!f) return false; 148 | 149 | if(data && length > 0) { 150 | if(fwrite(data, 1, length, f) != length) { 151 | fclose(f); 152 | return false; 153 | } 154 | } 155 | fclose(f); 156 | return true; 157 | } 158 | 159 | static bool save_file(const std::string& file, const std::vector& data) { 160 | return save_file(file, data.data(), data.size()); 161 | } 162 | 163 | /* 构造时设置当前gpuid,析构时修改为原来的gpuid */ 164 | class AutoDevice { 165 | public: 166 | AutoDevice(int device_id = 0) { 167 | cudaGetDevice(&old_); 168 | checkCudaRuntime(cudaSetDevice(device_id)); 169 | } 170 | 171 | virtual ~AutoDevice() { checkCudaRuntime(cudaSetDevice(old_)); } 172 | 173 | private: 174 | int old_ = -1; 175 | }; 176 | 177 | static bool check_device_id(int device_id) { 178 | int device_count = -1; 179 | checkCudaRuntime(cudaGetDeviceCount(&device_count)); 180 | if(device_id < 0 || device_id >= device_count) { 181 | INFOE("Invalid device id: %d, count = %d", device_id, device_count); 182 | return false; 183 | } 184 | return true; 185 | } 186 | 187 | static int get_device(int device_id) { 188 | if(device_id != CURRENT_DEVICE_ID) { 189 | check_device_id(device_id); 190 | return device_id; 191 | } 192 | checkCudaRuntime(cudaGetDevice(&device_id)); 193 | return device_id; 194 | } 195 | 196 | static std::vector load_file(const std::string& file) { 197 | std::ifstream in(file, std::ios::in | std::ios::binary); 198 | if(!in.is_open()) return {}; 199 | 200 | in.seekg(0, std::ios::end); 201 | size_t length = in.tellg(); 202 | 203 | std::vector data; 204 | if(length > 0) { 205 | in.seekg(0, std::ios::beg); 206 | data.resize(length); 207 | 208 | in.read((char*)&data[0], length); 209 | } 210 | in.close(); 211 | return data; 212 | } 213 | 214 | inline int upbound(int n, int align = 32) { 215 | return (n + align - 1) / align * align; 216 | } 217 | 218 | template 219 | static std::string join_dims(const std::vector<_T>& dims) { 220 | std::stringstream output; 221 | char buf[64]; 222 | const char* fmts[] = {"%d", " x %d"}; 223 | for(int i = 0; i < dims.size(); ++i) { 224 | snprintf(buf, sizeof(buf), fmts[i != 0], dims[i]); 225 | output << buf; 226 | } 227 | return output.str(); 228 | } 229 | 230 | // 设置推理设备 231 | static void set_device(int device_id) { 232 | if(device_id == -1) return; 233 | checkCudaRuntime(cudaSetDevice(device_id)); 234 | } 235 | 236 | }; // namespace FasterTRT 237 | 238 | #endif -------------------------------------------------------------------------------- /1_trt_base/trt_rtdetr/rtdetr_onnx.py: -------------------------------------------------------------------------------- 1 | import time 2 | import cv2 3 | import numpy as np 4 | import argparse 5 | import onnxruntime as ort 6 | from pathlib import Path 7 | from tqdm import tqdm 8 | 9 | 10 | COCO_CLASSES = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", 11 | "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", 12 | "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", 13 | "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", 14 | "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", 15 | "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", 16 | "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", 17 | "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", 18 | "scissors", "teddy bear", "hair drier", "toothbrush", ] 19 | 20 | class PicoDet(): 21 | def __init__(self, 22 | model_pb_path, 23 | prob_threshold=0.5): 24 | self.classes = COCO_CLASSES 25 | self.num_classes = len(self.classes) 26 | self.prob_threshold = prob_threshold 27 | self.mean = np.array( 28 | [103.53, 116.28, 123.675], dtype=np.float32).reshape(1, 1, 3) 29 | self.std = np.array( 30 | [57.375, 57.12, 58.395], dtype=np.float32).reshape(1, 1, 3) 31 | so = ort.SessionOptions() 32 | so.log_severity_level = 3 33 | self.net = ort.InferenceSession(model_pb_path, so) 34 | inputs_name = [a.name for a in self.net.get_inputs()] 35 | inputs_shape = { 36 | k: v.shape 37 | for k, v in zip(inputs_name, self.net.get_inputs()) 38 | } 39 | self.input_shape = inputs_shape['image'][2:] 40 | 41 | def _normalize(self, img): 42 | img = img.astype(np.float32) 43 | img = (img / 255.0 - self.mean / 255.0) / (self.std / 255.0) 44 | return img 45 | 46 | def resize_image(self, srcimg, keep_ratio=False): 47 | top, left, newh, neww = 0, 0, self.input_shape[0], self.input_shape[1] 48 | origin_shape = srcimg.shape[:2] 49 | im_scale_y = newh / float(origin_shape[0]) 50 | im_scale_x = neww / float(origin_shape[1]) 51 | img_shape = np.array([ 52 | [float(self.input_shape[0]), float(self.input_shape[1])] 53 | ]).astype('float32') 54 | scale_factor = np.array([[im_scale_y, im_scale_x]]).astype('float32') 55 | 56 | if keep_ratio and srcimg.shape[0] != srcimg.shape[1]: 57 | hw_scale = srcimg.shape[0] / srcimg.shape[1] 58 | if hw_scale > 1: 59 | newh, neww = self.input_shape[0], int(self.input_shape[1] / 60 | hw_scale) 61 | img = cv2.resize( 62 | srcimg, (neww, newh), interpolation=cv2.INTER_AREA) 63 | left = int((self.input_shape[1] - neww) * 0.5) 64 | img = cv2.copyMakeBorder( 65 | img, 66 | 0, 67 | 0, 68 | left, 69 | self.input_shape[1] - neww - left, 70 | cv2.BORDER_CONSTANT, 71 | value=0) # add border 72 | else: 73 | newh, neww = int(self.input_shape[0] * 74 | hw_scale), self.input_shape[1] 75 | img = cv2.resize( 76 | srcimg, (neww, newh), interpolation=cv2.INTER_AREA) 77 | top = int((self.input_shape[0] - newh) * 0.5) 78 | img = cv2.copyMakeBorder( 79 | img, 80 | top, 81 | self.input_shape[0] - newh - top, 82 | 0, 83 | 0, 84 | cv2.BORDER_CONSTANT, 85 | value=0) 86 | else: 87 | img = cv2.resize( 88 | srcimg, self.input_shape, interpolation=cv2.INTER_LINEAR) 89 | 90 | return img, img_shape, scale_factor 91 | 92 | def get_color_map_list(self, num_classes): 93 | color_map = num_classes * [0, 0, 0] 94 | for i in range(0, num_classes): 95 | j = 0 96 | lab = i 97 | while lab: 98 | color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j)) 99 | color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j)) 100 | color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j)) 101 | j += 1 102 | lab >>= 3 103 | color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)] 104 | return color_map 105 | 106 | def detect(self, srcimg): 107 | img, im_shape, scale_factor = self.resize_image(srcimg) 108 | img = self._normalize(img) 109 | 110 | blob = np.expand_dims(np.transpose(img, (2, 0, 1)), axis=0) 111 | 112 | inputs_dict = { 113 | 'im_shape': im_shape, 114 | 'image': blob, 115 | 'scale_factor': scale_factor 116 | } 117 | inputs_name = [a.name for a in self.net.get_inputs()] 118 | net_inputs = {k: inputs_dict[k] for k in inputs_name} 119 | 120 | outs = self.net.run(None, net_inputs) 121 | 122 | outs = np.array(outs[0]) 123 | expect_boxes = (outs[:, 1] > 0.5) & (outs[:, 0] > -1) 124 | np_boxes = outs[expect_boxes, :] 125 | 126 | print(np_boxes) 127 | 128 | # color_list = self.get_color_map_list(self.num_classes) 129 | # clsid2color = {} 130 | 131 | # for i in range(np_boxes.shape[0]): 132 | # classid, conf = int(np_boxes[i, 0]), np_boxes[i, 1] 133 | # xmin, ymin, xmax, ymax = int(np_boxes[i, 2]), int(np_boxes[ 134 | # i, 3]), int(np_boxes[i, 4]), int(np_boxes[i, 5]) 135 | 136 | # if classid not in clsid2color: 137 | # clsid2color[classid] = color_list[classid] 138 | # color = tuple(clsid2color[classid]) 139 | 140 | # cv2.rectangle( 141 | # srcimg, (xmin, ymin), (xmax, ymax), color, thickness=2) 142 | # print(self.classes[classid] + ': ' + str(round(conf, 3))) 143 | # cv2.putText( 144 | # srcimg, 145 | # self.classes[classid] + ':' + str(round(conf, 3)), (xmin, 146 | # ymin - 10), 147 | # cv2.FONT_HERSHEY_SIMPLEX, 148 | # 0.8, (0, 255, 0), 149 | # thickness=2) 150 | 151 | return srcimg 152 | 153 | def detect_folder(self, img_fold, result_path): 154 | img_fold = Path(img_fold) 155 | result_path = Path(result_path) 156 | result_path.mkdir(parents=True, exist_ok=True) 157 | 158 | img_name_list = filter( 159 | lambda x: str(x).endswith(".png") or str(x).endswith(".jpg"), 160 | img_fold.iterdir(), ) 161 | img_name_list = list(img_name_list) 162 | print(f"find {len(img_name_list)} images") 163 | 164 | for img_path in tqdm(img_name_list): 165 | img = cv2.imread(str(img_path), 1) 166 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 167 | 168 | srcimg = net.detect(img) 169 | save_path = str(result_path / img_path.name.replace(".png", ".jpg")) 170 | cv2.imwrite(save_path, srcimg) 171 | 172 | 173 | if __name__ == '__main__': 174 | 175 | model_path = "rtdetr_r18vd_6x_coco/rtdetr_r18vd_6x_coco.onnx" 176 | img_file = "../demo/000000570688.jpg" 177 | conf = 0.5 178 | net = PicoDet(model_path, conf) 179 | 180 | img = cv2.imread(img_file) 181 | t1 = time.perf_counter() 182 | for _ in range(100): 183 | net.detect(img) 184 | t2 = time.perf_counter() 185 | print(f"time: {(t2-t1)*1000/100.0}ms") 186 | 187 | 188 | -------------------------------------------------------------------------------- /1_trt_base/trt_plugin/demo01/demo01.cu: -------------------------------------------------------------------------------- 1 | #include "demo01.h" 2 | 3 | __global__ void addScalarKernel(const float *input, float *output, const float scalar, const int nElement) 4 | { 5 | const int index = blockIdx.x * blockDim.x + threadIdx.x; 6 | if (index >= nElement) 7 | return; 8 | 9 | float _1 = input[index]; 10 | float _2 = _1 + scalar; 11 | output[index] = _2; 12 | } 13 | 14 | namespace nvinfer1 15 | { 16 | ZZX_ADDScalar::ZZX_ADDScalar(const std::string &name, float scalar): name_(name) 17 | { 18 | WHERE_AM_I(); // debug用的 19 | m_.scalar = scalar; 20 | } 21 | 22 | ZZX_ADDScalar::ZZX_ADDScalar(const std::string &name, const void *buffer, size_t length): name_(name) 23 | { 24 | WHERE_AM_I(); 25 | memcpy(&m_, buffer, sizeof(m_)); 26 | } 27 | 28 | ZZX_ADDScalar::~ZZX_ADDScalar() 29 | { 30 | WHERE_AM_I(); 31 | } 32 | 33 | // 深拷贝 34 | IPluginV2DynamicExt *ZZX_ADDScalar::clone() const noexcept 35 | { 36 | WHERE_AM_I(); 37 | auto p = new ZZX_ADDScalar(name_, &m_, sizeof(m_)); 38 | p->setPluginNamespace(namespace_.c_str()); 39 | return p; 40 | } 41 | 42 | // 获得输出数量 自定义为1 43 | int32_t ZZX_ADDScalar::getNbOutputs() const noexcept 44 | { 45 | WHERE_AM_I(); 46 | return 1; 47 | } 48 | 49 | // 获得输出数据类型 自定义为和输入一样 50 | DataType ZZX_ADDScalar::getOutputDataType(int32_t index, DataType const *inputTypes, int32_t nbInputs) const noexcept 51 | { 52 | WHERE_AM_I(); 53 | return inputTypes[0]; 54 | } 55 | 56 | // 获取输出维度 57 | DimsExprs ZZX_ADDScalar::getOutputDimensions(int32_t outputIndex, const DimsExprs *inputs, int32_t nbInputs, IExprBuilder &exprBuilder) noexcept 58 | { 59 | WHERE_AM_I(); 60 | return inputs[0]; 61 | } 62 | 63 | 64 | bool ZZX_ADDScalar::supportsFormatCombination(int32_t pos, const PluginTensorDesc *inOut, int32_t nbInputs, int32_t nbOutputs) noexcept 65 | { 66 | WHERE_AM_I(); 67 | bool res; 68 | switch (pos) 69 | { 70 | case 0: 71 | res = inOut[0].type == DataType::kFLOAT && inOut[0].format == TensorFormat::kLINEAR; 72 | break; 73 | case 1: 74 | res = inOut[1].type == inOut[0].type && inOut[1].format == inOut[0].format; 75 | break; 76 | default: // should NOT be here! 77 | res = false; 78 | } 79 | #ifdef DEBUG 80 | std::cout << "\tpos=" << pos << ",res=" << res << "->["; 81 | for (int i = 0; i < nbInputs + nbOutputs; ++i) 82 | { 83 | std::cout << formatToString(inOut[i].format) << ","; 84 | } 85 | std::cout << "],["; 86 | for (int i = 0; i < nbInputs + nbOutputs; ++i) 87 | { 88 | std::cout << dataTypeToString(inOut[i].type) << ","; 89 | } 90 | std::cout << "]" << std::endl; 91 | #endif 92 | return res; 93 | } 94 | 95 | // 推理前调用 96 | void ZZX_ADDScalar::configurePlugin(const DynamicPluginTensorDesc *in, int32_t nbInputs, const DynamicPluginTensorDesc *out, int32_t nbOutputs) noexcept 97 | { 98 | WHERE_AM_I(); 99 | return; 100 | } 101 | 102 | // 告诉trt需要多大中间变量储存空间, 便于后续优化 103 | size_t ZZX_ADDScalar::getWorkspaceSize(const PluginTensorDesc *inputs, int32_t nbInputs, const PluginTensorDesc *outputs, int32_t nbOutputs) const noexcept 104 | { 105 | WHERE_AM_I(); 106 | return 0; 107 | } 108 | 109 | // 核心,调用核函数 不要在这里使用cudaMalloc*等函数(导致巨大的申请开销) 110 | int32_t ZZX_ADDScalar::enqueue(const PluginTensorDesc *inputDesc, const PluginTensorDesc *outputDesc, const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept 111 | { 112 | WHERE_AM_I(); 113 | int nElement = 1; 114 | for (int i = 0; i < inputDesc[0].dims.nbDims; ++i) 115 | { 116 | nElement *= inputDesc[0].dims.d[i]; 117 | } 118 | dim3 grid(CEIL_DIVIDE(nElement, 256), 1, 1), block(256, 1, 1); 119 | addScalarKernel<<>>(reinterpret_cast(inputs[0]), reinterpret_cast(outputs[0]), m_.scalar, nElement); 120 | return 0; 121 | } 122 | 123 | // context/engine 销毁的时候调用 124 | void ZZX_ADDScalar::destroy() noexcept 125 | { 126 | WHERE_AM_I(); 127 | delete this; 128 | return; 129 | } 130 | 131 | // engine 创建时被调用,用于初始化 Plugin 132 | int32_t ZZX_ADDScalar::initialize() noexcept 133 | { 134 | WHERE_AM_I(); 135 | return 0; 136 | } 137 | 138 | // terminate (engine 销毁时被调用,用于释放 initialize 函数申请的资源 139 | void ZZX_ADDScalar::terminate() noexcept 140 | { 141 | WHERE_AM_I(); 142 | return; 143 | } 144 | 145 | // 序列化 146 | // (报告序列化需要的空间大小,单位 Byte 147 | size_t ZZX_ADDScalar::getSerializationSize() const noexcept 148 | { 149 | WHERE_AM_I(); 150 | return sizeof(m_); 151 | } 152 | 153 | // (将 Plugin 数据序列化到给定的 buffer 中) 154 | void ZZX_ADDScalar::serialize(void *buffer) const noexcept 155 | { 156 | WHERE_AM_I(); 157 | memcpy(buffer, &m_, sizeof(m_)); 158 | return; 159 | } 160 | 161 | void ZZX_ADDScalar::setPluginNamespace(const char *pluginNamespace) noexcept 162 | { 163 | WHERE_AM_I(); 164 | namespace_ = pluginNamespace; 165 | return; 166 | } 167 | 168 | const char *ZZX_ADDScalar::getPluginNamespace() const noexcept 169 | { 170 | WHERE_AM_I(); 171 | return namespace_.c_str(); 172 | } 173 | 174 | const char *ZZX_ADDScalar::getPluginType() const noexcept 175 | { 176 | WHERE_AM_I(); 177 | return PLUGIN_NAME; 178 | } 179 | 180 | const char *ZZX_ADDScalar::getPluginVersion() const noexcept 181 | { 182 | WHERE_AM_I(); 183 | return PLUGIN_VERSION; 184 | } 185 | 186 | // (申请使用 context 独占的 cudnn 或 cublas 187 | void ZZX_ADDScalar::attachToContext(cudnnContext *contextCudnn, cublasContext *contextCublas, IGpuAllocator *gpuAllocator) noexcept 188 | { 189 | WHERE_AM_I(); 190 | return; 191 | } 192 | 193 | //(销毁 context 独占的 cudnn 或 cublas 资 194 | void ZZX_ADDScalar::detachFromContext() noexcept 195 | { 196 | WHERE_AM_I(); 197 | return; 198 | } 199 | 200 | 201 | 202 | 203 | // class AddScalarPluginCreator 204 | PluginFieldCollection ZZXAddScalarPluginCreator::fc_ {}; 205 | std::vector ZZXAddScalarPluginCreator::attr_; 206 | 207 | ZZXAddScalarPluginCreator::ZZXAddScalarPluginCreator() 208 | { 209 | WHERE_AM_I(); 210 | attr_.clear(); 211 | attr_.emplace_back(PluginField("scalar", nullptr, PluginFieldType::kFLOAT32, 1)); 212 | fc_.nbFields = attr_.size(); 213 | fc_.fields = attr_.data(); 214 | } 215 | 216 | ZZXAddScalarPluginCreator::~ZZXAddScalarPluginCreator() 217 | { 218 | WHERE_AM_I(); 219 | } 220 | 221 | // 接受权重,构造这个算子 222 | IPluginV2DynamicExt* ZZXAddScalarPluginCreator::createPlugin(const char *name, const PluginFieldCollection *fc) noexcept 223 | { 224 | WHERE_AM_I(); 225 | float scalar = 0; 226 | std::map parameterMap {{"scalar", &scalar}}; 227 | 228 | for (int i = 0; i < fc->nbFields; ++i) 229 | { 230 | if (parameterMap.find(fc->fields[i].name) != parameterMap.end()) 231 | { 232 | *parameterMap[fc->fields[i].name] = *reinterpret_cast(fc->fields[i].data); 233 | } 234 | } 235 | ZZX_ADDScalar *pObj = new ZZX_ADDScalar(name, scalar); 236 | pObj->setPluginNamespace(namespace_.c_str()); 237 | return pObj; 238 | } 239 | 240 | // 反序列化 241 | IPluginV2DynamicExt *ZZXAddScalarPluginCreator::deserializePlugin(const char *name, const void *serialData, size_t serialLength) noexcept 242 | { 243 | WHERE_AM_I(); 244 | ZZX_ADDScalar *pObj = new ZZX_ADDScalar(name, serialData, serialLength); 245 | pObj->setPluginNamespace(namespace_.c_str()); 246 | return pObj; 247 | } 248 | 249 | void ZZXAddScalarPluginCreator::setPluginNamespace(const char *pluginNamespace) noexcept 250 | { 251 | WHERE_AM_I(); 252 | namespace_ = pluginNamespace; 253 | return; 254 | } 255 | 256 | const char *ZZXAddScalarPluginCreator::getPluginNamespace() const noexcept 257 | { 258 | WHERE_AM_I(); 259 | return namespace_.c_str(); 260 | } 261 | 262 | const char *ZZXAddScalarPluginCreator::getPluginName() const noexcept 263 | { 264 | WHERE_AM_I(); 265 | return PLUGIN_NAME; 266 | } 267 | 268 | const char *ZZXAddScalarPluginCreator::getPluginVersion() const noexcept 269 | { 270 | WHERE_AM_I(); 271 | return PLUGIN_VERSION; 272 | } 273 | 274 | const PluginFieldCollection *ZZXAddScalarPluginCreator::getFieldNames() noexcept 275 | { 276 | WHERE_AM_I(); 277 | return &fc_; 278 | } 279 | 280 | REGISTER_TENSORRT_PLUGIN(ZZXAddScalarPluginCreator); 281 | 282 | } 283 | -------------------------------------------------------------------------------- /2_faster_tensorrt/src/apps/rtdetr/rtdetr.cpp: -------------------------------------------------------------------------------- 1 | #include "rtdetr.h" 2 | 3 | namespace RTDETR { 4 | 5 | RtDetrTRTInferImpl::~RtDetrTRTInferImpl() { 6 | stop(); 7 | } 8 | 9 | // 启动 但不是重写基类的startup 参数不一样 里面会去调用基类 10 | bool RtDetrTRTInferImpl::startup(const std::string& file, int gpuid, int batch_size, 11 | float confidence_threshold) { 12 | // const float mean_norm[3] = {103.53, 116.28, 123.675}; 13 | // const float std_norm[3] = {57.375, 57.12, 58.395}; 14 | // normalize_ = Norm::mean_std(mean_norm, std_norm, 1 / 255.0f, ChannelType::SwapRB); 15 | normalize_ = Norm::alpha_beta(1 / 255.0f, 0.0f, ChannelType::SwapRB); 16 | confidence_threshold_ = confidence_threshold; 17 | batch_size_ = batch_size; 18 | return ThreadSafedAsyncInferImpl::startup(std::make_tuple(file, gpuid)); 19 | } 20 | 21 | // 重写基类worker 工作线程 22 | void RtDetrTRTInferImpl::worker(std::promise& result) { 23 | std::string file = std::get<0>(start_param_); 24 | int gpuid = std::get<1>(start_param_); 25 | 26 | set_device(gpuid); 27 | auto engine = load_infer(file, batch_size_); 28 | if(engine == nullptr) { 29 | INFOE("Engine %s load failed", file.c_str()); 30 | result.set_value(false); 31 | return; 32 | } 33 | 34 | engine->print(); 35 | 36 | const int MAX_IMAGE_BBOX = 100; 37 | const int NUM_BOX_ELEMENT = 7; // left, top, right, bottom, confidence, class, keepflag 38 | Tensor affin_matrix_device(FasterTRT::DataType::Float); 39 | Tensor output_array_device(FasterTRT::DataType::Float); 40 | 41 | // 输入输出 42 | int max_batch_size = engine->get_max_batch_size(); 43 | auto input = engine->tensor("image"); 44 | auto output = engine->tensor("output"); 45 | 46 | // decode数据 47 | int num_classes, output_num_bboxes, output_fm_area; 48 | output_num_bboxes = output->size(0) * output->size(1); 49 | output_fm_area = output->size(2); 50 | num_classes = output->size(2) - 4; 51 | 52 | // 输入 53 | input_width_ = input->size(3); 54 | input_height_ = input->size(2); 55 | tensor_allocator_ = std::make_shared>(max_batch_size * 2); 56 | stream_ = engine->get_stream(); 57 | gpu_ = gpuid; 58 | result.set_value(true); // 初始化完成 返回给startup函数结束 59 | 60 | input->resize_single_dim(0, max_batch_size).to_gpu(); 61 | affin_matrix_device.set_stream(stream_); 62 | 63 | // 这里8个值的目的是保证 8 * sizeof(float) % 32 == 0 64 | affin_matrix_device.resize(max_batch_size, 8).to_gpu(); 65 | 66 | // 输出数据 67 | output_array_device.set_stream(stream_); 68 | output_array_device.resize(max_batch_size, 1 + MAX_IMAGE_BBOX * NUM_BOX_ELEMENT).to_gpu(); 69 | 70 | auto decode_kernel_invoker = rtdetr_decode_kernel_invoker; 71 | 72 | // 循环等待&检测 73 | std::vector fetch_jobs; 74 | while(get_jobs_and_wait(fetch_jobs, max_batch_size)) { 75 | int infer_batch_size = fetch_jobs.size(); 76 | input->resize_single_dim(0, infer_batch_size); 77 | 78 | for(int ibatch = 0; ibatch < infer_batch_size; ++ibatch) { 79 | auto& job = fetch_jobs[ibatch]; 80 | auto& mono = job.mono_tensor->data(); 81 | affin_matrix_device.copy_from_gpu(affin_matrix_device.offset(ibatch), 82 | mono->get_workspace()->gpu(), 6); 83 | input->copy_from_gpu(input->offset(ibatch), mono->gpu(), mono->count()); 84 | job.mono_tensor->release(); 85 | } 86 | 87 | engine->forward(false); 88 | 89 | output_array_device.to_gpu(false); 90 | for(int ibatch = 0; ibatch < infer_batch_size; ++ibatch) { 91 | // auto& job = fetch_jobs[ibatch]; 92 | float* image_based_output = output->gpu(ibatch); 93 | float* output_array_ptr = output_array_device.gpu(ibatch); 94 | auto affine_matrix = affin_matrix_device.gpu(ibatch); 95 | checkCudaRuntime(cudaMemsetAsync(output_array_ptr, 0, sizeof(int), stream_)); 96 | decode_kernel_invoker(image_based_output, output_num_bboxes, output_fm_area, 97 | num_classes, confidence_threshold_, affine_matrix, 98 | output_array_ptr, MAX_IMAGE_BBOX, input_width_, stream_); 99 | } 100 | 101 | output_array_device.to_cpu(); 102 | for(int ibatch = 0; ibatch < infer_batch_size; ++ibatch) { 103 | float* parray = output_array_device.cpu(ibatch); 104 | int count = std::min(MAX_IMAGE_BBOX, (int)*parray); 105 | auto& job = fetch_jobs[ibatch]; 106 | auto& image_based_boxes = job.output; 107 | for(int i = 0; i < count; ++i) { 108 | float* pbox = parray + 1 + i * NUM_BOX_ELEMENT; 109 | int label = pbox[5]; 110 | int keepflag = pbox[6]; 111 | if(keepflag == 1) { 112 | image_based_boxes.emplace_back(pbox[0], pbox[1], pbox[2], pbox[3], pbox[4], 113 | label); 114 | } 115 | } 116 | job.pro->set_value(image_based_boxes); 117 | } 118 | fetch_jobs.clear(); 119 | } 120 | stream_ = nullptr; 121 | // TODO 这个流是否要考虑换个地方释放? 122 | checkCudaRuntime(cudaStreamDestroy(stream_pro_)); 123 | stream_pro_ = nullptr; 124 | tensor_allocator_.reset(); 125 | INFO("Engine destroy."); 126 | } 127 | 128 | // 预处理 129 | bool RtDetrTRTInferImpl::preprocess(Job& job, const cv::Mat& image) { 130 | if(tensor_allocator_ == nullptr) { 131 | INFOE("tensor_allocator_ is nullptr"); 132 | return false; 133 | } 134 | 135 | job.mono_tensor = tensor_allocator_->query(); 136 | if(job.mono_tensor == nullptr) { 137 | INFOE("Tensor allocator query failed."); 138 | return false; 139 | } 140 | 141 | if(stream_pro_ == nullptr) { 142 | checkCudaRuntime(cudaStreamCreate(&stream_pro_)); 143 | } 144 | 145 | AutoDevice auto_device(gpu_); 146 | auto& tensor = job.mono_tensor->data(); 147 | if(tensor == nullptr) { 148 | // not init 149 | tensor = std::make_shared(); 150 | tensor->set_workspace(std::make_shared()); 151 | } 152 | 153 | cv::Size input_size(input_width_, input_height_); 154 | job.additional.compute(image.size(), input_size); 155 | 156 | tensor->set_stream(stream_pro_); 157 | tensor->resize(1, 3, input_height_, input_width_); 158 | 159 | size_t size_image = image.cols * image.rows * 3; 160 | size_t size_matrix = upbound(sizeof(job.additional.d2i), 32); 161 | auto workspace = tensor->get_workspace(); 162 | uint8_t* gpu_workspace = (uint8_t*)workspace->gpu(size_matrix + size_image); 163 | float* affine_matrix_device = (float*)gpu_workspace; 164 | uint8_t* image_device = size_matrix + gpu_workspace; 165 | 166 | uint8_t* cpu_workspace = (uint8_t*)workspace->cpu(size_matrix + size_image); 167 | float* affine_matrix_host = (float*)cpu_workspace; 168 | uint8_t* image_host = size_matrix + cpu_workspace; 169 | 170 | memcpy(image_host, image.data, size_image); 171 | memcpy(affine_matrix_host, job.additional.d2i, sizeof(job.additional.d2i)); 172 | checkCudaRuntime( 173 | cudaMemcpyAsync(image_device, image_host, size_image, cudaMemcpyHostToDevice, stream_pro_)); 174 | checkCudaRuntime(cudaMemcpyAsync(affine_matrix_device, affine_matrix_host, 175 | sizeof(job.additional.d2i), cudaMemcpyHostToDevice, 176 | stream_pro_)); 177 | 178 | warp_affine_bilinear_and_normalize_plane(image_device, image.cols * 3, image.cols, image.rows, 179 | tensor->gpu(), input_width_, input_height_, 180 | affine_matrix_device, .0, normalize_, stream_pro_); 181 | // 这个地方需要同步,确保数据放到gpu后才可以吧任务提交到队列中。 182 | cudaStreamSynchronize(stream_pro_); 183 | 184 | return true; 185 | } 186 | 187 | // 提交任务 188 | std::vector> RtDetrTRTInferImpl::commits( 189 | const std::vector& images) { 190 | return ThreadSafedAsyncInferImpl::commits(images); 191 | } 192 | 193 | // 提交cv::Mat任务 194 | std::shared_future RtDetrTRTInferImpl::commit(const cv::Mat& image) { 195 | return ThreadSafedAsyncInferImpl::commit(image); 196 | } 197 | 198 | // 创建推理器 199 | std::shared_ptr create_infer(const std::string& engine_file, int gpuid, int batch_size, 200 | float confidence_threshold) { 201 | std::shared_ptr instance(new RtDetrTRTInferImpl()); 202 | if(!instance->startup(engine_file, gpuid, batch_size, confidence_threshold)) { 203 | instance.reset(); 204 | } 205 | return instance; 206 | } 207 | } // namespace RTDETR 208 | -------------------------------------------------------------------------------- /1_trt_base/trt_rtdetr/rtdetr_trt.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import time 3 | import numpy as np 4 | import tensorrt as trt 5 | from cuda import cudart 6 | 7 | model_path = "rtdetr_r18vd_6x_coco/rtdetr_r18vd_6x_coco.trt" 8 | img_path = "../demo/000000570688.jpg" 9 | 10 | mean = np.array( 11 | [103.53, 116.28, 123.675], dtype=np.float32).reshape(1, 1, 3) 12 | std = np.array( 13 | [57.375, 57.12, 58.395], dtype=np.float32).reshape(1, 1, 3) 14 | def normalize(img): 15 | img = img.astype(np.float32) 16 | img = (img / 255.0 - mean / 255.0) / (std / 255.0) 17 | return img 18 | 19 | def resize_image(srcimg, input_shape, keep_ratio=False): 20 | top, left, newh, neww = 0, 0, input_shape[0], input_shape[1] 21 | origin_shape = srcimg.shape[:2] 22 | im_scale_y = newh / float(origin_shape[0]) 23 | im_scale_x = neww / float(origin_shape[1]) 24 | img_shape = np.array([ 25 | [float(input_shape[0]), float(input_shape[1])] 26 | ]).astype('float32') 27 | scale_factor = np.array([[im_scale_y, im_scale_x]]).astype('float32') 28 | 29 | if keep_ratio and srcimg.shape[0] != srcimg.shape[1]: 30 | hw_scale = srcimg.shape[0] / srcimg.shape[1] 31 | if hw_scale > 1: 32 | newh, neww = input_shape[0], int(input_shape[1] / 33 | hw_scale) 34 | img = cv2.resize( 35 | srcimg, (neww, newh), interpolation=cv2.INTER_AREA) 36 | left = int((input_shape[1] - neww) * 0.5) 37 | img = cv2.copyMakeBorder( 38 | img, 39 | 0, 40 | 0, 41 | left, 42 | input_shape[1] - neww - left, 43 | cv2.BORDER_CONSTANT, 44 | value=0) # add border 45 | else: 46 | newh, neww = int(input_shape[0] * 47 | hw_scale), input_shape[1] 48 | img = cv2.resize( 49 | srcimg, (neww, newh), interpolation=cv2.INTER_AREA) 50 | top = int((input_shape[0] - newh) * 0.5) 51 | img = cv2.copyMakeBorder( 52 | img, 53 | top, 54 | input_shape[0] - newh - top, 55 | 0, 56 | 0, 57 | cv2.BORDER_CONSTANT, 58 | value=0) 59 | else: 60 | img = cv2.resize( 61 | srcimg, input_shape, interpolation=cv2.INTER_LINEAR) 62 | 63 | return img, img_shape, scale_factor 64 | 65 | 66 | 67 | 68 | class RtdetrTrt: 69 | def __init__(self) -> None: 70 | self.logger = trt.Logger(trt.Logger.WARNING) 71 | trt.init_libnvinfer_plugins( self.logger, namespace='') # 加载插件 72 | 73 | self.runtime = trt.Runtime(self.logger) 74 | 75 | with open(model_path, "rb") as f: 76 | serialized_engine = f.read() 77 | self.engine = self.runtime.deserialize_cuda_engine(serialized_engine) 78 | self.context = self.engine.create_execution_context() 79 | 80 | self.nIO = self.engine.num_io_tensors # io变量数量 81 | self.lTensorName = [self.engine.get_tensor_name(i) for i in range(self.nIO)] # 获取io变量名字 82 | self.nInput = [self.engine.get_tensor_mode(self.lTensorName[i]) for i in range(self.nIO)].count(trt.TensorIOMode.INPUT) # 输入tensor数量 83 | self.Output = [self.engine.get_tensor_mode(self.lTensorName[i]) for i in range(self.nIO)].count(trt.TensorIOMode.OUTPUT) 84 | 85 | print("===============INPUT/OUTPUT=================== ") 86 | for i in range(self.nIO): 87 | print(f"[{i}]{'Input ' if i < self.nInput else 'Output'} -> "+ 88 | f"{self.engine.get_tensor_dtype(self.lTensorName[i])} " + # 数据类型 89 | f"{self.engine.get_tensor_shape(self.lTensorName[i])} " + # engine形状 90 | f"{self.context.get_tensor_shape(self.lTensorName[i])} " + # context形状 91 | f"{self.lTensorName[i]} ") # 名字 92 | print("============================================== ") 93 | 94 | # cpu端数据 95 | self.bufferH = [] 96 | for i in range(self.nIO): 97 | self.bufferH.append(np.empty(self.context.get_tensor_shape(self.lTensorName[i]), 98 | dtype=trt.nptype(self.engine.get_tensor_dtype(self.lTensorName[i])))) 99 | 100 | # # gpu端数据申请显存 101 | self.bufferD = [] 102 | for i in range(self.nIO): 103 | self.bufferD.append(cudart.cudaMalloc(self.bufferH[i].nbytes)[1]) 104 | 105 | 106 | def infer(self, origin_img): 107 | 108 | img, img_shape, scale_factor = resize_image(origin_img, (640, 640)) 109 | img = normalize(img) 110 | blob = np.expand_dims(np.transpose(img, (2, 0, 1)), axis=0) 111 | 112 | # cpu端数据 113 | self.bufferH[0] = np.ascontiguousarray(img_shape) 114 | self.bufferH[1] = np.ascontiguousarray(blob) 115 | self.bufferH[2] = np.ascontiguousarray(scale_factor) 116 | 117 | 118 | for i in range(self.nInput, self.nIO): # 输出数据 119 | self.bufferH[i] = np.empty(self.context.get_tensor_shape(self.lTensorName[i]), 120 | dtype=trt.nptype(self.engine.get_tensor_dtype(self.lTensorName[i]))) 121 | 122 | # 输入数据复制到显存 123 | for i in range(self.nInput): 124 | cudart.cudaMemcpy(self.bufferD[i], self.bufferH[i].ctypes.data, self.bufferH[i].nbytes, 125 | cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) 126 | 127 | # # 推理 execute_async_v2 execute_v2 128 | self.context.execute_v2(self.bufferD) # batchsize bingings 129 | for i in range(self.nInput, self.nIO): # 数据拷会cpu 130 | cudart.cudaMemcpy(self.bufferH[i].ctypes.data, self.bufferD[i], self.bufferH[i].nbytes, 131 | cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) 132 | 133 | # for i in range(self.nInput, self.nIO): 134 | # print(len(self.bufferH[i])) 135 | dets = [] 136 | for i in self.bufferH[-1]: 137 | if i[1] > 0.5: 138 | dets.append(i) 139 | return dets 140 | 141 | 142 | def infer_(self, img_shape, blob, scale_factor): 143 | # cpu端数据 144 | self.bufferH[0] = np.ascontiguousarray(img_shape) 145 | self.bufferH[1] = np.ascontiguousarray(blob) 146 | self.bufferH[2] = np.ascontiguousarray(scale_factor) 147 | 148 | 149 | for i in range(self.nInput, self.nIO): # 输出数据 150 | self.bufferH[i] = np.empty(self.context.get_tensor_shape(self.lTensorName[i]), 151 | dtype=trt.nptype(self.engine.get_tensor_dtype(self.lTensorName[i]))) 152 | 153 | # 输入数据复制到显存 154 | for i in range(self.nInput): 155 | cudart.cudaMemcpy(self.bufferD[i], self.bufferH[i].ctypes.data, self.bufferH[i].nbytes, 156 | cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) 157 | 158 | # # 推理 execute_async_v2 execute_v2 159 | self.context.execute_v2(self.bufferD) # batchsize bingings 160 | for i in range(self.nInput, self.nIO): # 数据拷会cpu 161 | cudart.cudaMemcpy(self.bufferH[i].ctypes.data, self.bufferD[i], self.bufferH[i].nbytes, 162 | cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) 163 | 164 | # for i in range(self.nInput, self.nIO): 165 | # print(len(self.bufferH[i])) 166 | dets = [] 167 | for i in self.bufferH[-1]: 168 | if i[1] > 0.5: 169 | dets.append(i) 170 | return dets 171 | 172 | 173 | def myfree(self): 174 | for i in self.bufferD: # 释放显存 175 | cudart.cudaFree(i) 176 | 177 | 178 | if __name__ == '__main__': 179 | origin_img = cv2.imread(img_path) 180 | 181 | img, img_shape, scale_factor = resize_image(origin_img, (640, 640)) 182 | img = normalize(img) 183 | blob = np.expand_dims(np.transpose(img, (2, 0, 1)), axis=0) 184 | 185 | 186 | yolo_trt = RtdetrTrt() 187 | yolo_trt.infer(origin_img) 188 | for _ in range(50): 189 | # yolo_trt.infer(origin_img) 190 | yolo_trt.infer_(img, img_shape, scale_factor) 191 | 192 | time_b = time.perf_counter() 193 | for _ in range(1000): 194 | # dets = yolo_trt.infer(origin_img) 195 | dets = yolo_trt.infer_(img, img_shape, scale_factor) 196 | 197 | time_e = time.perf_counter() 198 | print(f"cost time: {(time_e-time_b)*1000 / 1000.0 :.2f}ms") 199 | 200 | for i in dets: 201 | print(f"class: {i[0]:.0f}\tsocre: {i[1] :.2f}\tx1: {i[2] :.0f}\ty1: {i[3] :.0f}\tx2: {i[4] :.0f}\ty2: {i[5] :.0f}") 202 | 203 | 204 | 205 | yolo_trt.myfree() 206 | 207 | -------------------------------------------------------------------------------- /2_faster_tensorrt/src/base/memory_tensor.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file memory_tensor.hpp 3 | * @author 0zzx0 4 | * @brief 内存显存相关 5 | * @version 0.1 6 | * @date 2023-6-11 2023-8-21 7 | * 8 | * @copyright Copyright (c) 2023 9 | * 10 | */ 11 | 12 | #ifndef MEMORY_TENSOR_HPP 13 | #define MEMORY_TENSOR_HPP 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "tools.hpp" 21 | 22 | namespace FasterTRT { 23 | 24 | typedef struct { 25 | unsigned short _; 26 | } float16; 27 | enum class DataType : int { Unknow = -1, Float = 0, Float16 = 1, Int32 = 2, UInt8 = 3 }; 28 | 29 | enum class DataHead : int { Init = 0, Device = 1, Host = 2 }; 30 | 31 | float float16_to_float(float16 value); 32 | float16 float_to_float16(float value); 33 | int data_type_size(DataType dt); 34 | const char* data_head_string(DataHead dh); 35 | const char* data_type_string(DataType dt); 36 | 37 | /** 38 | * @brief MixMemory: gpu/cpu内存管理 39 | 实现对gpu和cpu内存进行分配和释放 40 | cpu使用的是pinned memory,当对gpu做内存复制时,性能比较好 41 | * 42 | */ 43 | class MixMemory { 44 | public: 45 | MixMemory(int device_id = CURRENT_DEVICE_ID); 46 | MixMemory(void* cpu, size_t cpu_size, void* gpu, size_t gpu_size); 47 | virtual ~MixMemory(); 48 | void* gpu(size_t size); 49 | void* cpu(size_t size); 50 | void release_gpu(); 51 | void release_cpu(); 52 | void release_all(); 53 | 54 | inline bool owner_gpu() const { return owner_gpu_; } 55 | inline bool owner_cpu() const { return owner_cpu_; } 56 | 57 | inline size_t cpu_size() const { return cpu_size_; } 58 | inline size_t gpu_size() const { return gpu_size_; } 59 | inline int device_id() const { return device_id_; } 60 | 61 | inline void* gpu() const { return gpu_; } 62 | 63 | // Pinned Memory 64 | inline void* cpu() const { return cpu_; } 65 | 66 | void reference_data(void* cpu, size_t cpu_size, void* gpu, size_t gpu_size); 67 | 68 | private: 69 | int device_id_ = 0; 70 | 71 | void* cpu_ = nullptr; 72 | size_t cpu_size_ = 0; 73 | bool owner_cpu_ = true; 74 | 75 | void* gpu_ = nullptr; 76 | size_t gpu_size_ = 0; 77 | bool owner_gpu_ = true; 78 | }; 79 | 80 | /** 81 | * @brief Tensor类,实现张量的管理 82 | 由于NN多用张量,必须有个类进行管理才方便,实现内存自动分配,计算索引等等 83 | 如果要调试,可以执行save_to_file,储存为文件后,在python中加载并查看 84 | * 85 | */ 86 | class Tensor { 87 | public: 88 | Tensor(const Tensor& other) = delete; 89 | Tensor& operator=(const Tensor& other) = delete; 90 | 91 | explicit Tensor(DataType dtype = DataType::Float, std::shared_ptr data = nullptr, 92 | int device_id = CURRENT_DEVICE_ID); 93 | explicit Tensor(int n, int c, int h, int w, DataType dtype = DataType::Float, 94 | std::shared_ptr data = nullptr, int device_id = CURRENT_DEVICE_ID); 95 | explicit Tensor(int ndims, const int* dims, DataType dtype = DataType::Float, 96 | std::shared_ptr data = nullptr, int device_id = CURRENT_DEVICE_ID); 97 | explicit Tensor(const std::vector& dims, DataType dtype = DataType::Float, 98 | std::shared_ptr data = nullptr, int device_id = CURRENT_DEVICE_ID); 99 | virtual ~Tensor(); 100 | 101 | int numel() const; 102 | inline int ndims() const { return shape_.size(); } 103 | inline int size(int index) const { return shape_[index]; } 104 | inline int shape(int index) const { return shape_[index]; } 105 | 106 | inline int batch() const { return shape_[0]; } 107 | inline int channel() const { return shape_[1]; } 108 | inline int height() const { return shape_[2]; } 109 | inline int width() const { return shape_[3]; } 110 | 111 | inline DataType type() const { return dtype_; } 112 | inline const std::vector& dims() const { return shape_; } 113 | inline const std::vector& strides() const { return strides_; } 114 | inline int bytes() const { return bytes_; } 115 | inline int bytes(int start_axis) const { return count(start_axis) * element_size(); } 116 | inline int element_size() const { return data_type_size(dtype_); } 117 | inline DataHead head() const { return head_; } 118 | 119 | std::shared_ptr clone() const; 120 | Tensor& release(); 121 | Tensor& set_to(float value); 122 | bool empty() const; 123 | 124 | ///////////////偏置部分 125 | template 126 | int offset(int index, _Args... index_args) const { 127 | const int index_array[] = {index, index_args...}; 128 | return offset_array(sizeof...(index_args) + 1, index_array); 129 | } 130 | 131 | int offset_array(const std::vector& index) const; 132 | int offset_array(size_t size, const int* index_array) const; 133 | 134 | ////////////////resize部分 135 | template 136 | Tensor& resize(int dim_size, _Args... dim_size_args) { 137 | const int dim_size_array[] = {dim_size, dim_size_args...}; 138 | return resize(sizeof...(dim_size_args) + 1, dim_size_array); 139 | } 140 | 141 | Tensor& resize(int ndims, const int* dims); 142 | Tensor& resize(const std::vector& dims); 143 | Tensor& resize_single_dim(int idim, int size); 144 | int count(int start_axis = 0) const; 145 | int device() const { return device_id_; } 146 | 147 | ////////////////////数据操作部分 148 | Tensor& to_gpu(bool copy = true); 149 | Tensor& to_cpu(bool copy = true); 150 | 151 | Tensor& to_half(); 152 | Tensor& to_float(); 153 | 154 | inline void* cpu() const { 155 | ((Tensor*)this)->to_cpu(); 156 | return data_->cpu(); 157 | } 158 | inline void* gpu() const { 159 | ((Tensor*)this)->to_gpu(); 160 | return data_->gpu(); 161 | } 162 | 163 | template 164 | inline const DType* cpu() const { 165 | return (DType*)cpu(); 166 | } 167 | template 168 | inline DType* cpu() { 169 | return (DType*)cpu(); 170 | } 171 | 172 | template 173 | inline DType* cpu(int i, _Args&&... args) { 174 | return cpu() + offset(i, args...); 175 | } 176 | 177 | template 178 | inline const DType* gpu() const { 179 | return (DType*)gpu(); 180 | } 181 | template 182 | inline DType* gpu() { 183 | return (DType*)gpu(); 184 | } 185 | 186 | template 187 | inline DType* gpu(int i, _Args&&... args) { 188 | return gpu() + offset(i, args...); 189 | } 190 | 191 | template 192 | inline DType& at(int i, _Args&&... args) { 193 | return *(cpu() + offset(i, args...)); 194 | } 195 | 196 | std::shared_ptr get_data() const { return data_; } 197 | std::shared_ptr get_workspace() const { return workspace_; } 198 | Tensor& set_workspace(std::shared_ptr workspace) { 199 | workspace_ = workspace; 200 | return *this; 201 | } 202 | 203 | bool is_stream_owner() const { return stream_owner_; } 204 | cudaStream_t get_stream() const { return stream_; } 205 | Tensor& set_stream(cudaStream_t stream, bool owner = false) { 206 | stream_ = stream; 207 | stream_owner_ = owner; 208 | return *this; 209 | } 210 | 211 | Tensor& set_mat(int n, const cv::Mat& image); 212 | Tensor& set_norm_mat(int n, const cv::Mat& image, float mean[3], float std[3]); 213 | cv::Mat at_mat(int n = 0, int c = 0) { 214 | return cv::Mat(height(), width(), CV_32F, cpu(n, c)); 215 | } 216 | 217 | Tensor& synchronize(); 218 | const char* shape_string() const { return shape_string_; } 219 | const char* descriptor() const; 220 | 221 | Tensor& copy_from_gpu(size_t offset, const void* src, size_t num_element, 222 | int device_id = CURRENT_DEVICE_ID); 223 | Tensor& copy_from_cpu(size_t offset, const void* src, size_t num_element); 224 | 225 | void reference_data(const std::vector& shape, void* cpu_data, size_t cpu_size, 226 | void* gpu_data, size_t gpu_size, DataType dtype); 227 | /** 228 | 229 | # 以下代码是python中加载Tensor 230 | import numpy as np 231 | 232 | def load_tensor(file): 233 | 234 | with open(file, "rb") as f: 235 | binary_data = f.read() 236 | 237 | magic_number, ndims, dtype = np.frombuffer(binary_data, np.uint32, count=3, offset=0) 238 | assert magic_number == 0xFCCFE2E2, f"{file} not a tensor file." 239 | 240 | dims = np.frombuffer(binary_data, np.uint32, count=ndims, offset=3 * 4) 241 | 242 | if dtype == 0: 243 | np_dtype = np.float32 244 | elif dtype == 1: 245 | np_dtype = np.float16 246 | else: 247 | assert False, f"Unsupport dtype = {dtype}, can not convert to numpy dtype" 248 | 249 | return np.frombuffer(binary_data, np_dtype, offset=(ndims + 3) * 4).reshape(*dims) 250 | 251 | **/ 252 | bool save_to_file(const std::string& file) const; 253 | bool load_from_file(const std::string& file); 254 | 255 | private: 256 | Tensor& compute_shape_string(); 257 | Tensor& adajust_memory_by_update_dims_or_type(); 258 | void setup_data(std::shared_ptr data); 259 | 260 | private: 261 | std::vector shape_; 262 | std::vector strides_; 263 | size_t bytes_ = 0; 264 | DataHead head_ = DataHead::Init; 265 | DataType dtype_ = DataType::Float; 266 | cudaStream_t stream_ = nullptr; 267 | int device_id_ = 0; 268 | char shape_string_[100]; 269 | char descriptor_string_[100]; 270 | std::shared_ptr data_; 271 | std::shared_ptr workspace_; 272 | 273 | bool stream_owner_ = false; 274 | }; 275 | 276 | }; // namespace FasterTRT 277 | 278 | #endif -------------------------------------------------------------------------------- /2_faster_tensorrt/src/base/infer_base.cpp: -------------------------------------------------------------------------------- 1 | #include "infer_base.hpp" 2 | 3 | namespace FasterTRT { 4 | /////////////////////////////////////////////////////////////////////// 5 | /////////////////////////// TRTInferImpl ////////////////////////////// 6 | /////////////////////////////////////////////////////////////////////// 7 | 8 | TRTInferImpl::~TRTInferImpl() { 9 | destroy(); 10 | } 11 | 12 | // 销毁对象(析构默认调用) 13 | void TRTInferImpl::destroy() { 14 | int old_device = 0; 15 | checkCudaRuntime(cudaGetDevice(&old_device)); 16 | checkCudaRuntime(cudaSetDevice(device_)); 17 | this->context_.reset(); 18 | this->blobsNameMapper_.clear(); 19 | this->outputs_.clear(); 20 | this->inputs_.clear(); 21 | this->inputs_name_.clear(); 22 | this->outputs_name_.clear(); 23 | checkCudaRuntime(cudaSetDevice(old_device)); 24 | } 25 | 26 | // 打印信息 输入输出信息 27 | void TRTInferImpl::print() { 28 | if(!context_) { 29 | INFOW("Infer print, nullptr."); 30 | return; 31 | } 32 | 33 | INFO("Infer %p I/O detail", this); 34 | INFO("\tMax Batch Size: %d", this->get_max_batch_size()); 35 | INFO("\tInputs count: %d", inputs_.size()); 36 | for(int i = 0; i < inputs_.size(); ++i) { 37 | INFO("\t\t%d.%s : shape {%s}", i, inputs_name_[i].c_str(), inputs_[i]->shape_string()); 38 | } 39 | 40 | INFO("\tOutputs count: %d", outputs_.size()); 41 | for(int i = 0; i < outputs_.size(); ++i) { 42 | INFO("\t\t%d.%s : shape {%s}", i, outputs_name_[i].c_str(), outputs_[i]->shape_string()); 43 | } 44 | } 45 | 46 | // 序列化engine 47 | std::shared_ptr> TRTInferImpl::serial_engine() { 48 | auto memory = this->context_->engine_->serialize(); 49 | auto output = std::make_shared>((uint8_t*)memory->data(), 50 | (uint8_t*)memory->data() + memory->size()); 51 | memory->destroy(); 52 | return output; 53 | } 54 | 55 | // 从内存加载 56 | bool TRTInferImpl::load_from_memory(const void* pdata, size_t size) { 57 | if(pdata == nullptr || size == 0) return false; 58 | 59 | context_.reset(new EngineContext()); 60 | 61 | // build model 62 | if(!context_->build_model(pdata, size)) { 63 | context_.reset(); 64 | return false; 65 | } 66 | 67 | workspace_.reset(new MixMemory()); 68 | cudaGetDevice(&device_); 69 | build_engine_input_and_outputs_mapper(); 70 | return true; 71 | } 72 | 73 | // 从文件加载 74 | bool TRTInferImpl::load(const std::string& file, int batch_size) { 75 | auto data = load_file(file); 76 | if(data.empty()) return false; 77 | 78 | context_.reset(new EngineContext()); 79 | 80 | // build model 81 | if(!context_->build_model(data.data(), data.size())) { 82 | context_.reset(); 83 | return false; 84 | } 85 | batch_max_size_ = batch_size; 86 | 87 | workspace_.reset(new MixMemory()); 88 | cudaGetDevice(&device_); 89 | build_engine_input_and_outputs_mapper(); 90 | return true; 91 | } 92 | 93 | // 获取设备的内存大小 94 | size_t TRTInferImpl::get_device_memory_size() { 95 | EngineContext* context = (EngineContext*)this->context_.get(); 96 | return context->context_->getEngine().getDeviceMemorySize(); 97 | } 98 | 99 | // 获取输入输出等信息 100 | void TRTInferImpl::build_engine_input_and_outputs_mapper() { 101 | EngineContext* context = (EngineContext*)this->context_.get(); 102 | int nbBindings = context->engine_->getNbBindings(); 103 | // int max_batchsize = context->engine_->getMaxBatchSize(); 104 | int max_batchsize = batch_max_size_; 105 | 106 | inputs_.clear(); 107 | inputs_name_.clear(); 108 | outputs_.clear(); 109 | outputs_name_.clear(); 110 | orderdBlobs_.clear(); 111 | bindingsPtr_.clear(); 112 | blobsNameMapper_.clear(); 113 | for(int i = 0; i < nbBindings; ++i) { 114 | auto dims = context->engine_->getBindingDimensions(i); 115 | auto type = context->engine_->getBindingDataType(i); 116 | const char* bindingName = context->engine_->getBindingName(i); 117 | dims.d[0] = max_batchsize; 118 | auto newTensor = std::make_shared(dims.nbDims, dims.d); 119 | newTensor->set_stream(this->context_->stream_); 120 | newTensor->set_workspace(this->workspace_); 121 | if(context->engine_->bindingIsInput(i)) { 122 | // if is input 123 | inputs_.push_back(newTensor); 124 | inputs_name_.push_back(bindingName); 125 | inputs_map_to_ordered_index_.push_back(orderdBlobs_.size()); 126 | } else { 127 | // if is output 128 | outputs_.push_back(newTensor); 129 | outputs_name_.push_back(bindingName); 130 | outputs_map_to_ordered_index_.push_back(orderdBlobs_.size()); 131 | } 132 | blobsNameMapper_[bindingName] = i; 133 | orderdBlobs_.push_back(newTensor); 134 | } 135 | bindingsPtr_.resize(orderdBlobs_.size()); 136 | } 137 | 138 | // 数据和推理引擎设置cuda流 139 | void TRTInferImpl::set_stream(cudaStream_t stream) { 140 | this->context_->set_stream(stream); 141 | 142 | for(auto& t : orderdBlobs_) t->set_stream(stream); 143 | } 144 | 145 | // 获取当前cuda流 146 | cudaStream_t TRTInferImpl::get_stream() { 147 | return this->context_->stream_; 148 | } 149 | 150 | // 获取当前设备 151 | int TRTInferImpl::device() { 152 | return device_; 153 | } 154 | 155 | // 等待同步 156 | void TRTInferImpl::synchronize() { 157 | checkCudaRuntime(cudaStreamSynchronize(context_->stream_)); 158 | } 159 | 160 | // 判断是否属于输出 161 | bool TRTInferImpl::is_output_name(const std::string& name) { 162 | return std::find(outputs_name_.begin(), outputs_name_.end(), name) != outputs_name_.end(); 163 | } 164 | 165 | // 判断是否属于输入 166 | bool TRTInferImpl::is_input_name(const std::string& name) { 167 | return std::find(inputs_name_.begin(), inputs_name_.end(), name) != inputs_name_.end(); 168 | } 169 | 170 | // 推理 171 | void TRTInferImpl::forward(bool sync) { 172 | EngineContext* context = (EngineContext*)context_.get(); 173 | int inputBatchSize = inputs_[0]->size(0); 174 | for(int i = 0; i < context->engine_->getNbBindings(); ++i) { 175 | auto dims = context->engine_->getBindingDimensions(i); 176 | auto type = context->engine_->getBindingDataType(i); 177 | dims.d[0] = inputBatchSize; 178 | if(context->engine_->bindingIsInput(i)) { 179 | context->context_->setBindingDimensions(i, dims); 180 | } 181 | } 182 | 183 | for(int i = 0; i < outputs_.size(); ++i) { 184 | outputs_[i]->resize_single_dim(0, inputBatchSize); 185 | outputs_[i]->to_gpu(false); 186 | } 187 | 188 | for(int i = 0; i < orderdBlobs_.size(); ++i) bindingsPtr_[i] = orderdBlobs_[i]->gpu(); 189 | 190 | void** bindingsptr = bindingsPtr_.data(); 191 | // bool execute_result = context->context_->enqueue(inputBatchSize, bindingsptr, 192 | // context->stream_, nullptr); 193 | bool execute_result = context->context_->enqueueV2(bindingsptr, context->stream_, nullptr); 194 | if(!execute_result) { 195 | auto code = cudaGetLastError(); 196 | INFOF("execute fail, code %d[%s], message %s", code, cudaGetErrorName(code), 197 | cudaGetErrorString(code)); 198 | } 199 | 200 | if(sync) { 201 | synchronize(); 202 | } 203 | } 204 | 205 | // 获取workspace_(这是一个内存管理类的指针) 206 | std::shared_ptr TRTInferImpl::get_workspace() { 207 | return workspace_; 208 | } 209 | 210 | // 返回输入数量 211 | int TRTInferImpl::num_input() { 212 | return this->inputs_.size(); 213 | } 214 | 215 | // 返回输出数量 216 | int TRTInferImpl::num_output() { 217 | return this->outputs_.size(); 218 | } 219 | 220 | // 设置第index的输入 221 | void TRTInferImpl::set_input(int index, std::shared_ptr tensor) { 222 | Assert(index >= 0 && index < inputs_.size()); 223 | this->inputs_[index] = tensor; 224 | 225 | int order_index = inputs_map_to_ordered_index_[index]; 226 | this->orderdBlobs_[order_index] = tensor; 227 | } 228 | 229 | // 设置第index的输出 230 | void TRTInferImpl::set_output(int index, std::shared_ptr tensor) { 231 | Assert(index >= 0 && index < outputs_.size()); 232 | this->outputs_[index] = tensor; 233 | 234 | int order_index = outputs_map_to_ordered_index_[index]; 235 | this->orderdBlobs_[order_index] = tensor; 236 | } 237 | 238 | // 返回第index输入tensor 239 | std::shared_ptr TRTInferImpl::input(int index) { 240 | Assert(index >= 0 && index < inputs_name_.size()); 241 | return this->inputs_[index]; 242 | } 243 | 244 | // 返回第index输入tensor名字 245 | std::string TRTInferImpl::get_input_name(int index) { 246 | Assert(index >= 0 && index < inputs_name_.size()); 247 | return inputs_name_[index]; 248 | } 249 | 250 | // 返回第index输输出tensor 251 | std::shared_ptr TRTInferImpl::output(int index) { 252 | Assert(index >= 0 && index < outputs_.size()); 253 | return outputs_[index]; 254 | } 255 | 256 | // 返回第index输出tensor名字 257 | std::string TRTInferImpl::get_output_name(int index) { 258 | Assert(index >= 0 && index < outputs_name_.size()); 259 | return outputs_name_[index]; 260 | } 261 | 262 | // 获取最大batchsize 263 | int TRTInferImpl::get_max_batch_size() { 264 | Assert(this->context_ != nullptr); 265 | // return this->context_->engine_->getMaxBatchSize(); 266 | return batch_max_size_; 267 | } 268 | 269 | // 根据名字查找tensor 270 | std::shared_ptr TRTInferImpl::tensor(const std::string& name) { 271 | Assert(this->blobsNameMapper_.find(name) != this->blobsNameMapper_.end()); 272 | return orderdBlobs_[blobsNameMapper_[name]]; 273 | } 274 | 275 | /////////////////////////////////////////////////////////////////////// 276 | /////////////////////////加载文件初始化对象 ///////////////////////////// 277 | /////////////////////////////////////////////////////////////////////// 278 | std::shared_ptr load_infer(const std::string& file, int batch_size) { 279 | std::shared_ptr infer(new TRTInferImpl()); 280 | if(!infer->load(file, batch_size)) infer.reset(); 281 | return infer; 282 | } 283 | 284 | }; // namespace FasterTRT 285 | -------------------------------------------------------------------------------- /2_faster_tensorrt/readme.md: -------------------------------------------------------------------------------- 1 | # Faster tensorrt 2 | 3 | ## 前言 4 | 5 | 使用之前你应该已经了解trt的构建和推理流程,所以此处不再涉及基础使用。你应该修改的最少有 6 | ```txt 7 | 1. CMakeLists.txt中的cuda、cudnn、tensorrt环境路径 8 | 2. main.cpp中的测试推理图片/视频的路径、trt二进制文件路径,推理类别等 9 | 3. 预处理和后处理也要根据实际使用模型修改,本文代码以yolox为例 10 | ``` 11 | 12 | 原始的TensorRT_Pro有十分优秀的性能,并且接口的设计也很巧妙。但是我在复现和使用的时候发现部分可能不太适用于我当前使用的机器人。 13 | 1. 它的加速是在将需要推理的所有图像全部commit, 然后它内部每个batch的加载和推理。但是在单目机器人上往往是视频流输入,此时是一般是不能输入batch数据的,所以此时实际上是不会比直接推理快多少。 14 | 15 | 2. 图像commit后的结果等待没有任务队列管理 16 | 17 | 3. 它用的是自写的CUDA NMS,但是实际上TensorRT8上有很多官方的NMS插件,可以替换。两者的实际效果对比待测试。 18 | 19 | 20 | ## 1. 文件说明 21 | 22 | 我在大多数地方都已经加了中文注释,应该能够容易看懂。当然注释可能也会有写错或者理解错误啥的,还是需要有自己的思考的,也欢迎一起交流。在`src`目录下一共有五大部分,分别是`apps`,`base`,`eval`,`kernels`,`onnx_model`,`main.cpp`。 23 | 24 | ### 1.1 base 25 | 26 | 这里主要是一些整体框架的基础,根据仔细需要求修改。 27 | 28 | 1. `tools.hpp`: 一些工具函数 包括log日志打印,CUDA检查,输出文件保存读取等定义并直接实现 29 | 2. `memory_tensor.hpp`: 定义`MixMemory`实现内存和显存的申请和释放;定义`Tensor`实现张量的管理、扩容、拷贝等 30 | 3. `memory_tensor.cpp`: `MixMemory`和`Tensor`的实现 31 | 4. `monopoly_accocator.hpp`: 定义内存独占管理分配器,最终实现预处理和推理并行的重要工具 32 | 5. `infer_base.hpp`: 定义trt引擎管理类和异步安全推理类 33 | 6. `infer_base.cpp`: trt引擎管理类和异步安全推理类的实现 34 | 7. `trt_base.hpp`: 定义trt引擎构建和量化 35 | 8. `trt_base.hpp`: trt引擎构建和量化实现 36 | 37 | ### 1.2 kernels 38 | 39 | 推荐把cuda相关实现放在此文件夹中。 40 | 41 | 1. `cuda_kernel.cuh`: cuda核函数的定义 42 | 1. `cuda_kernel.cu`: cuda核函数的实现,预处理和后处理相关的cuda加速代码 43 | 44 | 45 | ### 1.3 eval 46 | 47 | 这里一个评估相关代码,可以测试相关数据集(coco格式)使用trt推理的map,暂时主要针对目标检测。 48 | 49 | 1. `save.hpp`: 一个保存检测结果到文件里的类 50 | 2. `get_imgid_txt.py`: 读取`eval_results.json`,来保存图片name和id到文件`img_id.txt` 51 | 3. `eval.cpp`: 读取`img_id.txt`中的图片,进行推理,并保存相应结果到`results.txt` 52 | 4. `img_id.txt`: img的id和img的name的对应,便于评估 53 | 5. `results.txt`: 检测的结果 54 | 6. `eval_results.json`: 检测结果保存到json文件 55 | 7. `eval.py`: 最终的评估程序,打印结果 56 | 57 | 58 | ### 1.4 apps 59 | 60 | 这里是实际模型的实现地方,定义模型的结构,推理过程,预处理和后处理流程等,推荐每个模型新建一个文件夹实现。 61 | 62 | 1. `common.hpp`: 一些视觉任务中都会用到的功能挡在这里,比如bbox定义、图片仿射变换的计算等 63 | 64 | 然后就一些具体模型的实现了 65 | 2. `yolo/yolo.h`: 定义yolo的推理 66 | 3. `yolo/yolo.cpp`: yolo推理的实现 67 | 68 | ### 1.5 onnx_model 69 | 70 | 有一些模型的onnx文件需要一些操作才能被本仓库正确检测,这个地方存放编辑onnx的python文件。 71 | 72 | ### 1.6 main 73 | 74 | 1. `main.cpp`: 主函数,调用和实现功能都在此处,动态的控制队列也是在此处实际推理中实现。 75 | 76 | 77 | ## 2. 使用教程 78 | 79 | ### 2.1 模型转换 80 | 81 | #### 2.1.1 trtexec 82 | 模型转换部分,在不需要增加自定义算子的时候,想要导出tensorrt的engine,**trtexec is all you need!** 83 | 84 | ```shell 85 | # 构建模型时 86 | trtexec 87 | --onnx = ./model NCHW.onnx # 指定onnx模型文件名 88 | # --output=y:0 # 指定输出张量名(使用 Onnx 时该选项无效) 89 | --minShapes =x:0:1x1x28x28 90 | --optShapes =x:0:4x1x28x28 91 | --maxShapes =x:0:16x1x28x28 # 指定输入形状的范围最小值、最常见值、最大值 92 | --workspace = 1024 # 以后要用 memPoolSize 优化过程可使用显存最大值 93 | --fp16 # 指定引擎精度和稀疏性等属性 int8 noTF32 best sparsity 94 | --saveEngine=model.plan # 指定输出引擎文件名 95 | --skipInference # 只创建引擎不运行 旧版本叫buildonly 96 | --verbose # 打印详细日志 97 | --timingCacheFile=timing.cache # 指定输出优化计时缓存文件名 98 | --profilingVerbosity =detailed # 构建期保留更多的逐层信息 99 | --dumpLayerInfo # 打印层信息 100 | --exportLayerInfo=layerInfo.txt # 导出引擎逐层信息,可与 profilingVerbosity 合用 101 | 102 | # 模型推理时 103 | trtexec 104 | --loadEngine=model.plan # 读取 engine 文件 105 | --shapes=x:1x1x28x28 # 指定输入张量形状 106 | --warmUp=1000 # 热身阶段最短运行时间(单位: ms 107 | --duration=10 # 测试阶段最短运行时间(单位: s 108 | --iterations=100 # 指定测试阶段运行的最小迭代次数 109 | --useCudaGraph # 使用 CUDAGraph 来捕获和执行推理过程 110 | --noDataTransfers # 关闭 Host 和 Device 之间的数据传输 111 | --streams=2 # 使用多个 stream 来运行推理 112 | --threads # 使用多线程 113 | --verbose # 打印详细日志 114 | --dumpProfile 115 | --exportProfile=layerProfile.txt # 保存逐层性能数据信息 116 | ``` 117 | 118 | 119 | #### 2.1.2 polygraphy 120 | 很牛的工具! 121 | 122 | polygraphy工具,可以多后端运行对比,对比不同后端结果,生成engine等(重要),还可以判断那些算子不能被trt加速,并把这些切割出来 123 | Build TensorRT engine using the ONNX file, and compare the output of each layer between Onnxruntime and TensorRT 124 | ```shell 125 | polygraphy run model.onnx \ 126 | --onnxrt --trt \ 127 | --workspace 1000000000 \ 128 | --save-engine=model-FP32-MarkAll.plan \ 129 | --atol 1e-3 --rtol 1e-3 \ 130 | --verbose \ 131 | --onnx-outputs mark all \ 132 | --trt-outputs mark all \ 133 | --trt-min-shapes 'tensor-0:[1,1,28,28]' \ 134 | --trt-opt-shapes 'tensor-0:[4,1,28,28]' \ 135 | --trt-max-shapes 'tensor-0:[16,1,28,28]' \ 136 | --input-shapes 'tensor-0:[4,1,28,28]' 137 | > result-run-FP32-MarkAll.log 2>&1 138 | 139 | ``` 140 | 141 | #### 2.1.3 trt api 142 | 除此之外,tensorrt_pro中也给出了一个complie的模型转换接口,我也搬运了过来 143 | ```cpp 144 | 145 | bool compile( 146 | Mode mode, 147 | YoloType type, 148 | unsigned int max_batch_size, 149 | const string& source_onnx_file, 150 | const string& save_engine_file, 151 | size_t max_workspace_size = 1<<30, 152 | const string& int8_images_folder="", 153 | const string& int8_entropy_calibrator_cache_file="" 154 | ); 155 | ``` 156 | 157 | 158 | ### 2.2 模型推理 159 | 160 | 目前已经仓库里支持了 161 | 1. yolox: 基本是官方默认的吧,我把fcous换成了conv 162 | 2. yolov8:yolov8导出的onnx模型需要经过编辑,主要是输出增加一个维度调整,方便和yolox的一起处理。可以参考代码[v8onnx_tranpose.py](./src/onnx_model/v8onnx_tranpose.py) 163 | 3. rtdetr:百度家出的检测器,导出可以参考[rtdetr_sim_export_trt.py](./src/onnx_model/rtdetr_sim_export_trt.py) 164 | ... 165 | 166 | 167 | 本仓库就突出一个接口简单。 168 | 169 | ```cpp 170 | // 创建模型 171 | auto yolo = YOLO::create_infer(model_file, type, deviceid, batch_size, confidence_threshold, nms_threshold); 172 | 173 | // 推理图片 174 | auto objs = yolo->commit(image); 175 | 176 | // 得到结果 177 | auto res = objs.get(); 178 | 179 | ``` 180 | 控制队列形式 181 | ```cpp 182 | 183 | queue> out_queue; 184 | 185 | for(int i=0;i<10;i++) { 186 | auto objs = yolo->commit(image); 187 | out_queue.emplace(objs); 188 | if(out_queue.size() < keep_queue_long) { 189 | continue; 190 | } 191 | auto res = out_queue.front().get(); 192 | out_queue.pop(); 193 | } 194 | while(!out_queue.empty()) { 195 | auto res = out_queue.front().get(); 196 | out_queue.pop(); 197 | } 198 | ``` 199 | 200 | ### 2.3 模型测评 201 | 202 | 使用c++的推理结果来实现coco格式的eval格式,进而便于对比加速前后精度的变化。稍微有点麻烦,整体思想是保存c++的推理结果,然后用python的pycocotools来实现结果的计算。 203 | 204 | 首先运行`eval/get_imgid_txt.py`,得到`img_id.txt`文件,包含了图片名称和图片id的对应 205 | ``` 206 | 0 005894.jpg 207 | 1 004755.jpg 208 | ``` 209 | 210 | 然后默认cmake会编译eval文件夹的内容,当需要模型评测时,运行`build/eval`可以得到`results.txt`,包含推理结果 211 | ``` 212 | 005894.jpg 0 0 0.836939 1175 609 229 181 213 | 005894.jpg 0 1 0.768631 2468 1880 99 162 214 | 005894.jpg 0 2 0.70347 1938 607 216 141 215 | 005894.jpg 0 2 0.781555 944 1442 163 203 216 | 004755.jpg 1 1 0.557236 622 361 59 45 217 | 004755.jpg 1 1 0.676005 383 79 64 44 218 | ``` 219 | 最后运行`eval/eval.py`,得到最终的coco格式的map 220 | ``` 221 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.447 222 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.751 223 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.486 224 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.175 225 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.404 226 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.503 227 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.166 228 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.465 229 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.507 230 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.244 231 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.471 232 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.558 233 | ``` 234 | 235 | 236 | ### 2.4 自定义模型 237 | 238 | #### 2.4.1 新建文件夹(bushi) 239 | 240 | 建议在`src/apps`目录下新建一个文件夹,此处以暂未出现的`YoloDetr`称呼。创建相应的头文件和源文件`yolo_detr.h`,`yolo_detr.cpp`,相关的预处理和后处理之类的肯定是要和训练期间的设置保持一致的,不过既然叫`***detr`了明显是不需要后处理的😏。预处理的话建议在gpu上完成,可以在`src/kernels`文件夹中新创建你需要的预处理,或者找之前满足你要求的已经实现的预处理。 241 | 242 | #### 2.5.2 实现 243 | 244 | 首先包含相关头文件,并使用命名空间 245 | ```cpp 246 | #include "../common.hpp" 247 | using namespace FasterTRT; 248 | ``` 249 | 然后新建一个推理类,并实现相关方法。 250 | ```cpp 251 | // 线程安全模板类设置模板类型 252 | using ThreadSafedAsyncInferImpl = ThreadSafedAsyncInfer 253 | < 254 | cv::Mat, // input 255 | BoxArray, // output 256 | tuple, // start param 257 | AffineMatrix // additional 258 | >; 259 | // 推理基类 260 | using Infer = InferBase; 261 | 262 | // 创建该模型的类 263 | class YoloDETR : public Infer, public ThreadSafedAsyncInferImpl { 264 | 265 | // 最少实现 266 | // 1. 初始化 里面需要调用ThreadSafedAsyncInferImpl::startup(make_tuple(file, gpuid)); 267 | virtual bool startup(const string& file, YoloType type, int gpuid, int batch_size, float confidence_threshold, float nms_threshold); 268 | 269 | // 2. 工作线程 里面指定输入输出 并初始化内存显存,指定推理顺序等等 270 | virtual void worker(promise& result) override; 271 | 272 | // 3. 预处理操作 273 | virtual bool preprocess(Job& job, const Mat& image) override; 274 | 275 | // 4. 推理,包括组推理和单个推理 276 | virtual vector> commits(const vector& images) override; 277 | virtual shared_future commit(const Mat& image) override; 278 | 279 | } 280 | 281 | // 封装接口,最终暴露给用户的只有commit和commits方法。 282 | // 当然也可以选择把所有接口都开放,不使用这里初始化就行了。 283 | shared_ptr create_infer(...){ 284 | shared_ptr instance(new YoloDETR()); 285 | if(!instance->startup(...)){ 286 | instance.reset(); 287 | } 288 | return instance; 289 | } 290 | 291 | ``` 292 | 293 | 294 | ## 3. 推理性能 295 | 296 | 使用Nsight Systems看看cuda处理过程,我后来才发现trt_pro中的fast_yolo是没有多流的,而完整版本是有多流选项的。不过我自己也已经实现了多流了,下面是我优化的过程。 297 | 298 | 首先是原始版本,多线程但是单流,这时候向流中提交任务还是串行实现的,虽然整体效率yolox官方给的高很多,但是还有提升空间。 299 | ![](./sources/ori.jpg) 300 | 这个时候cpu侧的双线程已经没用了,及时加上控制队列反而会造成这样的后果。 301 | ![](./sources/ori_queue.jpg) 302 | 303 | 然后,通过上图可以发现,H2D十分耗时,于是考虑使用双流,一流推理,另外一流专门执行H2D,顺带完成预处理工作。 304 | ![](./sources/2streamv1.jpg) 305 | 306 | 这个时候可能会疑惑,哎我现在已经是双流了,数据处理也确实在两条stream上了,为啥还不能并行呢?这是因为双流有一个问题是需要保证数据流的顺序。图片输入在提交任务到steam1(预处理流)后,数据异步拷贝到gpu,但是这个时候生产者已经把任务放到了任务队列中,所以推理线程会立即开始着手取数据和推理,这个时候由于是双流可能访问同一显存,就十分不安全了。于是我在stream1的最后执行了一个流同步的操作。 307 | 那怎么实现多流并行呢?我是用的方法是采用一个控制队列,保存推理线程返回future,然后立即推流下一帧,这个时候数据也安全,推理也安全!并且可以实现并行。 308 | ![](./sources/2streamv2.jpg) 309 | 310 | 并且整体的gpu利用率也更近紧凑了,下图左边是加控制队列后,右边是加控制队列前。 311 | ![](./sources/2steam_overview.jpg) 312 | 313 | 314 | 在2080Ti(8.5)上推理图片,不包含图像的读取和画框,warmup500,跑2000轮,平均耗时 315 | 316 | | method | ori | ori+queue | ori+2stream | ori+queue+2stream | 317 | | :----: | :----: | :----: | :----: | :----: | 318 | | cost time | 2.25ms | 1.84ms | 2.28ms | 1.41ms | 319 | | FPS | 444.64 | 542.89 | 438.6 | 709.98 | 320 | 321 | 326 | 327 | 328 | ## 4. More优化 329 | 330 | - [ ] gpu内存异步操作内核进一步融合,使用一个gpu内核实现运算符组合,减少数据传输和内核启动延迟 331 | - [ ] 一个tensorrt的engine可以创建多个context,实现多线程调用。只占用一个engine显存的大小,同时供多个推理运算 332 | - [ ] 向量化全局内存访问,提高内存访问效率 333 | - [ ] transformer系列算法加速支持 by fastertransformers 334 | -------------------------------------------------------------------------------- /2_faster_tensorrt/src/base/infer_base.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file infer_base.hpp 3 | * @author 0zzx0 4 | * @brief 推理器基类 5 | * @version 0.1 6 | * @date 2023-6-11 2023-8-21 7 | * 8 | * @copyright Copyright (c) 2023 9 | * 10 | */ 11 | 12 | #ifndef INFER_BASE_HPP 13 | #define INFER_BASE_HPP 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include 24 | 25 | #include "memory_tensor.hpp" 26 | #include "monopoly_accocator.hpp" 27 | #include "../kernels/cuda_kernel.cuh" 28 | 29 | namespace FasterTRT { 30 | 31 | using namespace nvinfer1; 32 | 33 | /////////////////////////// TRT logger /////////////////////////// 34 | class Logger : public ILogger { 35 | public: 36 | virtual void log(Severity severity, const char* msg) noexcept override { 37 | if(severity == Severity::kINTERNAL_ERROR) { 38 | INFOE("NVInfer INTERNAL_ERROR: %s", msg); 39 | abort(); 40 | } else if(severity == Severity::kERROR) { 41 | INFOE("NVInfer: %s", msg); 42 | } else if(severity == Severity::kWARNING) { 43 | INFOW("NVInfer: %s", msg); 44 | } else if(severity == Severity::kINFO) { 45 | INFOD("NVInfer: %s", msg); 46 | } else { 47 | INFOD("%s", msg); 48 | } 49 | } 50 | }; 51 | static Logger gLogger; 52 | 53 | // 销毁tensorrt中间指针对象的函数模板 54 | template 55 | static void destroy_nvidia_pointer(_T* ptr) { 56 | if(ptr) ptr->destroy(); 57 | } 58 | 59 | /** 60 | * @brief 封装trt运行过程中的部分资源 方便一起创建和销毁 61 | * 62 | */ 63 | class EngineContext { 64 | public: 65 | virtual ~EngineContext() { destroy(); } 66 | 67 | // 设置stream 如果已经存在销毁旧的,添加新的 68 | void set_stream(cudaStream_t stream) { 69 | if(owner_stream_) { 70 | if(stream_) { 71 | cudaStreamDestroy(stream_); 72 | } 73 | owner_stream_ = false; 74 | } 75 | stream_ = stream; 76 | } 77 | 78 | // 使用智能指针创建runtime engine context和初始化stream 79 | bool build_model(const void* pdata, size_t size) { 80 | destroy(); 81 | 82 | if(pdata == nullptr || size == 0) return false; 83 | 84 | owner_stream_ = true; 85 | checkCudaRuntime(cudaStreamCreate(&stream_)); 86 | if(stream_ == nullptr) return false; 87 | 88 | runtime_ = std::shared_ptr(createInferRuntime(gLogger), 89 | destroy_nvidia_pointer); 90 | if(runtime_ == nullptr) return false; 91 | 92 | engine_ = 93 | std::shared_ptr(runtime_->deserializeCudaEngine(pdata, size, nullptr), 94 | destroy_nvidia_pointer); 95 | if(engine_ == nullptr) return false; 96 | 97 | // runtime_->setDLACore(0); 98 | context_ = std::shared_ptr(engine_->createExecutionContext(), 99 | destroy_nvidia_pointer); 100 | return context_ != nullptr; 101 | } 102 | 103 | private: 104 | // 销毁这些指针 通过让智能指针引用计数减一 105 | void destroy() { 106 | context_.reset(); 107 | engine_.reset(); 108 | runtime_.reset(); 109 | 110 | if(owner_stream_) { 111 | if(stream_) { 112 | cudaStreamDestroy(stream_); 113 | } 114 | } 115 | stream_ = nullptr; 116 | } 117 | 118 | public: 119 | cudaStream_t stream_ = nullptr; 120 | bool owner_stream_ = false; 121 | std::shared_ptr context_; 122 | std::shared_ptr engine_; 123 | std::shared_ptr runtime_ = nullptr; 124 | }; 125 | 126 | /** 127 | * @brief 推理引擎的创建和推理 128 | 可以获取推理模型的各类输入输出信息 129 | * 130 | */ 131 | class TRTInferImpl { 132 | public: 133 | virtual ~TRTInferImpl(); 134 | bool load(const std::string& file, int batch_size); 135 | bool load_from_memory(const void* pdata, size_t size); 136 | void destroy(); 137 | 138 | void forward(bool sync); 139 | 140 | int get_max_batch_size(); 141 | cudaStream_t get_stream(); 142 | void set_stream(cudaStream_t stream); 143 | void synchronize(); 144 | size_t get_device_memory_size(); 145 | std::shared_ptr get_workspace(); 146 | std::shared_ptr input(int index = 0); 147 | std::string get_input_name(int index = 0); 148 | std::shared_ptr output(int index = 0); 149 | std::string get_output_name(int index = 0); 150 | std::shared_ptr tensor(const std::string& name); 151 | bool is_output_name(const std::string& name); 152 | bool is_input_name(const std::string& name); 153 | void set_input(int index, std::shared_ptr tensor); 154 | void set_output(int index, std::shared_ptr tensor); 155 | std::shared_ptr> serial_engine(); 156 | 157 | void print(); 158 | 159 | int num_output(); 160 | int num_input(); 161 | int device(); 162 | 163 | private: 164 | void build_engine_input_and_outputs_mapper(); 165 | 166 | private: 167 | std::vector> inputs_; 168 | std::vector> outputs_; 169 | std::vector inputs_map_to_ordered_index_; 170 | std::vector outputs_map_to_ordered_index_; 171 | std::vector inputs_name_; 172 | std::vector outputs_name_; 173 | std::vector> orderdBlobs_; 174 | std::map blobsNameMapper_; 175 | std::shared_ptr context_; 176 | std::vector bindingsPtr_; 177 | std::shared_ptr workspace_; 178 | int device_ = 0; 179 | int batch_max_size_ = 1; 180 | }; 181 | 182 | /** 183 | * @brief 异步线程安全的推理器(虚基类 子类至少重写preprocess work) 184 | 通过异步线程启动,使得调用方允许任意线程调用把图像做输入,并通过future来获取异步结果 185 | 模板类 186 | * 187 | * @tparam Input 输入 188 | * @tparam Output 输入 189 | * @tparam StartParam 参数 190 | * @tparam JobAdditional job参数 191 | */ 192 | template , 193 | class JobAdditional = int> 194 | class ThreadSafedAsyncInfer { 195 | public: 196 | // Job数据类型。 197 | struct Job { 198 | Input input; 199 | Output output; 200 | JobAdditional additional; 201 | MonopolyAllocator::MonopolyDataPointer mono_tensor; 202 | std::shared_ptr> pro; 203 | }; 204 | 205 | virtual ~ThreadSafedAsyncInfer() { stop(); } 206 | 207 | // 停止 由析构函数调用 208 | void stop() { 209 | run_ = false; 210 | cond_.notify_all(); 211 | 212 | /// cleanup jobs 213 | { 214 | std::unique_lock l(jobs_lock_); 215 | while(!jobs_.empty()) { 216 | auto& item = jobs_.front(); 217 | if(item.pro) item.pro->set_value(Output()); 218 | jobs_.pop(); 219 | } 220 | }; 221 | 222 | if(worker_) { 223 | worker_->join(); 224 | worker_.reset(); 225 | } 226 | } 227 | 228 | // 启动 初始化线程 用一个promise等待worker中的初始化结束 229 | bool startup(const StartParam& param) { 230 | run_ = true; 231 | 232 | std::promise pro; 233 | start_param_ = param; 234 | worker_ = 235 | std::make_shared(&ThreadSafedAsyncInfer::worker, this, std::ref(pro)); 236 | return pro.get_future().get(); 237 | } 238 | 239 | // 单输入commit 先预处理input 然后上锁推进工作队列 cond_ 提醒 然后开始等待output 240 | virtual std::shared_future commit(const Input& input) { 241 | Job job; 242 | job.pro = std::make_shared>(); 243 | if(!preprocess(job, input)) { 244 | job.pro->set_value(Output()); 245 | return job.pro->get_future(); 246 | } 247 | 248 | ////////////////////上锁并且推进队列//////////////////////////// 249 | { 250 | std::unique_lock l(jobs_lock_); 251 | // jobs_.push(job); 252 | jobs_.emplace(job); 253 | }; 254 | cond_.notify_one(); 255 | return job.pro->get_future(); 256 | } 257 | 258 | // vector 输入commit 259 | virtual std::vector> commits(const std::vector& inputs) { 260 | int batch_size = std::min((int)inputs.size(), this->tensor_allocator_->capacity()); 261 | std::vector jobs(inputs.size()); 262 | std::vector> results(inputs.size()); 263 | 264 | int nepoch = (inputs.size() + batch_size - 1) / batch_size; 265 | for(int epoch = 0; epoch < nepoch; ++epoch) { 266 | int begin = epoch * batch_size; 267 | int end = std::min((int)inputs.size(), begin + batch_size); 268 | 269 | for(int i = begin; i < end; ++i) { 270 | Job& job = jobs[i]; 271 | job.pro = std::make_shared>(); 272 | if(!preprocess(job, inputs[i])) { 273 | job.pro->set_value(Output()); 274 | } 275 | results[i] = job.pro->get_future(); 276 | } 277 | 278 | /////////////////////////////////////////////////////////// 279 | { 280 | std::unique_lock l(jobs_lock_); 281 | for(int i = begin; i < end; ++i) { 282 | jobs_.emplace(std::move(jobs[i])); 283 | }; 284 | } 285 | cond_.notify_one(); 286 | } 287 | return results; 288 | } 289 | 290 | protected: 291 | // 工作线程(纯虚) 292 | virtual void worker(std::promise& result) = 0; 293 | // 预处理(纯虚) 294 | virtual bool preprocess(Job& job, const Input& input) = 0; 295 | 296 | // 获取任务组 等待之前的任务执行完毕 297 | virtual bool get_jobs_and_wait(std::vector& fetch_jobs, int max_size) { 298 | std::unique_lock l(jobs_lock_); 299 | cond_.wait(l, [&]() { 300 | return !run_ || !jobs_.empty(); 301 | }); // 当前run=true 且 job为empty(队列中的任务做完)的时候才会等待 302 | 303 | if(!run_) return false; 304 | 305 | fetch_jobs.clear(); 306 | for(int i = 0; i < max_size && !jobs_.empty(); ++i) { 307 | fetch_jobs.emplace_back(std::move(jobs_.front())); 308 | jobs_.pop(); 309 | } 310 | return true; 311 | } 312 | 313 | // 获取任务 等待之前的任务执行完毕 314 | virtual bool get_job_and_wait(Job& fetch_job) { 315 | std::unique_lock l(jobs_lock_); 316 | cond_.wait(l, [&]() { return !run_ || !jobs_.empty(); }); 317 | 318 | if(!run_) return false; 319 | 320 | fetch_job = std::move(jobs_.front()); 321 | jobs_.pop(); 322 | return true; 323 | } 324 | 325 | protected: 326 | StartParam start_param_; 327 | std::atomic run_; 328 | std::mutex jobs_lock_; 329 | std::queue jobs_; 330 | std::shared_ptr worker_; 331 | std::condition_variable cond_; 332 | std::shared_ptr> tensor_allocator_; 333 | }; 334 | 335 | /** 336 | * @brief 推理的虚基类 最终暴露给用户的接口,实际推理的类应该继承并实现本类中的方法 337 | * 338 | * @tparam Intput 输入 339 | * @tparam Output 输出 340 | */ 341 | template 342 | class InferBase { 343 | public: 344 | virtual std::shared_future commit(const Intput& image) = 0; 345 | virtual std::vector> commits(const std::vector& images) = 0; 346 | }; 347 | 348 | // 产生一个trt推理的智能指针 参数是序列化文件路径 349 | std::shared_ptr load_infer(const std::string& file, int batch_size); 350 | 351 | }; // namespace FasterTRT 352 | 353 | #endif 354 | -------------------------------------------------------------------------------- /1_trt_base/trt_yolox/cpp/yolox_end2end.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "NvInfer.h" 11 | #include "NvInferPlugin.h" 12 | #include "cuda_runtime_api.h" 13 | #include "logging.h" 14 | 15 | #define CHECK(status) \ 16 | do\ 17 | {\ 18 | auto ret = (status);\ 19 | if (ret != 0)\ 20 | {\ 21 | std::cerr << "Cuda failure: " << ret << std::endl;\ 22 | abort();\ 23 | }\ 24 | } while (0) 25 | 26 | #define DEVICE 0 // GPU id 27 | static const int INPUT_W = 640; 28 | static const int INPUT_H = 640; 29 | const char* INPUT_BLOB_NAME = "images"; 30 | const char* OUTPUT_BLOB_NAME1 = "output"; 31 | const char* OUTPUT_BLOB_NAME2 = "943"; 32 | const char* OUTPUT_BLOB_NAME3 = "944"; 33 | const char* OUTPUT_BLOB_NAME4 = "945"; 34 | 35 | const std::vector class_names={"echinus", "starfish", "holothurian", "scallop"}; 36 | const std::vector> color_list = 37 | { 38 | {0.000, 0.447, 0.741}, 39 | {0.850, 0.325, 0.098}, 40 | {0.929, 0.694, 0.125}, 41 | {0.494, 0.184, 0.556} 42 | }; 43 | 44 | using namespace nvinfer1; 45 | 46 | // class Logger : public nvinfer1::ILogger { 47 | // public: 48 | // void log(Severity severity, const char* msg) noexcept override { 49 | // if (severity != Severity::kINFO) { 50 | // std::cout << msg << std::endl; 51 | // } 52 | // } 53 | // }; 54 | 55 | 56 | class YoloEnd2End{ 57 | public: 58 | YoloEnd2End(const std::string model_path); 59 | cv::Mat static_resize(cv::Mat& image); 60 | float* blobFromImage(cv::Mat& img); 61 | void draw_objects(const cv::Mat& img, float* Boxes, int* ClassIndexs, int* BboxNum); 62 | void Infer(cv::Mat& img, float* Boxes, float* score, int* ClassIndexs, int* BboxNum); 63 | ~YoloEnd2End(); 64 | 65 | private: 66 | nvinfer1::ICudaEngine* engine = nullptr; 67 | nvinfer1::IRuntime* runtime = nullptr; 68 | nvinfer1::IExecutionContext* context = nullptr; 69 | cudaStream_t stream = nullptr; 70 | void* buffs[5]; 71 | int iH, iW, in_size, out_size1, out_size2, out_size3, out_size4; 72 | Logger gLogger; 73 | }; 74 | 75 | // resize 76 | cv::Mat YoloEnd2End::static_resize(cv::Mat& img) { 77 | float r = std::min(INPUT_W / (img.cols*1.0), INPUT_H / (img.rows*1.0)); 78 | // r = std::min(r, 1.0f); 79 | int unpad_w = r * img.cols; 80 | int unpad_h = r * img.rows; 81 | cv::Mat re(unpad_h, unpad_w, CV_8UC3); 82 | cv::resize(img, re, re.size()); 83 | cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(114, 114, 114)); 84 | re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows))); 85 | return out; 86 | } 87 | 88 | // float* YoloEnd2End::blobFromImage(cv::Mat& img){ 89 | // float* blob = new float[img.total()*3]; 90 | // // std::memcpy(blob, img.data, img.total() * 3* sizeof(float)); 91 | // int img_h = img.rows; 92 | // int img_w = img.cols; 93 | // int channelLength = img_w * img_h; 94 | // std::vector split_img = { 95 | // cv::Mat(img_h, img_w, CV_32FC1, blob + channelLength * 0), 96 | // cv::Mat(img_h, img_w, CV_32FC1, blob + channelLength * 1), 97 | // cv::Mat(img_h, img_w, CV_32FC1, blob + channelLength * 2) 98 | // }; 99 | // cv::split(img, split_img); 100 | // return blob; 101 | // } 102 | 103 | 104 | float* YoloEnd2End::blobFromImage(cv::Mat& img){ 105 | float* blob = new float[img.total()*3]; 106 | int channels = 3; 107 | int img_h = img.rows; 108 | int img_w = img.cols; 109 | for (size_t c = 0; c < channels; c++) 110 | { 111 | for (size_t h = 0; h < img_h; h++) 112 | { 113 | for (size_t w = 0; w < img_w; w++) 114 | { 115 | blob[c * img_w * img_h + h * img_w + w] = 116 | (float)img.at(h, w)[c]; 117 | } 118 | } 119 | } 120 | return blob; 121 | } 122 | 123 | YoloEnd2End::YoloEnd2End(const std::string model_path) { 124 | std::ifstream ifile(model_path, std::ios::in | std::ios::binary); 125 | if (!ifile) { 126 | std::cout << "read serialized file failed\n"; 127 | std::abort(); 128 | } 129 | 130 | ifile.seekg(0, std::ios::end); 131 | const int mdsize = ifile.tellg(); 132 | ifile.clear(); 133 | ifile.seekg(0, std::ios::beg); 134 | std::vector buf(mdsize); 135 | ifile.read(&buf[0], mdsize); 136 | ifile.close(); 137 | std::cout << "model size: " << mdsize << std::endl; 138 | 139 | runtime = nvinfer1::createInferRuntime(gLogger); 140 | initLibNvInferPlugins(&gLogger, ""); 141 | engine = runtime->deserializeCudaEngine((void*)&buf[0], mdsize, nullptr); 142 | 143 | auto in_dims = engine->getTensorShape(INPUT_BLOB_NAME); 144 | // auto in_dims = engine->getBindingDimensions(engine->getBindingIndex("images")); 145 | iH = in_dims.d[2]; 146 | iW = in_dims.d[3]; 147 | in_size = 1; 148 | for (int j = 0; j < in_dims.nbDims; j++) { 149 | in_size *= in_dims.d[j]; 150 | } 151 | auto out_dims1 = engine->getTensorShape(OUTPUT_BLOB_NAME1); 152 | // auto out_dims1 = engine->getBindingDimensions(engine->getBindingIndex("output")); 153 | out_size1 = 1; 154 | for (int j = 0; j < out_dims1.nbDims; j++) { 155 | out_size1 *= out_dims1.d[j]; 156 | } 157 | auto out_dims2 = engine->getTensorShape(OUTPUT_BLOB_NAME2); 158 | // auto out_dims2 = engine->getBindingDimensions(engine->getBindingIndex("943")); 159 | out_size2 = 1; 160 | for (int j = 0; j < out_dims2.nbDims; j++) { 161 | out_size2 *= out_dims2.d[j]; 162 | } 163 | auto out_dims3 = engine->getTensorShape(OUTPUT_BLOB_NAME3); 164 | // auto out_dims3 = engine->getBindingDimensions(engine->getBindingIndex("944")); 165 | out_size3 = 1; 166 | for (int j = 0; j < out_dims3.nbDims; j++) { 167 | out_size3 *= out_dims3.d[j]; 168 | } 169 | auto out_dims4 = engine->getTensorShape(OUTPUT_BLOB_NAME4); 170 | // auto out_dims4 = engine->getBindingDimensions(engine->getBindingIndex("945")); 171 | out_size4 = 1; 172 | for (int j = 0; j < out_dims4.nbDims; j++) { 173 | out_size4 *= out_dims4.d[j]; 174 | } 175 | 176 | context = engine->createExecutionContext(); 177 | if (!context) { 178 | std::cout << "create execution context failed\n"; 179 | std::abort(); 180 | } 181 | 182 | CHECK(cudaMalloc(&buffs[0], in_size * sizeof(float))); 183 | CHECK(cudaMalloc(&buffs[1], out_size1 * sizeof(int))); 184 | CHECK(cudaMalloc(&buffs[2], out_size2 * sizeof(float))); 185 | CHECK(cudaMalloc(&buffs[3], out_size3 * sizeof(float))); 186 | CHECK(cudaMalloc(&buffs[4], out_size4 * sizeof(int))); 187 | CHECK(cudaStreamCreate(&stream)); 188 | } 189 | 190 | void YoloEnd2End::Infer(cv::Mat& img, float* Boxes, float* score, int* ClassIndexs, int* BboxNum) { 191 | 192 | cv::Mat pr_img; 193 | pr_img = this->static_resize(img); 194 | float* blob = this->blobFromImage(pr_img); 195 | float scale = std::min(INPUT_W / (img.cols*1.0), INPUT_H / (img.rows*1.0)); 196 | 197 | static int* num_dets = new int[out_size1]; 198 | static float* det_boxes = new float[out_size2]; 199 | static float* det_scores = new float[out_size3]; 200 | static int* det_classes = new int[out_size4]; 201 | 202 | CHECK(cudaMemcpyAsync(buffs[0], &blob[0], in_size * sizeof(float), cudaMemcpyHostToDevice, stream)); 203 | 204 | context->enqueueV2(&buffs[0], stream, nullptr); 205 | 206 | CHECK(cudaMemcpyAsync(num_dets, buffs[1], out_size1 * sizeof(int), cudaMemcpyDeviceToHost, stream)); 207 | CHECK(cudaMemcpyAsync(det_boxes, buffs[2], out_size2 * sizeof(float), cudaMemcpyDeviceToHost, stream)); 208 | CHECK(cudaMemcpyAsync(det_scores, buffs[3], out_size3 * sizeof(float), cudaMemcpyDeviceToHost, stream)); 209 | CHECK(cudaMemcpyAsync(det_classes, buffs[4], out_size4 * sizeof(int), cudaMemcpyDeviceToHost, stream)); 210 | 211 | BboxNum[0] = num_dets[0]; 212 | int img_w = img.cols; 213 | int img_h = img.rows; 214 | for (size_t i = 0; i < num_dets[0]; i++) { 215 | float x0 = (det_boxes[i * 4]) / scale; 216 | float y0 = (det_boxes[i * 4 + 1]) / scale; 217 | float w = (det_boxes[i * 4 + 2]) / scale; 218 | float h = (det_boxes[i * 4 + 3]) / scale; 219 | 220 | x0 = x0 - w/2.0; 221 | y0 = y0 - h/2.0; 222 | x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); 223 | y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); 224 | w = std::max(w, 0.f); 225 | h = std::max(h, 0.f); 226 | Boxes[i * 4] = x0; 227 | Boxes[i * 4 + 1] = y0; 228 | Boxes[i * 4 + 2] = w; 229 | Boxes[i * 4 + 3] = h; 230 | ClassIndexs[i] = det_classes[i]; 231 | score[i*4] = det_scores[i]; 232 | } 233 | delete blob; 234 | } 235 | 236 | void YoloEnd2End::draw_objects(const cv::Mat& img, float* Boxes, int* ClassIndexs, int* BboxNum) { 237 | cv::Mat image = img.clone(); 238 | for (int j = 0; j < BboxNum[0]; j++) { 239 | cv::Rect rect(Boxes[j * 4], Boxes[j * 4 + 1], Boxes[j * 4 + 2], Boxes[j * 4 + 3]); 240 | 241 | cv::Scalar color = cv::Scalar(color_list[ClassIndexs[j]][0], 242 | color_list[ClassIndexs[j]][1], 243 | color_list[ClassIndexs[j]][2]); 244 | 245 | cv::rectangle(image, rect, color * 255, 2); 246 | cv::putText( 247 | image, 248 | class_names[ClassIndexs[j]], 249 | cv::Point(rect.x, rect.y - 1), 250 | cv::FONT_HERSHEY_PLAIN, 251 | 1.2, 252 | color * 255, 253 | 2); 254 | cv::imwrite("result.jpg", image); 255 | } 256 | } 257 | 258 | YoloEnd2End::~YoloEnd2End() { 259 | std::cout<<"释放内存、显存"<destroy(); 271 | // engine->destroy(); 272 | // runtime->destroy(); 273 | } 274 | 275 | 276 | int main(int argc, char** argv) { 277 | cudaSetDevice(DEVICE); 278 | const std::string input_image_path = "../../../../2_faster_tensorrt/inference/1.jpg"; 279 | const std::string engine_file_path="../../../../2_faster_tensorrt/yolox_end2end.engine"; 280 | 281 | 282 | float* Boxes = new float[400]; 283 | float* Scores = new float[100]; 284 | int* BboxNum = new int[1]; 285 | int* ClassIndexs = new int[100]; 286 | YoloEnd2End yolo_end2end(engine_file_path); 287 | cv::Mat img; 288 | img = cv::imread(input_image_path); 289 | // warmup 290 | for (int num =0; num < 500; num++) { 291 | yolo_end2end.Infer(img, Boxes, Scores, ClassIndexs, BboxNum); 292 | } 293 | // inference 294 | auto start = std::chrono::system_clock::now(); 295 | for (int num = 0; num < 1000; num++) { 296 | yolo_end2end.Infer(img, Boxes, Scores, ClassIndexs, BboxNum); 297 | } 298 | auto end = std::chrono::system_clock::now(); 299 | std::cout << std::chrono::duration_cast(end - start).count() /1000.0<< "ms" << std::endl; 300 | 301 | // std::cout<