├── .gitignore ├── Benchmark.py ├── LICENSE ├── README.md ├── calibration └── README.md ├── cpp ├── README.md ├── jetson_csi │ ├── CMakeLists.txt │ ├── README.md │ ├── csi_detect.cpp │ ├── gstreamer.cpp │ ├── gstreamer.h │ ├── labels_coco.yaml │ ├── preprocess.cu │ ├── preprocess.h │ ├── trt_infer.cpp │ ├── trt_infer.h │ ├── utils_detection.cpp │ └── utils_detection.h ├── kp_jetson_csi │ ├── CMakeLists.txt │ ├── README.md │ ├── csi_kp_detect.cpp │ ├── gstreamer.cpp │ ├── gstreamer.h │ ├── labels_det.yaml │ ├── points_link.yaml │ ├── preprocess.cu │ ├── preprocess.h │ ├── trt_infer.cpp │ ├── trt_infer.h │ ├── utils_detection.cpp │ └── utils_detection.h └── video_detect │ ├── CMakeLists.txt │ ├── README.md │ ├── labels_coco.yaml │ ├── main.cpp │ ├── preprocess.cu │ ├── preprocess.h │ ├── trt_infer.cpp │ ├── trt_infer.h │ ├── utils_detection.cpp │ └── utils_detection.h ├── doc └── yolov5s_det.png ├── labels_coco.yaml ├── labels_voc.yaml ├── models_onnx └── README.md ├── models_trt └── README.md ├── onnx2trt.py ├── requirements.txt ├── utils ├── calibrator.py ├── trt_infer.py └── utils_detection.py ├── yolo_detect_v1.py ├── yolo_detect_v2.py └── yolox_detect.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | __pycache__ 3 | *.mp4 4 | *.onnx 5 | *.engine 6 | calibration -------------------------------------------------------------------------------- /Benchmark.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------- 2 | # 这个脚本向你展示了如何使用 tensorRT 对导出的模型进行推理,并进行速度测试 3 | # 目前 GPU 上 tensorRT 是跑的最快的部署框架 ... 4 | # --------------------------------------------------------------- 5 | 6 | import time 7 | import numpy as np 8 | import tensorrt as trt 9 | 10 | from tqdm import tqdm 11 | from utils import trt_infer 12 | 13 | # int8 / fp32 ~ 70% 14 | # trt > ppq > fp32 15 | 16 | # Nvidia Nsight Performance Profile 17 | ENGINE_PATH = './models_trt/yolov5s.engine' 18 | BATCH_SIZE = 1 19 | INPUT_SHAPE = [BATCH_SIZE, 3, 512, 512] 20 | BENCHMARK_SAMPLES = 12800 21 | 22 | print(f'Benchmark with {ENGINE_PATH}') 23 | logger = trt.Logger(trt.Logger.ERROR) 24 | with open(ENGINE_PATH, 'rb') as f, trt.Runtime(logger) as runtime: 25 | engine = runtime.deserialize_cuda_engine(f.read()) 26 | 27 | with engine.create_execution_context() as context: 28 | inputs, outputs, bindings, stream = trt_infer.allocate_buffers(context.engine) 29 | inputs[0].host = np.zeros(shape=INPUT_SHAPE, dtype=np.float32) 30 | 31 | t1 = time.time() 32 | for _ in tqdm(range(BENCHMARK_SAMPLES), desc=f'Benchmark ...'): 33 | trt_infer.do_inference( 34 | context, bindings=bindings, inputs=inputs, 35 | outputs=outputs, stream=stream, batch_size=BATCH_SIZE) 36 | 37 | t2 = time.time() 38 | t = (t2 - t1)*1000/BENCHMARK_SAMPLES 39 | print(f"{t:0.5f}ms") 40 | 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # yolov5_TensorRT_inference 2 | 记录yolov5的TensorRT量化(fp16, int8)及推理代码。经实测可运行于Jetson平台,可将yolov5s、yolov8s这类的小模型部署在Jetson nano 4g上用于摄像头的检测。 3 |
4 | 5 |
6 | 7 | CPP: 8 | [视频目标检测](https://github.com/MadaoFY/yolov5_TensorRT_inference/tree/main/cpp/video_detect) 9 | [jetson nano摄像头目标检测](https://github.com/MadaoFY/yolov5_TensorRT_inference/tree/main/cpp/jetson_csi) 10 | [jetson nano摄像头人体关键点检测](https://github.com/MadaoFY/yolov5_TensorRT_inference/tree/main/cpp/kp_jetson_csi) 11 | 12 | 模型支持: 13 | yolov5 14 | yolov7 15 | yolov8 16 | yolox(不可在生成的engine中添加nms模块) 17 | 18 | 温馨提示:本人使用的TensrRT版本为8.4.3.1,为保证成功运行,你的TensorRT大版本最好在8.4。具体环境依赖请参考```requirements.txt``` 19 | 20 | 项目文件如下: 21 | ```bash 22 | |-yolov5_TensorRT_inference 23 | |-calibration # 默认情况下用于存放int8量化校准集的文件夹 24 | |-cpp # c++推理代码,有jetson nano上用的代码 25 | |-doc # 单纯用来存放文本的文件夹 26 | |-models_onnx # 默认情况下用于存放onnx模型的文件夹 27 | |-models_trt # 默认情况下用于存放量化后生成的trt模型的文件夹 28 | |-utils # 存放utils的文件夹 29 | |-Benchmark.py # 测试trt模型速度的脚本 30 | |-labels_coco.yaml # coco数据集类别标签 31 | |-labels_voc.yaml # voc数据集类别标签 32 | |-onnx2trt.py # onnx模型转engine的脚本,已添加EfficientNMS算子的支持 33 | |-yolo_detect_v1.py # 不带nms算子的视频检测脚本 34 | |-yolo_detect_v2.py # 带nms算子的视频检测脚本,该脚本使用的trt模型添加了EfficientNMS算子 35 | |-yolox_detect.py # yolovx的视频检测脚本 36 | ``` 37 | 38 | 以下将使用yolov5s模型演示如何量化及用于视频的推理。 39 | ## 数据准备 40 | 使用yolov5官方提供的coco训练模型,已导出为onnx。这里使用voc2012作为校准集,仅用来演示,你可以下载coco数据集作为你的校准集。 41 | 42 | yolov5s.onnx:https://pan.baidu.com/s/1eYaU3ndVpwexL4k6goxjHg 43 | 提取码: sduf 44 | 45 | voc2012:https://pan.baidu.com/s/1rICWiczIv_GyrYIrEj1p3Q 46 | 提取码: 4pgx 47 | 48 | 视频源:https://pan.baidu.com/s/1HBIjz6019vn9qfoKPIuV2A 49 | 提取码: fbfh 50 | 51 | ## 量化(onnx2trt.py) 52 | 你需要从yolov5、yolov7、yolox的官方库导出相应onnx模型,从第三方实现的库中导出的yolo onnx模型不保证适用,注意导出的onnx不包含nms部分。如果你想把nms算子加入到engine中,add_nms设置为True。默认将onnx模型放置于models_onnx文件夹,导出的trt模型可保存于models_trt文件夹。如果你想使用int8量化,你需要从训练集中准备至少500张图片作为校准集,图片放置于calibration文件夹。 53 | 54 | ```shell 55 | python onnx2trt.py --onnx_dir ./models_onnx/yolov5s.onnx --engine_dir ./models_trt/yolov5s.engine --int8 True --imgs_dir ./calibration 56 | ``` 57 | 参数说明: 58 | - ```--onnx_dir``` onnx模型路径 59 | - ```--engine_dir``` trt模型的保存路径 60 | - ```--min_shape``` 最小的shape 61 | - ```--opt_shape``` 优化的shape 62 | - ```--max_shape``` 最大的shape 63 | - ```--fp16``` 是否使用fp16量化 64 | - ```--int8``` 是否使用int8量化 65 | - ```--imgs_dir``` 校准集路径 66 | - ```--n_iteration``` int8量化校准轮次 67 | - ```--cache_file``` 是否生成cache 68 | - ```--yolov8_head``` 是否为yolov8的检测头(注意,yolov8的输出与yolov5不一样) 69 | - ```--add_nms``` 添加EfficientNMS算子 70 | - ```--conf_thres``` nms的置信度设置 71 | - ```--iou_thres``` nms的iou设置 72 | - ```--max_det``` nms输出的最大检测数量 73 | 74 | 更详细参数说明可以在脚本中查看。 75 | 76 | ## 视频推理 77 | ### 1.不带EfficientNMS算子的推理脚本(yolo_detect_v1.py) 78 | 你需要准备一个模型输出类别的labels文件,具体可参考仓库的labels_coco.yaml文件。本演示中用到模型为coco训练的yolov5s模型,所以需要用到相对应的coco类别。如果你使用的是yolov5、yolov7模型,运行yolo_detect_v1.py脚本,yolox模型运行yolox_detect.py脚本。以yolov5s.engine推理为例。 79 | ```shell 80 | python yolo_detect_v1.py --video_dir ./sample_1080p_h265.mp4 --engine_dir ./models_trt/yolov5s.engine --labels ./labels_coco.yaml 81 | ``` 82 | 83 | - ```--video_dir``` 视频源路径 84 | - ```--engine_dir``` trt模型路径 85 | - ```--labels``` 模型labels文件 86 | - ```--conf_thres``` nms的置信度设置 87 | - ```--iou_thres``` nms的iou设置 88 | - ```--max_det``` nms输出的最大检测数量 89 | 90 | ### 2.带EfficientNMS算子的推理脚本(yolo_detect_v2.py) 91 | yolo_detect_v2.py脚本里的所使用trt模型已添加EfficientNMS算子,所以无需在对nms参数进行设置 92 | ```shell 93 | python yolo_detect_v2.py --video_dir ./sample_1080p_h265.mp4 --engine_dir ./models_trt/yolov7_nms.engine --labels ./labels_coco.yaml 94 | ``` 95 | 96 | - ```--video_dir``` 视频源路径 97 | - ```--engine_dir``` trt模型路径 98 | - ```--labels``` 模型labels文件 99 | 100 | 101 | ## 其他相关 102 | 可能TensoRT安装是最消耗时间的事情、、、 103 | TensoRT:https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#installing 104 | https://developer.nvidia.com/tensorrt 105 | 106 | Trt_sample: https://github.com/NVIDIA/trt-samples-for-hackathon-cn/tree/master/cookbook 107 | 108 | yolox:https://github.com/Megvii-BaseDetection/YOLOX 109 | yolov5:https://github.com/ultralytics/yolov5 110 | yolov7:https://github.com/WongKinYiu/yolov7 111 | yolov8: https://github.com/ultralytics/ultralytics 112 | 113 | 114 | -------------------------------------------------------------------------------- /calibration/README.md: -------------------------------------------------------------------------------- 1 | 用于存放量化用的校准集 2 | -------------------------------------------------------------------------------- /cpp/README.md: -------------------------------------------------------------------------------- 1 | # cpp_inference 2 | c++的TensorRT推理代码。jetson_csi为jetson nano的摄像头检测代码。 3 | 4 | 模型支持:yolov5、yolov7、yolov8 5 | 6 | -------------------------------------------------------------------------------- /cpp/jetson_csi/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | set(CMAKE_CXX_STANDARD 14) 4 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 5 | set(CMAKE_CXX_EXTENSIONS ON) 6 | 7 | project(yolo_detect C CXX) 8 | 9 | add_definitions(-DAPI_EXPORTS) 10 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 11 | # SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -g2 -ggdb") 12 | if(NOT CMAKE_BUILD_TYPE) 13 | SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3") 14 | endif() 15 | set(src_list csi_detect.cpp utils_detection.cpp utils_detection.h trt_infer.cpp trt_infer.h gstreamer.cpp gstreamer.h preprocess.cu preprocess.h) 16 | 17 | # CUDA 18 | # TODO(Call for PR): make cmake compatible with Windows 19 | set(CMAKE_CUDA_COMPILER /usr/local/cuda-10.2/bin/nvcc) 20 | enable_language(CUDA) 21 | find_package(CUDA REQUIRED) 22 | message(STATUS " libraries: ${CUDA_LIBRARIES}") 23 | message(STATUS " include path: ${CUDA_INCLUDE_DIRS}") 24 | 25 | 26 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 27 | include_directories(/usr/local/cuda-10.2/include/) 28 | link_directories(/usr/local/cuda-10.2/lib64/) 29 | 30 | 31 | # tensorrt 32 | # set(TRT_DIR J:/tensorrt/TensorRT-8.4.3.1) 33 | # set(TRT_INCLUDE_DIRS ${TRT_DIR}/include/) 34 | # set(TRT_LIB_DIRS ${TRT_DIR}/lib/) 35 | 36 | # include_directories(${TRT_INCLUDE_DIRS}) 37 | # link_directories(${TRT_LIB_DIRS}) 38 | 39 | #include_directories(${PROJECT_SOURCE_DIR}/) 40 | #file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/*.cpp ${PROJECT_SOURCE_DIR}/*.h) 41 | 42 | # opencv 43 | find_package(OpenCV REQUIRED) 44 | include_directories( ${OpenCV_INCLUDE_DIRS} ) 45 | 46 | add_executable(${PROJECT_NAME} ${src_list}) 47 | target_link_libraries(${PROJECT_NAME} nvinfer) 48 | target_link_libraries(${PROJECT_NAME} cudart) 49 | target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS}) 50 | -------------------------------------------------------------------------------- /cpp/jetson_csi/README.md: -------------------------------------------------------------------------------- 1 | # jetson_csi 2 | 用jetson nano摄像头目标检测的c++代码。 3 | 用cmake编译后,运行yolo_detect。 4 | 5 | ```shell 6 | yolo_detect --engine_dir=./yolov5s.engine --labels=./labels_coco.yaml 7 | ``` 8 | 9 | 参数说明: 10 | - ```--engine_dir``` trt模型的保存路径 11 | - ```--labels``` 模型labels文件 12 | - ```--conf_thres``` nms的置信度设置 13 | - ```--iou_thres``` nms的iou设置 14 | - ```--max_det``` nms输出的最大检测数量 15 | 16 | 更详细参数说明可以在csi_detect.cpp中查看。 17 | -------------------------------------------------------------------------------- /cpp/jetson_csi/csi_detect.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/jetson_csi/csi_detect.cpp -------------------------------------------------------------------------------- /cpp/jetson_csi/gstreamer.cpp: -------------------------------------------------------------------------------- 1 | #include "gstreamer.h" 2 | 3 | 4 | std::string gs_pipeline(int capture_width, int capture_height, int display_width, int display_height, int framerate, int flip_method) 5 | { 6 | std::string result = "nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)" + std::to_string(capture_width) + 7 | ", height=(int)" + std::to_string(capture_height) + 8 | ", format=(string)NV12, framerate=(fraction)" + std::to_string(framerate) + 9 | "/1 ! nvvidconv flip-method=" + std::to_string(flip_method) + 10 | " ! video/x-raw, width=(int)" + std::to_string(display_width) + 11 | ", height=(int)" + std::to_string(display_height) + 12 | ", format=(string)BGRx ! videoconvert ! video/x-raw, format=(string)BGR ! appsink"; 13 | 14 | return result; 15 | } 16 | -------------------------------------------------------------------------------- /cpp/jetson_csi/gstreamer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | std::string gs_pipeline(int capture_width, int capture_height, int display_width, int display_height, int framerate, int flip_method); 6 | -------------------------------------------------------------------------------- /cpp/jetson_csi/labels_coco.yaml: -------------------------------------------------------------------------------- 1 | 0: person 2 | 1: bicycle 3 | 2: car 4 | 3: motorcycle 5 | 4: airplane 6 | 5: bus 7 | 6: train 8 | 7: truck 9 | 8: boat 10 | 9: traffic light 11 | 10: fire hydrant 12 | 11: stop sign 13 | 12: parking meter 14 | 13: bench 15 | 14: bird 16 | 15: cat 17 | 16: dog 18 | 17: horse 19 | 18: sheep 20 | 19: cow 21 | 20: elephant 22 | 21: bear 23 | 22: zebra 24 | 23: giraffe 25 | 24: backpack 26 | 25: umbrella 27 | 26: handbag 28 | 27: tie 29 | 28: suitcase 30 | 29: frisbee 31 | 30: skis 32 | 31: snowboard 33 | 32: sports ball 34 | 33: kite 35 | 34: baseball bat 36 | 35: baseball glove 37 | 36: skateboard 38 | 37: surfboard 39 | 38: tennis racket 40 | 39: bottle 41 | 40: wine glass 42 | 41: cup 43 | 42: fork 44 | 43: knife 45 | 44: spoon 46 | 45: bowl 47 | 46: banana 48 | 47: apple 49 | 48: sandwich 50 | 49: orange 51 | 50: broccoli 52 | 51: carrot 53 | 52: hot dog 54 | 53: pizza 55 | 54: donut 56 | 55: cake 57 | 56: chair 58 | 57: couch 59 | 58: potted plant 60 | 59: bed 61 | 60: dining table 62 | 61: toilet 63 | 62: tv 64 | 63: laptop 65 | 64: mouse 66 | 65: remote 67 | 66: keyboard 68 | 67: cell phone 69 | 68: microwave 70 | 69: oven 71 | 70: toaster 72 | 71: sink 73 | 72: refrigerator 74 | 73: book 75 | 74: clock 76 | 75: vase 77 | 76: scissors 78 | 77: teddy bear 79 | 78: hair drier 80 | 79: toothbrush 81 | -------------------------------------------------------------------------------- /cpp/jetson_csi/preprocess.cu: -------------------------------------------------------------------------------- 1 | #include "preprocess.h" 2 | 3 | #include 4 | 5 | 6 | __global__ void warpaffine_nearest_bgrbgr2rrggbb_kernel( 7 | uint8_t* src, int src_step_size, int src_width, 8 | int src_height, float* dst, int dst_width, 9 | int dst_height, uint8_t const_value_st, 10 | AffineMatrix d2s, int h_p, int w_p) 11 | { 12 | int dx = blockDim.x * blockIdx.x + threadIdx.x; 13 | int dy = blockDim.y * blockIdx.y + threadIdx.y; 14 | if (dx >= dst_width || dy >= dst_height) return; 15 | 16 | float m_x1 = d2s.value[0]; 17 | float m_y1 = d2s.value[1]; 18 | float m_z1 = d2s.value[2]; 19 | float m_x2 = d2s.value[3]; 20 | float m_y2 = d2s.value[4]; 21 | float m_z2 = d2s.value[5]; 22 | 23 | float c0, c1, c2; 24 | if (dy < h_p || dy >(dst_height - h_p) || dx < w_p || dx >(dst_width - w_p)) 25 | { 26 | // out of range 27 | c0 = const_value_st; 28 | c1 = const_value_st; 29 | c2 = const_value_st; 30 | } 31 | else 32 | { 33 | float src_x = m_x1 * (dx + 0.5f) + m_y1 * dy + m_z1 - 0.5f; 34 | float src_y = m_x2 * dx + m_y2 * (dy + 0.5f) + m_z2 - 0.5f; 35 | 36 | int sy_1 = floorf(src_y + 0.5f); 37 | int sx_1 = floorf(src_x + 0.5f); 38 | 39 | uint8_t const_value[] = { const_value_st, const_value_st, const_value_st }; 40 | uint8_t* p = const_value; 41 | 42 | if (sy_1 >= 0 && sy_1 <= src_height && sx_1 >=0 && sx_1 <= src_width) 43 | { 44 | p = src + sy_1 * src_step_size + sx_1 * 3; 45 | } 46 | 47 | c0 = p[0]; 48 | c1 = p[1]; 49 | c2 = p[2]; 50 | } 51 | 52 | // normalization 53 | c0 /= 255.0f; 54 | c1 /= 255.0f; 55 | c2 /= 255.0f; 56 | 57 | // bgrbgrbgr to rrrgggbbb 58 | int area = dst_width * dst_height; 59 | float* pdst_c0 = dst + dy * dst_width + dx; 60 | pdst_c0[0] = c2; 61 | pdst_c0[area] = c1; 62 | pdst_c0[2 * area] = c0; 63 | } 64 | 65 | __global__ void warpaffine_bilinear_bgrbgr2rrggbb_kernel( 66 | uint8_t* src, int src_step_size, int src_width, 67 | int src_height, float* dst, int dst_width, 68 | int dst_height, uint8_t const_value_st, 69 | AffineMatrix d2s, int h_p, int w_p) 70 | { 71 | int dx = blockDim.x * blockIdx.x + threadIdx.x; 72 | int dy = blockDim.y * blockIdx.y + threadIdx.y; 73 | if (dx >= dst_width || dy >= dst_height) return; 74 | 75 | float m_x1 = d2s.value[0]; 76 | float m_y1 = d2s.value[1]; 77 | float m_z1 = d2s.value[2]; 78 | float m_x2 = d2s.value[3]; 79 | float m_y2 = d2s.value[4]; 80 | float m_z2 = d2s.value[5]; 81 | 82 | float c0, c1, c2; 83 | if (dy < h_p || dy >(dst_height - h_p) || dx < w_p || dx >(dst_width - w_p)) 84 | { 85 | // out of range 86 | c0 = const_value_st; 87 | c1 = const_value_st; 88 | c2 = const_value_st; 89 | } 90 | else 91 | { 92 | float src_x = m_x1 * (dx + 0.5f) + m_y1 * dy + m_z1 - 0.5f; 93 | float src_y = m_x2 * dx + m_y2 * (dy + 0.5f) + m_z2 - 0.5f; 94 | 95 | int sy_1 = floorf(src_y); 96 | int sx_1 = floorf(src_x); 97 | int sy_2 = sy_1 + 1; 98 | int sx_2 = sx_1 + 1; 99 | 100 | uint8_t const_value[] = { const_value_st, const_value_st, const_value_st }; 101 | float a2 = src_y - sy_1; 102 | float a1 = 1.0f - a2; 103 | float b2 = src_x - sx_1; 104 | float b1 = 1.0f - b2; 105 | float w11 = a1 * b1; 106 | float w12 = a1 * b2; 107 | float w21 = a2 * b1; 108 | float w22 = a2 * b2; 109 | uint8_t* p11 = const_value; 110 | uint8_t* p12 = const_value; 111 | uint8_t* p21 = const_value; 112 | uint8_t* p22 = const_value; 113 | 114 | /*if (sy_1 >= 0) { 115 | if (sx_1 >= 0)*/ 116 | p11 = src + sy_1 * src_step_size + sx_1 * 3; 117 | 118 | //if (sx_2 < src_width) 119 | p12 = src + sy_1 * src_step_size + sx_2 * 3; 120 | //} 121 | 122 | /*if (sy_2 < src_height) { 123 | if (sx_1 >= 0)*/ 124 | p21 = src + sy_2 * src_step_size + sx_1 * 3; 125 | 126 | /*if (sx_2 < src_width)*/ 127 | p22 = src + sy_2 * src_step_size + sx_2 * 3; 128 | //} 129 | 130 | c0 = w11 * p11[0] + w12 * p12[0] + w21 * p21[0] + w22 * p22[0] + 0.5f; 131 | c1 = w11 * p11[1] + w12 * p12[1] + w21 * p21[1] + w22 * p22[1] + 0.5f; 132 | c2 = w11 * p11[2] + w12 * p12[2] + w21 * p21[2] + w22 * p22[2] + 0.5f; 133 | } 134 | 135 | // normalization 136 | c0 /= 255.0f; 137 | c1 /= 255.0f; 138 | c2 /= 255.0f; 139 | 140 | // bgrbgrbgr to rrrgggbbb 141 | int area = dst_width * dst_height; 142 | float* pdst_c0 = dst + dy * dst_width + dx; 143 | pdst_c0[0] = c2; 144 | pdst_c0[area] = c1; 145 | pdst_c0[2 * area] = c0; 146 | } 147 | 148 | 149 | void cuda_preprocess(cv::Mat& image, preproc_struct& image_trans, std::vector& bufferH, 150 | std::vector& bufferD, std::vector& bindingsize, cudaStream_t& stream, cv::Size resize) 151 | { 152 | int h, w, h_p, w_p; 153 | 154 | float scale = cv::min((float)resize.height / (float)image.rows, (float)resize.width / (float)image.cols); 155 | scale = cv::min(scale, 1.1f); 156 | 157 | h = image.rows * scale; 158 | w = image.cols * scale; 159 | h_p = (resize.height - h) * 0.5f; 160 | w_p = (resize.width - w) * 0.5f; 161 | 162 | image_trans.scale = scale; 163 | image_trans.h_p = h_p; 164 | image_trans.w_p = w_p; 165 | 166 | // copy data to device memory 167 | // memcpy(bufferH[2], image.data, bindingsize[2]); 168 | // cudaMemcpyAsync(bufferD[2], bufferH[2], bindingsize[2], cudaMemcpyHostToDevice, stream); 169 | cudaMemcpyAsync(bufferD[2], image.data, bindingsize[2], cudaMemcpyHostToDevice, stream); 170 | 171 | AffineMatrix s2d, d2s; 172 | 173 | /*s2d.value[0] = scale; 174 | s2d.value[1] = 0; 175 | s2d.value[2] = (resize.width - scale * image.cols + scale - 1) * 0.5f; 176 | s2d.value[3] = 0; 177 | s2d.value[4] = scale; 178 | s2d.value[5] = (resize.height - scale * image.rows + scale - 1) * 0.5f;*/ 179 | 180 | d2s.value[0] = 1.0f / scale; 181 | d2s.value[1] = 0; 182 | d2s.value[2] = (image.cols - resize.width / scale + d2s.value[0] - 1) * 0.5f; 183 | d2s.value[3] = 0; 184 | d2s.value[4] = 1.0f / scale; 185 | d2s.value[5] = (image.rows - resize.height / scale + d2s.value[0] - 1) * 0.5f; 186 | 187 | /*cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value); 188 | cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value); 189 | cv::invertAffineTransform(m2x3_s2d, m2x3_d2s); 190 | memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value));*/ 191 | 192 | dim3 block(128, 1); 193 | dim3 grid((resize.width + block.x - 1) / block.x, (resize.height + block.y - 1) / block.y); 194 | 195 | warpaffine_nearest_bgrbgr2rrggbb_kernel <<< grid, block, 0, stream >>> ( 196 | (uint8_t*)bufferD[2], image.cols * 3, image.cols, 197 | image.rows, (float*)bufferD[0], resize.width, 198 | resize.height, 0, d2s, h_p, w_p); 199 | } 200 | -------------------------------------------------------------------------------- /cpp/jetson_csi/preprocess.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "utils_detection.h" 4 | 5 | #include 6 | #include 7 | 8 | struct AffineMatrix 9 | { 10 | float value[6]; 11 | }; 12 | 13 | void cuda_preprocess(cv::Mat& image, preproc_struct& image_trans, std::vector& bufferH, 14 | std::vector& bufferD, std::vector& bindingsize, cudaStream_t& stream, cv::Size resize); 15 | -------------------------------------------------------------------------------- /cpp/jetson_csi/trt_infer.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/jetson_csi/trt_infer.cpp -------------------------------------------------------------------------------- /cpp/jetson_csi/trt_infer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | 11 | 12 | class Logger : public nvinfer1::ILogger 13 | { 14 | public: 15 | Severity reportableSeverity; 16 | 17 | Logger(Severity severity = Severity::kINFO); 18 | void log(Severity severity, const char* msg) noexcept override; 19 | }; 20 | 21 | 22 | bool load_engine(nvinfer1::IRuntime*& runtime, nvinfer1::ICudaEngine*& engine, const std::string& engine_dir, 23 | nvinfer1::ILogger& gLogger); 24 | 25 | void allocate_buffers(nvinfer1::ICudaEngine*& engine, std::vector& bufferH, std::vector& bufferD, std::vector& bindingsize, 26 | cv::Size img_size); 27 | 28 | float* do_inference(nvinfer1::IExecutionContext*& context, std::vector& bufferH, const std::vector& bufferD, 29 | cudaStream_t& stream, const std::vector& BindingSize); 30 | 31 | 32 | class yolo_trt_det 33 | { 34 | private: 35 | 36 | nvinfer1::IRuntime* _runtime = nullptr; 37 | nvinfer1::ICudaEngine* _engine = nullptr; 38 | nvinfer1::IExecutionContext* _context = nullptr; 39 | 40 | std::unordered_map catid_labels; 41 | color_dicts catid_colors; 42 | cv::Size img_size; 43 | cv::Size set_size; 44 | bool v8_head; 45 | 46 | std::vector cpu_buffer; 47 | std::vector gpu_buffer; 48 | std::vector BindingSize; 49 | cudaStream_t stream; 50 | 51 | public: 52 | yolo_trt_det(const std::string& engine_dir, const std::string& labels_dir, cv::Size img_size); 53 | ~yolo_trt_det(); 54 | 55 | std::vector draw_batch(std::vector& image_list, float conf, float iou, int max_det); 56 | 57 | cv::Mat draw(cv::Mat& image, float conf, float iou, int max_det); 58 | }; 59 | -------------------------------------------------------------------------------- /cpp/jetson_csi/utils_detection.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/jetson_csi/utils_detection.cpp -------------------------------------------------------------------------------- /cpp/jetson_csi/utils_detection.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | 10 | 11 | struct color_dicts 12 | { 13 | std::unordered_map> color_map; 14 | std::vector catid; 15 | 16 | color_dicts(const std::unordered_map& catid_labels); 17 | }; 18 | 19 | 20 | struct preproc_struct 21 | { 22 | float* img = nullptr; 23 | float scale; 24 | int h_p; 25 | int w_p; 26 | 27 | ~preproc_struct(); 28 | }; 29 | 30 | 31 | 32 | std::unordered_map yaml_load_labels(const std::string& dir = "data.yaml"); 33 | 34 | void preprocess(cv::Mat& image, preproc_struct& image_trans, cv::Size resize); 35 | 36 | void fliter_boxes(float* const boxes, bool v8_head, const std::array& output_shape, float conf_thres, 37 | std::vector& keep_boxes, std::vector& keep_scores, std::vector& keep_classes); 38 | 39 | void scale_boxes(cv::Rect& box, const preproc_struct& preproc_res); 40 | 41 | void draw_boxes(cv::Mat image, const cv::Rect& box, float score, int class_id, 42 | std::unordered_map catid_labels, color_dicts& color_dicts); 43 | 44 | void imgresize(const cv::Mat& image, cv::Mat& input_image, float scale, cv::Size resize); 45 | 46 | template 47 | static bool SortScorePairDescend(const std::pair& pair1, const std::pair& pair2); 48 | 49 | template 50 | void max_score_idx(const std::vector& scores, float score_thres, T scores_idxs); 51 | 52 | float get_iou(const cv::Rect& bbox1, const cv::Rect& bbox2); 53 | 54 | void base_nms(const std::vector& bboxes, const std::vector& scores, const std::vector& catid, 55 | float score_threshold, float nms_threshold, std::vector& indices, int limit); 56 | -------------------------------------------------------------------------------- /cpp/kp_jetson_csi/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | set(CMAKE_CXX_STANDARD 14) 4 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 5 | set(CMAKE_CXX_EXTENSIONS ON) 6 | 7 | project(yolo_detect C CXX) 8 | 9 | add_definitions(-DAPI_EXPORTS) 10 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 11 | # SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -g2 -ggdb") 12 | if(NOT CMAKE_BUILD_TYPE) 13 | SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3") 14 | endif() 15 | set(src_list csi_kp_detect.cpp utils_detection.cpp utils_detection.h trt_infer.cpp trt_infer.h gstreamer.cpp gstreamer.h preprocess.cu preprocess.h) 16 | 17 | # CUDA 18 | # TODO(Call for PR): make cmake compatible with Windows 19 | set(CMAKE_CUDA_COMPILER /usr/local/cuda-10.2/bin/nvcc) 20 | enable_language(CUDA) 21 | find_package(CUDA REQUIRED) 22 | message(STATUS " libraries: ${CUDA_LIBRARIES}") 23 | message(STATUS " include path: ${CUDA_INCLUDE_DIRS}") 24 | 25 | 26 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 27 | include_directories(/usr/local/cuda-10.2/include/) 28 | link_directories(/usr/local/cuda-10.2/lib64/) 29 | 30 | 31 | # tensorrt 32 | # set(TRT_DIR J:/tensorrt/TensorRT-8.4.3.1) 33 | # set(TRT_INCLUDE_DIRS ${TRT_DIR}/include/) 34 | # set(TRT_LIB_DIRS ${TRT_DIR}/lib/) 35 | 36 | # include_directories(${TRT_INCLUDE_DIRS}) 37 | # link_directories(${TRT_LIB_DIRS}) 38 | 39 | #include_directories(${PROJECT_SOURCE_DIR}/) 40 | #file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/*.cpp ${PROJECT_SOURCE_DIR}/*.h) 41 | 42 | # opencv 43 | find_package(OpenCV REQUIRED) 44 | include_directories( ${OpenCV_INCLUDE_DIRS} ) 45 | 46 | add_executable(${PROJECT_NAME} ${src_list}) 47 | target_link_libraries(${PROJECT_NAME} nvinfer) 48 | target_link_libraries(${PROJECT_NAME} cudart) 49 | target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS}) 50 | -------------------------------------------------------------------------------- /cpp/kp_jetson_csi/README.md: -------------------------------------------------------------------------------- 1 | # kp_jetson_csi 2 | 在jetson nano 4g上使用yolov5和hrnet进行摄像头人体关键点检测。 3 | 我对hrnet进行了轻量化改造,使其能在算力有限的平台上运行。替换上mobilenetv2的backbone后用coco2017数据集进行了训练,可满足单目标的人体关键点检测需求。 4 | 后续有时间可能会更新关键点检测模型,当然如果没时间魔改出更快更准的模型的话就算了... 5 | 6 | ## 数据准备 7 | 相比于目标检测,这里需要多提供一个关键点检测的engine和关键点链接信息。 8 | 你可以使用我提供的以下两个onnx模型,在运行的设备上生成engine。或者自己训练一个专门用于检测人的yolo模型,和一个用于关键点检测的hrnet模型。 9 | 10 | yolov5s_person.onnx:https://pan.baidu.com/s/1mgbFLOENiIaTmfsyc2RtVw 11 | 提取码:qei0 12 | 13 | Myhrnet.onnx:https://pan.baidu.com/s/1rIR_CjOuu6qzaWsoirfP3A 14 | 提取码:43dw 15 | 16 | points_link.yaml文件里记录的是关键点的链接信息,用于绘图。 17 | 18 | 用cmake编译后,运行yolo_detect。 19 | 20 | ```shell 21 | yolo_detect --det_engine_dir=./yolov5s_person.engine --kp_engine_dir=./Myhrnet.engine --labels=./labels_det.yaml --pointlinker=./points_link.yaml 22 | ``` 23 | 24 | 参数说明: 25 | - ```--det_engine_dir``` 目标检测trt模型的保存路径 26 | - ```--kp_engine_dir``` 关键点检测trt模型的保存路径 27 | - ```--labels``` 模型labels的yaml文件 28 | - ```--pointlinker``` 关键点链接的yaml文件 29 | - ```--conf_thres``` nms的置信度设置 30 | - ```--iou_thres``` nms的iou设置 31 | - ```--max_det``` 输出的最大检测数量 32 | - ```--skip``` 隔帧检测帧数 33 | 34 | 更详细参数说明可以在csi_kp_detect.cpp中查看。 35 | -------------------------------------------------------------------------------- /cpp/kp_jetson_csi/csi_kp_detect.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/kp_jetson_csi/csi_kp_detect.cpp -------------------------------------------------------------------------------- /cpp/kp_jetson_csi/gstreamer.cpp: -------------------------------------------------------------------------------- 1 | #include "gstreamer.h" 2 | 3 | 4 | std::string gs_pipeline(int capture_width, int capture_height, int display_width, int display_height, int framerate, int flip_method) 5 | { 6 | std::string result = "nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)" + std::to_string(capture_width) + 7 | ", height=(int)" + std::to_string(capture_height) + 8 | ", format=(string)NV12, framerate=(fraction)" + std::to_string(framerate) + 9 | "/1 ! nvvidconv flip-method=" + std::to_string(flip_method) + 10 | " ! video/x-raw, width=(int)" + std::to_string(display_width) + 11 | ", height=(int)" + std::to_string(display_height) + 12 | ", format=(string)BGRx ! videoconvert ! video/x-raw, format=(string)BGR ! appsink"; 13 | 14 | return result; 15 | } 16 | -------------------------------------------------------------------------------- /cpp/kp_jetson_csi/gstreamer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | std::string gs_pipeline(int capture_width, int capture_height, int display_width, int display_height, int framerate, int flip_method); 6 | -------------------------------------------------------------------------------- /cpp/kp_jetson_csi/labels_det.yaml: -------------------------------------------------------------------------------- 1 | 0: person 2 | -------------------------------------------------------------------------------- /cpp/kp_jetson_csi/points_link.yaml: -------------------------------------------------------------------------------- 1 | 15: 13 2 | 13: 11 3 | 16: 14 4 | 14: 12 5 | 12: 11 6 | 11: 5 7 | 6: 12 8 | 5: 6 9 | 7: 5 10 | 8: 6 11 | 9: 7 12 | 10: 8 13 | 1: 2 14 | 0: 1 15 | 0: 2 16 | 1: 3 17 | 2: 4 18 | 3: 5 19 | 4: 6 20 | -------------------------------------------------------------------------------- /cpp/kp_jetson_csi/preprocess.cu: -------------------------------------------------------------------------------- 1 | #include "preprocess.h" 2 | 3 | #include 4 | 5 | 6 | __global__ void warpaffine_nearest_bgrbgr2rrggbb_kernel( 7 | uint8_t* src, int src_step_size, int src_width, 8 | int src_height, float* dst, int dst_width, 9 | int dst_height, uint8_t const_value_st, 10 | AffineMatrix d2s, int h_p, int w_p) 11 | { 12 | int dx = blockDim.x * blockIdx.x + threadIdx.x; 13 | int dy = blockDim.y * blockIdx.y + threadIdx.y; 14 | if (dx >= dst_width || dy >= dst_height) return; 15 | 16 | float m_x1 = d2s.value[0]; 17 | float m_y1 = d2s.value[1]; 18 | float m_z1 = d2s.value[2]; 19 | float m_x2 = d2s.value[3]; 20 | float m_y2 = d2s.value[4]; 21 | float m_z2 = d2s.value[5]; 22 | 23 | float c0, c1, c2; 24 | if (dy < h_p || dy >(dst_height - h_p) || dx < w_p || dx >(dst_width - w_p)) 25 | { 26 | // out of range 27 | c0 = const_value_st; 28 | c1 = const_value_st; 29 | c2 = const_value_st; 30 | } 31 | else 32 | { 33 | float src_x = m_x1 * (dx + 0.5f) + m_y1 * dy + m_z1 - 0.5f; 34 | float src_y = m_x2 * dx + m_y2 * (dy + 0.5f) + m_z2 - 0.5f; 35 | 36 | int sy_1 = floorf(src_y + 0.5f); 37 | int sx_1 = floorf(src_x + 0.5f); 38 | 39 | uint8_t const_value[] = { const_value_st, const_value_st, const_value_st }; 40 | uint8_t* p = const_value; 41 | 42 | if (sy_1 >= 0 && sy_1 <= src_height && sx_1 >=0 && sx_1 <= src_width) 43 | { 44 | p = src + sy_1 * src_step_size + sx_1 * 3; 45 | } 46 | 47 | c0 = p[0]; 48 | c1 = p[1]; 49 | c2 = p[2]; 50 | } 51 | 52 | // normalization 53 | c0 /= 255.0f; 54 | c1 /= 255.0f; 55 | c2 /= 255.0f; 56 | 57 | // bgrbgrbgr to rrrgggbbb 58 | int area = dst_width * dst_height; 59 | float* pdst_c0 = dst + dy * dst_width + dx; 60 | pdst_c0[0] = c2; 61 | pdst_c0[area] = c1; 62 | pdst_c0[2 * area] = c0; 63 | } 64 | 65 | __global__ void warpaffine_bilinear_bgrbgr2rrggbb_kernel( 66 | uint8_t* src, int src_step_size, int src_width, 67 | int src_height, float* dst, int dst_width, 68 | int dst_height, uint8_t const_value_st, 69 | AffineMatrix d2s, int h_p, int w_p) 70 | { 71 | int dx = blockDim.x * blockIdx.x + threadIdx.x; 72 | int dy = blockDim.y * blockIdx.y + threadIdx.y; 73 | if (dx >= dst_width || dy >= dst_height) return; 74 | 75 | float m_x1 = d2s.value[0]; 76 | float m_y1 = d2s.value[1]; 77 | float m_z1 = d2s.value[2]; 78 | float m_x2 = d2s.value[3]; 79 | float m_y2 = d2s.value[4]; 80 | float m_z2 = d2s.value[5]; 81 | 82 | float c0, c1, c2; 83 | if (dy < h_p || dy >(dst_height - h_p) || dx < w_p || dx >(dst_width - w_p)) 84 | { 85 | // out of range 86 | c0 = const_value_st; 87 | c1 = const_value_st; 88 | c2 = const_value_st; 89 | } 90 | else 91 | { 92 | float src_x = m_x1 * (dx + 0.5f) + m_y1 * dy + m_z1 - 0.5f; 93 | float src_y = m_x2 * dx + m_y2 * (dy + 0.5f) + m_z2 - 0.5f; 94 | 95 | int sy_1 = floorf(src_y); 96 | int sx_1 = floorf(src_x); 97 | int sy_2 = sy_1 + 1; 98 | int sx_2 = sx_1 + 1; 99 | 100 | uint8_t const_value[] = { const_value_st, const_value_st, const_value_st }; 101 | float a2 = src_y - sy_1; 102 | float a1 = 1.0f - a2; 103 | float b2 = src_x - sx_1; 104 | float b1 = 1.0f - b2; 105 | float w11 = a1 * b1; 106 | float w12 = a1 * b2; 107 | float w21 = a2 * b1; 108 | float w22 = a2 * b2; 109 | uint8_t* p11 = const_value; 110 | uint8_t* p12 = const_value; 111 | uint8_t* p21 = const_value; 112 | uint8_t* p22 = const_value; 113 | 114 | /*if (sy_1 >= 0) { 115 | if (sx_1 >= 0)*/ 116 | p11 = src + sy_1 * src_step_size + sx_1 * 3; 117 | 118 | //if (sx_2 < src_width) 119 | p12 = src + sy_1 * src_step_size + sx_2 * 3; 120 | //} 121 | 122 | /*if (sy_2 < src_height) { 123 | if (sx_1 >= 0)*/ 124 | p21 = src + sy_2 * src_step_size + sx_1 * 3; 125 | 126 | /*if (sx_2 < src_width)*/ 127 | p22 = src + sy_2 * src_step_size + sx_2 * 3; 128 | //} 129 | 130 | c0 = w11 * p11[0] + w12 * p12[0] + w21 * p21[0] + w22 * p22[0] + 0.5f; 131 | c1 = w11 * p11[1] + w12 * p12[1] + w21 * p21[1] + w22 * p22[1] + 0.5f; 132 | c2 = w11 * p11[2] + w12 * p12[2] + w21 * p21[2] + w22 * p22[2] + 0.5f; 133 | } 134 | 135 | // normalization 136 | c0 /= 255.0f; 137 | c1 /= 255.0f; 138 | c2 /= 255.0f; 139 | 140 | // bgrbgrbgr to rrrgggbbb 141 | int area = dst_width * dst_height; 142 | float* pdst_c0 = dst + dy * dst_width + dx; 143 | pdst_c0[0] = c2; 144 | pdst_c0[area] = c1; 145 | pdst_c0[2 * area] = c0; 146 | } 147 | 148 | 149 | void cuda_preprocess(cv::Mat& image, preproc_struct& image_trans, std::vector& bufferH, 150 | std::vector& bufferD, std::vector& bindingsize, cudaStream_t& stream, cv::Size resize) 151 | { 152 | int h, w, h_p, w_p; 153 | 154 | float scale = cv::min((float)resize.height / (float)image.rows, (float)resize.width / (float)image.cols); 155 | scale = cv::min(scale, 1.1f); 156 | 157 | h = image.rows * scale; 158 | w = image.cols * scale; 159 | h_p = (resize.height - h) * 0.5f; 160 | w_p = (resize.width - w) * 0.5f; 161 | 162 | image_trans.scale = scale; 163 | image_trans.ori_h = image.rows; 164 | image_trans.ori_w = image.cols; 165 | image_trans.h_p = h_p; 166 | image_trans.w_p = w_p; 167 | 168 | // copy data to device memory 169 | cudaMemcpyAsync(bufferD[2], image.data, bindingsize[2], cudaMemcpyHostToDevice, stream); 170 | 171 | AffineMatrix d2s; 172 | 173 | d2s.value[0] = 1.0f / scale; 174 | d2s.value[1] = 0; 175 | d2s.value[2] = (image.cols - resize.width / scale + d2s.value[0] - 1) * 0.5f; 176 | d2s.value[3] = 0; 177 | d2s.value[4] = 1.0f / scale; 178 | d2s.value[5] = (image.rows - resize.height / scale + d2s.value[0] - 1) * 0.5f; 179 | 180 | // AffineMatrix s2d; 181 | 182 | /*s2d.value[0] = scale; 183 | s2d.value[1] = 0; 184 | s2d.value[2] = (resize.width - scale * image.cols + scale - 1) * 0.5f; 185 | 186 | s2d.value[3] = 0; 187 | s2d.value[4] = scale; 188 | s2d.value[5] = (resize.height - scale * image.rows + scale - 1) * 0.5f;*/ 189 | 190 | /*cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value); 191 | cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value); 192 | cv::invertAffineTransform(m2x3_s2d, m2x3_d2s); 193 | memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value));*/ 194 | 195 | dim3 block(128, 1); 196 | dim3 grid((resize.width + block.x - 1) / block.x, (resize.height + block.y - 1) / block.y); 197 | 198 | warpaffine_nearest_bgrbgr2rrggbb_kernel <<< grid, block, 0, stream >>> ( 199 | (uint8_t*)bufferD[2], image.cols * 3, image.cols, 200 | image.rows, (float*)bufferD[0], resize.width, 201 | resize.height, 0, d2s, h_p, w_p); 202 | } 203 | -------------------------------------------------------------------------------- /cpp/kp_jetson_csi/preprocess.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "utils_detection.h" 4 | 5 | #include 6 | #include 7 | 8 | struct AffineMatrix 9 | { 10 | float value[6]; 11 | }; 12 | 13 | void cuda_preprocess(cv::Mat& image, preproc_struct& image_trans, std::vector& bufferH, 14 | std::vector& bufferD, std::vector& bindingsize, cudaStream_t& stream, cv::Size resize); 15 | -------------------------------------------------------------------------------- /cpp/kp_jetson_csi/trt_infer.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/kp_jetson_csi/trt_infer.cpp -------------------------------------------------------------------------------- /cpp/kp_jetson_csi/trt_infer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | 11 | 12 | class Logger : public nvinfer1::ILogger 13 | { 14 | public: 15 | Severity reportableSeverity; 16 | 17 | Logger(Severity severity = Severity::kINFO); 18 | void log(Severity severity, const char* msg) noexcept override; 19 | }; 20 | 21 | 22 | bool load_engine(nvinfer1::IRuntime*& runtime, nvinfer1::ICudaEngine*& engine, const std::string& engine_dir, 23 | nvinfer1::ILogger& gLogger); 24 | 25 | void allocate_buffers(nvinfer1::ICudaEngine*& engine, std::vector& bufferH, std::vector& bufferD, std::vector& bindingsize); 26 | 27 | float* det_inference(nvinfer1::IExecutionContext*& context, std::vector& bufferH, const std::vector& bufferD, 28 | const std::vector& BindingSize, cudaStream_t& stream); 29 | 30 | float* kp_inference(nvinfer1::IExecutionContext*& context, std::vector& bufferH, const std::vector& bufferD, 31 | const std::vector& BindingSize, cudaStream_t& stream); 32 | 33 | 34 | class yolo_trt_det 35 | { 36 | private: 37 | 38 | nvinfer1::IRuntime* det_runtime = nullptr; 39 | nvinfer1::ICudaEngine* det_engine = nullptr; 40 | nvinfer1::IExecutionContext* det_context = nullptr; 41 | 42 | nvinfer1::IRuntime* kp_runtime = nullptr; 43 | nvinfer1::ICudaEngine* kp_engine = nullptr; 44 | nvinfer1::IExecutionContext* kp_context = nullptr; 45 | 46 | std::unordered_map catid_labels; 47 | std::vector> points_linker; 48 | color_dicts catid_colors; 49 | cv::Size img_resize; 50 | cv::Size kp_img_resize; 51 | 52 | bool v8_head; 53 | 54 | std::vector det_bufferh; 55 | std::vector det_bufferd; 56 | std::vector det_bindingsize; 57 | 58 | std::vector kp_bufferh; 59 | std::vector kp_bufferd; 60 | std::vector kp_bindingsize; 61 | cudaStream_t stream; 62 | 63 | int skip; 64 | std::vector< int > nms_idx; 65 | std::vector nms_boxes; 66 | std::vector nms_scores; 67 | std::vector nms_catid; 68 | 69 | uint64_t infer_times; 70 | uint32_t frams_num; 71 | 72 | public: 73 | yolo_trt_det() = default; 74 | yolo_trt_det(const std::string & det_engine_dir, const std::string & kp_engine_dir, const std::string & labels_dir, 75 | const std::string & pointlinker_dir, cv::Size img_size); 76 | ~yolo_trt_det(); 77 | 78 | cv::Mat draw(cv::Mat & image, float conf, float iou, int max_det, int skip); 79 | }; 80 | -------------------------------------------------------------------------------- /cpp/kp_jetson_csi/utils_detection.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/kp_jetson_csi/utils_detection.cpp -------------------------------------------------------------------------------- /cpp/kp_jetson_csi/utils_detection.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | 10 | 11 | struct color_dicts 12 | { 13 | std::unordered_map> color_map; 14 | std::vector catid; 15 | 16 | color_dicts() {}; 17 | color_dicts(const std::unordered_map& catid_labels); 18 | }; 19 | 20 | 21 | struct preproc_struct 22 | { 23 | float* img = nullptr; 24 | float scale; 25 | int ori_h; 26 | int ori_w; 27 | int h_p; 28 | int w_p; 29 | 30 | ~preproc_struct(); 31 | }; 32 | 33 | 34 | 35 | std::unordered_map yaml_load_labels(const std::string& dir = "data.yaml"); 36 | 37 | std::vector> yaml_load_points_link(const std::string& dir); 38 | 39 | void preprocess(cv::Mat& image, preproc_struct& image_trans, const cv::Size& resize); 40 | 41 | void fliter_boxes(float* const boxes, bool v8_head, const std::array& output_shape, const float& conf_thres, 42 | std::vector& keep_boxes, std::vector& keep_scores, std::vector& keep_classes); 43 | 44 | void scale_boxes(cv::Rect& box, const preproc_struct& preproc_res); 45 | 46 | void draw_boxes(cv::Mat image, const cv::Rect& box, const float& score, const int& class_id, 47 | std::unordered_map catid_labels, color_dicts& color_dicts); 48 | 49 | void imgresize(const cv::Mat& image, cv::Mat& input_image, const float& scale, cv::Size resize); 50 | 51 | template 52 | static bool SortScorePairDescend(const std::pair& pair1, const std::pair& pair2); 53 | 54 | template 55 | void max_score_idx(const std::vector& scores, const float& score_thres, T& scores_idxs); 56 | 57 | float get_iou(const cv::Rect& bbox1, const cv::Rect& bbox2); 58 | 59 | void base_nms(const std::vector& bboxes, const std::vector& scores, const std::vector& catid, 60 | const float& score_threshold, const float& nms_threshold, std::vector& indices, const int& limit); 61 | 62 | void get_final_preds(float* const heatmaps, preproc_struct& keypoints_trans, const std::array& output_shape, 63 | const cv::Rect& bbox, std::vector& keypoints_scorce, std::vector& keypoints); 64 | 65 | void draw_keypoints(cv::Mat image, const std::vector& keypoints, std::vector& keypoints_score, 66 | float score, const std::vector>& points_linker); 67 | -------------------------------------------------------------------------------- /cpp/video_detect/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | set(CMAKE_CXX_STANDARD 14) 4 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 5 | set(CMAKE_CXX_EXTENSIONS ON) 6 | 7 | project(yolo_tensorrt C CXX) 8 | 9 | add_definitions(-DAPI_EXPORTS) 10 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 11 | 12 | if(NOT CMAKE_BUILD_TYPE) 13 | set(CMAKE_BUILD_TYPE Release CACHE STRING "Build Type" FORCE) 14 | set(CMAKE_CXX_FLAGS_Release "$ENV{CXXFLAGS} -O3 -Wall") 15 | endif() 16 | 17 | 18 | set(src_list main.cpp utils_detection.cpp utils_detection.h trt_infer.cpp trt_infer.h preprocess.cu preprocess.h) 19 | 20 | # TODO(Call for PR): make cmake compatible with Windows 21 | set(CMAKE_CUDA_COMPILER E:/NV/cuda11.7/bin/nvcc) 22 | enable_language(CUDA) 23 | 24 | # CUDA 25 | # TODO(Call for PR): make cmake compatible with Windows 26 | find_package(CUDA REQUIRED) 27 | message(STATUS " libraries: ${CUDA_LIBRARIES}") 28 | message(STATUS " include path: ${CUDA_INCLUDE_DIRS}") 29 | 30 | 31 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 32 | include_directories(E:/NV/cuda11.7/include/) 33 | link_directories(E:/NV/cuda11.7/lib/x64/) 34 | 35 | 36 | # tensorrt 37 | set(TRT_DIR J:/tensorrt/TensorRT-8.4.3.1) 38 | set(TRT_INCLUDE_DIRS ${TRT_DIR}/include/) 39 | set(TRT_LIB_DIRS ${TRT_DIR}/lib/) 40 | 41 | include_directories(${TRT_INCLUDE_DIRS}) 42 | 43 | 44 | # opencv 45 | set(CMAKE_PREFIX_PATH E:/opencv/build/x64/vc16/lib) 46 | find_package(OpenCV REQUIRED) 47 | include_directories( ${OpenCV_INCLUDE_DIRS} ) 48 | 49 | add_executable(${PROJECT_NAME} ${src_list}) 50 | target_link_libraries(${PROJECT_NAME} nvinfer) 51 | target_link_libraries(${PROJECT_NAME} cudart) 52 | target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS}) -------------------------------------------------------------------------------- /cpp/video_detect/README.md: -------------------------------------------------------------------------------- 1 | # video_detect 2 | 用于视频目标检测的c++代码。 3 | 用cmake编译后,运行yolo_detect。 4 | 5 | ```shell 6 | yolo_detect --engine_dir=./yolov5s.engine --video_dir=./sample_1080p_h265.mp4 --labels=./labels_coco.yaml 7 | ``` 8 | 9 | 参数说明: 10 | - ```--engine_dir``` trt模型的保存路径 11 | - ```--video_dir``` 视频源路径 12 | - ```--labels``` 模型labels文件 13 | - ```--conf_thres``` nms的置信度设置 14 | - ```--iou_thres``` nms的iou设置 15 | - ```--max_det``` nms输出的最大检测数量 16 | 17 | 更详细参数说明可以在main.cpp中查看。 18 | -------------------------------------------------------------------------------- /cpp/video_detect/labels_coco.yaml: -------------------------------------------------------------------------------- 1 | 0: person 2 | 1: bicycle 3 | 2: car 4 | 3: motorcycle 5 | 4: airplane 6 | 5: bus 7 | 6: train 8 | 7: truck 9 | 8: boat 10 | 9: traffic light 11 | 10: fire hydrant 12 | 11: stop sign 13 | 12: parking meter 14 | 13: bench 15 | 14: bird 16 | 15: cat 17 | 16: dog 18 | 17: horse 19 | 18: sheep 20 | 19: cow 21 | 20: elephant 22 | 21: bear 23 | 22: zebra 24 | 23: giraffe 25 | 24: backpack 26 | 25: umbrella 27 | 26: handbag 28 | 27: tie 29 | 28: suitcase 30 | 29: frisbee 31 | 30: skis 32 | 31: snowboard 33 | 32: sports ball 34 | 33: kite 35 | 34: baseball bat 36 | 35: baseball glove 37 | 36: skateboard 38 | 37: surfboard 39 | 38: tennis racket 40 | 39: bottle 41 | 40: wine glass 42 | 41: cup 43 | 42: fork 44 | 43: knife 45 | 44: spoon 46 | 45: bowl 47 | 46: banana 48 | 47: apple 49 | 48: sandwich 50 | 49: orange 51 | 50: broccoli 52 | 51: carrot 53 | 52: hot dog 54 | 53: pizza 55 | 54: donut 56 | 55: cake 57 | 56: chair 58 | 57: couch 59 | 58: potted plant 60 | 59: bed 61 | 60: dining table 62 | 61: toilet 63 | 62: tv 64 | 63: laptop 65 | 64: mouse 66 | 65: remote 67 | 66: keyboard 68 | 67: cell phone 69 | 68: microwave 70 | 69: oven 71 | 70: toaster 72 | 71: sink 73 | 72: refrigerator 74 | 73: book 75 | 74: clock 76 | 75: vase 77 | 76: scissors 78 | 77: teddy bear 79 | 78: hair drier 80 | 79: toothbrush 81 | -------------------------------------------------------------------------------- /cpp/video_detect/main.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/video_detect/main.cpp -------------------------------------------------------------------------------- /cpp/video_detect/preprocess.cu: -------------------------------------------------------------------------------- 1 | #include "preprocess.h" 2 | 3 | #include 4 | 5 | 6 | __global__ void warpaffine_nearest_bgrbgr2rrggbb_kernel( 7 | uint8_t* src, int src_step_size, int src_width, 8 | int src_height, float* dst, int dst_width, 9 | int dst_height, uint8_t const_value_st, 10 | AffineMatrix d2s, int h_p, int w_p) 11 | { 12 | int dx = blockDim.x * blockIdx.x + threadIdx.x; 13 | int dy = blockDim.y * blockIdx.y + threadIdx.y; 14 | if (dx >= dst_width || dy >= dst_height) return; 15 | 16 | float m_x1 = d2s.value[0]; 17 | float m_y1 = d2s.value[1]; 18 | float m_z1 = d2s.value[2]; 19 | float m_x2 = d2s.value[3]; 20 | float m_y2 = d2s.value[4]; 21 | float m_z2 = d2s.value[5]; 22 | 23 | float c0, c1, c2; 24 | if (dy < h_p || dy >(dst_height - h_p) || dx < w_p || dx >(dst_width - w_p)) 25 | { 26 | // out of range 27 | c0 = const_value_st; 28 | c1 = const_value_st; 29 | c2 = const_value_st; 30 | } 31 | else 32 | { 33 | float src_x = m_x1 * (dx + 0.5f) + m_y1 * dy + m_z1 - 0.5f; 34 | float src_y = m_x2 * dx + m_y2 * (dy + 0.5f) + m_z2 - 0.5f; 35 | 36 | int sy_1 = floorf(src_y + 0.5f); 37 | int sx_1 = floorf(src_x + 0.5f); 38 | 39 | uint8_t const_value[] = { const_value_st, const_value_st, const_value_st }; 40 | uint8_t* p = const_value; 41 | 42 | if (sy_1 >= 0 && sy_1 <= src_height && sx_1 >=0 && sx_1 <= src_width) 43 | { 44 | p = src + sy_1 * src_step_size + sx_1 * 3; 45 | } 46 | 47 | c0 = p[0]; 48 | c1 = p[1]; 49 | c2 = p[2]; 50 | } 51 | 52 | // normalization 53 | c0 = c0 / 255.0f; 54 | c1 = c1 / 255.0f; 55 | c2 = c2 / 255.0f; 56 | 57 | // bgrbgrbgr to rrrgggbbb 58 | int area = dst_width * dst_height; 59 | float* pdst_c0 = dst + dy * dst_width + dx; 60 | pdst_c0[0] = c2; 61 | pdst_c0[area] = c1; 62 | pdst_c0[2 * area] = c0; 63 | } 64 | 65 | __global__ void warpaffine_bilinear_bgrbgr2rrggbb_kernel( 66 | uint8_t* src, int src_step_size, int src_width, 67 | int src_height, float* dst, int dst_width, 68 | int dst_height, uint8_t const_value_st, 69 | AffineMatrix d2s, int h_p, int w_p) 70 | { 71 | int dx = blockDim.x * blockIdx.x + threadIdx.x; 72 | int dy = blockDim.y * blockIdx.y + threadIdx.y; 73 | if (dx >= dst_width || dy >= dst_height) return; 74 | 75 | float m_x1 = d2s.value[0]; 76 | float m_y1 = d2s.value[1]; 77 | float m_z1 = d2s.value[2]; 78 | float m_x2 = d2s.value[3]; 79 | float m_y2 = d2s.value[4]; 80 | float m_z2 = d2s.value[5]; 81 | 82 | float c0, c1, c2; 83 | if (dy < h_p || dy >(dst_height - h_p) || dx < w_p || dx >(dst_width - w_p)) 84 | { 85 | // out of range 86 | c0 = const_value_st; 87 | c1 = const_value_st; 88 | c2 = const_value_st; 89 | } 90 | else 91 | { 92 | float src_x = m_x1 * (dx + 0.5f) + m_y1 * dy + m_z1 - 0.5f; 93 | float src_y = m_x2 * dx + m_y2 * (dy + 0.5f) + m_z2 - 0.5f; 94 | 95 | int sy_1 = floorf(src_y); 96 | int sx_1 = floorf(src_x); 97 | int sy_2 = sy_1 + 1; 98 | int sx_2 = sx_1 + 1; 99 | 100 | uint8_t const_value[] = { const_value_st, const_value_st, const_value_st }; 101 | float a2 = src_y - sy_1; 102 | float a1 = 1.0f - a2; 103 | float b2 = src_x - sx_1; 104 | float b1 = 1.0f - b2; 105 | float w11 = a1 * b1; 106 | float w12 = a1 * b2; 107 | float w21 = a2 * b1; 108 | float w22 = a2 * b2; 109 | uint8_t* p11 = const_value; 110 | uint8_t* p12 = const_value; 111 | uint8_t* p21 = const_value; 112 | uint8_t* p22 = const_value; 113 | 114 | /*if (sy_1 >= 0) { 115 | if (sx_1 >= 0)*/ 116 | p11 = src + sy_1 * src_step_size + sx_1 * 3; 117 | 118 | //if (sx_2 < src_width) 119 | p12 = src + sy_1 * src_step_size + sx_2 * 3; 120 | //} 121 | 122 | /*if (sy_2 < src_height) { 123 | if (sx_1 >= 0)*/ 124 | p21 = src + sy_2 * src_step_size + sx_1 * 3; 125 | 126 | /*if (sx_2 < src_width)*/ 127 | p22 = src + sy_2 * src_step_size + sx_2 * 3; 128 | //} 129 | 130 | c0 = w11 * p11[0] + w12 * p12[0] + w21 * p21[0] + w22 * p22[0] + 0.5f; 131 | c1 = w11 * p11[1] + w12 * p12[1] + w21 * p21[1] + w22 * p22[1] + 0.5f; 132 | c2 = w11 * p11[2] + w12 * p12[2] + w21 * p21[2] + w22 * p22[2] + 0.5f; 133 | } 134 | 135 | // normalization 136 | c0 /= 255.0f; 137 | c1 /= 255.0f; 138 | c2 /= 255.0f; 139 | 140 | // bgrbgrbgr to rrrgggbbb 141 | int area = dst_width * dst_height; 142 | float* pdst_c0 = dst + dy * dst_width + dx; 143 | dst[dy * dst_width + dx] = c2; 144 | dst[dy * dst_width + dx + area] = c1; 145 | dst[dy * dst_width + dx + 2 * area] = c0; 146 | } 147 | 148 | 149 | void cuda_preprocess(cv::Mat& image, preproc_struct& image_trans, std::vector& bufferH, 150 | std::vector& bufferD, std::vector& bindingsize, cudaStream_t& stream, cv::Size resize) 151 | { 152 | int h, w, h_p, w_p; 153 | 154 | float scale = cv::min((float)resize.height / (float)image.rows, (float)resize.width / (float)image.cols); 155 | scale = cv::min(scale, 1.1f); 156 | 157 | h = image.rows * scale; 158 | w = image.cols * scale; 159 | h_p = (resize.height - h) * 0.5f; 160 | w_p = (resize.width - w) * 0.5f; 161 | 162 | image_trans.scale = scale; 163 | image_trans.h_p = h_p; 164 | image_trans.w_p = w_p; 165 | 166 | // copy data to device memory 167 | memcpy(bufferH[2], image.data, bindingsize[2]); 168 | cudaMemcpyAsync(bufferD[2], bufferH[2], bindingsize[2], cudaMemcpyHostToDevice, stream); 169 | 170 | // AffineMatrix s2d; 171 | /*s2d.value[0] = scale; 172 | s2d.value[1] = 0; 173 | s2d.value[2] = (resize.width - scale * image.cols + scale - 1) * 0.5f; 174 | s2d.value[3] = 0; 175 | s2d.value[4] = scale; 176 | s2d.value[5] = (resize.height - scale * image.rows + scale - 1) * 0.5f;*/ 177 | 178 | /*cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value); 179 | cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value); 180 | cv::invertAffineTransform(m2x3_s2d, m2x3_d2s); 181 | memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value));*/ 182 | 183 | AffineMatrix d2s; 184 | 185 | d2s.value[0] = 1.0f / scale; 186 | d2s.value[1] = 0; 187 | d2s.value[2] = (image.cols - resize.width / scale + d2s.value[0] - 1) * 0.5f; 188 | d2s.value[3] = 0; 189 | d2s.value[4] = 1.0f / scale; 190 | d2s.value[5] = (image.rows - resize.height / scale + d2s.value[0] - 1) * 0.5f; 191 | 192 | dim3 block(128, 1); 193 | dim3 grid((resize.width + block.x - 1) / block.x, (resize.height + block.y - 1) / block.y); 194 | 195 | warpaffine_nearest_bgrbgr2rrggbb_kernel <<< grid, block, 0, stream >>> ( 196 | (uint8_t*)bufferD[2], image.cols * 3, image.cols, 197 | image.rows, (float*)bufferD[0], resize.width, 198 | resize.height, 0, d2s, h_p, w_p); 199 | } -------------------------------------------------------------------------------- /cpp/video_detect/preprocess.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "utils_detection.h" 4 | 5 | #include 6 | #include 7 | 8 | struct AffineMatrix 9 | { 10 | float value[6]; 11 | }; 12 | 13 | void cuda_preprocess(cv::Mat& image, preproc_struct& image_trans, std::vector& bufferH, 14 | std::vector& bufferD, std::vector& bindingsize, cudaStream_t& stream, cv::Size resize); -------------------------------------------------------------------------------- /cpp/video_detect/trt_infer.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/video_detect/trt_infer.cpp -------------------------------------------------------------------------------- /cpp/video_detect/trt_infer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | 11 | 12 | class Logger : public nvinfer1::ILogger 13 | { 14 | public: 15 | Severity reportableSeverity; 16 | 17 | Logger(Severity severity = Severity::kINFO); 18 | void log(Severity severity, const char* msg) noexcept override; 19 | }; 20 | 21 | 22 | bool load_engine(nvinfer1::IRuntime*& runtime, nvinfer1::ICudaEngine*& engine, const std::string& engine_dir, 23 | nvinfer1::ILogger& gLogger); 24 | 25 | void allocate_buffers(nvinfer1::ICudaEngine*& engine, 26 | std::vector& bufferH, std::vector& bufferD, std::vector& bindingsize, cv::Size img_size); 27 | 28 | float* do_inference(nvinfer1::IExecutionContext*& context, std::vector& bufferH, const std::vector& bufferD, 29 | cudaStream_t& stream, const std::vector& BindingSize); 30 | 31 | 32 | class yolo_trt_det 33 | { 34 | private: 35 | 36 | nvinfer1::IRuntime* _runtime = nullptr; 37 | nvinfer1::ICudaEngine* _engine = nullptr; 38 | nvinfer1::IExecutionContext* _context = nullptr; 39 | 40 | std::unordered_map catid_labels; 41 | color_dicts catid_colors; 42 | cv::Size set_size; 43 | bool v8_head; 44 | 45 | std::vector cpu_buffer; 46 | std::vector gpu_buffer; 47 | std::vector BindingSize; 48 | cudaStream_t stream; 49 | 50 | public: 51 | yolo_trt_det(const std::string& engine_dir, const std::string& labels_dir, cv::Size img_size); 52 | ~yolo_trt_det(); 53 | 54 | //std::vector draw_batch(std::vector& image_list, float conf, float iou, int max_det); 55 | 56 | cv::Mat draw(cv::Mat& image, float conf, float iou, int max_det); 57 | }; -------------------------------------------------------------------------------- /cpp/video_detect/utils_detection.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/video_detect/utils_detection.cpp -------------------------------------------------------------------------------- /cpp/video_detect/utils_detection.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | 10 | 11 | struct color_dicts 12 | { 13 | std::unordered_map> color_map; 14 | std::vector catid; 15 | 16 | color_dicts(const std::unordered_map& catid_labels); 17 | }; 18 | 19 | 20 | struct preproc_struct 21 | { 22 | float* img = nullptr; 23 | float scale; 24 | int h_p; 25 | int w_p; 26 | 27 | ~preproc_struct(); 28 | }; 29 | 30 | 31 | 32 | std::unordered_map yaml_load_labels(const std::string& dir = "data.yaml"); 33 | 34 | void preprocess(cv::Mat& image, preproc_struct& image_trans, cv::Size resize); 35 | 36 | void fliter_boxes(float* const boxes, bool v8_head, const std::array& output_shape, float conf_thres, 37 | std::vector& keep_boxes, std::vector& keep_scores, std::vector& keep_classes); 38 | 39 | void scale_boxes(cv::Rect& box, const preproc_struct& preproc_res); 40 | 41 | void draw_boxes(cv::Mat image, const cv::Rect& box, float score, int class_id, 42 | std::unordered_map catid_labels, color_dicts& color_dicts); 43 | 44 | void imgresize(const cv::Mat& image, cv::Mat& input_image, float scale, cv::Size resize); 45 | 46 | template 47 | static bool SortScorePairDescend(const std::pair& pair1, const std::pair& pair2); 48 | 49 | template 50 | void max_score_idx(const std::vector& scores, float score_thres, T scores_idxs); 51 | 52 | float get_iou(const cv::Rect& bbox1, const cv::Rect& bbox2); 53 | 54 | void base_nms(const std::vector& bboxes, const std::vector& scores, const std::vector& catid, float score_threshold, float nms_threshold, std::vector& indices, int limit); 55 | -------------------------------------------------------------------------------- /doc/yolov5s_det.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/doc/yolov5s_det.png -------------------------------------------------------------------------------- /labels_coco.yaml: -------------------------------------------------------------------------------- 1 | 0: person 2 | 1: bicycle 3 | 2: car 4 | 3: motorcycle 5 | 4: airplane 6 | 5: bus 7 | 6: train 8 | 7: truck 9 | 8: boat 10 | 9: traffic light 11 | 10: fire hydrant 12 | 11: stop sign 13 | 12: parking meter 14 | 13: bench 15 | 14: bird 16 | 15: cat 17 | 16: dog 18 | 17: horse 19 | 18: sheep 20 | 19: cow 21 | 20: elephant 22 | 21: bear 23 | 22: zebra 24 | 23: giraffe 25 | 24: backpack 26 | 25: umbrella 27 | 26: handbag 28 | 27: tie 29 | 28: suitcase 30 | 29: frisbee 31 | 30: skis 32 | 31: snowboard 33 | 32: sports ball 34 | 33: kite 35 | 34: baseball bat 36 | 35: baseball glove 37 | 36: skateboard 38 | 37: surfboard 39 | 38: tennis racket 40 | 39: bottle 41 | 40: wine glass 42 | 41: cup 43 | 42: fork 44 | 43: knife 45 | 44: spoon 46 | 45: bowl 47 | 46: banana 48 | 47: apple 49 | 48: sandwich 50 | 49: orange 51 | 50: broccoli 52 | 51: carrot 53 | 52: hot dog 54 | 53: pizza 55 | 54: donut 56 | 55: cake 57 | 56: chair 58 | 57: couch 59 | 58: potted plant 60 | 59: bed 61 | 60: dining table 62 | 61: toilet 63 | 62: tv 64 | 63: laptop 65 | 64: mouse 66 | 65: remote 67 | 66: keyboard 68 | 67: cell phone 69 | 68: microwave 70 | 69: oven 71 | 70: toaster 72 | 71: sink 73 | 72: refrigerator 74 | 73: book 75 | 74: clock 76 | 75: vase 77 | 76: scissors 78 | 77: teddy bear 79 | 78: hair drier 80 | 79: toothbrush 81 | -------------------------------------------------------------------------------- /labels_voc.yaml: -------------------------------------------------------------------------------- 1 | 0: bus 2 | 1: train 3 | 2: cow 4 | 3: diningtable 5 | 4: motorbike 6 | 5: horse 7 | 6: sofa 8 | 7: bicycle 9 | 8: tvmonitor 10 | 9: aeroplane 11 | 10: boat 12 | 11: sheep 13 | 12: pottedplant 14 | 13: bird 15 | 14: cat 16 | 15: bottle 17 | 16: dog 18 | 17: car 19 | 18: chair 20 | 19: person 21 | -------------------------------------------------------------------------------- /models_onnx/README.md: -------------------------------------------------------------------------------- 1 | 用于存放待量化的onnx 2 | -------------------------------------------------------------------------------- /models_trt/README.md: -------------------------------------------------------------------------------- 1 | 用于存放导出后的engine 2 | -------------------------------------------------------------------------------- /onnx2trt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import numpy as np 4 | import tensorrt as trt 5 | 6 | 7 | from utils import calibrator 8 | 9 | __all__ = [ 10 | 'build_engine', 11 | 'onnx2trt' 12 | ] 13 | 14 | 15 | def AddEfficientNMSPlugin(conf_thres=0.25, iou_thres=0.45, max_det=200, box_coding=1): 16 | """ 17 | 添加efficientNMS 18 | 19 | score_threshold: score_thresh 20 | iou_threshold: iou_thresh 21 | max_output_boxes: detections_per_img 22 | box_coding: 0->[x1, y1, x2, y2], 1->[x, y, w, h] 23 | """ 24 | for c in trt.get_plugin_registry().plugin_creator_list: 25 | if c.name == "EfficientNMS_TRT": 26 | print(f'Succeeded finding {c.name}') 27 | parameter = [ 28 | trt.PluginField("score_threshold", np.float32(conf_thres), trt.PluginFieldType.FLOAT32), 29 | trt.PluginField("iou_threshold", np.float32(iou_thres), trt.PluginFieldType.FLOAT32), 30 | trt.PluginField("max_output_boxes", np.int32(max_det), trt.PluginFieldType.INT32), 31 | trt.PluginField("background_class", np.int32(-1), trt.PluginFieldType.INT32), # background_class: -1, no background class 32 | trt.PluginField("score_activation", np.int32(0), trt.PluginFieldType.INT32), # score_activation: 0->False, 1->True 33 | trt.PluginField("box_coding", np.int32(box_coding), trt.PluginFieldType.INT32) 34 | ] 35 | return c.create_plugin(c.name, trt.PluginFieldCollection(parameter)) 36 | return None 37 | 38 | 39 | def build_engine( 40 | onnx_file, model_engine, min_shape, opt_shape, max_shape, 41 | fp16=False, int8=False, imgs_dir=None, imgs_list=None, n_iteration=128, cache_file=None, 42 | v8_head=False, add_nms=False, conf_thres=0.25, iou_thres=0.45, max_det=200, box_coding=1 43 | ): 44 | logger = trt.Logger(trt.Logger.ERROR) 45 | trt.init_libnvinfer_plugins(logger, namespace="") 46 | builder = trt.Builder(logger) 47 | network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) 48 | config = builder.create_builder_config() 49 | config.max_workspace_size = (4 << 30) 50 | # config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 << 30) 51 | 52 | # Parse model file 53 | parser = trt.OnnxParser(network, logger) 54 | if not os.path.exists(onnx_file): 55 | print("ONNX file is not exists!") 56 | exit() 57 | print("Succeeded finding .onnx file!") 58 | with open(onnx_file, "rb") as model: 59 | if not parser.parse(model.read()): 60 | print("Failed parsing .onnx file!") 61 | for error in range(parser.num_errors): 62 | print(parser.get_error(error)) 63 | exit() 64 | else: 65 | print("Succeeded parsing .onnx file!") 66 | 67 | if v8_head: 68 | outputTensor = network.get_output(0) 69 | print(f'v8 {outputTensor.name} shape:{outputTensor.shape}') 70 | network.unmark_output(outputTensor) 71 | outputTensor = network.add_shuffle(outputTensor) 72 | outputTensor.first_transpose = (0, 2, 1) 73 | network.mark_output(outputTensor.get_output(0)) 74 | 75 | # 添加nms算子 76 | if add_nms: 77 | """ 78 | 对原输出进行预处理,拆分成 目标框数据 和 类别置信度数据 两个矩阵,背景置信度要与类别置信度相乘。 79 | [1, 8500, 4 + 1 + 80] ——> [1, 8500, 4] + [1, 8500, 1 + 80] ——> [1, 8500, 4] + [1, 8500, 80] 80 | """ 81 | outputTensor = network.get_output(0) 82 | print(f'{outputTensor.name} shape:{outputTensor.shape}') 83 | bs, num_boxes, det_res = outputTensor.shape 84 | network.unmark_output(outputTensor) 85 | xycwh = network.add_slice(outputTensor, (0, 0, 0), (bs, num_boxes, 4), (1, 1, 1)) 86 | if v8_head: 87 | obj = network.add_slice( 88 | outputTensor, (0, 0, 4), (bs, num_boxes, det_res - 4), (1, 1, 1) 89 | ) 90 | else: 91 | scores = network.add_slice(outputTensor, (0, 0, 4), (bs, num_boxes, 1), (1, 1, 1)) 92 | obj = network.add_slice(outputTensor, (0, 0, 5), (bs, num_boxes, det_res - 5), (1, 1, 1)) 93 | obj = network.add_elementwise( 94 | scores.get_output(0), obj.get_output(0), trt.ElementWiseOperation.PROD 95 | ) 96 | print('Add EfficientNMS_TRT!') 97 | nms = AddEfficientNMSPlugin(conf_thres, iou_thres, max_det, box_coding) 98 | pluginlayer = network.add_plugin_v2([xycwh.get_output(0), obj.get_output(0)], nms) 99 | pluginlayer.get_output(0).name = "num_dets" 100 | pluginlayer.get_output(1).name = "det_boxes" 101 | pluginlayer.get_output(2).name = "det_scores" 102 | pluginlayer.get_output(3).name = "det_classes" 103 | for i in range(4): 104 | network.mark_output(pluginlayer.get_output(i)) 105 | 106 | inputTensor = network.get_input(0) 107 | print(f'{inputTensor.name} shape:{inputTensor.shape}') 108 | batch, c, h, w = inputTensor.shape 109 | if batch != -1: 110 | min_shape[0], opt_shape[0], max_shape[0] = batch, batch, batch 111 | if c != -1: 112 | min_shape[1], opt_shape[1], max_shape[1] = c, c, c 113 | if h != -1: 114 | min_shape[-2], opt_shape[-2], max_shape[-2] = h, h, h 115 | if w != -1: 116 | min_shape[-1], opt_shape[-1], max_shape[-1] = w, w, w 117 | 118 | profile = builder.create_optimization_profile() 119 | profile.set_shape(inputTensor.name, min_shape, opt_shape, max_shape) 120 | config.add_optimization_profile(profile) 121 | 122 | # Quantization 123 | if fp16: 124 | config.set_flag(trt.BuilderFlag.FP16) 125 | if int8 and imgs_dir: 126 | config.set_flag(trt.BuilderFlag.INT8) 127 | if imgs_list is None: 128 | imgs_list = os.listdir(imgs_dir) 129 | config.int8_calibrator = calibrator.MyCalibrator( 130 | calibrationpath=imgs_dir, 131 | imgslist=imgs_list, 132 | nCalibration=n_iteration, 133 | inputShape=max_shape, 134 | cacheFile=cache_file 135 | ) 136 | 137 | 138 | print('Now, engine is building!') 139 | plan = builder.build_serialized_network(network, config) 140 | if plan is None: 141 | print("Failed building engine!") 142 | # exit() 143 | with open(model_engine, "wb") as f: 144 | f.write(plan) 145 | print('Engine has been built!!!') 146 | 147 | runtime = trt.Runtime(logger) 148 | return runtime.deserialize_cuda_engine(plan) 149 | 150 | 151 | class onnx2trt: 152 | """ 153 | Parses an ONNX graph and builds a TensorRT engine from it. 154 | """ 155 | def __init__(self, verbose=False): 156 | 157 | self.logger = trt.Logger(trt.Logger.ERROR) 158 | if verbose: 159 | self.logger = trt.Logger(trt.Logger.INFO) 160 | self.logger.min_severity = trt.Logger.Severity.VERBOSE 161 | 162 | trt.init_libnvinfer_plugins(self.logger, namespace="") 163 | 164 | self.builder = trt.Builder(self.logger) 165 | self.config = self.builder.create_builder_config() 166 | self.config.max_workspace_size = (4 << 30) 167 | # self.config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 << 30) 168 | 169 | self.network = None 170 | self.profile = None 171 | self.parser = None 172 | 173 | self.FP16 = False 174 | self.INT8 = False 175 | 176 | def create_network( 177 | self, onnx_dir, v8_head=False, add_nms=False, conf_thres=0.25, iou_thres=0.45, max_det=200, box_coding=1 178 | ): 179 | 180 | self.network = self.builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) 181 | # Parse model file 182 | self.parser = trt.OnnxParser(self.network, self.logger) 183 | if not os.path.exists(onnx_dir): 184 | print("ONNX file is not exists!") 185 | exit() 186 | print("Succeeded finding .onnx file!") 187 | with open(onnx_dir, "rb") as model: 188 | if not self.parser.parse(model.read()): 189 | print("Failed parsing .onnx file!") 190 | for error in range(self.parser.num_errors): 191 | print(self.parser.get_error(error)) 192 | exit() 193 | else: 194 | print("Succeeded parsing .onnx file!") 195 | 196 | if v8_head: 197 | outputTensor = self.network.get_output(0) 198 | print(f'v8 {outputTensor.name} shape:{outputTensor.shape}') 199 | self.network.unmark_output(outputTensor) 200 | outputTensor = self.network.add_shuffle(outputTensor) 201 | # (bs, det_res, num_boxes ) -> (bs, num_boxes, det_res) 202 | outputTensor.first_transpose = (0, 2, 1) 203 | self.network.mark_output(outputTensor.get_output(0)) 204 | 205 | # 添加nms算子 206 | if add_nms: 207 | """ 208 | 对原输出进行预处理,拆分成 目标框数据 和 类别置信度数据 两个矩阵,背景置信度要与类别置信度相乘。 209 | [1, 8500, 4 + 1 + 80] ——> [1, 8500, 4] + [1, 8500, 1 + 80] ——> [1, 8500, 4] + [1, 8500, 80] 210 | """ 211 | outputTensor = self.network.get_output(0) 212 | print(f'{outputTensor.name} shape:{outputTensor.shape}') 213 | bs, num_boxes, det_res = outputTensor.shape 214 | self.network.unmark_output(outputTensor) 215 | xycwh = self.network.add_slice(outputTensor, (0, 0, 0), (bs, num_boxes, 4), (1, 1, 1)) 216 | if v8_head: 217 | obj = self.network.add_slice( 218 | outputTensor, (0, 0, 4), (bs, num_boxes, det_res - 4), (1, 1, 1) 219 | ) 220 | else: 221 | scores = self.network.add_slice(outputTensor, (0, 0, 4), (bs, num_boxes, 1), (1, 1, 1)) 222 | obj = self.network.add_slice(outputTensor, (0, 0, 5), (bs, num_boxes, det_res - 5), (1, 1, 1)) 223 | obj = self.network.add_elementwise( 224 | scores.get_output(0), obj.get_output(0), trt.ElementWiseOperation.PROD 225 | ) 226 | print('Add EfficientNMS_TRT!') 227 | nms = AddEfficientNMSPlugin(conf_thres, iou_thres, max_det, box_coding) 228 | pluginlayer = self.network.add_plugin_v2([xycwh.get_output(0), obj.get_output(0)], nms) 229 | pluginlayer.get_output(0).name = "num_dets" 230 | pluginlayer.get_output(1).name = "det_boxes" 231 | pluginlayer.get_output(2).name = "det_scores" 232 | pluginlayer.get_output(3).name = "det_classes" 233 | for i in range(4): 234 | self.network.mark_output(pluginlayer.get_output(i)) 235 | 236 | 237 | def create_engine(self, engine_dir, min_shape, opt_shape, max_shape, fp16=False, int8=False, 238 | imgs_dir=None, n_iteration=128, cache_file=None): 239 | 240 | self.FP16 = fp16 241 | self.INT8 = int8 242 | 243 | inputTensor = self.network.get_input(0) 244 | print(f'{inputTensor.name} shape:{inputTensor.shape}') 245 | batch, c, h, w = inputTensor.shape 246 | if batch != -1: 247 | min_shape[0], opt_shape[0], max_shape[0] = batch, batch, batch 248 | if c != -1: 249 | min_shape[1], opt_shape[1], max_shape[1] = c, c, c 250 | if h != -1: 251 | min_shape[-2], opt_shape[-2], max_shape[-2] = h, h, h 252 | if w != -1: 253 | min_shape[-1], opt_shape[-1], max_shape[-1] = w, w, w 254 | 255 | self.profile = self.builder.create_optimization_profile() 256 | self.profile.set_shape(inputTensor.name, min_shape, opt_shape, max_shape) 257 | self.config.add_optimization_profile(self.profile) 258 | 259 | # Quantization 260 | if self.FP16: 261 | self.config.set_flag(trt.BuilderFlag.FP16) 262 | if self.INT8: 263 | assert imgs_dir ,'If you choice int8, you should also set imgs_dir for the calibration' 264 | self.config.set_flag(trt.BuilderFlag.INT8) 265 | imgs_list = os.listdir(imgs_dir) 266 | calib = calibrator.MyCalibrator( 267 | calibrationpath=imgs_dir, 268 | imgslist=imgs_list, 269 | nCalibration=n_iteration, 270 | inputShape=max_shape, 271 | cacheFile=cache_file 272 | ) 273 | self.config.int8_calibrator = calib 274 | 275 | print('Now, engine is building...') 276 | t1 = time.time() 277 | plan = self.builder.build_serialized_network(self.network, self.config) 278 | t2 = time.time() 279 | print(f'{(t2 - t1)/60:0.2f}min') 280 | if plan is None: 281 | print("Failed building engine!") 282 | # exit() 283 | with open(engine_dir, "wb") as f: 284 | f.write(plan) 285 | print('Engine has been built!!!') 286 | 287 | runtime = trt.Runtime(self.logger) 288 | return runtime.deserialize_cuda_engine(plan) 289 | 290 | 291 | def main(args): 292 | 293 | onnx_dir = args.onnx_dir 294 | engine_dir = args.engine_dir 295 | if engine_dir is None: 296 | engine_dir = f"./models_trt/{onnx_dir.split('/')[-1].replace('onnx', 'engine')}" 297 | 298 | yolo_engine = onnx2trt() 299 | yolo_engine.create_network( 300 | onnx_dir, 301 | v8_head=args.yolov8_head, 302 | add_nms=args.add_nms, 303 | conf_thres=args.conf_thres, 304 | iou_thres=args.iou_thres, 305 | max_det=args.max_det 306 | ) 307 | 308 | yolo_engine.create_engine( 309 | engine_dir, 310 | min_shape=args.min_shape, 311 | opt_shape=args.opt_shape, 312 | max_shape=args.max_shape, 313 | fp16=args.fp16, 314 | int8=args.int8, 315 | imgs_dir=args.imgs_dir, 316 | n_iteration=args.n_iteration, 317 | cache_file=args.cache_file 318 | ) 319 | 320 | 321 | if __name__ == '__main__': 322 | import argparse 323 | 324 | parser = argparse.ArgumentParser(description=__doc__) 325 | # onnx模型 326 | parser.add_argument('--onnx_dir', type=str, default='./models_onnx/yolov5s.onnx', help='onnx path') 327 | # engine模型保存地址 328 | parser.add_argument('--engine_dir', type=str, default=None, help='engine path') 329 | # 最小的输入shape 330 | parser.add_argument('--min_shape', nargs='+', type=int, default=[1, 3, 512, 512], 331 | help='input min shape [batch, channel, height, width]') 332 | # 最佳优化的输入shape 333 | parser.add_argument('--opt_shape', nargs='+', type=int, default=[1, 3, 512, 512], 334 | help='input opt shape [batch, channel, height, width]') 335 | # 最大的输入shape 336 | parser.add_argument('--max_shape', nargs='+', type=int, default=[1, 3, 512, 512], 337 | help='input max shape [batch, channel, height, width]') 338 | # 是否使用fp16量化 339 | parser.add_argument('--fp16', type=bool, default=True, choices=[True, False], 340 | help='TensorRt FP16 half-precision export') 341 | # 是否使用int8量化 342 | parser.add_argument('--int8', type=bool, default=False, choices=[True, False], 343 | help='TensorRt INT8 quantization') 344 | # int8量化校准集位置 345 | parser.add_argument('--imgs_dir', default='./calibration', help='Dataset for int8 calibration') 346 | # 校准的轮次 347 | parser.add_argument('--n_iteration', type=int, default=512, help='Iteration for int8 calibration') 348 | # cache保存位置 349 | parser.add_argument('--cache_file', default=None, help='Int8 cache path') 350 | # 是否为yolov8的检测头 351 | parser.add_argument('--yolov8_head', type=bool, default=True, choices=[True, False], help='yolov8_head or not') 352 | # 是否添加nms 353 | parser.add_argument('--add_nms', type=bool, default=False, choices=[True, False], help='add efficientNMS') 354 | # 只有得分大于置信度的预测框会被保留下来 355 | parser.add_argument('--conf_thres', type=float, default=0.25, help='confidence threshold') 356 | # 非极大抑制所用到的nms_iou大小 357 | parser.add_argument('--iou_thres', type=float, default=0.45, help='NMS IoU threshold') 358 | # 目标框数量限制 359 | parser.add_argument('--max_det', type=int, default=200, help='maximum detections per image') 360 | 361 | args = parser.parse_args() 362 | print(args) 363 | 364 | main(args) 365 | 366 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyYAML 2 | numpy>=1.21.0 3 | opencv-python>=4.1.1 4 | onnx>=1.10.2 5 | torch>=1.10.2+cu113 6 | torchvision>=0.11.3 7 | 8 | 9 | pycuda<2021.1 # old CUDA python API (not recommended), replaced by cuda-python 10 | nvidia-pyindex 11 | tensorrt == 8.4.3.1 # https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#downloading 12 | cuda-python 13 | -------------------------------------------------------------------------------- /utils/calibrator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 as cv 3 | import numpy as np 4 | import pycuda.autoinit 5 | import pycuda.driver as cuda 6 | from cuda import cudart 7 | import tensorrt as trt 8 | 9 | if cudart: 10 | cudart.cudaDeviceSynchronize() 11 | 12 | __all__ = [ 13 | 'MyCalibrator', 14 | 'MyCalibrator_v2' 15 | ] 16 | 17 | def trans(img, size): 18 | crop_shape = min(img.shape[:2]) 19 | img = img[:crop_shape - 1, :crop_shape - 1, :] 20 | img = cv.resize(img, size) 21 | img /= 255.0 22 | return img 23 | 24 | 25 | class MyCalibrator(trt.IInt8EntropyCalibrator2): 26 | """pycuda""" 27 | def __init__(self, calibrationpath, imgslist, nCalibration, inputShape, cacheFile): 28 | trt.IInt8EntropyCalibrator2.__init__(self) 29 | self.calibrationpath = calibrationpath 30 | self.imgslist = imgslist 31 | self.nCalibration = nCalibration 32 | self.shape = inputShape # (N,C,H,W) 33 | self.buffeSize = trt.volume(inputShape) * trt.float32.itemsize 34 | self.cacheFile = cacheFile 35 | self.dIn = cuda.mem_alloc(self.buffeSize) 36 | self.oneBatch = self.batchGenerator() 37 | 38 | print(int(self.dIn)) 39 | 40 | # def __del__(self): 41 | # cudart.cudaFree(self.dIn) 42 | 43 | def batchGenerator(self): 44 | for i in range(self.nCalibration): 45 | print("> calibration %d" % i) 46 | subImageList = np.random.choice(self.imgslist, self.shape[0], replace=False) 47 | # self.imgslist = list(set(self.imgslist) - set(subImageList)) 48 | yield np.ascontiguousarray(self.loadImages(subImageList)) 49 | 50 | def loadImages(self, imageList): 51 | res = np.empty(self.shape, dtype=np.float32) 52 | for i in range(self.shape[0]): 53 | path = os.path.join(self.calibrationpath, imageList[i]) 54 | img = cv.imread(path) 55 | img = cv.cvtColor(img, cv.COLOR_BGR2RGB).astype(np.float32) 56 | img = trans(img, self.shape[-2:]).transpose((2, 0, 1)) 57 | res[i] = img 58 | return res 59 | 60 | def get_batch_size(self): # do NOT change name 61 | return self.shape[0] 62 | 63 | def get_batch(self, nameList=None, inputNodeName=None): # do NOT change name 64 | try: 65 | data = next(self.oneBatch) 66 | # cudart.cudaMemcpy(self.dIn, data.ctypes.data, self.buffeSize, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) 67 | cuda.memcpy_htod(self.dIn, data.ravel()) 68 | return [int(self.dIn)] 69 | except StopIteration: 70 | return None 71 | 72 | def read_calibration_cache(self): # do NOT change name 73 | if os.path.exists(self.cacheFile): 74 | print("Succeed finding cahce file: %s" % (self.cacheFile)) 75 | with open(self.cacheFile, "rb") as f: 76 | cache = f.read() 77 | return cache 78 | else: 79 | print("Failed finding int8 cache!") 80 | return 81 | 82 | def write_calibration_cache(self, cache): # do NOT change name 83 | with open(self.cacheFile, "wb") as f: 84 | f.write(cache) 85 | print("Succeed saving int8 cache!") 86 | 87 | 88 | class MyCalibrator_v2(trt.IInt8EntropyCalibrator2): 89 | """cuda-python""" 90 | def __init__(self, calibrationpath, imgslist, nCalibration, inputShape, cacheFile): 91 | trt.IInt8EntropyCalibrator2.__init__(self) 92 | self.calibrationpath = calibrationpath 93 | self.imgslist = imgslist 94 | self.nCalibration = nCalibration 95 | self.shape = inputShape # (N,C,H,W) 96 | self.buffeSize = trt.volume(inputShape) * trt.float32.itemsize 97 | self.cacheFile = cacheFile 98 | _, self.dIn = cudart.cudaMalloc(self.buffeSize) 99 | self.oneBatch = self.batchGenerator() 100 | 101 | print(int(self.dIn)) 102 | 103 | def __del__(self): 104 | cudart.cudaFree(self.dIn) 105 | 106 | def batchGenerator(self): 107 | for i in range(self.nCalibration): 108 | print("> calibration %d" % i) 109 | subImageList = np.random.choice(self.imgslist, self.shape[0], replace=False) 110 | # self.imgslist = list(set(self.imgslist) - set(subImageList)) 111 | yield np.ascontiguousarray(self.loadImages(subImageList)) 112 | 113 | def loadImages(self, imageList): 114 | res = np.empty(self.shape, dtype=np.float32) 115 | for i in range(self.shape[0]): 116 | path = os.path.join(self.calibrationpath, imageList[i]) 117 | img = cv.imread(path) 118 | img = cv.cvtColor(img, cv.COLOR_BGR2RGB).astype(np.float32) 119 | img = trans(img, self.shape[-2:]).transpose((2, 0, 1)) 120 | res[i] = img 121 | return res 122 | 123 | def get_batch_size(self): # do NOT change name 124 | return self.shape[0] 125 | 126 | def get_batch(self, nameList=None, inputNodeName=None): # do NOT change name 127 | try: 128 | data = next(self.oneBatch) 129 | cudart.cudaMemcpy(self.dIn, data.ctypes.data, self.buffeSize, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) 130 | return [int(self.dIn)] 131 | except StopIteration: 132 | return None 133 | 134 | def read_calibration_cache(self): # do NOT change name 135 | if os.path.exists(self.cacheFile): 136 | print("Succeed finding cahce file: %s" % (self.cacheFile)) 137 | with open(self.cacheFile, "rb") as f: 138 | cache = f.read() 139 | return cache 140 | else: 141 | print("Failed finding int8 cache!") 142 | return 143 | 144 | def write_calibration_cache(self, cache): # do NOT change name 145 | with open(self.cacheFile, "wb") as f: 146 | f.write(cache) 147 | print("Succeed saving int8 cache!") 148 | -------------------------------------------------------------------------------- /utils/trt_infer.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import argparse 19 | import logging 20 | import os 21 | import sys 22 | 23 | import numpy as np 24 | import pycuda.autoinit 25 | import pycuda.driver as cuda 26 | import tensorrt as trt 27 | 28 | try: 29 | # Sometimes python does not understand FileNotFoundError 30 | FileNotFoundError 31 | except NameError: 32 | FileNotFoundError = IOError 33 | 34 | EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) 35 | logging.basicConfig(level=logging.INFO) 36 | logging.getLogger("EngineBuilder").setLevel(logging.INFO) 37 | log = logging.getLogger("EngineBuilder") 38 | 39 | def GiB(val): 40 | return val * 1 << 30 41 | 42 | 43 | def add_help(description): 44 | parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) 45 | args, _ = parser.parse_known_args() 46 | 47 | 48 | def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[], err_msg=""): 49 | ''' 50 | Parses sample arguments. 51 | 52 | Args: 53 | description (str): Description of the sample. 54 | subfolder (str): The subfolder containing data relevant to this sample 55 | find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path. 56 | 57 | Returns: 58 | str: Path of data directory. 59 | ''' 60 | 61 | # Standard command-line arguments for all samples. 62 | kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data") 63 | parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) 64 | parser.add_argument("-d", "--datadir", help="Location of the TensorRT sample data directory, and any additional data directories.", action="append", default=[kDEFAULT_DATA_ROOT]) 65 | args, _ = parser.parse_known_args() 66 | 67 | def get_data_path(data_dir): 68 | # If the subfolder exists, append it to the path, otherwise use the provided path as-is. 69 | data_path = os.path.join(data_dir, subfolder) 70 | if not os.path.exists(data_path): 71 | if data_dir != kDEFAULT_DATA_ROOT: 72 | print("WARNING: " + data_path + " does not exist. Trying " + data_dir + " instead.") 73 | data_path = data_dir 74 | # Make sure data directory exists. 75 | if not (os.path.exists(data_path)) and data_dir != kDEFAULT_DATA_ROOT: 76 | print("WARNING: {:} does not exist. Please provide the correct data path with the -d option.".format(data_path)) 77 | return data_path 78 | 79 | data_paths = [get_data_path(data_dir) for data_dir in args.datadir] 80 | return data_paths, locate_files(data_paths, find_files, err_msg) 81 | 82 | def locate_files(data_paths, filenames, err_msg=""): 83 | """ 84 | Locates the specified files in the specified data directories. 85 | If a file exists in multiple data directories, the first directory is used. 86 | 87 | Args: 88 | data_paths (List[str]): The data directories. 89 | filename (List[str]): The names of the files to find. 90 | 91 | Returns: 92 | List[str]: The absolute paths of the files. 93 | 94 | Raises: 95 | FileNotFoundError if a file could not be located. 96 | """ 97 | found_files = [None] * len(filenames) 98 | for data_path in data_paths: 99 | # Find all requested files. 100 | for index, (found, filename) in enumerate(zip(found_files, filenames)): 101 | if not found: 102 | file_path = os.path.abspath(os.path.join(data_path, filename)) 103 | if os.path.exists(file_path): 104 | found_files[index] = file_path 105 | 106 | # Check that all files were found 107 | for f, filename in zip(found_files, filenames): 108 | if not f or not os.path.exists(f): 109 | raise FileNotFoundError("Could not find {:}. Searched in data paths: {:}\n{:}".format(filename, data_paths, err_msg)) 110 | return found_files 111 | 112 | def load_engine(engine_path): 113 | # TRT_LOGGER = trt.Logger(trt.Logger.WARNING) # INFO 114 | logger = trt.Logger(trt.Logger.ERROR) 115 | trt.init_libnvinfer_plugins(logger, '') 116 | with open(engine_path, 'rb') as f, trt.Runtime(logger) as runtime: 117 | return runtime.deserialize_cuda_engine(f.read()) 118 | 119 | # Simple helper data class that's a little nicer to use than a 2-tuple. 120 | class HostDeviceMem(object): 121 | def __init__(self, host_mem, device_mem): 122 | self.host = host_mem 123 | self.device = device_mem 124 | 125 | def __str__(self): 126 | return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) 127 | 128 | def __repr__(self): 129 | return self.__str__() 130 | 131 | 132 | # Allocates all buffers required for an engine, i.e. host/device inputs/outputs. 133 | def allocate_buffers(engine): 134 | inputs = [] 135 | outputs = [] 136 | bindings = [] 137 | stream = cuda.Stream() 138 | for binding in engine: 139 | size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size 140 | dtype = trt.nptype(engine.get_binding_dtype(binding)) 141 | # Allocate host and device buffers 142 | host_mem = cuda.pagelocked_empty(size, dtype) 143 | device_mem = cuda.mem_alloc(host_mem.nbytes) 144 | # Append the device buffer to device bindings. 145 | bindings.append(int(device_mem)) 146 | # Append to the appropriate list. 147 | if engine.binding_is_input(binding): 148 | inputs.append(HostDeviceMem(host_mem, device_mem)) 149 | else: 150 | outputs.append(HostDeviceMem(host_mem, device_mem)) 151 | return inputs, outputs, bindings, stream 152 | 153 | 154 | def allocate_buffers_v2(context): 155 | inputs = [] 156 | outputs = [] 157 | bindings = [] 158 | stream = cuda.Stream() 159 | for idx, binding in enumerate(context.engine): 160 | # size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size 161 | size = trt.volume(context.get_binding_shape(idx)) 162 | dtype = trt.nptype(context.engine.get_binding_dtype(idx)) 163 | # Allocate host and device buffers 164 | host_mem = cuda.pagelocked_empty(size, dtype) 165 | device_mem = cuda.mem_alloc(host_mem.nbytes) 166 | # Append the device buffer to device bindings. 167 | bindings.append(int(device_mem)) 168 | # Append to the appropriate list. 169 | if context.engine.binding_is_input(binding): 170 | inputs.append(HostDeviceMem(host_mem, device_mem)) 171 | else: 172 | outputs.append(HostDeviceMem(host_mem, device_mem)) 173 | return inputs, outputs, bindings, stream 174 | 175 | 176 | # This function is generalized for multiple inputs/outputs. 177 | # inputs and outputs are expected to be lists of HostDeviceMem objects. 178 | def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): 179 | # Transfer input data to the GPU. 180 | [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] 181 | # Run inference. 182 | context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) 183 | # Transfer predictions back from the GPU. 184 | [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] 185 | # Synchronize the stream 186 | stream.synchronize() 187 | # Return only the host outputs. 188 | return [out.host for out in outputs] 189 | 190 | # This function is generalized for multiple inputs/outputs for full dimension networks. 191 | # inputs and outputs are expected to be lists of HostDeviceMem objects. 192 | def do_inference_v2(context, bindings, inputs, outputs, stream): 193 | # Transfer input data to the GPU. 194 | [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] 195 | # Run inference. 196 | context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) 197 | # Transfer predictions back from the GPU. 198 | [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] 199 | # Synchronize the stream 200 | stream.synchronize() 201 | # Return only the host outputs. 202 | return [out.host for out in outputs] 203 | 204 | 205 | class EngineBuilder: 206 | """ 207 | Parses an ONNX graph and builds a TensorRT engine from it. 208 | """ 209 | 210 | def __init__(self, verbose=False): 211 | """ 212 | :param verbose: If enabled, a higher verbosity level will be set on the TensorRT logger. 213 | """ 214 | self.trt_logger = trt.Logger(trt.Logger.INFO) 215 | if verbose: 216 | self.trt_logger.min_severity = trt.Logger.Severity.VERBOSE 217 | 218 | trt.init_libnvinfer_plugins(self.trt_logger, namespace="") 219 | 220 | self.builder = trt.Builder(self.trt_logger) 221 | self.config = self.builder.create_builder_config() 222 | self.config.max_workspace_size = 8 * (2 ** 30) # 8 GB 223 | 224 | self.batch_size = None 225 | self.network = None 226 | self.parser = None 227 | 228 | def create_network(self, onnx_path): 229 | """ 230 | Parse the ONNX graph and create the corresponding TensorRT network definition. 231 | :param onnx_path: The path to the ONNX graph to load. 232 | """ 233 | network_flags = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) 234 | 235 | self.network = self.builder.create_network(network_flags) 236 | self.parser = trt.OnnxParser(self.network, self.trt_logger) 237 | 238 | onnx_path = os.path.realpath(onnx_path) 239 | with open(onnx_path, "rb") as f: 240 | if not self.parser.parse(f.read()): 241 | log.error("Failed to load ONNX file: {}".format(onnx_path)) 242 | for error in range(self.parser.num_errors): 243 | log.error(self.parser.get_error(error)) 244 | sys.exit(1) 245 | 246 | inputs = [self.network.get_input(i) for i in range(self.network.num_inputs)] 247 | outputs = [self.network.get_output(i) for i in range(self.network.num_outputs)] 248 | 249 | log.info("Network Description") 250 | for input in inputs: 251 | self.batch_size = input.shape[0] 252 | log.info("Input '{}' with shape {} and dtype {}".format(input.name, input.shape, input.dtype)) 253 | for output in outputs: 254 | log.info("Output '{}' with shape {} and dtype {}".format(output.name, output.shape, output.dtype)) 255 | assert self.batch_size > 0 256 | self.builder.max_batch_size = self.batch_size 257 | 258 | def create_engine(self, engine_path, precision, calib_input=None, calib_cache=None, calib_num_images=25000, 259 | calib_batch_size=8, calib_preprocessor=None): 260 | """ 261 | Build the TensorRT engine and serialize it to disk. 262 | :param engine_path: The path where to serialize the engine to. 263 | :param precision: The datatype to use for the engine, either 'fp32', 'fp16' or 'int8'. 264 | :param calib_input: The path to a directory holding the calibration images. 265 | :param calib_cache: The path where to write the calibration cache to, or if it already exists, load it from. 266 | :param calib_num_images: The maximum number of images to use for calibration. 267 | :param calib_batch_size: The batch size to use for the calibration process. 268 | :param calib_preprocessor: The ImageBatcher preprocessor algorithm to use. 269 | """ 270 | engine_path = os.path.realpath(engine_path) 271 | engine_dir = os.path.dirname(engine_path) 272 | os.makedirs(engine_dir, exist_ok=True) 273 | log.info("Building {} Engine in {}".format(precision, engine_path)) 274 | 275 | inputs = [self.network.get_input(i) for i in range(self.network.num_inputs)] 276 | 277 | if precision == "fp16": 278 | if not self.builder.platform_has_fast_fp16: 279 | log.warning("FP16 is not supported natively on this platform/device") 280 | else: 281 | self.config.set_flag(trt.BuilderFlag.FP16) 282 | 283 | with self.builder.build_engine(self.network, self.config) as engine, open(engine_path, "wb") as f: 284 | log.info("Serializing engine to file: {:}".format(engine_path)) 285 | f.write(engine.serialize()) 286 | 287 | 288 | class EngineBuilder_v2: 289 | """ 290 | Parses an ONNX graph and builds a TensorRT engine from it. 291 | """ 292 | 293 | def __init__(self, verbose=False): 294 | """ 295 | :param verbose: If enabled, a higher verbosity level will be set on the TensorRT logger. 296 | """ 297 | self.trt_logger = trt.Logger(trt.Logger.INFO) 298 | if verbose: 299 | self.trt_logger.min_severity = trt.Logger.Severity.VERBOSE 300 | 301 | trt.init_libnvinfer_plugins(self.trt_logger, namespace="") 302 | 303 | self.builder = trt.Builder(self.trt_logger) 304 | self.config = self.builder.create_builder_config() 305 | self.config.max_workspace_size = 8 * (2 ** 30) # 8 GB 306 | 307 | self.batch_size = None 308 | self.network = None 309 | self.parser = None 310 | 311 | def create_network(self, onnx_path, get_inputs): 312 | """ 313 | Parse the ONNX graph and create the corresponding TensorRT network definition. 314 | :param onnx_path: The path to the ONNX graph to load. 315 | """ 316 | network_flags = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) 317 | 318 | self.network = self.builder.create_network(network_flags) 319 | self.parser = trt.OnnxParser(self.network, self.trt_logger) 320 | 321 | onnx_path = os.path.realpath(onnx_path) 322 | with open(onnx_path, "rb") as f: 323 | if not self.parser.parse(f.read()): 324 | log.error("Failed to load ONNX file: {}".format(onnx_path)) 325 | for error in range(self.parser.num_errors): 326 | log.error(self.parser.get_error(error)) 327 | sys.exit(1) 328 | 329 | inputs = [] 330 | for i, shape in enumerate(get_inputs): 331 | self.network.get_input(i).shape = shape 332 | inputs.append(shape) 333 | # inputs = [self.network.get_input(i) for i in range(self.network.num_inputs)] 334 | outputs = [self.network.get_output(i) for i in range(self.network.num_outputs)] 335 | 336 | log.info("Network Description") 337 | for input in inputs: 338 | self.batch_size = input.shape[0] 339 | log.info("Input '{}' with shape {} and dtype {}".format(input.name, input.shape, input.dtype)) 340 | for output in outputs: 341 | log.info("Output '{}' with shape {} and dtype {}".format(output.name, output.shape, output.dtype)) 342 | assert self.batch_size > 0 343 | self.builder.max_batch_size = self.batch_size 344 | 345 | def create_engine(self, engine_path, precision, calib_input=None, calib_cache=None, calib_num_images=25000, 346 | calib_batch_size=8, calib_preprocessor=None): 347 | """ 348 | Build the TensorRT engine and serialize it to disk. 349 | :param engine_path: The path where to serialize the engine to. 350 | :param precision: The datatype to use for the engine, either 'fp32', 'fp16' or 'int8'. 351 | :param calib_input: The path to a directory holding the calibration images. 352 | :param calib_cache: The path where to write the calibration cache to, or if it already exists, load it from. 353 | :param calib_num_images: The maximum number of images to use for calibration. 354 | :param calib_batch_size: The batch size to use for the calibration process. 355 | :param calib_preprocessor: The ImageBatcher preprocessor algorithm to use. 356 | """ 357 | engine_path = os.path.realpath(engine_path) 358 | engine_dir = os.path.dirname(engine_path) 359 | os.makedirs(engine_dir, exist_ok=True) 360 | log.info("Building {} Engine in {}".format(precision, engine_path)) 361 | 362 | inputs = [self.network.get_input(i) for i in range(self.network.num_inputs)] 363 | 364 | if precision == "fp16": 365 | if not self.builder.platform_has_fast_fp16: 366 | log.warning("FP16 is not supported natively on this platform/device") 367 | else: 368 | self.config.set_flag(trt.BuilderFlag.FP16) 369 | 370 | with self.builder.build_engine(self.network, self.config) as engine, open(engine_path, "wb") as f: 371 | log.info("Serializing engine to file: {:}".format(engine_path)) 372 | f.write(engine.serialize()) -------------------------------------------------------------------------------- /utils/utils_detection.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import json 3 | import torch 4 | import cv2 as cv 5 | import numpy as np 6 | import torchvision 7 | 8 | 9 | def yaml_load(file='data.yaml'): 10 | # Single-line safe yaml loading 11 | with open(file, errors='ignore') as f: 12 | return yaml.safe_load(f) 13 | 14 | 15 | def json_load(file='data.json'): 16 | with open(file, "r") as f: 17 | return json.load(f) 18 | 19 | 20 | def xyxy2xywh(x): 21 | # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right 22 | y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) 23 | y[:, 0] = (x[:, 0] + x[:, 2]) / 2 # x center 24 | y[:, 1] = (x[:, 1] + x[:, 3]) / 2 # y center 25 | y[:, 2] = x[:, 2] - x[:, 0] # width 26 | y[:, 3] = x[:, 3] - x[:, 1] # height 27 | return y 28 | 29 | 30 | def xywh2xyxy(x): 31 | # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right 32 | y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) 33 | y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x 34 | y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y 35 | y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x 36 | y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y 37 | return y 38 | 39 | 40 | def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0): 41 | # Convert nx4 boxes from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right 42 | y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) 43 | y[:, 0] = w * (x[:, 0] - x[:, 2] / 2) + padw # top left x 44 | y[:, 1] = h * (x[:, 1] - x[:, 3] / 2) + padh # top left y 45 | y[:, 2] = w * (x[:, 0] + x[:, 2] / 2) + padw # bottom right x 46 | y[:, 3] = h * (x[:, 1] + x[:, 3] / 2) + padh # bottom right y 47 | return y 48 | 49 | 50 | def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0): 51 | # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] normalized where xy1=top-left, xy2=bottom-right 52 | if clip: 53 | clip_boxes(x, (h - eps, w - eps)) # warning: inplace clip 54 | y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) 55 | y[:, 0] = ((x[:, 0] + x[:, 2]) / 2) / w # x center 56 | y[:, 1] = ((x[:, 1] + x[:, 3]) / 2) / h # y center 57 | y[:, 2] = (x[:, 2] - x[:, 0]) / w # width 58 | y[:, 3] = (x[:, 3] - x[:, 1]) / h # height 59 | return y 60 | 61 | 62 | def xyn2xy(x, w=640, h=640, padw=0, padh=0): 63 | # Convert normalized segments into pixel segments, shape (n,2) 64 | y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) 65 | y[:, 0] = w * x[:, 0] + padw # top left x 66 | y[:, 1] = h * x[:, 1] + padh # top left y 67 | return y 68 | 69 | 70 | def letterbox_image(image, return_padding=False): 71 | """ 72 | 为保持h,w的一致,对图片短边两侧进行等距离padding 73 | """ 74 | h, w = image.shape[:2] 75 | 76 | if h > w: 77 | p = int((h - w) // 2) 78 | image = cv.copyMakeBorder(image, 0, 0, p, (h - w - p), cv.BORDER_CONSTANT, value=0) 79 | else: 80 | p = int((w - h) // 2) 81 | image = cv.copyMakeBorder(image, p, (w - h - p), 0, 0, cv.BORDER_CONSTANT, value=0) 82 | 83 | if return_padding: 84 | return image, p 85 | else: 86 | return image 87 | 88 | def image_trans(img, size): 89 | scale = min((size[0] / img.shape[0]), (size[1] / img.shape[1]), 1.1) 90 | new_size = (int(img.shape[1] * scale), int(img.shape[0] * scale)) 91 | # img_new = cv.resize(img, new_size, interpolation=cv.INTER_NEAREST) 92 | img_new = cv.resize(img, new_size, interpolation=cv.INTER_LINEAR) 93 | top = round((size[0] - new_size[1]) * 0.5) 94 | bottom = (size[0] - new_size[1]) - top 95 | left = round((size[1] - new_size[0]) * 0.5) 96 | right = (size[1] - new_size[0]) - left 97 | img_new = cv.copyMakeBorder(img_new, top, bottom, left, right, cv.BORDER_CONSTANT, value=0) 98 | img_new = img_new.transpose((2, 0, 1))[::-1] 99 | img_new = np.expand_dims(img_new, 0) 100 | img_new = np.ascontiguousarray(img_new).astype(np.float32) 101 | img_new = img_new / 255.0 102 | return img_new 103 | 104 | 105 | def scale_bboxes(bboxes, img_ori_hw, img_det_hw): 106 | assert len(img_ori_hw) == len(img_ori_hw) 107 | 108 | scale = max(img_ori_hw[0] / img_det_hw[0], img_ori_hw[1] / img_det_hw[1]) 109 | bboxes[:, :4] = bboxes[:, :4] * scale 110 | 111 | h_bias = (max(img_ori_hw) - img_ori_hw[0]) / 2.0 112 | w_bias = (max(img_ori_hw) - img_ori_hw[1]) / 2.0 113 | 114 | bboxes[:, [0, 2]] -= w_bias 115 | bboxes[:, [1, 3]] -= h_bias 116 | 117 | clip_boxes(bboxes, img_ori_hw) 118 | 119 | return bboxes 120 | 121 | 122 | def scale_bboxes_v2(bboxes, img_ori_hw, img_det_hw, p): 123 | assert len(img_ori_hw) == len(img_ori_hw) 124 | 125 | scale = max(img_ori_hw[0] / img_det_hw[0], img_ori_hw[1] / img_det_hw[1]) 126 | bboxes[:, :4] = bboxes[:, :4] * scale 127 | if img_ori_hw[0] > img_ori_hw[1]: 128 | bboxes[:, [0, 2]] -= p 129 | else: 130 | bboxes[:, [1, 3]] -= p 131 | 132 | clip_boxes(bboxes, img_ori_hw) 133 | 134 | return bboxes 135 | 136 | 137 | def clip_boxes(boxes, shape): 138 | # Clip boxes (xyxy) to image shape (height, width) 139 | if isinstance(boxes, torch.Tensor): # faster individually 140 | boxes[:, 0].clamp_(0, shape[1]) # x1 141 | boxes[:, 1].clamp_(0, shape[0]) # y1 142 | boxes[:, 2].clamp_(0, shape[1]) # x2 143 | boxes[:, 3].clamp_(0, shape[0]) # y2 144 | else: # np.array (faster grouped) 145 | boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1]) # x1, x2 146 | boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0]) # y1, y2 147 | 148 | 149 | 150 | def box_area(box): 151 | # box = xyxy(4,n) 152 | return (box[2] - box[0]) * (box[3] - box[1]) 153 | 154 | 155 | def box_iou(box1, box2, eps=1e-7): 156 | # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2) 157 | (a1, a2), (b1, b2) = box1[:, None].chunk(2, 2), box2.chunk(2, 1) 158 | inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp(0).prod(2) 159 | 160 | # IoU = inter / (area1 + area2 - inter) 161 | return inter / (box_area(box1.T)[:, None] + box_area(box2.T) - inter + eps) 162 | 163 | 164 | def draw_boxes(img, boxes, scores, labels, catid_labels, textscale=1, color_dicts=None): 165 | boxes = tuple(boxes.astype('int')) 166 | if color_dicts is None: 167 | color_dicts = {k:(0,0,255) for k in labels.keys} 168 | 169 | text_size, _ = cv.getTextSize(f'{catid_labels[labels]}:{scores:.2f}', fontFace=cv.FONT_HERSHEY_DUPLEX, 170 | fontScale=textscale, thickness=1) 171 | text_w, text_h = text_size 172 | img0 = cv.rectangle(img, boxes[:2], boxes[2:], thickness=2, lineType=cv.LINE_AA, color=color_dicts[labels]) 173 | img0 = cv.rectangle(img0, boxes[:2], (boxes[0] + text_w + 1, boxes[1] + text_h + 2), 174 | thickness=-1, color=color_dicts[labels]) 175 | img0 = cv.putText(img0, f'{catid_labels[labels]}:{scores:.2f}', 176 | (boxes[0], boxes[1] + text_h), 177 | fontFace=cv.FONT_HERSHEY_DUPLEX, fontScale=textscale, thickness=1, 178 | lineType=cv.LINE_AA, 179 | color=(255, 255, 255) 180 | ) 181 | return img0 182 | 183 | 184 | def non_max_suppression(prediction, 185 | v8_head=False, 186 | conf_thres=0.25, 187 | iou_thres=0.45, 188 | agnostic=False, 189 | max_det=300): 190 | bs = prediction.shape[0] # batch size 191 | # Settings 192 | # min_wh = 2 # (pixels) minimum box width and height 193 | max_wh = 7680 # (pixels) maximum box width and height 194 | max_nms = 30000 # maximum number of boxes into torchvision.ops.nms() 195 | redundant = True # require redundant detections 196 | merge = False # use merge-NMS 197 | output = [np.zeros((0, 6), dtype=np.float32)] * bs 198 | if not v8_head: 199 | xc = prediction[..., 4] > conf_thres # candidates 200 | else: 201 | xc = prediction[..., 4:].max(2) > conf_thres # candidates 202 | for xi, x in enumerate(prediction): # image index, image inference 203 | # If none remain process next image 204 | if not x.shape[0]: 205 | continue 206 | # Apply constraints 207 | # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height 208 | x = x[xc[xi]] # confidence 209 | if not v8_head: 210 | # Compute conf 211 | x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf 212 | j = x[:, 5:].argmax(axis=1, keepdims=True) 213 | conf = x[:, 5:] 214 | else: 215 | j = x[:, 4:].argmax(axis=1, keepdims=True) 216 | conf = x[:, 4:] 217 | conf = conf[range(len(j)), j.ravel()].reshape(-1, 1) 218 | # Detections matrix nx6 (xywh, conf, cls) 219 | x = np.concatenate((x[:,:4], conf, j), 1)[conf.ravel() > conf_thres] 220 | # Check shape 221 | n = x.shape[0] # number of boxes 222 | if not n: # no boxes 223 | continue 224 | elif n > max_nms: # excess boxes 225 | x = x[x[:, 4].argsort()[::-1][:max_nms]] # sort by confidence 226 | 227 | # Batched NMS 228 | # c = x[:, 5:6] * (0 if agnostic else max_wh) # classes 229 | # boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores 230 | # i = cv.dnn.NMSBoxes(boxes, scores, conf_thres, iou_thres) 231 | c = x[:, 5].ravel().astype("int32") 232 | i = cv.dnn.NMSBoxesBatched(x[:, :4], x[:, 4], c, conf_thres, iou_thres, None, 0) 233 | if i.shape[0] > max_det: # limit detections 234 | i = i[:max_det] 235 | if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean) 236 | # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4) 237 | iou = box_iou(x[:, :4][i], x[:, :4]) > iou_thres # iou matrix 238 | weights = iou * x[:, 4][None] # box weights 239 | x[i, :4] = np.matmul(weights, x[:, :4]) / weights.sum(1, keepdim=True) # merged boxes 240 | if redundant: 241 | i = i[iou.sum(1) > 1] # require redundancy 242 | 243 | output[xi] = xywh2xyxy(x[i]) 244 | return output 245 | 246 | 247 | def non_max_suppression_torch(prediction, 248 | v8_head=False, 249 | conf_thres=0.25, 250 | iou_thres=0.45, 251 | agnostic=False, 252 | max_det=300): 253 | bs = prediction.shape[0] # batch size 254 | # Settings 255 | # min_wh = 2 # (pixels) minimum box width and height 256 | max_wh = 7680 # (pixels) maximum box width and height 257 | max_nms = 30000 # maximum number of boxes into torchvision.ops.nms() 258 | redundant = True # require redundant detections 259 | merge = False # use merge-NMS 260 | output = [torch.zeros((0, 6), device=prediction.device)] * bs 261 | if not v8_head: 262 | xc = prediction[..., 4] > conf_thres # candidates 263 | else: 264 | xc = prediction[..., 4:].max(2) > conf_thres # candidates 265 | for xi, x in enumerate(prediction): # image index, image inference 266 | # If none remain process next image 267 | if not x.shape[0]: 268 | continue 269 | # Apply constraints 270 | # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height 271 | x = x[xc[xi]] # confidence 272 | if not v8_head: 273 | # Compute conf 274 | x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf 275 | # Detections matrix nx6 (xywh, conf, cls) 276 | conf, j = x[:, 5:].max(1, keepdim=True) 277 | else: 278 | # Detections matrix nx6 (xywh, conf, cls) 279 | conf, j = x[:, 4:].max(1, keepdim=True) 280 | 281 | # Box (center x, center y, width, height) to (x1, y1, x2, y2) 282 | box = xywh2xyxy(x[:, :4]) 283 | x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres] 284 | # Check shape 285 | n = x.shape[0] # number of boxes 286 | if not n: # no boxes 287 | continue 288 | elif n > max_nms: # excess boxes 289 | x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence 290 | 291 | # Batched NMS 292 | c = x[:, 5:6] * (0 if agnostic else max_wh) # classes 293 | boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores 294 | i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS 295 | if i.shape[0] > max_det: # limit detections 296 | i = i[:max_det] 297 | if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean) 298 | # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4) 299 | iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix 300 | weights = iou * scores[None] # box weights 301 | x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes 302 | if redundant: 303 | i = i[iou.sum(1) > 1] # require redundancy 304 | 305 | output[xi] = x[i] 306 | return output 307 | 308 | 309 | def yolox_postprocess(outputs, img_size, p6=False): 310 | 311 | grids = [] 312 | expanded_strides = [] 313 | 314 | if not p6: 315 | strides = [8, 16, 32] 316 | else: 317 | strides = [8, 16, 32, 64] 318 | 319 | hsizes = [img_size[0] // stride for stride in strides] 320 | wsizes = [img_size[1] // stride for stride in strides] 321 | 322 | for hsize, wsize, stride in zip(hsizes, wsizes, strides): 323 | xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize)) 324 | grid = np.stack((xv, yv), 2).reshape(1, -1, 2) 325 | grids.append(grid) 326 | shape = grid.shape[:2] 327 | expanded_strides.append(np.full((*shape, 1), stride)) 328 | 329 | grids = np.concatenate(grids, 1) 330 | expanded_strides = np.concatenate(expanded_strides, 1) 331 | outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides 332 | outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides 333 | 334 | return outputs 335 | 336 | 337 | class Colors: 338 | # Ultralytics color palette https://ultralytics.com/ 339 | def __init__(self, id_and_obj): 340 | base_hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A', '92CC17', '3DDB86', '1A9334', '00D4BB', 341 | '2C99A8', '00C2FF', '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF', 'FF95C8', 'FF37C7') 342 | n = len(id_and_obj) / len(base_hexs) 343 | if n > 1: 344 | n = int(n) + 1 345 | base_hexs *= n 346 | 347 | self.obj_id = tuple(id_and_obj.keys()) 348 | self.hex = base_hexs[:len(self.obj_id)] 349 | self.id_and_hex = {k: v for k, v in zip(self.obj_id, self.hex)} 350 | 351 | def get_id_and_colors(self): 352 | id_and_colors = {k: self.hex2rgb(f'#{v}') for k, v in self.id_and_hex.items()} 353 | return id_and_colors 354 | 355 | def hex2rgb(self, h): # rgb order 356 | return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4)) 357 | 358 | 359 | -------------------------------------------------------------------------------- /yolo_detect_v1.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | import cv2 as cv 5 | import numpy as np 6 | 7 | from utils import trt_infer 8 | from utils.trt_infer import load_engine 9 | from utils.utils_detection import yaml_load, image_trans, scale_bboxes, non_max_suppression, Colors, draw_boxes, \ 10 | non_max_suppression_torch 11 | 12 | 13 | class yolo_engine_det: 14 | def __init__(self, engine_dir, catid_labels): 15 | self.engine = load_engine(engine_dir) 16 | self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 17 | self.context = self.engine.create_execution_context() 18 | self.resize = self.engine.get_binding_shape(0)[2:] 19 | self.colors = self.get_colors_dict(catid_labels) 20 | self.labels = catid_labels 21 | self.v8_head = False 22 | self.nms = non_max_suppression 23 | 24 | if self.engine.get_binding_shape(1)[-1] - len(catid_labels) == 4: 25 | self.v8_head = True 26 | 27 | # self.context.set_binding_shape(0, [1, 3, self.resize[0], self.resize[1]]) 28 | self.inputs = None 29 | self.outputs = None 30 | self.bindings = None 31 | self.stream = None 32 | 33 | self.inputs, self.outputs, self.bindings, self.stream = trt_infer.allocate_buffers_v2(self.context) 34 | 35 | @staticmethod 36 | def get_colors_dict(catid_labels): 37 | color_dicts = Colors(catid_labels) 38 | return color_dicts.get_id_and_colors() 39 | 40 | 41 | def draw(self, frame, conf=0.25, iou=0.45, max_det=200): 42 | x = image_trans(frame, self.resize) 43 | np.copyto(self.inputs[0].host, x.ravel()) 44 | # self.inputs[0].host = x.ravel() 45 | t1 = time.time() 46 | pred = trt_infer.do_inference_v2( 47 | self.context, bindings=self.bindings, inputs=self.inputs, outputs=self.outputs, stream=self.stream 48 | ) 49 | pred = pred[0].reshape(self.context.get_binding_shape(1)) 50 | pred = self.nms(pred, v8_head=self.v8_head, conf_thres=conf, iou_thres=iou, agnostic=False, max_det=max_det)[0] 51 | t2 = time.time() 52 | fps = round((0.1 / (t2 - t1) * 10)) 53 | times = round((t2 - t1) * 1000, 3) 54 | pred = scale_bboxes(pred, frame.shape[:2], self.resize) 55 | for i in pred: 56 | # pred: x1, y1, x2, y2, conf, labels 57 | frame = draw_boxes(frame, i[:4], i[4], i[5], self.labels, 0.7, self.colors) 58 | frame = cv.putText(frame, f'fps: {fps}', (10, 30), fontFace=cv.FONT_HERSHEY_SIMPLEX, fontScale=1, thickness=2, 59 | lineType=cv.LINE_AA, color=(255, 0, 255)) 60 | return frame, times, pred 61 | 62 | 63 | 64 | def main(args): 65 | times = [] 66 | # 检测物体标签 67 | catid_labels = yaml_load(args.labels) 68 | # 视频源 69 | vc = cv.VideoCapture(args.video_dir) 70 | # 载入engine 71 | yolo_draw = yolo_engine_det(args.engine_dir, catid_labels) 72 | 73 | # 循环读取视频中的每一帧 74 | while vc.isOpened(): 75 | ret, frame = vc.read() 76 | 77 | if ret is True: 78 | frame, t, _ = yolo_draw.draw(frame, conf=args.conf_thres, iou=args.iou_thres, max_det=args.max_det) 79 | print(f'{t}ms') 80 | times.append(t) 81 | cv.imshow('video', frame) 82 | 83 | if cv.waitKey(30) & 0xFF == 27: 84 | break 85 | else: 86 | break 87 | print(np.mean(times)) 88 | vc.release() 89 | cv.destroyAllWindows() 90 | 91 | 92 | if __name__ == "__main__": 93 | import argparse 94 | 95 | parser = argparse.ArgumentParser(description=__doc__) 96 | # 目标类别标签 97 | parser.add_argument('--labels', type=str, default='./labels_coco.yaml', help='obj labels') 98 | # video地址 99 | parser.add_argument('--video_dir', type=str, default='sample_1080p_h265.mp4', 100 | help='video path') 101 | # engine模型地址 102 | parser.add_argument('--engine_dir', type=str, default='./models_trt/yolov5s.engine', 103 | help='engine path') 104 | # 只有得分大于置信度的预测框会被保留下来 105 | parser.add_argument('--conf_thres', type=float, default=0.25, help='confidence threshold') 106 | # 非极大抑制所用到的nms_iou大小 107 | parser.add_argument('--iou_thres', type=float, default=0.45, help='NMS IoU threshold') 108 | # 目标框数量限制 109 | parser.add_argument('--max_det', type=int, default=200, help='maximum detections per image') 110 | 111 | args = parser.parse_args() 112 | print(args) 113 | 114 | main(args) 115 | 116 | -------------------------------------------------------------------------------- /yolo_detect_v2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | import cv2 as cv 5 | import numpy as np 6 | 7 | from utils import trt_infer 8 | from utils.trt_infer import load_engine 9 | from utils.utils_detection import yaml_load, image_trans, scale_bboxes, non_max_suppression, Colors, draw_boxes 10 | 11 | 12 | class yolo_engine_det: 13 | def __init__(self, engine_dir, catid_labels): 14 | self.engine = load_engine(engine_dir) 15 | self.context = self.engine.create_execution_context() 16 | self.resize = self.engine.get_binding_shape(0)[2:] 17 | self.colors = self.get_colors_dict(catid_labels) 18 | self.labels = catid_labels 19 | 20 | # self.context.set_binding_shape(0, [1, 3, self.resize[0], self.resize[1]]) 21 | self.inputs = None 22 | self.outputs = None 23 | self.bindings = None 24 | self.stream = None 25 | 26 | self.inputs, self.outputs, self.bindings, self.stream = trt_infer.allocate_buffers_v2(self.context) 27 | 28 | @staticmethod 29 | def get_colors_dict(catid_labels): 30 | color_dicts = Colors(catid_labels) 31 | return color_dicts.get_id_and_colors() 32 | 33 | 34 | def draw(self, frame): 35 | x = image_trans(frame, self.resize) 36 | np.copyto(self.inputs[0].host, x.ravel()) 37 | t1 = time.time() 38 | pred = trt_infer.do_inference_v2( 39 | self.context, bindings=self.bindings, inputs=self.inputs, outputs=self.outputs, stream=self.stream 40 | ) 41 | t2 = time.time() 42 | fps = int(1.0 / (t2 - t1)) 43 | times = round((t2 - t1) * 1000, 3) 44 | num_det, boxes, conf, labels = pred 45 | num_det = num_det[0] 46 | if num_det > 0: 47 | # conf = conf[:num_det] 48 | # labels = labels[:num_det] 49 | boxes = boxes[:num_det * 4].reshape(-1, 4) 50 | boxes = scale_bboxes(boxes, frame.shape[:2], self.resize) 51 | for i in range(num_det): 52 | frame = draw_boxes(frame, boxes[i], conf[i], labels[i], self.labels, 0.7, self.colors) 53 | frame = cv.putText(frame, f'fps: {fps}', (10, 30), fontFace=cv.FONT_HERSHEY_SIMPLEX, fontScale=1, thickness=2, 54 | lineType=cv.LINE_AA, color=(255, 0, 255)) 55 | return frame, times 56 | 57 | 58 | def main(args): 59 | times = [] 60 | # 检测物体标签 61 | catid_labels = yaml_load(args.labels) 62 | # 视频源 63 | vc = cv.VideoCapture(args.video_dir) 64 | # 载入engine 65 | yolo_draw = yolo_engine_det( 66 | args.engine_dir, catid_labels 67 | ) 68 | 69 | # 循环读取视频中的每一帧 70 | while vc.isOpened(): 71 | ret, frame = vc.read() 72 | 73 | if ret is True: 74 | frame, t = yolo_draw.draw(frame) 75 | print(f'{t}ms') 76 | times.append(t) 77 | cv.imshow('video', frame) 78 | 79 | if cv.waitKey(30) & 0xFF == 27: 80 | break 81 | else: 82 | break 83 | print(np.mean(times)) 84 | vc.release() 85 | cv.destroyAllWindows() 86 | 87 | 88 | if __name__ == "__main__": 89 | import argparse 90 | 91 | parser = argparse.ArgumentParser(description=__doc__) 92 | # 目标类别标签 93 | parser.add_argument('--labels', type=str, default='./labels_coco.yaml', help='obj labels') 94 | # video地址 95 | parser.add_argument('--video_dir', type=str, default='sample_1080p_h265.mp4', 96 | help='video path') 97 | # engine模型地址 98 | parser.add_argument('--engine_dir', type=str, default='./models_trt/yolov7_nms.engine', 99 | help='engine path') 100 | 101 | 102 | args = parser.parse_args() 103 | print(args) 104 | 105 | main(args) 106 | 107 | -------------------------------------------------------------------------------- /yolox_detect.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | import cv2 as cv 5 | import numpy as np 6 | 7 | from utils import trt_infer 8 | from utils.trt_infer import load_engine 9 | from utils.utils_detection import yaml_load, image_trans, scale_bboxes, non_max_suppression_torch, yolox_postprocess, \ 10 | Colors, draw_boxes 11 | 12 | 13 | class yolox_engine_det: 14 | def __init__(self, engine_dir, catid_labels): 15 | self.engine = load_engine(engine_dir) 16 | self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') 17 | self.context = self.engine.create_execution_context() 18 | self.resize = self.engine.get_binding_shape(0)[2:] 19 | self.colors = self.get_colors_dict(catid_labels) 20 | self.labels = catid_labels 21 | self.nms = non_max_suppression_torch 22 | 23 | # self.context.set_binding_shape(0, [1, 3, self.resize[0], self.resize[1]]) 24 | self.inputs = None 25 | self.outputs = None 26 | self.bindings = None 27 | self.stream = None 28 | 29 | self.inputs, self.outputs, self.bindings, self.stream = trt_infer.allocate_buffers_v2(self.context) 30 | 31 | @staticmethod 32 | def get_colors_dict(catid_labels): 33 | color_dicts = Colors(catid_labels) 34 | return color_dicts.get_id_and_colors() 35 | 36 | 37 | def draw(self, frame, conf=0.25, iou=0.45, max_det=200): 38 | x = image_trans(frame, self.resize) 39 | np.copyto(self.inputs[0].host, x.ravel()) 40 | t1 = time.time() 41 | pred = trt_infer.do_inference_v2( 42 | self.context, bindings=self.bindings, inputs=self.inputs, outputs=self.outputs, stream=self.stream 43 | ) 44 | pred = pred[0].reshape(self.context.get_binding_shape(1)) 45 | pred = yolox_postprocess(pred, self.resize, p6=False) 46 | pred = torch.from_numpy(pred).to(self.device) 47 | pred = self.nms(pred, False, conf_thres=conf, iou_thres=iou, agnostic=False, max_det=max_det)[0] 48 | t2 = time.time() 49 | fps = int(1.0 / (t2 - t1)) 50 | pred = scale_bboxes(pred, frame.shape[:2], self.resize) 51 | pred = pred.cpu().numpy() 52 | for i in pred: 53 | # pred: x1, y1, x2, y2, conf, labels 54 | # bbox = tuple(i[:4].astype('int')) 55 | # frame = cv.rectangle(frame, bbox[:2], bbox[2:], thickness=2, lineType=cv.LINE_AA, 56 | # color=self.colors[i[-1]] 57 | # ) 58 | # frame = cv.putText(frame, f'{self.labels[i[-1]]}:{i[-2]:.2f}', (bbox[0] + 5, bbox[1] + 30), 59 | # fontFace=cv.FONT_HERSHEY_DUPLEX, fontScale=1, thickness=1, lineType=cv.LINE_AA, 60 | # color = (210, 105, 30) 61 | # ) 62 | frame = draw_boxes(frame, i[:4], i[4], i[5], self.labels, 0.7, self.colors) 63 | frame = cv.putText(frame, f'fps: {fps}', (10, 30), fontFace=cv.FONT_HERSHEY_SIMPLEX, fontScale=1, thickness=2, 64 | lineType=cv.LINE_AA, color=(255, 0, 255)) 65 | return frame 66 | 67 | 68 | def main(args): 69 | # 检测物体标签 70 | catid_labels = yaml_load(args.labels)['labels'] 71 | # 视频源 72 | vc = cv.VideoCapture(args.video_dir) 73 | # 载入engine 74 | yolo_draw = yolox_engine_det(args.engine_dir, catid_labels) 75 | 76 | # 循环读取视频中的每一帧 77 | while vc.isOpened(): 78 | ret, frame = vc.read() 79 | 80 | if ret is True: 81 | frame = yolo_draw.draw( 82 | frame, conf=args.conf_thres, iou=args.iou_thres, max_det=args.max_det 83 | ) 84 | cv.imshow('video', frame) 85 | 86 | if cv.waitKey(30) & 0xFF == 27: 87 | break 88 | else: 89 | break 90 | 91 | vc.release() 92 | cv.destroyAllWindows() 93 | 94 | 95 | if __name__ == "__main__": 96 | import argparse 97 | 98 | parser = argparse.ArgumentParser(description=__doc__) 99 | # 目标类别标签 100 | parser.add_argument('--labels', type=str, default='./labels_coco.yaml', help='obj labels') 101 | # video地址 102 | parser.add_argument('--video_dir', type=str, default='sample_1080p_h265.mp4', 103 | help='video path') 104 | # engine模型地址 105 | parser.add_argument('--engine_dir', type=str, default='./models_trt/yolox_s.engine', 106 | help='engine path') 107 | # 只有得分大于置信度的预测框会被保留下来 108 | parser.add_argument('--conf_thres', type=float, default=0.25, help='confidence threshold') 109 | # 非极大抑制所用到的nms_iou大小 110 | parser.add_argument('--iou_thres', type=float, default=0.45, help='NMS IoU threshold') 111 | # 目标框数量限制 112 | parser.add_argument('--max_det', type=int, default=200, help='maximum detections per image') 113 | 114 | args = parser.parse_args() 115 | print(args) 116 | 117 | main(args) 118 | --------------------------------------------------------------------------------