├── .gitignore
├── Benchmark.py
├── LICENSE
├── README.md
├── calibration
└── README.md
├── cpp
├── README.md
├── jetson_csi
│ ├── CMakeLists.txt
│ ├── README.md
│ ├── csi_detect.cpp
│ ├── gstreamer.cpp
│ ├── gstreamer.h
│ ├── labels_coco.yaml
│ ├── preprocess.cu
│ ├── preprocess.h
│ ├── trt_infer.cpp
│ ├── trt_infer.h
│ ├── utils_detection.cpp
│ └── utils_detection.h
├── kp_jetson_csi
│ ├── CMakeLists.txt
│ ├── README.md
│ ├── csi_kp_detect.cpp
│ ├── gstreamer.cpp
│ ├── gstreamer.h
│ ├── labels_det.yaml
│ ├── points_link.yaml
│ ├── preprocess.cu
│ ├── preprocess.h
│ ├── trt_infer.cpp
│ ├── trt_infer.h
│ ├── utils_detection.cpp
│ └── utils_detection.h
└── video_detect
│ ├── CMakeLists.txt
│ ├── README.md
│ ├── labels_coco.yaml
│ ├── main.cpp
│ ├── preprocess.cu
│ ├── preprocess.h
│ ├── trt_infer.cpp
│ ├── trt_infer.h
│ ├── utils_detection.cpp
│ └── utils_detection.h
├── doc
└── yolov5s_det.png
├── labels_coco.yaml
├── labels_voc.yaml
├── models_onnx
└── README.md
├── models_trt
└── README.md
├── onnx2trt.py
├── requirements.txt
├── utils
├── calibrator.py
├── trt_infer.py
└── utils_detection.py
├── yolo_detect_v1.py
├── yolo_detect_v2.py
└── yolox_detect.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | __pycache__
3 | *.mp4
4 | *.onnx
5 | *.engine
6 | calibration
--------------------------------------------------------------------------------
/Benchmark.py:
--------------------------------------------------------------------------------
1 | # ---------------------------------------------------------------
2 | # 这个脚本向你展示了如何使用 tensorRT 对导出的模型进行推理,并进行速度测试
3 | # 目前 GPU 上 tensorRT 是跑的最快的部署框架 ...
4 | # ---------------------------------------------------------------
5 |
6 | import time
7 | import numpy as np
8 | import tensorrt as trt
9 |
10 | from tqdm import tqdm
11 | from utils import trt_infer
12 |
13 | # int8 / fp32 ~ 70%
14 | # trt > ppq > fp32
15 |
16 | # Nvidia Nsight Performance Profile
17 | ENGINE_PATH = './models_trt/yolov5s.engine'
18 | BATCH_SIZE = 1
19 | INPUT_SHAPE = [BATCH_SIZE, 3, 512, 512]
20 | BENCHMARK_SAMPLES = 12800
21 |
22 | print(f'Benchmark with {ENGINE_PATH}')
23 | logger = trt.Logger(trt.Logger.ERROR)
24 | with open(ENGINE_PATH, 'rb') as f, trt.Runtime(logger) as runtime:
25 | engine = runtime.deserialize_cuda_engine(f.read())
26 |
27 | with engine.create_execution_context() as context:
28 | inputs, outputs, bindings, stream = trt_infer.allocate_buffers(context.engine)
29 | inputs[0].host = np.zeros(shape=INPUT_SHAPE, dtype=np.float32)
30 |
31 | t1 = time.time()
32 | for _ in tqdm(range(BENCHMARK_SAMPLES), desc=f'Benchmark ...'):
33 | trt_infer.do_inference(
34 | context, bindings=bindings, inputs=inputs,
35 | outputs=outputs, stream=stream, batch_size=BATCH_SIZE)
36 |
37 | t2 = time.time()
38 | t = (t2 - t1)*1000/BENCHMARK_SAMPLES
39 | print(f"{t:0.5f}ms")
40 |
41 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # yolov5_TensorRT_inference
2 | 记录yolov5的TensorRT量化(fp16, int8)及推理代码。经实测可运行于Jetson平台,可将yolov5s、yolov8s这类的小模型部署在Jetson nano 4g上用于摄像头的检测。
3 |
4 |

5 |
6 |
7 | CPP:
8 | [视频目标检测](https://github.com/MadaoFY/yolov5_TensorRT_inference/tree/main/cpp/video_detect)
9 | [jetson nano摄像头目标检测](https://github.com/MadaoFY/yolov5_TensorRT_inference/tree/main/cpp/jetson_csi)
10 | [jetson nano摄像头人体关键点检测](https://github.com/MadaoFY/yolov5_TensorRT_inference/tree/main/cpp/kp_jetson_csi)
11 |
12 | 模型支持:
13 | yolov5
14 | yolov7
15 | yolov8
16 | yolox(不可在生成的engine中添加nms模块)
17 |
18 | 温馨提示:本人使用的TensrRT版本为8.4.3.1,为保证成功运行,你的TensorRT大版本最好在8.4。具体环境依赖请参考```requirements.txt```
19 |
20 | 项目文件如下:
21 | ```bash
22 | |-yolov5_TensorRT_inference
23 | |-calibration # 默认情况下用于存放int8量化校准集的文件夹
24 | |-cpp # c++推理代码,有jetson nano上用的代码
25 | |-doc # 单纯用来存放文本的文件夹
26 | |-models_onnx # 默认情况下用于存放onnx模型的文件夹
27 | |-models_trt # 默认情况下用于存放量化后生成的trt模型的文件夹
28 | |-utils # 存放utils的文件夹
29 | |-Benchmark.py # 测试trt模型速度的脚本
30 | |-labels_coco.yaml # coco数据集类别标签
31 | |-labels_voc.yaml # voc数据集类别标签
32 | |-onnx2trt.py # onnx模型转engine的脚本,已添加EfficientNMS算子的支持
33 | |-yolo_detect_v1.py # 不带nms算子的视频检测脚本
34 | |-yolo_detect_v2.py # 带nms算子的视频检测脚本,该脚本使用的trt模型添加了EfficientNMS算子
35 | |-yolox_detect.py # yolovx的视频检测脚本
36 | ```
37 |
38 | 以下将使用yolov5s模型演示如何量化及用于视频的推理。
39 | ## 数据准备
40 | 使用yolov5官方提供的coco训练模型,已导出为onnx。这里使用voc2012作为校准集,仅用来演示,你可以下载coco数据集作为你的校准集。
41 |
42 | yolov5s.onnx:https://pan.baidu.com/s/1eYaU3ndVpwexL4k6goxjHg
43 | 提取码: sduf
44 |
45 | voc2012:https://pan.baidu.com/s/1rICWiczIv_GyrYIrEj1p3Q
46 | 提取码: 4pgx
47 |
48 | 视频源:https://pan.baidu.com/s/1HBIjz6019vn9qfoKPIuV2A
49 | 提取码: fbfh
50 |
51 | ## 量化(onnx2trt.py)
52 | 你需要从yolov5、yolov7、yolox的官方库导出相应onnx模型,从第三方实现的库中导出的yolo onnx模型不保证适用,注意导出的onnx不包含nms部分。如果你想把nms算子加入到engine中,add_nms设置为True。默认将onnx模型放置于models_onnx文件夹,导出的trt模型可保存于models_trt文件夹。如果你想使用int8量化,你需要从训练集中准备至少500张图片作为校准集,图片放置于calibration文件夹。
53 |
54 | ```shell
55 | python onnx2trt.py --onnx_dir ./models_onnx/yolov5s.onnx --engine_dir ./models_trt/yolov5s.engine --int8 True --imgs_dir ./calibration
56 | ```
57 | 参数说明:
58 | - ```--onnx_dir``` onnx模型路径
59 | - ```--engine_dir``` trt模型的保存路径
60 | - ```--min_shape``` 最小的shape
61 | - ```--opt_shape``` 优化的shape
62 | - ```--max_shape``` 最大的shape
63 | - ```--fp16``` 是否使用fp16量化
64 | - ```--int8``` 是否使用int8量化
65 | - ```--imgs_dir``` 校准集路径
66 | - ```--n_iteration``` int8量化校准轮次
67 | - ```--cache_file``` 是否生成cache
68 | - ```--yolov8_head``` 是否为yolov8的检测头(注意,yolov8的输出与yolov5不一样)
69 | - ```--add_nms``` 添加EfficientNMS算子
70 | - ```--conf_thres``` nms的置信度设置
71 | - ```--iou_thres``` nms的iou设置
72 | - ```--max_det``` nms输出的最大检测数量
73 |
74 | 更详细参数说明可以在脚本中查看。
75 |
76 | ## 视频推理
77 | ### 1.不带EfficientNMS算子的推理脚本(yolo_detect_v1.py)
78 | 你需要准备一个模型输出类别的labels文件,具体可参考仓库的labels_coco.yaml文件。本演示中用到模型为coco训练的yolov5s模型,所以需要用到相对应的coco类别。如果你使用的是yolov5、yolov7模型,运行yolo_detect_v1.py脚本,yolox模型运行yolox_detect.py脚本。以yolov5s.engine推理为例。
79 | ```shell
80 | python yolo_detect_v1.py --video_dir ./sample_1080p_h265.mp4 --engine_dir ./models_trt/yolov5s.engine --labels ./labels_coco.yaml
81 | ```
82 |
83 | - ```--video_dir``` 视频源路径
84 | - ```--engine_dir``` trt模型路径
85 | - ```--labels``` 模型labels文件
86 | - ```--conf_thres``` nms的置信度设置
87 | - ```--iou_thres``` nms的iou设置
88 | - ```--max_det``` nms输出的最大检测数量
89 |
90 | ### 2.带EfficientNMS算子的推理脚本(yolo_detect_v2.py)
91 | yolo_detect_v2.py脚本里的所使用trt模型已添加EfficientNMS算子,所以无需在对nms参数进行设置
92 | ```shell
93 | python yolo_detect_v2.py --video_dir ./sample_1080p_h265.mp4 --engine_dir ./models_trt/yolov7_nms.engine --labels ./labels_coco.yaml
94 | ```
95 |
96 | - ```--video_dir``` 视频源路径
97 | - ```--engine_dir``` trt模型路径
98 | - ```--labels``` 模型labels文件
99 |
100 |
101 | ## 其他相关
102 | 可能TensoRT安装是最消耗时间的事情、、、
103 | TensoRT:https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#installing
104 | https://developer.nvidia.com/tensorrt
105 |
106 | Trt_sample: https://github.com/NVIDIA/trt-samples-for-hackathon-cn/tree/master/cookbook
107 |
108 | yolox:https://github.com/Megvii-BaseDetection/YOLOX
109 | yolov5:https://github.com/ultralytics/yolov5
110 | yolov7:https://github.com/WongKinYiu/yolov7
111 | yolov8: https://github.com/ultralytics/ultralytics
112 |
113 |
114 |
--------------------------------------------------------------------------------
/calibration/README.md:
--------------------------------------------------------------------------------
1 | 用于存放量化用的校准集
2 |
--------------------------------------------------------------------------------
/cpp/README.md:
--------------------------------------------------------------------------------
1 | # cpp_inference
2 | c++的TensorRT推理代码。jetson_csi为jetson nano的摄像头检测代码。
3 |
4 | 模型支持:yolov5、yolov7、yolov8
5 |
6 |
--------------------------------------------------------------------------------
/cpp/jetson_csi/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.10)
2 |
3 | set(CMAKE_CXX_STANDARD 14)
4 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
5 | set(CMAKE_CXX_EXTENSIONS ON)
6 |
7 | project(yolo_detect C CXX)
8 |
9 | add_definitions(-DAPI_EXPORTS)
10 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
11 | # SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -g2 -ggdb")
12 | if(NOT CMAKE_BUILD_TYPE)
13 | SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3")
14 | endif()
15 | set(src_list csi_detect.cpp utils_detection.cpp utils_detection.h trt_infer.cpp trt_infer.h gstreamer.cpp gstreamer.h preprocess.cu preprocess.h)
16 |
17 | # CUDA
18 | # TODO(Call for PR): make cmake compatible with Windows
19 | set(CMAKE_CUDA_COMPILER /usr/local/cuda-10.2/bin/nvcc)
20 | enable_language(CUDA)
21 | find_package(CUDA REQUIRED)
22 | message(STATUS " libraries: ${CUDA_LIBRARIES}")
23 | message(STATUS " include path: ${CUDA_INCLUDE_DIRS}")
24 |
25 |
26 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
27 | include_directories(/usr/local/cuda-10.2/include/)
28 | link_directories(/usr/local/cuda-10.2/lib64/)
29 |
30 |
31 | # tensorrt
32 | # set(TRT_DIR J:/tensorrt/TensorRT-8.4.3.1)
33 | # set(TRT_INCLUDE_DIRS ${TRT_DIR}/include/)
34 | # set(TRT_LIB_DIRS ${TRT_DIR}/lib/)
35 |
36 | # include_directories(${TRT_INCLUDE_DIRS})
37 | # link_directories(${TRT_LIB_DIRS})
38 |
39 | #include_directories(${PROJECT_SOURCE_DIR}/)
40 | #file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/*.cpp ${PROJECT_SOURCE_DIR}/*.h)
41 |
42 | # opencv
43 | find_package(OpenCV REQUIRED)
44 | include_directories( ${OpenCV_INCLUDE_DIRS} )
45 |
46 | add_executable(${PROJECT_NAME} ${src_list})
47 | target_link_libraries(${PROJECT_NAME} nvinfer)
48 | target_link_libraries(${PROJECT_NAME} cudart)
49 | target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})
50 |
--------------------------------------------------------------------------------
/cpp/jetson_csi/README.md:
--------------------------------------------------------------------------------
1 | # jetson_csi
2 | 用jetson nano摄像头目标检测的c++代码。
3 | 用cmake编译后,运行yolo_detect。
4 |
5 | ```shell
6 | yolo_detect --engine_dir=./yolov5s.engine --labels=./labels_coco.yaml
7 | ```
8 |
9 | 参数说明:
10 | - ```--engine_dir``` trt模型的保存路径
11 | - ```--labels``` 模型labels文件
12 | - ```--conf_thres``` nms的置信度设置
13 | - ```--iou_thres``` nms的iou设置
14 | - ```--max_det``` nms输出的最大检测数量
15 |
16 | 更详细参数说明可以在csi_detect.cpp中查看。
17 |
--------------------------------------------------------------------------------
/cpp/jetson_csi/csi_detect.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/jetson_csi/csi_detect.cpp
--------------------------------------------------------------------------------
/cpp/jetson_csi/gstreamer.cpp:
--------------------------------------------------------------------------------
1 | #include "gstreamer.h"
2 |
3 |
4 | std::string gs_pipeline(int capture_width, int capture_height, int display_width, int display_height, int framerate, int flip_method)
5 | {
6 | std::string result = "nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)" + std::to_string(capture_width) +
7 | ", height=(int)" + std::to_string(capture_height) +
8 | ", format=(string)NV12, framerate=(fraction)" + std::to_string(framerate) +
9 | "/1 ! nvvidconv flip-method=" + std::to_string(flip_method) +
10 | " ! video/x-raw, width=(int)" + std::to_string(display_width) +
11 | ", height=(int)" + std::to_string(display_height) +
12 | ", format=(string)BGRx ! videoconvert ! video/x-raw, format=(string)BGR ! appsink";
13 |
14 | return result;
15 | }
16 |
--------------------------------------------------------------------------------
/cpp/jetson_csi/gstreamer.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | std::string gs_pipeline(int capture_width, int capture_height, int display_width, int display_height, int framerate, int flip_method);
6 |
--------------------------------------------------------------------------------
/cpp/jetson_csi/labels_coco.yaml:
--------------------------------------------------------------------------------
1 | 0: person
2 | 1: bicycle
3 | 2: car
4 | 3: motorcycle
5 | 4: airplane
6 | 5: bus
7 | 6: train
8 | 7: truck
9 | 8: boat
10 | 9: traffic light
11 | 10: fire hydrant
12 | 11: stop sign
13 | 12: parking meter
14 | 13: bench
15 | 14: bird
16 | 15: cat
17 | 16: dog
18 | 17: horse
19 | 18: sheep
20 | 19: cow
21 | 20: elephant
22 | 21: bear
23 | 22: zebra
24 | 23: giraffe
25 | 24: backpack
26 | 25: umbrella
27 | 26: handbag
28 | 27: tie
29 | 28: suitcase
30 | 29: frisbee
31 | 30: skis
32 | 31: snowboard
33 | 32: sports ball
34 | 33: kite
35 | 34: baseball bat
36 | 35: baseball glove
37 | 36: skateboard
38 | 37: surfboard
39 | 38: tennis racket
40 | 39: bottle
41 | 40: wine glass
42 | 41: cup
43 | 42: fork
44 | 43: knife
45 | 44: spoon
46 | 45: bowl
47 | 46: banana
48 | 47: apple
49 | 48: sandwich
50 | 49: orange
51 | 50: broccoli
52 | 51: carrot
53 | 52: hot dog
54 | 53: pizza
55 | 54: donut
56 | 55: cake
57 | 56: chair
58 | 57: couch
59 | 58: potted plant
60 | 59: bed
61 | 60: dining table
62 | 61: toilet
63 | 62: tv
64 | 63: laptop
65 | 64: mouse
66 | 65: remote
67 | 66: keyboard
68 | 67: cell phone
69 | 68: microwave
70 | 69: oven
71 | 70: toaster
72 | 71: sink
73 | 72: refrigerator
74 | 73: book
75 | 74: clock
76 | 75: vase
77 | 76: scissors
78 | 77: teddy bear
79 | 78: hair drier
80 | 79: toothbrush
81 |
--------------------------------------------------------------------------------
/cpp/jetson_csi/preprocess.cu:
--------------------------------------------------------------------------------
1 | #include "preprocess.h"
2 |
3 | #include
4 |
5 |
6 | __global__ void warpaffine_nearest_bgrbgr2rrggbb_kernel(
7 | uint8_t* src, int src_step_size, int src_width,
8 | int src_height, float* dst, int dst_width,
9 | int dst_height, uint8_t const_value_st,
10 | AffineMatrix d2s, int h_p, int w_p)
11 | {
12 | int dx = blockDim.x * blockIdx.x + threadIdx.x;
13 | int dy = blockDim.y * blockIdx.y + threadIdx.y;
14 | if (dx >= dst_width || dy >= dst_height) return;
15 |
16 | float m_x1 = d2s.value[0];
17 | float m_y1 = d2s.value[1];
18 | float m_z1 = d2s.value[2];
19 | float m_x2 = d2s.value[3];
20 | float m_y2 = d2s.value[4];
21 | float m_z2 = d2s.value[5];
22 |
23 | float c0, c1, c2;
24 | if (dy < h_p || dy >(dst_height - h_p) || dx < w_p || dx >(dst_width - w_p))
25 | {
26 | // out of range
27 | c0 = const_value_st;
28 | c1 = const_value_st;
29 | c2 = const_value_st;
30 | }
31 | else
32 | {
33 | float src_x = m_x1 * (dx + 0.5f) + m_y1 * dy + m_z1 - 0.5f;
34 | float src_y = m_x2 * dx + m_y2 * (dy + 0.5f) + m_z2 - 0.5f;
35 |
36 | int sy_1 = floorf(src_y + 0.5f);
37 | int sx_1 = floorf(src_x + 0.5f);
38 |
39 | uint8_t const_value[] = { const_value_st, const_value_st, const_value_st };
40 | uint8_t* p = const_value;
41 |
42 | if (sy_1 >= 0 && sy_1 <= src_height && sx_1 >=0 && sx_1 <= src_width)
43 | {
44 | p = src + sy_1 * src_step_size + sx_1 * 3;
45 | }
46 |
47 | c0 = p[0];
48 | c1 = p[1];
49 | c2 = p[2];
50 | }
51 |
52 | // normalization
53 | c0 /= 255.0f;
54 | c1 /= 255.0f;
55 | c2 /= 255.0f;
56 |
57 | // bgrbgrbgr to rrrgggbbb
58 | int area = dst_width * dst_height;
59 | float* pdst_c0 = dst + dy * dst_width + dx;
60 | pdst_c0[0] = c2;
61 | pdst_c0[area] = c1;
62 | pdst_c0[2 * area] = c0;
63 | }
64 |
65 | __global__ void warpaffine_bilinear_bgrbgr2rrggbb_kernel(
66 | uint8_t* src, int src_step_size, int src_width,
67 | int src_height, float* dst, int dst_width,
68 | int dst_height, uint8_t const_value_st,
69 | AffineMatrix d2s, int h_p, int w_p)
70 | {
71 | int dx = blockDim.x * blockIdx.x + threadIdx.x;
72 | int dy = blockDim.y * blockIdx.y + threadIdx.y;
73 | if (dx >= dst_width || dy >= dst_height) return;
74 |
75 | float m_x1 = d2s.value[0];
76 | float m_y1 = d2s.value[1];
77 | float m_z1 = d2s.value[2];
78 | float m_x2 = d2s.value[3];
79 | float m_y2 = d2s.value[4];
80 | float m_z2 = d2s.value[5];
81 |
82 | float c0, c1, c2;
83 | if (dy < h_p || dy >(dst_height - h_p) || dx < w_p || dx >(dst_width - w_p))
84 | {
85 | // out of range
86 | c0 = const_value_st;
87 | c1 = const_value_st;
88 | c2 = const_value_st;
89 | }
90 | else
91 | {
92 | float src_x = m_x1 * (dx + 0.5f) + m_y1 * dy + m_z1 - 0.5f;
93 | float src_y = m_x2 * dx + m_y2 * (dy + 0.5f) + m_z2 - 0.5f;
94 |
95 | int sy_1 = floorf(src_y);
96 | int sx_1 = floorf(src_x);
97 | int sy_2 = sy_1 + 1;
98 | int sx_2 = sx_1 + 1;
99 |
100 | uint8_t const_value[] = { const_value_st, const_value_st, const_value_st };
101 | float a2 = src_y - sy_1;
102 | float a1 = 1.0f - a2;
103 | float b2 = src_x - sx_1;
104 | float b1 = 1.0f - b2;
105 | float w11 = a1 * b1;
106 | float w12 = a1 * b2;
107 | float w21 = a2 * b1;
108 | float w22 = a2 * b2;
109 | uint8_t* p11 = const_value;
110 | uint8_t* p12 = const_value;
111 | uint8_t* p21 = const_value;
112 | uint8_t* p22 = const_value;
113 |
114 | /*if (sy_1 >= 0) {
115 | if (sx_1 >= 0)*/
116 | p11 = src + sy_1 * src_step_size + sx_1 * 3;
117 |
118 | //if (sx_2 < src_width)
119 | p12 = src + sy_1 * src_step_size + sx_2 * 3;
120 | //}
121 |
122 | /*if (sy_2 < src_height) {
123 | if (sx_1 >= 0)*/
124 | p21 = src + sy_2 * src_step_size + sx_1 * 3;
125 |
126 | /*if (sx_2 < src_width)*/
127 | p22 = src + sy_2 * src_step_size + sx_2 * 3;
128 | //}
129 |
130 | c0 = w11 * p11[0] + w12 * p12[0] + w21 * p21[0] + w22 * p22[0] + 0.5f;
131 | c1 = w11 * p11[1] + w12 * p12[1] + w21 * p21[1] + w22 * p22[1] + 0.5f;
132 | c2 = w11 * p11[2] + w12 * p12[2] + w21 * p21[2] + w22 * p22[2] + 0.5f;
133 | }
134 |
135 | // normalization
136 | c0 /= 255.0f;
137 | c1 /= 255.0f;
138 | c2 /= 255.0f;
139 |
140 | // bgrbgrbgr to rrrgggbbb
141 | int area = dst_width * dst_height;
142 | float* pdst_c0 = dst + dy * dst_width + dx;
143 | pdst_c0[0] = c2;
144 | pdst_c0[area] = c1;
145 | pdst_c0[2 * area] = c0;
146 | }
147 |
148 |
149 | void cuda_preprocess(cv::Mat& image, preproc_struct& image_trans, std::vector& bufferH,
150 | std::vector& bufferD, std::vector& bindingsize, cudaStream_t& stream, cv::Size resize)
151 | {
152 | int h, w, h_p, w_p;
153 |
154 | float scale = cv::min((float)resize.height / (float)image.rows, (float)resize.width / (float)image.cols);
155 | scale = cv::min(scale, 1.1f);
156 |
157 | h = image.rows * scale;
158 | w = image.cols * scale;
159 | h_p = (resize.height - h) * 0.5f;
160 | w_p = (resize.width - w) * 0.5f;
161 |
162 | image_trans.scale = scale;
163 | image_trans.h_p = h_p;
164 | image_trans.w_p = w_p;
165 |
166 | // copy data to device memory
167 | // memcpy(bufferH[2], image.data, bindingsize[2]);
168 | // cudaMemcpyAsync(bufferD[2], bufferH[2], bindingsize[2], cudaMemcpyHostToDevice, stream);
169 | cudaMemcpyAsync(bufferD[2], image.data, bindingsize[2], cudaMemcpyHostToDevice, stream);
170 |
171 | AffineMatrix s2d, d2s;
172 |
173 | /*s2d.value[0] = scale;
174 | s2d.value[1] = 0;
175 | s2d.value[2] = (resize.width - scale * image.cols + scale - 1) * 0.5f;
176 | s2d.value[3] = 0;
177 | s2d.value[4] = scale;
178 | s2d.value[5] = (resize.height - scale * image.rows + scale - 1) * 0.5f;*/
179 |
180 | d2s.value[0] = 1.0f / scale;
181 | d2s.value[1] = 0;
182 | d2s.value[2] = (image.cols - resize.width / scale + d2s.value[0] - 1) * 0.5f;
183 | d2s.value[3] = 0;
184 | d2s.value[4] = 1.0f / scale;
185 | d2s.value[5] = (image.rows - resize.height / scale + d2s.value[0] - 1) * 0.5f;
186 |
187 | /*cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
188 | cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
189 | cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
190 | memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value));*/
191 |
192 | dim3 block(128, 1);
193 | dim3 grid((resize.width + block.x - 1) / block.x, (resize.height + block.y - 1) / block.y);
194 |
195 | warpaffine_nearest_bgrbgr2rrggbb_kernel <<< grid, block, 0, stream >>> (
196 | (uint8_t*)bufferD[2], image.cols * 3, image.cols,
197 | image.rows, (float*)bufferD[0], resize.width,
198 | resize.height, 0, d2s, h_p, w_p);
199 | }
200 |
--------------------------------------------------------------------------------
/cpp/jetson_csi/preprocess.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "utils_detection.h"
4 |
5 | #include
6 | #include
7 |
8 | struct AffineMatrix
9 | {
10 | float value[6];
11 | };
12 |
13 | void cuda_preprocess(cv::Mat& image, preproc_struct& image_trans, std::vector& bufferH,
14 | std::vector& bufferD, std::vector& bindingsize, cudaStream_t& stream, cv::Size resize);
15 |
--------------------------------------------------------------------------------
/cpp/jetson_csi/trt_infer.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/jetson_csi/trt_infer.cpp
--------------------------------------------------------------------------------
/cpp/jetson_csi/trt_infer.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 |
11 |
12 | class Logger : public nvinfer1::ILogger
13 | {
14 | public:
15 | Severity reportableSeverity;
16 |
17 | Logger(Severity severity = Severity::kINFO);
18 | void log(Severity severity, const char* msg) noexcept override;
19 | };
20 |
21 |
22 | bool load_engine(nvinfer1::IRuntime*& runtime, nvinfer1::ICudaEngine*& engine, const std::string& engine_dir,
23 | nvinfer1::ILogger& gLogger);
24 |
25 | void allocate_buffers(nvinfer1::ICudaEngine*& engine, std::vector& bufferH, std::vector& bufferD, std::vector& bindingsize,
26 | cv::Size img_size);
27 |
28 | float* do_inference(nvinfer1::IExecutionContext*& context, std::vector& bufferH, const std::vector& bufferD,
29 | cudaStream_t& stream, const std::vector& BindingSize);
30 |
31 |
32 | class yolo_trt_det
33 | {
34 | private:
35 |
36 | nvinfer1::IRuntime* _runtime = nullptr;
37 | nvinfer1::ICudaEngine* _engine = nullptr;
38 | nvinfer1::IExecutionContext* _context = nullptr;
39 |
40 | std::unordered_map catid_labels;
41 | color_dicts catid_colors;
42 | cv::Size img_size;
43 | cv::Size set_size;
44 | bool v8_head;
45 |
46 | std::vector cpu_buffer;
47 | std::vector gpu_buffer;
48 | std::vector BindingSize;
49 | cudaStream_t stream;
50 |
51 | public:
52 | yolo_trt_det(const std::string& engine_dir, const std::string& labels_dir, cv::Size img_size);
53 | ~yolo_trt_det();
54 |
55 | std::vector draw_batch(std::vector& image_list, float conf, float iou, int max_det);
56 |
57 | cv::Mat draw(cv::Mat& image, float conf, float iou, int max_det);
58 | };
59 |
--------------------------------------------------------------------------------
/cpp/jetson_csi/utils_detection.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/jetson_csi/utils_detection.cpp
--------------------------------------------------------------------------------
/cpp/jetson_csi/utils_detection.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 |
10 |
11 | struct color_dicts
12 | {
13 | std::unordered_map> color_map;
14 | std::vector catid;
15 |
16 | color_dicts(const std::unordered_map& catid_labels);
17 | };
18 |
19 |
20 | struct preproc_struct
21 | {
22 | float* img = nullptr;
23 | float scale;
24 | int h_p;
25 | int w_p;
26 |
27 | ~preproc_struct();
28 | };
29 |
30 |
31 |
32 | std::unordered_map yaml_load_labels(const std::string& dir = "data.yaml");
33 |
34 | void preprocess(cv::Mat& image, preproc_struct& image_trans, cv::Size resize);
35 |
36 | void fliter_boxes(float* const boxes, bool v8_head, const std::array& output_shape, float conf_thres,
37 | std::vector& keep_boxes, std::vector& keep_scores, std::vector& keep_classes);
38 |
39 | void scale_boxes(cv::Rect& box, const preproc_struct& preproc_res);
40 |
41 | void draw_boxes(cv::Mat image, const cv::Rect& box, float score, int class_id,
42 | std::unordered_map catid_labels, color_dicts& color_dicts);
43 |
44 | void imgresize(const cv::Mat& image, cv::Mat& input_image, float scale, cv::Size resize);
45 |
46 | template
47 | static bool SortScorePairDescend(const std::pair& pair1, const std::pair& pair2);
48 |
49 | template
50 | void max_score_idx(const std::vector& scores, float score_thres, T scores_idxs);
51 |
52 | float get_iou(const cv::Rect& bbox1, const cv::Rect& bbox2);
53 |
54 | void base_nms(const std::vector& bboxes, const std::vector& scores, const std::vector& catid,
55 | float score_threshold, float nms_threshold, std::vector& indices, int limit);
56 |
--------------------------------------------------------------------------------
/cpp/kp_jetson_csi/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.10)
2 |
3 | set(CMAKE_CXX_STANDARD 14)
4 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
5 | set(CMAKE_CXX_EXTENSIONS ON)
6 |
7 | project(yolo_detect C CXX)
8 |
9 | add_definitions(-DAPI_EXPORTS)
10 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
11 | # SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -g2 -ggdb")
12 | if(NOT CMAKE_BUILD_TYPE)
13 | SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3")
14 | endif()
15 | set(src_list csi_kp_detect.cpp utils_detection.cpp utils_detection.h trt_infer.cpp trt_infer.h gstreamer.cpp gstreamer.h preprocess.cu preprocess.h)
16 |
17 | # CUDA
18 | # TODO(Call for PR): make cmake compatible with Windows
19 | set(CMAKE_CUDA_COMPILER /usr/local/cuda-10.2/bin/nvcc)
20 | enable_language(CUDA)
21 | find_package(CUDA REQUIRED)
22 | message(STATUS " libraries: ${CUDA_LIBRARIES}")
23 | message(STATUS " include path: ${CUDA_INCLUDE_DIRS}")
24 |
25 |
26 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
27 | include_directories(/usr/local/cuda-10.2/include/)
28 | link_directories(/usr/local/cuda-10.2/lib64/)
29 |
30 |
31 | # tensorrt
32 | # set(TRT_DIR J:/tensorrt/TensorRT-8.4.3.1)
33 | # set(TRT_INCLUDE_DIRS ${TRT_DIR}/include/)
34 | # set(TRT_LIB_DIRS ${TRT_DIR}/lib/)
35 |
36 | # include_directories(${TRT_INCLUDE_DIRS})
37 | # link_directories(${TRT_LIB_DIRS})
38 |
39 | #include_directories(${PROJECT_SOURCE_DIR}/)
40 | #file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/*.cpp ${PROJECT_SOURCE_DIR}/*.h)
41 |
42 | # opencv
43 | find_package(OpenCV REQUIRED)
44 | include_directories( ${OpenCV_INCLUDE_DIRS} )
45 |
46 | add_executable(${PROJECT_NAME} ${src_list})
47 | target_link_libraries(${PROJECT_NAME} nvinfer)
48 | target_link_libraries(${PROJECT_NAME} cudart)
49 | target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})
50 |
--------------------------------------------------------------------------------
/cpp/kp_jetson_csi/README.md:
--------------------------------------------------------------------------------
1 | # kp_jetson_csi
2 | 在jetson nano 4g上使用yolov5和hrnet进行摄像头人体关键点检测。
3 | 我对hrnet进行了轻量化改造,使其能在算力有限的平台上运行。替换上mobilenetv2的backbone后用coco2017数据集进行了训练,可满足单目标的人体关键点检测需求。
4 | 后续有时间可能会更新关键点检测模型,当然如果没时间魔改出更快更准的模型的话就算了...
5 |
6 | ## 数据准备
7 | 相比于目标检测,这里需要多提供一个关键点检测的engine和关键点链接信息。
8 | 你可以使用我提供的以下两个onnx模型,在运行的设备上生成engine。或者自己训练一个专门用于检测人的yolo模型,和一个用于关键点检测的hrnet模型。
9 |
10 | yolov5s_person.onnx:https://pan.baidu.com/s/1mgbFLOENiIaTmfsyc2RtVw
11 | 提取码:qei0
12 |
13 | Myhrnet.onnx:https://pan.baidu.com/s/1rIR_CjOuu6qzaWsoirfP3A
14 | 提取码:43dw
15 |
16 | points_link.yaml文件里记录的是关键点的链接信息,用于绘图。
17 |
18 | 用cmake编译后,运行yolo_detect。
19 |
20 | ```shell
21 | yolo_detect --det_engine_dir=./yolov5s_person.engine --kp_engine_dir=./Myhrnet.engine --labels=./labels_det.yaml --pointlinker=./points_link.yaml
22 | ```
23 |
24 | 参数说明:
25 | - ```--det_engine_dir``` 目标检测trt模型的保存路径
26 | - ```--kp_engine_dir``` 关键点检测trt模型的保存路径
27 | - ```--labels``` 模型labels的yaml文件
28 | - ```--pointlinker``` 关键点链接的yaml文件
29 | - ```--conf_thres``` nms的置信度设置
30 | - ```--iou_thres``` nms的iou设置
31 | - ```--max_det``` 输出的最大检测数量
32 | - ```--skip``` 隔帧检测帧数
33 |
34 | 更详细参数说明可以在csi_kp_detect.cpp中查看。
35 |
--------------------------------------------------------------------------------
/cpp/kp_jetson_csi/csi_kp_detect.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/kp_jetson_csi/csi_kp_detect.cpp
--------------------------------------------------------------------------------
/cpp/kp_jetson_csi/gstreamer.cpp:
--------------------------------------------------------------------------------
1 | #include "gstreamer.h"
2 |
3 |
4 | std::string gs_pipeline(int capture_width, int capture_height, int display_width, int display_height, int framerate, int flip_method)
5 | {
6 | std::string result = "nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)" + std::to_string(capture_width) +
7 | ", height=(int)" + std::to_string(capture_height) +
8 | ", format=(string)NV12, framerate=(fraction)" + std::to_string(framerate) +
9 | "/1 ! nvvidconv flip-method=" + std::to_string(flip_method) +
10 | " ! video/x-raw, width=(int)" + std::to_string(display_width) +
11 | ", height=(int)" + std::to_string(display_height) +
12 | ", format=(string)BGRx ! videoconvert ! video/x-raw, format=(string)BGR ! appsink";
13 |
14 | return result;
15 | }
16 |
--------------------------------------------------------------------------------
/cpp/kp_jetson_csi/gstreamer.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | std::string gs_pipeline(int capture_width, int capture_height, int display_width, int display_height, int framerate, int flip_method);
6 |
--------------------------------------------------------------------------------
/cpp/kp_jetson_csi/labels_det.yaml:
--------------------------------------------------------------------------------
1 | 0: person
2 |
--------------------------------------------------------------------------------
/cpp/kp_jetson_csi/points_link.yaml:
--------------------------------------------------------------------------------
1 | 15: 13
2 | 13: 11
3 | 16: 14
4 | 14: 12
5 | 12: 11
6 | 11: 5
7 | 6: 12
8 | 5: 6
9 | 7: 5
10 | 8: 6
11 | 9: 7
12 | 10: 8
13 | 1: 2
14 | 0: 1
15 | 0: 2
16 | 1: 3
17 | 2: 4
18 | 3: 5
19 | 4: 6
20 |
--------------------------------------------------------------------------------
/cpp/kp_jetson_csi/preprocess.cu:
--------------------------------------------------------------------------------
1 | #include "preprocess.h"
2 |
3 | #include
4 |
5 |
6 | __global__ void warpaffine_nearest_bgrbgr2rrggbb_kernel(
7 | uint8_t* src, int src_step_size, int src_width,
8 | int src_height, float* dst, int dst_width,
9 | int dst_height, uint8_t const_value_st,
10 | AffineMatrix d2s, int h_p, int w_p)
11 | {
12 | int dx = blockDim.x * blockIdx.x + threadIdx.x;
13 | int dy = blockDim.y * blockIdx.y + threadIdx.y;
14 | if (dx >= dst_width || dy >= dst_height) return;
15 |
16 | float m_x1 = d2s.value[0];
17 | float m_y1 = d2s.value[1];
18 | float m_z1 = d2s.value[2];
19 | float m_x2 = d2s.value[3];
20 | float m_y2 = d2s.value[4];
21 | float m_z2 = d2s.value[5];
22 |
23 | float c0, c1, c2;
24 | if (dy < h_p || dy >(dst_height - h_p) || dx < w_p || dx >(dst_width - w_p))
25 | {
26 | // out of range
27 | c0 = const_value_st;
28 | c1 = const_value_st;
29 | c2 = const_value_st;
30 | }
31 | else
32 | {
33 | float src_x = m_x1 * (dx + 0.5f) + m_y1 * dy + m_z1 - 0.5f;
34 | float src_y = m_x2 * dx + m_y2 * (dy + 0.5f) + m_z2 - 0.5f;
35 |
36 | int sy_1 = floorf(src_y + 0.5f);
37 | int sx_1 = floorf(src_x + 0.5f);
38 |
39 | uint8_t const_value[] = { const_value_st, const_value_st, const_value_st };
40 | uint8_t* p = const_value;
41 |
42 | if (sy_1 >= 0 && sy_1 <= src_height && sx_1 >=0 && sx_1 <= src_width)
43 | {
44 | p = src + sy_1 * src_step_size + sx_1 * 3;
45 | }
46 |
47 | c0 = p[0];
48 | c1 = p[1];
49 | c2 = p[2];
50 | }
51 |
52 | // normalization
53 | c0 /= 255.0f;
54 | c1 /= 255.0f;
55 | c2 /= 255.0f;
56 |
57 | // bgrbgrbgr to rrrgggbbb
58 | int area = dst_width * dst_height;
59 | float* pdst_c0 = dst + dy * dst_width + dx;
60 | pdst_c0[0] = c2;
61 | pdst_c0[area] = c1;
62 | pdst_c0[2 * area] = c0;
63 | }
64 |
65 | __global__ void warpaffine_bilinear_bgrbgr2rrggbb_kernel(
66 | uint8_t* src, int src_step_size, int src_width,
67 | int src_height, float* dst, int dst_width,
68 | int dst_height, uint8_t const_value_st,
69 | AffineMatrix d2s, int h_p, int w_p)
70 | {
71 | int dx = blockDim.x * blockIdx.x + threadIdx.x;
72 | int dy = blockDim.y * blockIdx.y + threadIdx.y;
73 | if (dx >= dst_width || dy >= dst_height) return;
74 |
75 | float m_x1 = d2s.value[0];
76 | float m_y1 = d2s.value[1];
77 | float m_z1 = d2s.value[2];
78 | float m_x2 = d2s.value[3];
79 | float m_y2 = d2s.value[4];
80 | float m_z2 = d2s.value[5];
81 |
82 | float c0, c1, c2;
83 | if (dy < h_p || dy >(dst_height - h_p) || dx < w_p || dx >(dst_width - w_p))
84 | {
85 | // out of range
86 | c0 = const_value_st;
87 | c1 = const_value_st;
88 | c2 = const_value_st;
89 | }
90 | else
91 | {
92 | float src_x = m_x1 * (dx + 0.5f) + m_y1 * dy + m_z1 - 0.5f;
93 | float src_y = m_x2 * dx + m_y2 * (dy + 0.5f) + m_z2 - 0.5f;
94 |
95 | int sy_1 = floorf(src_y);
96 | int sx_1 = floorf(src_x);
97 | int sy_2 = sy_1 + 1;
98 | int sx_2 = sx_1 + 1;
99 |
100 | uint8_t const_value[] = { const_value_st, const_value_st, const_value_st };
101 | float a2 = src_y - sy_1;
102 | float a1 = 1.0f - a2;
103 | float b2 = src_x - sx_1;
104 | float b1 = 1.0f - b2;
105 | float w11 = a1 * b1;
106 | float w12 = a1 * b2;
107 | float w21 = a2 * b1;
108 | float w22 = a2 * b2;
109 | uint8_t* p11 = const_value;
110 | uint8_t* p12 = const_value;
111 | uint8_t* p21 = const_value;
112 | uint8_t* p22 = const_value;
113 |
114 | /*if (sy_1 >= 0) {
115 | if (sx_1 >= 0)*/
116 | p11 = src + sy_1 * src_step_size + sx_1 * 3;
117 |
118 | //if (sx_2 < src_width)
119 | p12 = src + sy_1 * src_step_size + sx_2 * 3;
120 | //}
121 |
122 | /*if (sy_2 < src_height) {
123 | if (sx_1 >= 0)*/
124 | p21 = src + sy_2 * src_step_size + sx_1 * 3;
125 |
126 | /*if (sx_2 < src_width)*/
127 | p22 = src + sy_2 * src_step_size + sx_2 * 3;
128 | //}
129 |
130 | c0 = w11 * p11[0] + w12 * p12[0] + w21 * p21[0] + w22 * p22[0] + 0.5f;
131 | c1 = w11 * p11[1] + w12 * p12[1] + w21 * p21[1] + w22 * p22[1] + 0.5f;
132 | c2 = w11 * p11[2] + w12 * p12[2] + w21 * p21[2] + w22 * p22[2] + 0.5f;
133 | }
134 |
135 | // normalization
136 | c0 /= 255.0f;
137 | c1 /= 255.0f;
138 | c2 /= 255.0f;
139 |
140 | // bgrbgrbgr to rrrgggbbb
141 | int area = dst_width * dst_height;
142 | float* pdst_c0 = dst + dy * dst_width + dx;
143 | pdst_c0[0] = c2;
144 | pdst_c0[area] = c1;
145 | pdst_c0[2 * area] = c0;
146 | }
147 |
148 |
149 | void cuda_preprocess(cv::Mat& image, preproc_struct& image_trans, std::vector& bufferH,
150 | std::vector& bufferD, std::vector& bindingsize, cudaStream_t& stream, cv::Size resize)
151 | {
152 | int h, w, h_p, w_p;
153 |
154 | float scale = cv::min((float)resize.height / (float)image.rows, (float)resize.width / (float)image.cols);
155 | scale = cv::min(scale, 1.1f);
156 |
157 | h = image.rows * scale;
158 | w = image.cols * scale;
159 | h_p = (resize.height - h) * 0.5f;
160 | w_p = (resize.width - w) * 0.5f;
161 |
162 | image_trans.scale = scale;
163 | image_trans.ori_h = image.rows;
164 | image_trans.ori_w = image.cols;
165 | image_trans.h_p = h_p;
166 | image_trans.w_p = w_p;
167 |
168 | // copy data to device memory
169 | cudaMemcpyAsync(bufferD[2], image.data, bindingsize[2], cudaMemcpyHostToDevice, stream);
170 |
171 | AffineMatrix d2s;
172 |
173 | d2s.value[0] = 1.0f / scale;
174 | d2s.value[1] = 0;
175 | d2s.value[2] = (image.cols - resize.width / scale + d2s.value[0] - 1) * 0.5f;
176 | d2s.value[3] = 0;
177 | d2s.value[4] = 1.0f / scale;
178 | d2s.value[5] = (image.rows - resize.height / scale + d2s.value[0] - 1) * 0.5f;
179 |
180 | // AffineMatrix s2d;
181 |
182 | /*s2d.value[0] = scale;
183 | s2d.value[1] = 0;
184 | s2d.value[2] = (resize.width - scale * image.cols + scale - 1) * 0.5f;
185 |
186 | s2d.value[3] = 0;
187 | s2d.value[4] = scale;
188 | s2d.value[5] = (resize.height - scale * image.rows + scale - 1) * 0.5f;*/
189 |
190 | /*cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
191 | cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
192 | cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
193 | memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value));*/
194 |
195 | dim3 block(128, 1);
196 | dim3 grid((resize.width + block.x - 1) / block.x, (resize.height + block.y - 1) / block.y);
197 |
198 | warpaffine_nearest_bgrbgr2rrggbb_kernel <<< grid, block, 0, stream >>> (
199 | (uint8_t*)bufferD[2], image.cols * 3, image.cols,
200 | image.rows, (float*)bufferD[0], resize.width,
201 | resize.height, 0, d2s, h_p, w_p);
202 | }
203 |
--------------------------------------------------------------------------------
/cpp/kp_jetson_csi/preprocess.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "utils_detection.h"
4 |
5 | #include
6 | #include
7 |
8 | struct AffineMatrix
9 | {
10 | float value[6];
11 | };
12 |
13 | void cuda_preprocess(cv::Mat& image, preproc_struct& image_trans, std::vector& bufferH,
14 | std::vector& bufferD, std::vector& bindingsize, cudaStream_t& stream, cv::Size resize);
15 |
--------------------------------------------------------------------------------
/cpp/kp_jetson_csi/trt_infer.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/kp_jetson_csi/trt_infer.cpp
--------------------------------------------------------------------------------
/cpp/kp_jetson_csi/trt_infer.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 |
11 |
12 | class Logger : public nvinfer1::ILogger
13 | {
14 | public:
15 | Severity reportableSeverity;
16 |
17 | Logger(Severity severity = Severity::kINFO);
18 | void log(Severity severity, const char* msg) noexcept override;
19 | };
20 |
21 |
22 | bool load_engine(nvinfer1::IRuntime*& runtime, nvinfer1::ICudaEngine*& engine, const std::string& engine_dir,
23 | nvinfer1::ILogger& gLogger);
24 |
25 | void allocate_buffers(nvinfer1::ICudaEngine*& engine, std::vector& bufferH, std::vector& bufferD, std::vector& bindingsize);
26 |
27 | float* det_inference(nvinfer1::IExecutionContext*& context, std::vector& bufferH, const std::vector& bufferD,
28 | const std::vector& BindingSize, cudaStream_t& stream);
29 |
30 | float* kp_inference(nvinfer1::IExecutionContext*& context, std::vector& bufferH, const std::vector& bufferD,
31 | const std::vector& BindingSize, cudaStream_t& stream);
32 |
33 |
34 | class yolo_trt_det
35 | {
36 | private:
37 |
38 | nvinfer1::IRuntime* det_runtime = nullptr;
39 | nvinfer1::ICudaEngine* det_engine = nullptr;
40 | nvinfer1::IExecutionContext* det_context = nullptr;
41 |
42 | nvinfer1::IRuntime* kp_runtime = nullptr;
43 | nvinfer1::ICudaEngine* kp_engine = nullptr;
44 | nvinfer1::IExecutionContext* kp_context = nullptr;
45 |
46 | std::unordered_map catid_labels;
47 | std::vector> points_linker;
48 | color_dicts catid_colors;
49 | cv::Size img_resize;
50 | cv::Size kp_img_resize;
51 |
52 | bool v8_head;
53 |
54 | std::vector det_bufferh;
55 | std::vector det_bufferd;
56 | std::vector det_bindingsize;
57 |
58 | std::vector kp_bufferh;
59 | std::vector kp_bufferd;
60 | std::vector kp_bindingsize;
61 | cudaStream_t stream;
62 |
63 | int skip;
64 | std::vector< int > nms_idx;
65 | std::vector nms_boxes;
66 | std::vector nms_scores;
67 | std::vector nms_catid;
68 |
69 | uint64_t infer_times;
70 | uint32_t frams_num;
71 |
72 | public:
73 | yolo_trt_det() = default;
74 | yolo_trt_det(const std::string & det_engine_dir, const std::string & kp_engine_dir, const std::string & labels_dir,
75 | const std::string & pointlinker_dir, cv::Size img_size);
76 | ~yolo_trt_det();
77 |
78 | cv::Mat draw(cv::Mat & image, float conf, float iou, int max_det, int skip);
79 | };
80 |
--------------------------------------------------------------------------------
/cpp/kp_jetson_csi/utils_detection.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/kp_jetson_csi/utils_detection.cpp
--------------------------------------------------------------------------------
/cpp/kp_jetson_csi/utils_detection.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 |
10 |
11 | struct color_dicts
12 | {
13 | std::unordered_map> color_map;
14 | std::vector catid;
15 |
16 | color_dicts() {};
17 | color_dicts(const std::unordered_map& catid_labels);
18 | };
19 |
20 |
21 | struct preproc_struct
22 | {
23 | float* img = nullptr;
24 | float scale;
25 | int ori_h;
26 | int ori_w;
27 | int h_p;
28 | int w_p;
29 |
30 | ~preproc_struct();
31 | };
32 |
33 |
34 |
35 | std::unordered_map yaml_load_labels(const std::string& dir = "data.yaml");
36 |
37 | std::vector> yaml_load_points_link(const std::string& dir);
38 |
39 | void preprocess(cv::Mat& image, preproc_struct& image_trans, const cv::Size& resize);
40 |
41 | void fliter_boxes(float* const boxes, bool v8_head, const std::array& output_shape, const float& conf_thres,
42 | std::vector& keep_boxes, std::vector& keep_scores, std::vector& keep_classes);
43 |
44 | void scale_boxes(cv::Rect& box, const preproc_struct& preproc_res);
45 |
46 | void draw_boxes(cv::Mat image, const cv::Rect& box, const float& score, const int& class_id,
47 | std::unordered_map catid_labels, color_dicts& color_dicts);
48 |
49 | void imgresize(const cv::Mat& image, cv::Mat& input_image, const float& scale, cv::Size resize);
50 |
51 | template
52 | static bool SortScorePairDescend(const std::pair& pair1, const std::pair& pair2);
53 |
54 | template
55 | void max_score_idx(const std::vector& scores, const float& score_thres, T& scores_idxs);
56 |
57 | float get_iou(const cv::Rect& bbox1, const cv::Rect& bbox2);
58 |
59 | void base_nms(const std::vector& bboxes, const std::vector& scores, const std::vector& catid,
60 | const float& score_threshold, const float& nms_threshold, std::vector& indices, const int& limit);
61 |
62 | void get_final_preds(float* const heatmaps, preproc_struct& keypoints_trans, const std::array& output_shape,
63 | const cv::Rect& bbox, std::vector& keypoints_scorce, std::vector& keypoints);
64 |
65 | void draw_keypoints(cv::Mat image, const std::vector& keypoints, std::vector& keypoints_score,
66 | float score, const std::vector>& points_linker);
67 |
--------------------------------------------------------------------------------
/cpp/video_detect/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.10)
2 |
3 | set(CMAKE_CXX_STANDARD 14)
4 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
5 | set(CMAKE_CXX_EXTENSIONS ON)
6 |
7 | project(yolo_tensorrt C CXX)
8 |
9 | add_definitions(-DAPI_EXPORTS)
10 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
11 |
12 | if(NOT CMAKE_BUILD_TYPE)
13 | set(CMAKE_BUILD_TYPE Release CACHE STRING "Build Type" FORCE)
14 | set(CMAKE_CXX_FLAGS_Release "$ENV{CXXFLAGS} -O3 -Wall")
15 | endif()
16 |
17 |
18 | set(src_list main.cpp utils_detection.cpp utils_detection.h trt_infer.cpp trt_infer.h preprocess.cu preprocess.h)
19 |
20 | # TODO(Call for PR): make cmake compatible with Windows
21 | set(CMAKE_CUDA_COMPILER E:/NV/cuda11.7/bin/nvcc)
22 | enable_language(CUDA)
23 |
24 | # CUDA
25 | # TODO(Call for PR): make cmake compatible with Windows
26 | find_package(CUDA REQUIRED)
27 | message(STATUS " libraries: ${CUDA_LIBRARIES}")
28 | message(STATUS " include path: ${CUDA_INCLUDE_DIRS}")
29 |
30 |
31 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
32 | include_directories(E:/NV/cuda11.7/include/)
33 | link_directories(E:/NV/cuda11.7/lib/x64/)
34 |
35 |
36 | # tensorrt
37 | set(TRT_DIR J:/tensorrt/TensorRT-8.4.3.1)
38 | set(TRT_INCLUDE_DIRS ${TRT_DIR}/include/)
39 | set(TRT_LIB_DIRS ${TRT_DIR}/lib/)
40 |
41 | include_directories(${TRT_INCLUDE_DIRS})
42 |
43 |
44 | # opencv
45 | set(CMAKE_PREFIX_PATH E:/opencv/build/x64/vc16/lib)
46 | find_package(OpenCV REQUIRED)
47 | include_directories( ${OpenCV_INCLUDE_DIRS} )
48 |
49 | add_executable(${PROJECT_NAME} ${src_list})
50 | target_link_libraries(${PROJECT_NAME} nvinfer)
51 | target_link_libraries(${PROJECT_NAME} cudart)
52 | target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})
--------------------------------------------------------------------------------
/cpp/video_detect/README.md:
--------------------------------------------------------------------------------
1 | # video_detect
2 | 用于视频目标检测的c++代码。
3 | 用cmake编译后,运行yolo_detect。
4 |
5 | ```shell
6 | yolo_detect --engine_dir=./yolov5s.engine --video_dir=./sample_1080p_h265.mp4 --labels=./labels_coco.yaml
7 | ```
8 |
9 | 参数说明:
10 | - ```--engine_dir``` trt模型的保存路径
11 | - ```--video_dir``` 视频源路径
12 | - ```--labels``` 模型labels文件
13 | - ```--conf_thres``` nms的置信度设置
14 | - ```--iou_thres``` nms的iou设置
15 | - ```--max_det``` nms输出的最大检测数量
16 |
17 | 更详细参数说明可以在main.cpp中查看。
18 |
--------------------------------------------------------------------------------
/cpp/video_detect/labels_coco.yaml:
--------------------------------------------------------------------------------
1 | 0: person
2 | 1: bicycle
3 | 2: car
4 | 3: motorcycle
5 | 4: airplane
6 | 5: bus
7 | 6: train
8 | 7: truck
9 | 8: boat
10 | 9: traffic light
11 | 10: fire hydrant
12 | 11: stop sign
13 | 12: parking meter
14 | 13: bench
15 | 14: bird
16 | 15: cat
17 | 16: dog
18 | 17: horse
19 | 18: sheep
20 | 19: cow
21 | 20: elephant
22 | 21: bear
23 | 22: zebra
24 | 23: giraffe
25 | 24: backpack
26 | 25: umbrella
27 | 26: handbag
28 | 27: tie
29 | 28: suitcase
30 | 29: frisbee
31 | 30: skis
32 | 31: snowboard
33 | 32: sports ball
34 | 33: kite
35 | 34: baseball bat
36 | 35: baseball glove
37 | 36: skateboard
38 | 37: surfboard
39 | 38: tennis racket
40 | 39: bottle
41 | 40: wine glass
42 | 41: cup
43 | 42: fork
44 | 43: knife
45 | 44: spoon
46 | 45: bowl
47 | 46: banana
48 | 47: apple
49 | 48: sandwich
50 | 49: orange
51 | 50: broccoli
52 | 51: carrot
53 | 52: hot dog
54 | 53: pizza
55 | 54: donut
56 | 55: cake
57 | 56: chair
58 | 57: couch
59 | 58: potted plant
60 | 59: bed
61 | 60: dining table
62 | 61: toilet
63 | 62: tv
64 | 63: laptop
65 | 64: mouse
66 | 65: remote
67 | 66: keyboard
68 | 67: cell phone
69 | 68: microwave
70 | 69: oven
71 | 70: toaster
72 | 71: sink
73 | 72: refrigerator
74 | 73: book
75 | 74: clock
76 | 75: vase
77 | 76: scissors
78 | 77: teddy bear
79 | 78: hair drier
80 | 79: toothbrush
81 |
--------------------------------------------------------------------------------
/cpp/video_detect/main.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/video_detect/main.cpp
--------------------------------------------------------------------------------
/cpp/video_detect/preprocess.cu:
--------------------------------------------------------------------------------
1 | #include "preprocess.h"
2 |
3 | #include
4 |
5 |
6 | __global__ void warpaffine_nearest_bgrbgr2rrggbb_kernel(
7 | uint8_t* src, int src_step_size, int src_width,
8 | int src_height, float* dst, int dst_width,
9 | int dst_height, uint8_t const_value_st,
10 | AffineMatrix d2s, int h_p, int w_p)
11 | {
12 | int dx = blockDim.x * blockIdx.x + threadIdx.x;
13 | int dy = blockDim.y * blockIdx.y + threadIdx.y;
14 | if (dx >= dst_width || dy >= dst_height) return;
15 |
16 | float m_x1 = d2s.value[0];
17 | float m_y1 = d2s.value[1];
18 | float m_z1 = d2s.value[2];
19 | float m_x2 = d2s.value[3];
20 | float m_y2 = d2s.value[4];
21 | float m_z2 = d2s.value[5];
22 |
23 | float c0, c1, c2;
24 | if (dy < h_p || dy >(dst_height - h_p) || dx < w_p || dx >(dst_width - w_p))
25 | {
26 | // out of range
27 | c0 = const_value_st;
28 | c1 = const_value_st;
29 | c2 = const_value_st;
30 | }
31 | else
32 | {
33 | float src_x = m_x1 * (dx + 0.5f) + m_y1 * dy + m_z1 - 0.5f;
34 | float src_y = m_x2 * dx + m_y2 * (dy + 0.5f) + m_z2 - 0.5f;
35 |
36 | int sy_1 = floorf(src_y + 0.5f);
37 | int sx_1 = floorf(src_x + 0.5f);
38 |
39 | uint8_t const_value[] = { const_value_st, const_value_st, const_value_st };
40 | uint8_t* p = const_value;
41 |
42 | if (sy_1 >= 0 && sy_1 <= src_height && sx_1 >=0 && sx_1 <= src_width)
43 | {
44 | p = src + sy_1 * src_step_size + sx_1 * 3;
45 | }
46 |
47 | c0 = p[0];
48 | c1 = p[1];
49 | c2 = p[2];
50 | }
51 |
52 | // normalization
53 | c0 = c0 / 255.0f;
54 | c1 = c1 / 255.0f;
55 | c2 = c2 / 255.0f;
56 |
57 | // bgrbgrbgr to rrrgggbbb
58 | int area = dst_width * dst_height;
59 | float* pdst_c0 = dst + dy * dst_width + dx;
60 | pdst_c0[0] = c2;
61 | pdst_c0[area] = c1;
62 | pdst_c0[2 * area] = c0;
63 | }
64 |
65 | __global__ void warpaffine_bilinear_bgrbgr2rrggbb_kernel(
66 | uint8_t* src, int src_step_size, int src_width,
67 | int src_height, float* dst, int dst_width,
68 | int dst_height, uint8_t const_value_st,
69 | AffineMatrix d2s, int h_p, int w_p)
70 | {
71 | int dx = blockDim.x * blockIdx.x + threadIdx.x;
72 | int dy = blockDim.y * blockIdx.y + threadIdx.y;
73 | if (dx >= dst_width || dy >= dst_height) return;
74 |
75 | float m_x1 = d2s.value[0];
76 | float m_y1 = d2s.value[1];
77 | float m_z1 = d2s.value[2];
78 | float m_x2 = d2s.value[3];
79 | float m_y2 = d2s.value[4];
80 | float m_z2 = d2s.value[5];
81 |
82 | float c0, c1, c2;
83 | if (dy < h_p || dy >(dst_height - h_p) || dx < w_p || dx >(dst_width - w_p))
84 | {
85 | // out of range
86 | c0 = const_value_st;
87 | c1 = const_value_st;
88 | c2 = const_value_st;
89 | }
90 | else
91 | {
92 | float src_x = m_x1 * (dx + 0.5f) + m_y1 * dy + m_z1 - 0.5f;
93 | float src_y = m_x2 * dx + m_y2 * (dy + 0.5f) + m_z2 - 0.5f;
94 |
95 | int sy_1 = floorf(src_y);
96 | int sx_1 = floorf(src_x);
97 | int sy_2 = sy_1 + 1;
98 | int sx_2 = sx_1 + 1;
99 |
100 | uint8_t const_value[] = { const_value_st, const_value_st, const_value_st };
101 | float a2 = src_y - sy_1;
102 | float a1 = 1.0f - a2;
103 | float b2 = src_x - sx_1;
104 | float b1 = 1.0f - b2;
105 | float w11 = a1 * b1;
106 | float w12 = a1 * b2;
107 | float w21 = a2 * b1;
108 | float w22 = a2 * b2;
109 | uint8_t* p11 = const_value;
110 | uint8_t* p12 = const_value;
111 | uint8_t* p21 = const_value;
112 | uint8_t* p22 = const_value;
113 |
114 | /*if (sy_1 >= 0) {
115 | if (sx_1 >= 0)*/
116 | p11 = src + sy_1 * src_step_size + sx_1 * 3;
117 |
118 | //if (sx_2 < src_width)
119 | p12 = src + sy_1 * src_step_size + sx_2 * 3;
120 | //}
121 |
122 | /*if (sy_2 < src_height) {
123 | if (sx_1 >= 0)*/
124 | p21 = src + sy_2 * src_step_size + sx_1 * 3;
125 |
126 | /*if (sx_2 < src_width)*/
127 | p22 = src + sy_2 * src_step_size + sx_2 * 3;
128 | //}
129 |
130 | c0 = w11 * p11[0] + w12 * p12[0] + w21 * p21[0] + w22 * p22[0] + 0.5f;
131 | c1 = w11 * p11[1] + w12 * p12[1] + w21 * p21[1] + w22 * p22[1] + 0.5f;
132 | c2 = w11 * p11[2] + w12 * p12[2] + w21 * p21[2] + w22 * p22[2] + 0.5f;
133 | }
134 |
135 | // normalization
136 | c0 /= 255.0f;
137 | c1 /= 255.0f;
138 | c2 /= 255.0f;
139 |
140 | // bgrbgrbgr to rrrgggbbb
141 | int area = dst_width * dst_height;
142 | float* pdst_c0 = dst + dy * dst_width + dx;
143 | dst[dy * dst_width + dx] = c2;
144 | dst[dy * dst_width + dx + area] = c1;
145 | dst[dy * dst_width + dx + 2 * area] = c0;
146 | }
147 |
148 |
149 | void cuda_preprocess(cv::Mat& image, preproc_struct& image_trans, std::vector& bufferH,
150 | std::vector& bufferD, std::vector& bindingsize, cudaStream_t& stream, cv::Size resize)
151 | {
152 | int h, w, h_p, w_p;
153 |
154 | float scale = cv::min((float)resize.height / (float)image.rows, (float)resize.width / (float)image.cols);
155 | scale = cv::min(scale, 1.1f);
156 |
157 | h = image.rows * scale;
158 | w = image.cols * scale;
159 | h_p = (resize.height - h) * 0.5f;
160 | w_p = (resize.width - w) * 0.5f;
161 |
162 | image_trans.scale = scale;
163 | image_trans.h_p = h_p;
164 | image_trans.w_p = w_p;
165 |
166 | // copy data to device memory
167 | memcpy(bufferH[2], image.data, bindingsize[2]);
168 | cudaMemcpyAsync(bufferD[2], bufferH[2], bindingsize[2], cudaMemcpyHostToDevice, stream);
169 |
170 | // AffineMatrix s2d;
171 | /*s2d.value[0] = scale;
172 | s2d.value[1] = 0;
173 | s2d.value[2] = (resize.width - scale * image.cols + scale - 1) * 0.5f;
174 | s2d.value[3] = 0;
175 | s2d.value[4] = scale;
176 | s2d.value[5] = (resize.height - scale * image.rows + scale - 1) * 0.5f;*/
177 |
178 | /*cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
179 | cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
180 | cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
181 | memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value));*/
182 |
183 | AffineMatrix d2s;
184 |
185 | d2s.value[0] = 1.0f / scale;
186 | d2s.value[1] = 0;
187 | d2s.value[2] = (image.cols - resize.width / scale + d2s.value[0] - 1) * 0.5f;
188 | d2s.value[3] = 0;
189 | d2s.value[4] = 1.0f / scale;
190 | d2s.value[5] = (image.rows - resize.height / scale + d2s.value[0] - 1) * 0.5f;
191 |
192 | dim3 block(128, 1);
193 | dim3 grid((resize.width + block.x - 1) / block.x, (resize.height + block.y - 1) / block.y);
194 |
195 | warpaffine_nearest_bgrbgr2rrggbb_kernel <<< grid, block, 0, stream >>> (
196 | (uint8_t*)bufferD[2], image.cols * 3, image.cols,
197 | image.rows, (float*)bufferD[0], resize.width,
198 | resize.height, 0, d2s, h_p, w_p);
199 | }
--------------------------------------------------------------------------------
/cpp/video_detect/preprocess.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "utils_detection.h"
4 |
5 | #include
6 | #include
7 |
8 | struct AffineMatrix
9 | {
10 | float value[6];
11 | };
12 |
13 | void cuda_preprocess(cv::Mat& image, preproc_struct& image_trans, std::vector& bufferH,
14 | std::vector& bufferD, std::vector& bindingsize, cudaStream_t& stream, cv::Size resize);
--------------------------------------------------------------------------------
/cpp/video_detect/trt_infer.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/video_detect/trt_infer.cpp
--------------------------------------------------------------------------------
/cpp/video_detect/trt_infer.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 |
11 |
12 | class Logger : public nvinfer1::ILogger
13 | {
14 | public:
15 | Severity reportableSeverity;
16 |
17 | Logger(Severity severity = Severity::kINFO);
18 | void log(Severity severity, const char* msg) noexcept override;
19 | };
20 |
21 |
22 | bool load_engine(nvinfer1::IRuntime*& runtime, nvinfer1::ICudaEngine*& engine, const std::string& engine_dir,
23 | nvinfer1::ILogger& gLogger);
24 |
25 | void allocate_buffers(nvinfer1::ICudaEngine*& engine,
26 | std::vector& bufferH, std::vector& bufferD, std::vector& bindingsize, cv::Size img_size);
27 |
28 | float* do_inference(nvinfer1::IExecutionContext*& context, std::vector& bufferH, const std::vector& bufferD,
29 | cudaStream_t& stream, const std::vector& BindingSize);
30 |
31 |
32 | class yolo_trt_det
33 | {
34 | private:
35 |
36 | nvinfer1::IRuntime* _runtime = nullptr;
37 | nvinfer1::ICudaEngine* _engine = nullptr;
38 | nvinfer1::IExecutionContext* _context = nullptr;
39 |
40 | std::unordered_map catid_labels;
41 | color_dicts catid_colors;
42 | cv::Size set_size;
43 | bool v8_head;
44 |
45 | std::vector cpu_buffer;
46 | std::vector gpu_buffer;
47 | std::vector BindingSize;
48 | cudaStream_t stream;
49 |
50 | public:
51 | yolo_trt_det(const std::string& engine_dir, const std::string& labels_dir, cv::Size img_size);
52 | ~yolo_trt_det();
53 |
54 | //std::vector draw_batch(std::vector& image_list, float conf, float iou, int max_det);
55 |
56 | cv::Mat draw(cv::Mat& image, float conf, float iou, int max_det);
57 | };
--------------------------------------------------------------------------------
/cpp/video_detect/utils_detection.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/cpp/video_detect/utils_detection.cpp
--------------------------------------------------------------------------------
/cpp/video_detect/utils_detection.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 |
10 |
11 | struct color_dicts
12 | {
13 | std::unordered_map> color_map;
14 | std::vector catid;
15 |
16 | color_dicts(const std::unordered_map& catid_labels);
17 | };
18 |
19 |
20 | struct preproc_struct
21 | {
22 | float* img = nullptr;
23 | float scale;
24 | int h_p;
25 | int w_p;
26 |
27 | ~preproc_struct();
28 | };
29 |
30 |
31 |
32 | std::unordered_map yaml_load_labels(const std::string& dir = "data.yaml");
33 |
34 | void preprocess(cv::Mat& image, preproc_struct& image_trans, cv::Size resize);
35 |
36 | void fliter_boxes(float* const boxes, bool v8_head, const std::array& output_shape, float conf_thres,
37 | std::vector& keep_boxes, std::vector& keep_scores, std::vector& keep_classes);
38 |
39 | void scale_boxes(cv::Rect& box, const preproc_struct& preproc_res);
40 |
41 | void draw_boxes(cv::Mat image, const cv::Rect& box, float score, int class_id,
42 | std::unordered_map catid_labels, color_dicts& color_dicts);
43 |
44 | void imgresize(const cv::Mat& image, cv::Mat& input_image, float scale, cv::Size resize);
45 |
46 | template
47 | static bool SortScorePairDescend(const std::pair& pair1, const std::pair& pair2);
48 |
49 | template
50 | void max_score_idx(const std::vector& scores, float score_thres, T scores_idxs);
51 |
52 | float get_iou(const cv::Rect& bbox1, const cv::Rect& bbox2);
53 |
54 | void base_nms(const std::vector& bboxes, const std::vector& scores, const std::vector& catid, float score_threshold, float nms_threshold, std::vector& indices, int limit);
55 |
--------------------------------------------------------------------------------
/doc/yolov5s_det.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MadaoFY/yolov5_TensorRT_inference/4cc1ec7316c63b101da3c842c1f98dc82c90e70c/doc/yolov5s_det.png
--------------------------------------------------------------------------------
/labels_coco.yaml:
--------------------------------------------------------------------------------
1 | 0: person
2 | 1: bicycle
3 | 2: car
4 | 3: motorcycle
5 | 4: airplane
6 | 5: bus
7 | 6: train
8 | 7: truck
9 | 8: boat
10 | 9: traffic light
11 | 10: fire hydrant
12 | 11: stop sign
13 | 12: parking meter
14 | 13: bench
15 | 14: bird
16 | 15: cat
17 | 16: dog
18 | 17: horse
19 | 18: sheep
20 | 19: cow
21 | 20: elephant
22 | 21: bear
23 | 22: zebra
24 | 23: giraffe
25 | 24: backpack
26 | 25: umbrella
27 | 26: handbag
28 | 27: tie
29 | 28: suitcase
30 | 29: frisbee
31 | 30: skis
32 | 31: snowboard
33 | 32: sports ball
34 | 33: kite
35 | 34: baseball bat
36 | 35: baseball glove
37 | 36: skateboard
38 | 37: surfboard
39 | 38: tennis racket
40 | 39: bottle
41 | 40: wine glass
42 | 41: cup
43 | 42: fork
44 | 43: knife
45 | 44: spoon
46 | 45: bowl
47 | 46: banana
48 | 47: apple
49 | 48: sandwich
50 | 49: orange
51 | 50: broccoli
52 | 51: carrot
53 | 52: hot dog
54 | 53: pizza
55 | 54: donut
56 | 55: cake
57 | 56: chair
58 | 57: couch
59 | 58: potted plant
60 | 59: bed
61 | 60: dining table
62 | 61: toilet
63 | 62: tv
64 | 63: laptop
65 | 64: mouse
66 | 65: remote
67 | 66: keyboard
68 | 67: cell phone
69 | 68: microwave
70 | 69: oven
71 | 70: toaster
72 | 71: sink
73 | 72: refrigerator
74 | 73: book
75 | 74: clock
76 | 75: vase
77 | 76: scissors
78 | 77: teddy bear
79 | 78: hair drier
80 | 79: toothbrush
81 |
--------------------------------------------------------------------------------
/labels_voc.yaml:
--------------------------------------------------------------------------------
1 | 0: bus
2 | 1: train
3 | 2: cow
4 | 3: diningtable
5 | 4: motorbike
6 | 5: horse
7 | 6: sofa
8 | 7: bicycle
9 | 8: tvmonitor
10 | 9: aeroplane
11 | 10: boat
12 | 11: sheep
13 | 12: pottedplant
14 | 13: bird
15 | 14: cat
16 | 15: bottle
17 | 16: dog
18 | 17: car
19 | 18: chair
20 | 19: person
21 |
--------------------------------------------------------------------------------
/models_onnx/README.md:
--------------------------------------------------------------------------------
1 | 用于存放待量化的onnx
2 |
--------------------------------------------------------------------------------
/models_trt/README.md:
--------------------------------------------------------------------------------
1 | 用于存放导出后的engine
2 |
--------------------------------------------------------------------------------
/onnx2trt.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import numpy as np
4 | import tensorrt as trt
5 |
6 |
7 | from utils import calibrator
8 |
9 | __all__ = [
10 | 'build_engine',
11 | 'onnx2trt'
12 | ]
13 |
14 |
15 | def AddEfficientNMSPlugin(conf_thres=0.25, iou_thres=0.45, max_det=200, box_coding=1):
16 | """
17 | 添加efficientNMS
18 |
19 | score_threshold: score_thresh
20 | iou_threshold: iou_thresh
21 | max_output_boxes: detections_per_img
22 | box_coding: 0->[x1, y1, x2, y2], 1->[x, y, w, h]
23 | """
24 | for c in trt.get_plugin_registry().plugin_creator_list:
25 | if c.name == "EfficientNMS_TRT":
26 | print(f'Succeeded finding {c.name}')
27 | parameter = [
28 | trt.PluginField("score_threshold", np.float32(conf_thres), trt.PluginFieldType.FLOAT32),
29 | trt.PluginField("iou_threshold", np.float32(iou_thres), trt.PluginFieldType.FLOAT32),
30 | trt.PluginField("max_output_boxes", np.int32(max_det), trt.PluginFieldType.INT32),
31 | trt.PluginField("background_class", np.int32(-1), trt.PluginFieldType.INT32), # background_class: -1, no background class
32 | trt.PluginField("score_activation", np.int32(0), trt.PluginFieldType.INT32), # score_activation: 0->False, 1->True
33 | trt.PluginField("box_coding", np.int32(box_coding), trt.PluginFieldType.INT32)
34 | ]
35 | return c.create_plugin(c.name, trt.PluginFieldCollection(parameter))
36 | return None
37 |
38 |
39 | def build_engine(
40 | onnx_file, model_engine, min_shape, opt_shape, max_shape,
41 | fp16=False, int8=False, imgs_dir=None, imgs_list=None, n_iteration=128, cache_file=None,
42 | v8_head=False, add_nms=False, conf_thres=0.25, iou_thres=0.45, max_det=200, box_coding=1
43 | ):
44 | logger = trt.Logger(trt.Logger.ERROR)
45 | trt.init_libnvinfer_plugins(logger, namespace="")
46 | builder = trt.Builder(logger)
47 | network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
48 | config = builder.create_builder_config()
49 | config.max_workspace_size = (4 << 30)
50 | # config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 << 30)
51 |
52 | # Parse model file
53 | parser = trt.OnnxParser(network, logger)
54 | if not os.path.exists(onnx_file):
55 | print("ONNX file is not exists!")
56 | exit()
57 | print("Succeeded finding .onnx file!")
58 | with open(onnx_file, "rb") as model:
59 | if not parser.parse(model.read()):
60 | print("Failed parsing .onnx file!")
61 | for error in range(parser.num_errors):
62 | print(parser.get_error(error))
63 | exit()
64 | else:
65 | print("Succeeded parsing .onnx file!")
66 |
67 | if v8_head:
68 | outputTensor = network.get_output(0)
69 | print(f'v8 {outputTensor.name} shape:{outputTensor.shape}')
70 | network.unmark_output(outputTensor)
71 | outputTensor = network.add_shuffle(outputTensor)
72 | outputTensor.first_transpose = (0, 2, 1)
73 | network.mark_output(outputTensor.get_output(0))
74 |
75 | # 添加nms算子
76 | if add_nms:
77 | """
78 | 对原输出进行预处理,拆分成 目标框数据 和 类别置信度数据 两个矩阵,背景置信度要与类别置信度相乘。
79 | [1, 8500, 4 + 1 + 80] ——> [1, 8500, 4] + [1, 8500, 1 + 80] ——> [1, 8500, 4] + [1, 8500, 80]
80 | """
81 | outputTensor = network.get_output(0)
82 | print(f'{outputTensor.name} shape:{outputTensor.shape}')
83 | bs, num_boxes, det_res = outputTensor.shape
84 | network.unmark_output(outputTensor)
85 | xycwh = network.add_slice(outputTensor, (0, 0, 0), (bs, num_boxes, 4), (1, 1, 1))
86 | if v8_head:
87 | obj = network.add_slice(
88 | outputTensor, (0, 0, 4), (bs, num_boxes, det_res - 4), (1, 1, 1)
89 | )
90 | else:
91 | scores = network.add_slice(outputTensor, (0, 0, 4), (bs, num_boxes, 1), (1, 1, 1))
92 | obj = network.add_slice(outputTensor, (0, 0, 5), (bs, num_boxes, det_res - 5), (1, 1, 1))
93 | obj = network.add_elementwise(
94 | scores.get_output(0), obj.get_output(0), trt.ElementWiseOperation.PROD
95 | )
96 | print('Add EfficientNMS_TRT!')
97 | nms = AddEfficientNMSPlugin(conf_thres, iou_thres, max_det, box_coding)
98 | pluginlayer = network.add_plugin_v2([xycwh.get_output(0), obj.get_output(0)], nms)
99 | pluginlayer.get_output(0).name = "num_dets"
100 | pluginlayer.get_output(1).name = "det_boxes"
101 | pluginlayer.get_output(2).name = "det_scores"
102 | pluginlayer.get_output(3).name = "det_classes"
103 | for i in range(4):
104 | network.mark_output(pluginlayer.get_output(i))
105 |
106 | inputTensor = network.get_input(0)
107 | print(f'{inputTensor.name} shape:{inputTensor.shape}')
108 | batch, c, h, w = inputTensor.shape
109 | if batch != -1:
110 | min_shape[0], opt_shape[0], max_shape[0] = batch, batch, batch
111 | if c != -1:
112 | min_shape[1], opt_shape[1], max_shape[1] = c, c, c
113 | if h != -1:
114 | min_shape[-2], opt_shape[-2], max_shape[-2] = h, h, h
115 | if w != -1:
116 | min_shape[-1], opt_shape[-1], max_shape[-1] = w, w, w
117 |
118 | profile = builder.create_optimization_profile()
119 | profile.set_shape(inputTensor.name, min_shape, opt_shape, max_shape)
120 | config.add_optimization_profile(profile)
121 |
122 | # Quantization
123 | if fp16:
124 | config.set_flag(trt.BuilderFlag.FP16)
125 | if int8 and imgs_dir:
126 | config.set_flag(trt.BuilderFlag.INT8)
127 | if imgs_list is None:
128 | imgs_list = os.listdir(imgs_dir)
129 | config.int8_calibrator = calibrator.MyCalibrator(
130 | calibrationpath=imgs_dir,
131 | imgslist=imgs_list,
132 | nCalibration=n_iteration,
133 | inputShape=max_shape,
134 | cacheFile=cache_file
135 | )
136 |
137 |
138 | print('Now, engine is building!')
139 | plan = builder.build_serialized_network(network, config)
140 | if plan is None:
141 | print("Failed building engine!")
142 | # exit()
143 | with open(model_engine, "wb") as f:
144 | f.write(plan)
145 | print('Engine has been built!!!')
146 |
147 | runtime = trt.Runtime(logger)
148 | return runtime.deserialize_cuda_engine(plan)
149 |
150 |
151 | class onnx2trt:
152 | """
153 | Parses an ONNX graph and builds a TensorRT engine from it.
154 | """
155 | def __init__(self, verbose=False):
156 |
157 | self.logger = trt.Logger(trt.Logger.ERROR)
158 | if verbose:
159 | self.logger = trt.Logger(trt.Logger.INFO)
160 | self.logger.min_severity = trt.Logger.Severity.VERBOSE
161 |
162 | trt.init_libnvinfer_plugins(self.logger, namespace="")
163 |
164 | self.builder = trt.Builder(self.logger)
165 | self.config = self.builder.create_builder_config()
166 | self.config.max_workspace_size = (4 << 30)
167 | # self.config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 << 30)
168 |
169 | self.network = None
170 | self.profile = None
171 | self.parser = None
172 |
173 | self.FP16 = False
174 | self.INT8 = False
175 |
176 | def create_network(
177 | self, onnx_dir, v8_head=False, add_nms=False, conf_thres=0.25, iou_thres=0.45, max_det=200, box_coding=1
178 | ):
179 |
180 | self.network = self.builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
181 | # Parse model file
182 | self.parser = trt.OnnxParser(self.network, self.logger)
183 | if not os.path.exists(onnx_dir):
184 | print("ONNX file is not exists!")
185 | exit()
186 | print("Succeeded finding .onnx file!")
187 | with open(onnx_dir, "rb") as model:
188 | if not self.parser.parse(model.read()):
189 | print("Failed parsing .onnx file!")
190 | for error in range(self.parser.num_errors):
191 | print(self.parser.get_error(error))
192 | exit()
193 | else:
194 | print("Succeeded parsing .onnx file!")
195 |
196 | if v8_head:
197 | outputTensor = self.network.get_output(0)
198 | print(f'v8 {outputTensor.name} shape:{outputTensor.shape}')
199 | self.network.unmark_output(outputTensor)
200 | outputTensor = self.network.add_shuffle(outputTensor)
201 | # (bs, det_res, num_boxes ) -> (bs, num_boxes, det_res)
202 | outputTensor.first_transpose = (0, 2, 1)
203 | self.network.mark_output(outputTensor.get_output(0))
204 |
205 | # 添加nms算子
206 | if add_nms:
207 | """
208 | 对原输出进行预处理,拆分成 目标框数据 和 类别置信度数据 两个矩阵,背景置信度要与类别置信度相乘。
209 | [1, 8500, 4 + 1 + 80] ——> [1, 8500, 4] + [1, 8500, 1 + 80] ——> [1, 8500, 4] + [1, 8500, 80]
210 | """
211 | outputTensor = self.network.get_output(0)
212 | print(f'{outputTensor.name} shape:{outputTensor.shape}')
213 | bs, num_boxes, det_res = outputTensor.shape
214 | self.network.unmark_output(outputTensor)
215 | xycwh = self.network.add_slice(outputTensor, (0, 0, 0), (bs, num_boxes, 4), (1, 1, 1))
216 | if v8_head:
217 | obj = self.network.add_slice(
218 | outputTensor, (0, 0, 4), (bs, num_boxes, det_res - 4), (1, 1, 1)
219 | )
220 | else:
221 | scores = self.network.add_slice(outputTensor, (0, 0, 4), (bs, num_boxes, 1), (1, 1, 1))
222 | obj = self.network.add_slice(outputTensor, (0, 0, 5), (bs, num_boxes, det_res - 5), (1, 1, 1))
223 | obj = self.network.add_elementwise(
224 | scores.get_output(0), obj.get_output(0), trt.ElementWiseOperation.PROD
225 | )
226 | print('Add EfficientNMS_TRT!')
227 | nms = AddEfficientNMSPlugin(conf_thres, iou_thres, max_det, box_coding)
228 | pluginlayer = self.network.add_plugin_v2([xycwh.get_output(0), obj.get_output(0)], nms)
229 | pluginlayer.get_output(0).name = "num_dets"
230 | pluginlayer.get_output(1).name = "det_boxes"
231 | pluginlayer.get_output(2).name = "det_scores"
232 | pluginlayer.get_output(3).name = "det_classes"
233 | for i in range(4):
234 | self.network.mark_output(pluginlayer.get_output(i))
235 |
236 |
237 | def create_engine(self, engine_dir, min_shape, opt_shape, max_shape, fp16=False, int8=False,
238 | imgs_dir=None, n_iteration=128, cache_file=None):
239 |
240 | self.FP16 = fp16
241 | self.INT8 = int8
242 |
243 | inputTensor = self.network.get_input(0)
244 | print(f'{inputTensor.name} shape:{inputTensor.shape}')
245 | batch, c, h, w = inputTensor.shape
246 | if batch != -1:
247 | min_shape[0], opt_shape[0], max_shape[0] = batch, batch, batch
248 | if c != -1:
249 | min_shape[1], opt_shape[1], max_shape[1] = c, c, c
250 | if h != -1:
251 | min_shape[-2], opt_shape[-2], max_shape[-2] = h, h, h
252 | if w != -1:
253 | min_shape[-1], opt_shape[-1], max_shape[-1] = w, w, w
254 |
255 | self.profile = self.builder.create_optimization_profile()
256 | self.profile.set_shape(inputTensor.name, min_shape, opt_shape, max_shape)
257 | self.config.add_optimization_profile(self.profile)
258 |
259 | # Quantization
260 | if self.FP16:
261 | self.config.set_flag(trt.BuilderFlag.FP16)
262 | if self.INT8:
263 | assert imgs_dir ,'If you choice int8, you should also set imgs_dir for the calibration'
264 | self.config.set_flag(trt.BuilderFlag.INT8)
265 | imgs_list = os.listdir(imgs_dir)
266 | calib = calibrator.MyCalibrator(
267 | calibrationpath=imgs_dir,
268 | imgslist=imgs_list,
269 | nCalibration=n_iteration,
270 | inputShape=max_shape,
271 | cacheFile=cache_file
272 | )
273 | self.config.int8_calibrator = calib
274 |
275 | print('Now, engine is building...')
276 | t1 = time.time()
277 | plan = self.builder.build_serialized_network(self.network, self.config)
278 | t2 = time.time()
279 | print(f'{(t2 - t1)/60:0.2f}min')
280 | if plan is None:
281 | print("Failed building engine!")
282 | # exit()
283 | with open(engine_dir, "wb") as f:
284 | f.write(plan)
285 | print('Engine has been built!!!')
286 |
287 | runtime = trt.Runtime(self.logger)
288 | return runtime.deserialize_cuda_engine(plan)
289 |
290 |
291 | def main(args):
292 |
293 | onnx_dir = args.onnx_dir
294 | engine_dir = args.engine_dir
295 | if engine_dir is None:
296 | engine_dir = f"./models_trt/{onnx_dir.split('/')[-1].replace('onnx', 'engine')}"
297 |
298 | yolo_engine = onnx2trt()
299 | yolo_engine.create_network(
300 | onnx_dir,
301 | v8_head=args.yolov8_head,
302 | add_nms=args.add_nms,
303 | conf_thres=args.conf_thres,
304 | iou_thres=args.iou_thres,
305 | max_det=args.max_det
306 | )
307 |
308 | yolo_engine.create_engine(
309 | engine_dir,
310 | min_shape=args.min_shape,
311 | opt_shape=args.opt_shape,
312 | max_shape=args.max_shape,
313 | fp16=args.fp16,
314 | int8=args.int8,
315 | imgs_dir=args.imgs_dir,
316 | n_iteration=args.n_iteration,
317 | cache_file=args.cache_file
318 | )
319 |
320 |
321 | if __name__ == '__main__':
322 | import argparse
323 |
324 | parser = argparse.ArgumentParser(description=__doc__)
325 | # onnx模型
326 | parser.add_argument('--onnx_dir', type=str, default='./models_onnx/yolov5s.onnx', help='onnx path')
327 | # engine模型保存地址
328 | parser.add_argument('--engine_dir', type=str, default=None, help='engine path')
329 | # 最小的输入shape
330 | parser.add_argument('--min_shape', nargs='+', type=int, default=[1, 3, 512, 512],
331 | help='input min shape [batch, channel, height, width]')
332 | # 最佳优化的输入shape
333 | parser.add_argument('--opt_shape', nargs='+', type=int, default=[1, 3, 512, 512],
334 | help='input opt shape [batch, channel, height, width]')
335 | # 最大的输入shape
336 | parser.add_argument('--max_shape', nargs='+', type=int, default=[1, 3, 512, 512],
337 | help='input max shape [batch, channel, height, width]')
338 | # 是否使用fp16量化
339 | parser.add_argument('--fp16', type=bool, default=True, choices=[True, False],
340 | help='TensorRt FP16 half-precision export')
341 | # 是否使用int8量化
342 | parser.add_argument('--int8', type=bool, default=False, choices=[True, False],
343 | help='TensorRt INT8 quantization')
344 | # int8量化校准集位置
345 | parser.add_argument('--imgs_dir', default='./calibration', help='Dataset for int8 calibration')
346 | # 校准的轮次
347 | parser.add_argument('--n_iteration', type=int, default=512, help='Iteration for int8 calibration')
348 | # cache保存位置
349 | parser.add_argument('--cache_file', default=None, help='Int8 cache path')
350 | # 是否为yolov8的检测头
351 | parser.add_argument('--yolov8_head', type=bool, default=True, choices=[True, False], help='yolov8_head or not')
352 | # 是否添加nms
353 | parser.add_argument('--add_nms', type=bool, default=False, choices=[True, False], help='add efficientNMS')
354 | # 只有得分大于置信度的预测框会被保留下来
355 | parser.add_argument('--conf_thres', type=float, default=0.25, help='confidence threshold')
356 | # 非极大抑制所用到的nms_iou大小
357 | parser.add_argument('--iou_thres', type=float, default=0.45, help='NMS IoU threshold')
358 | # 目标框数量限制
359 | parser.add_argument('--max_det', type=int, default=200, help='maximum detections per image')
360 |
361 | args = parser.parse_args()
362 | print(args)
363 |
364 | main(args)
365 |
366 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | PyYAML
2 | numpy>=1.21.0
3 | opencv-python>=4.1.1
4 | onnx>=1.10.2
5 | torch>=1.10.2+cu113
6 | torchvision>=0.11.3
7 |
8 |
9 | pycuda<2021.1 # old CUDA python API (not recommended), replaced by cuda-python
10 | nvidia-pyindex
11 | tensorrt == 8.4.3.1 # https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#downloading
12 | cuda-python
13 |
--------------------------------------------------------------------------------
/utils/calibrator.py:
--------------------------------------------------------------------------------
1 | import os
2 | import cv2 as cv
3 | import numpy as np
4 | import pycuda.autoinit
5 | import pycuda.driver as cuda
6 | from cuda import cudart
7 | import tensorrt as trt
8 |
9 | if cudart:
10 | cudart.cudaDeviceSynchronize()
11 |
12 | __all__ = [
13 | 'MyCalibrator',
14 | 'MyCalibrator_v2'
15 | ]
16 |
17 | def trans(img, size):
18 | crop_shape = min(img.shape[:2])
19 | img = img[:crop_shape - 1, :crop_shape - 1, :]
20 | img = cv.resize(img, size)
21 | img /= 255.0
22 | return img
23 |
24 |
25 | class MyCalibrator(trt.IInt8EntropyCalibrator2):
26 | """pycuda"""
27 | def __init__(self, calibrationpath, imgslist, nCalibration, inputShape, cacheFile):
28 | trt.IInt8EntropyCalibrator2.__init__(self)
29 | self.calibrationpath = calibrationpath
30 | self.imgslist = imgslist
31 | self.nCalibration = nCalibration
32 | self.shape = inputShape # (N,C,H,W)
33 | self.buffeSize = trt.volume(inputShape) * trt.float32.itemsize
34 | self.cacheFile = cacheFile
35 | self.dIn = cuda.mem_alloc(self.buffeSize)
36 | self.oneBatch = self.batchGenerator()
37 |
38 | print(int(self.dIn))
39 |
40 | # def __del__(self):
41 | # cudart.cudaFree(self.dIn)
42 |
43 | def batchGenerator(self):
44 | for i in range(self.nCalibration):
45 | print("> calibration %d" % i)
46 | subImageList = np.random.choice(self.imgslist, self.shape[0], replace=False)
47 | # self.imgslist = list(set(self.imgslist) - set(subImageList))
48 | yield np.ascontiguousarray(self.loadImages(subImageList))
49 |
50 | def loadImages(self, imageList):
51 | res = np.empty(self.shape, dtype=np.float32)
52 | for i in range(self.shape[0]):
53 | path = os.path.join(self.calibrationpath, imageList[i])
54 | img = cv.imread(path)
55 | img = cv.cvtColor(img, cv.COLOR_BGR2RGB).astype(np.float32)
56 | img = trans(img, self.shape[-2:]).transpose((2, 0, 1))
57 | res[i] = img
58 | return res
59 |
60 | def get_batch_size(self): # do NOT change name
61 | return self.shape[0]
62 |
63 | def get_batch(self, nameList=None, inputNodeName=None): # do NOT change name
64 | try:
65 | data = next(self.oneBatch)
66 | # cudart.cudaMemcpy(self.dIn, data.ctypes.data, self.buffeSize, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
67 | cuda.memcpy_htod(self.dIn, data.ravel())
68 | return [int(self.dIn)]
69 | except StopIteration:
70 | return None
71 |
72 | def read_calibration_cache(self): # do NOT change name
73 | if os.path.exists(self.cacheFile):
74 | print("Succeed finding cahce file: %s" % (self.cacheFile))
75 | with open(self.cacheFile, "rb") as f:
76 | cache = f.read()
77 | return cache
78 | else:
79 | print("Failed finding int8 cache!")
80 | return
81 |
82 | def write_calibration_cache(self, cache): # do NOT change name
83 | with open(self.cacheFile, "wb") as f:
84 | f.write(cache)
85 | print("Succeed saving int8 cache!")
86 |
87 |
88 | class MyCalibrator_v2(trt.IInt8EntropyCalibrator2):
89 | """cuda-python"""
90 | def __init__(self, calibrationpath, imgslist, nCalibration, inputShape, cacheFile):
91 | trt.IInt8EntropyCalibrator2.__init__(self)
92 | self.calibrationpath = calibrationpath
93 | self.imgslist = imgslist
94 | self.nCalibration = nCalibration
95 | self.shape = inputShape # (N,C,H,W)
96 | self.buffeSize = trt.volume(inputShape) * trt.float32.itemsize
97 | self.cacheFile = cacheFile
98 | _, self.dIn = cudart.cudaMalloc(self.buffeSize)
99 | self.oneBatch = self.batchGenerator()
100 |
101 | print(int(self.dIn))
102 |
103 | def __del__(self):
104 | cudart.cudaFree(self.dIn)
105 |
106 | def batchGenerator(self):
107 | for i in range(self.nCalibration):
108 | print("> calibration %d" % i)
109 | subImageList = np.random.choice(self.imgslist, self.shape[0], replace=False)
110 | # self.imgslist = list(set(self.imgslist) - set(subImageList))
111 | yield np.ascontiguousarray(self.loadImages(subImageList))
112 |
113 | def loadImages(self, imageList):
114 | res = np.empty(self.shape, dtype=np.float32)
115 | for i in range(self.shape[0]):
116 | path = os.path.join(self.calibrationpath, imageList[i])
117 | img = cv.imread(path)
118 | img = cv.cvtColor(img, cv.COLOR_BGR2RGB).astype(np.float32)
119 | img = trans(img, self.shape[-2:]).transpose((2, 0, 1))
120 | res[i] = img
121 | return res
122 |
123 | def get_batch_size(self): # do NOT change name
124 | return self.shape[0]
125 |
126 | def get_batch(self, nameList=None, inputNodeName=None): # do NOT change name
127 | try:
128 | data = next(self.oneBatch)
129 | cudart.cudaMemcpy(self.dIn, data.ctypes.data, self.buffeSize, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
130 | return [int(self.dIn)]
131 | except StopIteration:
132 | return None
133 |
134 | def read_calibration_cache(self): # do NOT change name
135 | if os.path.exists(self.cacheFile):
136 | print("Succeed finding cahce file: %s" % (self.cacheFile))
137 | with open(self.cacheFile, "rb") as f:
138 | cache = f.read()
139 | return cache
140 | else:
141 | print("Failed finding int8 cache!")
142 | return
143 |
144 | def write_calibration_cache(self, cache): # do NOT change name
145 | with open(self.cacheFile, "wb") as f:
146 | f.write(cache)
147 | print("Succeed saving int8 cache!")
148 |
--------------------------------------------------------------------------------
/utils/trt_infer.py:
--------------------------------------------------------------------------------
1 | #
2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | import argparse
19 | import logging
20 | import os
21 | import sys
22 |
23 | import numpy as np
24 | import pycuda.autoinit
25 | import pycuda.driver as cuda
26 | import tensorrt as trt
27 |
28 | try:
29 | # Sometimes python does not understand FileNotFoundError
30 | FileNotFoundError
31 | except NameError:
32 | FileNotFoundError = IOError
33 |
34 | EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
35 | logging.basicConfig(level=logging.INFO)
36 | logging.getLogger("EngineBuilder").setLevel(logging.INFO)
37 | log = logging.getLogger("EngineBuilder")
38 |
39 | def GiB(val):
40 | return val * 1 << 30
41 |
42 |
43 | def add_help(description):
44 | parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
45 | args, _ = parser.parse_known_args()
46 |
47 |
48 | def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[], err_msg=""):
49 | '''
50 | Parses sample arguments.
51 |
52 | Args:
53 | description (str): Description of the sample.
54 | subfolder (str): The subfolder containing data relevant to this sample
55 | find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path.
56 |
57 | Returns:
58 | str: Path of data directory.
59 | '''
60 |
61 | # Standard command-line arguments for all samples.
62 | kDEFAULT_DATA_ROOT = os.path.join(os.sep, "usr", "src", "tensorrt", "data")
63 | parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
64 | parser.add_argument("-d", "--datadir", help="Location of the TensorRT sample data directory, and any additional data directories.", action="append", default=[kDEFAULT_DATA_ROOT])
65 | args, _ = parser.parse_known_args()
66 |
67 | def get_data_path(data_dir):
68 | # If the subfolder exists, append it to the path, otherwise use the provided path as-is.
69 | data_path = os.path.join(data_dir, subfolder)
70 | if not os.path.exists(data_path):
71 | if data_dir != kDEFAULT_DATA_ROOT:
72 | print("WARNING: " + data_path + " does not exist. Trying " + data_dir + " instead.")
73 | data_path = data_dir
74 | # Make sure data directory exists.
75 | if not (os.path.exists(data_path)) and data_dir != kDEFAULT_DATA_ROOT:
76 | print("WARNING: {:} does not exist. Please provide the correct data path with the -d option.".format(data_path))
77 | return data_path
78 |
79 | data_paths = [get_data_path(data_dir) for data_dir in args.datadir]
80 | return data_paths, locate_files(data_paths, find_files, err_msg)
81 |
82 | def locate_files(data_paths, filenames, err_msg=""):
83 | """
84 | Locates the specified files in the specified data directories.
85 | If a file exists in multiple data directories, the first directory is used.
86 |
87 | Args:
88 | data_paths (List[str]): The data directories.
89 | filename (List[str]): The names of the files to find.
90 |
91 | Returns:
92 | List[str]: The absolute paths of the files.
93 |
94 | Raises:
95 | FileNotFoundError if a file could not be located.
96 | """
97 | found_files = [None] * len(filenames)
98 | for data_path in data_paths:
99 | # Find all requested files.
100 | for index, (found, filename) in enumerate(zip(found_files, filenames)):
101 | if not found:
102 | file_path = os.path.abspath(os.path.join(data_path, filename))
103 | if os.path.exists(file_path):
104 | found_files[index] = file_path
105 |
106 | # Check that all files were found
107 | for f, filename in zip(found_files, filenames):
108 | if not f or not os.path.exists(f):
109 | raise FileNotFoundError("Could not find {:}. Searched in data paths: {:}\n{:}".format(filename, data_paths, err_msg))
110 | return found_files
111 |
112 | def load_engine(engine_path):
113 | # TRT_LOGGER = trt.Logger(trt.Logger.WARNING) # INFO
114 | logger = trt.Logger(trt.Logger.ERROR)
115 | trt.init_libnvinfer_plugins(logger, '')
116 | with open(engine_path, 'rb') as f, trt.Runtime(logger) as runtime:
117 | return runtime.deserialize_cuda_engine(f.read())
118 |
119 | # Simple helper data class that's a little nicer to use than a 2-tuple.
120 | class HostDeviceMem(object):
121 | def __init__(self, host_mem, device_mem):
122 | self.host = host_mem
123 | self.device = device_mem
124 |
125 | def __str__(self):
126 | return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
127 |
128 | def __repr__(self):
129 | return self.__str__()
130 |
131 |
132 | # Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
133 | def allocate_buffers(engine):
134 | inputs = []
135 | outputs = []
136 | bindings = []
137 | stream = cuda.Stream()
138 | for binding in engine:
139 | size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
140 | dtype = trt.nptype(engine.get_binding_dtype(binding))
141 | # Allocate host and device buffers
142 | host_mem = cuda.pagelocked_empty(size, dtype)
143 | device_mem = cuda.mem_alloc(host_mem.nbytes)
144 | # Append the device buffer to device bindings.
145 | bindings.append(int(device_mem))
146 | # Append to the appropriate list.
147 | if engine.binding_is_input(binding):
148 | inputs.append(HostDeviceMem(host_mem, device_mem))
149 | else:
150 | outputs.append(HostDeviceMem(host_mem, device_mem))
151 | return inputs, outputs, bindings, stream
152 |
153 |
154 | def allocate_buffers_v2(context):
155 | inputs = []
156 | outputs = []
157 | bindings = []
158 | stream = cuda.Stream()
159 | for idx, binding in enumerate(context.engine):
160 | # size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
161 | size = trt.volume(context.get_binding_shape(idx))
162 | dtype = trt.nptype(context.engine.get_binding_dtype(idx))
163 | # Allocate host and device buffers
164 | host_mem = cuda.pagelocked_empty(size, dtype)
165 | device_mem = cuda.mem_alloc(host_mem.nbytes)
166 | # Append the device buffer to device bindings.
167 | bindings.append(int(device_mem))
168 | # Append to the appropriate list.
169 | if context.engine.binding_is_input(binding):
170 | inputs.append(HostDeviceMem(host_mem, device_mem))
171 | else:
172 | outputs.append(HostDeviceMem(host_mem, device_mem))
173 | return inputs, outputs, bindings, stream
174 |
175 |
176 | # This function is generalized for multiple inputs/outputs.
177 | # inputs and outputs are expected to be lists of HostDeviceMem objects.
178 | def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
179 | # Transfer input data to the GPU.
180 | [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
181 | # Run inference.
182 | context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
183 | # Transfer predictions back from the GPU.
184 | [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
185 | # Synchronize the stream
186 | stream.synchronize()
187 | # Return only the host outputs.
188 | return [out.host for out in outputs]
189 |
190 | # This function is generalized for multiple inputs/outputs for full dimension networks.
191 | # inputs and outputs are expected to be lists of HostDeviceMem objects.
192 | def do_inference_v2(context, bindings, inputs, outputs, stream):
193 | # Transfer input data to the GPU.
194 | [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
195 | # Run inference.
196 | context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
197 | # Transfer predictions back from the GPU.
198 | [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
199 | # Synchronize the stream
200 | stream.synchronize()
201 | # Return only the host outputs.
202 | return [out.host for out in outputs]
203 |
204 |
205 | class EngineBuilder:
206 | """
207 | Parses an ONNX graph and builds a TensorRT engine from it.
208 | """
209 |
210 | def __init__(self, verbose=False):
211 | """
212 | :param verbose: If enabled, a higher verbosity level will be set on the TensorRT logger.
213 | """
214 | self.trt_logger = trt.Logger(trt.Logger.INFO)
215 | if verbose:
216 | self.trt_logger.min_severity = trt.Logger.Severity.VERBOSE
217 |
218 | trt.init_libnvinfer_plugins(self.trt_logger, namespace="")
219 |
220 | self.builder = trt.Builder(self.trt_logger)
221 | self.config = self.builder.create_builder_config()
222 | self.config.max_workspace_size = 8 * (2 ** 30) # 8 GB
223 |
224 | self.batch_size = None
225 | self.network = None
226 | self.parser = None
227 |
228 | def create_network(self, onnx_path):
229 | """
230 | Parse the ONNX graph and create the corresponding TensorRT network definition.
231 | :param onnx_path: The path to the ONNX graph to load.
232 | """
233 | network_flags = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
234 |
235 | self.network = self.builder.create_network(network_flags)
236 | self.parser = trt.OnnxParser(self.network, self.trt_logger)
237 |
238 | onnx_path = os.path.realpath(onnx_path)
239 | with open(onnx_path, "rb") as f:
240 | if not self.parser.parse(f.read()):
241 | log.error("Failed to load ONNX file: {}".format(onnx_path))
242 | for error in range(self.parser.num_errors):
243 | log.error(self.parser.get_error(error))
244 | sys.exit(1)
245 |
246 | inputs = [self.network.get_input(i) for i in range(self.network.num_inputs)]
247 | outputs = [self.network.get_output(i) for i in range(self.network.num_outputs)]
248 |
249 | log.info("Network Description")
250 | for input in inputs:
251 | self.batch_size = input.shape[0]
252 | log.info("Input '{}' with shape {} and dtype {}".format(input.name, input.shape, input.dtype))
253 | for output in outputs:
254 | log.info("Output '{}' with shape {} and dtype {}".format(output.name, output.shape, output.dtype))
255 | assert self.batch_size > 0
256 | self.builder.max_batch_size = self.batch_size
257 |
258 | def create_engine(self, engine_path, precision, calib_input=None, calib_cache=None, calib_num_images=25000,
259 | calib_batch_size=8, calib_preprocessor=None):
260 | """
261 | Build the TensorRT engine and serialize it to disk.
262 | :param engine_path: The path where to serialize the engine to.
263 | :param precision: The datatype to use for the engine, either 'fp32', 'fp16' or 'int8'.
264 | :param calib_input: The path to a directory holding the calibration images.
265 | :param calib_cache: The path where to write the calibration cache to, or if it already exists, load it from.
266 | :param calib_num_images: The maximum number of images to use for calibration.
267 | :param calib_batch_size: The batch size to use for the calibration process.
268 | :param calib_preprocessor: The ImageBatcher preprocessor algorithm to use.
269 | """
270 | engine_path = os.path.realpath(engine_path)
271 | engine_dir = os.path.dirname(engine_path)
272 | os.makedirs(engine_dir, exist_ok=True)
273 | log.info("Building {} Engine in {}".format(precision, engine_path))
274 |
275 | inputs = [self.network.get_input(i) for i in range(self.network.num_inputs)]
276 |
277 | if precision == "fp16":
278 | if not self.builder.platform_has_fast_fp16:
279 | log.warning("FP16 is not supported natively on this platform/device")
280 | else:
281 | self.config.set_flag(trt.BuilderFlag.FP16)
282 |
283 | with self.builder.build_engine(self.network, self.config) as engine, open(engine_path, "wb") as f:
284 | log.info("Serializing engine to file: {:}".format(engine_path))
285 | f.write(engine.serialize())
286 |
287 |
288 | class EngineBuilder_v2:
289 | """
290 | Parses an ONNX graph and builds a TensorRT engine from it.
291 | """
292 |
293 | def __init__(self, verbose=False):
294 | """
295 | :param verbose: If enabled, a higher verbosity level will be set on the TensorRT logger.
296 | """
297 | self.trt_logger = trt.Logger(trt.Logger.INFO)
298 | if verbose:
299 | self.trt_logger.min_severity = trt.Logger.Severity.VERBOSE
300 |
301 | trt.init_libnvinfer_plugins(self.trt_logger, namespace="")
302 |
303 | self.builder = trt.Builder(self.trt_logger)
304 | self.config = self.builder.create_builder_config()
305 | self.config.max_workspace_size = 8 * (2 ** 30) # 8 GB
306 |
307 | self.batch_size = None
308 | self.network = None
309 | self.parser = None
310 |
311 | def create_network(self, onnx_path, get_inputs):
312 | """
313 | Parse the ONNX graph and create the corresponding TensorRT network definition.
314 | :param onnx_path: The path to the ONNX graph to load.
315 | """
316 | network_flags = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
317 |
318 | self.network = self.builder.create_network(network_flags)
319 | self.parser = trt.OnnxParser(self.network, self.trt_logger)
320 |
321 | onnx_path = os.path.realpath(onnx_path)
322 | with open(onnx_path, "rb") as f:
323 | if not self.parser.parse(f.read()):
324 | log.error("Failed to load ONNX file: {}".format(onnx_path))
325 | for error in range(self.parser.num_errors):
326 | log.error(self.parser.get_error(error))
327 | sys.exit(1)
328 |
329 | inputs = []
330 | for i, shape in enumerate(get_inputs):
331 | self.network.get_input(i).shape = shape
332 | inputs.append(shape)
333 | # inputs = [self.network.get_input(i) for i in range(self.network.num_inputs)]
334 | outputs = [self.network.get_output(i) for i in range(self.network.num_outputs)]
335 |
336 | log.info("Network Description")
337 | for input in inputs:
338 | self.batch_size = input.shape[0]
339 | log.info("Input '{}' with shape {} and dtype {}".format(input.name, input.shape, input.dtype))
340 | for output in outputs:
341 | log.info("Output '{}' with shape {} and dtype {}".format(output.name, output.shape, output.dtype))
342 | assert self.batch_size > 0
343 | self.builder.max_batch_size = self.batch_size
344 |
345 | def create_engine(self, engine_path, precision, calib_input=None, calib_cache=None, calib_num_images=25000,
346 | calib_batch_size=8, calib_preprocessor=None):
347 | """
348 | Build the TensorRT engine and serialize it to disk.
349 | :param engine_path: The path where to serialize the engine to.
350 | :param precision: The datatype to use for the engine, either 'fp32', 'fp16' or 'int8'.
351 | :param calib_input: The path to a directory holding the calibration images.
352 | :param calib_cache: The path where to write the calibration cache to, or if it already exists, load it from.
353 | :param calib_num_images: The maximum number of images to use for calibration.
354 | :param calib_batch_size: The batch size to use for the calibration process.
355 | :param calib_preprocessor: The ImageBatcher preprocessor algorithm to use.
356 | """
357 | engine_path = os.path.realpath(engine_path)
358 | engine_dir = os.path.dirname(engine_path)
359 | os.makedirs(engine_dir, exist_ok=True)
360 | log.info("Building {} Engine in {}".format(precision, engine_path))
361 |
362 | inputs = [self.network.get_input(i) for i in range(self.network.num_inputs)]
363 |
364 | if precision == "fp16":
365 | if not self.builder.platform_has_fast_fp16:
366 | log.warning("FP16 is not supported natively on this platform/device")
367 | else:
368 | self.config.set_flag(trt.BuilderFlag.FP16)
369 |
370 | with self.builder.build_engine(self.network, self.config) as engine, open(engine_path, "wb") as f:
371 | log.info("Serializing engine to file: {:}".format(engine_path))
372 | f.write(engine.serialize())
--------------------------------------------------------------------------------
/utils/utils_detection.py:
--------------------------------------------------------------------------------
1 | import yaml
2 | import json
3 | import torch
4 | import cv2 as cv
5 | import numpy as np
6 | import torchvision
7 |
8 |
9 | def yaml_load(file='data.yaml'):
10 | # Single-line safe yaml loading
11 | with open(file, errors='ignore') as f:
12 | return yaml.safe_load(f)
13 |
14 |
15 | def json_load(file='data.json'):
16 | with open(file, "r") as f:
17 | return json.load(f)
18 |
19 |
20 | def xyxy2xywh(x):
21 | # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right
22 | y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
23 | y[:, 0] = (x[:, 0] + x[:, 2]) / 2 # x center
24 | y[:, 1] = (x[:, 1] + x[:, 3]) / 2 # y center
25 | y[:, 2] = x[:, 2] - x[:, 0] # width
26 | y[:, 3] = x[:, 3] - x[:, 1] # height
27 | return y
28 |
29 |
30 | def xywh2xyxy(x):
31 | # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
32 | y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
33 | y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x
34 | y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y
35 | y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x
36 | y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y
37 | return y
38 |
39 |
40 | def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0):
41 | # Convert nx4 boxes from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
42 | y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
43 | y[:, 0] = w * (x[:, 0] - x[:, 2] / 2) + padw # top left x
44 | y[:, 1] = h * (x[:, 1] - x[:, 3] / 2) + padh # top left y
45 | y[:, 2] = w * (x[:, 0] + x[:, 2] / 2) + padw # bottom right x
46 | y[:, 3] = h * (x[:, 1] + x[:, 3] / 2) + padh # bottom right y
47 | return y
48 |
49 |
50 | def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0):
51 | # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] normalized where xy1=top-left, xy2=bottom-right
52 | if clip:
53 | clip_boxes(x, (h - eps, w - eps)) # warning: inplace clip
54 | y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
55 | y[:, 0] = ((x[:, 0] + x[:, 2]) / 2) / w # x center
56 | y[:, 1] = ((x[:, 1] + x[:, 3]) / 2) / h # y center
57 | y[:, 2] = (x[:, 2] - x[:, 0]) / w # width
58 | y[:, 3] = (x[:, 3] - x[:, 1]) / h # height
59 | return y
60 |
61 |
62 | def xyn2xy(x, w=640, h=640, padw=0, padh=0):
63 | # Convert normalized segments into pixel segments, shape (n,2)
64 | y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
65 | y[:, 0] = w * x[:, 0] + padw # top left x
66 | y[:, 1] = h * x[:, 1] + padh # top left y
67 | return y
68 |
69 |
70 | def letterbox_image(image, return_padding=False):
71 | """
72 | 为保持h,w的一致,对图片短边两侧进行等距离padding
73 | """
74 | h, w = image.shape[:2]
75 |
76 | if h > w:
77 | p = int((h - w) // 2)
78 | image = cv.copyMakeBorder(image, 0, 0, p, (h - w - p), cv.BORDER_CONSTANT, value=0)
79 | else:
80 | p = int((w - h) // 2)
81 | image = cv.copyMakeBorder(image, p, (w - h - p), 0, 0, cv.BORDER_CONSTANT, value=0)
82 |
83 | if return_padding:
84 | return image, p
85 | else:
86 | return image
87 |
88 | def image_trans(img, size):
89 | scale = min((size[0] / img.shape[0]), (size[1] / img.shape[1]), 1.1)
90 | new_size = (int(img.shape[1] * scale), int(img.shape[0] * scale))
91 | # img_new = cv.resize(img, new_size, interpolation=cv.INTER_NEAREST)
92 | img_new = cv.resize(img, new_size, interpolation=cv.INTER_LINEAR)
93 | top = round((size[0] - new_size[1]) * 0.5)
94 | bottom = (size[0] - new_size[1]) - top
95 | left = round((size[1] - new_size[0]) * 0.5)
96 | right = (size[1] - new_size[0]) - left
97 | img_new = cv.copyMakeBorder(img_new, top, bottom, left, right, cv.BORDER_CONSTANT, value=0)
98 | img_new = img_new.transpose((2, 0, 1))[::-1]
99 | img_new = np.expand_dims(img_new, 0)
100 | img_new = np.ascontiguousarray(img_new).astype(np.float32)
101 | img_new = img_new / 255.0
102 | return img_new
103 |
104 |
105 | def scale_bboxes(bboxes, img_ori_hw, img_det_hw):
106 | assert len(img_ori_hw) == len(img_ori_hw)
107 |
108 | scale = max(img_ori_hw[0] / img_det_hw[0], img_ori_hw[1] / img_det_hw[1])
109 | bboxes[:, :4] = bboxes[:, :4] * scale
110 |
111 | h_bias = (max(img_ori_hw) - img_ori_hw[0]) / 2.0
112 | w_bias = (max(img_ori_hw) - img_ori_hw[1]) / 2.0
113 |
114 | bboxes[:, [0, 2]] -= w_bias
115 | bboxes[:, [1, 3]] -= h_bias
116 |
117 | clip_boxes(bboxes, img_ori_hw)
118 |
119 | return bboxes
120 |
121 |
122 | def scale_bboxes_v2(bboxes, img_ori_hw, img_det_hw, p):
123 | assert len(img_ori_hw) == len(img_ori_hw)
124 |
125 | scale = max(img_ori_hw[0] / img_det_hw[0], img_ori_hw[1] / img_det_hw[1])
126 | bboxes[:, :4] = bboxes[:, :4] * scale
127 | if img_ori_hw[0] > img_ori_hw[1]:
128 | bboxes[:, [0, 2]] -= p
129 | else:
130 | bboxes[:, [1, 3]] -= p
131 |
132 | clip_boxes(bboxes, img_ori_hw)
133 |
134 | return bboxes
135 |
136 |
137 | def clip_boxes(boxes, shape):
138 | # Clip boxes (xyxy) to image shape (height, width)
139 | if isinstance(boxes, torch.Tensor): # faster individually
140 | boxes[:, 0].clamp_(0, shape[1]) # x1
141 | boxes[:, 1].clamp_(0, shape[0]) # y1
142 | boxes[:, 2].clamp_(0, shape[1]) # x2
143 | boxes[:, 3].clamp_(0, shape[0]) # y2
144 | else: # np.array (faster grouped)
145 | boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1]) # x1, x2
146 | boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0]) # y1, y2
147 |
148 |
149 |
150 | def box_area(box):
151 | # box = xyxy(4,n)
152 | return (box[2] - box[0]) * (box[3] - box[1])
153 |
154 |
155 | def box_iou(box1, box2, eps=1e-7):
156 | # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
157 | (a1, a2), (b1, b2) = box1[:, None].chunk(2, 2), box2.chunk(2, 1)
158 | inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp(0).prod(2)
159 |
160 | # IoU = inter / (area1 + area2 - inter)
161 | return inter / (box_area(box1.T)[:, None] + box_area(box2.T) - inter + eps)
162 |
163 |
164 | def draw_boxes(img, boxes, scores, labels, catid_labels, textscale=1, color_dicts=None):
165 | boxes = tuple(boxes.astype('int'))
166 | if color_dicts is None:
167 | color_dicts = {k:(0,0,255) for k in labels.keys}
168 |
169 | text_size, _ = cv.getTextSize(f'{catid_labels[labels]}:{scores:.2f}', fontFace=cv.FONT_HERSHEY_DUPLEX,
170 | fontScale=textscale, thickness=1)
171 | text_w, text_h = text_size
172 | img0 = cv.rectangle(img, boxes[:2], boxes[2:], thickness=2, lineType=cv.LINE_AA, color=color_dicts[labels])
173 | img0 = cv.rectangle(img0, boxes[:2], (boxes[0] + text_w + 1, boxes[1] + text_h + 2),
174 | thickness=-1, color=color_dicts[labels])
175 | img0 = cv.putText(img0, f'{catid_labels[labels]}:{scores:.2f}',
176 | (boxes[0], boxes[1] + text_h),
177 | fontFace=cv.FONT_HERSHEY_DUPLEX, fontScale=textscale, thickness=1,
178 | lineType=cv.LINE_AA,
179 | color=(255, 255, 255)
180 | )
181 | return img0
182 |
183 |
184 | def non_max_suppression(prediction,
185 | v8_head=False,
186 | conf_thres=0.25,
187 | iou_thres=0.45,
188 | agnostic=False,
189 | max_det=300):
190 | bs = prediction.shape[0] # batch size
191 | # Settings
192 | # min_wh = 2 # (pixels) minimum box width and height
193 | max_wh = 7680 # (pixels) maximum box width and height
194 | max_nms = 30000 # maximum number of boxes into torchvision.ops.nms()
195 | redundant = True # require redundant detections
196 | merge = False # use merge-NMS
197 | output = [np.zeros((0, 6), dtype=np.float32)] * bs
198 | if not v8_head:
199 | xc = prediction[..., 4] > conf_thres # candidates
200 | else:
201 | xc = prediction[..., 4:].max(2) > conf_thres # candidates
202 | for xi, x in enumerate(prediction): # image index, image inference
203 | # If none remain process next image
204 | if not x.shape[0]:
205 | continue
206 | # Apply constraints
207 | # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height
208 | x = x[xc[xi]] # confidence
209 | if not v8_head:
210 | # Compute conf
211 | x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf
212 | j = x[:, 5:].argmax(axis=1, keepdims=True)
213 | conf = x[:, 5:]
214 | else:
215 | j = x[:, 4:].argmax(axis=1, keepdims=True)
216 | conf = x[:, 4:]
217 | conf = conf[range(len(j)), j.ravel()].reshape(-1, 1)
218 | # Detections matrix nx6 (xywh, conf, cls)
219 | x = np.concatenate((x[:,:4], conf, j), 1)[conf.ravel() > conf_thres]
220 | # Check shape
221 | n = x.shape[0] # number of boxes
222 | if not n: # no boxes
223 | continue
224 | elif n > max_nms: # excess boxes
225 | x = x[x[:, 4].argsort()[::-1][:max_nms]] # sort by confidence
226 |
227 | # Batched NMS
228 | # c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
229 | # boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
230 | # i = cv.dnn.NMSBoxes(boxes, scores, conf_thres, iou_thres)
231 | c = x[:, 5].ravel().astype("int32")
232 | i = cv.dnn.NMSBoxesBatched(x[:, :4], x[:, 4], c, conf_thres, iou_thres, None, 0)
233 | if i.shape[0] > max_det: # limit detections
234 | i = i[:max_det]
235 | if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
236 | # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
237 | iou = box_iou(x[:, :4][i], x[:, :4]) > iou_thres # iou matrix
238 | weights = iou * x[:, 4][None] # box weights
239 | x[i, :4] = np.matmul(weights, x[:, :4]) / weights.sum(1, keepdim=True) # merged boxes
240 | if redundant:
241 | i = i[iou.sum(1) > 1] # require redundancy
242 |
243 | output[xi] = xywh2xyxy(x[i])
244 | return output
245 |
246 |
247 | def non_max_suppression_torch(prediction,
248 | v8_head=False,
249 | conf_thres=0.25,
250 | iou_thres=0.45,
251 | agnostic=False,
252 | max_det=300):
253 | bs = prediction.shape[0] # batch size
254 | # Settings
255 | # min_wh = 2 # (pixels) minimum box width and height
256 | max_wh = 7680 # (pixels) maximum box width and height
257 | max_nms = 30000 # maximum number of boxes into torchvision.ops.nms()
258 | redundant = True # require redundant detections
259 | merge = False # use merge-NMS
260 | output = [torch.zeros((0, 6), device=prediction.device)] * bs
261 | if not v8_head:
262 | xc = prediction[..., 4] > conf_thres # candidates
263 | else:
264 | xc = prediction[..., 4:].max(2) > conf_thres # candidates
265 | for xi, x in enumerate(prediction): # image index, image inference
266 | # If none remain process next image
267 | if not x.shape[0]:
268 | continue
269 | # Apply constraints
270 | # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height
271 | x = x[xc[xi]] # confidence
272 | if not v8_head:
273 | # Compute conf
274 | x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf
275 | # Detections matrix nx6 (xywh, conf, cls)
276 | conf, j = x[:, 5:].max(1, keepdim=True)
277 | else:
278 | # Detections matrix nx6 (xywh, conf, cls)
279 | conf, j = x[:, 4:].max(1, keepdim=True)
280 |
281 | # Box (center x, center y, width, height) to (x1, y1, x2, y2)
282 | box = xywh2xyxy(x[:, :4])
283 | x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
284 | # Check shape
285 | n = x.shape[0] # number of boxes
286 | if not n: # no boxes
287 | continue
288 | elif n > max_nms: # excess boxes
289 | x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence
290 |
291 | # Batched NMS
292 | c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
293 | boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
294 | i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
295 | if i.shape[0] > max_det: # limit detections
296 | i = i[:max_det]
297 | if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
298 | # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
299 | iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
300 | weights = iou * scores[None] # box weights
301 | x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
302 | if redundant:
303 | i = i[iou.sum(1) > 1] # require redundancy
304 |
305 | output[xi] = x[i]
306 | return output
307 |
308 |
309 | def yolox_postprocess(outputs, img_size, p6=False):
310 |
311 | grids = []
312 | expanded_strides = []
313 |
314 | if not p6:
315 | strides = [8, 16, 32]
316 | else:
317 | strides = [8, 16, 32, 64]
318 |
319 | hsizes = [img_size[0] // stride for stride in strides]
320 | wsizes = [img_size[1] // stride for stride in strides]
321 |
322 | for hsize, wsize, stride in zip(hsizes, wsizes, strides):
323 | xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
324 | grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
325 | grids.append(grid)
326 | shape = grid.shape[:2]
327 | expanded_strides.append(np.full((*shape, 1), stride))
328 |
329 | grids = np.concatenate(grids, 1)
330 | expanded_strides = np.concatenate(expanded_strides, 1)
331 | outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
332 | outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
333 |
334 | return outputs
335 |
336 |
337 | class Colors:
338 | # Ultralytics color palette https://ultralytics.com/
339 | def __init__(self, id_and_obj):
340 | base_hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A', '92CC17', '3DDB86', '1A9334', '00D4BB',
341 | '2C99A8', '00C2FF', '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF', 'FF95C8', 'FF37C7')
342 | n = len(id_and_obj) / len(base_hexs)
343 | if n > 1:
344 | n = int(n) + 1
345 | base_hexs *= n
346 |
347 | self.obj_id = tuple(id_and_obj.keys())
348 | self.hex = base_hexs[:len(self.obj_id)]
349 | self.id_and_hex = {k: v for k, v in zip(self.obj_id, self.hex)}
350 |
351 | def get_id_and_colors(self):
352 | id_and_colors = {k: self.hex2rgb(f'#{v}') for k, v in self.id_and_hex.items()}
353 | return id_and_colors
354 |
355 | def hex2rgb(self, h): # rgb order
356 | return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4))
357 |
358 |
359 |
--------------------------------------------------------------------------------
/yolo_detect_v1.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import torch
4 | import cv2 as cv
5 | import numpy as np
6 |
7 | from utils import trt_infer
8 | from utils.trt_infer import load_engine
9 | from utils.utils_detection import yaml_load, image_trans, scale_bboxes, non_max_suppression, Colors, draw_boxes, \
10 | non_max_suppression_torch
11 |
12 |
13 | class yolo_engine_det:
14 | def __init__(self, engine_dir, catid_labels):
15 | self.engine = load_engine(engine_dir)
16 | self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
17 | self.context = self.engine.create_execution_context()
18 | self.resize = self.engine.get_binding_shape(0)[2:]
19 | self.colors = self.get_colors_dict(catid_labels)
20 | self.labels = catid_labels
21 | self.v8_head = False
22 | self.nms = non_max_suppression
23 |
24 | if self.engine.get_binding_shape(1)[-1] - len(catid_labels) == 4:
25 | self.v8_head = True
26 |
27 | # self.context.set_binding_shape(0, [1, 3, self.resize[0], self.resize[1]])
28 | self.inputs = None
29 | self.outputs = None
30 | self.bindings = None
31 | self.stream = None
32 |
33 | self.inputs, self.outputs, self.bindings, self.stream = trt_infer.allocate_buffers_v2(self.context)
34 |
35 | @staticmethod
36 | def get_colors_dict(catid_labels):
37 | color_dicts = Colors(catid_labels)
38 | return color_dicts.get_id_and_colors()
39 |
40 |
41 | def draw(self, frame, conf=0.25, iou=0.45, max_det=200):
42 | x = image_trans(frame, self.resize)
43 | np.copyto(self.inputs[0].host, x.ravel())
44 | # self.inputs[0].host = x.ravel()
45 | t1 = time.time()
46 | pred = trt_infer.do_inference_v2(
47 | self.context, bindings=self.bindings, inputs=self.inputs, outputs=self.outputs, stream=self.stream
48 | )
49 | pred = pred[0].reshape(self.context.get_binding_shape(1))
50 | pred = self.nms(pred, v8_head=self.v8_head, conf_thres=conf, iou_thres=iou, agnostic=False, max_det=max_det)[0]
51 | t2 = time.time()
52 | fps = round((0.1 / (t2 - t1) * 10))
53 | times = round((t2 - t1) * 1000, 3)
54 | pred = scale_bboxes(pred, frame.shape[:2], self.resize)
55 | for i in pred:
56 | # pred: x1, y1, x2, y2, conf, labels
57 | frame = draw_boxes(frame, i[:4], i[4], i[5], self.labels, 0.7, self.colors)
58 | frame = cv.putText(frame, f'fps: {fps}', (10, 30), fontFace=cv.FONT_HERSHEY_SIMPLEX, fontScale=1, thickness=2,
59 | lineType=cv.LINE_AA, color=(255, 0, 255))
60 | return frame, times, pred
61 |
62 |
63 |
64 | def main(args):
65 | times = []
66 | # 检测物体标签
67 | catid_labels = yaml_load(args.labels)
68 | # 视频源
69 | vc = cv.VideoCapture(args.video_dir)
70 | # 载入engine
71 | yolo_draw = yolo_engine_det(args.engine_dir, catid_labels)
72 |
73 | # 循环读取视频中的每一帧
74 | while vc.isOpened():
75 | ret, frame = vc.read()
76 |
77 | if ret is True:
78 | frame, t, _ = yolo_draw.draw(frame, conf=args.conf_thres, iou=args.iou_thres, max_det=args.max_det)
79 | print(f'{t}ms')
80 | times.append(t)
81 | cv.imshow('video', frame)
82 |
83 | if cv.waitKey(30) & 0xFF == 27:
84 | break
85 | else:
86 | break
87 | print(np.mean(times))
88 | vc.release()
89 | cv.destroyAllWindows()
90 |
91 |
92 | if __name__ == "__main__":
93 | import argparse
94 |
95 | parser = argparse.ArgumentParser(description=__doc__)
96 | # 目标类别标签
97 | parser.add_argument('--labels', type=str, default='./labels_coco.yaml', help='obj labels')
98 | # video地址
99 | parser.add_argument('--video_dir', type=str, default='sample_1080p_h265.mp4',
100 | help='video path')
101 | # engine模型地址
102 | parser.add_argument('--engine_dir', type=str, default='./models_trt/yolov5s.engine',
103 | help='engine path')
104 | # 只有得分大于置信度的预测框会被保留下来
105 | parser.add_argument('--conf_thres', type=float, default=0.25, help='confidence threshold')
106 | # 非极大抑制所用到的nms_iou大小
107 | parser.add_argument('--iou_thres', type=float, default=0.45, help='NMS IoU threshold')
108 | # 目标框数量限制
109 | parser.add_argument('--max_det', type=int, default=200, help='maximum detections per image')
110 |
111 | args = parser.parse_args()
112 | print(args)
113 |
114 | main(args)
115 |
116 |
--------------------------------------------------------------------------------
/yolo_detect_v2.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import torch
4 | import cv2 as cv
5 | import numpy as np
6 |
7 | from utils import trt_infer
8 | from utils.trt_infer import load_engine
9 | from utils.utils_detection import yaml_load, image_trans, scale_bboxes, non_max_suppression, Colors, draw_boxes
10 |
11 |
12 | class yolo_engine_det:
13 | def __init__(self, engine_dir, catid_labels):
14 | self.engine = load_engine(engine_dir)
15 | self.context = self.engine.create_execution_context()
16 | self.resize = self.engine.get_binding_shape(0)[2:]
17 | self.colors = self.get_colors_dict(catid_labels)
18 | self.labels = catid_labels
19 |
20 | # self.context.set_binding_shape(0, [1, 3, self.resize[0], self.resize[1]])
21 | self.inputs = None
22 | self.outputs = None
23 | self.bindings = None
24 | self.stream = None
25 |
26 | self.inputs, self.outputs, self.bindings, self.stream = trt_infer.allocate_buffers_v2(self.context)
27 |
28 | @staticmethod
29 | def get_colors_dict(catid_labels):
30 | color_dicts = Colors(catid_labels)
31 | return color_dicts.get_id_and_colors()
32 |
33 |
34 | def draw(self, frame):
35 | x = image_trans(frame, self.resize)
36 | np.copyto(self.inputs[0].host, x.ravel())
37 | t1 = time.time()
38 | pred = trt_infer.do_inference_v2(
39 | self.context, bindings=self.bindings, inputs=self.inputs, outputs=self.outputs, stream=self.stream
40 | )
41 | t2 = time.time()
42 | fps = int(1.0 / (t2 - t1))
43 | times = round((t2 - t1) * 1000, 3)
44 | num_det, boxes, conf, labels = pred
45 | num_det = num_det[0]
46 | if num_det > 0:
47 | # conf = conf[:num_det]
48 | # labels = labels[:num_det]
49 | boxes = boxes[:num_det * 4].reshape(-1, 4)
50 | boxes = scale_bboxes(boxes, frame.shape[:2], self.resize)
51 | for i in range(num_det):
52 | frame = draw_boxes(frame, boxes[i], conf[i], labels[i], self.labels, 0.7, self.colors)
53 | frame = cv.putText(frame, f'fps: {fps}', (10, 30), fontFace=cv.FONT_HERSHEY_SIMPLEX, fontScale=1, thickness=2,
54 | lineType=cv.LINE_AA, color=(255, 0, 255))
55 | return frame, times
56 |
57 |
58 | def main(args):
59 | times = []
60 | # 检测物体标签
61 | catid_labels = yaml_load(args.labels)
62 | # 视频源
63 | vc = cv.VideoCapture(args.video_dir)
64 | # 载入engine
65 | yolo_draw = yolo_engine_det(
66 | args.engine_dir, catid_labels
67 | )
68 |
69 | # 循环读取视频中的每一帧
70 | while vc.isOpened():
71 | ret, frame = vc.read()
72 |
73 | if ret is True:
74 | frame, t = yolo_draw.draw(frame)
75 | print(f'{t}ms')
76 | times.append(t)
77 | cv.imshow('video', frame)
78 |
79 | if cv.waitKey(30) & 0xFF == 27:
80 | break
81 | else:
82 | break
83 | print(np.mean(times))
84 | vc.release()
85 | cv.destroyAllWindows()
86 |
87 |
88 | if __name__ == "__main__":
89 | import argparse
90 |
91 | parser = argparse.ArgumentParser(description=__doc__)
92 | # 目标类别标签
93 | parser.add_argument('--labels', type=str, default='./labels_coco.yaml', help='obj labels')
94 | # video地址
95 | parser.add_argument('--video_dir', type=str, default='sample_1080p_h265.mp4',
96 | help='video path')
97 | # engine模型地址
98 | parser.add_argument('--engine_dir', type=str, default='./models_trt/yolov7_nms.engine',
99 | help='engine path')
100 |
101 |
102 | args = parser.parse_args()
103 | print(args)
104 |
105 | main(args)
106 |
107 |
--------------------------------------------------------------------------------
/yolox_detect.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import torch
4 | import cv2 as cv
5 | import numpy as np
6 |
7 | from utils import trt_infer
8 | from utils.trt_infer import load_engine
9 | from utils.utils_detection import yaml_load, image_trans, scale_bboxes, non_max_suppression_torch, yolox_postprocess, \
10 | Colors, draw_boxes
11 |
12 |
13 | class yolox_engine_det:
14 | def __init__(self, engine_dir, catid_labels):
15 | self.engine = load_engine(engine_dir)
16 | self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
17 | self.context = self.engine.create_execution_context()
18 | self.resize = self.engine.get_binding_shape(0)[2:]
19 | self.colors = self.get_colors_dict(catid_labels)
20 | self.labels = catid_labels
21 | self.nms = non_max_suppression_torch
22 |
23 | # self.context.set_binding_shape(0, [1, 3, self.resize[0], self.resize[1]])
24 | self.inputs = None
25 | self.outputs = None
26 | self.bindings = None
27 | self.stream = None
28 |
29 | self.inputs, self.outputs, self.bindings, self.stream = trt_infer.allocate_buffers_v2(self.context)
30 |
31 | @staticmethod
32 | def get_colors_dict(catid_labels):
33 | color_dicts = Colors(catid_labels)
34 | return color_dicts.get_id_and_colors()
35 |
36 |
37 | def draw(self, frame, conf=0.25, iou=0.45, max_det=200):
38 | x = image_trans(frame, self.resize)
39 | np.copyto(self.inputs[0].host, x.ravel())
40 | t1 = time.time()
41 | pred = trt_infer.do_inference_v2(
42 | self.context, bindings=self.bindings, inputs=self.inputs, outputs=self.outputs, stream=self.stream
43 | )
44 | pred = pred[0].reshape(self.context.get_binding_shape(1))
45 | pred = yolox_postprocess(pred, self.resize, p6=False)
46 | pred = torch.from_numpy(pred).to(self.device)
47 | pred = self.nms(pred, False, conf_thres=conf, iou_thres=iou, agnostic=False, max_det=max_det)[0]
48 | t2 = time.time()
49 | fps = int(1.0 / (t2 - t1))
50 | pred = scale_bboxes(pred, frame.shape[:2], self.resize)
51 | pred = pred.cpu().numpy()
52 | for i in pred:
53 | # pred: x1, y1, x2, y2, conf, labels
54 | # bbox = tuple(i[:4].astype('int'))
55 | # frame = cv.rectangle(frame, bbox[:2], bbox[2:], thickness=2, lineType=cv.LINE_AA,
56 | # color=self.colors[i[-1]]
57 | # )
58 | # frame = cv.putText(frame, f'{self.labels[i[-1]]}:{i[-2]:.2f}', (bbox[0] + 5, bbox[1] + 30),
59 | # fontFace=cv.FONT_HERSHEY_DUPLEX, fontScale=1, thickness=1, lineType=cv.LINE_AA,
60 | # color = (210, 105, 30)
61 | # )
62 | frame = draw_boxes(frame, i[:4], i[4], i[5], self.labels, 0.7, self.colors)
63 | frame = cv.putText(frame, f'fps: {fps}', (10, 30), fontFace=cv.FONT_HERSHEY_SIMPLEX, fontScale=1, thickness=2,
64 | lineType=cv.LINE_AA, color=(255, 0, 255))
65 | return frame
66 |
67 |
68 | def main(args):
69 | # 检测物体标签
70 | catid_labels = yaml_load(args.labels)['labels']
71 | # 视频源
72 | vc = cv.VideoCapture(args.video_dir)
73 | # 载入engine
74 | yolo_draw = yolox_engine_det(args.engine_dir, catid_labels)
75 |
76 | # 循环读取视频中的每一帧
77 | while vc.isOpened():
78 | ret, frame = vc.read()
79 |
80 | if ret is True:
81 | frame = yolo_draw.draw(
82 | frame, conf=args.conf_thres, iou=args.iou_thres, max_det=args.max_det
83 | )
84 | cv.imshow('video', frame)
85 |
86 | if cv.waitKey(30) & 0xFF == 27:
87 | break
88 | else:
89 | break
90 |
91 | vc.release()
92 | cv.destroyAllWindows()
93 |
94 |
95 | if __name__ == "__main__":
96 | import argparse
97 |
98 | parser = argparse.ArgumentParser(description=__doc__)
99 | # 目标类别标签
100 | parser.add_argument('--labels', type=str, default='./labels_coco.yaml', help='obj labels')
101 | # video地址
102 | parser.add_argument('--video_dir', type=str, default='sample_1080p_h265.mp4',
103 | help='video path')
104 | # engine模型地址
105 | parser.add_argument('--engine_dir', type=str, default='./models_trt/yolox_s.engine',
106 | help='engine path')
107 | # 只有得分大于置信度的预测框会被保留下来
108 | parser.add_argument('--conf_thres', type=float, default=0.25, help='confidence threshold')
109 | # 非极大抑制所用到的nms_iou大小
110 | parser.add_argument('--iou_thres', type=float, default=0.45, help='NMS IoU threshold')
111 | # 目标框数量限制
112 | parser.add_argument('--max_det', type=int, default=200, help='maximum detections per image')
113 |
114 | args = parser.parse_args()
115 | print(args)
116 |
117 | main(args)
118 |
--------------------------------------------------------------------------------