├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README-en.md ├── README.md ├── assets ├── bytetrack.png └── effect.gif ├── bytetrack ├── include │ ├── BYTETracker.h │ ├── STrack.h │ ├── dataType.h │ ├── kalmanFilter.h │ ├── lapjv.h │ └── logging.h └── src │ ├── BYTETracker.cpp │ ├── STrack.cpp │ ├── kalmanFilter.cpp │ ├── lapjv.cpp │ └── utils.cpp ├── main.cpp ├── tensorrtx-yolov8 ├── CMakeLists.txt ├── README.md ├── gen_wts.py ├── images │ ├── 10001.jpg │ └── 10002.jpeg ├── include │ ├── block.h │ ├── calibrator.h │ ├── config.h │ ├── cuda_utils.h │ ├── logging.h │ ├── macros.h │ ├── model.h │ ├── postprocess.h │ ├── preprocess.h │ ├── types.h │ └── utils.h ├── main.cpp ├── plugin │ ├── yololayer.cu │ └── yololayer.h ├── src │ ├── block.cpp │ ├── calibrator.cpp │ ├── model.cpp │ ├── postprocess.cpp │ ├── postprocess.cu │ └── preprocess.cu └── yolov8_trt.py ├── videos └── demo.mp4 └── yolo ├── CMakeLists.txt ├── README.md ├── include ├── config.h ├── macros.h ├── postprocess.h ├── preprocess.h ├── public.h ├── types.h ├── utils.h └── yolov8_lib.h ├── main.cpp ├── plugin ├── yololayer.cu └── yololayer.h └── src ├── postprocess.cpp ├── preprocess.cu └── yolov8_lib.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | project(yolov8_bytetrack) 4 | 5 | add_definitions(-std=c++11) 6 | add_definitions(-DAPI_EXPORTS) 7 | set(CMAKE_CXX_STANDARD 11) 8 | set(CMAKE_BUILD_TYPE release) 9 | 10 | include_directories("/usr/include/eigen3") 11 | 12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 13 | # ============= cuda ============ 14 | find_package(CUDA REQUIRED) 15 | include_directories(/usr/local/cuda/include) 16 | link_directories(/usr/local/cuda/lib64) 17 | 18 | # ============= tensorrt ============ 19 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") 20 | message("Embed_platform on") 21 | include_directories(/usr/include/aarch64-linux-gnu) 22 | link_directories(/usr/lib/aarch64-linux-gnu) 23 | else() 24 | message("Embed_platform off") 25 | include_directories(/usr/include/x86_64-linux-gnu) 26 | link_directories(/usr/lib/x86_64-linux-gnu) 27 | endif() 28 | 29 | # ============ opencv ============ 30 | find_package(OpenCV) 31 | include_directories(${OpenCV_INCLUDE_DIRS}) 32 | 33 | # =========== bytetrack lib =========== 34 | include_directories(${PROJECT_SOURCE_DIR}/bytetrack/include) 35 | file(GLOB_RECURSE SRCS01 ${PROJECT_SOURCE_DIR}/bytetrack/src/*.cpp) 36 | add_library(bytetrack SHARED ${SRCS01}) 37 | target_link_libraries(bytetrack cudart nvinfer ${OpenCV_LIBS}) 38 | 39 | # ============= yolov8 lib ============= 40 | include_directories(${PROJECT_SOURCE_DIR}/yolo/include) 41 | include_directories(${PROJECT_SOURCE_DIR}/yolo/plugin) 42 | file(GLOB_RECURSE SRCS02 ${PROJECT_SOURCE_DIR}/yolo/src/*.cpp ${PROJECT_SOURCE_DIR}/yolo/src/*.cu ${PROJECT_SOURCE_DIR}/yolo/plugin/*.cu) 43 | cuda_add_library(yolo_infer SHARED ${SRCS02}) 44 | target_link_libraries(yolo_infer nvinfer cudart ${OpenCV_LIBS}) 45 | 46 | # ======== main execute file ======== 47 | add_executable(main ${PROJECT_SOURCE_DIR}/main.cpp) 48 | target_link_libraries(main bytetrack yolo_infer) 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 emptysoal 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README-en.md: -------------------------------------------------------------------------------- 1 | # TensorRT C++ api deploy YOLOv8 + ByteTrack 2 | 3 | - My other TensorRT project of YOLOv8 tasks: [YOLOv8 detection, key points, segmentation, tracking](https://github.com/emptysoal/TensorRT-YOLOv8) 4 | 5 | ## Introduction 6 | 7 | - Based on `TensorRT-v8` , deploy `YOLOv8` + `ByteTrack` ; 8 | 9 | - Support `Jetson` series, also `Linux x86_64`; 10 | 11 | Main work I have done: 12 | 13 | 1. Refer to [tensorrtx](https://github.com/wang-xinyu/tensorrtx/tree/master/yolov8) ,model: `.pth` -> `.engine`,extract the inference part of the code, encapsulated into C++ classes, easy to call other projects ; 14 | 2. Preprocessing replaced with my own CUDA programming preprocessing; 15 | 3. Post-processing removed CUDA programming because it was not significantly faster in tests compared to CPU post-processing ; 16 | 4. The post-processed NMS greatly reduces conf_thres hyperparameters due to the principle of `ByteTrack` tracking, which is very important ; 17 | 5. `YOLOv8` inference compiles to a dynamic link library to decouple projects; 18 | 6. Reference official [ByteTrack TensorRT deploy](https://github.com/ifzhang/ByteTrack/tree/main/deploy/TensorRT/cpp) , modify its interface to the `YOLO` detector; 19 | 7. `ByteTrack` also compiles to a dynamic link library, further decoupling projects; 20 | 8. Add category filtering function, you can set the category you want to track in `main.cpp` line 8 . 21 | 22 | ## Effect 23 | 24 | ![](./assets/effect.gif) 25 | 26 | # Environment 27 | 28 | 1. Base requirements: 29 | 30 | - `TensorRT 8.0+` 31 | - `OpenCV 3.4.0+` 32 | 33 | 2. My running environment on `Jetson Nano` is as follows: 34 | 35 | - The burned system image is `Jetpack 4.6.1`,original environment is as follows: 36 | 37 | | CUDA | cuDNN | TensorRT | OpenCV | 38 | | ---- | ----- | -------- | ------ | 39 | | 10.2 | 8.2 | 8.2.1 | 4.1.1 | 40 | 41 | - Install Eigen 42 | 43 | ```bash 44 | apt install libeigen3-dev 45 | ``` 46 | 47 | ## Model conversion 48 | 49 | Get the serialized file of TensorRT, suffix.engine 50 | 51 | - First get the wts format model file, link: [yolov8s.wts](https://pan.baidu.com/s/16d_MqVlUxnjOhLxVyjQy8w) , code:gsqm 52 | - Then follow these steps: 53 | 54 | ```bash 55 | cd {TensorRT-YOLOv8-ByteTrack}/tensorrtx-yolov8/ 56 | mkdir build 57 | cd build 58 | cp {path/to/yolov8s.wts} . 59 | cmake .. 60 | make 61 | ./yolov8 -s yolov8s.wts yolov8s.engine s 62 | 63 | cd ../../ 64 | mkdir yolo/engine 65 | cp tensorrtx-yolov8/build/yolov8s.engine yolo/engine 66 | ``` 67 | 68 | ## Run tracking 69 | 70 | - Follow these steps 71 | 72 | ```bash 73 | mkdir build 74 | cd build 75 | cmake .. 76 | make 77 | ./main ../videos/demo.mp4 # The path to your own video 78 | ``` 79 | 80 | Then the `result.mp4` will be in the build directory, is to track the effect of the video file 81 | 82 | If you want the tracked video to play in real time, you can uncomment line 94 of main.cpp. 83 | 84 | # Reference 85 | 86 | - [tensorrtx](https://github.com/wang-xinyu/tensorrtx/tree/master/yolov8) 87 | 88 | - [ByteTrack](https://github.com/ifzhang/ByteTrack) 89 | 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TensorRT C++ api 部署 YOLOv8 + ByteTrack 2 | 3 | - 本人另一个 `TensorRT` 部署 `YOLOv8` 各任务的项目: [YOLOv8 检测、关键点、分割、跟踪](https://github.com/emptysoal/TensorRT-YOLOv8) 4 | 5 | ## 一. 项目简介 6 | 7 | - 基于 `TensorRT-v8` ,部署`YOLOv8` + `ByteTrack` 的目标跟踪; 8 | 9 | - 支持 `Jetson` 系列嵌入式设备上部署,也可以在 `Linux x86_64`的服务器上部署; 10 | 11 | 本人所做的主要工作: 12 | 13 | 1. 参考 [tensorrtx](https://github.com/wang-xinyu/tensorrtx/tree/master/yolov8) 项目,模型 `.pth` -> `.engine`,提取出**推理部分代码,封装为C++的类**,便于其他项目调用; 14 | 2. 预处理更换成了自己写的 CUDA编程预处理; 15 | 3. 后处理去掉了CUDA编程,因为测试其相比CPU后处理提速并不明显; 16 | 4. 后处理的 `NMS` **大幅减小`conf_thres`超参数**,源于 `ByteTrack` 跟踪的原理,这一点**非常重要**; 17 | 5. `YOLOv8` 推理编译为一个动态链接库,以解耦项目; 18 | 6. 参考官方 [ByteTrack TensorRT部署](https://github.com/ifzhang/ByteTrack/tree/main/deploy/TensorRT/cpp),修改其与YOLO检测器的接口; 19 | 7. `ByteTrack` 也编译为一个动态链接库,进一步解耦项目; 20 | 8. 增加类别过滤功能,可以在`main.cpp`第 8 行设置自己想要跟踪的类别。 21 | 22 | ## 二. 项目效果 23 | 24 | ![](./assets/effect.gif) 25 | 26 | ## 三. 环境配置 27 | 28 | 1. 基本要求: 29 | 30 | - `TensorRT 8.0+` 31 | - `OpenCV 3.4.0+` 32 | 33 | 2. 本人在 `Jetson Nano` 上的运行环境如下: 34 | 35 | - 烧录的系统镜像为 `Jetpack 4.6.1`,该`jetpack` 原装环境如下: 36 | 37 | | CUDA | cuDNN | TensorRT | OpenCV | 38 | | ---- | ----- | -------- | ------ | 39 | | 10.2 | 8.2 | 8.2.1 | 4.1.1 | 40 | 41 | 关于如何在 `Jetson nano` 上烧录镜像,网上资料有很多,这里就不赘述了,注意下载 `Jetpack`镜像时选择 4.6.1 版本,该版本对应的 TensorRT v8 版本 42 | 43 | - 安装`Eigen`库 44 | 45 | ```bash 46 | apt install libeigen3-dev 47 | ``` 48 | 49 | 3. 如果是服务器上,保证基本环境版本满足,再安装`Eigen`库即可 50 | 51 | 提示:无论何种设备,记得确认 `CMakeLists.txt` 文件中相关库的路径。 52 | 53 | ## 四. 模型转换 54 | 55 | 目的:得到`TensorRT`的序列化文件,后缀 `.engine` 56 | 57 | - 首先获取 `wts` 格式的模型文件,链接:[yolov8s.wts](https://pan.baidu.com/s/16d_MqVlUxnjOhLxVyjQy8w),提取码:gsqm 58 | 59 | - 然后按以下步骤执行: 60 | 61 | ```bash 62 | cd {TensorRT-YOLOv8-ByteTrack}/tensorrtx-yolov8/ 63 | mkdir build 64 | cd build 65 | cp {path/to/yolov8s.wts} . 66 | cmake .. 67 | make 68 | ./yolov8 -s yolov8s.wts yolov8s.engine s 69 | 70 | cd ../../ 71 | mkdir yolo/engine 72 | cp tensorrtx-yolov8/build/yolov8s.engine yolo/engine 73 | ``` 74 | 75 | ## 五. 运行项目 76 | 77 | - 开始编译并运行目标跟踪的代码 78 | - 按如下步骤运行 79 | 80 | ```bash 81 | mkdir build 82 | cd build 83 | cmake .. 84 | make 85 | ./main ../videos/demo.mp4 # 传入自己视频的路径 86 | ``` 87 | 88 | 之后会在 `build` 目录下得到`result.mp4`,为跟踪效果的视频文件 89 | 90 | 如果想要跟踪的视频实时播放,可解开`main.cpp`第 94 行的注释。 91 | 92 | ## 六. 项目参考 93 | 94 | 主要参考了下面的项目: 95 | 96 | - [tensorrtx](https://github.com/wang-xinyu/tensorrtx/tree/master/yolov8) 97 | 98 | - [ByteTrack](https://github.com/ifzhang/ByteTrack) 99 | 100 | -------------------------------------------------------------------------------- /assets/bytetrack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/emptysoal/TensorRT-YOLOv8-ByteTrack/cec012e0672dd3d1f79a8dc29875be9c56feedb5/assets/bytetrack.png -------------------------------------------------------------------------------- /assets/effect.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/emptysoal/TensorRT-YOLOv8-ByteTrack/cec012e0672dd3d1f79a8dc29875be9c56feedb5/assets/effect.gif -------------------------------------------------------------------------------- /bytetrack/include/BYTETracker.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "STrack.h" 4 | 5 | struct Object 6 | { 7 | cv::Rect_ rect; 8 | int label; 9 | float prob; 10 | }; 11 | 12 | class BYTETracker 13 | { 14 | public: 15 | BYTETracker(int frame_rate = 30, int track_buffer = 30); 16 | ~BYTETracker(); 17 | 18 | vector update(const vector& objects); 19 | Scalar get_color(int idx); 20 | 21 | private: 22 | vector joint_stracks(vector &tlista, vector &tlistb); 23 | vector joint_stracks(vector &tlista, vector &tlistb); 24 | 25 | vector sub_stracks(vector &tlista, vector &tlistb); 26 | void remove_duplicate_stracks(vector &resa, vector &resb, vector &stracksa, vector &stracksb); 27 | 28 | void linear_assignment(vector > &cost_matrix, int cost_matrix_size, int cost_matrix_size_size, float thresh, 29 | vector > &matches, vector &unmatched_a, vector &unmatched_b); 30 | vector > iou_distance(vector &atracks, vector &btracks, int &dist_size, int &dist_size_size); 31 | vector > iou_distance(vector &atracks, vector &btracks); 32 | vector > ious(vector > &atlbrs, vector > &btlbrs); 33 | 34 | double lapjv(const vector > &cost, vector &rowsol, vector &colsol, 35 | bool extend_cost = false, float cost_limit = LONG_MAX, bool return_cost = true); 36 | 37 | private: 38 | 39 | float track_thresh; 40 | float high_thresh; 41 | float match_thresh; 42 | int frame_id; 43 | int max_time_lost; 44 | 45 | vector tracked_stracks; 46 | vector lost_stracks; 47 | vector removed_stracks; 48 | byte_kalman::KalmanFilter kalman_filter; 49 | }; -------------------------------------------------------------------------------- /bytetrack/include/STrack.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "kalmanFilter.h" 5 | 6 | using namespace cv; 7 | using namespace std; 8 | 9 | enum TrackState { New = 0, Tracked, Lost, Removed }; 10 | 11 | class STrack 12 | { 13 | public: 14 | STrack(vector tlwh_, float score); 15 | ~STrack(); 16 | 17 | vector static tlbr_to_tlwh(vector &tlbr); 18 | void static multi_predict(vector &stracks, byte_kalman::KalmanFilter &kalman_filter); 19 | void static_tlwh(); 20 | void static_tlbr(); 21 | vector tlwh_to_xyah(vector tlwh_tmp); 22 | vector to_xyah(); 23 | void mark_lost(); 24 | void mark_removed(); 25 | int next_id(); 26 | int end_frame(); 27 | 28 | void activate(byte_kalman::KalmanFilter &kalman_filter, int frame_id); 29 | void re_activate(STrack &new_track, int frame_id, bool new_id = false); 30 | void update(STrack &new_track, int frame_id); 31 | 32 | public: 33 | bool is_activated; 34 | int track_id; 35 | int state; 36 | 37 | vector _tlwh; 38 | vector tlwh; 39 | vector tlbr; 40 | int frame_id; 41 | int tracklet_len; 42 | int start_frame; 43 | 44 | KAL_MEAN mean; 45 | KAL_COVA covariance; 46 | float score; 47 | 48 | private: 49 | byte_kalman::KalmanFilter kalman_filter; 50 | }; -------------------------------------------------------------------------------- /bytetrack/include/dataType.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | typedef Eigen::Matrix DETECTBOX; 9 | typedef Eigen::Matrix DETECTBOXSS; 10 | typedef Eigen::Matrix FEATURE; 11 | typedef Eigen::Matrix FEATURESS; 12 | //typedef std::vector FEATURESS; 13 | 14 | //Kalmanfilter 15 | //typedef Eigen::Matrix KAL_FILTER; 16 | typedef Eigen::Matrix KAL_MEAN; 17 | typedef Eigen::Matrix KAL_COVA; 18 | typedef Eigen::Matrix KAL_HMEAN; 19 | typedef Eigen::Matrix KAL_HCOVA; 20 | using KAL_DATA = std::pair; 21 | using KAL_HDATA = std::pair; 22 | 23 | //main 24 | using RESULT_DATA = std::pair; 25 | 26 | //tracker: 27 | using TRACKER_DATA = std::pair; 28 | using MATCH_DATA = std::pair; 29 | typedef struct t { 30 | std::vector matches; 31 | std::vector unmatched_tracks; 32 | std::vector unmatched_detections; 33 | }TRACHER_MATCHD; 34 | 35 | //linear_assignment: 36 | typedef Eigen::Matrix DYNAMICM; -------------------------------------------------------------------------------- /bytetrack/include/kalmanFilter.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "dataType.h" 4 | 5 | namespace byte_kalman 6 | { 7 | class KalmanFilter 8 | { 9 | public: 10 | static const double chi2inv95[10]; 11 | KalmanFilter(); 12 | KAL_DATA initiate(const DETECTBOX& measurement); 13 | void predict(KAL_MEAN& mean, KAL_COVA& covariance); 14 | KAL_HDATA project(const KAL_MEAN& mean, const KAL_COVA& covariance); 15 | KAL_DATA update(const KAL_MEAN& mean, 16 | const KAL_COVA& covariance, 17 | const DETECTBOX& measurement); 18 | 19 | Eigen::Matrix gating_distance( 20 | const KAL_MEAN& mean, 21 | const KAL_COVA& covariance, 22 | const std::vector& measurements, 23 | bool only_position = false); 24 | 25 | private: 26 | Eigen::Matrix _motion_mat; 27 | Eigen::Matrix _update_mat; 28 | float _std_weight_position; 29 | float _std_weight_velocity; 30 | }; 31 | } -------------------------------------------------------------------------------- /bytetrack/include/lapjv.h: -------------------------------------------------------------------------------- 1 | #ifndef LAPJV_H 2 | #define LAPJV_H 3 | 4 | #define LARGE 1000000 5 | 6 | #if !defined TRUE 7 | #define TRUE 1 8 | #endif 9 | #if !defined FALSE 10 | #define FALSE 0 11 | #endif 12 | 13 | #define NEW(x, t, n) if ((x = (t *)malloc(sizeof(t) * (n))) == 0) { return -1; } 14 | #define FREE(x) if (x != 0) { free(x); x = 0; } 15 | #define SWAP_INDICES(a, b) { int_t _temp_index = a; a = b; b = _temp_index; } 16 | 17 | #if 0 18 | #include 19 | #define ASSERT(cond) assert(cond) 20 | #define PRINTF(fmt, ...) printf(fmt, ##__VA_ARGS__) 21 | #define PRINT_COST_ARRAY(a, n) \ 22 | while (1) { \ 23 | printf(#a" = ["); \ 24 | if ((n) > 0) { \ 25 | printf("%f", (a)[0]); \ 26 | for (uint_t j = 1; j < n; j++) { \ 27 | printf(", %f", (a)[j]); \ 28 | } \ 29 | } \ 30 | printf("]\n"); \ 31 | break; \ 32 | } 33 | #define PRINT_INDEX_ARRAY(a, n) \ 34 | while (1) { \ 35 | printf(#a" = ["); \ 36 | if ((n) > 0) { \ 37 | printf("%d", (a)[0]); \ 38 | for (uint_t j = 1; j < n; j++) { \ 39 | printf(", %d", (a)[j]); \ 40 | } \ 41 | } \ 42 | printf("]\n"); \ 43 | break; \ 44 | } 45 | #else 46 | #define ASSERT(cond) 47 | #define PRINTF(fmt, ...) 48 | #define PRINT_COST_ARRAY(a, n) 49 | #define PRINT_INDEX_ARRAY(a, n) 50 | #endif 51 | 52 | 53 | typedef signed int int_t; 54 | typedef unsigned int uint_t; 55 | typedef double cost_t; 56 | typedef char boolean; 57 | typedef enum fp_t { FP_1 = 1, FP_2 = 2, FP_DYNAMIC = 3 } fp_t; 58 | 59 | extern int_t lapjv_internal( 60 | const uint_t n, cost_t *cost[], 61 | int_t *x, int_t *y); 62 | 63 | #endif // LAPJV_H -------------------------------------------------------------------------------- /bytetrack/include/logging.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef TENSORRT_LOGGING_H 18 | #define TENSORRT_LOGGING_H 19 | 20 | #include "NvInferRuntimeCommon.h" 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | using Severity = nvinfer1::ILogger::Severity; 30 | 31 | class LogStreamConsumerBuffer : public std::stringbuf 32 | { 33 | public: 34 | LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) 35 | : mOutput(stream) 36 | , mPrefix(prefix) 37 | , mShouldLog(shouldLog) 38 | { 39 | } 40 | 41 | LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) 42 | : mOutput(other.mOutput) 43 | { 44 | } 45 | 46 | ~LogStreamConsumerBuffer() 47 | { 48 | // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence 49 | // std::streambuf::pptr() gives a pointer to the current position of the output sequence 50 | // if the pointer to the beginning is not equal to the pointer to the current position, 51 | // call putOutput() to log the output to the stream 52 | if (pbase() != pptr()) 53 | { 54 | putOutput(); 55 | } 56 | } 57 | 58 | // synchronizes the stream buffer and returns 0 on success 59 | // synchronizing the stream buffer consists of inserting the buffer contents into the stream, 60 | // resetting the buffer and flushing the stream 61 | virtual int sync() 62 | { 63 | putOutput(); 64 | return 0; 65 | } 66 | 67 | void putOutput() 68 | { 69 | if (mShouldLog) 70 | { 71 | // prepend timestamp 72 | std::time_t timestamp = std::time(nullptr); 73 | tm* tm_local = std::localtime(×tamp); 74 | std::cout << "["; 75 | std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; 76 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; 77 | std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; 78 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; 79 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; 80 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; 81 | // std::stringbuf::str() gets the string contents of the buffer 82 | // insert the buffer contents pre-appended by the appropriate prefix into the stream 83 | mOutput << mPrefix << str(); 84 | // set the buffer to empty 85 | str(""); 86 | // flush the stream 87 | mOutput.flush(); 88 | } 89 | } 90 | 91 | void setShouldLog(bool shouldLog) 92 | { 93 | mShouldLog = shouldLog; 94 | } 95 | 96 | private: 97 | std::ostream& mOutput; 98 | std::string mPrefix; 99 | bool mShouldLog; 100 | }; 101 | 102 | //! 103 | //! \class LogStreamConsumerBase 104 | //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer 105 | //! 106 | class LogStreamConsumerBase 107 | { 108 | public: 109 | LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) 110 | : mBuffer(stream, prefix, shouldLog) 111 | { 112 | } 113 | 114 | protected: 115 | LogStreamConsumerBuffer mBuffer; 116 | }; 117 | 118 | //! 119 | //! \class LogStreamConsumer 120 | //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. 121 | //! Order of base classes is LogStreamConsumerBase and then std::ostream. 122 | //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field 123 | //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. 124 | //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. 125 | //! Please do not change the order of the parent classes. 126 | //! 127 | class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream 128 | { 129 | public: 130 | //! \brief Creates a LogStreamConsumer which logs messages with level severity. 131 | //! Reportable severity determines if the messages are severe enough to be logged. 132 | LogStreamConsumer(Severity reportableSeverity, Severity severity) 133 | : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) 134 | , std::ostream(&mBuffer) // links the stream buffer with the stream 135 | , mShouldLog(severity <= reportableSeverity) 136 | , mSeverity(severity) 137 | { 138 | } 139 | 140 | LogStreamConsumer(LogStreamConsumer&& other) 141 | : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) 142 | , std::ostream(&mBuffer) // links the stream buffer with the stream 143 | , mShouldLog(other.mShouldLog) 144 | , mSeverity(other.mSeverity) 145 | { 146 | } 147 | 148 | void setReportableSeverity(Severity reportableSeverity) 149 | { 150 | mShouldLog = mSeverity <= reportableSeverity; 151 | mBuffer.setShouldLog(mShouldLog); 152 | } 153 | 154 | private: 155 | static std::ostream& severityOstream(Severity severity) 156 | { 157 | return severity >= Severity::kINFO ? std::cout : std::cerr; 158 | } 159 | 160 | static std::string severityPrefix(Severity severity) 161 | { 162 | switch (severity) 163 | { 164 | case Severity::kINTERNAL_ERROR: return "[F] "; 165 | case Severity::kERROR: return "[E] "; 166 | case Severity::kWARNING: return "[W] "; 167 | case Severity::kINFO: return "[I] "; 168 | case Severity::kVERBOSE: return "[V] "; 169 | default: assert(0); return ""; 170 | } 171 | } 172 | 173 | bool mShouldLog; 174 | Severity mSeverity; 175 | }; 176 | 177 | //! \class Logger 178 | //! 179 | //! \brief Class which manages logging of TensorRT tools and samples 180 | //! 181 | //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, 182 | //! and supports logging two types of messages: 183 | //! 184 | //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) 185 | //! - Test pass/fail messages 186 | //! 187 | //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is 188 | //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. 189 | //! 190 | //! In the future, this class could be extended to support dumping test results to a file in some standard format 191 | //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). 192 | //! 193 | //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger 194 | //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT 195 | //! library and messages coming from the sample. 196 | //! 197 | //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the 198 | //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger 199 | //! object. 200 | 201 | class Logger : public nvinfer1::ILogger 202 | { 203 | public: 204 | Logger(Severity severity = Severity::kWARNING) 205 | : mReportableSeverity(severity) 206 | { 207 | } 208 | 209 | //! 210 | //! \enum TestResult 211 | //! \brief Represents the state of a given test 212 | //! 213 | enum class TestResult 214 | { 215 | kRUNNING, //!< The test is running 216 | kPASSED, //!< The test passed 217 | kFAILED, //!< The test failed 218 | kWAIVED //!< The test was waived 219 | }; 220 | 221 | //! 222 | //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger 223 | //! \return The nvinfer1::ILogger associated with this Logger 224 | //! 225 | //! TODO Once all samples are updated to use this method to register the logger with TensorRT, 226 | //! we can eliminate the inheritance of Logger from ILogger 227 | //! 228 | nvinfer1::ILogger& getTRTLogger() 229 | { 230 | return *this; 231 | } 232 | 233 | //! 234 | //! \brief Implementation of the nvinfer1::ILogger::log() virtual method 235 | //! 236 | //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the 237 | //! inheritance from nvinfer1::ILogger 238 | //! 239 | void log(Severity severity, const char* msg) noexcept override 240 | { 241 | LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; 242 | } 243 | 244 | //! 245 | //! \brief Method for controlling the verbosity of logging output 246 | //! 247 | //! \param severity The logger will only emit messages that have severity of this level or higher. 248 | //! 249 | void setReportableSeverity(Severity severity) 250 | { 251 | mReportableSeverity = severity; 252 | } 253 | 254 | //! 255 | //! \brief Opaque handle that holds logging information for a particular test 256 | //! 257 | //! This object is an opaque handle to information used by the Logger to print test results. 258 | //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used 259 | //! with Logger::reportTest{Start,End}(). 260 | //! 261 | class TestAtom 262 | { 263 | public: 264 | TestAtom(TestAtom&&) = default; 265 | 266 | private: 267 | friend class Logger; 268 | 269 | TestAtom(bool started, const std::string& name, const std::string& cmdline) 270 | : mStarted(started) 271 | , mName(name) 272 | , mCmdline(cmdline) 273 | { 274 | } 275 | 276 | bool mStarted; 277 | std::string mName; 278 | std::string mCmdline; 279 | }; 280 | 281 | //! 282 | //! \brief Define a test for logging 283 | //! 284 | //! \param[in] name The name of the test. This should be a string starting with 285 | //! "TensorRT" and containing dot-separated strings containing 286 | //! the characters [A-Za-z0-9_]. 287 | //! For example, "TensorRT.sample_googlenet" 288 | //! \param[in] cmdline The command line used to reproduce the test 289 | // 290 | //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). 291 | //! 292 | static TestAtom defineTest(const std::string& name, const std::string& cmdline) 293 | { 294 | return TestAtom(false, name, cmdline); 295 | } 296 | 297 | //! 298 | //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments 299 | //! as input 300 | //! 301 | //! \param[in] name The name of the test 302 | //! \param[in] argc The number of command-line arguments 303 | //! \param[in] argv The array of command-line arguments (given as C strings) 304 | //! 305 | //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). 306 | static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) 307 | { 308 | auto cmdline = genCmdlineString(argc, argv); 309 | return defineTest(name, cmdline); 310 | } 311 | 312 | //! 313 | //! \brief Report that a test has started. 314 | //! 315 | //! \pre reportTestStart() has not been called yet for the given testAtom 316 | //! 317 | //! \param[in] testAtom The handle to the test that has started 318 | //! 319 | static void reportTestStart(TestAtom& testAtom) 320 | { 321 | reportTestResult(testAtom, TestResult::kRUNNING); 322 | assert(!testAtom.mStarted); 323 | testAtom.mStarted = true; 324 | } 325 | 326 | //! 327 | //! \brief Report that a test has ended. 328 | //! 329 | //! \pre reportTestStart() has been called for the given testAtom 330 | //! 331 | //! \param[in] testAtom The handle to the test that has ended 332 | //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, 333 | //! TestResult::kFAILED, TestResult::kWAIVED 334 | //! 335 | static void reportTestEnd(const TestAtom& testAtom, TestResult result) 336 | { 337 | assert(result != TestResult::kRUNNING); 338 | assert(testAtom.mStarted); 339 | reportTestResult(testAtom, result); 340 | } 341 | 342 | static int reportPass(const TestAtom& testAtom) 343 | { 344 | reportTestEnd(testAtom, TestResult::kPASSED); 345 | return EXIT_SUCCESS; 346 | } 347 | 348 | static int reportFail(const TestAtom& testAtom) 349 | { 350 | reportTestEnd(testAtom, TestResult::kFAILED); 351 | return EXIT_FAILURE; 352 | } 353 | 354 | static int reportWaive(const TestAtom& testAtom) 355 | { 356 | reportTestEnd(testAtom, TestResult::kWAIVED); 357 | return EXIT_SUCCESS; 358 | } 359 | 360 | static int reportTest(const TestAtom& testAtom, bool pass) 361 | { 362 | return pass ? reportPass(testAtom) : reportFail(testAtom); 363 | } 364 | 365 | Severity getReportableSeverity() const 366 | { 367 | return mReportableSeverity; 368 | } 369 | 370 | private: 371 | //! 372 | //! \brief returns an appropriate string for prefixing a log message with the given severity 373 | //! 374 | static const char* severityPrefix(Severity severity) 375 | { 376 | switch (severity) 377 | { 378 | case Severity::kINTERNAL_ERROR: return "[F] "; 379 | case Severity::kERROR: return "[E] "; 380 | case Severity::kWARNING: return "[W] "; 381 | case Severity::kINFO: return "[I] "; 382 | case Severity::kVERBOSE: return "[V] "; 383 | default: assert(0); return ""; 384 | } 385 | } 386 | 387 | //! 388 | //! \brief returns an appropriate string for prefixing a test result message with the given result 389 | //! 390 | static const char* testResultString(TestResult result) 391 | { 392 | switch (result) 393 | { 394 | case TestResult::kRUNNING: return "RUNNING"; 395 | case TestResult::kPASSED: return "PASSED"; 396 | case TestResult::kFAILED: return "FAILED"; 397 | case TestResult::kWAIVED: return "WAIVED"; 398 | default: assert(0); return ""; 399 | } 400 | } 401 | 402 | //! 403 | //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity 404 | //! 405 | static std::ostream& severityOstream(Severity severity) 406 | { 407 | return severity >= Severity::kINFO ? std::cout : std::cerr; 408 | } 409 | 410 | //! 411 | //! \brief method that implements logging test results 412 | //! 413 | static void reportTestResult(const TestAtom& testAtom, TestResult result) 414 | { 415 | severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " 416 | << testAtom.mCmdline << std::endl; 417 | } 418 | 419 | //! 420 | //! \brief generate a command line string from the given (argc, argv) values 421 | //! 422 | static std::string genCmdlineString(int argc, char const* const* argv) 423 | { 424 | std::stringstream ss; 425 | for (int i = 0; i < argc; i++) 426 | { 427 | if (i > 0) 428 | ss << " "; 429 | ss << argv[i]; 430 | } 431 | return ss.str(); 432 | } 433 | 434 | Severity mReportableSeverity; 435 | }; 436 | 437 | namespace 438 | { 439 | 440 | //! 441 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE 442 | //! 443 | //! Example usage: 444 | //! 445 | //! LOG_VERBOSE(logger) << "hello world" << std::endl; 446 | //! 447 | inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) 448 | { 449 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); 450 | } 451 | 452 | //! 453 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO 454 | //! 455 | //! Example usage: 456 | //! 457 | //! LOG_INFO(logger) << "hello world" << std::endl; 458 | //! 459 | inline LogStreamConsumer LOG_INFO(const Logger& logger) 460 | { 461 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); 462 | } 463 | 464 | //! 465 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING 466 | //! 467 | //! Example usage: 468 | //! 469 | //! LOG_WARN(logger) << "hello world" << std::endl; 470 | //! 471 | inline LogStreamConsumer LOG_WARN(const Logger& logger) 472 | { 473 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); 474 | } 475 | 476 | //! 477 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR 478 | //! 479 | //! Example usage: 480 | //! 481 | //! LOG_ERROR(logger) << "hello world" << std::endl; 482 | //! 483 | inline LogStreamConsumer LOG_ERROR(const Logger& logger) 484 | { 485 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); 486 | } 487 | 488 | //! 489 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR 490 | // ("fatal" severity) 491 | //! 492 | //! Example usage: 493 | //! 494 | //! LOG_FATAL(logger) << "hello world" << std::endl; 495 | //! 496 | inline LogStreamConsumer LOG_FATAL(const Logger& logger) 497 | { 498 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); 499 | } 500 | 501 | } // anonymous namespace 502 | 503 | #endif // TENSORRT_LOGGING_H 504 | -------------------------------------------------------------------------------- /bytetrack/src/BYTETracker.cpp: -------------------------------------------------------------------------------- 1 | #include "BYTETracker.h" 2 | #include 3 | 4 | BYTETracker::BYTETracker(int frame_rate, int track_buffer) 5 | { 6 | track_thresh = 0.5; 7 | high_thresh = 0.6; 8 | match_thresh = 0.8; 9 | 10 | frame_id = 0; 11 | max_time_lost = int(frame_rate / 30.0 * track_buffer); 12 | cout << "Init ByteTrack!" << endl; 13 | } 14 | 15 | BYTETracker::~BYTETracker() 16 | { 17 | } 18 | 19 | vector BYTETracker::update(const vector& objects) 20 | { 21 | 22 | ////////////////// Step 1: Get detections ////////////////// 23 | this->frame_id++; 24 | vector activated_stracks; 25 | vector refind_stracks; 26 | vector removed_stracks; 27 | vector lost_stracks; 28 | vector detections; 29 | vector detections_low; 30 | 31 | vector detections_cp; 32 | vector tracked_stracks_swap; 33 | vector resa, resb; 34 | vector output_stracks; 35 | 36 | vector unconfirmed; 37 | vector tracked_stracks; 38 | vector strack_pool; 39 | vector r_tracked_stracks; 40 | 41 | if (objects.size() > 0) 42 | { 43 | for (int i = 0; i < objects.size(); i++) 44 | { 45 | vector tlbr_; 46 | tlbr_.resize(4); 47 | tlbr_[0] = objects[i].rect.x; 48 | tlbr_[1] = objects[i].rect.y; 49 | tlbr_[2] = objects[i].rect.x + objects[i].rect.width; 50 | tlbr_[3] = objects[i].rect.y + objects[i].rect.height; 51 | 52 | float score = objects[i].prob; 53 | 54 | STrack strack(STrack::tlbr_to_tlwh(tlbr_), score); 55 | if (score >= track_thresh) 56 | { 57 | detections.push_back(strack); 58 | } 59 | else 60 | { 61 | detections_low.push_back(strack); 62 | } 63 | 64 | } 65 | } 66 | 67 | // Add newly detected tracklets to tracked_stracks 68 | for (int i = 0; i < this->tracked_stracks.size(); i++) 69 | { 70 | if (!this->tracked_stracks[i].is_activated) 71 | unconfirmed.push_back(&this->tracked_stracks[i]); 72 | else 73 | tracked_stracks.push_back(&this->tracked_stracks[i]); 74 | } 75 | 76 | ////////////////// Step 2: First association, with IoU ////////////////// 77 | strack_pool = joint_stracks(tracked_stracks, this->lost_stracks); 78 | STrack::multi_predict(strack_pool, this->kalman_filter); 79 | 80 | vector > dists; 81 | int dist_size = 0, dist_size_size = 0; 82 | dists = iou_distance(strack_pool, detections, dist_size, dist_size_size); 83 | 84 | vector > matches; 85 | vector u_track, u_detection; 86 | linear_assignment(dists, dist_size, dist_size_size, match_thresh, matches, u_track, u_detection); 87 | 88 | for (int i = 0; i < matches.size(); i++) 89 | { 90 | STrack *track = strack_pool[matches[i][0]]; 91 | STrack *det = &detections[matches[i][1]]; 92 | if (track->state == TrackState::Tracked) 93 | { 94 | track->update(*det, this->frame_id); 95 | activated_stracks.push_back(*track); 96 | } 97 | else 98 | { 99 | track->re_activate(*det, this->frame_id, false); 100 | refind_stracks.push_back(*track); 101 | } 102 | } 103 | 104 | ////////////////// Step 3: Second association, using low score dets ////////////////// 105 | for (int i = 0; i < u_detection.size(); i++) 106 | { 107 | detections_cp.push_back(detections[u_detection[i]]); 108 | } 109 | detections.clear(); 110 | detections.assign(detections_low.begin(), detections_low.end()); 111 | 112 | for (int i = 0; i < u_track.size(); i++) 113 | { 114 | if (strack_pool[u_track[i]]->state == TrackState::Tracked) 115 | { 116 | r_tracked_stracks.push_back(strack_pool[u_track[i]]); 117 | } 118 | } 119 | 120 | dists.clear(); 121 | dists = iou_distance(r_tracked_stracks, detections, dist_size, dist_size_size); 122 | 123 | matches.clear(); 124 | u_track.clear(); 125 | u_detection.clear(); 126 | linear_assignment(dists, dist_size, dist_size_size, 0.5, matches, u_track, u_detection); 127 | 128 | for (int i = 0; i < matches.size(); i++) 129 | { 130 | STrack *track = r_tracked_stracks[matches[i][0]]; 131 | STrack *det = &detections[matches[i][1]]; 132 | if (track->state == TrackState::Tracked) 133 | { 134 | track->update(*det, this->frame_id); 135 | activated_stracks.push_back(*track); 136 | } 137 | else 138 | { 139 | track->re_activate(*det, this->frame_id, false); 140 | refind_stracks.push_back(*track); 141 | } 142 | } 143 | 144 | for (int i = 0; i < u_track.size(); i++) 145 | { 146 | STrack *track = r_tracked_stracks[u_track[i]]; 147 | if (track->state != TrackState::Lost) 148 | { 149 | track->mark_lost(); 150 | lost_stracks.push_back(*track); 151 | } 152 | } 153 | 154 | // Deal with unconfirmed tracks, usually tracks with only one beginning frame 155 | detections.clear(); 156 | detections.assign(detections_cp.begin(), detections_cp.end()); 157 | 158 | dists.clear(); 159 | dists = iou_distance(unconfirmed, detections, dist_size, dist_size_size); 160 | 161 | matches.clear(); 162 | vector u_unconfirmed; 163 | u_detection.clear(); 164 | linear_assignment(dists, dist_size, dist_size_size, 0.7, matches, u_unconfirmed, u_detection); 165 | 166 | for (int i = 0; i < matches.size(); i++) 167 | { 168 | unconfirmed[matches[i][0]]->update(detections[matches[i][1]], this->frame_id); 169 | activated_stracks.push_back(*unconfirmed[matches[i][0]]); 170 | } 171 | 172 | for (int i = 0; i < u_unconfirmed.size(); i++) 173 | { 174 | STrack *track = unconfirmed[u_unconfirmed[i]]; 175 | track->mark_removed(); 176 | removed_stracks.push_back(*track); 177 | } 178 | 179 | ////////////////// Step 4: Init new stracks ////////////////// 180 | for (int i = 0; i < u_detection.size(); i++) 181 | { 182 | STrack *track = &detections[u_detection[i]]; 183 | if (track->score < this->high_thresh) 184 | continue; 185 | track->activate(this->kalman_filter, this->frame_id); 186 | activated_stracks.push_back(*track); 187 | } 188 | 189 | ////////////////// Step 5: Update state ////////////////// 190 | for (int i = 0; i < this->lost_stracks.size(); i++) 191 | { 192 | if (this->frame_id - this->lost_stracks[i].end_frame() > this->max_time_lost) 193 | { 194 | this->lost_stracks[i].mark_removed(); 195 | removed_stracks.push_back(this->lost_stracks[i]); 196 | } 197 | } 198 | 199 | for (int i = 0; i < this->tracked_stracks.size(); i++) 200 | { 201 | if (this->tracked_stracks[i].state == TrackState::Tracked) 202 | { 203 | tracked_stracks_swap.push_back(this->tracked_stracks[i]); 204 | } 205 | } 206 | this->tracked_stracks.clear(); 207 | this->tracked_stracks.assign(tracked_stracks_swap.begin(), tracked_stracks_swap.end()); 208 | 209 | this->tracked_stracks = joint_stracks(this->tracked_stracks, activated_stracks); 210 | this->tracked_stracks = joint_stracks(this->tracked_stracks, refind_stracks); 211 | 212 | //std::cout << activated_stracks.size() << std::endl; 213 | 214 | this->lost_stracks = sub_stracks(this->lost_stracks, this->tracked_stracks); 215 | for (int i = 0; i < lost_stracks.size(); i++) 216 | { 217 | this->lost_stracks.push_back(lost_stracks[i]); 218 | } 219 | 220 | this->lost_stracks = sub_stracks(this->lost_stracks, this->removed_stracks); 221 | for (int i = 0; i < removed_stracks.size(); i++) 222 | { 223 | this->removed_stracks.push_back(removed_stracks[i]); 224 | } 225 | 226 | remove_duplicate_stracks(resa, resb, this->tracked_stracks, this->lost_stracks); 227 | 228 | this->tracked_stracks.clear(); 229 | this->tracked_stracks.assign(resa.begin(), resa.end()); 230 | this->lost_stracks.clear(); 231 | this->lost_stracks.assign(resb.begin(), resb.end()); 232 | 233 | for (int i = 0; i < this->tracked_stracks.size(); i++) 234 | { 235 | if (this->tracked_stracks[i].is_activated) 236 | { 237 | output_stracks.push_back(this->tracked_stracks[i]); 238 | } 239 | } 240 | return output_stracks; 241 | } -------------------------------------------------------------------------------- /bytetrack/src/STrack.cpp: -------------------------------------------------------------------------------- 1 | #include "STrack.h" 2 | 3 | STrack::STrack(vector tlwh_, float score) 4 | { 5 | _tlwh.resize(4); 6 | _tlwh.assign(tlwh_.begin(), tlwh_.end()); 7 | 8 | is_activated = false; 9 | track_id = 0; 10 | state = TrackState::New; 11 | 12 | tlwh.resize(4); 13 | tlbr.resize(4); 14 | 15 | static_tlwh(); 16 | static_tlbr(); 17 | frame_id = 0; 18 | tracklet_len = 0; 19 | this->score = score; 20 | start_frame = 0; 21 | } 22 | 23 | STrack::~STrack() 24 | { 25 | } 26 | 27 | void STrack::activate(byte_kalman::KalmanFilter &kalman_filter, int frame_id) 28 | { 29 | this->kalman_filter = kalman_filter; 30 | this->track_id = this->next_id(); 31 | 32 | vector _tlwh_tmp(4); 33 | _tlwh_tmp[0] = this->_tlwh[0]; 34 | _tlwh_tmp[1] = this->_tlwh[1]; 35 | _tlwh_tmp[2] = this->_tlwh[2]; 36 | _tlwh_tmp[3] = this->_tlwh[3]; 37 | vector xyah = tlwh_to_xyah(_tlwh_tmp); 38 | DETECTBOX xyah_box; 39 | xyah_box[0] = xyah[0]; 40 | xyah_box[1] = xyah[1]; 41 | xyah_box[2] = xyah[2]; 42 | xyah_box[3] = xyah[3]; 43 | auto mc = this->kalman_filter.initiate(xyah_box); 44 | this->mean = mc.first; 45 | this->covariance = mc.second; 46 | 47 | static_tlwh(); 48 | static_tlbr(); 49 | 50 | this->tracklet_len = 0; 51 | this->state = TrackState::Tracked; 52 | if (frame_id == 1) 53 | { 54 | this->is_activated = true; 55 | } 56 | //this->is_activated = true; 57 | this->frame_id = frame_id; 58 | this->start_frame = frame_id; 59 | } 60 | 61 | void STrack::re_activate(STrack &new_track, int frame_id, bool new_id) 62 | { 63 | vector xyah = tlwh_to_xyah(new_track.tlwh); 64 | DETECTBOX xyah_box; 65 | xyah_box[0] = xyah[0]; 66 | xyah_box[1] = xyah[1]; 67 | xyah_box[2] = xyah[2]; 68 | xyah_box[3] = xyah[3]; 69 | auto mc = this->kalman_filter.update(this->mean, this->covariance, xyah_box); 70 | this->mean = mc.first; 71 | this->covariance = mc.second; 72 | 73 | static_tlwh(); 74 | static_tlbr(); 75 | 76 | this->tracklet_len = 0; 77 | this->state = TrackState::Tracked; 78 | this->is_activated = true; 79 | this->frame_id = frame_id; 80 | this->score = new_track.score; 81 | if (new_id) 82 | this->track_id = next_id(); 83 | } 84 | 85 | void STrack::update(STrack &new_track, int frame_id) 86 | { 87 | this->frame_id = frame_id; 88 | this->tracklet_len++; 89 | 90 | vector xyah = tlwh_to_xyah(new_track.tlwh); 91 | DETECTBOX xyah_box; 92 | xyah_box[0] = xyah[0]; 93 | xyah_box[1] = xyah[1]; 94 | xyah_box[2] = xyah[2]; 95 | xyah_box[3] = xyah[3]; 96 | 97 | auto mc = this->kalman_filter.update(this->mean, this->covariance, xyah_box); 98 | this->mean = mc.first; 99 | this->covariance = mc.second; 100 | 101 | static_tlwh(); 102 | static_tlbr(); 103 | 104 | this->state = TrackState::Tracked; 105 | this->is_activated = true; 106 | 107 | this->score = new_track.score; 108 | } 109 | 110 | void STrack::static_tlwh() 111 | { 112 | if (this->state == TrackState::New) 113 | { 114 | tlwh[0] = _tlwh[0]; 115 | tlwh[1] = _tlwh[1]; 116 | tlwh[2] = _tlwh[2]; 117 | tlwh[3] = _tlwh[3]; 118 | return; 119 | } 120 | 121 | tlwh[0] = mean[0]; 122 | tlwh[1] = mean[1]; 123 | tlwh[2] = mean[2]; 124 | tlwh[3] = mean[3]; 125 | 126 | tlwh[2] *= tlwh[3]; 127 | tlwh[0] -= tlwh[2] / 2; 128 | tlwh[1] -= tlwh[3] / 2; 129 | } 130 | 131 | void STrack::static_tlbr() 132 | { 133 | tlbr.clear(); 134 | tlbr.assign(tlwh.begin(), tlwh.end()); 135 | tlbr[2] += tlbr[0]; 136 | tlbr[3] += tlbr[1]; 137 | } 138 | 139 | vector STrack::tlwh_to_xyah(vector tlwh_tmp) 140 | { 141 | vector tlwh_output = tlwh_tmp; 142 | tlwh_output[0] += tlwh_output[2] / 2; 143 | tlwh_output[1] += tlwh_output[3] / 2; 144 | tlwh_output[2] /= tlwh_output[3]; 145 | return tlwh_output; 146 | } 147 | 148 | vector STrack::to_xyah() 149 | { 150 | return tlwh_to_xyah(tlwh); 151 | } 152 | 153 | vector STrack::tlbr_to_tlwh(vector &tlbr) 154 | { 155 | tlbr[2] -= tlbr[0]; 156 | tlbr[3] -= tlbr[1]; 157 | return tlbr; 158 | } 159 | 160 | void STrack::mark_lost() 161 | { 162 | state = TrackState::Lost; 163 | } 164 | 165 | void STrack::mark_removed() 166 | { 167 | state = TrackState::Removed; 168 | } 169 | 170 | int STrack::next_id() 171 | { 172 | static int _count = 0; 173 | _count++; 174 | return _count; 175 | } 176 | 177 | int STrack::end_frame() 178 | { 179 | return this->frame_id; 180 | } 181 | 182 | void STrack::multi_predict(vector &stracks, byte_kalman::KalmanFilter &kalman_filter) 183 | { 184 | for (int i = 0; i < stracks.size(); i++) 185 | { 186 | if (stracks[i]->state != TrackState::Tracked) 187 | { 188 | stracks[i]->mean[7] = 0; 189 | } 190 | kalman_filter.predict(stracks[i]->mean, stracks[i]->covariance); 191 | stracks[i]->static_tlwh(); 192 | stracks[i]->static_tlbr(); 193 | } 194 | } -------------------------------------------------------------------------------- /bytetrack/src/kalmanFilter.cpp: -------------------------------------------------------------------------------- 1 | #include "kalmanFilter.h" 2 | #include 3 | 4 | namespace byte_kalman 5 | { 6 | const double KalmanFilter::chi2inv95[10] = { 7 | 0, 8 | 3.8415, 9 | 5.9915, 10 | 7.8147, 11 | 9.4877, 12 | 11.070, 13 | 12.592, 14 | 14.067, 15 | 15.507, 16 | 16.919 17 | }; 18 | KalmanFilter::KalmanFilter() 19 | { 20 | int ndim = 4; 21 | double dt = 1.; 22 | 23 | _motion_mat = Eigen::MatrixXf::Identity(8, 8); 24 | for (int i = 0; i < ndim; i++) { 25 | _motion_mat(i, ndim + i) = dt; 26 | } 27 | _update_mat = Eigen::MatrixXf::Identity(4, 8); 28 | 29 | this->_std_weight_position = 1. / 20; 30 | this->_std_weight_velocity = 1. / 160; 31 | } 32 | 33 | KAL_DATA KalmanFilter::initiate(const DETECTBOX &measurement) 34 | { 35 | DETECTBOX mean_pos = measurement; 36 | DETECTBOX mean_vel; 37 | for (int i = 0; i < 4; i++) mean_vel(i) = 0; 38 | 39 | KAL_MEAN mean; 40 | for (int i = 0; i < 8; i++) { 41 | if (i < 4) mean(i) = mean_pos(i); 42 | else mean(i) = mean_vel(i - 4); 43 | } 44 | 45 | KAL_MEAN std; 46 | std(0) = 2 * _std_weight_position * measurement[3]; 47 | std(1) = 2 * _std_weight_position * measurement[3]; 48 | std(2) = 1e-2; 49 | std(3) = 2 * _std_weight_position * measurement[3]; 50 | std(4) = 10 * _std_weight_velocity * measurement[3]; 51 | std(5) = 10 * _std_weight_velocity * measurement[3]; 52 | std(6) = 1e-5; 53 | std(7) = 10 * _std_weight_velocity * measurement[3]; 54 | 55 | KAL_MEAN tmp = std.array().square(); 56 | KAL_COVA var = tmp.asDiagonal(); 57 | return std::make_pair(mean, var); 58 | } 59 | 60 | void KalmanFilter::predict(KAL_MEAN &mean, KAL_COVA &covariance) 61 | { 62 | //revise the data; 63 | DETECTBOX std_pos; 64 | std_pos << _std_weight_position * mean(3), 65 | _std_weight_position * mean(3), 66 | 1e-2, 67 | _std_weight_position * mean(3); 68 | DETECTBOX std_vel; 69 | std_vel << _std_weight_velocity * mean(3), 70 | _std_weight_velocity * mean(3), 71 | 1e-5, 72 | _std_weight_velocity * mean(3); 73 | KAL_MEAN tmp; 74 | tmp.block<1, 4>(0, 0) = std_pos; 75 | tmp.block<1, 4>(0, 4) = std_vel; 76 | tmp = tmp.array().square(); 77 | KAL_COVA motion_cov = tmp.asDiagonal(); 78 | KAL_MEAN mean1 = this->_motion_mat * mean.transpose(); 79 | KAL_COVA covariance1 = this->_motion_mat * covariance *(_motion_mat.transpose()); 80 | covariance1 += motion_cov; 81 | 82 | mean = mean1; 83 | covariance = covariance1; 84 | } 85 | 86 | KAL_HDATA KalmanFilter::project(const KAL_MEAN &mean, const KAL_COVA &covariance) 87 | { 88 | DETECTBOX std; 89 | std << _std_weight_position * mean(3), _std_weight_position * mean(3), 90 | 1e-1, _std_weight_position * mean(3); 91 | KAL_HMEAN mean1 = _update_mat * mean.transpose(); 92 | KAL_HCOVA covariance1 = _update_mat * covariance * (_update_mat.transpose()); 93 | Eigen::Matrix diag = std.asDiagonal(); 94 | diag = diag.array().square().matrix(); 95 | covariance1 += diag; 96 | // covariance1.diagonal() << diag; 97 | return std::make_pair(mean1, covariance1); 98 | } 99 | 100 | KAL_DATA 101 | KalmanFilter::update( 102 | const KAL_MEAN &mean, 103 | const KAL_COVA &covariance, 104 | const DETECTBOX &measurement) 105 | { 106 | KAL_HDATA pa = project(mean, covariance); 107 | KAL_HMEAN projected_mean = pa.first; 108 | KAL_HCOVA projected_cov = pa.second; 109 | 110 | //chol_factor, lower = 111 | //scipy.linalg.cho_factor(projected_cov, lower=True, check_finite=False) 112 | //kalmain_gain = 113 | //scipy.linalg.cho_solve((cho_factor, lower), 114 | //np.dot(covariance, self._upadte_mat.T).T, 115 | //check_finite=False).T 116 | Eigen::Matrix B = (covariance * (_update_mat.transpose())).transpose(); 117 | Eigen::Matrix kalman_gain = (projected_cov.llt().solve(B)).transpose(); // eg.8x4 118 | Eigen::Matrix innovation = measurement - projected_mean; //eg.1x4 119 | auto tmp = innovation * (kalman_gain.transpose()); 120 | KAL_MEAN new_mean = (mean.array() + tmp.array()).matrix(); 121 | KAL_COVA new_covariance = covariance - kalman_gain * projected_cov*(kalman_gain.transpose()); 122 | return std::make_pair(new_mean, new_covariance); 123 | } 124 | 125 | Eigen::Matrix 126 | KalmanFilter::gating_distance( 127 | const KAL_MEAN &mean, 128 | const KAL_COVA &covariance, 129 | const std::vector &measurements, 130 | bool only_position) 131 | { 132 | KAL_HDATA pa = this->project(mean, covariance); 133 | if (only_position) { 134 | printf("not implement!"); 135 | exit(0); 136 | } 137 | KAL_HMEAN mean1 = pa.first; 138 | KAL_HCOVA covariance1 = pa.second; 139 | 140 | // Eigen::Matrix d(size, 4); 141 | DETECTBOXSS d(measurements.size(), 4); 142 | int pos = 0; 143 | for (DETECTBOX box : measurements) { 144 | d.row(pos++) = box - mean1; 145 | } 146 | Eigen::Matrix factor = covariance1.llt().matrixL(); 147 | Eigen::Matrix z = factor.triangularView().solve(d).transpose(); 148 | auto zz = ((z.array())*(z.array())).matrix(); 149 | auto square_maha = zz.colwise().sum(); 150 | return square_maha; 151 | } 152 | } -------------------------------------------------------------------------------- /bytetrack/src/lapjv.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "lapjv.h" 6 | 7 | /** Column-reduction and reduction transfer for a dense cost matrix. 8 | */ 9 | int_t _ccrrt_dense(const uint_t n, cost_t *cost[], 10 | int_t *free_rows, int_t *x, int_t *y, cost_t *v) 11 | { 12 | int_t n_free_rows; 13 | boolean *unique; 14 | 15 | for (uint_t i = 0; i < n; i++) { 16 | x[i] = -1; 17 | v[i] = LARGE; 18 | y[i] = 0; 19 | } 20 | for (uint_t i = 0; i < n; i++) { 21 | for (uint_t j = 0; j < n; j++) { 22 | const cost_t c = cost[i][j]; 23 | if (c < v[j]) { 24 | v[j] = c; 25 | y[j] = i; 26 | } 27 | PRINTF("i=%d, j=%d, c[i,j]=%f, v[j]=%f y[j]=%d\n", i, j, c, v[j], y[j]); 28 | } 29 | } 30 | PRINT_COST_ARRAY(v, n); 31 | PRINT_INDEX_ARRAY(y, n); 32 | NEW(unique, boolean, n); 33 | memset(unique, TRUE, n); 34 | { 35 | int_t j = n; 36 | do { 37 | j--; 38 | const int_t i = y[j]; 39 | if (x[i] < 0) { 40 | x[i] = j; 41 | } 42 | else { 43 | unique[i] = FALSE; 44 | y[j] = -1; 45 | } 46 | } while (j > 0); 47 | } 48 | n_free_rows = 0; 49 | for (uint_t i = 0; i < n; i++) { 50 | if (x[i] < 0) { 51 | free_rows[n_free_rows++] = i; 52 | } 53 | else if (unique[i]) { 54 | const int_t j = x[i]; 55 | cost_t min = LARGE; 56 | for (uint_t j2 = 0; j2 < n; j2++) { 57 | if (j2 == (uint_t)j) { 58 | continue; 59 | } 60 | const cost_t c = cost[i][j2] - v[j2]; 61 | if (c < min) { 62 | min = c; 63 | } 64 | } 65 | PRINTF("v[%d] = %f - %f\n", j, v[j], min); 66 | v[j] -= min; 67 | } 68 | } 69 | FREE(unique); 70 | return n_free_rows; 71 | } 72 | 73 | 74 | /** Augmenting row reduction for a dense cost matrix. 75 | */ 76 | int_t _carr_dense( 77 | const uint_t n, cost_t *cost[], 78 | const uint_t n_free_rows, 79 | int_t *free_rows, int_t *x, int_t *y, cost_t *v) 80 | { 81 | uint_t current = 0; 82 | int_t new_free_rows = 0; 83 | uint_t rr_cnt = 0; 84 | PRINT_INDEX_ARRAY(x, n); 85 | PRINT_INDEX_ARRAY(y, n); 86 | PRINT_COST_ARRAY(v, n); 87 | PRINT_INDEX_ARRAY(free_rows, n_free_rows); 88 | while (current < n_free_rows) { 89 | int_t i0; 90 | int_t j1, j2; 91 | cost_t v1, v2, v1_new; 92 | boolean v1_lowers; 93 | 94 | rr_cnt++; 95 | PRINTF("current = %d rr_cnt = %d\n", current, rr_cnt); 96 | const int_t free_i = free_rows[current++]; 97 | j1 = 0; 98 | v1 = cost[free_i][0] - v[0]; 99 | j2 = -1; 100 | v2 = LARGE; 101 | for (uint_t j = 1; j < n; j++) { 102 | PRINTF("%d = %f %d = %f\n", j1, v1, j2, v2); 103 | const cost_t c = cost[free_i][j] - v[j]; 104 | if (c < v2) { 105 | if (c >= v1) { 106 | v2 = c; 107 | j2 = j; 108 | } 109 | else { 110 | v2 = v1; 111 | v1 = c; 112 | j2 = j1; 113 | j1 = j; 114 | } 115 | } 116 | } 117 | i0 = y[j1]; 118 | v1_new = v[j1] - (v2 - v1); 119 | v1_lowers = v1_new < v[j1]; 120 | PRINTF("%d %d 1=%d,%f 2=%d,%f v1'=%f(%d,%g) \n", free_i, i0, j1, v1, j2, v2, v1_new, v1_lowers, v[j1] - v1_new); 121 | if (rr_cnt < current * n) { 122 | if (v1_lowers) { 123 | v[j1] = v1_new; 124 | } 125 | else if (i0 >= 0 && j2 >= 0) { 126 | j1 = j2; 127 | i0 = y[j2]; 128 | } 129 | if (i0 >= 0) { 130 | if (v1_lowers) { 131 | free_rows[--current] = i0; 132 | } 133 | else { 134 | free_rows[new_free_rows++] = i0; 135 | } 136 | } 137 | } 138 | else { 139 | PRINTF("rr_cnt=%d >= %d (current=%d * n=%d)\n", rr_cnt, current * n, current, n); 140 | if (i0 >= 0) { 141 | free_rows[new_free_rows++] = i0; 142 | } 143 | } 144 | x[free_i] = j1; 145 | y[j1] = free_i; 146 | } 147 | return new_free_rows; 148 | } 149 | 150 | 151 | /** Find columns with minimum d[j] and put them on the SCAN list. 152 | */ 153 | uint_t _find_dense(const uint_t n, uint_t lo, cost_t *d, int_t *cols, int_t *y) 154 | { 155 | uint_t hi = lo + 1; 156 | cost_t mind = d[cols[lo]]; 157 | for (uint_t k = hi; k < n; k++) { 158 | int_t j = cols[k]; 159 | if (d[j] <= mind) { 160 | if (d[j] < mind) { 161 | hi = lo; 162 | mind = d[j]; 163 | } 164 | cols[k] = cols[hi]; 165 | cols[hi++] = j; 166 | } 167 | } 168 | return hi; 169 | } 170 | 171 | 172 | // Scan all columns in TODO starting from arbitrary column in SCAN 173 | // and try to decrease d of the TODO columns using the SCAN column. 174 | int_t _scan_dense(const uint_t n, cost_t *cost[], 175 | uint_t *plo, uint_t*phi, 176 | cost_t *d, int_t *cols, int_t *pred, 177 | int_t *y, cost_t *v) 178 | { 179 | uint_t lo = *plo; 180 | uint_t hi = *phi; 181 | cost_t h, cred_ij; 182 | 183 | while (lo != hi) { 184 | int_t j = cols[lo++]; 185 | const int_t i = y[j]; 186 | const cost_t mind = d[j]; 187 | h = cost[i][j] - v[j] - mind; 188 | PRINTF("i=%d j=%d h=%f\n", i, j, h); 189 | // For all columns in TODO 190 | for (uint_t k = hi; k < n; k++) { 191 | j = cols[k]; 192 | cred_ij = cost[i][j] - v[j] - h; 193 | if (cred_ij < d[j]) { 194 | d[j] = cred_ij; 195 | pred[j] = i; 196 | if (cred_ij == mind) { 197 | if (y[j] < 0) { 198 | return j; 199 | } 200 | cols[k] = cols[hi]; 201 | cols[hi++] = j; 202 | } 203 | } 204 | } 205 | } 206 | *plo = lo; 207 | *phi = hi; 208 | return -1; 209 | } 210 | 211 | 212 | /** Single iteration of modified Dijkstra shortest path algorithm as explained in the JV paper. 213 | * 214 | * This is a dense matrix version. 215 | * 216 | * \return The closest free column index. 217 | */ 218 | int_t find_path_dense( 219 | const uint_t n, cost_t *cost[], 220 | const int_t start_i, 221 | int_t *y, cost_t *v, 222 | int_t *pred) 223 | { 224 | uint_t lo = 0, hi = 0; 225 | int_t final_j = -1; 226 | uint_t n_ready = 0; 227 | int_t *cols; 228 | cost_t *d; 229 | 230 | NEW(cols, int_t, n); 231 | NEW(d, cost_t, n); 232 | 233 | for (uint_t i = 0; i < n; i++) { 234 | cols[i] = i; 235 | pred[i] = start_i; 236 | d[i] = cost[start_i][i] - v[i]; 237 | } 238 | PRINT_COST_ARRAY(d, n); 239 | while (final_j == -1) { 240 | // No columns left on the SCAN list. 241 | if (lo == hi) { 242 | PRINTF("%d..%d -> find\n", lo, hi); 243 | n_ready = lo; 244 | hi = _find_dense(n, lo, d, cols, y); 245 | PRINTF("check %d..%d\n", lo, hi); 246 | PRINT_INDEX_ARRAY(cols, n); 247 | for (uint_t k = lo; k < hi; k++) { 248 | const int_t j = cols[k]; 249 | if (y[j] < 0) { 250 | final_j = j; 251 | } 252 | } 253 | } 254 | if (final_j == -1) { 255 | PRINTF("%d..%d -> scan\n", lo, hi); 256 | final_j = _scan_dense( 257 | n, cost, &lo, &hi, d, cols, pred, y, v); 258 | PRINT_COST_ARRAY(d, n); 259 | PRINT_INDEX_ARRAY(cols, n); 260 | PRINT_INDEX_ARRAY(pred, n); 261 | } 262 | } 263 | 264 | PRINTF("found final_j=%d\n", final_j); 265 | PRINT_INDEX_ARRAY(cols, n); 266 | { 267 | const cost_t mind = d[cols[lo]]; 268 | for (uint_t k = 0; k < n_ready; k++) { 269 | const int_t j = cols[k]; 270 | v[j] += d[j] - mind; 271 | } 272 | } 273 | 274 | FREE(cols); 275 | FREE(d); 276 | 277 | return final_j; 278 | } 279 | 280 | 281 | /** Augment for a dense cost matrix. 282 | */ 283 | int_t _ca_dense( 284 | const uint_t n, cost_t *cost[], 285 | const uint_t n_free_rows, 286 | int_t *free_rows, int_t *x, int_t *y, cost_t *v) 287 | { 288 | int_t *pred; 289 | 290 | NEW(pred, int_t, n); 291 | 292 | for (int_t *pfree_i = free_rows; pfree_i < free_rows + n_free_rows; pfree_i++) { 293 | int_t i = -1, j; 294 | uint_t k = 0; 295 | 296 | PRINTF("looking at free_i=%d\n", *pfree_i); 297 | j = find_path_dense(n, cost, *pfree_i, y, v, pred); 298 | ASSERT(j >= 0); 299 | ASSERT(j < n); 300 | while (i != *pfree_i) { 301 | PRINTF("augment %d\n", j); 302 | PRINT_INDEX_ARRAY(pred, n); 303 | i = pred[j]; 304 | PRINTF("y[%d]=%d -> %d\n", j, y[j], i); 305 | y[j] = i; 306 | PRINT_INDEX_ARRAY(x, n); 307 | SWAP_INDICES(j, x[i]); 308 | k++; 309 | if (k >= n) { 310 | ASSERT(FALSE); 311 | } 312 | } 313 | } 314 | FREE(pred); 315 | return 0; 316 | } 317 | 318 | 319 | /** Solve dense sparse LAP. 320 | */ 321 | int lapjv_internal( 322 | const uint_t n, cost_t *cost[], 323 | int_t *x, int_t *y) 324 | { 325 | int ret; 326 | int_t *free_rows; 327 | cost_t *v; 328 | 329 | NEW(free_rows, int_t, n); 330 | NEW(v, cost_t, n); 331 | ret = _ccrrt_dense(n, cost, free_rows, x, y, v); 332 | int i = 0; 333 | while (ret > 0 && i < 2) { 334 | ret = _carr_dense(n, cost, ret, free_rows, x, y, v); 335 | i++; 336 | } 337 | if (ret > 0) { 338 | ret = _ca_dense(n, cost, ret, free_rows, x, y, v); 339 | } 340 | FREE(v); 341 | FREE(free_rows); 342 | return ret; 343 | } -------------------------------------------------------------------------------- /bytetrack/src/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "BYTETracker.h" 2 | #include "lapjv.h" 3 | 4 | vector BYTETracker::joint_stracks(vector &tlista, vector &tlistb) 5 | { 6 | map exists; 7 | vector res; 8 | for (int i = 0; i < tlista.size(); i++) 9 | { 10 | exists.insert(pair(tlista[i]->track_id, 1)); 11 | res.push_back(tlista[i]); 12 | } 13 | for (int i = 0; i < tlistb.size(); i++) 14 | { 15 | int tid = tlistb[i].track_id; 16 | if (!exists[tid] || exists.count(tid) == 0) 17 | { 18 | exists[tid] = 1; 19 | res.push_back(&tlistb[i]); 20 | } 21 | } 22 | return res; 23 | } 24 | 25 | vector BYTETracker::joint_stracks(vector &tlista, vector &tlistb) 26 | { 27 | map exists; 28 | vector res; 29 | for (int i = 0; i < tlista.size(); i++) 30 | { 31 | exists.insert(pair(tlista[i].track_id, 1)); 32 | res.push_back(tlista[i]); 33 | } 34 | for (int i = 0; i < tlistb.size(); i++) 35 | { 36 | int tid = tlistb[i].track_id; 37 | if (!exists[tid] || exists.count(tid) == 0) 38 | { 39 | exists[tid] = 1; 40 | res.push_back(tlistb[i]); 41 | } 42 | } 43 | return res; 44 | } 45 | 46 | vector BYTETracker::sub_stracks(vector &tlista, vector &tlistb) 47 | { 48 | map stracks; 49 | for (int i = 0; i < tlista.size(); i++) 50 | { 51 | stracks.insert(pair(tlista[i].track_id, tlista[i])); 52 | } 53 | for (int i = 0; i < tlistb.size(); i++) 54 | { 55 | int tid = tlistb[i].track_id; 56 | if (stracks.count(tid) != 0) 57 | { 58 | stracks.erase(tid); 59 | } 60 | } 61 | 62 | vector res; 63 | std::map::iterator it; 64 | for (it = stracks.begin(); it != stracks.end(); ++it) 65 | { 66 | res.push_back(it->second); 67 | } 68 | 69 | return res; 70 | } 71 | 72 | void BYTETracker::remove_duplicate_stracks(vector &resa, vector &resb, vector &stracksa, vector &stracksb) 73 | { 74 | vector > pdist = iou_distance(stracksa, stracksb); 75 | vector > pairs; 76 | for (int i = 0; i < pdist.size(); i++) 77 | { 78 | for (int j = 0; j < pdist[i].size(); j++) 79 | { 80 | if (pdist[i][j] < 0.15) 81 | { 82 | pairs.push_back(pair(i, j)); 83 | } 84 | } 85 | } 86 | 87 | vector dupa, dupb; 88 | for (int i = 0; i < pairs.size(); i++) 89 | { 90 | int timep = stracksa[pairs[i].first].frame_id - stracksa[pairs[i].first].start_frame; 91 | int timeq = stracksb[pairs[i].second].frame_id - stracksb[pairs[i].second].start_frame; 92 | if (timep > timeq) 93 | dupb.push_back(pairs[i].second); 94 | else 95 | dupa.push_back(pairs[i].first); 96 | } 97 | 98 | for (int i = 0; i < stracksa.size(); i++) 99 | { 100 | vector::iterator iter = find(dupa.begin(), dupa.end(), i); 101 | if (iter == dupa.end()) 102 | { 103 | resa.push_back(stracksa[i]); 104 | } 105 | } 106 | 107 | for (int i = 0; i < stracksb.size(); i++) 108 | { 109 | vector::iterator iter = find(dupb.begin(), dupb.end(), i); 110 | if (iter == dupb.end()) 111 | { 112 | resb.push_back(stracksb[i]); 113 | } 114 | } 115 | } 116 | 117 | void BYTETracker::linear_assignment(vector > &cost_matrix, int cost_matrix_size, int cost_matrix_size_size, float thresh, 118 | vector > &matches, vector &unmatched_a, vector &unmatched_b) 119 | { 120 | if (cost_matrix.size() == 0) 121 | { 122 | for (int i = 0; i < cost_matrix_size; i++) 123 | { 124 | unmatched_a.push_back(i); 125 | } 126 | for (int i = 0; i < cost_matrix_size_size; i++) 127 | { 128 | unmatched_b.push_back(i); 129 | } 130 | return; 131 | } 132 | 133 | vector rowsol; vector colsol; 134 | float c = lapjv(cost_matrix, rowsol, colsol, true, thresh); 135 | for (int i = 0; i < rowsol.size(); i++) 136 | { 137 | if (rowsol[i] >= 0) 138 | { 139 | vector match; 140 | match.push_back(i); 141 | match.push_back(rowsol[i]); 142 | matches.push_back(match); 143 | } 144 | else 145 | { 146 | unmatched_a.push_back(i); 147 | } 148 | } 149 | 150 | for (int i = 0; i < colsol.size(); i++) 151 | { 152 | if (colsol[i] < 0) 153 | { 154 | unmatched_b.push_back(i); 155 | } 156 | } 157 | } 158 | 159 | vector > BYTETracker::ious(vector > &atlbrs, vector > &btlbrs) 160 | { 161 | vector > ious; 162 | if (atlbrs.size()*btlbrs.size() == 0) 163 | return ious; 164 | 165 | ious.resize(atlbrs.size()); 166 | for (int i = 0; i < ious.size(); i++) 167 | { 168 | ious[i].resize(btlbrs.size()); 169 | } 170 | 171 | //bbox_ious 172 | for (int k = 0; k < btlbrs.size(); k++) 173 | { 174 | vector ious_tmp; 175 | float box_area = (btlbrs[k][2] - btlbrs[k][0] + 1)*(btlbrs[k][3] - btlbrs[k][1] + 1); 176 | for (int n = 0; n < atlbrs.size(); n++) 177 | { 178 | float iw = min(atlbrs[n][2], btlbrs[k][2]) - max(atlbrs[n][0], btlbrs[k][0]) + 1; 179 | if (iw > 0) 180 | { 181 | float ih = min(atlbrs[n][3], btlbrs[k][3]) - max(atlbrs[n][1], btlbrs[k][1]) + 1; 182 | if(ih > 0) 183 | { 184 | float ua = (atlbrs[n][2] - atlbrs[n][0] + 1)*(atlbrs[n][3] - atlbrs[n][1] + 1) + box_area - iw * ih; 185 | ious[n][k] = iw * ih / ua; 186 | } 187 | else 188 | { 189 | ious[n][k] = 0.0; 190 | } 191 | } 192 | else 193 | { 194 | ious[n][k] = 0.0; 195 | } 196 | } 197 | } 198 | 199 | return ious; 200 | } 201 | 202 | vector > BYTETracker::iou_distance(vector &atracks, vector &btracks, int &dist_size, int &dist_size_size) 203 | { 204 | vector > cost_matrix; 205 | if (atracks.size() * btracks.size() == 0) 206 | { 207 | dist_size = atracks.size(); 208 | dist_size_size = btracks.size(); 209 | return cost_matrix; 210 | } 211 | vector > atlbrs, btlbrs; 212 | for (int i = 0; i < atracks.size(); i++) 213 | { 214 | atlbrs.push_back(atracks[i]->tlbr); 215 | } 216 | for (int i = 0; i < btracks.size(); i++) 217 | { 218 | btlbrs.push_back(btracks[i].tlbr); 219 | } 220 | 221 | dist_size = atracks.size(); 222 | dist_size_size = btracks.size(); 223 | 224 | vector > _ious = ious(atlbrs, btlbrs); 225 | 226 | for (int i = 0; i < _ious.size();i++) 227 | { 228 | vector _iou; 229 | for (int j = 0; j < _ious[i].size(); j++) 230 | { 231 | _iou.push_back(1 - _ious[i][j]); 232 | } 233 | cost_matrix.push_back(_iou); 234 | } 235 | 236 | return cost_matrix; 237 | } 238 | 239 | vector > BYTETracker::iou_distance(vector &atracks, vector &btracks) 240 | { 241 | vector > atlbrs, btlbrs; 242 | for (int i = 0; i < atracks.size(); i++) 243 | { 244 | atlbrs.push_back(atracks[i].tlbr); 245 | } 246 | for (int i = 0; i < btracks.size(); i++) 247 | { 248 | btlbrs.push_back(btracks[i].tlbr); 249 | } 250 | 251 | vector > _ious = ious(atlbrs, btlbrs); 252 | vector > cost_matrix; 253 | for (int i = 0; i < _ious.size(); i++) 254 | { 255 | vector _iou; 256 | for (int j = 0; j < _ious[i].size(); j++) 257 | { 258 | _iou.push_back(1 - _ious[i][j]); 259 | } 260 | cost_matrix.push_back(_iou); 261 | } 262 | 263 | return cost_matrix; 264 | } 265 | 266 | double BYTETracker::lapjv(const vector > &cost, vector &rowsol, vector &colsol, 267 | bool extend_cost, float cost_limit, bool return_cost) 268 | { 269 | vector > cost_c; 270 | cost_c.assign(cost.begin(), cost.end()); 271 | 272 | vector > cost_c_extended; 273 | 274 | int n_rows = cost.size(); 275 | int n_cols = cost[0].size(); 276 | rowsol.resize(n_rows); 277 | colsol.resize(n_cols); 278 | 279 | int n = 0; 280 | if (n_rows == n_cols) 281 | { 282 | n = n_rows; 283 | } 284 | else 285 | { 286 | if (!extend_cost) 287 | { 288 | cout << "set extend_cost=True" << endl; 289 | system("pause"); 290 | exit(0); 291 | } 292 | } 293 | 294 | if (extend_cost || cost_limit < LONG_MAX) 295 | { 296 | n = n_rows + n_cols; 297 | cost_c_extended.resize(n); 298 | for (int i = 0; i < cost_c_extended.size(); i++) 299 | cost_c_extended[i].resize(n); 300 | 301 | if (cost_limit < LONG_MAX) 302 | { 303 | for (int i = 0; i < cost_c_extended.size(); i++) 304 | { 305 | for (int j = 0; j < cost_c_extended[i].size(); j++) 306 | { 307 | cost_c_extended[i][j] = cost_limit / 2.0; 308 | } 309 | } 310 | } 311 | else 312 | { 313 | float cost_max = -1; 314 | for (int i = 0; i < cost_c.size(); i++) 315 | { 316 | for (int j = 0; j < cost_c[i].size(); j++) 317 | { 318 | if (cost_c[i][j] > cost_max) 319 | cost_max = cost_c[i][j]; 320 | } 321 | } 322 | for (int i = 0; i < cost_c_extended.size(); i++) 323 | { 324 | for (int j = 0; j < cost_c_extended[i].size(); j++) 325 | { 326 | cost_c_extended[i][j] = cost_max + 1; 327 | } 328 | } 329 | } 330 | 331 | for (int i = n_rows; i < cost_c_extended.size(); i++) 332 | { 333 | for (int j = n_cols; j < cost_c_extended[i].size(); j++) 334 | { 335 | cost_c_extended[i][j] = 0; 336 | } 337 | } 338 | for (int i = 0; i < n_rows; i++) 339 | { 340 | for (int j = 0; j < n_cols; j++) 341 | { 342 | cost_c_extended[i][j] = cost_c[i][j]; 343 | } 344 | } 345 | 346 | cost_c.clear(); 347 | cost_c.assign(cost_c_extended.begin(), cost_c_extended.end()); 348 | } 349 | 350 | double **cost_ptr; 351 | cost_ptr = new double *[sizeof(double *) * n]; 352 | for (int i = 0; i < n; i++) 353 | cost_ptr[i] = new double[sizeof(double) * n]; 354 | 355 | for (int i = 0; i < n; i++) 356 | { 357 | for (int j = 0; j < n; j++) 358 | { 359 | cost_ptr[i][j] = cost_c[i][j]; 360 | } 361 | } 362 | 363 | int* x_c = new int[sizeof(int) * n]; 364 | int *y_c = new int[sizeof(int) * n]; 365 | 366 | int ret = lapjv_internal(n, cost_ptr, x_c, y_c); 367 | if (ret != 0) 368 | { 369 | cout << "Calculate Wrong!" << endl; 370 | system("pause"); 371 | exit(0); 372 | } 373 | 374 | double opt = 0.0; 375 | 376 | if (n != n_rows) 377 | { 378 | for (int i = 0; i < n; i++) 379 | { 380 | if (x_c[i] >= n_cols) 381 | x_c[i] = -1; 382 | if (y_c[i] >= n_rows) 383 | y_c[i] = -1; 384 | } 385 | for (int i = 0; i < n_rows; i++) 386 | { 387 | rowsol[i] = x_c[i]; 388 | } 389 | for (int i = 0; i < n_cols; i++) 390 | { 391 | colsol[i] = y_c[i]; 392 | } 393 | 394 | if (return_cost) 395 | { 396 | for (int i = 0; i < rowsol.size(); i++) 397 | { 398 | if (rowsol[i] != -1) 399 | { 400 | //cout << i << "\t" << rowsol[i] << "\t" << cost_ptr[i][rowsol[i]] << endl; 401 | opt += cost_ptr[i][rowsol[i]]; 402 | } 403 | } 404 | } 405 | } 406 | else if (return_cost) 407 | { 408 | for (int i = 0; i < rowsol.size(); i++) 409 | { 410 | opt += cost_ptr[i][rowsol[i]]; 411 | } 412 | } 413 | 414 | for (int i = 0; i < n; i++) 415 | { 416 | delete[]cost_ptr[i]; 417 | } 418 | delete[]cost_ptr; 419 | delete[]x_c; 420 | delete[]y_c; 421 | 422 | return opt; 423 | } 424 | 425 | Scalar BYTETracker::get_color(int idx) 426 | { 427 | idx += 3; 428 | return Scalar(37 * idx % 255, 17 * idx % 255, 29 * idx % 255); 429 | } -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "yolov8_lib.h" 4 | #include "BYTETracker.h" 5 | 6 | 7 | // 需要跟踪的类别,可以根据自己需求调整,筛选自己想要跟踪的对象的种类(以下对应COCO数据集类别索引) 8 | std::vector trackClasses {0, 1, 2, 3, 5, 7}; // person, bicycle, car, motorcycle, bus, truck 9 | 10 | bool isTrackingClass(int class_id){ 11 | for (auto& c : trackClasses){ 12 | if (class_id == c) return true; 13 | } 14 | return false; 15 | } 16 | 17 | int run(char* videoPath) 18 | { 19 | // read video 20 | std::string input_video_path = std::string(videoPath); 21 | cv::VideoCapture cap(input_video_path); 22 | if ( !cap.isOpened() ) return 0; 23 | 24 | int img_w = cap.get(CAP_PROP_FRAME_WIDTH); 25 | int img_h = cap.get(CAP_PROP_FRAME_HEIGHT); 26 | int fps = cap.get(CAP_PROP_FPS); 27 | long nFrame = static_cast(cap.get(CAP_PROP_FRAME_COUNT)); 28 | cout << "Total frames: " << nFrame << endl; 29 | 30 | cv::VideoWriter writer("result.mp4", VideoWriter::fourcc('m', 'p', '4', 'v'), fps, Size(img_w, img_h)); 31 | 32 | // YOLOv8 predictor 33 | std::string trtFile = "../yolo/engine/yolov8s.engine"; 34 | YoloDetecter detecter(trtFile); 35 | 36 | // ByteTrack tracker 37 | BYTETracker tracker(fps, 30); 38 | 39 | cv::Mat img; 40 | int num_frames = 0; 41 | int total_ms = 0; 42 | while (true) 43 | { 44 | if(!cap.read(img)) break; 45 | num_frames ++; 46 | if (num_frames % 20 == 0) 47 | { 48 | cout << "Processing frame " << num_frames << " (" << num_frames * 1000000 / total_ms << " fps)" << endl; 49 | } 50 | if (img.empty()) break; 51 | 52 | auto start = std::chrono::system_clock::now(); 53 | 54 | // yolo inference 55 | std::vector res = detecter.inference(img); 56 | 57 | // yolo output format to bytetrack input format, and filter bbox by class id 58 | std::vector objects; 59 | for (long unsigned int j = 0; j < res.size(); j++) 60 | { 61 | cv::Rect r = res[j].tlwh; 62 | float conf = (float)res[j].conf; 63 | int class_id = (int)res[j].class_id; 64 | 65 | if (isTrackingClass(class_id)){ 66 | cv::Rect_ rect((float)r.x, (float)r.y, (float)r.width, (float)r.height); 67 | Object obj {rect, class_id, conf}; 68 | objects.push_back(obj); 69 | } 70 | } 71 | 72 | // track 73 | std::vector output_stracks = tracker.update(objects); 74 | 75 | auto end = std::chrono::system_clock::now(); 76 | total_ms = total_ms + std::chrono::duration_cast(end - start).count(); 77 | 78 | for (int i = 0; i < output_stracks.size(); i++) 79 | { 80 | std::vector tlwh = output_stracks[i].tlwh; 81 | // bool vertical = tlwh[2] / tlwh[3] > 1.6; 82 | // if (tlwh[2] * tlwh[3] > 20 && !vertical) 83 | if (tlwh[2] * tlwh[3] > 20) 84 | { 85 | cv::Scalar s = tracker.get_color(output_stracks[i].track_id); 86 | cv::putText(img, cv::format("%d", output_stracks[i].track_id), cv::Point(tlwh[0], tlwh[1] - 5), 87 | 0, 0.6, cv::Scalar(0, 0, 255), 2, cv::LINE_AA); 88 | cv::rectangle(img, cv::Rect(tlwh[0], tlwh[1], tlwh[2], tlwh[3]), s, 2); 89 | } 90 | } 91 | cv::putText(img, cv::format("frame: %d fps: %d num: %ld", num_frames, num_frames * 1000000 / total_ms, output_stracks.size()), 92 | cv::Point(0, 30), 0, 0.6, cv::Scalar(0, 0, 255), 2, cv::LINE_AA); 93 | writer.write(img); 94 | 95 | // cv::imshow("img", img); 96 | int c = cv::waitKey(1); 97 | if (c == 27) break; // ESC to exit 98 | } 99 | 100 | cap.release(); 101 | std::cout << "FPS: " << num_frames * 1000000 / total_ms << std::endl; 102 | 103 | return 0; 104 | } 105 | 106 | 107 | int main(int argc, char *argv[]) 108 | { 109 | if (argc != 2 ) 110 | { 111 | std::cerr << "arguments not right!" << std::endl; 112 | std::cerr << "Usage: ./main [video path]" << std::endl; 113 | std::cerr << "Example: ./main ./test_videos/demo.mp4" << std::endl; 114 | return -1; 115 | } 116 | 117 | return run(argv[1]); 118 | } 119 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | project(yolov8) 4 | 5 | add_definitions(-std=c++11) 6 | add_definitions(-DAPI_EXPORTS) 7 | set(CMAKE_CXX_STANDARD 11) 8 | set(CMAKE_BUILD_TYPE Debug) 9 | 10 | set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) 11 | enable_language(CUDA) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | include_directories(${PROJECT_SOURCE_DIR}/plugin) 15 | 16 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 17 | # ============= cuda ============ 18 | find_package(CUDA REQUIRED) 19 | include_directories(/usr/local/cuda/include) 20 | link_directories(/usr/local/cuda/lib64) 21 | 22 | # ============= tensorrt ============ 23 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") 24 | message("Embed_platform on") 25 | include_directories(/usr/include/aarch64-linux-gnu) 26 | link_directories(/usr/lib/aarch64-linux-gnu) 27 | else() 28 | message("Embed_platform off") 29 | include_directories(/usr/include/x86_64-linux-gnu) 30 | link_directories(/usr/lib/x86_64-linux-gnu) 31 | endif() 32 | 33 | add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu) 34 | target_link_libraries(myplugins nvinfer cudart) 35 | 36 | find_package(OpenCV) 37 | include_directories(${OpenCV_INCLUDE_DIRS}) 38 | 39 | 40 | file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) 41 | add_executable(yolov8 ${PROJECT_SOURCE_DIR}/main.cpp ${SRCS}) 42 | 43 | target_link_libraries(yolov8 nvinfer) 44 | target_link_libraries(yolov8 cudart) 45 | target_link_libraries(yolov8 myplugins) 46 | target_link_libraries(yolov8 ${OpenCV_LIBS}) 47 | 48 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/README.md: -------------------------------------------------------------------------------- 1 | # yolov8 2 | 3 | The Pytorch implementation is [ultralytics/yolov8](https://github.com/ultralytics/ultralytics/tree/main/ultralytics). 4 | 5 | The tensorrt code is derived from [xiaocao-tian/yolov8_tensorrt](https://github.com/xiaocao-tian/yolov8_tensorrt) 6 | 7 | 8 | ## Requirements 9 | 10 | - TensorRT 8.0+ 11 | - OpenCV 3.4.0+ 12 | 13 | ## Different versions of yolov8 14 | 15 | Currently, we support yolov8 16 | 17 | - For yolov8 , download .pt from [https://github.com/ultralytics/assets/releases](https://github.com/ultralytics/assets/releases), then follow how-to-run in current page. 18 | 19 | ## Config 20 | 21 | - Choose the model n/s/m/l/x from command line arguments. 22 | - Check more configs in [include/config.h](./include/config.h) 23 | 24 | ## How to Run, yolov8n as example 25 | 26 | 1. generate .wts from pytorch with .pt, or download .wts from model zoo 27 | 28 | ``` 29 | // download https://github.com/ultralytics/assets/releases/yolov8n.pt 30 | cp {tensorrtx}/yolov8/gen_wts.py {ultralytics}/ultralytics 31 | cd {ultralytics}/ultralytics 32 | python gen_wts.py 33 | // a file 'yolov8n.wts' will be generated. 34 | ``` 35 | 36 | 2. build tensorrtx/yolov8 and run 37 | 38 | ``` 39 | cd {tensorrtx}/yolov8/ 40 | // update kNumClass in config.h if your model is trained on custom dataset 41 | mkdir build 42 | cd build 43 | cp {ultralytics}/ultralytics/yolov8.wts {tensorrtx}/yolov8/build 44 | cmake .. 45 | make 46 | sudo ./yolov8 -s [.wts] [.engine] [n/s/m/l/x] // serialize model to plan file 47 | sudo ./yolov8 -d [.engine] [image folder] [c/g] // deserialize and run inference, the images in [image folder] will be processed. 48 | // For example yolov8 49 | sudo ./yolov8 -s yolov8n.wts yolov8.engine n 50 | sudo ./yolov8 -d yolov8n.engine ../images c //cpu postprocess 51 | sudo ./yolov8 -d yolov8n.engine ../images g //gpu postprocess 52 | 53 | ``` 54 | 3. check the images generated, as follows. _zidane.jpg and _bus.jpg 55 | 56 | 4. optional, load and run the tensorrt model in python 57 | 58 | ``` 59 | // install python-tensorrt, pycuda, etc. 60 | // ensure the yolov8n.engine and libmyplugins.so have been built 61 | python yolov8_trt.py 62 | ``` 63 | 64 | # INT8 Quantization 65 | 66 | 1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh 67 | 68 | 2. unzip it in yolov8/build 69 | 70 | 3. set the macro `USE_INT8` in config.h and make 71 | 72 | 4. serialize the model and test 73 | 74 |

75 | 76 |

77 | 78 | ## More Information 79 | 80 | See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) 81 | 82 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/gen_wts.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import os 4 | import struct 5 | import torch 6 | 7 | pt_file = "./weights/yolov8s.pt" 8 | wts_file = "./weights/yolov8s.wts" 9 | 10 | # Initialize 11 | device = 'cpu' 12 | 13 | # Load model 14 | model = torch.load(pt_file, map_location=device)['model'].float() # load to FP32 15 | 16 | anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None] 17 | 18 | delattr(model.model[-1], 'anchors') 19 | 20 | model.to(device).eval() 21 | 22 | with open(wts_file, 'w') as f: 23 | f.write('{}\n'.format(len(model.state_dict().keys()))) 24 | for k, v in model.state_dict().items(): 25 | vr = v.reshape(-1).cpu().numpy() 26 | f.write('{} {} '.format(k, len(vr))) 27 | for vv in vr: 28 | f.write(' ') 29 | f.write(struct.pack('>f', float(vv)).hex()) 30 | f.write('\n') 31 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/images/10001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/emptysoal/TensorRT-YOLOv8-ByteTrack/cec012e0672dd3d1f79a8dc29875be9c56feedb5/tensorrtx-yolov8/images/10001.jpg -------------------------------------------------------------------------------- /tensorrtx-yolov8/images/10002.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/emptysoal/TensorRT-YOLOv8-ByteTrack/cec012e0672dd3d1f79a8dc29875be9c56feedb5/tensorrtx-yolov8/images/10002.jpeg -------------------------------------------------------------------------------- /tensorrtx-yolov8/include/block.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "NvInfer.h" 6 | 7 | std::map loadWeights(const std::string file); 8 | 9 | nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map weightMap, 10 | nvinfer1::ITensor& input, int ch, int k, int s, int p, std::string lname); 11 | 12 | nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network, std::map weightMap, 13 | nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, float e, std::string lname); 14 | 15 | nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network, std::map weightMap, 16 | nvinfer1::ITensor& input, int c1, int c2, int k, std::string lname); 17 | 18 | nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map weightMap, 19 | nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname); 20 | 21 | nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition *network, std::vector dets); 22 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/include/calibrator.h: -------------------------------------------------------------------------------- 1 | #ifndef ENTROPY_CALIBRATOR_H 2 | #define ENTROPY_CALIBRATOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "macros.h" 8 | 9 | //! \class Int8EntropyCalibrator2 10 | //! 11 | //! \brief Implements Entropy calibrator 2. 12 | //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. 13 | //! 14 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 15 | { 16 | public: 17 | Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); 18 | virtual ~Int8EntropyCalibrator2(); 19 | int getBatchSize() const TRT_NOEXCEPT override; 20 | bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; 21 | const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; 22 | void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; 23 | 24 | private: 25 | int batchsize_; 26 | int input_w_; 27 | int input_h_; 28 | int img_idx_; 29 | std::string img_dir_; 30 | std::vector img_files_; 31 | size_t input_count_; 32 | std::string calib_table_name_; 33 | const char* input_blob_name_; 34 | bool read_cache_; 35 | void* device_input_; 36 | std::vector calib_cache_; 37 | }; 38 | 39 | #endif // ENTROPY_CALIBRATOR_H 40 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/include/config.h: -------------------------------------------------------------------------------- 1 | #define USE_FP16 2 | //#define USE_INT8 3 | 4 | const static char *kInputTensorName = "images"; 5 | const static char *kOutputTensorName = "output"; 6 | const static int kNumClass = 80; 7 | const static int kBatchSize = 1; 8 | const static int kGpuId = 0; 9 | const static int kInputH = 640; 10 | const static int kInputW = 640; 11 | const static float kNmsThresh = 0.45f; 12 | const static float kConfThresh = 0.5f; 13 | const static int kMaxInputImageSize = 3000 * 3000; 14 | const static int kMaxNumOutputBbox = 1000; 15 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/include/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRTX_CUDA_UTILS_H_ 2 | #define TRTX_CUDA_UTILS_H_ 3 | 4 | #include 5 | 6 | #ifndef CUDA_CHECK 7 | #define CUDA_CHECK(callstr)\ 8 | {\ 9 | cudaError_t error_code = callstr;\ 10 | if (error_code != cudaSuccess) {\ 11 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ 12 | assert(0);\ 13 | }\ 14 | } 15 | #endif // CUDA_CHECK 16 | 17 | #endif // TRTX_CUDA_UTILS_H_ 18 | 19 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/include/logging.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef TENSORRT_LOGGING_H 18 | #define TENSORRT_LOGGING_H 19 | 20 | #include "NvInferRuntimeCommon.h" 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include "macros.h" 29 | 30 | using Severity = nvinfer1::ILogger::Severity; 31 | 32 | class LogStreamConsumerBuffer : public std::stringbuf 33 | { 34 | public: 35 | LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) 36 | : mOutput(stream) 37 | , mPrefix(prefix) 38 | , mShouldLog(shouldLog) 39 | { 40 | } 41 | 42 | LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) 43 | : mOutput(other.mOutput) 44 | { 45 | } 46 | 47 | ~LogStreamConsumerBuffer() 48 | { 49 | // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence 50 | // std::streambuf::pptr() gives a pointer to the current position of the output sequence 51 | // if the pointer to the beginning is not equal to the pointer to the current position, 52 | // call putOutput() to log the output to the stream 53 | if (pbase() != pptr()) 54 | { 55 | putOutput(); 56 | } 57 | } 58 | 59 | // synchronizes the stream buffer and returns 0 on success 60 | // synchronizing the stream buffer consists of inserting the buffer contents into the stream, 61 | // resetting the buffer and flushing the stream 62 | virtual int sync() 63 | { 64 | putOutput(); 65 | return 0; 66 | } 67 | 68 | void putOutput() 69 | { 70 | if (mShouldLog) 71 | { 72 | // prepend timestamp 73 | std::time_t timestamp = std::time(nullptr); 74 | tm* tm_local = std::localtime(×tamp); 75 | std::cout << "["; 76 | std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; 77 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; 78 | std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; 79 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; 80 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; 81 | std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; 82 | // std::stringbuf::str() gets the string contents of the buffer 83 | // insert the buffer contents pre-appended by the appropriate prefix into the stream 84 | mOutput << mPrefix << str(); 85 | // set the buffer to empty 86 | str(""); 87 | // flush the stream 88 | mOutput.flush(); 89 | } 90 | } 91 | 92 | void setShouldLog(bool shouldLog) 93 | { 94 | mShouldLog = shouldLog; 95 | } 96 | 97 | private: 98 | std::ostream& mOutput; 99 | std::string mPrefix; 100 | bool mShouldLog; 101 | }; 102 | 103 | //! 104 | //! \class LogStreamConsumerBase 105 | //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer 106 | //! 107 | class LogStreamConsumerBase 108 | { 109 | public: 110 | LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) 111 | : mBuffer(stream, prefix, shouldLog) 112 | { 113 | } 114 | 115 | protected: 116 | LogStreamConsumerBuffer mBuffer; 117 | }; 118 | 119 | //! 120 | //! \class LogStreamConsumer 121 | //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. 122 | //! Order of base classes is LogStreamConsumerBase and then std::ostream. 123 | //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field 124 | //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. 125 | //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. 126 | //! Please do not change the order of the parent classes. 127 | //! 128 | class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream 129 | { 130 | public: 131 | //! \brief Creates a LogStreamConsumer which logs messages with level severity. 132 | //! Reportable severity determines if the messages are severe enough to be logged. 133 | LogStreamConsumer(Severity reportableSeverity, Severity severity) 134 | : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) 135 | , std::ostream(&mBuffer) // links the stream buffer with the stream 136 | , mShouldLog(severity <= reportableSeverity) 137 | , mSeverity(severity) 138 | { 139 | } 140 | 141 | LogStreamConsumer(LogStreamConsumer&& other) 142 | : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) 143 | , std::ostream(&mBuffer) // links the stream buffer with the stream 144 | , mShouldLog(other.mShouldLog) 145 | , mSeverity(other.mSeverity) 146 | { 147 | } 148 | 149 | void setReportableSeverity(Severity reportableSeverity) 150 | { 151 | mShouldLog = mSeverity <= reportableSeverity; 152 | mBuffer.setShouldLog(mShouldLog); 153 | } 154 | 155 | private: 156 | static std::ostream& severityOstream(Severity severity) 157 | { 158 | return severity >= Severity::kINFO ? std::cout : std::cerr; 159 | } 160 | 161 | static std::string severityPrefix(Severity severity) 162 | { 163 | switch (severity) 164 | { 165 | case Severity::kINTERNAL_ERROR: return "[F] "; 166 | case Severity::kERROR: return "[E] "; 167 | case Severity::kWARNING: return "[W] "; 168 | case Severity::kINFO: return "[I] "; 169 | case Severity::kVERBOSE: return "[V] "; 170 | default: assert(0); return ""; 171 | } 172 | } 173 | 174 | bool mShouldLog; 175 | Severity mSeverity; 176 | }; 177 | 178 | //! \class Logger 179 | //! 180 | //! \brief Class which manages logging of TensorRT tools and samples 181 | //! 182 | //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, 183 | //! and supports logging two types of messages: 184 | //! 185 | //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) 186 | //! - Test pass/fail messages 187 | //! 188 | //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is 189 | //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. 190 | //! 191 | //! In the future, this class could be extended to support dumping test results to a file in some standard format 192 | //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). 193 | //! 194 | //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger 195 | //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT 196 | //! library and messages coming from the sample. 197 | //! 198 | //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the 199 | //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger 200 | //! object. 201 | 202 | class Logger : public nvinfer1::ILogger 203 | { 204 | public: 205 | Logger(Severity severity = Severity::kWARNING) 206 | : mReportableSeverity(severity) 207 | { 208 | } 209 | 210 | //! 211 | //! \enum TestResult 212 | //! \brief Represents the state of a given test 213 | //! 214 | enum class TestResult 215 | { 216 | kRUNNING, //!< The test is running 217 | kPASSED, //!< The test passed 218 | kFAILED, //!< The test failed 219 | kWAIVED //!< The test was waived 220 | }; 221 | 222 | //! 223 | //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger 224 | //! \return The nvinfer1::ILogger associated with this Logger 225 | //! 226 | //! TODO Once all samples are updated to use this method to register the logger with TensorRT, 227 | //! we can eliminate the inheritance of Logger from ILogger 228 | //! 229 | nvinfer1::ILogger& getTRTLogger() 230 | { 231 | return *this; 232 | } 233 | 234 | //! 235 | //! \brief Implementation of the nvinfer1::ILogger::log() virtual method 236 | //! 237 | //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the 238 | //! inheritance from nvinfer1::ILogger 239 | //! 240 | void log(Severity severity, const char* msg) TRT_NOEXCEPT override 241 | { 242 | LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; 243 | } 244 | 245 | //! 246 | //! \brief Method for controlling the verbosity of logging output 247 | //! 248 | //! \param severity The logger will only emit messages that have severity of this level or higher. 249 | //! 250 | void setReportableSeverity(Severity severity) 251 | { 252 | mReportableSeverity = severity; 253 | } 254 | 255 | //! 256 | //! \brief Opaque handle that holds logging information for a particular test 257 | //! 258 | //! This object is an opaque handle to information used by the Logger to print test results. 259 | //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used 260 | //! with Logger::reportTest{Start,End}(). 261 | //! 262 | class TestAtom 263 | { 264 | public: 265 | TestAtom(TestAtom&&) = default; 266 | 267 | private: 268 | friend class Logger; 269 | 270 | TestAtom(bool started, const std::string& name, const std::string& cmdline) 271 | : mStarted(started) 272 | , mName(name) 273 | , mCmdline(cmdline) 274 | { 275 | } 276 | 277 | bool mStarted; 278 | std::string mName; 279 | std::string mCmdline; 280 | }; 281 | 282 | //! 283 | //! \brief Define a test for logging 284 | //! 285 | //! \param[in] name The name of the test. This should be a string starting with 286 | //! "TensorRT" and containing dot-separated strings containing 287 | //! the characters [A-Za-z0-9_]. 288 | //! For example, "TensorRT.sample_googlenet" 289 | //! \param[in] cmdline The command line used to reproduce the test 290 | // 291 | //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). 292 | //! 293 | static TestAtom defineTest(const std::string& name, const std::string& cmdline) 294 | { 295 | return TestAtom(false, name, cmdline); 296 | } 297 | 298 | //! 299 | //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments 300 | //! as input 301 | //! 302 | //! \param[in] name The name of the test 303 | //! \param[in] argc The number of command-line arguments 304 | //! \param[in] argv The array of command-line arguments (given as C strings) 305 | //! 306 | //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). 307 | static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) 308 | { 309 | auto cmdline = genCmdlineString(argc, argv); 310 | return defineTest(name, cmdline); 311 | } 312 | 313 | //! 314 | //! \brief Report that a test has started. 315 | //! 316 | //! \pre reportTestStart() has not been called yet for the given testAtom 317 | //! 318 | //! \param[in] testAtom The handle to the test that has started 319 | //! 320 | static void reportTestStart(TestAtom& testAtom) 321 | { 322 | reportTestResult(testAtom, TestResult::kRUNNING); 323 | assert(!testAtom.mStarted); 324 | testAtom.mStarted = true; 325 | } 326 | 327 | //! 328 | //! \brief Report that a test has ended. 329 | //! 330 | //! \pre reportTestStart() has been called for the given testAtom 331 | //! 332 | //! \param[in] testAtom The handle to the test that has ended 333 | //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, 334 | //! TestResult::kFAILED, TestResult::kWAIVED 335 | //! 336 | static void reportTestEnd(const TestAtom& testAtom, TestResult result) 337 | { 338 | assert(result != TestResult::kRUNNING); 339 | assert(testAtom.mStarted); 340 | reportTestResult(testAtom, result); 341 | } 342 | 343 | static int reportPass(const TestAtom& testAtom) 344 | { 345 | reportTestEnd(testAtom, TestResult::kPASSED); 346 | return EXIT_SUCCESS; 347 | } 348 | 349 | static int reportFail(const TestAtom& testAtom) 350 | { 351 | reportTestEnd(testAtom, TestResult::kFAILED); 352 | return EXIT_FAILURE; 353 | } 354 | 355 | static int reportWaive(const TestAtom& testAtom) 356 | { 357 | reportTestEnd(testAtom, TestResult::kWAIVED); 358 | return EXIT_SUCCESS; 359 | } 360 | 361 | static int reportTest(const TestAtom& testAtom, bool pass) 362 | { 363 | return pass ? reportPass(testAtom) : reportFail(testAtom); 364 | } 365 | 366 | Severity getReportableSeverity() const 367 | { 368 | return mReportableSeverity; 369 | } 370 | 371 | private: 372 | //! 373 | //! \brief returns an appropriate string for prefixing a log message with the given severity 374 | //! 375 | static const char* severityPrefix(Severity severity) 376 | { 377 | switch (severity) 378 | { 379 | case Severity::kINTERNAL_ERROR: return "[F] "; 380 | case Severity::kERROR: return "[E] "; 381 | case Severity::kWARNING: return "[W] "; 382 | case Severity::kINFO: return "[I] "; 383 | case Severity::kVERBOSE: return "[V] "; 384 | default: assert(0); return ""; 385 | } 386 | } 387 | 388 | //! 389 | //! \brief returns an appropriate string for prefixing a test result message with the given result 390 | //! 391 | static const char* testResultString(TestResult result) 392 | { 393 | switch (result) 394 | { 395 | case TestResult::kRUNNING: return "RUNNING"; 396 | case TestResult::kPASSED: return "PASSED"; 397 | case TestResult::kFAILED: return "FAILED"; 398 | case TestResult::kWAIVED: return "WAIVED"; 399 | default: assert(0); return ""; 400 | } 401 | } 402 | 403 | //! 404 | //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity 405 | //! 406 | static std::ostream& severityOstream(Severity severity) 407 | { 408 | return severity >= Severity::kINFO ? std::cout : std::cerr; 409 | } 410 | 411 | //! 412 | //! \brief method that implements logging test results 413 | //! 414 | static void reportTestResult(const TestAtom& testAtom, TestResult result) 415 | { 416 | severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " 417 | << testAtom.mCmdline << std::endl; 418 | } 419 | 420 | //! 421 | //! \brief generate a command line string from the given (argc, argv) values 422 | //! 423 | static std::string genCmdlineString(int argc, char const* const* argv) 424 | { 425 | std::stringstream ss; 426 | for (int i = 0; i < argc; i++) 427 | { 428 | if (i > 0) 429 | ss << " "; 430 | ss << argv[i]; 431 | } 432 | return ss.str(); 433 | } 434 | 435 | Severity mReportableSeverity; 436 | }; 437 | 438 | namespace 439 | { 440 | 441 | //! 442 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE 443 | //! 444 | //! Example usage: 445 | //! 446 | //! LOG_VERBOSE(logger) << "hello world" << std::endl; 447 | //! 448 | inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) 449 | { 450 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); 451 | } 452 | 453 | //! 454 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO 455 | //! 456 | //! Example usage: 457 | //! 458 | //! LOG_INFO(logger) << "hello world" << std::endl; 459 | //! 460 | inline LogStreamConsumer LOG_INFO(const Logger& logger) 461 | { 462 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); 463 | } 464 | 465 | //! 466 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING 467 | //! 468 | //! Example usage: 469 | //! 470 | //! LOG_WARN(logger) << "hello world" << std::endl; 471 | //! 472 | inline LogStreamConsumer LOG_WARN(const Logger& logger) 473 | { 474 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); 475 | } 476 | 477 | //! 478 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR 479 | //! 480 | //! Example usage: 481 | //! 482 | //! LOG_ERROR(logger) << "hello world" << std::endl; 483 | //! 484 | inline LogStreamConsumer LOG_ERROR(const Logger& logger) 485 | { 486 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); 487 | } 488 | 489 | //! 490 | //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR 491 | // ("fatal" severity) 492 | //! 493 | //! Example usage: 494 | //! 495 | //! LOG_FATAL(logger) << "hello world" << std::endl; 496 | //! 497 | inline LogStreamConsumer LOG_FATAL(const Logger& logger) 498 | { 499 | return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); 500 | } 501 | 502 | } // anonymous namespace 503 | 504 | #endif // TENSORRT_LOGGING_H 505 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/include/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #include "NvInfer.h" 5 | 6 | #ifdef API_EXPORTS 7 | #if defined(_MSC_VER) 8 | #define API __declspec(dllexport) 9 | #else 10 | #define API __attribute__((visibility("default"))) 11 | #endif 12 | #else 13 | 14 | #if defined(_MSC_VER) 15 | #define API __declspec(dllimport) 16 | #else 17 | #define API 18 | #endif 19 | #endif // API_EXPORTS 20 | 21 | #if NV_TENSORRT_MAJOR >= 8 22 | #define TRT_NOEXCEPT noexcept 23 | #define TRT_CONST_ENQUEUE const 24 | #else 25 | #define TRT_NOEXCEPT 26 | #define TRT_CONST_ENQUEUE 27 | #endif 28 | 29 | #endif // __MACROS_H 30 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/include/model.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "NvInfer.h" 3 | #include 4 | #include 5 | 6 | nvinfer1::IHostMemory* buildEngineYolov8n(nvinfer1::IBuilder* builder, 7 | nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path); 8 | 9 | nvinfer1::IHostMemory* buildEngineYolov8s(nvinfer1::IBuilder* builder, 10 | nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path); 11 | 12 | nvinfer1::IHostMemory* buildEngineYolov8m(nvinfer1::IBuilder* builder, 13 | nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path); 14 | 15 | nvinfer1::IHostMemory* buildEngineYolov8l(nvinfer1::IBuilder* builder, 16 | nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path); 17 | 18 | nvinfer1::IHostMemory* buildEngineYolov8x(nvinfer1::IBuilder* builder, 19 | nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path); 20 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/include/postprocess.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "types.h" 4 | #include "NvInfer.h" 5 | #include 6 | 7 | cv::Rect get_rect(cv::Mat& img, float bbox[4]); 8 | 9 | void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh = 0.5); 10 | 11 | void batch_nms(std::vector>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5); 12 | 13 | void draw_bbox(std::vector &img_batch, std::vector> &res_batch); 14 | 15 | void batch_process(std::vector> &res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch); 16 | 17 | void process_decode_ptr_host(std::vector &res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count); 18 | 19 | void cuda_decode(float* predict, int num_bboxes, float confidence_threshold,float* parray,int max_objects, cudaStream_t stream); 20 | 21 | void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream); 22 | 23 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/include/preprocess.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "NvInfer.h" 5 | #include "types.h" 6 | #include 7 | 8 | 9 | void cuda_preprocess_init(int max_image_size); 10 | 11 | void cuda_preprocess_destroy(); 12 | 13 | void cuda_preprocess(uint8_t *src, int src_width, int src_height, float *dst, int dst_width, int dst_height, cudaStream_t stream); 14 | 15 | void cuda_batch_preprocess(std::vector &img_batch, float *dst, int dst_width, int dst_height, cudaStream_t stream); 16 | 17 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/include/types.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "config.h" 3 | 4 | struct alignas(float) Detection { 5 | //center_x center_y w h 6 | float bbox[4]; 7 | float conf; // bbox_conf * cls_conf 8 | float class_id; 9 | }; 10 | 11 | struct AffineMatrix { 12 | float value[6]; 13 | }; 14 | 15 | const int bbox_element = sizeof(AffineMatrix) / sizeof(float)+1; // left, top, right, bottom, confidence, class, keepflag 16 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/include/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) { 6 | int w, h, x, y; 7 | float r_w = input_w / (img.cols*1.0); 8 | float r_h = input_h / (img.rows*1.0); 9 | if (r_h > r_w) { 10 | w = input_w; 11 | h = r_w * img.rows; 12 | x = 0; 13 | y = (input_h - h) / 2; 14 | } else { 15 | w = r_h * img.cols; 16 | h = input_h; 17 | x = (input_w - w) / 2; 18 | y = 0; 19 | } 20 | cv::Mat re(h, w, CV_8UC3); 21 | cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); 22 | cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); 23 | re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); 24 | return out; 25 | } 26 | 27 | static inline int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { 28 | DIR *p_dir = opendir(p_dir_name); 29 | if (p_dir == nullptr) { 30 | return -1; 31 | } 32 | 33 | struct dirent* p_file = nullptr; 34 | while ((p_file = readdir(p_dir)) != nullptr) { 35 | if (strcmp(p_file->d_name, ".") != 0 && 36 | strcmp(p_file->d_name, "..") != 0) { 37 | //std::string cur_file_name(p_dir_name); 38 | //cur_file_name += "/"; 39 | //cur_file_name += p_file->d_name; 40 | std::string cur_file_name(p_file->d_name); 41 | file_names.push_back(cur_file_name); 42 | } 43 | } 44 | 45 | closedir(p_dir); 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/main.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include "model.h" 6 | #include "utils.h" 7 | #include "preprocess.h" 8 | #include "postprocess.h" 9 | #include "cuda_utils.h" 10 | #include "logging.h" 11 | 12 | Logger gLogger; 13 | using namespace nvinfer1; 14 | const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; 15 | 16 | void serialize_engine(std::string &wts_name, std::string &engine_name, std::string &sub_type) { 17 | IBuilder *builder = createInferBuilder(gLogger); 18 | IBuilderConfig *config = builder->createBuilderConfig(); 19 | IHostMemory *serialized_engine = nullptr; 20 | 21 | if (sub_type == "n") { 22 | serialized_engine = buildEngineYolov8n(builder, config, DataType::kFLOAT, wts_name); 23 | } else if (sub_type == "s") { 24 | serialized_engine = buildEngineYolov8s(builder, config, DataType::kFLOAT, wts_name); 25 | } else if (sub_type == "m") { 26 | serialized_engine = buildEngineYolov8m(builder, config, DataType::kFLOAT, wts_name); 27 | } else if (sub_type == "l") { 28 | serialized_engine = buildEngineYolov8l(builder, config, DataType::kFLOAT, wts_name); 29 | } else if (sub_type == "x") { 30 | serialized_engine = buildEngineYolov8x(builder, config, DataType::kFLOAT, wts_name); 31 | } 32 | 33 | assert(serialized_engine); 34 | std::ofstream p(engine_name, std::ios::binary); 35 | if (!p) { 36 | std::cout << "could not open plan output file" << std::endl; 37 | assert(false); 38 | } 39 | p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); 40 | 41 | delete builder; 42 | delete config; 43 | delete serialized_engine; 44 | } 45 | 46 | 47 | void deserialize_engine(std::string &engine_name, IRuntime **runtime, ICudaEngine **engine, IExecutionContext **context) { 48 | std::ifstream file(engine_name, std::ios::binary); 49 | if (!file.good()) { 50 | std::cerr << "read " << engine_name << " error!" << std::endl; 51 | assert(false); 52 | } 53 | size_t size = 0; 54 | file.seekg(0, file.end); 55 | size = file.tellg(); 56 | file.seekg(0, file.beg); 57 | char *serialized_engine = new char[size]; 58 | assert(serialized_engine); 59 | file.read(serialized_engine, size); 60 | file.close(); 61 | 62 | *runtime = createInferRuntime(gLogger); 63 | assert(*runtime); 64 | *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); 65 | assert(*engine); 66 | *context = (*engine)->createExecutionContext(); 67 | assert(*context); 68 | delete[] serialized_engine; 69 | } 70 | 71 | void prepare_buffer(ICudaEngine *engine, float **input_buffer_device, float **output_buffer_device, 72 | float **output_buffer_host, float **decode_ptr_host, float **decode_ptr_device, std::string cuda_post_process) { 73 | assert(engine->getNbBindings() == 2); 74 | // In order to bind the buffers, we need to know the names of the input and output tensors. 75 | // Note that indices are guaranteed to be less than IEngine::getNbBindings() 76 | const int inputIndex = engine->getBindingIndex(kInputTensorName); 77 | const int outputIndex = engine->getBindingIndex(kOutputTensorName); 78 | assert(inputIndex == 0); 79 | assert(outputIndex == 1); 80 | // Create GPU buffers on device 81 | CUDA_CHECK(cudaMalloc((void **) input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); 82 | CUDA_CHECK(cudaMalloc((void **) output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); 83 | if (cuda_post_process == "c") { 84 | *output_buffer_host = new float[kBatchSize * kOutputSize]; 85 | } else if (cuda_post_process == "g") { 86 | if (kBatchSize > 1) { 87 | std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; 88 | exit(0); 89 | } 90 | // Allocate memory for decode_ptr_host and copy to device 91 | *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; 92 | CUDA_CHECK(cudaMalloc((void **)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); 93 | } 94 | } 95 | 96 | void infer(IExecutionContext &context, cudaStream_t &stream, void **buffers, float *output, int batchsize, float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { 97 | // infer on the batch asynchronously, and DMA output back to host 98 | // auto start = std::chrono::system_clock::now(); 99 | context.enqueue(batchsize, buffers, stream, nullptr); 100 | if (cuda_post_process == "c") { 101 | CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,stream)); 102 | // auto end = std::chrono::system_clock::now(); 103 | // std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; 104 | } else if (cuda_post_process == "g") { 105 | CUDA_CHECK(cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); 106 | cuda_decode((float *)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); 107 | cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);//cuda nms 108 | CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, stream)); 109 | // auto end = std::chrono::system_clock::now(); 110 | // std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; 111 | } 112 | 113 | CUDA_CHECK(cudaStreamSynchronize(stream)); 114 | } 115 | 116 | 117 | bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, std::string &img_dir, std::string &sub_type, std::string &cuda_post_process) { 118 | if (argc < 4) return false; 119 | if (std::string(argv[1]) == "-s" && argc == 5) { 120 | wts = std::string(argv[2]); 121 | engine = std::string(argv[3]); 122 | sub_type = std::string(argv[4]); 123 | } else if (std::string(argv[1]) == "-d" && argc == 5) { 124 | engine = std::string(argv[2]); 125 | img_dir = std::string(argv[3]); 126 | cuda_post_process = std::string(argv[4]); 127 | } else { 128 | return false; 129 | } 130 | return true; 131 | } 132 | 133 | int main(int argc, char **argv) { 134 | cudaSetDevice(kGpuId); 135 | std::string wts_name = ""; 136 | std::string engine_name = ""; 137 | std::string img_dir; 138 | std::string sub_type = ""; 139 | std::string cuda_post_process=""; 140 | int model_bboxes; 141 | 142 | if (!parse_args(argc, argv, wts_name, engine_name, img_dir, sub_type, cuda_post_process)) { 143 | std::cerr << "Arguments not right!" << std::endl; 144 | std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x] // serialize model to plan file" << std::endl; 145 | std::cerr << "./yolov8 -d [.engine] ../samples [c/g]// deserialize plan file and run inference" << std::endl; 146 | return -1; 147 | } 148 | 149 | // Create a model using the API directly and serialize it to a file 150 | if (!wts_name.empty()) { 151 | serialize_engine(wts_name, engine_name, sub_type); 152 | return 0; 153 | } 154 | 155 | // Deserialize the engine from file 156 | IRuntime *runtime = nullptr; 157 | ICudaEngine *engine = nullptr; 158 | IExecutionContext *context = nullptr; 159 | deserialize_engine(engine_name, &runtime, &engine, &context); 160 | cudaStream_t stream; 161 | CUDA_CHECK(cudaStreamCreate(&stream)); 162 | cuda_preprocess_init(kMaxInputImageSize); 163 | auto out_dims = engine->getBindingDimensions(1); 164 | model_bboxes = out_dims.d[0]; 165 | // Prepare cpu and gpu buffers 166 | float *device_buffers[2]; 167 | float *output_buffer_host = nullptr; 168 | float *decode_ptr_host=nullptr; 169 | float *decode_ptr_device=nullptr; 170 | 171 | // Read images from directory 172 | std::vector file_names; 173 | if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { 174 | std::cerr << "read_files_in_dir failed." << std::endl; 175 | return -1; 176 | } 177 | 178 | prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process); 179 | 180 | // batch predict 181 | for (size_t i = 0; i < file_names.size(); i += kBatchSize) { 182 | // Get a batch of images 183 | std::vector img_batch; 184 | std::vector img_name_batch; 185 | for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { 186 | cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); 187 | img_batch.push_back(img); 188 | img_name_batch.push_back(file_names[j]); 189 | } 190 | auto start = std::chrono::system_clock::now(); 191 | // Preprocess 192 | cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); 193 | // Run inference 194 | infer(*context, stream, (void **)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); 195 | std::vector> res_batch; 196 | if (cuda_post_process == "c") { 197 | // NMS 198 | batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); 199 | } else if (cuda_post_process == "g") { 200 | //Process gpu decode and nms results 201 | batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch); 202 | } 203 | auto end = std::chrono::system_clock::now(); 204 | std::cout << "inference and gpu postprocess time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; 205 | // Draw bounding boxes 206 | draw_bbox(img_batch, res_batch); 207 | // Save images 208 | for (size_t j = 0; j < img_batch.size(); j++) { 209 | cv::imwrite("_" + img_name_batch[j], img_batch[j]); 210 | } 211 | } 212 | 213 | // Release stream and buffers 214 | cudaStreamDestroy(stream); 215 | CUDA_CHECK(cudaFree(device_buffers[0])); 216 | CUDA_CHECK(cudaFree(device_buffers[1])); 217 | CUDA_CHECK(cudaFree(decode_ptr_device)); 218 | delete[] decode_ptr_host; 219 | delete[] output_buffer_host; 220 | cuda_preprocess_destroy(); 221 | // Destroy the engine 222 | delete context; 223 | delete engine; 224 | delete runtime; 225 | 226 | // Print histogram of the output distribution 227 | //std::cout << "\nOutput:\n\n"; 228 | //for (unsigned int i = 0; i < kOutputSize; i++) 229 | //{ 230 | // std::cout << prob[i] << ", "; 231 | // if (i % 10 == 0) std::cout << std::endl; 232 | //} 233 | //std::cout << std::endl; 234 | 235 | return 0; 236 | } 237 | 238 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/plugin/yololayer.cu: -------------------------------------------------------------------------------- 1 | #include "yololayer.h" 2 | #include "types.h" 3 | #include 4 | #include 5 | #include "cuda_utils.h" 6 | #include 7 | #include 8 | 9 | namespace Tn { 10 | template 11 | void write(char*& buffer, const T& val) { 12 | *reinterpret_cast(buffer) = val; 13 | buffer += sizeof(T); 14 | } 15 | 16 | template 17 | void read(const char*& buffer, T& val) { 18 | val = *reinterpret_cast(buffer); 19 | buffer += sizeof(T); 20 | } 21 | } // namespace Tn 22 | 23 | 24 | namespace nvinfer1 { 25 | YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut) { 26 | mClassCount = classCount; 27 | mYoloV8NetWidth = netWidth; 28 | mYoloV8netHeight = netHeight; 29 | mMaxOutObject = maxOut; 30 | } 31 | 32 | YoloLayerPlugin::~YoloLayerPlugin() {} 33 | 34 | YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { 35 | using namespace Tn; 36 | const char* d = reinterpret_cast(data), * a = d; 37 | read(d, mClassCount); 38 | read(d, mThreadCount); 39 | read(d, mYoloV8NetWidth); 40 | read(d, mYoloV8netHeight); 41 | read(d, mMaxOutObject); 42 | 43 | assert(d == a + length); 44 | } 45 | 46 | void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { 47 | 48 | using namespace Tn; 49 | char* d = static_cast(buffer), * a = d; 50 | write(d, mClassCount); 51 | write(d, mThreadCount); 52 | write(d, mYoloV8NetWidth); 53 | write(d, mYoloV8netHeight); 54 | write(d, mMaxOutObject); 55 | 56 | assert(d == a + getSerializationSize()); 57 | } 58 | 59 | size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { 60 | return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject); 61 | } 62 | 63 | int YoloLayerPlugin::initialize() TRT_NOEXCEPT { 64 | return 0; 65 | } 66 | 67 | nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT { 68 | int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float); 69 | return nvinfer1::Dims3(total_size + 1, 1, 1); 70 | } 71 | 72 | void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { 73 | mPluginNamespace = pluginNamespace; 74 | } 75 | 76 | const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { 77 | return mPluginNamespace; 78 | } 79 | 80 | nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { 81 | return nvinfer1::DataType::kFLOAT; 82 | } 83 | 84 | bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { 85 | 86 | return false; 87 | } 88 | 89 | bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { 90 | 91 | return false; 92 | } 93 | 94 | void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput, nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT {}; 95 | 96 | void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {}; 97 | 98 | void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} 99 | 100 | const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { 101 | 102 | return "YoloLayer_TRT"; 103 | } 104 | 105 | const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { 106 | return "1"; 107 | } 108 | 109 | void YoloLayerPlugin::destroy() TRT_NOEXCEPT { 110 | 111 | delete this; 112 | } 113 | 114 | nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { 115 | 116 | YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject); 117 | p->setPluginNamespace(mPluginNamespace); 118 | return p; 119 | } 120 | 121 | int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { 122 | 123 | forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize); 124 | return 0; 125 | } 126 | 127 | 128 | __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }; 129 | 130 | __global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, 131 | const int grid_h, int grid_w, const int stride, int classes, int outputElem) { 132 | int idx = threadIdx.x + blockDim.x * blockIdx.x; 133 | if (idx >= numElements) return; 134 | 135 | int total_grid = grid_h * grid_w; 136 | int info_len = 4 + classes; 137 | int batchIdx = idx / total_grid; 138 | int elemIdx = idx % total_grid; 139 | const float* curInput = input + batchIdx * total_grid * info_len; 140 | int outputIdx = batchIdx * outputElem; 141 | 142 | int class_id = 0; 143 | float max_cls_prob = 0.0; 144 | for (int i = 4; i < info_len; i++) { 145 | float p = Logist(curInput[elemIdx + i * total_grid]); 146 | if (p > max_cls_prob) { 147 | max_cls_prob = p; 148 | class_id = i - 4; 149 | } 150 | } 151 | 152 | if (max_cls_prob < 0.1) return; 153 | 154 | int count = (int)atomicAdd(output + outputIdx, 1); 155 | if (count >= maxoutobject) return; 156 | char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection); 157 | Detection* det = (Detection*)(data); 158 | 159 | int row = elemIdx / grid_w; 160 | int col = elemIdx % grid_w; 161 | 162 | det->conf = max_cls_prob; 163 | det->class_id = class_id; 164 | det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride; 165 | det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride; 166 | det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride; 167 | det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride; 168 | } 169 | 170 | void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,int mYoloV8NetWidth, int batchSize) { 171 | int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float); 172 | cudaMemsetAsync(output, 0, sizeof(float), stream); 173 | for (int idx = 0; idx < batchSize; ++idx) { 174 | CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream)); 175 | } 176 | int numElem = 0; 177 | int grids[3][2] = { {mYoloV8netHeight / 8, mYoloV8NetWidth / 8}, {mYoloV8netHeight / 16, mYoloV8NetWidth / 16}, {mYoloV8netHeight / 32, mYoloV8NetWidth / 32} }; 178 | int strides[] = { 8, 16, 32 }; 179 | for (unsigned int i = 0; i < 3; i++) { 180 | int grid_h = grids[i][0]; 181 | int grid_w = grids[i][1]; 182 | int stride = strides[i]; 183 | numElem = grid_h * grid_w * batchSize; 184 | if (numElem < mThreadCount) mThreadCount = numElem; 185 | 186 | CalDetection << <(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream >> > 187 | (inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem); 188 | } 189 | } 190 | 191 | PluginFieldCollection YoloPluginCreator::mFC{}; 192 | std::vector YoloPluginCreator::mPluginAttributes; 193 | 194 | YoloPluginCreator::YoloPluginCreator() { 195 | mPluginAttributes.clear(); 196 | mFC.nbFields = mPluginAttributes.size(); 197 | mFC.fields = mPluginAttributes.data(); 198 | } 199 | 200 | const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT { 201 | return "YoloLayer_TRT"; 202 | } 203 | 204 | const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT { 205 | return "1"; 206 | } 207 | 208 | const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { 209 | return &mFC; 210 | } 211 | 212 | IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { 213 | assert(fc->nbFields == 1); 214 | assert(strcmp(fc->fields[0].name, "netinfo") == 0); 215 | int* p_netinfo = (int*)(fc->fields[0].data); 216 | int class_count = p_netinfo[0]; 217 | int input_w = p_netinfo[1]; 218 | int input_h = p_netinfo[2]; 219 | int max_output_object_count = p_netinfo[3]; 220 | YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count); 221 | obj->setPluginNamespace(mNamespace.c_str()); 222 | return obj; 223 | } 224 | 225 | IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { 226 | // This object will be deleted when the network is destroyed, which will 227 | // call YoloLayerPlugin::destroy() 228 | YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); 229 | obj->setPluginNamespace(mNamespace.c_str()); 230 | return obj; 231 | } 232 | 233 | } // namespace nvinfer1 234 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/plugin/yololayer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "macros.h" 3 | #include "NvInfer.h" 4 | #include 5 | #include 6 | #include "macros.h" 7 | namespace nvinfer1 { 8 | class API YoloLayerPlugin : public IPluginV2IOExt { 9 | public: 10 | YoloLayerPlugin(int classCount, int netWdith, int netHeight, int maxOut); 11 | YoloLayerPlugin(const void* data, size_t length); 12 | ~YoloLayerPlugin(); 13 | 14 | int getNbOutputs() const TRT_NOEXCEPT override { 15 | return 1; 16 | } 17 | 18 | nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; 19 | 20 | int initialize() TRT_NOEXCEPT override; 21 | 22 | virtual void terminate() TRT_NOEXCEPT override {} 23 | 24 | virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } 25 | 26 | virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; 27 | 28 | virtual size_t getSerializationSize() const TRT_NOEXCEPT override; 29 | 30 | virtual void serialize(void* buffer) const TRT_NOEXCEPT override; 31 | 32 | bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { 33 | return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; 34 | } 35 | 36 | 37 | const char* getPluginType() const TRT_NOEXCEPT override; 38 | 39 | const char* getPluginVersion() const TRT_NOEXCEPT override; 40 | 41 | void destroy() TRT_NOEXCEPT override; 42 | 43 | IPluginV2IOExt* clone() const TRT_NOEXCEPT override; 44 | 45 | void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; 46 | 47 | const char* getPluginNamespace() const TRT_NOEXCEPT override; 48 | 49 | nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const TRT_NOEXCEPT; 50 | 51 | bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; 52 | 53 | bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; 54 | 55 | void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; 56 | 57 | void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out, int32_t nbOutput) TRT_NOEXCEPT override; 58 | 59 | void detachFromContext() TRT_NOEXCEPT override; 60 | 61 | private: 62 | void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, int mYoloV8NetWidth, int batchSize); 63 | int mThreadCount = 256; 64 | const char* mPluginNamespace; 65 | int mClassCount; 66 | int mYoloV8NetWidth; 67 | int mYoloV8netHeight; 68 | int mMaxOutObject; 69 | }; 70 | 71 | class API YoloPluginCreator : public IPluginCreator { 72 | public: 73 | YoloPluginCreator(); 74 | ~YoloPluginCreator() override = default; 75 | 76 | const char* getPluginName() const TRT_NOEXCEPT override; 77 | 78 | const char* getPluginVersion() const TRT_NOEXCEPT override; 79 | 80 | const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; 81 | 82 | nvinfer1::IPluginV2IOExt* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override; 83 | 84 | nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; 85 | 86 | void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { 87 | mNamespace = libNamespace; 88 | } 89 | 90 | const char* getPluginNamespace() const TRT_NOEXCEPT override { 91 | return mNamespace.c_str(); 92 | } 93 | 94 | private: 95 | std::string mNamespace; 96 | static PluginFieldCollection mFC; 97 | static std::vector mPluginAttributes; 98 | }; 99 | REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); 100 | } // namespace nvinfer1 101 | 102 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/src/block.cpp: -------------------------------------------------------------------------------- 1 | #include "block.h" 2 | #include "yololayer.h" 3 | #include "config.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | std::map loadWeights(const std::string file){ 10 | std::cout << "Loading weights: " << file << std::endl; 11 | std::map WeightMap; 12 | 13 | std::ifstream input(file); 14 | assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); 15 | 16 | int32_t count; 17 | input>>count ; 18 | assert(count > 0 && "Invalid weight map file."); 19 | 20 | while(count--){ 21 | nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; 22 | uint32_t size; 23 | 24 | std::string name; 25 | input >> name >> std::dec >> size; 26 | wt.type = nvinfer1::DataType::kFLOAT; 27 | 28 | uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); 29 | for(uint32_t x = 0, y = size; x < y; x++){ 30 | input >> std::hex >> val[x]; 31 | } 32 | wt.values = val; 33 | wt.count = size; 34 | WeightMap[name] = wt; 35 | } 36 | return WeightMap; 37 | } 38 | 39 | 40 | static nvinfer1::IScaleLayer* addBatchNorm2d(nvinfer1::INetworkDefinition* network, std::map weightMap, 41 | nvinfer1::ITensor& input, std::string lname, float eps){ 42 | float* gamma = (float*)weightMap[lname + ".weight"].values; 43 | float* beta = (float*)weightMap[lname + ".bias"].values; 44 | float* mean = (float*)weightMap[lname + ".running_mean"].values; 45 | float* var = (float*)weightMap[lname + ".running_var"].values; 46 | int len = weightMap[lname + ".running_var"].count; 47 | 48 | float* scval = reinterpret_cast(malloc(sizeof(float) * len)); 49 | for(int i = 0; i < len; i++){ 50 | scval[i] = gamma[i] / sqrt(var[i] + eps); 51 | } 52 | nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len}; 53 | 54 | float* shval = reinterpret_cast(malloc(sizeof(float) * len)); 55 | for(int i = 0; i < len; i++){ 56 | shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); 57 | } 58 | nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len}; 59 | 60 | float* pval = reinterpret_cast(malloc(sizeof(float) * len)); 61 | for (int i = 0; i < len; i++) { 62 | pval[i] = 1.0; 63 | } 64 | nvinfer1::Weights power{ nvinfer1::DataType::kFLOAT, pval, len }; 65 | weightMap[lname + ".scale"] = scale; 66 | weightMap[lname + ".shift"] = shift; 67 | weightMap[lname + ".power"] = power; 68 | nvinfer1::IScaleLayer* output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power); 69 | assert(output); 70 | return output; 71 | } 72 | 73 | 74 | nvinfer1::IElementWiseLayer* convBnSiLU(nvinfer1::INetworkDefinition* network, std::map weightMap, 75 | nvinfer1::ITensor& input, int ch, int k, int s, int p, std::string lname){ 76 | nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; 77 | nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname+".conv.weight"], bias_empty); 78 | assert(conv); 79 | conv->setStrideNd(nvinfer1::DimsHW{s, s}); 80 | conv->setPaddingNd(nvinfer1::DimsHW{p, p}); 81 | 82 | nvinfer1::IScaleLayer* bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname+".bn", 1e-5); 83 | 84 | nvinfer1::IActivationLayer* sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); 85 | nvinfer1::IElementWiseLayer* ew = network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); 86 | assert(ew); 87 | return ew; 88 | } 89 | 90 | 91 | nvinfer1::ILayer* bottleneck(nvinfer1::INetworkDefinition* network, std::map weightMap, 92 | nvinfer1::ITensor& input, int c1, int c2, bool shortcut, float e, std::string lname){ 93 | nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c2, 3, 1, 1, lname+".cv1"); 94 | nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, 3, 1, 1, lname+".cv2"); 95 | 96 | if(shortcut && c1 == c2){ 97 | nvinfer1::IElementWiseLayer* ew = network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); 98 | return ew; 99 | } 100 | return conv2; 101 | } 102 | 103 | 104 | nvinfer1::IElementWiseLayer* C2F(nvinfer1::INetworkDefinition* network, std::map weightMap, 105 | nvinfer1::ITensor& input, int c1, int c2, int n, bool shortcut, float e, std::string lname){ 106 | int c_ = (float)c2 * e; 107 | 108 | nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, 2* c_, 1, 1, 0, lname+".cv1"); 109 | nvinfer1::Dims d = conv1->getOutput(0)->getDimensions(); 110 | 111 | nvinfer1::ISliceLayer* split1 = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims3{0,0,0}, nvinfer1::Dims3{d.d[0]/2, d.d[1], d.d[2]}, nvinfer1::Dims3{1,1,1}); 112 | nvinfer1::ISliceLayer* split2 = network->addSlice(*conv1->getOutput(0), nvinfer1::Dims3{d.d[0]/2,0,0}, nvinfer1::Dims3{d.d[0]/2, d.d[1], d.d[2]}, nvinfer1::Dims3{1,1,1}); 113 | nvinfer1::ITensor* inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)}; 114 | nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensor0, 2); 115 | nvinfer1::ITensor* y1 = split2->getOutput(0); 116 | for(int i = 0; i < n; i++){ 117 | auto* b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, 1.0, lname+".m." + std::to_string(i)); 118 | y1 = b->getOutput(0); 119 | 120 | nvinfer1::ITensor* inputTensors[] = {cat->getOutput(0), b->getOutput(0)}; 121 | cat = network->addConcatenation(inputTensors, 2); 122 | } 123 | 124 | nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname+".cv2"); 125 | 126 | return conv2; 127 | } 128 | 129 | 130 | nvinfer1::IElementWiseLayer* SPPF(nvinfer1::INetworkDefinition* network, std::map weightMap, 131 | nvinfer1::ITensor& input, int c1, int c2, int k, std::string lname){ 132 | int c_ = c1 / 2; 133 | 134 | nvinfer1::IElementWiseLayer* conv1 = convBnSiLU(network, weightMap, input, c_, 1, 1, 0, lname+".cv1"); 135 | 136 | nvinfer1::IPoolingLayer* pool1 = network->addPoolingNd(*conv1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k,k}); 137 | pool1->setStrideNd(nvinfer1::DimsHW{1, 1}); 138 | pool1->setPaddingNd(nvinfer1::DimsHW{ k / 2, k / 2 }); 139 | nvinfer1::IPoolingLayer* pool2 = network->addPoolingNd(*pool1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k,k}); 140 | pool2->setStrideNd(nvinfer1::DimsHW{1, 1}); 141 | pool2->setPaddingNd(nvinfer1::DimsHW{ k / 2, k / 2 }); 142 | nvinfer1::IPoolingLayer* pool3 = network->addPoolingNd(*pool2->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k,k}); 143 | pool3->setStrideNd(nvinfer1::DimsHW{1, 1}); 144 | pool3->setPaddingNd(nvinfer1::DimsHW{ k / 2, k / 2 }); 145 | nvinfer1::ITensor* inputTensors[] = {conv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), pool3->getOutput(0)}; 146 | nvinfer1::IConcatenationLayer* cat = network->addConcatenation(inputTensors, 4); 147 | nvinfer1::IElementWiseLayer* conv2 = convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname+".cv2"); 148 | return conv2; 149 | } 150 | 151 | 152 | nvinfer1::IShuffleLayer* DFL(nvinfer1::INetworkDefinition* network, std::map weightMap, 153 | nvinfer1::ITensor& input, int ch, int grid, int k, int s, int p, std::string lname){ 154 | 155 | nvinfer1::IShuffleLayer* shuffle1 = network->addShuffle(input); 156 | shuffle1->setReshapeDimensions(nvinfer1::Dims3{4, 16, grid}); 157 | shuffle1->setSecondTranspose(nvinfer1::Permutation{1, 0, 2}); 158 | nvinfer1::ISoftMaxLayer* softmax = network->addSoftMax(*shuffle1->getOutput(0)); 159 | 160 | nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; 161 | nvinfer1::IConvolutionLayer* conv = network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty); 162 | conv->setStrideNd(nvinfer1::DimsHW{s, s}); 163 | conv->setPaddingNd(nvinfer1::DimsHW{p, p}); 164 | 165 | nvinfer1::IShuffleLayer* shuffle2 = network->addShuffle(*conv->getOutput(0)); 166 | shuffle2->setReshapeDimensions(nvinfer1::Dims2{4, grid}); 167 | 168 | return shuffle2; 169 | } 170 | 171 | 172 | nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition *network, std::vector dets) { 173 | auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); 174 | 175 | nvinfer1::PluginField plugin_fields[1]; 176 | int netinfo[4] = {kNumClass, kInputW, kInputH, kMaxNumOutputBbox}; 177 | plugin_fields[0].data = netinfo; 178 | plugin_fields[0].length = 4; 179 | plugin_fields[0].name = "netinfo"; 180 | plugin_fields[0].type = nvinfer1::PluginFieldType::kFLOAT32; 181 | 182 | 183 | nvinfer1::PluginFieldCollection plugin_data; 184 | plugin_data.nbFields = 1; 185 | plugin_data.fields = plugin_fields; 186 | nvinfer1::IPluginV2 *plugin_obj = creator->createPlugin("yololayer", &plugin_data); 187 | std::vector input_tensors; 188 | for (auto det: dets) { 189 | input_tensors.push_back(det->getOutput(0)); 190 | } 191 | auto yolo = network->addPluginV2(&input_tensors[0], input_tensors.size(), *plugin_obj); 192 | return yolo; 193 | } 194 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/src/calibrator.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "calibrator.h" 6 | #include "cuda_utils.h" 7 | #include "utils.h" 8 | 9 | Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, 10 | const char* input_blob_name, bool read_cache) 11 | : batchsize_(batchsize) 12 | , input_w_(input_w) 13 | , input_h_(input_h) 14 | , img_idx_(0) 15 | , img_dir_(img_dir) 16 | , calib_table_name_(calib_table_name) 17 | , input_blob_name_(input_blob_name) 18 | , read_cache_(read_cache) 19 | { 20 | input_count_ = 3 * input_w * input_h * batchsize; 21 | CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float))); 22 | read_files_in_dir(img_dir, img_files_); 23 | } 24 | 25 | Int8EntropyCalibrator2::~Int8EntropyCalibrator2() 26 | { 27 | CUDA_CHECK(cudaFree(device_input_)); 28 | } 29 | 30 | int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT 31 | { 32 | return batchsize_; 33 | } 34 | 35 | bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT 36 | { 37 | if (img_idx_ + batchsize_ > (int)img_files_.size()) { 38 | return false; 39 | } 40 | 41 | std::vector input_imgs_; 42 | for (int i = img_idx_; i < img_idx_ + batchsize_; i++) { 43 | std::cout << img_files_[i] << " " << i << std::endl; 44 | cv::Mat temp = cv::imread(img_dir_ + img_files_[i]); 45 | if (temp.empty()){ 46 | std::cerr << "Fatal error: image cannot open!" << std::endl; 47 | return false; 48 | } 49 | cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_); 50 | input_imgs_.push_back(pr_img); 51 | } 52 | img_idx_ += batchsize_; 53 | cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false); 54 | CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); 55 | assert(!strcmp(names[0], input_blob_name_)); 56 | bindings[0] = device_input_; 57 | return true; 58 | } 59 | 60 | const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT 61 | { 62 | std::cout << "reading calib cache: " << calib_table_name_ << std::endl; 63 | calib_cache_.clear(); 64 | std::ifstream input(calib_table_name_, std::ios::binary); 65 | input >> std::noskipws; 66 | if (read_cache_ && input.good()) 67 | { 68 | std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calib_cache_)); 69 | } 70 | length = calib_cache_.size(); 71 | return length ? calib_cache_.data() : nullptr; 72 | } 73 | 74 | void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT 75 | { 76 | std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl; 77 | std::ofstream output(calib_table_name_, std::ios::binary); 78 | output.write(reinterpret_cast(cache), length); 79 | } 80 | 81 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/src/postprocess.cpp: -------------------------------------------------------------------------------- 1 | #include "postprocess.h" 2 | 3 | 4 | cv::Rect get_rect(cv::Mat &img, float bbox[4]) { 5 | float l, r, t, b; 6 | float r_w = kInputW / (img.cols * 1.0); 7 | float r_h = kInputH / (img.rows * 1.0); 8 | 9 | if (r_h > r_w) { 10 | l = bbox[0]; 11 | r = bbox[2]; 12 | t = bbox[1] - (kInputH - r_w * img.rows) / 2; 13 | b = bbox[3] - (kInputH - r_w * img.rows) / 2; 14 | l = l / r_w; 15 | r = r / r_w; 16 | t = t / r_w; 17 | b = b / r_w; 18 | } else { 19 | l = bbox[0] - (kInputW - r_h * img.cols) / 2; 20 | r = bbox[2] - (kInputW - r_h * img.cols) / 2; 21 | t = bbox[1]; 22 | b = bbox[3]; 23 | l = l / r_h; 24 | r = r / r_h; 25 | t = t / r_h; 26 | b = b / r_h; 27 | } 28 | return cv::Rect(round(l), round(t), round(r - l), round(b - t)); 29 | } 30 | 31 | static float iou(float lbox[4], float rbox[4]) { 32 | float interBox[] = { 33 | (std::max)(lbox[0], rbox[0]), //left 34 | (std::min)(lbox[2], rbox[2]), //right 35 | (std::max)(lbox[1], rbox[1]), //top 36 | (std::min)(lbox[3], rbox[3]), //bottom 37 | }; 38 | 39 | if (interBox[2] > interBox[3] || interBox[0] > interBox[1]) 40 | return 0.0f; 41 | 42 | float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]); 43 | float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS; 44 | return interBoxS / unionBoxS; 45 | } 46 | 47 | static bool cmp(const Detection &a, const Detection &b) { 48 | return a.conf > b.conf; 49 | } 50 | 51 | void nms(std::vector &res, float *output, float conf_thresh, float nms_thresh) { 52 | int det_size = sizeof(Detection) / sizeof(float); 53 | std::map> m; 54 | 55 | for (int i = 0; i < output[0]; i++) { 56 | if (output[1 + det_size * i + 4] <= conf_thresh) continue; 57 | Detection det; 58 | memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); 59 | if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); 60 | m[det.class_id].push_back(det); 61 | } 62 | for (auto it = m.begin(); it != m.end(); it++) { 63 | auto &dets = it->second; 64 | std::sort(dets.begin(), dets.end(), cmp); 65 | for (size_t m = 0; m < dets.size(); ++m) { 66 | auto &item = dets[m]; 67 | res.push_back(item); 68 | for (size_t n = m + 1; n < dets.size(); ++n) { 69 | if (iou(item.bbox, dets[n].bbox) > nms_thresh) { 70 | dets.erase(dets.begin() + n); 71 | --n; 72 | } 73 | } 74 | } 75 | } 76 | } 77 | 78 | void batch_nms(std::vector> &res_batch, float *output, int batch_size, int output_size, 79 | float conf_thresh, float nms_thresh) { 80 | res_batch.resize(batch_size); 81 | for (int i = 0; i < batch_size; i++) { 82 | nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh); 83 | } 84 | } 85 | 86 | void process_decode_ptr_host(std::vector &res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, int count) { 87 | Detection det; 88 | for (int i = 0; i < count; i++) { 89 | int basic_pos = 1 + i * bbox_element; 90 | int keep_flag = decode_ptr_host[basic_pos + 6]; 91 | if (keep_flag == 1) { 92 | det.bbox[0] = decode_ptr_host[basic_pos + 0]; 93 | det.bbox[1] = decode_ptr_host[basic_pos + 1]; 94 | det.bbox[2] = decode_ptr_host[basic_pos + 2]; 95 | det.bbox[3] = decode_ptr_host[basic_pos + 3]; 96 | det.conf = decode_ptr_host[basic_pos + 4]; 97 | det.class_id = decode_ptr_host[basic_pos + 5]; 98 | res.push_back(det); 99 | } 100 | } 101 | } 102 | 103 | void batch_process(std::vector> &res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch) { 104 | res_batch.resize(batch_size); 105 | int count = static_cast(*decode_ptr_host); 106 | count = std::min(count, kMaxNumOutputBbox); 107 | for (int i = 0; i < batch_size; i++) { 108 | auto& img = const_cast(img_batch[i]); 109 | process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count); 110 | } 111 | } 112 | 113 | void draw_bbox(std::vector &img_batch, std::vector> &res_batch) { 114 | for (size_t i = 0; i < img_batch.size(); i++) { 115 | auto &res = res_batch[i]; 116 | cv::Mat img = img_batch[i]; 117 | for (size_t j = 0; j < res.size(); j++) { 118 | cv::Rect r = get_rect(img, res[j].bbox); 119 | cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); 120 | cv::putText(img, std::to_string((int) res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 121 | 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); 122 | } 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/src/postprocess.cu: -------------------------------------------------------------------------------- 1 | // 2 | // Created by lindsay on 23-7-17. 3 | // 4 | #include "types.h" 5 | #include "postprocess.h" 6 | 7 | static __global__ void 8 | decode_kernel(float *predict, int num_bboxes, float confidence_threshold, float *parray, int max_objects) { 9 | 10 | float count = predict[0]; 11 | int position = (blockDim.x * blockIdx.x + threadIdx.x); 12 | if (position >= count) 13 | return; 14 | float *pitem = predict + 1 + position * 6; 15 | int index = atomicAdd(parray, 1); 16 | if (index >= max_objects) 17 | return; 18 | float confidence = pitem[4]; 19 | if (confidence < confidence_threshold) 20 | return; 21 | float left = pitem[0]; 22 | float top = pitem[1]; 23 | float right = pitem[2]; 24 | float bottom = pitem[3]; 25 | float label = pitem[5]; 26 | float *pout_item = parray + 1 + index * bbox_element; 27 | *pout_item++ = left; 28 | *pout_item++ = top; 29 | *pout_item++ = right; 30 | *pout_item++ = bottom; 31 | *pout_item++ = confidence; 32 | *pout_item++ = label; 33 | *pout_item++ = 1; // 1 = keep, 0 = ignore 34 | } 35 | 36 | static __device__ float 37 | box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop, float bright, float bbottom) { 38 | 39 | float cleft = max(aleft, bleft); 40 | float ctop = max(atop, btop); 41 | float cright = min(aright, bright); 42 | float cbottom = min(abottom, bbottom); 43 | 44 | float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f); 45 | if (c_area == 0.0f) 46 | return 0.0f; 47 | 48 | float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop); 49 | float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop); 50 | return c_area / (a_area + b_area - c_area); 51 | } 52 | 53 | static __global__ void nms_kernel(float *bboxes, int max_objects, float threshold) { 54 | 55 | int position = (blockDim.x * blockIdx.x + threadIdx.x); 56 | int count = bboxes[0]; 57 | 58 | // float count = 0.0f; 59 | if (position >= count) 60 | return; 61 | 62 | float *pcurrent = bboxes + 1 + position * bbox_element; 63 | for (int i = 1; i < count; ++i) { 64 | float *pitem = bboxes + 1 + i * bbox_element; 65 | if (i == position || pcurrent[5] != pitem[5]) continue; 66 | 67 | if (pitem[4] >= pcurrent[4]) { 68 | if (pitem[4] == pcurrent[4] && i < position) 69 | continue; 70 | 71 | float iou = box_iou( 72 | pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], 73 | pitem[0], pitem[1], pitem[2], pitem[3] 74 | ); 75 | 76 | if (iou > threshold) { 77 | pcurrent[6] = 0; 78 | return; 79 | } 80 | } 81 | } 82 | } 83 | 84 | void cuda_decode(float *predict, int num_bboxes, float confidence_threshold, float *parray, int max_objects, 85 | cudaStream_t stream) { 86 | int block = 256; 87 | int grid = ceil(num_bboxes / (float) block); 88 | decode_kernel << < 89 | grid, block, 0, stream >> > ((float *) predict, num_bboxes, confidence_threshold, parray, max_objects); 90 | 91 | } 92 | 93 | void cuda_nms(float *parray, float nms_threshold, int max_objects, cudaStream_t stream) { 94 | int block = max_objects < 256 ? max_objects : 256; 95 | int grid = ceil(max_objects / (float) block); 96 | nms_kernel << < grid, block, 0, stream >> > (parray, max_objects, nms_threshold); 97 | 98 | } 99 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/src/preprocess.cu: -------------------------------------------------------------------------------- 1 | #include "preprocess.h" 2 | #include "cuda_utils.h" 3 | 4 | static uint8_t *img_buffer_host = nullptr; 5 | static uint8_t *img_buffer_device = nullptr; 6 | 7 | 8 | __global__ void 9 | warpaffine_kernel(uint8_t *src, int src_line_size, int src_width, int src_height, float *dst, int dst_width, 10 | int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) { 11 | int position = blockDim.x * blockIdx.x + threadIdx.x; 12 | if (position >= edge) return; 13 | 14 | float m_x1 = d2s.value[0]; 15 | float m_y1 = d2s.value[1]; 16 | float m_z1 = d2s.value[2]; 17 | float m_x2 = d2s.value[3]; 18 | float m_y2 = d2s.value[4]; 19 | float m_z2 = d2s.value[5]; 20 | 21 | int dx = position % dst_width; 22 | int dy = position / dst_width; 23 | float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f; 24 | float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f; 25 | float c0, c1, c2; 26 | 27 | if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) { 28 | // out of range 29 | c0 = const_value_st; 30 | c1 = const_value_st; 31 | c2 = const_value_st; 32 | } else { 33 | int y_low = floorf(src_y); 34 | int x_low = floorf(src_x); 35 | int y_high = y_low + 1; 36 | int x_high = x_low + 1; 37 | 38 | uint8_t const_value[] = {const_value_st, const_value_st, const_value_st}; 39 | float ly = src_y - y_low; 40 | float lx = src_x - x_low; 41 | float hy = 1 - ly; 42 | float hx = 1 - lx; 43 | float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; 44 | uint8_t *v1 = const_value; 45 | uint8_t *v2 = const_value; 46 | uint8_t *v3 = const_value; 47 | uint8_t *v4 = const_value; 48 | 49 | if (y_low >= 0) { 50 | if (x_low >= 0) 51 | v1 = src + y_low * src_line_size + x_low * 3; 52 | 53 | if (x_high < src_width) 54 | v2 = src + y_low * src_line_size + x_high * 3; 55 | } 56 | 57 | if (y_high < src_height) { 58 | if (x_low >= 0) 59 | v3 = src + y_high * src_line_size + x_low * 3; 60 | 61 | if (x_high < src_width) 62 | v4 = src + y_high * src_line_size + x_high * 3; 63 | } 64 | 65 | c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]; 66 | c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]; 67 | c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]; 68 | } 69 | 70 | // bgr to rgb 71 | float t = c2; 72 | c2 = c0; 73 | c0 = t; 74 | 75 | // normalization 76 | c0 = c0 / 255.0f; 77 | c1 = c1 / 255.0f; 78 | c2 = c2 / 255.0f; 79 | 80 | // rgbrgbrgb to rrrgggbbb 81 | int area = dst_width * dst_height; 82 | float *pdst_c0 = dst + dy * dst_width + dx; 83 | float *pdst_c1 = pdst_c0 + area; 84 | float *pdst_c2 = pdst_c1 + area; 85 | *pdst_c0 = c0; 86 | *pdst_c1 = c1; 87 | *pdst_c2 = c2; 88 | } 89 | 90 | 91 | 92 | 93 | void cuda_preprocess(uint8_t *src, int src_width, int src_height, float *dst, int dst_width, int dst_height, 94 | cudaStream_t stream) { 95 | int img_size = src_width * src_height * 3; 96 | // copy data to pinned memory 97 | memcpy(img_buffer_host, src, img_size); 98 | // copy data to device memory 99 | CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream)); 100 | 101 | AffineMatrix s2d, d2s; 102 | float scale = std::min(dst_height / (float) src_height, dst_width / (float) src_width); 103 | 104 | s2d.value[0] = scale; 105 | s2d.value[1] = 0; 106 | s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5; 107 | s2d.value[3] = 0; 108 | s2d.value[4] = scale; 109 | s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5; 110 | cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value); 111 | cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value); 112 | cv::invertAffineTransform(m2x3_s2d, m2x3_d2s); 113 | 114 | memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value)); 115 | 116 | int jobs = dst_height * dst_width; 117 | int threads = 256; 118 | int blocks = ceil(jobs / (float) threads); 119 | warpaffine_kernel<<>>( 120 | img_buffer_device, src_width * 3, src_width, 121 | src_height, dst, dst_width, 122 | dst_height, 128, d2s, jobs); 123 | } 124 | 125 | 126 | void cuda_batch_preprocess(std::vector &img_batch, 127 | float *dst, int dst_width, int dst_height, 128 | cudaStream_t stream) { 129 | int dst_size = dst_width * dst_height * 3; 130 | for (size_t i = 0; i < img_batch.size(); i++) { 131 | cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, 132 | dst_height, stream); 133 | CUDA_CHECK(cudaStreamSynchronize(stream)); 134 | } 135 | } 136 | 137 | 138 | 139 | 140 | 141 | void cuda_preprocess_init(int max_image_size) { 142 | // prepare input data in pinned memory 143 | CUDA_CHECK(cudaMallocHost((void **) &img_buffer_host, max_image_size * 3)); 144 | // prepare input data in device memory 145 | CUDA_CHECK(cudaMalloc((void **) &img_buffer_device, max_image_size * 3)); 146 | } 147 | 148 | void cuda_preprocess_destroy() { 149 | CUDA_CHECK(cudaFree(img_buffer_device)); 150 | CUDA_CHECK(cudaFreeHost(img_buffer_host)); 151 | } 152 | 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /tensorrtx-yolov8/yolov8_trt.py: -------------------------------------------------------------------------------- 1 | """ 2 | An example that uses TensorRT's Python api to make inferences. 3 | """ 4 | import ctypes 5 | import os 6 | import shutil 7 | import random 8 | import sys 9 | import threading 10 | import time 11 | import cv2 12 | import numpy as np 13 | import pycuda.autoinit 14 | import pycuda.driver as cuda 15 | import tensorrt as trt 16 | 17 | CONF_THRESH = 0.5 18 | IOU_THRESHOLD = 0.4 19 | 20 | 21 | def get_img_path_batches(batch_size, img_dir): 22 | ret = [] 23 | batch = [] 24 | for root, dirs, files in os.walk(img_dir): 25 | for name in files: 26 | if len(batch) == batch_size: 27 | ret.append(batch) 28 | batch = [] 29 | batch.append(os.path.join(root, name)) 30 | if len(batch) > 0: 31 | ret.append(batch) 32 | return ret 33 | 34 | 35 | def plot_one_box(x, img, color=None, label=None, line_thickness=None): 36 | """ 37 | description: Plots one bounding box on image img, 38 | this function comes from YoLov8 project. 39 | param: 40 | x: a box likes [x1,y1,x2,y2] 41 | img: a opencv image object 42 | color: color to draw rectangle, such as (0,255,0) 43 | label: str 44 | line_thickness: int 45 | return: 46 | no return 47 | 48 | """ 49 | tl = ( 50 | line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 51 | ) # line/font thickness 52 | color = color or [random.randint(0, 255) for _ in range(3)] 53 | c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) 54 | cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) 55 | if label: 56 | tf = max(tl - 1, 1) # font thickness 57 | t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] 58 | c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 59 | cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled 60 | cv2.putText( 61 | img, 62 | label, 63 | (c1[0], c1[1] - 2), 64 | 0, 65 | tl / 3, 66 | [225, 255, 255], 67 | thickness=tf, 68 | lineType=cv2.LINE_AA, 69 | ) 70 | 71 | 72 | class YoLov8TRT(object): 73 | """ 74 | description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops. 75 | """ 76 | 77 | def __init__(self, engine_file_path): 78 | # Create a Context on this device, 79 | self.ctx = cuda.Device(0).make_context() 80 | stream = cuda.Stream() 81 | TRT_LOGGER = trt.Logger(trt.Logger.INFO) 82 | runtime = trt.Runtime(TRT_LOGGER) 83 | 84 | # Deserialize the engine from file 85 | with open(engine_file_path, "rb") as f: 86 | engine = runtime.deserialize_cuda_engine(f.read()) 87 | context = engine.create_execution_context() 88 | 89 | host_inputs = [] 90 | cuda_inputs = [] 91 | host_outputs = [] 92 | cuda_outputs = [] 93 | bindings = [] 94 | 95 | for binding in engine: 96 | print('bingding:', binding, engine.get_binding_shape(binding)) 97 | size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size 98 | dtype = trt.nptype(engine.get_binding_dtype(binding)) 99 | # Allocate host and device buffers 100 | host_mem = cuda.pagelocked_empty(size, dtype) 101 | cuda_mem = cuda.mem_alloc(host_mem.nbytes) 102 | # Append the device buffer to device bindings. 103 | bindings.append(int(cuda_mem)) 104 | # Append to the appropriate list. 105 | if engine.binding_is_input(binding): 106 | self.input_w = engine.get_binding_shape(binding)[-1] 107 | self.input_h = engine.get_binding_shape(binding)[-2] 108 | host_inputs.append(host_mem) 109 | cuda_inputs.append(cuda_mem) 110 | else: 111 | host_outputs.append(host_mem) 112 | cuda_outputs.append(cuda_mem) 113 | 114 | # Store 115 | self.stream = stream 116 | self.context = context 117 | self.engine = engine 118 | self.host_inputs = host_inputs 119 | self.cuda_inputs = cuda_inputs 120 | self.host_outputs = host_outputs 121 | self.cuda_outputs = cuda_outputs 122 | self.bindings = bindings 123 | self.batch_size = engine.max_batch_size 124 | 125 | def infer(self, raw_image_generator): 126 | threading.Thread.__init__(self) 127 | # Make self the active context, pushing it on top of the context stack. 128 | self.ctx.push() 129 | # Restore 130 | stream = self.stream 131 | context = self.context 132 | engine = self.engine 133 | host_inputs = self.host_inputs 134 | cuda_inputs = self.cuda_inputs 135 | host_outputs = self.host_outputs 136 | cuda_outputs = self.cuda_outputs 137 | bindings = self.bindings 138 | # Do image preprocess 139 | batch_image_raw = [] 140 | batch_origin_h = [] 141 | batch_origin_w = [] 142 | batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) 143 | for i, image_raw in enumerate(raw_image_generator): 144 | input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) 145 | batch_image_raw.append(image_raw) 146 | batch_origin_h.append(origin_h) 147 | batch_origin_w.append(origin_w) 148 | np.copyto(batch_input_image[i], input_image) 149 | batch_input_image = np.ascontiguousarray(batch_input_image) 150 | 151 | # Copy input image to host buffer 152 | np.copyto(host_inputs[0], batch_input_image.ravel()) 153 | start = time.time() 154 | # Transfer input data to the GPU. 155 | cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) 156 | # Run inference. 157 | context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) 158 | # Transfer predictions back from the GPU. 159 | cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) 160 | # Synchronize the stream 161 | stream.synchronize() 162 | end = time.time() 163 | # Remove any context from the top of the context stack, deactivating it. 164 | self.ctx.pop() 165 | # Here we use the first row of output in that batch_size = 1 166 | output = host_outputs[0] 167 | # Do postprocess 168 | for i in range(self.batch_size): 169 | result_boxes, result_scores, result_classid = self.post_process( 170 | output[i * 6001: (i + 1) * 6001], batch_origin_h[i], batch_origin_w[i] 171 | ) 172 | # Draw rectangles and labels on the original image 173 | for j in range(len(result_boxes)): 174 | box = result_boxes[j] 175 | plot_one_box( 176 | box, 177 | batch_image_raw[i], 178 | label="{}:{:.2f}".format( 179 | categories[int(result_classid[j])], result_scores[j] 180 | ), 181 | ) 182 | return batch_image_raw, end - start 183 | 184 | def destroy(self): 185 | # Remove any context from the top of the context stack, deactivating it. 186 | self.ctx.pop() 187 | 188 | def get_raw_image(self, image_path_batch): 189 | """ 190 | description: Read an image from image path 191 | """ 192 | for img_path in image_path_batch: 193 | yield cv2.imread(img_path) 194 | 195 | def get_raw_image_zeros(self, image_path_batch=None): 196 | """ 197 | description: Ready data for warmup 198 | """ 199 | for _ in range(self.batch_size): 200 | yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) 201 | 202 | def preprocess_image(self, raw_bgr_image): 203 | """ 204 | description: Convert BGR image to RGB, 205 | resize and pad it to target size, normalize to [0,1], 206 | transform to NCHW format. 207 | param: 208 | input_image_path: str, image path 209 | return: 210 | image: the processed image 211 | image_raw: the original image 212 | h: original height 213 | w: original width 214 | """ 215 | image_raw = raw_bgr_image 216 | h, w, c = image_raw.shape 217 | image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) 218 | # Calculate widht and height and paddings 219 | r_w = self.input_w / w 220 | r_h = self.input_h / h 221 | if r_h > r_w: 222 | tw = self.input_w 223 | th = int(r_w * h) 224 | tx1 = tx2 = 0 225 | ty1 = int((self.input_h - th) / 2) 226 | ty2 = self.input_h - th - ty1 227 | else: 228 | tw = int(r_h * w) 229 | th = self.input_h 230 | tx1 = int((self.input_w - tw) / 2) 231 | tx2 = self.input_w - tw - tx1 232 | ty1 = ty2 = 0 233 | # Resize the image with long side while maintaining ratio 234 | image = cv2.resize(image, (tw, th)) 235 | # Pad the short side with (128,128,128) 236 | image = cv2.copyMakeBorder( 237 | image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) 238 | ) 239 | image = image.astype(np.float32) 240 | # Normalize to [0,1] 241 | image /= 255.0 242 | # HWC to CHW format: 243 | image = np.transpose(image, [2, 0, 1]) 244 | # CHW to NCHW format 245 | image = np.expand_dims(image, axis=0) 246 | # Convert the image to row-major order, also known as "C order": 247 | image = np.ascontiguousarray(image) 248 | return image, image_raw, h, w 249 | 250 | def xywh2xyxy(self, origin_h, origin_w, x): 251 | """ 252 | description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right 253 | param: 254 | origin_h: height of original image 255 | origin_w: width of original image 256 | x: A boxes numpy, each row is a box [center_x, center_y, w, h] 257 | return: 258 | y: A boxes numpy, each row is a box [x1, y1, x2, y2] 259 | """ 260 | y = np.zeros_like(x) 261 | r_w = self.input_w / origin_w 262 | r_h = self.input_h / origin_h 263 | if r_h > r_w: 264 | y[:, 0] = x[:, 0] 265 | y[:, 2] = x[:, 2] 266 | y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2 267 | y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2 268 | y /= r_w 269 | else: 270 | y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2 271 | y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2 272 | y[:, 1] = x[:, 1] 273 | y[:, 3] = x[:, 3] 274 | y /= r_h 275 | 276 | return y 277 | 278 | def post_process(self, output, origin_h, origin_w): 279 | """ 280 | description: postprocess the prediction 281 | param: 282 | output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] 283 | origin_h: height of original image 284 | origin_w: width of original image 285 | return: 286 | result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] 287 | result_scores: finally scores, a numpy, each element is the score correspoing to box 288 | result_classid: finally classid, a numpy, each element is the classid correspoing to box 289 | """ 290 | # Get the num of boxes detected 291 | num = int(output[0]) 292 | # Reshape to a two dimentional ndarray 293 | pred = np.reshape(output[1:], (-1, 6))[:num, :] 294 | # Do nms 295 | boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) 296 | result_boxes = boxes[:, :4] if len(boxes) else np.array([]) 297 | result_scores = boxes[:, 4] if len(boxes) else np.array([]) 298 | result_classid = boxes[:, 5] if len(boxes) else np.array([]) 299 | return result_boxes, result_scores, result_classid 300 | 301 | def bbox_iou(self, box1, box2, x1y1x2y2=True): 302 | """ 303 | description: compute the IoU of two bounding boxes 304 | param: 305 | box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) 306 | box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) 307 | x1y1x2y2: select the coordinate format 308 | return: 309 | iou: computed iou 310 | """ 311 | if not x1y1x2y2: 312 | # Transform from center and width to exact coordinates 313 | b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 314 | b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 315 | b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 316 | b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 317 | else: 318 | # Get the coordinates of bounding boxes 319 | b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] 320 | b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] 321 | 322 | # Get the coordinates of the intersection rectangle 323 | inter_rect_x1 = np.maximum(b1_x1, b2_x1) 324 | inter_rect_y1 = np.maximum(b1_y1, b2_y1) 325 | inter_rect_x2 = np.minimum(b1_x2, b2_x2) 326 | inter_rect_y2 = np.minimum(b1_y2, b2_y2) 327 | # Intersection area 328 | inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \ 329 | np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None) 330 | # Union Area 331 | b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) 332 | b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) 333 | 334 | iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) 335 | 336 | return iou 337 | 338 | def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): 339 | """ 340 | description: Removes detections with lower object confidence score than 'conf_thres' and performs 341 | Non-Maximum Suppression to further filter detections. 342 | param: 343 | prediction: detections, (x1, y1, x2, y2, conf, cls_id) 344 | origin_h: original image height 345 | origin_w: original image width 346 | conf_thres: a confidence threshold to filter detections 347 | nms_thres: a iou threshold to filter detections 348 | return: 349 | boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) 350 | """ 351 | # Get the boxes that score > CONF_THRESH 352 | boxes = prediction[prediction[:, 4] >= conf_thres] 353 | # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] 354 | boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) 355 | # clip the coordinates 356 | boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1) 357 | boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1) 358 | boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1) 359 | boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1) 360 | # Object confidence 361 | confs = boxes[:, 4] 362 | # Sort by the confs 363 | boxes = boxes[np.argsort(-confs)] 364 | # Perform non-maximum suppression 365 | keep_boxes = [] 366 | while boxes.shape[0]: 367 | large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres 368 | label_match = boxes[0, -1] == boxes[:, -1] 369 | # Indices of boxes with lower confidence scores, large IOUs and matching labels 370 | invalid = large_overlap & label_match 371 | keep_boxes += [boxes[0]] 372 | boxes = boxes[~invalid] 373 | boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) 374 | return boxes 375 | 376 | 377 | class inferThread(threading.Thread): 378 | def __init__(self, yolov8_wrapper, image_path_batch): 379 | threading.Thread.__init__(self) 380 | self.yolov8_wrapper = yolov8_wrapper 381 | self.image_path_batch = image_path_batch 382 | 383 | def run(self): 384 | batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch)) 385 | for i, img_path in enumerate(self.image_path_batch): 386 | parent, filename = os.path.split(img_path) 387 | save_name = os.path.join('output', filename) 388 | # Save image 389 | cv2.imwrite(save_name, batch_image_raw[i]) 390 | print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) 391 | 392 | 393 | class warmUpThread(threading.Thread): 394 | def __init__(self, yolov8_wrapper): 395 | threading.Thread.__init__(self) 396 | self.yolov8_wrapper = yolov8_wrapper 397 | 398 | def run(self): 399 | batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros()) 400 | print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) 401 | 402 | 403 | if __name__ == "__main__": 404 | # load custom plugin and engine 405 | PLUGIN_LIBRARY = "build/libmyplugins.so" 406 | engine_file_path = "yolov8n.engine" 407 | 408 | if len(sys.argv) > 1: 409 | engine_file_path = sys.argv[1] 410 | if len(sys.argv) > 2: 411 | PLUGIN_LIBRARY = sys.argv[2] 412 | 413 | ctypes.CDLL(PLUGIN_LIBRARY) 414 | 415 | # load coco labels 416 | 417 | categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", 418 | "traffic light", 419 | "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", 420 | "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", 421 | "frisbee", 422 | "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", 423 | "surfboard", 424 | "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", 425 | "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", 426 | "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", 427 | "cell phone", 428 | "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", 429 | "teddy bear", 430 | "hair drier", "toothbrush"] 431 | 432 | if os.path.exists('output/'): 433 | shutil.rmtree('output/') 434 | os.makedirs('output/') 435 | # a YoLov8TRT instance 436 | yolov8_wrapper = YoLov8TRT(engine_file_path) 437 | try: 438 | print('batch size is', yolov8_wrapper.batch_size) 439 | 440 | image_dir = "samples/" 441 | image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir) 442 | 443 | for i in range(10): 444 | # create a new thread to do warm_up 445 | thread1 = warmUpThread(yolov8_wrapper) 446 | thread1.start() 447 | thread1.join() 448 | for batch in image_path_batches: 449 | # create a new thread to do inference 450 | thread1 = inferThread(yolov8_wrapper, batch) 451 | thread1.start() 452 | thread1.join() 453 | finally: 454 | # destroy the instance 455 | yolov8_wrapper.destroy() 456 | -------------------------------------------------------------------------------- /videos/demo.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/emptysoal/TensorRT-YOLOv8-ByteTrack/cec012e0672dd3d1f79a8dc29875be9c56feedb5/videos/demo.mp4 -------------------------------------------------------------------------------- /yolo/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | project(yolov8_trt_infer) 4 | 5 | add_definitions(-std=c++11) 6 | add_definitions(-DAPI_EXPORTS) 7 | set(CMAKE_CXX_STANDARD 11) 8 | set(CMAKE_BUILD_TYPE release) 9 | 10 | include_directories(${PROJECT_SOURCE_DIR}/include) 11 | include_directories(${PROJECT_SOURCE_DIR}/plugin) 12 | 13 | find_package(CUDA REQUIRED) 14 | 15 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 16 | # ============= cuda ============ 17 | include_directories(/usr/local/cuda/include) 18 | link_directories(/usr/local/cuda/lib64) 19 | 20 | # ============= tensorrt ============ 21 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") 22 | message("Embed_platform on") 23 | include_directories(/usr/include/aarch64-linux-gnu) 24 | link_directories(/usr/lib/aarch64-linux-gnu) 25 | else() 26 | message("Embed_platform off") 27 | include_directories(/usr/include/x86_64-linux-gnu) 28 | link_directories(/usr/lib/x86_64-linux-gnu) 29 | endif() 30 | 31 | # ============ opencv =========== 32 | find_package(OpenCV) 33 | include_directories(${OpenCV_INCLUDE_DIRS}) 34 | 35 | # ====== yolo infer shared lib ====== 36 | file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu ${PROJECT_SOURCE_DIR}/plugin/*.cu) 37 | cuda_add_library(yolo_infer SHARED ${SRCS}) 38 | target_link_libraries(yolo_infer nvinfer cudart ${OpenCV_LIBS}) 39 | set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/lib) 40 | 41 | # ======== main execute file ======== 42 | link_directories(${PROJECT_SOURCE_DIR}/lib) 43 | add_executable(main ${PROJECT_SOURCE_DIR}/main.cpp) 44 | target_link_libraries(main yolo_infer) 45 | set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}) 46 | -------------------------------------------------------------------------------- /yolo/README.md: -------------------------------------------------------------------------------- 1 | # 封装 YOLOv8 TensorRT 推理 2 | 3 | ## 一. 项目简介 4 | 5 | - 基于 `TensorRT-v8` ,运行`YOLOv8`推理; 6 | 7 | - 支持嵌入式设备 `Jetson` 系列上部署,也可以在 `Linux x86_64`的服务器上部署; 8 | 9 | 本人所做的主要工作: 10 | 11 | 1. 参考 [tensorrtx](https://github.com/wang-xinyu/tensorrtx/tree/master/yolov8) 项目,模型 `.pth` -> `.engine`,提取出推理部分代码,并**封装为C++的类**,便于其他项目调用; 12 | 2. 预处理更换成了自己写的 CUDA编程预处理; 13 | 3. 后处理去掉了CUDA编程,因为测试其相比CPU后处理提速并不明显; 14 | 5. `YOLOv8` 推理编译为一个动态链接库,以解耦项目。 15 | 16 | 特点: 17 | 18 | - 在其他项目中使用 `YOLOv8` 推理时,调用下面 3 行代码即可: 19 | 20 | ```C++ 21 | // 加载模型 22 | std::string trtFile = "./engine/yolov8s.engine"; 23 | YoloDetecter detecter(trtFile); 24 | 25 | // 使用TensorRT推理 26 | std::vector res = detecter.inference(img); 27 | ``` 28 | 29 | ## 二. 环境配置 30 | 31 | 1. 基本要求: 32 | 33 | - `TensorRT 8.0+` 34 | - `OpenCV 3.4.0+` 35 | 36 | 2. 本人在 `Jetson Nano` 上的运行环境如下: 37 | 38 | - 烧录的系统镜像为 `Jetpack 4.6.1`,该`jetpack` 原装环境如下: 39 | 40 | | CUDA | cuDNN | TensorRT | OpenCV | 41 | | ---- | ----- | -------- | ------ | 42 | | 10.2 | 8.2 | 8.2.1 | 4.1.1 | 43 | 44 | 关于如何在 `Jetson nano` 上烧录镜像,网上资料还是很多的,这里就不赘述了,注意下载 `Jetpack`镜像时选择 4.6.1 版本,该版本对应的 TensorRT v8 版本 45 | 46 | 提示:无论何种设备,记得确认 `CMakeLists.txt` 文件中相关库的路径。 47 | 48 | ## 三. 模型转换 49 | 50 | 目的:把 `YOLOv8`的`pth`检测模型,转换成`TensorRT`的序列化文件,后缀 `.engine` 51 | 52 | 步骤: 53 | 54 | 1. 按照 [tensorrtx](https://github.com/wang-xinyu/tensorrtx/tree/master/yolov8) 项目操作,但作者亲测有以下注意点: 55 | - 拷贝`gen_wts.py`文件时,拷贝到 `YOLOv8`一级`ultralytics`目录下即可,且不需要安装`YOLOv8`,无需按其所写的二级目录; 56 | - 注意修改 `gen_wts.py` 文件中的输入输出目录。 57 | 58 | 2. 之后可成功得到 `yolov8s.engine` 文件(本人使用的是YOLOv8 s 模型,也可以使用其他的) 59 | 60 | 3. 在本项目中新建 `engine`目录,并放入转换后的模型文件 61 | 62 | ## 四. 运行项目 63 | 64 | - 开始编译并运行 65 | - 按如下步骤运行 66 | 67 | ```bash 68 | mkdir build 69 | cd build 70 | cmake .. 71 | make 72 | cd .. 73 | ./main ./images # 传入自己图像的目录 74 | ``` 75 | 76 | -------------------------------------------------------------------------------- /yolo/include/config.h: -------------------------------------------------------------------------------- 1 | #define USE_FP16 2 | //#define USE_INT8 3 | 4 | const static char *kInputTensorName = "images"; 5 | const static char *kOutputTensorName = "output"; 6 | const static int kNumClass = 80; 7 | const static int kBatchSize = 1; 8 | const static int kGpuId = 0; 9 | const static int kInputH = 640; 10 | const static int kInputW = 640; 11 | const static float kNmsThresh = 0.45f; 12 | const static float kConfThresh = 0.01f; 13 | const static int kMaxInputImageSize = 3000 * 3000; 14 | const static int kMaxNumOutputBbox = 1000; 15 | -------------------------------------------------------------------------------- /yolo/include/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #include "NvInfer.h" 5 | 6 | #ifdef API_EXPORTS 7 | #if defined(_MSC_VER) 8 | #define API __declspec(dllexport) 9 | #else 10 | #define API __attribute__((visibility("default"))) 11 | #endif 12 | #else 13 | 14 | #if defined(_MSC_VER) 15 | #define API __declspec(dllimport) 16 | #else 17 | #define API 18 | #endif 19 | #endif // API_EXPORTS 20 | 21 | #if NV_TENSORRT_MAJOR >= 8 22 | #define TRT_NOEXCEPT noexcept 23 | #define TRT_CONST_ENQUEUE const 24 | #else 25 | #define TRT_NOEXCEPT 26 | #define TRT_CONST_ENQUEUE 27 | #endif 28 | 29 | #endif // __MACROS_H 30 | -------------------------------------------------------------------------------- /yolo/include/postprocess.h: -------------------------------------------------------------------------------- 1 | #ifndef POSTPROCESS_H 2 | #define POSTPROCESS_H 3 | 4 | #include "types.h" 5 | #include 6 | 7 | cv::Rect get_rect(cv::Mat& img, float bbox[4]); 8 | 9 | void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh = 0.5); 10 | 11 | #endif // POSTPROCESS_H 12 | -------------------------------------------------------------------------------- /yolo/include/preprocess.h: -------------------------------------------------------------------------------- 1 | #ifndef PREPROCESS_H 2 | #define PREPROCESS_H 3 | 4 | #include 5 | #include 6 | 7 | void preprocess(const cv::Mat& srcImg, float* dstData, const int dstHeight, const int dstWidth); 8 | /* 9 | srcImg: source image for inference 10 | dstData: data after preprocess (resize / bgr to rgb / hwc to chw / normalize) 11 | dstHeight: CNN input height 12 | dstWidth: CNN input width 13 | */ 14 | 15 | #endif // PREPROCESS_H 16 | -------------------------------------------------------------------------------- /yolo/include/public.h: -------------------------------------------------------------------------------- 1 | #ifndef PUBLIC_H 2 | #define PUBLIC_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #define CUDA_CHECK(call) check(call, __LINE__, __FILE__) 21 | 22 | inline bool check(cudaError_t e, int iLine, const char *szFile) 23 | { 24 | if (e != cudaSuccess) 25 | { 26 | std::cout << "CUDA runtime API error " << cudaGetErrorName(e) << " at line " << iLine << " in file " << szFile << std::endl; 27 | return false; 28 | } 29 | return true; 30 | } 31 | 32 | using namespace nvinfer1; 33 | 34 | 35 | class Logger : public ILogger 36 | { 37 | public: 38 | Severity reportableSeverity; 39 | 40 | Logger(Severity severity = Severity::kINFO): 41 | reportableSeverity(severity) {} 42 | 43 | void log(Severity severity, const char *msg) noexcept override 44 | { 45 | if (severity > reportableSeverity) 46 | { 47 | return; 48 | } 49 | switch (severity) 50 | { 51 | case Severity::kINTERNAL_ERROR: 52 | std::cerr << "INTERNAL_ERROR: "; 53 | break; 54 | case Severity::kERROR: 55 | std::cerr << "ERROR: "; 56 | break; 57 | case Severity::kWARNING: 58 | std::cerr << "WARNING: "; 59 | break; 60 | case Severity::kINFO: 61 | std::cerr << "INFO: "; 62 | break; 63 | default: 64 | std::cerr << "VERBOSE: "; 65 | break; 66 | } 67 | std::cerr << msg << std::endl; 68 | } 69 | }; 70 | 71 | #endif // PUBLIC_H 72 | -------------------------------------------------------------------------------- /yolo/include/types.h: -------------------------------------------------------------------------------- 1 | #ifndef TYPES_H 2 | #define TYPES_H 3 | 4 | #include "config.h" 5 | 6 | struct alignas(float) Detection { 7 | //center_x center_y w h 8 | float bbox[4]; 9 | float conf; // bbox_conf * cls_conf 10 | float class_id; 11 | }; 12 | 13 | struct AffineMatrix { 14 | float value[6]; 15 | }; 16 | 17 | const int bbox_element = sizeof(AffineMatrix) / sizeof(float)+1; // left, top, right, bottom, confidence, class, keepflag 18 | 19 | #endif // TYPES_H 20 | -------------------------------------------------------------------------------- /yolo/include/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H 2 | #define UTILS_H 3 | 4 | #include 5 | #include 6 | 7 | static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) { 8 | int w, h, x, y; 9 | float r_w = input_w / (img.cols*1.0); 10 | float r_h = input_h / (img.rows*1.0); 11 | if (r_h > r_w) { 12 | w = input_w; 13 | h = r_w * img.rows; 14 | x = 0; 15 | y = (input_h - h) / 2; 16 | } else { 17 | w = r_h * img.cols; 18 | h = input_h; 19 | x = (input_w - w) / 2; 20 | y = 0; 21 | } 22 | cv::Mat re(h, w, CV_8UC3); 23 | cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); 24 | cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); 25 | re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); 26 | return out; 27 | } 28 | 29 | static inline int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { 30 | DIR *p_dir = opendir(p_dir_name); 31 | if (p_dir == nullptr) { 32 | return -1; 33 | } 34 | 35 | struct dirent* p_file = nullptr; 36 | while ((p_file = readdir(p_dir)) != nullptr) { 37 | if (strcmp(p_file->d_name, ".") != 0 && 38 | strcmp(p_file->d_name, "..") != 0) { 39 | //std::string cur_file_name(p_dir_name); 40 | //cur_file_name += "/"; 41 | //cur_file_name += p_file->d_name; 42 | std::string cur_file_name(p_file->d_name); 43 | file_names.push_back(cur_file_name); 44 | } 45 | } 46 | 47 | closedir(p_dir); 48 | return 0; 49 | } 50 | 51 | #endif // UTILS_H 52 | -------------------------------------------------------------------------------- /yolo/include/yolov8_lib.h: -------------------------------------------------------------------------------- 1 | #ifndef YOLOV8_LIB 2 | #define YOLOV8_LIB 3 | 4 | #include 5 | #include "public.h" 6 | #include "yololayer.h" 7 | 8 | using namespace nvinfer1; 9 | 10 | 11 | struct DetectResult 12 | { 13 | cv::Rect tlwh; // top left width height 14 | float conf; 15 | int class_id; 16 | }; 17 | 18 | 19 | class YoloDetecter 20 | { 21 | public: 22 | YoloDetecter(const std::string trtFile); 23 | ~YoloDetecter(); 24 | std::vector inference(cv::Mat& img); 25 | 26 | private: 27 | void deserialize_engine(); 28 | void inference(); 29 | 30 | private: 31 | Logger gLogger; 32 | std::string trtFile_; 33 | 34 | ICudaEngine * engine; 35 | IRuntime * runtime; 36 | IExecutionContext * context; 37 | 38 | cudaStream_t stream; 39 | 40 | int kOutputSize; 41 | std::vector vTensorSize; // bytes of input and output 42 | float * inputData; 43 | float * outputData; 44 | std::vector vBufferD; 45 | }; 46 | 47 | #endif // YOLOV8_LIB 48 | -------------------------------------------------------------------------------- /yolo/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "utils.h" 3 | #include "yolov8_lib.h" 4 | 5 | 6 | int run(char* imageDir) 7 | { 8 | // get image file names for inferencing 9 | std::vector file_names; 10 | if (read_files_in_dir(imageDir, file_names) < 0) { 11 | std::cout << "read_files_in_dir failed." << std::endl; 12 | return -1; 13 | } 14 | 15 | // create detecter, and load engine plan 16 | std::string trtFile = "./engine/yolov8s.engine"; 17 | YoloDetecter detecter(trtFile); 18 | 19 | // inference 20 | for (long unsigned int i = 0; i < file_names.size(); i++) 21 | { 22 | std::string imagePath = std::string(imageDir) + "/" + file_names[i]; 23 | cv::Mat img = cv::imread(imagePath, cv::IMREAD_COLOR); 24 | if (img.empty()) continue; 25 | 26 | std::vector res = detecter.inference(img); 27 | 28 | // draw result on image 29 | for (long unsigned int j = 0; j < res.size(); j++) 30 | { 31 | cv::Rect r = res[j].tlwh; 32 | cv::rectangle(img, r, cv::Scalar(255, 0, 255), 2); 33 | cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0, 0, 255), 2); 34 | } 35 | 36 | cv::imwrite("_" + file_names[i], img); 37 | 38 | std::cout << "Image: " << file_names[i] << " done." << std::endl; 39 | } 40 | 41 | return 0; 42 | } 43 | 44 | int main(int argc, char *argv[]) 45 | { 46 | if (argc != 2) { 47 | printf("This program need 1 argument\n"); 48 | printf("Usage: ./main [image dir]\n"); 49 | printf("Example: ./main ./images\n"); 50 | return 1; 51 | } 52 | 53 | return run(argv[1]); 54 | } 55 | -------------------------------------------------------------------------------- /yolo/plugin/yololayer.cu: -------------------------------------------------------------------------------- 1 | #include "yololayer.h" 2 | #include "types.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace Tn { 9 | template 10 | void write(char*& buffer, const T& val) { 11 | *reinterpret_cast(buffer) = val; 12 | buffer += sizeof(T); 13 | } 14 | 15 | template 16 | void read(const char*& buffer, T& val) { 17 | val = *reinterpret_cast(buffer); 18 | buffer += sizeof(T); 19 | } 20 | } // namespace Tn 21 | 22 | 23 | namespace nvinfer1 { 24 | YoloLayerPlugin::YoloLayerPlugin(int classCount, int netWidth, int netHeight, int maxOut) { 25 | mClassCount = classCount; 26 | mYoloV8NetWidth = netWidth; 27 | mYoloV8netHeight = netHeight; 28 | mMaxOutObject = maxOut; 29 | } 30 | 31 | YoloLayerPlugin::~YoloLayerPlugin() {} 32 | 33 | YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { 34 | using namespace Tn; 35 | const char* d = reinterpret_cast(data), * a = d; 36 | read(d, mClassCount); 37 | read(d, mThreadCount); 38 | read(d, mYoloV8NetWidth); 39 | read(d, mYoloV8netHeight); 40 | read(d, mMaxOutObject); 41 | 42 | assert(d == a + length); 43 | } 44 | 45 | void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { 46 | 47 | using namespace Tn; 48 | char* d = static_cast(buffer), * a = d; 49 | write(d, mClassCount); 50 | write(d, mThreadCount); 51 | write(d, mYoloV8NetWidth); 52 | write(d, mYoloV8netHeight); 53 | write(d, mMaxOutObject); 54 | 55 | assert(d == a + getSerializationSize()); 56 | } 57 | 58 | size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { 59 | return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject); 60 | } 61 | 62 | int YoloLayerPlugin::initialize() TRT_NOEXCEPT { 63 | return 0; 64 | } 65 | 66 | nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT { 67 | int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float); 68 | return nvinfer1::Dims3(total_size + 1, 1, 1); 69 | } 70 | 71 | void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { 72 | mPluginNamespace = pluginNamespace; 73 | } 74 | 75 | const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { 76 | return mPluginNamespace; 77 | } 78 | 79 | nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT { 80 | return nvinfer1::DataType::kFLOAT; 81 | } 82 | 83 | bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT { 84 | 85 | return false; 86 | } 87 | 88 | bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { 89 | 90 | return false; 91 | } 92 | 93 | void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput, nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT {}; 94 | 95 | void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {}; 96 | 97 | void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} 98 | 99 | const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { 100 | 101 | return "YoloLayer_TRT"; 102 | } 103 | 104 | const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { 105 | return "1"; 106 | } 107 | 108 | void YoloLayerPlugin::destroy() TRT_NOEXCEPT { 109 | 110 | delete this; 111 | } 112 | 113 | nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { 114 | 115 | YoloLayerPlugin* p = new YoloLayerPlugin(mClassCount, mYoloV8NetWidth, mYoloV8netHeight, mMaxOutObject); 116 | p->setPluginNamespace(mPluginNamespace); 117 | return p; 118 | } 119 | 120 | int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { 121 | 122 | forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize); 123 | return 0; 124 | } 125 | 126 | 127 | __device__ float Logist(float data) { return 1.0f / (1.0f + expf(-data)); }; 128 | 129 | __global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, 130 | const int grid_h, int grid_w, const int stride, int classes, int outputElem) { 131 | int idx = threadIdx.x + blockDim.x * blockIdx.x; 132 | if (idx >= numElements) return; 133 | 134 | int total_grid = grid_h * grid_w; 135 | int info_len = 4 + classes; 136 | int batchIdx = idx / total_grid; 137 | int elemIdx = idx % total_grid; 138 | const float* curInput = input + batchIdx * total_grid * info_len; 139 | int outputIdx = batchIdx * outputElem; 140 | 141 | int class_id = 0; 142 | float max_cls_prob = 0.0; 143 | for (int i = 4; i < info_len; i++) { 144 | float p = Logist(curInput[elemIdx + i * total_grid]); 145 | if (p > max_cls_prob) { 146 | max_cls_prob = p; 147 | class_id = i - 4; 148 | } 149 | } 150 | 151 | if (max_cls_prob < 0.1) return; 152 | 153 | int count = (int)atomicAdd(output + outputIdx, 1); 154 | if (count >= maxoutobject) return; 155 | char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection); 156 | Detection* det = (Detection*)(data); 157 | 158 | int row = elemIdx / grid_w; 159 | int col = elemIdx % grid_w; 160 | 161 | det->conf = max_cls_prob; 162 | det->class_id = class_id; 163 | det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride; 164 | det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride; 165 | det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride; 166 | det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride; 167 | } 168 | 169 | void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,int mYoloV8NetWidth, int batchSize) { 170 | int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float); 171 | cudaMemsetAsync(output, 0, sizeof(float), stream); 172 | for (int idx = 0; idx < batchSize; ++idx) { 173 | CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream)); 174 | } 175 | int numElem = 0; 176 | int grids[3][2] = { {mYoloV8netHeight / 8, mYoloV8NetWidth / 8}, {mYoloV8netHeight / 16, mYoloV8NetWidth / 16}, {mYoloV8netHeight / 32, mYoloV8NetWidth / 32} }; 177 | int strides[] = { 8, 16, 32 }; 178 | for (unsigned int i = 0; i < 3; i++) { 179 | int grid_h = grids[i][0]; 180 | int grid_w = grids[i][1]; 181 | int stride = strides[i]; 182 | numElem = grid_h * grid_w * batchSize; 183 | if (numElem < mThreadCount) mThreadCount = numElem; 184 | 185 | CalDetection << <(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream >> > 186 | (inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, outputElem); 187 | } 188 | } 189 | 190 | PluginFieldCollection YoloPluginCreator::mFC{}; 191 | std::vector YoloPluginCreator::mPluginAttributes; 192 | 193 | YoloPluginCreator::YoloPluginCreator() { 194 | mPluginAttributes.clear(); 195 | mFC.nbFields = mPluginAttributes.size(); 196 | mFC.fields = mPluginAttributes.data(); 197 | } 198 | 199 | const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT { 200 | return "YoloLayer_TRT"; 201 | } 202 | 203 | const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT { 204 | return "1"; 205 | } 206 | 207 | const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { 208 | return &mFC; 209 | } 210 | 211 | IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { 212 | assert(fc->nbFields == 1); 213 | assert(strcmp(fc->fields[0].name, "netinfo") == 0); 214 | int* p_netinfo = (int*)(fc->fields[0].data); 215 | int class_count = p_netinfo[0]; 216 | int input_w = p_netinfo[1]; 217 | int input_h = p_netinfo[2]; 218 | int max_output_object_count = p_netinfo[3]; 219 | YoloLayerPlugin* obj = new YoloLayerPlugin(class_count, input_w, input_h, max_output_object_count); 220 | obj->setPluginNamespace(mNamespace.c_str()); 221 | return obj; 222 | } 223 | 224 | IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT { 225 | // This object will be deleted when the network is destroyed, which will 226 | // call YoloLayerPlugin::destroy() 227 | YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); 228 | obj->setPluginNamespace(mNamespace.c_str()); 229 | return obj; 230 | } 231 | 232 | } // namespace nvinfer1 233 | -------------------------------------------------------------------------------- /yolo/plugin/yololayer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "macros.h" 3 | #include "NvInfer.h" 4 | #include 5 | #include 6 | #include "macros.h" 7 | #include "public.h" 8 | namespace nvinfer1 { 9 | class API YoloLayerPlugin : public IPluginV2IOExt { 10 | public: 11 | YoloLayerPlugin(int classCount, int netWdith, int netHeight, int maxOut); 12 | YoloLayerPlugin(const void* data, size_t length); 13 | ~YoloLayerPlugin(); 14 | 15 | int getNbOutputs() const TRT_NOEXCEPT override { 16 | return 1; 17 | } 18 | 19 | nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; 20 | 21 | int initialize() TRT_NOEXCEPT override; 22 | 23 | virtual void terminate() TRT_NOEXCEPT override {} 24 | 25 | virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } 26 | 27 | virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; 28 | 29 | virtual size_t getSerializationSize() const TRT_NOEXCEPT override; 30 | 31 | virtual void serialize(void* buffer) const TRT_NOEXCEPT override; 32 | 33 | bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const TRT_NOEXCEPT override { 34 | return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; 35 | } 36 | 37 | 38 | const char* getPluginType() const TRT_NOEXCEPT override; 39 | 40 | const char* getPluginVersion() const TRT_NOEXCEPT override; 41 | 42 | void destroy() TRT_NOEXCEPT override; 43 | 44 | IPluginV2IOExt* clone() const TRT_NOEXCEPT override; 45 | 46 | void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; 47 | 48 | const char* getPluginNamespace() const TRT_NOEXCEPT override; 49 | 50 | nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const TRT_NOEXCEPT; 51 | 52 | bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const TRT_NOEXCEPT override; 53 | 54 | bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; 55 | 56 | void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; 57 | 58 | void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out, int32_t nbOutput) TRT_NOEXCEPT override; 59 | 60 | void detachFromContext() TRT_NOEXCEPT override; 61 | 62 | private: 63 | void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, int mYoloV8NetWidth, int batchSize); 64 | int mThreadCount = 256; 65 | const char* mPluginNamespace; 66 | int mClassCount; 67 | int mYoloV8NetWidth; 68 | int mYoloV8netHeight; 69 | int mMaxOutObject; 70 | }; 71 | 72 | class API YoloPluginCreator : public IPluginCreator { 73 | public: 74 | YoloPluginCreator(); 75 | ~YoloPluginCreator() override = default; 76 | 77 | const char* getPluginName() const TRT_NOEXCEPT override; 78 | 79 | const char* getPluginVersion() const TRT_NOEXCEPT override; 80 | 81 | const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; 82 | 83 | nvinfer1::IPluginV2IOExt* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override; 84 | 85 | nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override; 86 | 87 | void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { 88 | mNamespace = libNamespace; 89 | } 90 | 91 | const char* getPluginNamespace() const TRT_NOEXCEPT override { 92 | return mNamespace.c_str(); 93 | } 94 | 95 | private: 96 | std::string mNamespace; 97 | static PluginFieldCollection mFC; 98 | static std::vector mPluginAttributes; 99 | }; 100 | REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); 101 | } // namespace nvinfer1 102 | 103 | -------------------------------------------------------------------------------- /yolo/src/postprocess.cpp: -------------------------------------------------------------------------------- 1 | #include "postprocess.h" 2 | 3 | 4 | cv::Rect get_rect(cv::Mat &img, float bbox[4]) { 5 | float l, r, t, b; 6 | float r_w = kInputW / (img.cols * 1.0); 7 | float r_h = kInputH / (img.rows * 1.0); 8 | 9 | if (r_h > r_w) { 10 | l = bbox[0]; 11 | r = bbox[2]; 12 | t = bbox[1] - (kInputH - r_w * img.rows) / 2; 13 | b = bbox[3] - (kInputH - r_w * img.rows) / 2; 14 | l = l / r_w; 15 | r = r / r_w; 16 | t = t / r_w; 17 | b = b / r_w; 18 | } else { 19 | l = bbox[0] - (kInputW - r_h * img.cols) / 2; 20 | r = bbox[2] - (kInputW - r_h * img.cols) / 2; 21 | t = bbox[1]; 22 | b = bbox[3]; 23 | l = l / r_h; 24 | r = r / r_h; 25 | t = t / r_h; 26 | b = b / r_h; 27 | } 28 | return cv::Rect(round(l), round(t), round(r - l), round(b - t)); 29 | } 30 | 31 | static float iou(float lbox[4], float rbox[4]) { 32 | float interBox[] = { 33 | (std::max)(lbox[0], rbox[0]), //left 34 | (std::min)(lbox[2], rbox[2]), //right 35 | (std::max)(lbox[1], rbox[1]), //top 36 | (std::min)(lbox[3], rbox[3]), //bottom 37 | }; 38 | 39 | if (interBox[2] > interBox[3] || interBox[0] > interBox[1]) 40 | return 0.0f; 41 | 42 | float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]); 43 | float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS; 44 | return interBoxS / unionBoxS; 45 | } 46 | 47 | static bool cmp(const Detection &a, const Detection &b) { 48 | return a.conf > b.conf; 49 | } 50 | 51 | void nms(std::vector &res, float *output, float conf_thresh, float nms_thresh) { 52 | int det_size = sizeof(Detection) / sizeof(float); 53 | std::map> m; 54 | 55 | for (int i = 0; i < output[0]; i++) { 56 | if (output[1 + det_size * i + 4] <= conf_thresh) continue; 57 | Detection det; 58 | memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); 59 | if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); 60 | m[det.class_id].push_back(det); 61 | } 62 | for (auto it = m.begin(); it != m.end(); it++) { 63 | auto &dets = it->second; 64 | std::sort(dets.begin(), dets.end(), cmp); 65 | for (size_t m = 0; m < dets.size(); ++m) { 66 | auto &item = dets[m]; 67 | res.push_back(item); 68 | for (size_t n = m + 1; n < dets.size(); ++n) { 69 | if (iou(item.bbox, dets[n].bbox) > nms_thresh) { 70 | dets.erase(dets.begin() + n); 71 | --n; 72 | } 73 | } 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /yolo/src/preprocess.cu: -------------------------------------------------------------------------------- 1 | #include "preprocess.h" 2 | 3 | 4 | __global__ void letterbox(const uchar* srcData, const int srcH, const int srcW, uchar* tgtData, 5 | const int tgtH, const int tgtW, const int rszH, const int rszW, const int startY, const int startX) 6 | { 7 | int ix = threadIdx.x + blockDim.x * blockIdx.x; 8 | int iy = threadIdx.y + blockDim.y * blockIdx.y; 9 | int idx = ix + iy * tgtW; 10 | int idx3 = idx * 3; 11 | 12 | if ( ix > tgtW || iy > tgtH ) return; // thread out of target range 13 | // gray region on target image 14 | if ( iy < startY || iy > (startY + rszH - 1) ) { 15 | tgtData[idx3] = 128; 16 | tgtData[idx3 + 1] = 128; 17 | tgtData[idx3 + 2] = 128; 18 | return; 19 | } 20 | if ( ix < startX || ix > (startX + rszW - 1) ){ 21 | tgtData[idx3] = 128; 22 | tgtData[idx3 + 1] = 128; 23 | tgtData[idx3 + 2] = 128; 24 | return; 25 | } 26 | 27 | float scaleY = (float)rszH / (float)srcH; 28 | float scaleX = (float)rszW / (float)srcW; 29 | 30 | // (ix,iy)为目标图像坐标 31 | // (before_x,before_y)原图坐标 32 | float beforeX = float(ix - startX + 0.5) / scaleX - 0.5; 33 | float beforeY = float(iy - startY + 0.5) / scaleY - 0.5; 34 | // 原图像坐标四个相邻点 35 | // 获得变换前最近的四个顶点,取整 36 | int topY = static_cast(beforeY); 37 | int bottomY = topY + 1; 38 | int leftX = static_cast(beforeX); 39 | int rightX = leftX + 1; 40 | //计算变换前坐标的小数部分 41 | float u = beforeX - leftX; 42 | float v = beforeY - topY; 43 | 44 | if (topY >= srcH - 1 && leftX >= srcW - 1) //右下角 45 | { 46 | for (int k = 0; k < 3; k++) 47 | { 48 | tgtData[idx3 + k] = (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k]; 49 | } 50 | } 51 | else if (topY >= srcH - 1) // 最后一行 52 | { 53 | for (int k = 0; k < 3; k++) 54 | { 55 | tgtData[idx3 + k] 56 | = (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k] 57 | + (u) * (1. - v) * srcData[(rightX + topY * srcW) * 3 + k]; 58 | } 59 | } 60 | else if (leftX >= srcW - 1) // 最后一列 61 | { 62 | for (int k = 0; k < 3; k++) 63 | { 64 | tgtData[idx3 + k] 65 | = (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k] 66 | + (1. - u) * (v) * srcData[(leftX + bottomY * srcW) * 3 + k]; 67 | } 68 | } 69 | else // 非最后一行或最后一列情况 70 | { 71 | for (int k = 0; k < 3; k++) 72 | { 73 | tgtData[idx3 + k] 74 | = (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k] 75 | + (u) * (1. - v) * srcData[(rightX + topY * srcW) * 3 + k] 76 | + (1. - u) * (v) * srcData[(leftX + bottomY * srcW) * 3 + k] 77 | + u * v * srcData[(rightX + bottomY * srcW) * 3 + k]; 78 | } 79 | } 80 | } 81 | 82 | __global__ void process(const uchar* srcData, float* tgtData, const int h, const int w) 83 | { 84 | int ix = threadIdx.x + blockIdx.x * blockDim.x; 85 | int iy = threadIdx.y + blockIdx.y * blockDim.y; 86 | int idx = ix + iy * w; 87 | int idx3 = idx * 3; 88 | 89 | if (ix < w && iy < h) 90 | { 91 | tgtData[idx] = (float)srcData[idx3 + 2] / 255.0; // R pixel 92 | tgtData[idx + h * w] = (float)srcData[idx3 + 1] / 255.0; // G pixel 93 | tgtData[idx + h * w * 2] = (float)srcData[idx3] / 255.0; // B pixel 94 | } 95 | } 96 | 97 | void preprocess(const cv::Mat& srcImg, float* dstData, const int dstHeight, const int dstWidth) 98 | { 99 | int srcHeight = srcImg.rows; 100 | int srcWidth = srcImg.cols; 101 | int srcElements = srcHeight * srcWidth * 3; 102 | int dstElements = dstHeight * dstWidth * 3; 103 | 104 | // target data on device 105 | float* dstDevData; 106 | cudaMalloc((void**)&dstDevData, sizeof(float) * dstElements); 107 | // middle image data on device ( for bilinear resize ) 108 | uchar* midDevData; 109 | cudaMalloc((void**)&midDevData, sizeof(uchar) * dstElements); 110 | // source images data on device 111 | uchar* srcDevData; 112 | cudaMalloc((void**)&srcDevData, sizeof(uchar) * srcElements); 113 | cudaMemcpy(srcDevData, srcImg.data, sizeof(uchar) * srcElements, cudaMemcpyHostToDevice); 114 | 115 | // calculate width and height after resize 116 | int w, h, x, y; 117 | float r_w = dstWidth / (srcWidth * 1.0); 118 | float r_h = dstHeight / (srcHeight * 1.0); 119 | if (r_h > r_w) { 120 | w = dstWidth; 121 | h = r_w * srcHeight; 122 | x = 0; 123 | y = (dstHeight - h) / 2; 124 | } 125 | else { 126 | w = r_h * srcWidth; 127 | h = dstHeight; 128 | x = (dstWidth - w) / 2; 129 | y = 0; 130 | } 131 | 132 | dim3 blockSize(32, 32); 133 | dim3 gridSize((dstWidth + blockSize.x - 1) / blockSize.x, (dstHeight + blockSize.y - 1) / blockSize.y); 134 | 135 | // letterbox and resize 136 | letterbox<<>>(srcDevData, srcHeight, srcWidth, midDevData, dstHeight, dstWidth, h, w, y, x); 137 | cudaDeviceSynchronize(); 138 | // hwc to chw / bgr to rgb / normalize 139 | process<<>>(midDevData, dstDevData, dstHeight, dstWidth); 140 | 141 | cudaMemcpy(dstData, dstDevData, sizeof(float) * dstElements, cudaMemcpyDeviceToHost); 142 | 143 | cudaFree(srcDevData); 144 | cudaFree(midDevData); 145 | cudaFree(dstDevData); 146 | } 147 | -------------------------------------------------------------------------------- /yolo/src/yolov8_lib.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "yolov8_lib.h" 5 | #include "preprocess.h" 6 | #include "postprocess.h" 7 | 8 | using namespace nvinfer1; 9 | 10 | 11 | YoloDetecter::YoloDetecter(const std::string trtFile): trtFile_(trtFile) 12 | { 13 | gLogger = Logger(ILogger::Severity::kERROR); 14 | cudaSetDevice(kGpuId); 15 | 16 | // load engine 17 | deserialize_engine(); 18 | 19 | CUDA_CHECK(cudaStreamCreate(&stream)); 20 | 21 | // bytes of input and output 22 | kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; 23 | vTensorSize.resize(2, 0); 24 | vTensorSize[0] = 3 * kInputH * kInputW * sizeof(float); 25 | vTensorSize[1] = kOutputSize * sizeof(float); 26 | 27 | // prepare input data and output data --------------------------- 28 | inputData = new float[3 * kInputH * kInputW]; 29 | outputData = new float[kOutputSize]; 30 | 31 | // prepare input and output space on device 32 | vBufferD.resize(2, nullptr); 33 | for (int i = 0; i < 2; i++) 34 | { 35 | CUDA_CHECK(cudaMalloc(&vBufferD[i], vTensorSize[i])); 36 | } 37 | } 38 | 39 | void YoloDetecter::deserialize_engine() 40 | { 41 | std::ifstream file(trtFile_, std::ios::binary); 42 | if (!file.good()){ 43 | std::cerr << "read " << trtFile_ << " error!" << std::endl; 44 | assert(false); 45 | } 46 | size_t size = 0; 47 | file.seekg(0, file.end); 48 | size = file.tellg(); 49 | file.seekg(0, file.beg); 50 | char* serialized_engine = new char[size]; 51 | assert(serialized_engine); 52 | file.read(serialized_engine, size); 53 | file.close(); 54 | 55 | runtime = createInferRuntime(gLogger); 56 | engine = runtime->deserializeCudaEngine(serialized_engine, size); 57 | context = engine->createExecutionContext(); 58 | delete[] serialized_engine; 59 | } 60 | 61 | YoloDetecter::~YoloDetecter() 62 | { 63 | cudaStreamDestroy(stream); 64 | 65 | for (int i = 0; i < 2; ++i) 66 | { 67 | CUDA_CHECK(cudaFree(vBufferD[i])); 68 | } 69 | 70 | delete context; 71 | delete engine; 72 | delete runtime; 73 | 74 | delete [] inputData; 75 | delete [] outputData; 76 | } 77 | 78 | void YoloDetecter::inference() 79 | { 80 | CUDA_CHECK(cudaMemcpyAsync(vBufferD[0], (void *)inputData, vTensorSize[0], cudaMemcpyHostToDevice, stream)); 81 | context->enqueue(1, vBufferD.data(), stream, nullptr); 82 | CUDA_CHECK(cudaMemcpyAsync((void *)outputData, vBufferD[1], vTensorSize[1], cudaMemcpyDeviceToHost, stream)); 83 | CUDA_CHECK(cudaStreamSynchronize(stream)); 84 | } 85 | 86 | std::vector YoloDetecter::inference(cv::Mat& img) 87 | { 88 | preprocess(img, inputData, kInputH, kInputW); // put image data on inputData 89 | 90 | inference(); 91 | 92 | std::vector res; 93 | nms(res, outputData, kConfThresh, kNmsThresh); 94 | 95 | std::vector final_res; 96 | for (size_t j = 0; j < res.size(); j++) 97 | { 98 | cv::Rect r = get_rect(img, res[j].bbox); 99 | DetectResult single_res {r, res[j].conf, (int)res[j].class_id}; 100 | final_res.push_back(single_res); 101 | } 102 | 103 | return final_res; 104 | } 105 | --------------------------------------------------------------------------------