├── .gitignore
├── CMakeLists.txt
├── README.md
├── configs
├── bdd100k.names
├── lightNet-BDD100K-1280x960-chPruning.cfg
├── lightNet-BDD100K-1280x960-chPruning.txt
├── lightNet-BDD100K-1280x960.cfg
├── lightNet-BDD100K-1280x960.txt
├── lightNet-BDD100K-chPruning-det-semaseg-1280x960.cfg
├── lightNet-BDD100K-chPruning-det-semaseg-1280x960.txt
├── lightNet-BDD100K-det-semaseg-1280x960.cfg
├── lightNet-BDD100K-det-semaseg-1280x960.txt
├── yolov8x-BDD100K-640x640-T4-lane4cls.cfg
└── yolov8x-BDD100K-640x640-T4-lane4cls.txt
├── extra
├── API.h
└── class_timer.hpp
├── launch
└── lightnet_trt.launch.xml
├── modules
├── calibrator.cpp
├── calibrator.h
├── chunk.cu
├── chunk.h
├── class_detector.cpp
├── class_detector.h
├── class_yolo_detector.hpp
├── colormap.hpp
├── detect.cu
├── detect.h
├── ds_image.cpp
├── ds_image.h
├── err.log
├── hardswish.cu
├── hardswish.h
├── kernel.cu
├── logging.h
├── mish.cu
├── mish.h
├── plugin_factory.cpp
├── plugin_factory.h
├── preprocess.cu
├── preprocess.h
├── profiler.cpp
├── profiler.h
├── trt_utils.cpp
├── trt_utils.h
├── yolo.cpp
├── yolo.h
├── yolo_config_parser.cpp
├── yolo_config_parser.h
├── yoloplugin_lib.cpp
├── yoloplugin_lib.h
├── yolov2.cpp
├── yolov2.h
├── yolov3.cpp
├── yolov3.h
├── yolov4.cpp
├── yolov4.h
├── yolov5.cpp
└── yolov5.h
├── package.xml
├── samples
├── parallel_executor.cpp
└── sample_detector.cpp
└── src
├── lightnet_trt_core.cpp
├── lightnet_trt_core.hpp
├── lightnet_trt_node.cpp
├── lightnet_trt_node.hpp
├── main.cpp
├── utils.cpp
└── utils.hpp
/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.a
3 | *.dSYM
4 | *.csv
5 | *.out
6 | *.png
7 | *.so
8 | *.exe
9 | *.dll
10 | *.lib
11 | *.dylib
12 | *.pyc
13 | *~
14 | mnist/
15 | data/
16 | caffe/
17 | grasp/
18 | images/
19 | opencv/
20 | convnet/
21 | decaf/
22 | submission/
23 | cfg/
24 | temp/
25 | build/darknet/*
26 | build_*/
27 | ninja/
28 | ninja.zip
29 | vcpkg_installed/
30 | !build/darknet/YoloWrapper.cs
31 | .fuse*
32 | *.weights
33 | *.db
34 | *.engine
35 | *calibration.table
36 | build/*
37 | build/*.cmake
38 | build/*.ninja
39 | build/*.txt
40 | build/*.json
41 | build/CMakeFiles/
42 | build/detect_cuda_compute_capabilities.cu
43 | build/.ninja_deps
44 | build/.ninja_log
45 | build/Makefile
46 | */vcpkg-manifest-install.log
47 | build.log
48 | __pycache__/
49 |
50 | # OS Generated #
51 | .DS_Store*
52 | ehthumbs.db
53 | Icon?
54 | Thumbs.db
55 | *.swp
56 |
57 | # IDE generated #
58 | .vs/
59 | .vscode/
60 |
61 | # Managed by CMake
62 | src/version.h
63 |
64 | # Build artifacts
65 | lib/
66 | share/
67 | include/darknet/
68 | uselib
69 | uselib_track
70 | darknet
71 | vcpkg/
72 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # cmake_minimum_required(VERSION 3.1)
2 | cmake_minimum_required(VERSION 3.14)
3 | project(lightnet_trt VERSION 1.0)
4 |
5 | enable_language(CUDA) # not supported on CMAKE 3.14
6 |
7 | find_package(OpenMP REQUIRED)
8 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") # Added for CMAKE 3.14
9 | set(CMAKE_CXX_COMPILER "/usr/bin/g++")
10 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -O3 -Wno-unused-function -Wno-maybe-uninitialized -Wno-narrowing -Wno-misleading-indentation -Wno-unused-but-set-variable -Wno-unused-variable -Wno-return-type -Wno-deprecated-declarations -Wno-write-strings -Wno-reorder -Wno-sign-compare -Wno-unused-parameter -Wno-unused-value -Wno-uninitialized ${OpenMP_CXX_FLAGS} -fopenmp")
11 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -O0 -g -Wno-write-strings")
12 | #set(CMAKE_CXX_FLAGS_DEBUG "-O0 -ggdb -g")
13 | set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath -Wl,$ORIGIN")
14 |
15 | include_directories(/usr/local/cuda/include)
16 | find_package(CUDA REQUIRED)
17 |
18 | find_package(OpenCV REQUIRED)
19 |
20 | file(GLOB_RECURSE sources modules/*.hpp modules/*.cpp modules/*.h modules/*.cu extra/*.h module/preprocess.cu)
21 | add_library(detector SHARED ${sources})
22 |
23 | target_include_directories(detector PRIVATE extra/ modules/ ${OpenCV_INCLUDE_DIRS} ${CUDA_TOOLKIT_ROOT_DIR}/include)
24 | target_link_libraries(detector ${CUDA_LIBRARIES} nvinfer nvinfer_plugin nvparsers gflags "stdc++fs")
25 |
26 | add_executable(lightnet_trt samples/sample_detector.cpp ${sources})
27 |
28 | target_include_directories(lightnet_trt PRIVATE modules/ extra/)
29 | target_link_libraries(lightnet_trt ${CUDA_LIBRARIES} ${OpenCV_LIBS} nvinfer nvinfer_plugin nvparsers gflags boost_system boost_filesystem "stdc++fs" "stdc++")
30 |
31 |
32 | # ROS 2 package
33 | if(BUILD_ROS2_LIGHTNET_TRT)
34 | find_package(autoware_cmake REQUIRED)
35 | autoware_package()
36 |
37 | find_package(Eigen3 REQUIRED)
38 |
39 | # build nodelet
40 | # TODO
41 |
42 | # build node
43 | ament_auto_add_executable(lightnet_trt_node
44 | src/main.cpp
45 | src/lightnet_trt_node.cpp
46 | src/lightnet_trt_core.cpp
47 | src/utils.cpp
48 | )
49 | ament_target_dependencies(lightnet_trt_node)
50 | target_include_directories(lightnet_trt_node PRIVATE modules/ extra/)
51 | target_link_libraries(lightnet_trt_node detector)
52 | target_link_libraries(lightnet_trt_node ${CUDA_LIBRARIES} ${OpenCV_LIBS} Eigen3::Eigen nvinfer nvinfer_plugin nvparsers gflags boost_system boost_filesystem "stdc++fs" "stdc++")
53 |
54 | ament_auto_package(INSTALL_TO_SHARE
55 | launch
56 | configs
57 | )
58 | endif()
59 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LightNet-TRT: High-Efficiency and Real-Time CNN Implementation on Edge AI
2 |
3 | LightNet-TRT is a CNN implementation optimized for edge AI devices that combines the advantages of LightNet [[1]](#references) and TensorRT [[2]](#references). LightNet is a lightweight and high-performance neural network framework designed for edge devices, while TensorRT is a high-performance deep learning inference engine developed by NVIDIA for optimizing and running deep learning models on GPUs. LightNet-TRT uses the Network Definition API provided by TensorRT to integrate LightNet into TensorRT, allowing it to run efficiently and in real-time on edge devices.
4 |
5 | 
6 |
7 |
8 | ## Key Improvements
9 |
10 | ### 2:4 Structured Sparsity
11 |
12 | LightNet-TRT utilizes 2:4 structured sparsity [[3]](#references) to further optimize the network. 2:4 structured sparsity means that two values must be zero in each contiguous block of four values, resulting in a 50% reduction in the number of weights. This technique allows the network to use fewer weights and computations while maintaining accuracy.
13 |
14 | 
15 |
16 | ### NVDLA Execution
17 |
18 | LightNet-TRT also supports the execution of the neural network on the NVIDIA Deep Learning Accelerator (NVDLA) [[4]](#references) , a free and open architecture that provides high performance and low power consumption for deep learning inference on edge devices. By using NVDLA, LightNet-TRT can further improve the efficiency and performance of the network on edge devices.
19 |
20 | 
21 |
22 |
23 | ### Multi-Precision Quantization
24 |
25 | In addition to post training quantization [[5]](#references), LightNet-TRT also supports multi-precision quantization, which allows the network to use different precision for weights and activations. By using mixed precision, LightNet-TRT can further reduce the memory usage and computational requirements of the network while still maintaining accuracy. By writing it in CFG, you can set the precision for each layer of your CNN.
26 |
27 | 
28 |
29 |
30 |
31 | ### Multitask Execution (Detection/Segmentation)
32 |
33 | LightNet-TRT also supports multitask execution, allowing the network to perform both object detection and segmentation tasks simultaneously. This enables the network to perform multiple tasks efficiently on edge devices, saving computational resources and power.
34 |
35 | [](https://www.youtube.com/watch?v=TmlW-b_t3sQ)
36 |
37 | ## Installation
38 |
39 | ### Requirements
40 |
41 | - CUDA 11.0 or higher
42 | - TensorRT 8.0 or higher
43 | - OpenCV 3.0 or higher
44 |
45 | ### Steps
46 |
47 | 1. Clone the repository.
48 |
49 | ```shell
50 | $ git clone https://github.com/daniel89710/lightNet-TRT.git
51 | $ cd lightNet-TRT
52 | ```
53 |
54 | 2. Install libraries.
55 |
56 | ```shell
57 | $ sudo apt update
58 | $ sudo apt install libgflags-dev
59 | $ sudo apt install libboost-all-dev
60 | ```
61 |
62 | 3. Compile the TensorRT implementation.
63 |
64 | ```shell
65 | $ mkdir build
66 | $ cmake ../
67 | $ make -j
68 | ```
69 |
70 | ## Model
71 | | Model | Resolutions | GFLOPS | Params | Precision | Sparsity | DNN time on RTX3080 | DNN time on Jetson Orin NX 16GB GPU | DNN time on Jetson Orin NX 16GB DLA| DNN time on Jetson Orin Nano 4GB GPU | cfg | weights |
72 | |---|---|---|---|---|---|---|---|---|---|---|---|
73 | | lightNet | 1280x960 | 58.01 | 9.0M | int8 | 49.8% | 1.30ms | 7.6ms | 14.2ms | 14.9ms | [github](https://github.com/daniel89710/lightNet/blob/master/cfg/lightNet-BDD100K-1280x960.cfg) |[GoogleDrive](https://drive.google.com/file/d/1qTBQ0BkIYqcyu1BwC54_Z9T1_b702HKf/view?usp=sharing) |
74 | | LightNet+Semseg | 1280x960 | 76.61 | 9.7M | int8 | 49.8% | 2.06ms | 15.3ms | 23.2ms | 26.2ms | [github](https://github.com/daniel89710/lightNet-TRT/blob/main/configs/lightNet-BDD100K-det-semaseg-1280x960.cfg) | [GoogleDrive](https://drive.google.com/file/d/1ttdVtlDiPun13EQCB4Nyls3Q8w5aXg1i/view?usp=sharing)|
75 | | lightNet pruning | 1280x960 | 35.03 | 4.3M | int8 | 49.8% | 1.21ms | 8.89ms | 11.68ms | 13.75ms | [github](https://github.com/daniel89710/lightNet-TRT/blob/main/configs/lightNet-BDD100K-1280x960-chPruning.cfg) |[GoogleDrive](https://drive.google.com/file/d/1OGZApPeNH7K08-89eJ8tGhzA9kkRFLii/view?usp=sharing) |
76 | | LightNet pruning +SemsegLight | 1280x960 | 44.35 | 4.9M | int8 | 49.8% | 1.80ms | 9.89ms | 15.26ms | 23.35ms | [github](https://github.com/daniel89710/lightNet-TRT/blob/main/configs/lightNet-BDD100K-chPruning-det-semaseg-1280x960.cfg) | [GoogleDrive](https://drive.google.com/file/d/1dytYnqS4h_5YK73tr6DOZTUYUEKZYsFe/view?usp=drive_link)|
77 |
78 | * "DNN time" refers to the time measured by IProfiler during the enqueueV2 operation, excluding pre-process and post-process times.
79 | * Orin NX has three independent AI processors, allowing lightNet to be parallelized across a GPU and two DLAs.
80 | * Orin Nano 4GB has only iGPU with 512 CUDA cores.
81 | ## Usage
82 |
83 | ### Converting a LightNet model to a TensorRT engine
84 |
85 | Build FP32 engine
86 | ```shell
87 | $ ./lightNet-TRT --flagfile ../configs/lightNet-BDD100K-det-semaseg-1280x960.txt --precision kFLOAT
88 | ```
89 |
90 | Build FP16(HALF) engine
91 | ```shell
92 | $ ./lightNet-TRT --flagfile ../configs/lightNet-BDD100K-det-semaseg-1280x960.txt --precision kHALF
93 | ```
94 |
95 | Build INT8 engine
96 | (You need to prepare a list for calibration in "configs/calibration_images.txt".)
97 | ```shell
98 | $ ./lightNet-TRT --flagfile ../configs/lightNet-BDD100K-det-semaseg-1280x960.txt --precision kINT8
99 | ```
100 |
101 | Build DLA engine (Supported by only Xavier and Orin)
102 | ```shell
103 | $ ./lightNet-TRT --flagfile ../configs/lightNet-BDD100K-det-semaseg-1280x960.txt --precision kINT8 --dla [0/1]
104 | ```
105 |
106 | ### Inference with the TensorRT engine
107 |
108 | Inference from images
109 | ```shell
110 | $ ./lightNet-TRT --flagfile ../configs/lightNet-BDD100K-det-semaseg-1280x960.txt --precision [kFLOAT/kHALF/kINT8] {--dla [0/1]} --d DIRECTORY
111 | ```
112 |
113 | Inference from images
114 | ```shell
115 | $ ./lightNet-TRT --flagfile ../configs/lightNet-BDD100K-det-semaseg-1280x960.txt --precision [kFLOAT/kHALF/kINT8] {--dla [0/1]} --d VIDEO
116 | ```
117 |
118 | ### Inference with ROS 2 node
119 |
120 | You can build as follows. Note that current implementation requires Autoware installed and sourced.
121 |
122 | ```bash
123 | mkdir lightnet_trt_ws/src -p
124 | cd lightnet_trt_ws/src
125 | git clone https://github.com/kminoda/lightNet-TRT-ROS2.git
126 | cd ..
127 | colcon build --symlink-install --cmake-args -DCMAKE_BUILD_TYPE=Release -DBUILD_ROS2_LIGHTNET_TRT=ON
128 | ```
129 |
130 | Then, you can run the node as follows.
131 |
132 | ```bash
133 | ros2 launch lightnet_trt lightnet_trt.launch.xml
134 | ```
135 |
136 | ## Implementation
137 |
138 | LightNet-TRT is built on the LightNet framework and integrates with TensorRT using the Network Definition API. The implementation is based on the following repositories:
139 |
140 | - LightNet: [https://github.com/daniel89710/lightNet](https://github.com/daniel89710/lightNet)
141 | - TensorRT: [https://github.com/NVIDIA/TensorRT](https://github.com/NVIDIA/TensorRT)
142 | - NVIDIA DeepStream SDK: [https://github.com/NVIDIA-AI-IOT/deepstream\_reference\_apps/tree/restructure](https://github.com/NVIDIA-AI-IOT/deepstream_reference_apps/tree/restructure)
143 | - YOLO-TensorRT: [https://github.com/enazoe/yolo-tensorrt](https://github.com/enazoe/yolo-tensorrt)
144 |
145 | ## Conclusion
146 |
147 | LightNet-TRT is a powerful and efficient implementation of CNNs using Edge AI. With its advanced features and integration with TensorRT, it is an excellent choice for real-time object detection and semantic segmentation applications on edge devices.
148 |
149 | # References
150 | [1]. [LightNet](https://github.com/daniel89710/lightNet)
151 | [2]. [TensorRT](https://developer.nvidia.com/tensorrt)
152 | [3]. [Accelerating Inference with Sparsity Using the NVIDIA Ampere Architecture and NVIDIA TensorRT](https://developer.nvidia.com/blog/accelerating-inference-with-sparsity-using-ampere-and-tensorrt/)
153 | [4]. [NVDLA](http://nvdla.org/)
154 | [5]. [Achieving FP32 Accuracy for INT8 Inference Using Quantization Aware Training with NVIDIA TensorRT](https://developer.nvidia.com/blog/achieving-fp32-accuracy-for-int8-inference-using-quantization-aware-training-with-tensorrt/)
155 |
156 |
157 |
--------------------------------------------------------------------------------
/configs/bdd100k.names:
--------------------------------------------------------------------------------
1 | car
2 | bus
3 | person
4 | bike
5 | truck
6 | motor
7 | train
8 | rider
9 | traffic_sign
10 | traffic_light
--------------------------------------------------------------------------------
/configs/lightNet-BDD100K-1280x960-chPruning.txt:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2018 NVIDIA CORPORATION. All rights reserved.
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | # config_file_path : Darknet cfg file
24 | # wts_file_path : Darknet weights file
25 |
26 | --network_type=YOLOV3
27 | --config_file_path=../configs/lightNet-BDD100K-1280x960-chPruning.cfg
28 | --wts_file_path=../configs/lightNet-BDD100K-1280x960-chPruning.weights
29 |
30 | #Support kFLOAT/kHALF/kINT8
31 | #kINT8 requires 'calibration_images.txt'
32 | --precision=kFLOAT
33 |
34 | #Support DLA for Xavier/Orin series
35 | #--dla=[0/1]
36 | --mp=true
37 |
--------------------------------------------------------------------------------
/configs/lightNet-BDD100K-1280x960.txt:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2018 NVIDIA CORPORATION. All rights reserved.
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | # config_file_path : Darknet cfg file
24 | # wts_file_path : Darknet weights file
25 |
26 | --network_type=YOLOV3
27 | --config_file_path=../configs/lightNet-BDD100K-1280x960.cfg
28 | --wts_file_path=../configs/lightNet-BDD100K-1280x960.weights
29 |
30 | #Support kFLOAT/kHALF/kINT8
31 | --precision=kFLOAT
32 |
33 | #Support DLA for Xavier/Orin series
34 | #--dla=[0/1]
35 | --mp=true
36 |
--------------------------------------------------------------------------------
/configs/lightNet-BDD100K-chPruning-det-semaseg-1280x960.txt:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2018 NVIDIA CORPORATION. All rights reserved.
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | ### This is a sample config file for yolo plugin and the trt-yolo-app
24 | # Statements starting with '#' are ignored.
25 |
26 | # The following 4 config params are required for yolo plugin and trt-yolo-app
27 |
28 | # network_type : Type of network architecture. Choose from yolov2, yolov2-tiny, yolov3 and yolov3-tiny
29 | # config_file_path : Darknet cfg file
30 | # wts_file_path : Darknet weights file
31 | # labels_file_path : Text file with a list of object class labels
32 |
33 | --network_type=YOLOV3
34 | --config_file_path=../configs/lightNet-BDD100K-chPruning-det-semaseg-1280x960.cfg
35 | --wts_file_path=../configs/lightNet-BDD100K-chPruning-det-semaseg-1280x960.weights
36 |
37 | #Support kFLOAT/kHALF/kINT8
38 | --precision=kFLOAT
39 |
40 | #Support DLA for Xavier/Orin series
41 | #--dla=[0/1]
42 | #--mp=true
43 |
--------------------------------------------------------------------------------
/configs/lightNet-BDD100K-det-semaseg-1280x960.txt:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2018 NVIDIA CORPORATION. All rights reserved.
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | ### This is a sample config file for yolo plugin and the trt-yolo-app
24 | # Statements starting with '#' are ignored.
25 |
26 | # The following 4 config params are required for yolo plugin and trt-yolo-app
27 |
28 | # network_type : Type of network architecture. Choose from yolov2, yolov2-tiny, yolov3 and yolov3-tiny
29 | # config_file_path : Darknet cfg file
30 | # wts_file_path : Darknet weights file
31 | # labels_file_path : Text file with a list of object class labels
32 |
33 | --network_type=YOLOV3
34 | --config_file_path=../configs/lightNet-BDD100K-det-semaseg-1280x960.cfg
35 | --wts_file_path=../configs/lightNet-BDD100K-det-semaseg-1280x960.weights
36 |
37 | #Support kFLOAT/kHALF/kINT8
38 | --precision=kFLOAT
39 |
40 | #Support DLA for Xavier/Orin series
41 | #--dla=[0/1]
42 | #--mp=true
43 |
--------------------------------------------------------------------------------
/configs/yolov8x-BDD100K-640x640-T4-lane4cls.txt:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2018 NVIDIA CORPORATION. All rights reserved.
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | ### This is a sample config file for yolo plugin and the trt-yolo-app
24 | # Statements starting with '#' are ignored.
25 |
26 | # The following 4 config params are required for yolo plugin and trt-yolo-app
27 |
28 | # network_type : Type of network architecture. Choose from yolov2, yolov2-tiny, yolov3 and yolov3-tiny
29 | # config_file_path : Darknet cfg file
30 | # wts_file_path : Darknet weights file
31 | # labels_file_path : Text file with a list of object class labels
32 |
33 | --network_type=YOLOV8X
34 | --config_file_path=../configs/yolov8x-BDD100K-640x640-T4-lane4cls.cfg
35 | --wts_file_path=../weights/yolov8x-BDD100K-640x640-T4-lane4cls-scratch-pseudo-finetune_last.weights
36 |
37 | #Support kFLOAT/kHALF/kINT8
38 | --precision=kFLOAT
39 |
40 | #Support DLA for Xavier/Orin series
41 | #--dla=[0/1]
42 | #--mp=true
43 |
--------------------------------------------------------------------------------
/extra/API.h:
--------------------------------------------------------------------------------
1 | #ifdef API_EXPORTS
2 |
3 | #if defined(_MSC_VER)
4 | #define API __declspec(dllexport)
5 | #else
6 | #define API __attribute__((visibility("default")))
7 | #endif
8 |
9 | #else
10 |
11 | #if defined(_MSC_VER)
12 | #define API __declspec(dllimport)
13 | #else
14 | #define API
15 | #endif
16 |
17 | #endif
--------------------------------------------------------------------------------
/extra/class_timer.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 | #include
6 |
7 | class Timer
8 | {
9 | public:
10 | Timer() : beg_(clock_::now())
11 | {}
12 | void reset()
13 | {
14 | beg_ = clock_::now();
15 | }
16 |
17 | double elapsed() const
18 | {
19 | return std::chrono::duration_cast(clock_::now() - beg_).count();
20 | }
21 |
22 | void out(std::string message = "")
23 | {
24 | double t = elapsed();
25 | std::cout << message << " elasped time:" << t << "ms" << std::endl;
26 | reset();
27 | }
28 |
29 | double get_duration()const
30 | {
31 | return elapsed();
32 | }
33 | private:
34 | using clock_ = std::chrono::high_resolution_clock;
35 | using second_ = std::chrono::duration;
36 | std::chrono::time_point beg_;
37 | };
38 |
--------------------------------------------------------------------------------
/launch/lightnet_trt.launch.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/modules/calibrator.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | MIT License
3 |
4 | Copyright (c) 2018 NVIDIA CORPORATION. All rights reserved.
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | *
24 | */
25 |
26 | #include "calibrator.h"
27 | #include
28 | #include
29 | #include
30 | #include
31 | #include
32 | Int8EntropyCalibrator::Int8EntropyCalibrator(const uint32_t& batchSize, const std::string& calibImages,
33 | const std::string& calibImagesPath,
34 | const std::string& calibTableFilePath,
35 | const uint64_t& inputSize, const uint32_t& inputH,
36 | const uint32_t& inputW, const std::string& inputBlobName,
37 | const std::string &s_net_type_) :
38 | m_BatchSize(batchSize),
39 | m_InputH(inputH),
40 | m_InputW(inputW),
41 | m_InputSize(inputSize),
42 | m_InputCount(batchSize * inputSize),
43 | m_InputBlobName(inputBlobName),
44 | m_CalibTableFilePath(calibTableFilePath),
45 | m_ImageIndex(0),
46 | _s_net_type(s_net_type_)
47 | {
48 | if (!fileExists(m_CalibTableFilePath, false))
49 | {
50 | m_ImageList = loadImageList(calibImages, calibImagesPath);
51 | m_ImageList.resize(static_cast(m_ImageList.size() / m_BatchSize) * m_BatchSize);
52 | std::random_shuffle(m_ImageList.begin(), m_ImageList.end(),
53 | [](int i) { return rand() % i; });
54 | }
55 |
56 | NV_CUDA_CHECK(cudaMalloc(&m_DeviceInput, m_InputCount * sizeof(float)));
57 | }
58 |
59 | Int8EntropyCalibrator::~Int8EntropyCalibrator() { NV_CUDA_CHECK(cudaFree(m_DeviceInput)); }
60 |
61 | bool Int8EntropyCalibrator::getBatch(void* bindings[], const char* names[], int nbBindings)noexcept
62 | {
63 | if (m_ImageIndex + m_BatchSize >= m_ImageList.size()) return false;
64 |
65 | // Load next batch
66 | std::vector dsImages(m_BatchSize);
67 | for (uint32_t j = m_ImageIndex; j < m_ImageIndex + m_BatchSize; ++j)
68 | {
69 | dsImages.at(j - m_ImageIndex) = DsImage(m_ImageList.at(j), _s_net_type, m_InputH, m_InputW);
70 | }
71 | m_ImageIndex += m_BatchSize;
72 |
73 | cv::Mat trtInput = blobFromDsImages(dsImages, m_InputH, m_InputW);
74 |
75 | NV_CUDA_CHECK(cudaMemcpy(m_DeviceInput, trtInput.ptr(0), m_InputCount * sizeof(float),
76 | cudaMemcpyHostToDevice));
77 | assert(!strcmp(names[0], m_InputBlobName.c_str()));
78 | bindings[0] = m_DeviceInput;
79 | return true;
80 | }
81 |
82 | const void* Int8EntropyCalibrator::readCalibrationCache(size_t& length) noexcept
83 | {
84 | void* output;
85 | m_CalibrationCache.clear();
86 | assert(!m_CalibTableFilePath.empty());
87 | std::ifstream input(m_CalibTableFilePath, std::ios::binary | std::ios::in);
88 | input >> std::noskipws;
89 | if (m_ReadCache && input.good())
90 | std::copy(std::istream_iterator(input), std::istream_iterator(),
91 | std::back_inserter(m_CalibrationCache));
92 |
93 | length = m_CalibrationCache.size();
94 | if (length)
95 | {
96 | std::cout << "Using cached calibration table to build the engine" << std::endl;
97 | output = &m_CalibrationCache[0];
98 | }
99 |
100 | else
101 | {
102 | std::cout << "New calibration table will be created to build the engine" << std::endl;
103 | output = nullptr;
104 | }
105 |
106 | return output;
107 | }
108 |
109 | void Int8EntropyCalibrator::writeCalibrationCache(const void* cache, size_t length) noexcept
110 | {
111 | assert(!m_CalibTableFilePath.empty());
112 | std::ofstream output(m_CalibTableFilePath, std::ios::binary);
113 | output.write(reinterpret_cast(cache), length);
114 | output.close();
115 | }
--------------------------------------------------------------------------------
/modules/calibrator.h:
--------------------------------------------------------------------------------
1 | /**
2 | MIT License
3 |
4 | Copyright (c) 2018 NVIDIA CORPORATION. All rights reserved.
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | *
24 | */
25 | #ifndef _CALIBRATOR_H_
26 | #define _CALIBRATOR_H_
27 |
28 | #include "NvInfer.h"
29 | #include "ds_image.h"
30 | #include "trt_utils.h"
31 |
32 | class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator2
33 | {
34 | public:
35 | Int8EntropyCalibrator(const uint32_t& batchSize, const std::string& calibImages,
36 | const std::string& calibImagesPath, const std::string& calibTableFilePath,
37 | const uint64_t& inputSize, const uint32_t& inputH, const uint32_t& inputW,
38 | const std::string& inputBlobName,const std::string &s_net_type_);
39 | virtual ~Int8EntropyCalibrator();
40 |
41 | int getBatchSize() const noexcept override { return m_BatchSize; }
42 | bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept override;
43 | const void* readCalibrationCache(size_t& length) noexcept override;
44 | void writeCalibrationCache(const void* cache, size_t length) noexcept override;
45 |
46 | private:
47 | const uint32_t m_BatchSize;
48 | const uint32_t m_InputH;
49 | const uint32_t m_InputW;
50 | const uint64_t m_InputSize;
51 | const uint64_t m_InputCount;
52 | const std::string m_InputBlobName;
53 | const std::string _s_net_type;
54 | const std::string m_CalibTableFilePath{nullptr};
55 | uint32_t m_ImageIndex;
56 | bool m_ReadCache{true};
57 | void* m_DeviceInput{nullptr};
58 | std::vector m_ImageList;
59 | std::vector m_CalibrationCache;
60 | };
61 |
62 | #endif
--------------------------------------------------------------------------------
/modules/chunk.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include "chunk.h"
6 | #include
7 | #define NV_CUDA_CHECK(status) \
8 | { \
9 | if (status != 0) \
10 | { \
11 | std::cout << "Cuda failure: " << cudaGetErrorString(status) << " in file " << __FILE__ \
12 | << " at line " << __LINE__ << std::endl; \
13 | abort(); \
14 | } \
15 | }
16 |
17 |
18 | namespace nvinfer1
19 | {
20 | Chunk::Chunk()
21 | {
22 |
23 | }
24 | Chunk::Chunk(const void* buffer, size_t size)
25 | {
26 | assert(size == sizeof(_n_size_split));
27 | _n_size_split = *reinterpret_cast(buffer);
28 | }
29 | Chunk::~Chunk()
30 | {
31 |
32 | }
33 | int Chunk::getNbOutputs() const noexcept
34 | {
35 | return 2;
36 | }
37 |
38 | Dims Chunk::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)noexcept
39 | {
40 | assert(nbInputDims == 1);
41 | assert(index == 0 || index == 1);
42 | return Dims3(inputs[0].d[0] / 2, inputs[0].d[1], inputs[0].d[2]);
43 | }
44 |
45 | int Chunk::initialize() noexcept
46 | {
47 | return 0;
48 | }
49 |
50 | void Chunk::terminate() noexcept
51 | {
52 | }
53 |
54 | size_t Chunk::getWorkspaceSize(int maxBatchSize) const noexcept
55 | {
56 | return 0;
57 | }
58 |
59 | /*int Chunk::enqueue(int batchSize,
60 | const void* const* inputs,
61 | void** outputs,
62 | void* workspace,
63 | cudaStream_t stream)noexcept*/
64 |
65 | int Chunk::enqueue(int batchSize,
66 | const void* const* inputs,
67 | void* const* outputs,
68 | void* workspace,
69 | cudaStream_t stream) noexcept
70 | {
71 | //batch
72 | for (int b = 0; b < batchSize; ++b)
73 | {
74 | printf("Chunk %d, %d\n", _n_size_split, _n_size_split/sizeof(float));
75 | /*
76 | NV_CUDA_CHECK(cudaMemcpyAsync((char*)outputs[0] + b * _n_size_split, (char*)inputs[0] + b * 2 * _n_size_split + _n_size_split/sizeof(float), _n_size_split, cudaMemcpyDeviceToDevice, stream));
77 | NV_CUDA_CHECK(cudaMemcpyAsync((char*)outputs[1] + b * _n_size_split, (char*)inputs[0] + b * 2 * _n_size_split + _n_size_split/sizeof(float), _n_size_split, cudaMemcpyDeviceToDevice, stream));
78 | */
79 | NV_CUDA_CHECK(cudaMemcpy((char*)outputs[0] + b * _n_size_split, (char*)inputs[0] + b * 2 * _n_size_split + _n_size_split/sizeof(float), _n_size_split, cudaMemcpyDeviceToDevice));
80 | NV_CUDA_CHECK(cudaMemcpy((char*)outputs[1] + b * _n_size_split, (char*)inputs[0] + b * 2 * _n_size_split + _n_size_split/sizeof(float), _n_size_split, cudaMemcpyDeviceToDevice));
81 | }
82 | // NV_CUDA_CHECK(cudaMemcpy(outputs[0], inputs[0], _n_size_split, cudaMemcpyDeviceToDevice));
83 | // NV_CUDA_CHECK(cudaMemcpy(outputs[1], (void*)((char*)inputs[0] + _n_size_split), _n_size_split, cudaMemcpyDeviceToDevice));
84 | return 0;
85 | }
86 |
87 | size_t Chunk::getSerializationSize() const noexcept
88 | {
89 | return sizeof(_n_size_split);
90 | }
91 |
92 | void Chunk::serialize(void *buffer)const noexcept
93 | {
94 | *reinterpret_cast(buffer) = _n_size_split;
95 | }
96 |
97 | const char* Chunk::getPluginType()const noexcept
98 | {
99 | return "CHUNK_TRT";
100 | }
101 | const char* Chunk::getPluginVersion() const noexcept
102 | {
103 | return "1.0";
104 | }
105 |
106 | void Chunk::destroy() noexcept
107 | {
108 | delete this;
109 | }
110 |
111 | void Chunk::setPluginNamespace(const char* pluginNamespace) noexcept
112 | {
113 | _s_plugin_namespace = pluginNamespace;
114 | }
115 |
116 | const char* Chunk::getPluginNamespace() const noexcept
117 | {
118 | return _s_plugin_namespace.c_str();
119 | }
120 |
121 | DataType Chunk::getOutputDataType(int index,
122 | const nvinfer1::DataType* inputTypes,
123 | int nbInputs) const noexcept
124 | {
125 | assert(index == 0 || index == 1);
126 | return DataType::kFLOAT;
127 | }
128 |
129 | bool Chunk::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const noexcept
130 | {
131 | return false;
132 | }
133 |
134 | bool Chunk::canBroadcastInputAcrossBatch(int inputIndex) const noexcept
135 | {
136 | return false;
137 | }
138 |
139 | void Chunk::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) {}
140 |
141 | void Chunk::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
142 | {
143 | _n_size_split = in->dims.d[0] / 2 * in->dims.d[1] * in->dims.d[2] *sizeof(float);
144 | printf("Chunk in configuring %d, %d\n", _n_size_split, _n_size_split/sizeof(float));
145 | }
146 | void Chunk::detachFromContext() {}
147 |
148 | bool Chunk::supportsFormat(DataType type, PluginFormat format) const noexcept
149 | {
150 | return (type == DataType::kFLOAT && format == PluginFormat::kLINEAR);
151 | }
152 |
153 | void Chunk::configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) noexcept
154 | {
155 |
156 | }
157 |
158 | // Clone the plugin
159 | IPluginV2* Chunk::clone() const noexcept
160 | {
161 | Chunk *p = new Chunk();
162 | p->_n_size_split = _n_size_split;
163 | p->setPluginNamespace(_s_plugin_namespace.c_str());
164 | return p;
165 | }
166 |
167 | //----------------------------
168 | PluginFieldCollection ChunkPluginCreator::_fc{};
169 | std::vector ChunkPluginCreator::_vec_plugin_attributes;
170 |
171 | ChunkPluginCreator::ChunkPluginCreator()
172 | {
173 | _vec_plugin_attributes.clear();
174 | _fc.nbFields = _vec_plugin_attributes.size();
175 | _fc.fields = _vec_plugin_attributes.data();
176 | }
177 |
178 | const char* ChunkPluginCreator::getPluginName() const noexcept
179 | {
180 | return "CHUNK_TRT";
181 | }
182 |
183 | const char* ChunkPluginCreator::getPluginVersion() const noexcept
184 | {
185 | return "1.0";
186 | }
187 |
188 | const PluginFieldCollection* ChunkPluginCreator::getFieldNames()noexcept
189 | {
190 | return &_fc;
191 | }
192 |
193 | IPluginV2* ChunkPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)noexcept
194 | {
195 | Chunk* obj = new Chunk();
196 | obj->setPluginNamespace(_s_name_space.c_str());
197 | return obj;
198 | }
199 |
200 | IPluginV2* ChunkPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)noexcept
201 | {
202 | Chunk* obj = new Chunk(serialData,serialLength);
203 | obj->setPluginNamespace(_s_name_space.c_str());
204 | return obj;
205 | }
206 |
207 | void ChunkPluginCreator::setPluginNamespace(const char* libNamespace)noexcept
208 | {
209 | _s_name_space = libNamespace;
210 | }
211 |
212 | const char* ChunkPluginCreator::getPluginNamespace() const noexcept
213 | {
214 | return _s_name_space.c_str();
215 | }
216 | }//namespace nvinfer1
217 |
--------------------------------------------------------------------------------
/modules/chunk.h:
--------------------------------------------------------------------------------
1 | #ifndef CHUNK_H_
2 | #define CHUNK_H_
3 |
4 | //#include "NvInfer.h"
5 | //#include "NvInferPlugin.h"
6 | //#include "NvInferRuntimeCommon.h"
7 | //#include
8 | //#include
9 | //#include
10 | //#include
11 | //#include
12 | //#include
13 | //#include
14 |
15 | #include
16 | #include
17 | #include "NvInfer.h"
18 |
19 | namespace nvinfer1
20 | {
21 | class Chunk : public IPluginV2
22 | {
23 | public:
24 | Chunk();
25 | Chunk(const void* buffer, size_t length);
26 | ~Chunk();
27 | int getNbOutputs()const noexcept override;
28 | Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims)noexcept override;
29 | int initialize()noexcept override;
30 | void terminate()noexcept override;
31 | size_t getWorkspaceSize(int maxBatchSize) const noexcept override;
32 | // int enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream)noexcept override;
33 |
34 | int enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace,
35 | cudaStream_t stream) noexcept override;
36 |
37 | size_t getSerializationSize() const noexcept override;
38 | void serialize(void* buffer) const noexcept override;
39 | const char* getPluginType() const noexcept override;
40 | const char* getPluginVersion() const noexcept override;
41 | void destroy()noexcept override;
42 | void setPluginNamespace(const char* pluginNamespace)noexcept override;
43 | const char* getPluginNamespace() const noexcept override;
44 | DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept;
45 | bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const noexcept;
46 | bool canBroadcastInputAcrossBatch(int inputIndex) const noexcept;
47 | void attachToContext(
48 | cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator);
49 | void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput);
50 | void detachFromContext();
51 | bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const
52 | {
53 | return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
54 | }
55 | IPluginV2* clone() const noexcept override;
56 | bool supportsFormat(DataType type, PluginFormat format) const noexcept override;
57 | void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) noexcept override;
58 |
59 | private:
60 | std::string _s_plugin_namespace;
61 | int _n_size_split;
62 | };
63 |
64 | class ChunkPluginCreator : public IPluginCreator
65 | {
66 | public:
67 | ChunkPluginCreator();
68 | ~ChunkPluginCreator() override = default;
69 | const char* getPluginName()const noexcept override;
70 | const char* getPluginVersion() const noexcept override;
71 | const PluginFieldCollection* getFieldNames()noexcept override;
72 | IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc)noexcept override;
73 | IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength)noexcept override;
74 | void setPluginNamespace(const char* libNamespace)noexcept override;
75 | const char* getPluginNamespace() const noexcept override;
76 | private:
77 | std::string _s_name_space;
78 | static PluginFieldCollection _fc;
79 | static std::vector _vec_plugin_attributes;
80 | };
81 |
82 | }//nampespace nvinfer1
83 |
84 |
85 | #endif
86 |
87 |
88 |
--------------------------------------------------------------------------------
/modules/class_detector.cpp:
--------------------------------------------------------------------------------
1 | #include "class_detector.h"
2 | #include "class_yolo_detector.hpp"
3 |
4 | class Detector::Impl
5 | {
6 | public:
7 | Impl() {}
8 |
9 | ~Impl(){}
10 |
11 | YoloDectector _detector;
12 | };
13 |
14 | Detector::Detector()
15 | {
16 | _impl = new Impl();
17 | }
18 |
19 | Detector::~Detector()
20 | {
21 | if (_impl)
22 | {
23 | delete _impl;
24 | _impl = nullptr;
25 | }
26 | }
27 |
28 | void Detector::init(const Config &config)
29 | {
30 | _impl->_detector.init(config);
31 | }
32 |
33 | void Detector::dump_profiling(void)
34 | {
35 | _impl->_detector.dump_profiling();
36 | }
37 |
38 | void Detector::detect(const std::vector &mat_image, std::vector &vec_batch_result, const bool cuda)
39 | {
40 | _impl->_detector.detect(mat_image, vec_batch_result, cuda);
41 | }
42 |
43 | void Detector::save_image(cv::Mat &img, const std::string &dir, const std::string &name)
44 | {
45 | _impl->_detector.save_image(img, dir, name);
46 | }
47 |
48 | void Detector::segment(const std::vector &mat_image, std::string filename)
49 | {
50 | _impl->_detector.segment(mat_image, filename);
51 | }
52 |
53 | void Detector::regress(const std::vector &mat_image, std::vector &vec_batch_result, std::string filename)
54 | {
55 | _impl->_detector.regress(mat_image, vec_batch_result, filename);
56 | }
57 |
58 | void Detector::draw_BBox(cv::Mat &img,Result result)
59 | {
60 | _impl->_detector.draw_BBox(img, result);
61 | }
62 |
--------------------------------------------------------------------------------
/modules/class_detector.h:
--------------------------------------------------------------------------------
1 | #ifndef CLASS_DETECTOR_H_
2 | #define CLASS_DETECTOR_H_
3 |
4 | #include "API.h"
5 | #include
6 | #include
7 |
8 | struct Result
9 | {
10 | int id = -1;
11 | float prob = 0.f;
12 | cv::Rect rect;
13 | };
14 |
15 | using BatchResult = std::vector;
16 |
17 | enum ModelType
18 | {
19 | YOLOV3,
20 | YOLOV4,
21 | YOLOV5
22 | };
23 |
24 | enum Precision
25 | {
26 | INT8 = 0,
27 | FP16,
28 | FP32
29 | };
30 |
31 | struct Config
32 | {
33 | std::string file_model_cfg = "configs/yolov4.cfg";
34 |
35 | std::string file_model_weights = "configs/yolov4.weights";
36 |
37 | float detect_thresh = 0.9;
38 |
39 | ModelType net_type = YOLOV4;
40 |
41 | Precision inference_precison = FP32;
42 |
43 | int gpu_id = 0;
44 |
45 | std::string calibration_image_list_file_txt = "configs/calibration_images.txt";
46 | int batch = 0;
47 | int width = 0;
48 | int height = 0;
49 | int dla = -1;
50 | };
51 |
52 | class API Detector
53 | {
54 | public:
55 | explicit Detector();
56 |
57 | ~Detector();
58 |
59 | void init(const Config &config);
60 | void save_image(cv::Mat &img, const std::string &dir, const std::string &name);
61 | void segment(const std::vector &mat_image, std::string filename);
62 | void regress(const std::vector &mat_image, std::vector &vec_batch_result, std::string filename);
63 | void detect(const std::vector &mat_image, std::vector &vec_batch_result, const bool cuda);
64 | void dump_profiling(void);
65 | void draw_BBox(cv::Mat &img, Result result);
66 | private:
67 |
68 | Detector(const Detector &);
69 | const Detector &operator =(const Detector &);
70 | class Impl;
71 | Impl *_impl;
72 | };
73 |
74 | #endif // !CLASS_QH_DETECTOR_H_
75 |
--------------------------------------------------------------------------------
/modules/colormap.hpp:
--------------------------------------------------------------------------------
1 | #define MAX_DISTANCE 120
2 | const unsigned char jet_colormap[MAX_DISTANCE][3] = {
3 | {0, 0, 127},
4 | {0, 0, 137},
5 | {0, 0, 146},
6 | {0, 0, 156},
7 | {0, 0, 166},
8 | {0, 0, 176},
9 | {0, 0, 185},
10 | {0, 0, 195},
11 | {0, 0, 205},
12 | {0, 0, 215},
13 | {0, 0, 224},
14 | {0, 0, 234},
15 | {0, 0, 244},
16 | {0, 0, 254},
17 | {0, 0, 255},
18 | {0, 1, 255},
19 | {0, 9, 255},
20 | {0, 18, 255},
21 | {0, 26, 255},
22 | {0, 35, 255},
23 | {0, 43, 255},
24 | {0, 52, 255},
25 | {0, 61, 255},
26 | {0, 69, 255},
27 | {0, 78, 255},
28 | {0, 86, 255},
29 | {0, 95, 255},
30 | {0, 103, 255},
31 | {0, 112, 255},
32 | {0, 121, 255},
33 | {0, 129, 255},
34 | {0, 138, 255},
35 | {0, 146, 255},
36 | {0, 155, 255},
37 | {0, 163, 255},
38 | {0, 172, 255},
39 | {0, 181, 255},
40 | {0, 189, 255},
41 | {0, 198, 255},
42 | {0, 206, 255},
43 | {0, 215, 255},
44 | {0, 223, 251},
45 | {2, 232, 244},
46 | {9, 241, 237},
47 | {16, 249, 230},
48 | {23, 255, 223},
49 | {30, 255, 216},
50 | {36, 255, 209},
51 | {43, 255, 202},
52 | {50, 255, 195},
53 | {57, 255, 189},
54 | {64, 255, 182},
55 | {71, 255, 175},
56 | {78, 255, 168},
57 | {85, 255, 161},
58 | {92, 255, 154},
59 | {99, 255, 147},
60 | {106, 255, 140},
61 | {113, 255, 133},
62 | {119, 255, 126},
63 | {126, 255, 119},
64 | {133, 255, 113},
65 | {140, 255, 106},
66 | {147, 255, 99},
67 | {154, 255, 92},
68 | {161, 255, 85},
69 | {168, 255, 78},
70 | {175, 255, 71},
71 | {182, 255, 64},
72 | {189, 255, 57},
73 | {195, 255, 50},
74 | {202, 255, 43},
75 | {209, 255, 36},
76 | {216, 255, 30},
77 | {223, 255, 23},
78 | {230, 255, 16},
79 | {237, 255, 9},
80 | {244, 248, 2},
81 | {251, 240, 0},
82 | {255, 232, 0},
83 | {255, 224, 0},
84 | {255, 216, 0},
85 | {255, 208, 0},
86 | {255, 200, 0},
87 | {255, 192, 0},
88 | {255, 184, 0},
89 | {255, 176, 0},
90 | {255, 168, 0},
91 | {255, 161, 0},
92 | {255, 153, 0},
93 | {255, 145, 0},
94 | {255, 137, 0},
95 | {255, 129, 0},
96 | {255, 121, 0},
97 | {255, 113, 0},
98 | {255, 105, 0},
99 | {255, 97, 0},
100 | {255, 89, 0},
101 | {255, 81, 0},
102 | {255, 73, 0},
103 | {255, 65, 0},
104 | {255, 57, 0},
105 | {255, 49, 0},
106 | {255, 41, 0},
107 | {255, 34, 0},
108 | {255, 26, 0},
109 | {254, 18, 0},
110 | {244, 10, 0},
111 | {234, 2, 0},
112 | {224, 0, 0},
113 | {215, 0, 0},
114 | {205, 0, 0},
115 | {195, 0, 0},
116 | {185, 0, 0},
117 | {176, 0, 0},
118 | {166, 0, 0},
119 | {156, 0, 0},
120 | {146, 0, 0},
121 | {137, 0, 0},
122 | {127, 0, 0}
123 | };
124 |
125 | unsigned char magma_colormap[MAX_DISTANCE][3] = {
126 | {251, 252, 191},
127 | {251, 249, 187},
128 | {252, 245, 183},
129 | {252, 241, 179},
130 | {252, 238, 176},
131 | {252, 234, 172},
132 | {252, 230, 168},
133 | {253, 225, 163},
134 | {253, 221, 159},
135 | {253, 218, 156},
136 | {253, 214, 152},
137 | {253, 210, 149},
138 | {253, 207, 146},
139 | {254, 203, 142},
140 | {254, 198, 137},
141 | {254, 194, 134},
142 | {254, 190, 131},
143 | {254, 187, 128},
144 | {254, 183, 125},
145 | {254, 179, 123},
146 | {254, 174, 118},
147 | {254, 170, 116},
148 | {254, 166, 113},
149 | {253, 162, 111},
150 | {253, 159, 108},
151 | {253, 155, 106},
152 | {253, 151, 104},
153 | {252, 146, 101},
154 | {252, 142, 99},
155 | {251, 138, 98},
156 | {251, 134, 96},
157 | {250, 130, 95},
158 | {250, 127, 94},
159 | {249, 123, 93},
160 | {248, 117, 92},
161 | {247, 113, 91},
162 | {246, 110, 91},
163 | {245, 106, 91},
164 | {243, 103, 91},
165 | {242, 99, 92},
166 | {239, 94, 93},
167 | {238, 91, 94},
168 | {236, 88, 95},
169 | {234, 85, 96},
170 | {231, 82, 98},
171 | {229, 80, 99},
172 | {226, 77, 101},
173 | {222, 74, 103},
174 | {220, 72, 105},
175 | {217, 70, 106},
176 | {214, 68, 108},
177 | {211, 66, 109},
178 | {208, 65, 111},
179 | {203, 62, 113},
180 | {200, 61, 114},
181 | {197, 60, 116},
182 | {194, 58, 117},
183 | {190, 57, 118},
184 | {187, 56, 119},
185 | {184, 55, 120},
186 | {179, 53, 122},
187 | {176, 52, 123},
188 | {172, 51, 123},
189 | {169, 50, 124},
190 | {166, 49, 125},
191 | {163, 48, 126},
192 | {159, 47, 126},
193 | {154, 45, 127},
194 | {151, 44, 127},
195 | {148, 43, 128},
196 | {145, 42, 128},
197 | {141, 41, 128},
198 | {138, 40, 129},
199 | {133, 38, 129},
200 | {130, 37, 129},
201 | {127, 36, 129},
202 | {124, 35, 129},
203 | {121, 34, 129},
204 | {118, 33, 129},
205 | {115, 31, 129},
206 | {110, 30, 129},
207 | {107, 28, 128},
208 | {104, 27, 128},
209 | {101, 26, 128},
210 | {97, 24, 127},
211 | {94, 23, 127},
212 | {90, 21, 126},
213 | {87, 20, 125},
214 | {83, 19, 124},
215 | {80, 18, 123},
216 | {77, 17, 122},
217 | {74, 16, 121},
218 | {71, 15, 119},
219 | {66, 15, 116},
220 | {62, 15, 114},
221 | {59, 15, 111},
222 | {55, 15, 108},
223 | {52, 16, 104},
224 | {48, 16, 101},
225 | {45, 16, 96},
226 | {40, 17, 89},
227 | {37, 17, 85},
228 | {34, 17, 80},
229 | {31, 17, 75},
230 | {28, 16, 70},
231 | {26, 16, 65},
232 | {22, 14, 58},
233 | {20, 13, 53},
234 | {17, 12, 49},
235 | {15, 11, 44},
236 | {13, 10, 40},
237 | {11, 8, 36},
238 | {9, 7, 31},
239 | {6, 5, 25},
240 | {4, 4, 21},
241 | {3, 3, 17},
242 | {2, 2, 13},
243 | {1, 1, 9},
244 | {0, 0, 6},
245 | {0, 0, 3},
246 | };
247 |
--------------------------------------------------------------------------------
/modules/detect.cu:
--------------------------------------------------------------------------------
1 | //sys
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | //my
11 | #include "detect.h"
12 |
13 | #define NV_CUDA_CHECK(status) \
14 | { \
15 | if (status != 0) \
16 | { \
17 | std::cout << "Cuda failure: " << cudaGetErrorString(status) << " in file " << __FILE__ \
18 | << " at line " << __LINE__ << std::endl; \
19 | abort(); \
20 | } \
21 | }
22 |
23 | namespace nvinfer1
24 | {
25 | Detect::Detect()
26 | {
27 | }
28 |
29 | Detect::Detect(const void* data, size_t length)
30 | {
31 | const char *d = reinterpret_cast(data), *a = d;
32 | read(d,_n_anchor);
33 | read(d,_n_classes);
34 | read(d,_n_grid_h);
35 | read(d, _n_grid_w);
36 | read(d, _n_output_size);
37 | //printf("anchor:%d,classes:%d,gh:%d,gw:%d,size:%d\n", _n_anchor, _n_classes, _n_grid_h, _n_grid_w, _n_output_size);
38 | assert(d == a + length);
39 | }
40 |
41 | Detect::Detect(const uint32_t n_anchor_, const uint32_t n_classes_,
42 | const uint32_t n_grid_h_, const uint32_t n_grid_w_/*,
43 | const uint32_t &n_stride_h_, const uint32_t &n_stride_w_*/):
44 | _n_anchor(n_anchor_),
45 | _n_classes(n_classes_),
46 | _n_grid_h(n_grid_h_),
47 | _n_grid_w(n_grid_w_)
48 | {
49 | _n_output_size = (5 + _n_classes)*_n_anchor*_n_grid_h*_n_grid_w;
50 | }
51 | Detect::~Detect()
52 | {}
53 |
54 | inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
55 |
56 | __global__ void gpu_detect_layer(const float *input_,
57 | float* output_,
58 | const uint32_t n_grid_h_,
59 | const uint32_t n_grid_w_,
60 | const uint32_t n_classes_,
61 | const uint32_t n_anchor_)
62 | {
63 | uint32_t x_id = blockIdx.x * blockDim.x + threadIdx.x;
64 | uint32_t y_id = blockIdx.y * blockDim.y + threadIdx.y;
65 | uint32_t z_id = blockIdx.z * blockDim.z + threadIdx.z;
66 |
67 | if ((x_id >= n_grid_w_) || (y_id >= n_grid_h_) || (z_id >= n_anchor_))
68 | {
69 | return;
70 | }
71 | // printf("grid_h:%d,grid_w:%d,class:%d,anchor:%d\n", n_grid_h_, n_grid_w_, n_classes_, n_anchor_);
72 | const int numGridCells = n_grid_h_ * n_grid_w_;
73 | const int bbindex = y_id * n_grid_w_ + x_id;
74 |
75 | output_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 0)]
76 | = 2.f * sigmoidGPU(input_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 0)])-0.5f;
77 |
78 | output_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 1)]
79 | = 2.f * sigmoidGPU(input_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 1)])-0.5f;
80 |
81 | float w = 2.f * sigmoidGPU(input_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 2)]);
82 | output_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 2)]
83 | = w*w;
84 |
85 | float h = 2.f* sigmoidGPU(input_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 3)]);
86 | output_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 3)]
87 | = h*h;
88 |
89 | output_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 4)]
90 | = sigmoidGPU(input_[bbindex + numGridCells * (z_id * (5 + n_classes_) + 4)]);
91 | for (uint32_t i = 0; i < n_classes_; ++i)
92 | {
93 | output_[bbindex + numGridCells * (z_id * (5 + n_classes_) + (5 + i))]
94 | = sigmoidGPU(input_[bbindex + numGridCells * (z_id * (5 + n_classes_) + (5 + i))]);
95 | }
96 | }
97 |
98 | cudaError_t cuda_detect_layer(const void* input_,
99 | void* output_,
100 | const uint32_t& batch_size_,
101 | const uint32_t& grid_h_,
102 | const uint32_t& grid_w_,
103 | const uint32_t& n_classes_,
104 | const uint32_t& n_anchor_,
105 | uint64_t n_output_size_,
106 | cudaStream_t stream_)
107 | {
108 | dim3 threads_per_block(16, 16, 4);
109 | dim3 number_of_blocks((grid_w_ / threads_per_block.x) + 1,
110 | (grid_h_ / threads_per_block.y) + 1,
111 | (n_anchor_ / threads_per_block.z) + 1);
112 | for (int batch = 0; batch < batch_size_; ++batch)
113 | {
114 | gpu_detect_layer << > >(
115 | reinterpret_cast(input_) + (batch * n_output_size_),
116 | reinterpret_cast(output_) + (batch * n_output_size_),
117 | grid_h_,
118 | grid_w_,
119 | n_classes_,
120 | n_anchor_);
121 | }
122 | return cudaGetLastError();
123 | }
124 |
125 | int Detect::enqueue(int batchSize,
126 | const void* const* inputs,
127 | void* const* outputs,
128 | void* workspace,
129 | cudaStream_t stream) noexcept
130 | {
131 | NV_CUDA_CHECK(cuda_detect_layer(inputs[0], outputs[0], batchSize, _n_grid_h, _n_grid_w, _n_classes, _n_anchor, _n_output_size, stream));
132 | return 0;
133 | }
134 |
135 |
136 | bool Detect::supportsFormat(DataType type, PluginFormat format) const noexcept
137 | {
138 | return (type == DataType::kFLOAT && format == PluginFormat::kLINEAR);
139 | }
140 |
141 | void Detect::configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) noexcept
142 | {
143 |
144 | }
145 |
146 | size_t Detect::getSerializationSize() const noexcept
147 | {
148 | return sizeof(_n_anchor) + sizeof(_n_classes) + sizeof(_n_grid_h) + sizeof(_n_grid_w)
149 | + sizeof(_n_output_size);
150 | }
151 |
152 | void Detect::serialize(void *buffer) const noexcept
153 | {
154 | char *d = static_cast(buffer), *a = d;
155 | write(d,_n_anchor);
156 | write(d, _n_classes);
157 | write(d, _n_grid_h);
158 | write(d, _n_grid_w);
159 | write(d, _n_output_size);
160 | assert(d == a + getSerializationSize());
161 | }
162 |
163 | void Detect::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
164 | {
165 |
166 | }
167 | IPluginV2* Detect::clone() const noexcept
168 | {
169 | Detect *p = new Detect(_n_anchor,_n_classes,_n_grid_h,_n_grid_w);
170 | p->setPluginNamespace(_s_plugin_namespace.c_str());
171 | return p;
172 | }
173 |
174 |
175 | //
176 | PluginFieldCollection DetectPluginCreator::_fc{};
177 | std::vector DetectPluginCreator::_vec_plugin_attributes;
178 |
179 | DetectPluginCreator::DetectPluginCreator()
180 | {
181 | _vec_plugin_attributes.clear();
182 | _fc.nbFields = _vec_plugin_attributes.size();
183 | _fc.fields = _vec_plugin_attributes.data();
184 | }
185 |
186 | const char* DetectPluginCreator::getPluginName() const noexcept
187 | {
188 | return "DETECT_TRT";
189 | }
190 |
191 | const char* DetectPluginCreator::getPluginVersion() const noexcept
192 | {
193 | return "1.0";
194 | }
195 |
196 | const PluginFieldCollection* DetectPluginCreator::getFieldNames() noexcept
197 | {
198 | return &_fc;
199 | }
200 |
201 | IPluginV2* DetectPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) noexcept
202 | {
203 | Detect* obj = new Detect();
204 | obj->setPluginNamespace(_s_name_space.c_str());
205 | return obj;
206 | }
207 |
208 | IPluginV2* DetectPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept
209 | {
210 | Detect* obj = new Detect(serialData, serialLength);
211 | obj->setPluginNamespace(_s_name_space.c_str());
212 | return obj;
213 | }
214 |
215 | void DetectPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
216 | {
217 | _s_name_space = libNamespace;
218 | }
219 |
220 | const char* DetectPluginCreator::getPluginNamespace() const noexcept
221 | {
222 | return _s_name_space.c_str();
223 | }
224 | }//end namespace nvinfer1
225 |
--------------------------------------------------------------------------------
/modules/detect.h:
--------------------------------------------------------------------------------
1 | #ifndef _DETECT_H_
2 | #define _DETECT_H_
3 |
4 | #include
5 | #include
6 | #include "NvInfer.h"
7 |
8 | namespace nvinfer1
9 | {
10 | template
11 | void write(char*& buffer, const T& val)
12 | {
13 | *reinterpret_cast(buffer) = val;
14 | buffer += sizeof(T);
15 | }
16 |
17 | template
18 | void read(const char*& buffer, T& val)
19 | {
20 | val = *reinterpret_cast(buffer);
21 | buffer += sizeof(T);
22 | }
23 |
24 | class Detect :public IPluginV2
25 | {
26 | public:
27 | Detect();
28 | Detect(const void* data, size_t length);
29 | Detect(const uint32_t n_anchor_, const uint32_t _n_classes_,
30 | const uint32_t n_grid_h_, const uint32_t n_grid_w_/*,
31 | const uint32_t &n_stride_h_, const uint32_t &n_stride_w_*/);
32 | ~Detect();
33 | int getNbOutputs()const noexcept override
34 | {
35 | return 1;
36 | }
37 | Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) noexcept override
38 | {
39 | return inputs[0];
40 | }
41 | int initialize() noexcept override
42 | {
43 | return 0;
44 | }
45 | void terminate() noexcept override
46 | {
47 | }
48 | size_t getWorkspaceSize(int maxBatchSize) const noexcept override
49 | {
50 | return 0;
51 | }
52 | int enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace,
53 | cudaStream_t stream) noexcept override;
54 |
55 | bool supportsFormat(DataType type, PluginFormat format) const noexcept override;
56 | void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) noexcept override;
57 |
58 | size_t getSerializationSize() const noexcept override;
59 | void serialize(void* buffer) const noexcept override;
60 | const char* getPluginType() const noexcept override
61 | {
62 | return "DETECT_TRT";
63 | }
64 | const char* getPluginVersion() const noexcept override
65 | {
66 | return "1.0";
67 | }
68 | void destroy() noexcept override
69 | {
70 | delete this;
71 | }
72 | void setPluginNamespace(const char* pluginNamespace) noexcept override
73 | {
74 | _s_plugin_namespace = pluginNamespace;
75 | }
76 | const char* getPluginNamespace() const noexcept override
77 | {
78 | return _s_plugin_namespace.c_str();
79 | }
80 | DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept
81 | {
82 | return DataType::kFLOAT;
83 | }
84 | bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const noexcept
85 | {
86 | return false;
87 | }
88 | bool canBroadcastInputAcrossBatch(int inputIndex) const noexcept
89 | {
90 | return false;
91 | }
92 | void attachToContext(
93 | cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
94 | {}
95 | void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) ;
96 | void detachFromContext()
97 | {}
98 | bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const noexcept
99 | {
100 | return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
101 | }
102 | IPluginV2* clone() const noexcept override;
103 | private:
104 |
105 | uint32_t _n_anchor;
106 | uint32_t _n_classes;
107 | uint32_t _n_grid_h;
108 | uint32_t _n_grid_w;
109 | //uint32_t _n_stride_h;
110 | // uint32_t _n_stride_w;
111 | uint64_t _n_output_size;
112 | std::string _s_plugin_namespace;
113 | }; //end detect
114 |
115 | class DetectPluginCreator : public IPluginCreator
116 | {
117 | public:
118 | DetectPluginCreator();
119 | ~DetectPluginCreator() override = default;
120 | const char* getPluginName()const noexcept override;
121 | const char* getPluginVersion() const noexcept override;
122 | const PluginFieldCollection* getFieldNames() noexcept override;
123 | IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) noexcept override;
124 | IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override;
125 | void setPluginNamespace(const char* libNamespace) noexcept override;
126 | const char* getPluginNamespace() const noexcept override;
127 | private:
128 | std::string _s_name_space;
129 | static PluginFieldCollection _fc;
130 | static std::vector _vec_plugin_attributes;
131 | };//end detect creator
132 |
133 | }//end namespace nvinfer1
134 |
135 |
136 |
137 | #endif
138 |
--------------------------------------------------------------------------------
/modules/ds_image.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | MIT License
3 |
4 | Copyright (c) 2018 NVIDIA CORPORATION. All rights reserved.
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | *
24 | */
25 | #include "ds_image.h"
26 | #include
27 |
28 | DsImage::DsImage() :
29 | m_Height(0),
30 | m_Width(0),
31 | m_XOffset(0),
32 | m_YOffset(0),
33 | m_ScalingFactor(0.0),
34 | m_RNG(cv::RNG(unsigned(std::time(0)))),
35 | m_ImageName()
36 | {
37 | }
38 |
39 | DsImage::DsImage(const cv::Mat& mat_image_, const std::string &s_net_type_, const int& inputH, const int& inputW) :
40 | m_Height(0),
41 | m_Width(0),
42 | m_XOffset(0),
43 | m_YOffset(0),
44 | m_ScalingFactor(0.0),
45 | m_RNG(cv::RNG(unsigned(std::time(0)))),
46 | m_ImageName()
47 | {
48 | m_OrigImage = mat_image_;
49 | m_Height = m_OrigImage.rows;
50 | m_Width = m_OrigImage.cols;
51 | if (!m_OrigImage.data || m_OrigImage.cols <= 0 || m_OrigImage.rows <= 0)
52 | {
53 | std::cout << "empty image !"<< std::endl;
54 | assert(0);
55 | }
56 | if (m_OrigImage.channels() != 3)
57 | {
58 | std::cout << "Non RGB images are not supported "<< std::endl;
59 | assert(0);
60 | }
61 | if ("yolov5" == s_net_type_)
62 | {
63 | // resize the DsImage with scale
64 | float r = std::min(static_cast(inputH) / static_cast(m_Height), static_cast(inputW) / static_cast(m_Width));
65 | int resizeH = (std::round(m_Height*r));
66 | int resizeW = (std::round(m_Width*r));
67 |
68 | // Additional checks for images with non even dims
69 | if ((inputW - resizeW) % 2) resizeW--;
70 | if ((inputH - resizeH) % 2) resizeH--;
71 | assert((inputW - resizeW) % 2 == 0);
72 | assert((inputH - resizeH) % 2 == 0);
73 |
74 | m_XOffset = (inputW - resizeW) / 2;
75 | m_YOffset = (inputH - resizeH) / 2;
76 |
77 | assert(2 * m_XOffset + resizeW == inputW);
78 | assert(2 * m_YOffset + resizeH == inputH);
79 |
80 | // resizing
81 | cv::resize(m_OrigImage, m_LetterboxImage, cv::Size(resizeW, resizeH), 0, 0, cv::INTER_CUBIC);
82 | // letterboxing
83 | cv::copyMakeBorder(m_LetterboxImage, m_LetterboxImage, m_YOffset, m_YOffset, m_XOffset,
84 | m_XOffset, cv::BORDER_CONSTANT, cv::Scalar(128, 128, 128));
85 | // cv::imwrite("letter.jpg", m_LetterboxImage);
86 | // converting to RGB
87 | //cv::cvtColor(m_LetterboxImage, m_LetterboxImage, cv::COLOR_BGR2RGB);
88 | }
89 | else
90 | {
91 | cv::resize(m_OrigImage, m_LetterboxImage, cv::Size(inputW, inputH), 0, 0, cv::INTER_CUBIC);
92 | // converting to RGB
93 | //cv::cvtColor(m_LetterboxImage, m_LetterboxImage, cv::COLOR_BGR2RGB);
94 | }
95 |
96 | }
97 | DsImage::DsImage(const std::string& path, const std::string &s_net_type_, const int& inputH, const int& inputW) :
98 | m_Height(0),
99 | m_Width(0),
100 | m_XOffset(0),
101 | m_YOffset(0),
102 | m_ScalingFactor(0.0),
103 | m_RNG(cv::RNG(unsigned(std::time(0)))),
104 | m_ImageName()
105 | {
106 | m_ImageName = std::experimental::filesystem::path(path).stem().string();
107 | m_OrigImage = cv::imread(path, cv::IMREAD_UNCHANGED);
108 | m_Height = m_OrigImage.rows;
109 | m_Width = m_OrigImage.cols;
110 | if (!m_OrigImage.data || m_OrigImage.cols <= 0 || m_OrigImage.rows <= 0)
111 | {
112 | std::cout << "Unable to open image : " << path << std::endl;
113 | assert(0);
114 | }
115 |
116 | if (m_OrigImage.channels() != 3)
117 | {
118 | std::cout << "Non RGB images are not supported : " << path << std::endl;
119 | assert(0);
120 | }
121 |
122 | if ("yolov5" == s_net_type_)
123 | {
124 | // resize the DsImage with scale
125 | float dim = std::max(m_Height, m_Width);
126 | int resizeH = ((m_Height / dim) * inputH);
127 | int resizeW = ((m_Width / dim) * inputW);
128 | m_ScalingFactor = static_cast(resizeH) / static_cast(m_Height);
129 | float m_ScalingFactorw = static_cast(resizeW) / static_cast(m_Width);
130 |
131 | // Additional checks for images with non even dims
132 | if ((inputW - resizeW) % 2) resizeW--;
133 | if ((inputH - resizeH) % 2) resizeH--;
134 | assert((inputW - resizeW) % 2 == 0);
135 | assert((inputH - resizeH) % 2 == 0);
136 |
137 | m_XOffset = (inputW - resizeW) / 2;
138 | m_YOffset = (inputH - resizeH) / 2;
139 |
140 | assert(2 * m_XOffset + resizeW == inputW);
141 | assert(2 * m_YOffset + resizeH == inputH);
142 |
143 | // resizing
144 | cv::resize(m_OrigImage, m_LetterboxImage, cv::Size(resizeW, resizeH), 0, 0, cv::INTER_CUBIC);
145 | // letterboxing
146 | cv::copyMakeBorder(m_LetterboxImage, m_LetterboxImage, m_YOffset, m_YOffset, m_XOffset,
147 | m_XOffset, cv::BORDER_CONSTANT, cv::Scalar(128, 128, 128));
148 | // cv::imwrite("letter.jpg", m_LetterboxImage);
149 | // converting to RGB
150 | // cv::cvtColor(m_LetterboxImage, m_LetterboxImage, cv::COLOR_BGR2RGB);
151 | }
152 | else
153 | {
154 | cv::resize(m_OrigImage, m_LetterboxImage, cv::Size(inputW, inputH), 0, 0, cv::INTER_CUBIC);
155 | // converting to RGB
156 | // cv::cvtColor(m_LetterboxImage, m_LetterboxImage, cv::COLOR_BGR2RGB);
157 | }
158 | }
159 |
160 | void DsImage::letterbox(const int& inputH, const int& inputW)
161 | {
162 | //m_OrigImage.copyTo(m_MarkedImage);
163 | m_Height = m_OrigImage.rows;
164 | m_Width = m_OrigImage.cols;
165 |
166 | // resize the DsImage with scale
167 | float dim = std::max(m_Height, m_Width);
168 | int resizeH = ((m_Height / dim) * inputH);
169 | int resizeW = ((m_Width / dim) * inputW);
170 | m_ScalingFactor = static_cast(resizeH) / static_cast(m_Height);
171 | float m_ScalingFactorw = static_cast(resizeW) / static_cast(m_Width);
172 |
173 | // Additional checks for images with non even dims
174 | if ((inputW - resizeW) % 2) resizeW--;
175 | if ((inputH - resizeH) % 2) resizeH--;
176 | assert((inputW - resizeW) % 2 == 0);
177 | assert((inputH - resizeH) % 2 == 0);
178 |
179 | m_XOffset = (inputW - resizeW) / 2;
180 | m_YOffset = (inputH - resizeH) / 2;
181 |
182 | assert(2 * m_XOffset + resizeW == inputW);
183 | assert(2 * m_YOffset + resizeH == inputH);
184 |
185 | // resizing
186 | cv::resize(m_OrigImage, m_LetterboxImage, cv::Size(resizeW, resizeH), 0, 0, cv::INTER_CUBIC);
187 | // letterboxing
188 | cv::copyMakeBorder(m_LetterboxImage, m_LetterboxImage, m_YOffset, m_YOffset, m_XOffset,
189 | m_XOffset, cv::BORDER_CONSTANT, cv::Scalar(128, 128, 128));
190 | // cv::imwrite("letter.jpg", m_LetterboxImage);
191 | // converting to RGB
192 | cv::cvtColor(m_LetterboxImage, m_LetterboxImage, cv::COLOR_BGR2RGB);
193 | }
194 |
195 |
196 | void DsImage::addBBox(BBoxInfo box, const std::string& labelName)
197 | {
198 | m_Bboxes.push_back(box);
199 | const int x = box.box.x1;
200 | const int y = box.box.y1;
201 | const int w = box.box.x2 - box.box.x1;
202 | const int h = box.box.y2 - box.box.y1;
203 | const cv::Scalar color
204 | = cv::Scalar(m_RNG.uniform(0, 255), m_RNG.uniform(0, 255), m_RNG.uniform(0, 255));
205 |
206 | cv::rectangle(m_MarkedImage, cv::Rect(x, y, w, h), color, 1);
207 | const cv::Size tsize
208 | = cv::getTextSize(labelName, cv::FONT_HERSHEY_COMPLEX_SMALL, 0.5, 1, nullptr);
209 | cv::rectangle(m_MarkedImage, cv::Rect(x, y, tsize.width + 3, tsize.height + 4), color, -1);
210 | cv::putText(m_MarkedImage, labelName.c_str(), cv::Point(x, y + tsize.height),
211 | cv::FONT_HERSHEY_COMPLEX_SMALL, 0.5, cv::Scalar(255, 255, 255), 1);
212 | }
213 |
214 | void DsImage::showImage() const
215 | {
216 | cv::namedWindow(m_ImageName);
217 | cv::imshow(m_ImageName.c_str(), m_MarkedImage);
218 | cv::waitKey(0);
219 | }
220 |
221 | void DsImage::saveImageJPEG(const std::string& dirPath) const
222 | {
223 | cv::imwrite(dirPath + m_ImageName + ".jpeg", m_MarkedImage);
224 | }
225 | std::string DsImage::exportJson() const
226 | {
227 | if (m_Bboxes.empty()) return "";
228 | std::stringstream json;
229 | json.precision(2);
230 | json << std::fixed;
231 | for (uint32_t i = 0; i < m_Bboxes.size(); ++i)
232 | {
233 | json << "\n{\n";
234 | json << " \"image_id\" : " << std::stoi(m_ImageName) << ",\n";
235 | json << " \"category_id\" : " << m_Bboxes.at(i).classId << ",\n";
236 | json << " \"bbox\" : ";
237 | json << "[" << m_Bboxes.at(i).box.x1 << ", " << m_Bboxes.at(i).box.y1 << ", ";
238 | json << m_Bboxes.at(i).box.x2 - m_Bboxes.at(i).box.x1 << ", "
239 | << m_Bboxes.at(i).box.y2 - m_Bboxes.at(i).box.y1 << "],\n";
240 | json << " \"score\" : " << m_Bboxes.at(i).prob << "\n";
241 | if (i != m_Bboxes.size() - 1)
242 | json << "},";
243 | else
244 | json << "}";
245 | }
246 | return json.str();
247 | }
--------------------------------------------------------------------------------
/modules/ds_image.h:
--------------------------------------------------------------------------------
1 | /**
2 | MIT License
3 |
4 | Copyright (c) 2018 NVIDIA CORPORATION. All rights reserved.
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | *
24 | */
25 | #ifndef __IMAGE_H__
26 | #define __IMAGE_H__
27 |
28 | #include "trt_utils.h"
29 |
30 | struct BBoxInfo;
31 |
32 | class DsImage
33 | {
34 | public:
35 | DsImage();
36 | DsImage(const std::string& path, const std::string &s_net_type_, const int& inputH, const int& inputW);
37 | DsImage(const cv::Mat& mat_image_, const std::string &s_net_type_, const int& inputH, const int& inputW);
38 | int getImageHeight() const { return m_Height; }
39 | int getImageWidth() const { return m_Width; }
40 | cv::Mat getLetterBoxedImage() const { return m_LetterboxImage; }
41 | cv::Mat getOriginalImage() const { return m_OrigImage; }
42 | std::string getImageName() const { return m_ImageName; }
43 | void addBBox(BBoxInfo box, const std::string& labelName);
44 | void showImage() const;
45 | void saveImageJPEG(const std::string& dirPath) const;
46 | std::string exportJson() const;
47 | void letterbox(const int& inputH, const int& inputW);
48 | private:
49 | int m_Height;
50 | int m_Width;
51 | int m_XOffset;
52 | int m_YOffset;
53 | float m_ScalingFactor;
54 | std::string m_ImagePath;
55 | cv::RNG m_RNG;
56 | std::string m_ImageName;
57 | std::vector m_Bboxes;
58 |
59 | // unaltered original Image
60 | cv::Mat m_OrigImage;
61 | // letterboxed Image given to the network as input
62 | cv::Mat m_LetterboxImage;
63 | // final image marked with the bounding boxes
64 | cv::Mat m_MarkedImage;
65 | };
66 |
67 | #endif
68 |
--------------------------------------------------------------------------------
/modules/err.log:
--------------------------------------------------------------------------------
1 | make: *** ターゲットが指定されておらず, makefile も見つかりません. 中止.
2 |
--------------------------------------------------------------------------------
/modules/hardswish.cu:
--------------------------------------------------------------------------------
1 | //sys
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | //my
11 | #include "hardswish.h"
12 |
13 | #define NV_CUDA_CHECK(status) \
14 | { \
15 | if (status != 0) \
16 | { \
17 | std::cout << "Cuda failure: " << cudaGetErrorString(status) << " in file " << __FILE__ \
18 | << " at line " << __LINE__ << std::endl; \
19 | abort(); \
20 | } \
21 | }
22 |
23 | namespace nvinfer1
24 | {
25 | Hardswish::Hardswish()
26 | {
27 | cudaDeviceProp prop;
28 | cudaGetDeviceProperties(&prop, 0);
29 | _n_max_thread_pre_block = prop.maxThreadsPerBlock;
30 | // printf("Hardswish():%d\n", _n_max_thread_pre_block);
31 | }
32 |
33 | Hardswish::Hardswish(const void* data, size_t length)
34 | {
35 | const char *d = reinterpret_cast(data), *a = d;
36 | r(d, _n_max_thread_pre_block);
37 | r(d, _n_output_size);
38 | // printf("r:threads:%d,size:%d\n", _n_max_thread_pre_block, _n_output_size);
39 | assert(d == a + length);
40 | }
41 |
42 | Hardswish::~Hardswish()
43 | {}
44 |
45 | __global__ void kernel_hardswish(const float *input_, float *output_, int n_data_size_)
46 | {
47 | int i = threadIdx.x + blockIdx.x * blockDim.x;
48 | if (i >= n_data_size_)return;
49 | if (input_[i] >= 3.0f)
50 | {
51 | output_[i] = input_[i];
52 | }
53 | else if (input_[i] <= -3.0f)
54 | {
55 | output_[i] = 0.0f;
56 | }
57 | else
58 | {
59 | output_[i] = input_[i] * (input_[i] + 3.0f) / 6.0f;
60 | }
61 | }
62 |
63 | cudaError_t cuda_hardswish_layer(const void* input_,
64 | void* output_,
65 | const int n_batch_size_,
66 | const int n_output_size_,
67 | const int threads_,
68 | cudaStream_t stream_)
69 | {
70 | int n_data_size = n_batch_size_ * n_output_size_;
71 | // printf("cuda_hardswish_layer:%d,size:%d\n", n_batch_size_, n_output_size_);
72 | kernel_hardswish << <(n_data_size + threads_ -1)/threads_, threads_, 0, stream_ >> >(
73 | reinterpret_cast(input_),
74 | reinterpret_cast(output_),
75 | n_data_size);
76 | return cudaGetLastError();
77 | }
78 |
79 | int Hardswish::enqueue(int batchSize,
80 | const void* const* inputs,
81 | void* const* outputs,
82 | void* workspace,
83 | cudaStream_t stream) noexcept
84 | {
85 | // printf("batch_size:%d,output_size:%d,threads:%d\n", batchSize, _n_output_size, _n_max_thread_pre_block);
86 | NV_CUDA_CHECK(cuda_hardswish_layer(inputs[0], outputs[0], batchSize, _n_output_size , _n_max_thread_pre_block, stream));
87 | return 0;
88 | }
89 |
90 | size_t Hardswish::getSerializationSize() const noexcept
91 | {
92 | return sizeof(_n_max_thread_pre_block) +sizeof(_n_output_size);
93 | }
94 |
95 | void Hardswish::serialize(void *buffer) const noexcept
96 | {
97 | char *d = static_cast(buffer), *a = d;
98 | w(d, _n_max_thread_pre_block);
99 | w(d, _n_output_size);
100 | // printf("serialize:%d,%d\n", _n_max_thread_pre_block, _n_output_size);
101 | assert(d == a + getSerializationSize());
102 | }
103 |
104 |
105 | bool Hardswish::supportsFormat(DataType type, PluginFormat format) const noexcept
106 | {
107 | return (type == DataType::kFLOAT && format == PluginFormat::kLINEAR);
108 | }
109 |
110 | void Hardswish::configureWithFormat(const Dims* inputDims, int nbInputs,
111 | const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) noexcept
112 | {
113 |
114 | }
115 |
116 |
117 | void Hardswish::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) noexcept
118 | {
119 |
120 | _n_output_size = in->dims.d[0] * in->dims.d[1] * in->dims.d[2];
121 | // printf("configurePlugin:%d,%d,%d\n", in->dims.d[0], in->dims.d[1], in->dims.d[2]);
122 | }
123 | IPluginV2* Hardswish::clone() const noexcept
124 | {
125 | Hardswish *p = new Hardswish();
126 | p->setPluginNamespace(_s_plugin_namespace.c_str());
127 | p->_n_max_thread_pre_block = _n_max_thread_pre_block;
128 | p->_n_output_size = _n_output_size;
129 | // printf("clone:%d,%d\n", _n_max_thread_pre_block, _n_output_size);
130 | return p;
131 | }
132 |
133 |
134 | //
135 | PluginFieldCollection HardswishPluginCreator::_fc{};
136 | std::vector HardswishPluginCreator::_vec_plugin_attributes;
137 |
138 | HardswishPluginCreator::HardswishPluginCreator()
139 | {
140 | _vec_plugin_attributes.clear();
141 | _fc.nbFields = _vec_plugin_attributes.size();
142 | _fc.fields = _vec_plugin_attributes.data();
143 | }
144 |
145 | const char* HardswishPluginCreator::getPluginName() const noexcept
146 | {
147 | return "HARDSWISH_TRT";
148 | }
149 |
150 | const char* HardswishPluginCreator::getPluginVersion() const noexcept
151 | {
152 | return "1.0";
153 | }
154 |
155 | const PluginFieldCollection* HardswishPluginCreator::getFieldNames() noexcept
156 | {
157 | return &_fc;
158 | }
159 |
160 | IPluginV2* HardswishPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) noexcept
161 | {
162 | Hardswish* obj = new Hardswish();
163 | obj->setPluginNamespace(_s_name_space.c_str());
164 | return obj;
165 | }
166 |
167 | IPluginV2* HardswishPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept
168 | {
169 | Hardswish* obj = new Hardswish(serialData, serialLength);
170 | obj->setPluginNamespace(_s_name_space.c_str());
171 | return obj;
172 | }
173 |
174 | void HardswishPluginCreator::setPluginNamespace(const char* libNamespace) noexcept
175 | {
176 | _s_name_space = libNamespace;
177 | }
178 |
179 | const char* HardswishPluginCreator::getPluginNamespace() const noexcept
180 | {
181 | return _s_name_space.c_str();
182 | }
183 | }//end namespace nvinfer1
184 |
--------------------------------------------------------------------------------
/modules/hardswish.h:
--------------------------------------------------------------------------------
1 |
2 | #ifndef _HARDSWISH_H_
3 | #define _HARDSWISH_H_
4 |
5 | #include
6 | #include
7 | #include "NvInfer.h"
8 |
9 | namespace nvinfer1
10 | {
11 | template
12 | void w(char*& buffer, const T& val)
13 | {
14 | *reinterpret_cast(buffer) = val;
15 | buffer += sizeof(T);
16 | }
17 |
18 | template
19 | void r(const char*& buffer, T& val)
20 | {
21 | val = *reinterpret_cast(buffer);
22 | buffer += sizeof(T);
23 | }
24 |
25 | class Hardswish :public IPluginV2
26 | {
27 | public:
28 | Hardswish();
29 | Hardswish(const void* data, size_t length);
30 | ~Hardswish();
31 | int getNbOutputs()const noexcept override
32 | {
33 | return 1;
34 | }
35 | Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) noexcept override
36 | {
37 | return inputs[0];
38 | }
39 | int initialize() noexcept override
40 | {
41 | return 0;
42 | }
43 | void terminate() noexcept override
44 | {
45 | }
46 | size_t getWorkspaceSize(int maxBatchSize) const noexcept override
47 | {
48 | return 0;
49 | }
50 |
51 | bool supportsFormat(DataType type, PluginFormat format) const noexcept override;
52 | void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) noexcept override;
53 |
54 | int enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace,
55 | cudaStream_t stream) noexcept override;
56 |
57 | size_t getSerializationSize() const noexcept override;
58 | void serialize(void* buffer) const noexcept override;
59 | const char* getPluginType() const noexcept override
60 | {
61 | return "HARDSWISH_TRT";
62 | }
63 | const char* getPluginVersion() const noexcept override
64 | {
65 | return "1.0";
66 | }
67 | void destroy() noexcept override
68 | {
69 | delete this;
70 | }
71 | void setPluginNamespace(const char* pluginNamespace) noexcept override
72 | {
73 | _s_plugin_namespace = pluginNamespace;
74 | }
75 | const char* getPluginNamespace() const noexcept override
76 | {
77 | return _s_plugin_namespace.c_str();
78 | }
79 | DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept
80 | {
81 | return DataType::kFLOAT;
82 | }
83 | bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const noexcept
84 | {
85 | return false;
86 | }
87 | bool canBroadcastInputAcrossBatch(int inputIndex) const noexcept
88 | {
89 | return false;
90 | }
91 | void attachToContext(
92 | cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) noexcept
93 | {}
94 | void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) noexcept;
95 | void detachFromContext() noexcept
96 | {}
97 | bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const noexcept
98 | {
99 | return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
100 | }
101 | IPluginV2* clone() const noexcept override;
102 | private:
103 |
104 | uint32_t _n_max_thread_pre_block;
105 | uint32_t _n_output_size;
106 | std::string _s_plugin_namespace;
107 | }; //end detect
108 |
109 | class HardswishPluginCreator : public IPluginCreator
110 | {
111 | public:
112 | HardswishPluginCreator();
113 | ~HardswishPluginCreator() override = default;
114 | const char* getPluginName()const noexcept override;
115 | const char* getPluginVersion() const noexcept override;
116 | const PluginFieldCollection* getFieldNames() noexcept override;
117 | IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) noexcept override;
118 | IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override;
119 | void setPluginNamespace(const char* libNamespace) noexcept override;
120 | const char* getPluginNamespace() const noexcept override;
121 | private:
122 | std::string _s_name_space;
123 | static PluginFieldCollection _fc;
124 | static std::vector _vec_plugin_attributes;
125 | };//end detect creator
126 |
127 | }//end namespace nvinfer1
128 |
129 |
130 |
131 | #endif
--------------------------------------------------------------------------------
/modules/kernel.cu:
--------------------------------------------------------------------------------
1 |
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 |
8 | inline __device__ float sigmoidGPU(const float& x) { return 1.0f / (1.0f + __expf(-x)); }
9 |
10 | __global__ void gpuYoloLayerV3(const float* input, float* output, const uint32_t grid_h_,
11 | const uint32_t grid_w_, const uint32_t numOutputClasses,
12 | const uint32_t numBBoxes)
13 | {
14 | uint32_t x_id = blockIdx.x * blockDim.x + threadIdx.x;
15 | uint32_t y_id = blockIdx.y * blockDim.y + threadIdx.y;
16 | uint32_t z_id = blockIdx.z * blockDim.z + threadIdx.z;
17 |
18 | if ((x_id >= grid_w_) || (y_id >= grid_h_) || (z_id >= numBBoxes))
19 | {
20 | return;
21 | }
22 |
23 | const int numGridCells = grid_h_ * grid_w_;
24 | const int bbindex = y_id * grid_w_ + x_id;
25 |
26 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
27 | = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]);
28 |
29 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
30 | = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]);
31 |
32 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]
33 | = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]);
34 |
35 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]
36 | = __expf(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]);
37 |
38 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]
39 | = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
40 |
41 | for (uint32_t i = 0; i < numOutputClasses; ++i)
42 | {
43 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))]
44 | = sigmoidGPU(input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))]);
45 | }
46 | }
47 |
48 |
49 | //Scaled YOLOv4 format
50 | //https://alexeyab84.medium.com/scaled-yolo-v4-is-the-best-neural-network-for-object-detection-on-ms-coco-dataset-39dfa22fa982
51 | __global__ void gpuYoloV4Layer(const float* input, float* output, const uint32_t grid_h_,
52 | const uint32_t grid_w_, const uint32_t numOutputClasses,
53 | const uint32_t numBBoxes, float scale_x_y)
54 | {
55 | uint32_t x_id = blockIdx.x * blockDim.x + threadIdx.x;
56 | uint32_t y_id = blockIdx.y * blockDim.y + threadIdx.y;
57 | uint32_t z_id = blockIdx.z * blockDim.z + threadIdx.z;
58 |
59 | if ((x_id >= grid_w_) || (y_id >= grid_h_) || (z_id >= numBBoxes))
60 | {
61 | return;
62 | }
63 |
64 | const int numGridCells = grid_h_ * grid_w_;
65 | const int bbindex = y_id * grid_w_ + x_id;
66 | //bx
67 | scale_x_y = 1.0;
68 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]
69 | = scale_x_y * (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 0)]) - 0.5 * (scale_x_y -1);
70 | //by
71 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]
72 | = scale_x_y * (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 1)]) - 0.5 * (scale_x_y -1);
73 | //bw
74 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]
75 | = (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]) * (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 2)]) * 4;
76 | //bh
77 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]
78 | = (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]) * (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 3)]) * 4;
79 |
80 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]
81 | = (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + 4)]);
82 |
83 | for (uint32_t i = 0; i < numOutputClasses; ++i)
84 | {
85 | output[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))]
86 | = (input[bbindex + numGridCells * (z_id * (5 + numOutputClasses) + (5 + i))]);
87 | }
88 | }
89 |
90 |
91 | cudaError_t cudaYoloLayerV3(const void* input, void* output, const uint32_t& batchSize,
92 | const uint32_t& n_grid_h_,const uint32_t& n_grid_w_,
93 | const uint32_t& numOutputClasses, const uint32_t& numBBoxes,
94 | uint64_t outputSize, const uint32_t &new_coords, cudaStream_t stream)
95 | {
96 | dim3 threads_per_block(16, 16, 4);
97 | dim3 number_of_blocks((n_grid_w_ / threads_per_block.x) + 1,
98 | (n_grid_h_ / threads_per_block.y) + 1,
99 | (numBBoxes / threads_per_block.z) + 1);
100 | // printf("YOLOv3 (V4) \n");
101 | for (int batch = 0; batch < batchSize; ++batch)
102 | {
103 | if (new_coords) {
104 | gpuYoloV4Layer<<>>(
105 | reinterpret_cast(input) + (batch * outputSize),
106 | reinterpret_cast(output) + (batch * outputSize), n_grid_h_, n_grid_w_, numOutputClasses,
107 | numBBoxes, 2.0);
108 | } else {
109 | gpuYoloLayerV3<<>>(
110 | reinterpret_cast(input) + (batch * outputSize),
111 | reinterpret_cast(output) + (batch * outputSize), n_grid_h_, n_grid_w_, numOutputClasses,
112 | numBBoxes);
113 | }
114 |
115 | }
116 | return cudaGetLastError();
117 | }
118 |
119 |
120 | cudaError_t cudaYoloV4Layer(const void* input, void* output, const uint32_t& batchSize,
121 | const uint32_t& n_grid_h_,const uint32_t& n_grid_w_,
122 | const uint32_t& numOutputClasses, const uint32_t& numBBoxes,
123 | uint64_t outputSize, const uint32_t &new_coords, const float scale_x_y, cudaStream_t stream)
124 | {
125 | dim3 threads_per_block(16, 16, 4);
126 | dim3 number_of_blocks((n_grid_w_ / threads_per_block.x) + 1,
127 | (n_grid_h_ / threads_per_block.y) + 1,
128 | (numBBoxes / threads_per_block.z) + 1);
129 | // printf("Scaled YOLOv4 \n");
130 | for (int batch = 0; batch < batchSize; ++batch)
131 | {
132 | gpuYoloV4Layer<<>>(
133 | reinterpret_cast(input) + (batch * outputSize),
134 | reinterpret_cast(output) + (batch * outputSize), n_grid_h_, n_grid_w_, numOutputClasses,
135 | numBBoxes, scale_x_y);
136 | }
137 | return cudaGetLastError();
138 | }
139 |
--------------------------------------------------------------------------------
/modules/mish.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include "mish.h"
6 |
7 | namespace nvinfer1
8 | {
9 | MishPlugin::MishPlugin()
10 | {
11 | }
12 |
13 | MishPlugin::~MishPlugin()
14 | {
15 | }
16 |
17 | // create the plugin at runtime from a byte stream
18 | MishPlugin::MishPlugin(const void* data, size_t length)
19 | {
20 | assert(length == sizeof(input_size_));
21 | input_size_ = *reinterpret_cast(data);
22 | }
23 |
24 | void MishPlugin::serialize(void* buffer) const noexcept
25 | {
26 | *reinterpret_cast(buffer) = input_size_;
27 | }
28 |
29 | size_t MishPlugin::getSerializationSize() const noexcept
30 | {
31 | return sizeof(input_size_);
32 | }
33 |
34 | int MishPlugin::initialize()noexcept
35 | {
36 | return 0;
37 | }
38 |
39 | bool MishPlugin::supportsFormat(DataType type, PluginFormat format) const noexcept
40 | {
41 | return (type == DataType::kFLOAT && format == PluginFormat::kLINEAR);
42 | }
43 |
44 | void MishPlugin::configureWithFormat(const Dims* inputDims, int nbInputs,
45 | const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) noexcept
46 | {
47 |
48 | }
49 |
50 | Dims MishPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)noexcept
51 | {
52 | assert(nbInputDims == 1);
53 | assert(index == 0);
54 | input_size_ = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2];
55 | // Output dimensions
56 | return Dims3(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]);
57 | }
58 |
59 | // Set plugin namespace
60 | void MishPlugin::setPluginNamespace(const char* pluginNamespace)noexcept
61 | {
62 | mPluginNamespace = pluginNamespace;
63 | }
64 |
65 | const char* MishPlugin::getPluginNamespace() const noexcept
66 | {
67 | return mPluginNamespace;
68 | }
69 |
70 | // Return the DataType of the plugin output at the requested index
71 | DataType MishPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept
72 | {
73 | return DataType::kFLOAT;
74 | }
75 |
76 | // Return true if output tensor is broadcast across a batch.
77 | bool MishPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const noexcept
78 | {
79 | return false;
80 | }
81 |
82 | // Return true if plugin can use input that is broadcast across batch without replication.
83 | bool MishPlugin::canBroadcastInputAcrossBatch(int inputIndex) const noexcept
84 | {
85 | return false;
86 | }
87 |
88 | void MishPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)noexcept
89 | {
90 | }
91 |
92 | // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
93 | void MishPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)noexcept
94 | {
95 | }
96 |
97 | // Detach the plugin object from its execution context.
98 | void MishPlugin::detachFromContext()noexcept {}
99 |
100 | const char* MishPlugin::getPluginType() const noexcept
101 | {
102 | return "Mish_TRT";
103 | }
104 |
105 | const char* MishPlugin::getPluginVersion() const noexcept
106 | {
107 | return "1";
108 | }
109 |
110 | void MishPlugin::destroy()noexcept
111 | {
112 | delete this;
113 | }
114 |
115 | // Clone the plugin
116 | IPluginV2* MishPlugin::clone() const noexcept
117 | {
118 | MishPlugin *p = new MishPlugin();
119 | p->input_size_ = input_size_;
120 | p->setPluginNamespace(mPluginNamespace);
121 | return p;
122 | }
123 |
124 | __device__ float tanh_activate_kernel(float x){return (2/(1 + expf(-2*x)) - 1);}
125 |
126 | __device__ float softplus_kernel(float x, float threshold = 20)
127 | {
128 | if (x > threshold) return x; // too large
129 | else if (x < -threshold) return expf(x); // too small
130 | return logf(expf(x) + 1);
131 | }
132 |
133 | __global__ void mish_kernel(const float *input, float *output, int num_elem)
134 | {
135 |
136 | int idx = threadIdx.x + blockDim.x * blockIdx.x;
137 | if (idx >= num_elem) return;
138 |
139 | //float t = exp(input[idx]);
140 | //if (input[idx] > 20.0) {
141 | // t *= t;
142 | // output[idx] = (t - 1.0) / (t + 1.0);
143 | //} else {
144 | // float tt = t * t;
145 | // output[idx] = (tt + 2.0 * t) / (tt + 2.0 * t + 2.0);
146 | //}
147 | //output[idx] *= input[idx];
148 | output[idx] = input[idx] * tanh_activate_kernel(softplus_kernel(input[idx]));
149 | }
150 |
151 | void MishPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize)
152 | {
153 | int block_size = thread_count_;
154 | int grid_size = (input_size_ * batchSize + block_size - 1) / block_size;
155 | mish_kernel<<>>(inputs[0], output, input_size_ * batchSize);
156 | }
157 |
158 | int MishPlugin::enqueue(int batchSize,
159 | const void* const* inputs,
160 | void* const* outputs,
161 | void* workspace,
162 | cudaStream_t stream) noexcept
163 | {
164 | //assert(batchSize == 1);
165 | //GPU
166 | //CUDA_CHECK(cudaStreamSynchronize(stream));
167 | forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
168 | return 0;
169 | }
170 |
171 | PluginFieldCollection MishPluginCreator::mFC{};
172 | std::vector MishPluginCreator::mPluginAttributes;
173 |
174 | MishPluginCreator::MishPluginCreator()
175 | {
176 | mPluginAttributes.clear();
177 |
178 | mFC.nbFields = mPluginAttributes.size();
179 | mFC.fields = mPluginAttributes.data();
180 | }
181 |
182 | const char* MishPluginCreator::getPluginName() const noexcept
183 | {
184 | return "Mish_TRT";
185 | }
186 |
187 | const char* MishPluginCreator::getPluginVersion() const noexcept
188 | {
189 | return "1";
190 | }
191 |
192 | const PluginFieldCollection* MishPluginCreator::getFieldNames()noexcept
193 | {
194 | return &mFC;
195 | }
196 |
197 | IPluginV2* MishPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)noexcept
198 | {
199 | MishPlugin* obj = new MishPlugin();
200 | obj->setPluginNamespace(mNamespace.c_str());
201 | return obj;
202 | }
203 |
204 | IPluginV2* MishPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)noexcept
205 | {
206 | // This object will be deleted when the network is destroyed, which will
207 | // call MishPlugin::destroy()
208 | MishPlugin* obj = new MishPlugin(serialData, serialLength);
209 | obj->setPluginNamespace(mNamespace.c_str());
210 | return obj;
211 | }
212 |
213 |
214 | void MishPluginCreator::setPluginNamespace(const char* libNamespace)noexcept
215 | {
216 | mNamespace = libNamespace;
217 | }
218 |
219 | const char* MishPluginCreator::getPluginNamespace() const noexcept
220 | {
221 | return mNamespace.c_str();
222 | }
223 |
224 |
225 | }
226 |
227 |
--------------------------------------------------------------------------------
/modules/mish.h:
--------------------------------------------------------------------------------
1 | #ifndef _MISH_PLUGIN_H
2 | #define _MISH_PLUGIN_H
3 |
4 | #include
5 | #include
6 | #include "NvInfer.h"
7 |
8 |
9 | //https://github.com/wang-xinyu/tensorrtx
10 | namespace nvinfer1
11 | {
12 | class MishPlugin: public IPluginV2
13 | {
14 | public:
15 | explicit MishPlugin();
16 | MishPlugin(const void* data, size_t length);
17 |
18 | ~MishPlugin();
19 |
20 | int getNbOutputs() const noexcept override
21 | {
22 | return 1;
23 | }
24 |
25 | Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) noexcept override;
26 |
27 | int initialize() noexcept override;
28 |
29 | virtual void terminate() noexcept override {}
30 |
31 | virtual size_t getWorkspaceSize(int maxBatchSize) const noexcept override { return 0;}
32 |
33 | // virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream);
34 | int enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace,
35 | cudaStream_t stream) noexcept override;
36 | bool supportsFormat(DataType type, PluginFormat format) const noexcept override;
37 | void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) noexcept override;
38 |
39 | virtual size_t getSerializationSize() const noexcept override;
40 |
41 | virtual void serialize(void* buffer) const noexcept override;
42 |
43 | bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const noexcept {
44 | return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
45 | }
46 |
47 | const char* getPluginType() const noexcept override;
48 |
49 | const char* getPluginVersion() const noexcept override;
50 |
51 | void destroy() noexcept override;
52 |
53 | IPluginV2* clone() const noexcept override;
54 |
55 | void setPluginNamespace(const char* pluginNamespace) noexcept override;
56 |
57 | const char* getPluginNamespace() const noexcept override;
58 |
59 | DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept;
60 |
61 | bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const noexcept;
62 |
63 | bool canBroadcastInputAcrossBatch(int inputIndex) const noexcept;
64 |
65 | void attachToContext(
66 | cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)noexcept;
67 |
68 | void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)noexcept;
69 |
70 | void detachFromContext()noexcept;
71 |
72 | int input_size_;
73 | private:
74 | void forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize = 1);
75 | int thread_count_ = 256;
76 | const char* mPluginNamespace;
77 | };
78 |
79 | class MishPluginCreator : public IPluginCreator
80 | {
81 | public:
82 | MishPluginCreator();
83 |
84 | ~MishPluginCreator() override = default;
85 |
86 | const char* getPluginName() const noexcept override;
87 |
88 | const char* getPluginVersion() const noexcept override;
89 |
90 | const PluginFieldCollection* getFieldNames() noexcept override;
91 |
92 | IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) noexcept override;
93 |
94 | IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override;
95 |
96 | void setPluginNamespace(const char* libNamespace) noexcept override;
97 |
98 | const char* getPluginNamespace() const noexcept override;
99 |
100 | private:
101 | std::string mNamespace;
102 | static PluginFieldCollection mFC;
103 | static std::vector mPluginAttributes;
104 | };
105 | }
106 | #endif
107 |
--------------------------------------------------------------------------------
/modules/plugin_factory.cpp:
--------------------------------------------------------------------------------
1 |
2 |
3 | #include "plugin_factory.h"
4 | #include "trt_utils.h"
5 |
6 |
7 | /******* Yolo Layer V3 *******/
8 | /*****************************/
9 | namespace nvinfer1
10 | {
11 | YoloLayer::YoloLayer()
12 | {}
13 |
14 | YoloLayer::YoloLayer(const void* data, size_t length)
15 | {
16 | const char *d = static_cast(data), *a = d;
17 | re(d, m_NumBoxes);
18 | re(d, m_NumClasses);
19 | re(d, _n_grid_h);
20 | re(d, _n_grid_w);
21 | re(d, m_OutputSize);
22 | re(d, m_new_coords);
23 | assert(d = a + length);
24 | }
25 | void YoloLayer::serialize(void* buffer)const noexcept
26 | {
27 | char *d = static_cast(buffer), *a = d;
28 | wr(d, m_NumBoxes);
29 | wr(d, m_NumClasses);
30 | wr(d, _n_grid_h);
31 | wr(d, _n_grid_w);
32 | wr(d, m_OutputSize);
33 | wr(d, m_new_coords);
34 | assert(d == a + getSerializationSize());
35 | printf("Serialize V3\n");
36 | }
37 |
38 | bool YoloLayer::supportsFormat(DataType type, PluginFormat format) const noexcept
39 | {
40 | return (type == DataType::kFLOAT && format == PluginFormat::kLINEAR);
41 | }
42 |
43 | void YoloLayer::configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) noexcept
44 | {
45 |
46 | }
47 |
48 | IPluginV2* YoloLayer::clone() const noexcept
49 | {
50 | YoloLayer *p = new YoloLayer(m_NumBoxes,m_NumClasses,_n_grid_h,_n_grid_w, m_new_coords);
51 | p->setPluginNamespace(_s_plugin_namespace.c_str());
52 | return p;
53 | }
54 |
55 | YoloLayer::YoloLayer(const uint32_t& numBoxes, const uint32_t& numClasses, const uint32_t& grid_h_, const uint32_t &grid_w_, const uint32_t& new_coords) :
56 | m_NumBoxes(numBoxes),
57 | m_NumClasses(numClasses),
58 | _n_grid_h(grid_h_),
59 | _n_grid_w(grid_w_),
60 | m_new_coords(new_coords)
61 | {
62 | assert(m_NumBoxes > 0);
63 | assert(m_NumClasses > 0);
64 | assert(_n_grid_h > 0);
65 | assert(_n_grid_w > 0);
66 | m_OutputSize = _n_grid_h * _n_grid_w * (m_NumBoxes * (4 + 1 + m_NumClasses));
67 | }
68 |
69 | int YoloLayer::getNbOutputs() const noexcept { return 1; }
70 |
71 | nvinfer1::Dims YoloLayer::getOutputDimensions(int index, const nvinfer1::Dims* inputs,
72 | int nbInputDims) noexcept
73 | {
74 | assert(index == 0);
75 | assert(nbInputDims == 1);
76 | return inputs[0];
77 | }
78 |
79 | //void YoloLayerV3::configure(const nvinfer1::Dims* inputDims, int nbInputs,
80 | // const nvinfer1::Dims* outputDims, int nbOutputs, int maxBatchSize) noexcept
81 | //{
82 | // assert(nbInputs == 1);
83 | // assert(inputDims != nullptr);
84 | //}
85 |
86 | int YoloLayer::initialize() noexcept { return 0; }
87 |
88 | void YoloLayer::terminate() noexcept {}
89 |
90 | size_t YoloLayer::getWorkspaceSize(int maxBatchSize) const noexcept
91 | {
92 | return 0;
93 | }
94 |
95 | int YoloLayer::enqueue(int batchSize,
96 | const void* const* inputs,
97 | void* const* outputs,
98 | void* workspace,
99 | cudaStream_t stream) noexcept
100 | {
101 | NV_CUDA_CHECK(cudaYoloLayerV3(inputs[0], outputs[0], batchSize, _n_grid_h, _n_grid_w, m_NumClasses,
102 | m_NumBoxes, m_OutputSize, m_new_coords, stream));
103 | return 0;
104 | }
105 |
106 | size_t YoloLayer::getSerializationSize()const noexcept
107 | {
108 | return sizeof(m_NumBoxes) + sizeof(m_NumClasses) + sizeof(_n_grid_w) + sizeof(_n_grid_h) + sizeof(m_OutputSize) + sizeof(m_new_coords);
109 | }
110 |
111 |
112 |
113 |
114 | PluginFieldCollection YoloLayerPluginCreator::mFC{};
115 | std::vector YoloLayerPluginCreator::mPluginAttributes;
116 |
117 | YoloLayerPluginCreator::YoloLayerPluginCreator()
118 | {
119 | mPluginAttributes.clear();
120 |
121 | mFC.nbFields = mPluginAttributes.size();
122 | mFC.fields = mPluginAttributes.data();
123 |
124 | }
125 |
126 | const char* YoloLayerPluginCreator::getPluginName() const noexcept
127 | {
128 | return "YOLO_TRT";
129 | }
130 |
131 | const char* YoloLayerPluginCreator::getPluginVersion() const noexcept
132 | {
133 | return "1.0";
134 | }
135 |
136 | const PluginFieldCollection* YoloLayerPluginCreator::getFieldNames()noexcept
137 | {
138 | return &mFC;
139 | }
140 |
141 | IPluginV2* YoloLayerPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)noexcept
142 | {
143 |
144 | YoloLayer* obj = new YoloLayer();
145 | obj->setPluginNamespace(mNamespace.c_str());
146 | return obj;
147 | }
148 |
149 | IPluginV2* YoloLayerPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)noexcept
150 | {
151 | // This object will be deleted when the network is destroyed, which will
152 | // call MishPlugin::destroy()
153 | YoloLayer* obj = new YoloLayer(serialData, serialLength);
154 | obj->setPluginNamespace(mNamespace.c_str());
155 | return obj;
156 | }
157 |
158 |
159 | void YoloLayerPluginCreator::setPluginNamespace(const char* libNamespace)noexcept
160 | {
161 | mNamespace = libNamespace;
162 | }
163 |
164 | const char* YoloLayerPluginCreator::getPluginNamespace() const noexcept
165 | {
166 | return mNamespace.c_str();
167 | }
168 | }
169 |
170 | namespace nvinfer1
171 | {
172 | //Scaled YOLOv4
173 | YoloV4Layer::YoloV4Layer()
174 | {}
175 |
176 | YoloV4Layer::YoloV4Layer(const void* data, size_t length)
177 | {
178 | const char *d = static_cast(data), *a = d;
179 | re(d, m_NumBoxes);
180 | re(d, m_NumClasses);
181 | re(d, _n_grid_h);
182 | re(d, _n_grid_w);
183 | re(d, m_OutputSize);
184 | re(d, m_new_coords);
185 | re(d, m_scale_x_y);
186 | assert(d = a + length);
187 | }
188 | void YoloV4Layer::serialize(void* buffer)const noexcept
189 | {
190 | char *d = static_cast(buffer), *a = d;
191 | wr(d, m_NumBoxes);
192 | wr(d, m_NumClasses);
193 | wr(d, _n_grid_h);
194 | wr(d, _n_grid_w);
195 | wr(d, m_OutputSize);
196 | wr(d, m_new_coords);
197 | wr(d, m_scale_x_y);
198 | assert(d == a + getSerializationSize());
199 | }
200 |
201 | bool YoloV4Layer::supportsFormat(DataType type, PluginFormat format) const noexcept
202 | {
203 | return (type == DataType::kFLOAT && format == PluginFormat::kLINEAR);
204 | }
205 |
206 | void YoloV4Layer::configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) noexcept
207 | {
208 |
209 | }
210 |
211 | IPluginV2* YoloV4Layer::clone() const noexcept
212 | {
213 | YoloV4Layer *p = new YoloV4Layer(m_NumBoxes,m_NumClasses,_n_grid_h,_n_grid_w, m_new_coords, m_scale_x_y);
214 | p->setPluginNamespace(_s_plugin_namespace.c_str());
215 | return p;
216 | }
217 |
218 | YoloV4Layer::YoloV4Layer(const uint32_t& numBoxes, const uint32_t& numClasses, const uint32_t& grid_h_, const uint32_t &grid_w_, const uint32_t &new_coords, const float& scale_x_y) :
219 | m_NumBoxes(numBoxes),
220 | m_NumClasses(numClasses),
221 | _n_grid_h(grid_h_),
222 | _n_grid_w(grid_w_),
223 | m_scale_x_y(scale_x_y),
224 | m_new_coords(new_coords)
225 | {
226 | assert(m_NumBoxes > 0);
227 | assert(m_NumClasses > 0);
228 | assert(_n_grid_h > 0);
229 | assert(_n_grid_w > 0);
230 | assert(m_scale_x_y > 0);
231 | m_OutputSize = _n_grid_h * _n_grid_w * (m_NumBoxes * (4 + 1 + m_NumClasses));
232 | }
233 |
234 | int YoloV4Layer::getNbOutputs() const noexcept { return 1; }
235 |
236 | nvinfer1::Dims YoloV4Layer::getOutputDimensions(int index, const nvinfer1::Dims* inputs,
237 | int nbInputDims) noexcept
238 | {
239 | assert(index == 0);
240 | assert(nbInputDims == 1);
241 | return inputs[0];
242 | }
243 |
244 | //void YoloV4LayerV3::configure(const nvinfer1::Dims* inputDims, int nbInputs,
245 | // const nvinfer1::Dims* outputDims, int nbOutputs, int maxBatchSize) noexcept
246 | //{
247 | // assert(nbInputs == 1);
248 | // assert(inputDims != nullptr);
249 | //}
250 |
251 | int YoloV4Layer::initialize() noexcept { return 0; }
252 |
253 | void YoloV4Layer::terminate() noexcept {}
254 |
255 | size_t YoloV4Layer::getWorkspaceSize(int maxBatchSize) const noexcept
256 | {
257 | return 0;
258 | }
259 |
260 | int YoloV4Layer::enqueue(int batchSize,
261 | const void* const* inputs,
262 | void* const* outputs,
263 | void* workspace,
264 | cudaStream_t stream) noexcept
265 | {
266 | NV_CUDA_CHECK(cudaYoloV4Layer(inputs[0], outputs[0], batchSize, _n_grid_h, _n_grid_w, m_NumClasses,
267 | m_NumBoxes, m_OutputSize, m_new_coords, m_scale_x_y, stream));
268 | return 0;
269 | }
270 |
271 | size_t YoloV4Layer::getSerializationSize()const noexcept
272 | {
273 | printf("Serialize V4\n");
274 | return sizeof(m_NumBoxes) + sizeof(m_NumClasses) + sizeof(_n_grid_w) + sizeof(_n_grid_h) + sizeof(m_OutputSize) + sizeof(m_scale_x_y) + sizeof(m_new_coords);
275 | }
276 |
277 |
278 |
279 |
280 | PluginFieldCollection YoloV4LayerPluginCreator::mFC{};
281 | std::vector YoloV4LayerPluginCreator::mPluginAttributes;
282 |
283 | YoloV4LayerPluginCreator::YoloV4LayerPluginCreator()
284 | {
285 | mPluginAttributes.clear();
286 |
287 | mFC.nbFields = mPluginAttributes.size();
288 | mFC.fields = mPluginAttributes.data();
289 |
290 | }
291 |
292 | const char* YoloV4LayerPluginCreator::getPluginName() const noexcept
293 | {
294 | return "YOLOV4_TRT";
295 | }
296 |
297 | const char* YoloV4LayerPluginCreator::getPluginVersion() const noexcept
298 | {
299 | return "1.0";
300 | }
301 |
302 | const PluginFieldCollection* YoloV4LayerPluginCreator::getFieldNames()noexcept
303 | {
304 | return &mFC;
305 | }
306 |
307 | IPluginV2* YoloV4LayerPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)noexcept
308 | {
309 |
310 | YoloV4Layer* obj = new YoloV4Layer();
311 | obj->setPluginNamespace(mNamespace.c_str());
312 | return obj;
313 | }
314 |
315 | IPluginV2* YoloV4LayerPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)noexcept
316 | {
317 | // This object will be deleted when the network is destroyed, which will
318 | // call MishPlugin::destroy()
319 | YoloV4Layer* obj = new YoloV4Layer(serialData, serialLength);
320 | obj->setPluginNamespace(mNamespace.c_str());
321 | return obj;
322 | }
323 |
324 |
325 | void YoloV4LayerPluginCreator::setPluginNamespace(const char* libNamespace)noexcept
326 | {
327 | mNamespace = libNamespace;
328 | }
329 |
330 | const char* YoloV4LayerPluginCreator::getPluginNamespace() const noexcept
331 | {
332 | return mNamespace.c_str();
333 | }
334 |
335 |
336 | }
337 |
--------------------------------------------------------------------------------
/modules/plugin_factory.h:
--------------------------------------------------------------------------------
1 | /**
2 | MIT License
3 |
4 | Copyright (c) 2018 NVIDIA CORPORATION. All rights reserved.
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | *
24 | */
25 |
26 | #ifndef __PLUGIN_LAYER_H__
27 | #define __PLUGIN_LAYER_H__
28 |
29 | #include
30 | #include
31 | #include
32 | #include
33 | #include
34 | #include
35 | #include "NvInfer.h"
36 |
37 | #define NV_CUDA_CHECK(status) \
38 | { \
39 | if (status != 0) \
40 | { \
41 | std::cout << "Cuda failure: " << cudaGetErrorString(status) << " in file " << __FILE__ \
42 | << " at line " << __LINE__ << std::endl; \
43 | abort(); \
44 | } \
45 | }
46 |
47 | // Forward declaration of cuda kernels
48 | cudaError_t cudaYoloLayerV3(const void* input, void* output, const uint32_t& batchSize,
49 | const uint32_t& n_grid_h_, const uint32_t& n_grid_w_,
50 | const uint32_t& numOutputClasses, const uint32_t& numBBoxes,
51 | uint64_t outputSize, const uint32_t& m_new_coords, cudaStream_t stream);
52 |
53 | cudaError_t cudaYoloV4Layer(const void* input, void* output, const uint32_t& batchSize,
54 | const uint32_t& n_grid_h_,const uint32_t& n_grid_w_,
55 | const uint32_t& numOutputClasses, const uint32_t& numBBoxes,
56 | uint64_t outputSize, const uint32_t& m_new_coords, const float scale_x_y, cudaStream_t stream);
57 | //class PluginFactory : public nvinfer1::IPluginFactory
58 | //{
59 | //
60 | //public:
61 | // PluginFactory();
62 | // nvinfer1::IPlugin* createPlugin(const char* layerName, const void* serialData,
63 | // size_t serialLength);
64 | // bool isPlugin(const char* name);
65 | // void destroy();
66 | //
67 | //private:
68 | // static const int m_MaxLeakyLayers = 72;
69 | // static const int m_ReorgStride = 2;
70 | // static constexpr float m_LeakyNegSlope = 0.1f;
71 | // static const int m_NumBoxes = 5;
72 | // static const int m_NumCoords = 4;
73 | // static const int m_NumClasses = 80;
74 | // static const int m_MaxYoloLayers = 3;
75 | // int m_LeakyReLUCount = 0;
76 | // int m_YoloLayerCount = 0;
77 | // nvinfer1::plugin::RegionParameters m_RegionParameters{m_NumBoxes, m_NumCoords, m_NumClasses,
78 | // nullptr};
79 | //
80 | // struct INvPluginDeleter
81 | // {
82 | // void operator()(nvinfer1::plugin::INvPlugin* ptr)
83 | // {
84 | // if (ptr)
85 | // {
86 | // ptr->destroy();
87 | // }
88 | // }
89 | // };
90 | // struct IPluginDeleter
91 | // {
92 | // void operator()(nvinfer1::IPlugin* ptr)
93 | // {
94 | // if (ptr)
95 | // {
96 | // ptr->terminate();
97 | // }
98 | // }
99 | // };
100 | // typedef std::unique_ptr unique_ptr_INvPlugin;
101 | // typedef std::unique_ptr unique_ptr_IPlugin;
102 | //
103 | // unique_ptr_INvPlugin m_ReorgLayer;
104 | // unique_ptr_INvPlugin m_RegionLayer;
105 | // unique_ptr_INvPlugin m_LeakyReLULayers[m_MaxLeakyLayers];
106 | // unique_ptr_IPlugin m_YoloLayers[m_MaxYoloLayers];
107 | //};
108 | namespace nvinfer1
109 | {
110 | template
111 | void wr(char*& buffer, const T& val)
112 | {
113 | *reinterpret_cast(buffer) = val;
114 | buffer += sizeof(T);
115 | }
116 |
117 | template
118 | void re(const char*& buffer, T& val)
119 | {
120 | val = *reinterpret_cast(buffer);
121 | buffer += sizeof(T);
122 | }
123 |
124 | class YoloLayer : public IPluginV2
125 | {
126 | public:
127 | explicit YoloLayer();
128 | YoloLayer(const void* data, size_t length);
129 | YoloLayer(const uint32_t& numBoxes, const uint32_t& numClasses, const uint32_t& grid_h_, const uint32_t &grid_w_, const uint32_t &new_coords);
130 | int getNbOutputs() const noexcept override;
131 | nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
132 | int nbInputDims)noexcept override;
133 | /*void configure(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims,
134 | int nbOutputs, int maxBatchSize)noexcept override;*/
135 |
136 | /*void configure(const nvinfer1::Dims* inputDims, int nbInputs,
137 | const nvinfer1::Dims* outputDims, int nbOutputs, int maxBatchSize)noexcept override;*/
138 |
139 | int initialize()noexcept override;
140 | void terminate()noexcept override;
141 | size_t getWorkspaceSize(int maxBatchSize) const noexcept override;
142 |
143 | int enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace,
144 | cudaStream_t stream) noexcept override;
145 |
146 | size_t getSerializationSize() const noexcept override;
147 | void serialize(void* buffer) const noexcept override;
148 |
149 | const char* getPluginType() const noexcept override
150 | {
151 | return "YOLO_TRT";
152 | }
153 | bool supportsFormat(DataType type, PluginFormat format) const noexcept override;
154 | void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) noexcept override;
155 |
156 | const char* getPluginVersion() const noexcept override
157 | {
158 | return "1.0";
159 | }
160 |
161 | void setPluginNamespace(const char* pluginNamespace) noexcept override
162 | {
163 | _s_plugin_namespace = pluginNamespace;
164 | }
165 | const char* getPluginNamespace() const noexcept override
166 | {
167 | return _s_plugin_namespace.c_str();
168 | }
169 | void destroy() noexcept override
170 | {
171 | delete this;
172 | }
173 | IPluginV2* clone() const noexcept override;
174 | private:
175 |
176 | std::string _s_plugin_namespace;
177 | uint32_t m_NumBoxes;
178 | uint32_t m_NumClasses;
179 | uint32_t m_GridSize;
180 | uint64_t m_OutputSize;
181 | uint32_t _n_grid_h;
182 | uint32_t _n_grid_w;
183 | uint32_t m_new_coords;
184 | };
185 |
186 |
187 |
188 | class YoloLayerPluginCreator : public IPluginCreator
189 | {
190 | public:
191 | YoloLayerPluginCreator();
192 |
193 | ~YoloLayerPluginCreator() override = default;
194 |
195 | const char* getPluginName() const noexcept override;
196 |
197 | const char* getPluginVersion() const noexcept override;
198 |
199 | const PluginFieldCollection* getFieldNames() noexcept override;
200 |
201 | IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) noexcept override;
202 |
203 | IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override;
204 |
205 | void setPluginNamespace(const char* libNamespace) noexcept override;
206 |
207 | const char* getPluginNamespace() const noexcept override;
208 |
209 | private:
210 | std::string mNamespace;
211 | static PluginFieldCollection mFC;
212 | static std::vector mPluginAttributes;
213 | };
214 |
215 |
216 | //Scaled YOLOv4
217 | class YoloV4Layer : public IPluginV2
218 | {
219 | public:
220 | explicit YoloV4Layer();
221 | YoloV4Layer(const void* data, size_t length);
222 | YoloV4Layer(const uint32_t& numBoxes, const uint32_t& numClasses, const uint32_t& grid_h_, const uint32_t &grid_w_, const uint32_t &new_coords, const float& scale_x_y);
223 | int getNbOutputs() const noexcept override;
224 | nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
225 | int nbInputDims)noexcept override;
226 | /*void configure(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims,
227 | int nbOutputs, int maxBatchSize)noexcept override;*/
228 |
229 | /*void configure(const nvinfer1::Dims* inputDims, int nbInputs,
230 | const nvinfer1::Dims* outputDims, int nbOutputs, int maxBatchSize)noexcept override;*/
231 |
232 | int initialize()noexcept override;
233 | void terminate()noexcept override;
234 | size_t getWorkspaceSize(int maxBatchSize) const noexcept override;
235 |
236 | int enqueue(int batchSize, const void* const* inputs, void* const* outputs, void* workspace,
237 | cudaStream_t stream) noexcept override;
238 |
239 | size_t getSerializationSize() const noexcept override;
240 | void serialize(void* buffer) const noexcept override;
241 |
242 | const char* getPluginType() const noexcept override
243 | {
244 | return "YOLO_TRT";
245 | }
246 | bool supportsFormat(DataType type, PluginFormat format) const noexcept override;
247 | void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) noexcept override;
248 |
249 | const char* getPluginVersion() const noexcept override
250 | {
251 | return "1.0";
252 | }
253 |
254 | void setPluginNamespace(const char* pluginNamespace) noexcept override
255 | {
256 | _s_plugin_namespace = pluginNamespace;
257 | }
258 | const char* getPluginNamespace() const noexcept override
259 | {
260 | return _s_plugin_namespace.c_str();
261 | }
262 | void destroy() noexcept override
263 | {
264 | delete this;
265 | }
266 | IPluginV2* clone() const noexcept override;
267 | private:
268 |
269 | std::string _s_plugin_namespace;
270 | uint32_t m_NumBoxes;
271 | uint32_t m_NumClasses;
272 | uint32_t m_GridSize;
273 | uint64_t m_OutputSize;
274 | uint32_t _n_grid_h;
275 | uint32_t _n_grid_w;
276 | uint32_t m_new_coords;
277 | float m_scale_x_y;
278 |
279 | };
280 |
281 |
282 |
283 | class YoloV4LayerPluginCreator : public IPluginCreator
284 | {
285 | public:
286 | YoloV4LayerPluginCreator();
287 |
288 | ~YoloV4LayerPluginCreator() override = default;
289 |
290 | const char* getPluginName() const noexcept override;
291 |
292 | const char* getPluginVersion() const noexcept override;
293 |
294 | const PluginFieldCollection* getFieldNames() noexcept override;
295 |
296 | IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) noexcept override;
297 |
298 | IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) noexcept override;
299 |
300 | void setPluginNamespace(const char* libNamespace) noexcept override;
301 |
302 | const char* getPluginNamespace() const noexcept override;
303 |
304 | private:
305 | std::string mNamespace;
306 | static PluginFieldCollection mFC;
307 | static std::vector mPluginAttributes;
308 | };
309 | }
310 | #endif // __PLUGIN_LAYER_H__
311 |
--------------------------------------------------------------------------------
/modules/preprocess.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include "preprocess.h"
5 |
6 | #define BLOCK 512
7 |
8 | dim3 cuda_gridsize(size_t n){
9 | size_t k = (n-1) / BLOCK + 1;
10 | size_t x = k;
11 | size_t y = 1;
12 | if(x > 65535){
13 | x = ceil(sqrt(k));
14 | y = (n-1)/(x*BLOCK) + 1;
15 | }
16 | dim3 d;
17 | d.x = x;
18 | d.y = y;
19 | d.z = 1;
20 | return d;
21 | }
22 |
23 |
24 | __global__ void SimpleblobFromImageKernel(int N, float* dst_img, unsigned char* src_img,
25 | int dst_h, int dst_w, int src_h, int src_w,
26 | float stride_h, float stride_w, float norm)
27 | {
28 | int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
29 |
30 | if (index >= N) return;
31 | int chan = 3;
32 | int w = index % dst_w;
33 | int h = index / dst_w;
34 | float centroid_h, centroid_w;
35 | int c;
36 | centroid_h = stride_h * (float)(h + 0.5);
37 | centroid_w = stride_w * (float)(w + 0.5);
38 | int src_h_idx = lroundf(centroid_h)-1;
39 | int src_w_idx = lroundf(centroid_w)-1;
40 | if (src_h_idx<0){src_h_idx=0;}
41 | if (src_w_idx<0){src_w_idx=0;}
42 | index = chan * src_w_idx + chan* src_w * src_h_idx;
43 |
44 | for (c = 0; c < chan; c++) {
45 | int dst_index = w + (dst_w*h) + (dst_w*dst_h*c);
46 | dst_img[dst_index] = (float)src_img[index+c]*norm;
47 | }
48 | }
49 |
50 | void blobFromImageGpu(float *dst, unsigned char*src, int d_w, int d_h, int d_c,
51 | int s_w, int s_h, int s_c, float norm, cudaStream_t stream)
52 | {
53 | int N = d_w * d_h;
54 | float stride_h = (float)s_h / (float)d_h;
55 | float stride_w = (float)s_w / (float)d_w;
56 |
57 | SimpleblobFromImageKernel<<>>(N, dst, src,
58 | d_h, d_w,
59 | s_h, s_w,
60 | stride_h, stride_w, norm);
61 |
62 | }
63 |
64 |
--------------------------------------------------------------------------------
/modules/preprocess.h:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 |
8 | void blobFromImageGpu(float *dst, unsigned char*src, int d_w, int d_h, int d_c,
9 | int s_w, int s_h, int s_c, float norm, cudaStream_t stream);
10 |
--------------------------------------------------------------------------------
/modules/profiler.cpp:
--------------------------------------------------------------------------------
1 | //#include "profiler.h"
2 | #include "profiler.h"
3 | #include
4 |
5 | using namespace nvinfer1;
6 |
7 |
8 | SimpleProfiler::SimpleProfiler(std::string name,
9 | const std::vector& src_profilers) :
10 | m_name(name)
11 | {
12 | float total_time = 0.0;
13 | m_index = 0;
14 | for (const auto& src_profiler : src_profilers) {
15 | for (const auto& rec : src_profiler.m_profile) {
16 | auto it = m_profile.find(rec.first);
17 |
18 | if (it == m_profile.end()) {
19 | m_profile.insert(rec);
20 | } else {
21 | it->second.time += rec.second.time;
22 | it->second.count += rec.second.count;
23 | total_time += rec.second.time;
24 | }
25 | }
26 | }
27 | }
28 |
29 | void SimpleProfiler::reportLayerTime(const char* layerName, float ms) noexcept
30 | {
31 |
32 | m_profile[layerName].count++;
33 | m_profile[layerName].time += ms;
34 | if (m_profile[layerName].min_time == -1.0) {
35 | m_profile[layerName].min_time = ms;
36 | m_profile[layerName].index = m_index;
37 | m_index++;
38 | } else if (m_profile[layerName].min_time > ms) {
39 | m_profile[layerName].min_time = ms;
40 | }
41 | }
42 |
43 | void SimpleProfiler::setProfDict(nvinfer1::ILayer *layer) noexcept
44 | {
45 | std::string name = layer->getName();
46 | auto t = layer->getType();
47 | m_layer_dict[name];
48 | m_layer_dict[name].type = layer->getType();
49 | if (layer->getType() == nvinfer1::LayerType::kCONVOLUTION) {
50 | nvinfer1::IConvolutionLayer* conv = (nvinfer1::IConvolutionLayer*)layer;
51 | nvinfer1::ITensor* in = layer->getInput(0);
52 | nvinfer1::Dims dim_in = in->getDimensions();
53 | nvinfer1::ITensor* out = layer->getOutput(0);
54 | nvinfer1::Dims dim_out = out->getDimensions();
55 | nvinfer1::Dims k_dims = conv->getKernelSizeNd();
56 | nvinfer1::Dims s_dims = conv->getStrideNd();
57 | int groups = conv->getNbGroups();
58 | int stride = s_dims.d[0];
59 | int kernel = k_dims.d[0];
60 | m_layer_dict[name].in_c = dim_in.d[0];
61 | m_layer_dict[name].out_c = dim_out.d[0];
62 | m_layer_dict[name].w= dim_in.d[2];
63 | m_layer_dict[name].h = dim_in.d[1];
64 | m_layer_dict[name].k = kernel;;
65 | m_layer_dict[name].stride = stride;
66 | m_layer_dict[name].groups = groups;
67 |
68 | }
69 | }
70 |
71 | LayerInfo *SimpleProfiler::getILayerFromName(std::string name) noexcept
72 | {
73 | LayerInfo *linfo = NULL;
74 | /*
75 | if (m_layer_dict.find(name) != m_layer_dict.end()) {
76 | linfo = &m_layer_dict[name];
77 | }
78 | */
79 | for (auto it = m_layer_dict.begin(); it != m_layer_dict.end(); it++) {
80 | std::string key = it->first;
81 | if (!name.find(key)) {
82 | linfo = &m_layer_dict[key];
83 | }
84 | }
85 | return linfo;
86 | }
87 |
88 | std::ostream& operator<<(std::ostream& out, SimpleProfiler& value)
89 | {
90 | out << "========== " << value.m_name << " profile ==========" << std::endl;
91 | float totalTime = 0;
92 | std::string layerNameStr = "Operation";
93 |
94 | int maxLayerNameLength = static_cast(layerNameStr.size());
95 | for (const auto& elem : value.m_profile)
96 | {
97 | totalTime += elem.second.time;
98 | maxLayerNameLength = std::max(maxLayerNameLength, static_cast(elem.first.size()));
99 | }
100 |
101 | auto old_settings = out.flags();
102 | auto old_precision = out.precision();
103 | // Output header
104 | {
105 | out << std::setw(maxLayerNameLength) << layerNameStr << " ";
106 | out << std::setw(12) << "Runtime, "
107 | << "%"
108 | << " ";
109 | out << std::setw(12) << "Invocations"
110 | << " ";
111 | out << std::setw(12) << "Runtime[ms]" ;
112 | out << std::setw(12) << "Min Runtime[ms]" << std::endl;
113 | }
114 | int index = value.m_index;
115 | for (int i = 0; i < index; i++) {
116 | for (const auto& elem : value.m_profile){
117 | if (elem.second.index == i) {
118 | out << i << ":";
119 | out << std::setw(maxLayerNameLength) << elem.first << ",";
120 | out << std::setw(12) << std::fixed << std::setprecision(1) << (elem.second.time * 100.0F / totalTime) << "%"
121 | << ",";
122 | out << std::setw(12) << elem.second.count << ",";
123 | out << std::setw(12) << std::fixed << std::setprecision(2) << elem.second.time << ", ";
124 | out << std::setw(12) << std::fixed << std::setprecision(2) << elem.second.min_time << std::endl;
125 | }
126 | }
127 | }
128 | out.flags(old_settings);
129 | out.precision(old_precision);
130 | out << "========== " << value.m_name << " total runtime = " << totalTime << " ms ==========" << std::endl;
131 | out << "Conv Profile" << std::endl;
132 | out << "index, w, h, in_c, out_c, k, stride, groups, mintime[us], name" << std::endl;
133 |
134 | for (int i = 0; i < index; i++) {
135 | for (const auto& elem : value.m_profile){
136 | if (elem.second.index == i) {
137 | LayerInfo* linfo = value.getILayerFromName(elem.first);
138 | if (linfo) {
139 | if (linfo->type == nvinfer1::LayerType::kCONVOLUTION) {
140 | out << std::setw(4) << std::fixed << i << ", ";
141 | out << std::setw(4) << std::fixed << linfo->w << ", ";
142 | out << std::setw(4) << std::fixed << linfo->h << ", ";
143 | out << std::setw(4) << std::fixed << linfo->in_c << ", ";
144 | out << std::setw(4) << std::fixed << linfo->out_c << ", ";
145 | out << std::setw(4) << std::fixed << linfo->k << ", ";
146 | out << std::setw(4) << std::fixed << linfo->stride << ", ";
147 | out << std::setw(4) << std::fixed << linfo->groups << ", ";
148 |
149 |
150 |
151 | out << std::setw(10) << std::setprecision(4) << elem.second.min_time *1000 << ", ";
152 | out << std::setw(6) << elem.first << "," << std::endl;
153 | }
154 | }
155 | }
156 | }
157 | }
158 |
159 | std::vector chan{8, 16, 24, 32, 48, 64, 80, 96, 128, 160, 192, 256, 320, 384};
160 | //k=1 conv
161 | int kernel = 1;
162 | out << "###Conv Kernel(1)" <k == kernel && in_c == linfo->in_c && out_c == linfo->out_c) {
177 | if (linfo->type == nvinfer1::LayerType::kCONVOLUTION) {
178 | out << std::setw(10) << std::setprecision(4) << elem.second.min_time *1000;
179 | flg = true;
180 | break;
181 | }
182 | }
183 | }
184 | }
185 | }
186 | out << ", ";
187 | }
188 | out << std::endl;
189 | }
190 | kernel = 3;
191 | out << "###Conv Kernel(" << kernel << ")" <k == kernel && in_c == linfo->in_c && out_c == linfo->out_c) {
206 | if (linfo->type == nvinfer1::LayerType::kCONVOLUTION) {
207 | out << std::setw(10) << std::setprecision(4) << elem.second.min_time *1000;
208 | flg = true;
209 | break;
210 | }
211 | }
212 | }
213 | }
214 | }
215 | out << ", ";
216 | }
217 | out << std::endl;
218 | }
219 |
220 | kernel = 5;
221 | out << "###Conv Kernel(" << kernel << ")" <k == kernel && in_c == linfo->in_c && out_c == linfo->out_c) {
236 | if (linfo->type == nvinfer1::LayerType::kCONVOLUTION) {
237 | out << std::setw(10) << std::setprecision(4) << elem.second.min_time *1000;
238 | flg = true;
239 | break;
240 | }
241 | }
242 | }
243 | }
244 | }
245 | out << ", ";
246 | }
247 | out << std::endl;
248 | }
249 | kernel = 7;
250 | out << "###Conv Kernel(" << kernel << ")" <k == kernel && in_c == linfo->in_c && out_c == linfo->out_c) {
265 | if (linfo->type == nvinfer1::LayerType::kCONVOLUTION) {
266 | out << std::setw(10) << std::setprecision(4) << elem.second.min_time *1000;
267 | flg = true;
268 | break;
269 | }
270 | }
271 | }
272 | }
273 | }
274 | out << ", ";
275 | }
276 | out << std::endl;
277 | }
278 | return out;
279 | }
280 |
281 |
282 |
--------------------------------------------------------------------------------
/modules/profiler.h:
--------------------------------------------------------------------------------
1 | #ifndef YOLO_PROFILER_H
2 | #define YOLO_PROFILER_H
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include