├── .clang-format ├── .cmake-format.yaml ├── .github ├── ISSUE_TEMPLATE │ └── tensorrtx-issue-template.md ├── stale.yml └── workflows │ └── pre-commit.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── alexnet ├── CMakeLists.txt ├── README.md ├── alex.cpp ├── alexnet.py └── logging.h ├── arcface ├── CMakeLists.txt ├── README.md ├── arcface-mobilefacenet.cpp ├── arcface-r100.cpp ├── arcface-r50.cpp ├── gen_wts.py ├── logging.h ├── macros.h ├── prelu.cu └── prelu.h ├── centernet ├── README.md ├── centernet.py ├── dcnv2Plugin │ ├── CMakeLists.txt │ ├── dcn_v2_im2col_cuda.cu │ ├── dcn_v2_im2col_cuda.h │ ├── dcnv2Plugin.cpp │ └── dcnv2Plugin.h └── sample │ ├── common.py │ └── test.py ├── crnn ├── CMakeLists.txt ├── README.md ├── crnn.cpp ├── genwts.py └── logging.h ├── csrnet ├── CMakeLists.txt ├── README.md ├── config.h ├── csrnet.cpp ├── gen_wts.py ├── logging.h └── macros.h ├── dbnet ├── CMakeLists.txt ├── README.md ├── clipper │ ├── CMakeLists.txt │ ├── clipper.cpp │ └── clipper.hpp ├── common.hpp ├── dbnet.cpp ├── logging.h └── utils.h ├── densenet ├── CMakeLists.txt ├── README.md ├── densenet121.cpp ├── densenet121.py └── logging.h ├── detr ├── CMakeLists.txt ├── README.md ├── backbone.hpp ├── calibrator.hpp ├── common.hpp ├── detr.cpp ├── gen_wts.py ├── logging.h └── macros.h ├── docker ├── .env ├── README.md ├── tensorrtx-docker-compose.yml └── x86_64.dockerfile ├── efficient_ad ├── CMakeLists.txt ├── README.md ├── datas │ └── models │ │ └── gen_wts.py ├── efficientAD_det.cpp └── src │ ├── config.h │ ├── cuda_utils.h │ ├── logging.h │ ├── macros.h │ ├── model.cpp │ ├── model.h │ ├── postprocess.h │ └── utils.h ├── efficientnet ├── CMakeLists.txt ├── README.md ├── efficientnet.cpp ├── gen_wts.py ├── logging.h └── utils.hpp ├── ghostnet ├── README.md ├── ghostnetv1 │ ├── CMakeLists.txt │ ├── gen_wts.py │ ├── ghostnetv1.cpp │ └── logging.h └── ghostnetv2 │ ├── CMakeLists.txt │ ├── gen_wts.py │ ├── ghostnetv2.cpp │ └── logging.h ├── googlenet ├── CMakeLists.txt ├── README.md ├── googlenet.cpp └── logging.h ├── hrnet ├── hrnet-image-classification │ ├── CMakeLists.txt │ ├── README.md │ ├── common.hpp │ ├── demo.py │ ├── hrnet.cpp │ └── logging.h └── hrnet-semantic-segmentation │ ├── CMakeLists.txt │ ├── README.md │ ├── common.hpp │ ├── gen_wts.py │ ├── hrnet.cpp │ ├── hrnet_ocr.cpp │ ├── hrnet_trt.py │ └── logging.h ├── ibnnet ├── CMakeLists.txt ├── InferenceEngine.cpp ├── InferenceEngine.h ├── README.md ├── gen_wts.py ├── holder.h ├── ibnnet.cpp ├── ibnnet.h ├── layers.cpp ├── layers.h ├── logging.h ├── main.cpp ├── utils.cpp └── utils.h ├── inception ├── inceptionv3 │ ├── CMakeLists.txt │ ├── README.md │ ├── inception_v3.cpp │ └── logging.h └── inceptionv4 │ ├── CMakeLists.txt │ ├── README.md │ ├── inception_v4.cpp │ ├── inception_v4.h │ ├── layers_api.cpp │ ├── layers_api.h │ ├── logging.h │ ├── main.cpp │ ├── utils.cpp │ └── utils.h ├── lenet ├── CMakeLists.txt ├── README.md ├── lenet.cpp ├── lenet.py ├── lenet_tripy.py ├── logging.h └── macros.h ├── lprnet ├── 1.jpg ├── CMakeLists.txt ├── LPRnet.cpp ├── README.md ├── genwts.py └── logging.h ├── mlp ├── CMakeLists.txt ├── README.md ├── logging.h ├── mlp.cpp ├── mlp.py └── mlp.wts ├── mnasnet ├── CMakeLists.txt ├── README.md ├── logging.h └── mnasnet.cpp ├── mobilenet ├── mobilenetv2 │ ├── CMakeLists.txt │ ├── README.md │ ├── logging.h │ ├── mobilenet_v2.cpp │ └── mobilenet_v2.py └── mobilenetv3 │ ├── CMakeLists.txt │ ├── README.md │ ├── logging.h │ ├── mobilenet_v3.cpp │ └── mobilenet_v3.py ├── psenet ├── CMakeLists.txt ├── README.md ├── gen_tf_wts.py ├── layers.cpp ├── layers.h ├── main.cpp ├── psenet.cpp ├── psenet.h ├── test.jpg ├── utils.cpp └── utils.h ├── rcnn ├── BatchedNms.cu ├── BatchedNmsPlugin.h ├── CMakeLists.txt ├── MaskRcnnInference.cu ├── MaskRcnnInferencePlugin.h ├── PredictorDecode.cu ├── PredictorDecodePlugin.h ├── README.md ├── RoiAlign.cu ├── RoiAlignPlugin.h ├── RpnDecode.cu ├── RpnDecodePlugin.h ├── RpnNms.cu ├── RpnNmsPlugin.h ├── backbone.hpp ├── calibrator.hpp ├── common.hpp ├── cuda_utils.h ├── gen_wts.py ├── logging.h ├── macros.h └── rcnn.cpp ├── real-esrgan ├── general-x4v3 │ ├── CMakeLists.txt │ ├── README.md │ ├── cmake │ │ └── FindTensorRT.cmake │ ├── gen_wts.py │ ├── main.cpp │ └── src │ │ ├── include │ │ ├── config │ │ │ └── config.hpp │ │ ├── cuda_utils.h │ │ ├── logging │ │ │ └── logging.h │ │ ├── pixel_shuffle │ │ │ └── pixel_shuffle.hpp │ │ └── preprocess │ │ │ └── preprocess.hpp │ │ └── pixel_shuffle │ │ ├── pixel_shuffle.cpp │ │ └── pixel_shuffle.cu └── x4plus │ ├── CMakeLists.txt │ ├── README.md │ ├── common.hpp │ ├── cuda_utils.h │ ├── gen_wts.py │ ├── logging.h │ ├── macros.h │ ├── postprocess.cu │ ├── postprocess.hpp │ ├── preprocess.cu │ ├── preprocess.hpp │ ├── real-esrgan.cpp │ └── utils.h ├── refinedet ├── CMakeLists.txt ├── README.md ├── calibrator.cpp ├── calibrator.h ├── configure.h ├── gen_wts_refinedet.py ├── logging.h ├── refinedet.cpp └── utils.h ├── repvgg ├── CMakeLists.txt ├── README.md ├── gen_wts.py ├── logging.h └── repvgg.cpp ├── resnet ├── CMakeLists.txt ├── README.md ├── logging.h ├── resnet18.cpp ├── resnet34.cpp ├── resnet50.cpp ├── resnet50.py ├── resnext50_32x4d.cpp ├── wide_resnet50.py └── wideresnet50.cpp ├── retinaface ├── CMakeLists.txt ├── README.md ├── calibrator.cpp ├── calibrator.h ├── common.hpp ├── decode.cu ├── decode.h ├── logging.h ├── macros.h ├── retina_mnet.cpp ├── retina_r50.cpp └── retinaface_trt.py ├── retinafaceAntiCov ├── CMakeLists.txt ├── README.md ├── decode.cu ├── decode.h ├── gen_wts.py ├── logging.h ├── macros.h └── retinafaceAntiCov.cpp ├── scaled-yolov4 ├── CMakeLists.txt ├── README.md ├── common.hpp ├── gen_wts.py ├── logging.h ├── mish.cu ├── mish.h ├── utils.h ├── yololayer.cu ├── yololayer.h └── yolov4_csp.cpp ├── senet ├── CMakeLists.txt ├── README.md ├── logging.h └── se_resnet50.cpp ├── shufflenetv2 ├── CMakeLists.txt ├── README.md ├── logging.h └── shufflenet_v2.cpp ├── squeezenet ├── CMakeLists.txt ├── README.md ├── logging.h └── squeezenet.cpp ├── superpoint ├── CMakeLists.txt ├── README.md ├── gen_wts.py ├── logging.h ├── supernet.cpp ├── utils.cpp └── utils.h ├── swin-transformer └── semantic-segmentation │ ├── CMakeLists.txt │ ├── README.md │ ├── UpsampleKernel.cu │ ├── UpsamplePlugin.cpp │ ├── UpsamplePlugin.h │ ├── UpsmapleKernel.h │ ├── common.hpp │ ├── fillmask.cu │ ├── fillmask.h │ ├── gelu.cu │ ├── gelu.h │ ├── gen_wts.py │ ├── include │ └── dirent.h │ ├── layerNorm.cu │ ├── layerNorm.h │ ├── logging.h │ ├── main.cpp │ ├── myhpp.h │ ├── trainsform.cpp │ └── utilsn.h ├── tsm ├── CMakeLists.txt ├── README.md ├── demo.sh ├── gen_wts.py ├── logging.h ├── mmaction2_tsm_r50_config.py ├── test_shift.py ├── tsm_r50.cpp └── tsm_r50.py ├── tutorials ├── check_fp16_int8_support.md ├── contribution.md ├── faq.md ├── from_pytorch_to_trt_stepbystep_hrnet.md ├── getting_started.md ├── install.md ├── measure_performance.md ├── migrating_from_tensorrt_4_to_7.md ├── multi_GPU_processing.md └── run_on_windows.md ├── ufld ├── CMakeLists.txt ├── README.md ├── common.hpp ├── gen_wts.py ├── lane_det.cpp ├── logging.h ├── macros.h └── pth2onnx.py ├── unet ├── CMakeLists.txt ├── README.md ├── common.hpp ├── gen_wts.py ├── logging.h ├── macros.h └── unet.cpp ├── vgg ├── CMakeLists.txt ├── README.md ├── logging.h └── vgg11.cpp ├── yolo11 ├── CMakeLists.txt ├── gen_wts.py ├── include │ ├── block.h │ ├── calibrator.h │ ├── config.h │ ├── cuda_utils.h │ ├── logging.h │ ├── macros.h │ ├── model.h │ ├── postprocess.h │ ├── preprocess.h │ ├── types.h │ └── utils.h ├── plugin │ ├── yololayer.cu │ └── yololayer.h ├── readme.md ├── src │ ├── block.cpp │ ├── calibrator.cpp │ ├── model.cpp │ ├── postprocess.cpp │ ├── postprocess.cu │ └── preprocess.cu ├── yolo11_cls.cpp ├── yolo11_cls_trt.py ├── yolo11_det.cpp ├── yolo11_det_trt.py ├── yolo11_obb.cpp ├── yolo11_obb_trt.py ├── yolo11_pose.cpp ├── yolo11_pose_trt.py ├── yolo11_seg.cpp └── yolo11_seg_trt.py ├── yolo11_tripy ├── .gitignore ├── README.md ├── classify.py ├── compile_classifier.py ├── constants.py ├── model │ ├── block.py │ └── model.py └── requirements.txt ├── yolop ├── CMakeLists.txt ├── README.md ├── common.hpp ├── cuda_utils.h ├── gen_wts.py ├── logging.h ├── macros.h ├── utils.h ├── yololayer.cu ├── yololayer.h ├── yolop.cpp ├── yolop.hpp └── yolop_trt.py ├── yolov10 ├── CMakeLists.txt ├── README.md ├── gen_wts.py ├── include │ ├── block.h │ ├── calibrator.h │ ├── config.h │ ├── cuda_utils.h │ ├── logging.h │ ├── macros.h │ ├── model.h │ ├── postprocess.h │ ├── preprocess.h │ ├── types.h │ └── utils.h ├── plugin │ ├── yololayer.cu │ └── yololayer.h ├── src │ ├── block.cpp │ ├── calibrator.cpp │ ├── model.cpp │ ├── postprocess.cpp │ └── preprocess.cu ├── yolov10_det.cpp └── yolov10_det_trt.py ├── yolov12 ├── CMakeLists.txt ├── gen_wts.py ├── include │ ├── block.h │ ├── config.h │ ├── cuda_utils.h │ ├── logging.h │ ├── macros.h │ ├── model.h │ ├── postprocess.h │ ├── preprocess.h │ ├── types.h │ └── utils.h ├── plugin │ ├── yololayer.cu │ └── yololayer.h ├── readme.md ├── src │ ├── block.cpp │ ├── model.cpp │ ├── postprocess.cpp │ ├── postprocess.cu │ └── preprocess.cu └── yolo12_det.cpp ├── yolov3-spp ├── CMakeLists.txt ├── README.md ├── Utils.h ├── gen_wts.py ├── logging.h ├── samples │ ├── bus.jpg │ └── zidane.jpg ├── yololayer.cu ├── yololayer.h └── yolov3-spp.cpp ├── yolov3-tiny ├── CMakeLists.txt ├── README.md ├── gen_wts.py ├── logging.h ├── macros.h ├── utils.h ├── yololayer.cu ├── yololayer.h └── yolov3-tiny.cpp ├── yolov3 ├── CMakeLists.txt ├── README.md ├── calibrator.cpp ├── calibrator.h ├── gen_wts.py ├── logging.h ├── macros.h ├── utils.h ├── yololayer.cu ├── yololayer.h ├── yolov3.cpp └── yolov3_trt.py ├── yolov4 ├── CMakeLists.txt ├── README.md ├── gen_wts.py ├── logging.h ├── mish.cu ├── mish.h ├── utils.h ├── yololayer.cu ├── yololayer.h └── yolov4.cpp ├── yolov5 ├── CMakeLists.txt ├── README.md ├── gen_wts.py ├── images ├── plugin │ ├── yololayer.cu │ └── yololayer.h ├── src │ ├── calibrator.cpp │ ├── calibrator.h │ ├── config.h │ ├── cuda_utils.h │ ├── logging.h │ ├── macros.h │ ├── model.cpp │ ├── model.h │ ├── postprocess.cpp │ ├── postprocess.h │ ├── preprocess.cu │ ├── preprocess.h │ ├── types.h │ └── utils.h ├── yolov5_cls.cpp ├── yolov5_cls_trt.py ├── yolov5_det.cpp ├── yolov5_det_cuda_python.py ├── yolov5_det_trt.py ├── yolov5_seg.cpp └── yolov5_seg_trt.py ├── yolov7 ├── CMakeLists.txt ├── README.md ├── gen_wts.py ├── images ├── include │ ├── block.h │ ├── calibrator.h │ ├── config.h │ ├── cuda_utils.h │ ├── logging.h │ ├── macros.h │ ├── model.h │ ├── postprocess.h │ ├── preprocess.h │ ├── types.h │ └── utils.h ├── main.cpp ├── plugin │ ├── yololayer.cu │ └── yololayer.h ├── src │ ├── block.cpp │ ├── calibrator.cpp │ ├── model.cpp │ ├── postprocess.cpp │ └── preprocess.cu └── yolov7_trt.py ├── yolov8 ├── CMakeLists.txt ├── README.md ├── gen_wts.py ├── include │ ├── block.h │ ├── calibrator.h │ ├── config.h │ ├── cuda_utils.h │ ├── logging.h │ ├── macros.h │ ├── model.h │ ├── postprocess.h │ ├── preprocess.h │ ├── types.h │ └── utils.h ├── plugin │ ├── yololayer.cu │ └── yololayer.h ├── src │ ├── block.cpp │ ├── calibrator.cpp │ ├── model.cpp │ ├── postprocess.cpp │ ├── postprocess.cu │ └── preprocess.cu ├── yolov8_5u_det.cpp ├── yolov8_5u_det_trt.py ├── yolov8_cls.cpp ├── yolov8_cls_trt.py ├── yolov8_det.cpp ├── yolov8_det_trt.py ├── yolov8_obb.cpp ├── yolov8_obb_trt.py ├── yolov8_pose.cpp ├── yolov8_pose_trt.py ├── yolov8_seg.cpp └── yolov8_seg_trt.py └── yolov9 ├── CMakeLists.txt ├── README.md ├── demo.cpp ├── gen_wts.py ├── images ├── include ├── block.h ├── calibrator.h ├── config.h ├── cuda_utils.h ├── logging.h ├── macros.h ├── model.h ├── postprocess.h ├── preprocess.h ├── types.h └── utils.h ├── plugin ├── yololayer.cu └── yololayer.h ├── src ├── block.cpp ├── calibrator.cpp ├── model.cpp ├── postprocess.cpp ├── postprocess.cu └── preprocess.cu ├── windows └── dirent.h └── yolov9_trt.py /.github/ISSUE_TEMPLATE/tensorrtx-issue-template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: tensorrtx issue template 3 | about: To understand your issue better 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Env 11 | 12 | - GPU, e.g. V100, RTX2080, TX2, Xavier NX, Nano, etc. 13 | - OS, e.g. Ubuntu16.04, Win10, etc. 14 | - Cuda version 15 | - TensorRT version 16 | 17 | ## About this repo 18 | 19 | - which branch/tag/commit are you using? 20 | - which model? yolov5, retinaface? 21 | 22 | ## Your problem 23 | 24 | - what is your command? e.g. `sudo ./yolov5 -s` 25 | - what's your output? 26 | - what output do you expect? 27 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 60 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 7 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - pinned 8 | - security 9 | # Label to use when marking an issue as stale 10 | staleLabel: wontfix 11 | # Comment to post when marking an issue as stale. Set to `false` to disable 12 | markComment: > 13 | This issue has been automatically marked as stale because it has not had 14 | recent activity. It will be closed if no further activity occurs. Thank you 15 | for your contributions. 16 | # Comment to post when closing a stale issue. Set to `false` to disable 17 | closeComment: false 18 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: pre-commit 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - master 7 | - trt10 8 | 9 | jobs: 10 | pre-commit: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v3 14 | with: 15 | # grab the history of the PR 16 | fetch-depth: 0 17 | - uses: actions/setup-python@v3 18 | - uses: pre-commit/action@v3.0.1 19 | with: 20 | extra_args: --from-ref ${{ github.event.pull_request.base.sha }} --to-ref ${{ github.event.pull_request.head.sha }} 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | *.wts 3 | *.engine 4 | *.tpymodel 5 | */*.ppm 6 | *idea* 7 | 8 | .vscode/* 9 | !.vscode/settings.json 10 | !.vscode/tasks.json 11 | !.vscode/launch.json 12 | !.vscode/extensions.json 13 | !.vscode/*.code-snippets 14 | 15 | # Local History for Visual Studio Code 16 | .history/ 17 | 18 | # Built Visual Studio Code Extensions 19 | *.vsix 20 | 21 | .vscode/* 22 | !.vscode/settings.json 23 | !.vscode/tasks.json 24 | !.vscode/launch.json 25 | !.vscode/extensions.json 26 | !.vscode/*.code-snippets 27 | 28 | # Local History for Visual Studio Code 29 | .history/ 30 | 31 | # Built Visual Studio Code Extensions 32 | *.vsix 33 | 34 | # Prerequisites 35 | *.d 36 | 37 | # Compiled Object files 38 | *.slo 39 | *.lo 40 | *.o 41 | *.obj 42 | 43 | # Precompiled Headers 44 | *.gch 45 | *.pch 46 | 47 | # Compiled Dynamic libraries 48 | *.so 49 | *.dylib 50 | *.dll 51 | 52 | # Fortran module files 53 | *.mod 54 | *.smod 55 | 56 | # Compiled Static libraries 57 | *.lai 58 | *.la 59 | *.a 60 | *.lib 61 | 62 | # Executables 63 | *.exe 64 | *.out 65 | *.app 66 | 67 | CMakeLists.txt.user 68 | CMakeCache.txt 69 | CMakeFiles 70 | CMakeScripts 71 | Testing 72 | Makefile 73 | cmake_install.cmake 74 | install_manifest.txt 75 | compile_commands.json 76 | CTestTestfile.cmake 77 | _deps 78 | CMakeUserPresets.json 79 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.5.0 4 | hooks: 5 | - id: check-merge-conflict 6 | - id: check-symlinks 7 | - id: end-of-file-fixer 8 | - id: trailing-whitespace 9 | - id: check-added-large-files 10 | - repo: https://github.com/pre-commit/mirrors-clang-format 11 | rev: v14.0.6 12 | hooks: 13 | - id: clang-format 14 | types_or: [c++, c, cuda] 15 | - repo: https://github.com/PyCQA/flake8 16 | rev: 7.0.0 17 | hooks: 18 | - id: flake8 19 | args: [--max-line-length=120] 20 | - repo: https://github.com/cheshirekow/cmake-format-precommit 21 | rev: v0.6.13 22 | hooks: 23 | - id: cmake-format 24 | additional_dependencies: [pyyaml] 25 | args: [--in-place, -c, .cmake-format.yaml] 26 | types: [file] 27 | files: (\.cmake|CMakeLists.txt)(.in)?$ 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019-2020 Wang Xinyu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /alexnet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(alexnet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | include_directories(${PROJECT_SOURCE_DIR}/include) 12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 13 | # cuda 14 | include_directories(/usr/local/cuda/include) 15 | link_directories(/usr/local/cuda/lib64) 16 | # tensorrt 17 | include_directories(/usr/include/x86_64-linux-gnu/) 18 | link_directories(/usr/lib/x86_64-linux-gnu/) 19 | 20 | add_executable(alexnet ${PROJECT_SOURCE_DIR}/alex.cpp) 21 | target_link_libraries(alexnet nvinfer) 22 | target_link_libraries(alexnet cudart) 23 | 24 | add_definitions(-O2 -pthread) 25 | 26 | -------------------------------------------------------------------------------- /alexnet/README.md: -------------------------------------------------------------------------------- 1 | # alexnet 2 | 3 | AlexNet model architecture from the "One weird trick..." `_ paper. 4 | 5 | For the details, you can refer to [pytorchx/alexnet](https://github.com/wang-xinyu/pytorchx/tree/master/alexnet) 6 | 7 | This alexnet is just several `conv-relu-pool` blocks followed by several `fc-relu`, nothing special. All layers can be implemented by tensorrt api, including `addConvolution`, `addActivation`, `addPooling`, `addFullyConnected`. 8 | 9 | ``` 10 | // 1. generate alexnet.wts from [pytorchx/alexnet](https://github.com/wang-xinyu/pytorchx/tree/master/alexnet) 11 | 12 | // 2. put alexnet.wts into tensorrtx/alexnet 13 | 14 | // 3. build and run 15 | 16 | cd tensorrtx/alexnet 17 | 18 | mkdir build 19 | 20 | cd build 21 | 22 | cmake .. 23 | 24 | make 25 | 26 | sudo ./alexnet -s // serialize model to plan file i.e. 'alexnet.engine' 27 | 28 | sudo ./alexnet -d // deserialize plan file and run inference 29 | 30 | // 4. see if the output is same as pytorchx/alexnet 31 | ``` 32 | 33 | 34 | -------------------------------------------------------------------------------- /arcface/gen_wts.py: -------------------------------------------------------------------------------- 1 | import struct 2 | import sys 3 | import argparse 4 | import face_model 5 | import cv2 6 | import numpy as np 7 | 8 | parser = argparse.ArgumentParser(description='face model test') 9 | # general 10 | parser.add_argument('--image-size', default='112,112', help='') 11 | parser.add_argument('--model', default='model-r100-ii/model,0', help='path to load model.') 12 | parser.add_argument('--ga-model', default='', help='path to load model.') 13 | parser.add_argument('--gpu', default=0, type=int, help='gpu id') 14 | parser.add_argument('--det', default=0, type=int, help='mtcnn option, 1 means using R+O, 0 means detect from begining') 15 | parser.add_argument('--flip', default=0, type=int, help='whether do lr flip aug') 16 | parser.add_argument('--threshold', default=1.24, type=float, help='ver dist threshold') 17 | args = parser.parse_args() 18 | 19 | model = face_model.FaceModel(args) 20 | 21 | f = open('arcface-r100.wts', 'w') 22 | f.write('{}\n'.format(len(model.model.get_params()[0].keys()) + len(model.model.get_params()[1].keys()))) 23 | for k, v in model.model.get_params()[0].items(): 24 | vr = v.reshape(-1).asnumpy() 25 | f.write('{} {} '.format(k, len(vr))) 26 | for vv in vr: 27 | f.write(' ') 28 | f.write(struct.pack('>f',float(vv)).hex()) 29 | f.write('\n') 30 | for k, v in model.model.get_params()[1].items(): 31 | vr = v.reshape(-1).asnumpy() 32 | f.write('{} {} '.format(k, len(vr))) 33 | for vv in vr: 34 | f.write(' ') 35 | f.write(struct.pack('>f',float(vv)).hex()) 36 | f.write('\n') 37 | 38 | -------------------------------------------------------------------------------- /arcface/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #if NV_TENSORRT_MAJOR >= 8 5 | #define TRT_NOEXCEPT noexcept 6 | #define TRT_CONST_ENQUEUE const 7 | #else 8 | #define TRT_NOEXCEPT 9 | #define TRT_CONST_ENQUEUE 10 | #endif 11 | 12 | #endif // __MACROS_H -------------------------------------------------------------------------------- /centernet/README.md: -------------------------------------------------------------------------------- 1 | # CenterNet 2 | 3 | This is the trt implementation of detection model [ctdet_coco_dla_2x](https://drive.google.com/open?id=1pl_-ael8wERdUREEnaIfqOV_VF2bEVRT) from [xingyizhou/CenterNet](https://github.com/xingyizhou/CenterNet) official work. 4 | 5 | ## How to Run 6 | 7 | 1. Follow [NVIDIA/TensorRT](https://github.com/NVIDIA/TensorRT) tutorial to build TensorRT7 8 | 9 | 2. Copy folder `dcnv2Plugin` to `TensorRT/plugin` and edit `InferPlugin.cpp` and `CMakeLists.txt` 10 | 11 | 3. Rebuild to install custom plugin 12 | 13 | 4. Use `tensorrt-7.2.3.4-cp36-none-linux_x86_64.whl` in TensorRT OSS to update your python-tensorrt 14 | 15 | 5. Run `python centernet.py -m ${PTH_PATH} -s` to create trt engine 16 | 17 | ## Sample 18 | 19 | ``` 20 | // Download ctdet_coco_dla_2x.pth and transfer it into trt engine first 21 | // Download the test img from https://raw.githubusercontent.com/tensorflow/models/master/research/deeplab/g3doc/img/image2.jpg or choose your own one 22 | cd sample 23 | python test.py ${ENGINE_PATH} ${IMG_PATH} 24 | ``` 25 | ![trt_out](https://user-images.githubusercontent.com/47047345/119128637-7a878900-ba68-11eb-91ff-5dcc10f01b77.jpg) 26 | 27 | ## TODO 28 | 29 | Integrate the post process with trt engine to make it more easier to use. -------------------------------------------------------------------------------- /centernet/dcnv2Plugin/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | file(GLOB SRCS *.cpp) 17 | set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS}) 18 | set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE) 19 | file(GLOB CU_SRCS *.cu) 20 | set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} ${CU_SRCS}) 21 | set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} PARENT_SCOPE) -------------------------------------------------------------------------------- /crnn/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(crnn) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") 15 | message("embed_platform on") 16 | include_directories(/usr/local/cuda/targets/aarch64-linux/include) 17 | link_directories(/usr/local/cuda/targets/aarch64-linux/lib) 18 | else() 19 | message("embed_platform off") 20 | include_directories(/usr/local/cuda/include) 21 | link_directories(/usr/local/cuda/lib64) 22 | endif() 23 | 24 | find_package(OpenCV) 25 | include_directories(${OpenCV_INCLUDE_DIRS}) 26 | 27 | add_executable(crnn ${PROJECT_SOURCE_DIR}/crnn.cpp) 28 | target_link_libraries(crnn nvinfer) 29 | target_link_libraries(crnn cudart) 30 | target_link_libraries(crnn ${OpenCV_LIBS}) 31 | 32 | add_definitions(-O2 -pthread) 33 | 34 | -------------------------------------------------------------------------------- /crnn/README.md: -------------------------------------------------------------------------------- 1 | # crnn 2 | 3 | The Pytorch implementation is [meijieru/crnn.pytorch](https://github.com/meijieru/crnn.pytorch). 4 | 5 | ## How to Run 6 | 7 | ``` 8 | 1. generate crnn.wts from pytorch 9 | 10 | git clone https://github.com/wang-xinyu/tensorrtx.git 11 | git clone https://github.com/meijieru/crnn.pytorch.git 12 | // download its weights 'crnn.pth' 13 | // copy tensorrtx/crnn/genwts.py into crnn.pytorch/ 14 | // go to crnn.pytorch/ 15 | python genwts.py 16 | // a file 'crnn.wts' will be generated. 17 | 18 | 2. build tensorrtx/crnn and run 19 | 20 | // put crnn.wts into tensorrtx/crnn 21 | // go to tensorrtx/crnn 22 | mkdir build 23 | cd build 24 | cmake .. 25 | make 26 | sudo ./crnn -s // serialize model to plan file i.e. 'crnn.engine' 27 | // copy crnn.pytorch/data/demo.png here 28 | sudo ./crnn -d // deserialize plan file and run inference 29 | 30 | 3. check the output as follows: 31 | 32 | raw: a-----v--a-i-l-a-bb-l-e--- 33 | sim: available 34 | 35 | ``` 36 | 37 | ## More Information 38 | 39 | See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) 40 | 41 | ## Acknowledgment 42 | 43 | Thanks for the donation for this crnn tensorrt implementation from @雍. 44 | 45 | -------------------------------------------------------------------------------- /crnn/genwts.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import utils 4 | import models.crnn as crnn 5 | import struct 6 | 7 | model_path = './data/crnn.pth' 8 | 9 | model = crnn.CRNN(32, 1, 37, 256) 10 | if torch.cuda.is_available(): 11 | model = model.cuda() 12 | print('loading pretrained model from %s' % model_path) 13 | model.load_state_dict(torch.load(model_path)) 14 | 15 | image = torch.ones(1, 1, 32, 100) 16 | if torch.cuda.is_available(): 17 | image = image.cuda() 18 | 19 | model.eval() 20 | print(model) 21 | print('image shape ', image.shape) 22 | preds = model(image) 23 | 24 | f = open("crnn.wts", 'w') 25 | f.write("{}\n".format(len(model.state_dict().keys()))) 26 | for k,v in model.state_dict().items(): 27 | print('key: ', k) 28 | print('value: ', v.shape) 29 | vr = v.reshape(-1).cpu().numpy() 30 | f.write("{} {}".format(k, len(vr))) 31 | for vv in vr: 32 | f.write(" ") 33 | f.write(struct.pack(">f", float(vv)).hex()) 34 | f.write("\n") 35 | 36 | -------------------------------------------------------------------------------- /csrnet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | project(csrnet) 4 | 5 | add_definitions(-std=c++11) 6 | add_definitions(-DAPI_EXPORTS) 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | # cuda 12 | include_directories(/usr/local/cuda/targets/x86_64-linux/include ) 13 | link_directories(/usr/local/cuda/targets/x86_64-linux/lib) 14 | 15 | # tensorrt 16 | include_directories(/usr/include/x86_64-linux-gnu/) 17 | link_directories(/usr/lib/x86_64-linux-gnu/) 18 | 19 | # opencv 20 | find_package(OpenCV) 21 | include_directories(${OpenCV_INCLUDE_DIRS}) 22 | 23 | include_directories(${PROJECT_SOURCE_DIR}/) 24 | 25 | add_executable(csrnet csrnet.cpp) 26 | target_link_libraries(csrnet nvinfer cudart ${OpenCV_LIBS}) -------------------------------------------------------------------------------- /csrnet/README.md: -------------------------------------------------------------------------------- 1 | # csrnet 2 | 3 | The Pytorch implementation is [leeyeehoo/CSRNet-pytorch](https://github.com/leeyeehoo/CSRNet-pytorch). 4 | 5 | This repo is a TensorRT implementation of CSRNet. 6 | 7 | paper : [CSRNet: Dilated Convolutional Neural Networks for Understanding the Highly Congested Scenes](https://arxiv.org/abs/1802.10062) 8 | 9 | Dev environment: 10 | - Ubuntu 22.04 11 | - TensorRT 8.6 12 | - OpenCV 4.5.4 13 | - CMake 3.24 14 | - GPU Driver 535.113.01 15 | - CUDA 12.2 16 | - RTX3080 17 | 18 | 19 | # how to run 20 | 21 | ```bash 22 | 1. generate csrnet engine 23 | git clone https://github.com/leeyeehoo/CSRNet-pytorch.git 24 | git clone https://github.com/wang-xinyu/tensorrtx.git 25 | // copy gen_wts.py to CSRNet-pytorch 26 | // generate wts file 27 | python gen_wts.py 28 | // csrnet wts will be generated in CSRNet-pytorch 29 | 30 | 2. build csrnet.engine 31 | // mv CSRNet-pytorch/csrnet.engine to tensorrtx/csrnet 32 | mv CSRNet-pytorch/csrnet.wts tensorrtx/csrnet 33 | // build 34 | mkdir build 35 | cmake .. 36 | make 37 | sudo ./csrnet -s ./csrnet.wts 38 | 39 | Loading weights: ./csrnet.wts 40 | build engine successfully : ./csrnet.engine 41 | 42 | // download images https://github.com/wang-xinyu/tensorrtx/assets/46584679/46bc4def-e573-44ae-996d-5d68927c78ff and copy to images 43 | sudo ./csrnet -d ./images 44 | 45 | // output e.g 46 | // enqueueV2 time: 0.0323869s 47 | // detect time:44ms 48 | // people num :22.9101 write_path: ../images/data.jpg 49 | ``` 50 | 51 | 52 | # result 53 | 54 | inference people num: 22.9101 55 | 56 |

57 | 58 |

59 | -------------------------------------------------------------------------------- /csrnet/config.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | const static char *kInputTensorName = "data"; 4 | const static char *kOutputTensorName = "prob"; 5 | const static char *kEngineFile = "./csrnet.engine"; 6 | 7 | const static int kBatchSize = 1; 8 | 9 | const static int MAX_INPUT_SIZE = 1440; // 32x 10 | const static int MIN_INPUT_SIZE = 608; 11 | const static int OPT_INPUT_W = 1152; 12 | const static int OPT_INPUT_H = 640; 13 | 14 | constexpr static int kMaxInputImageSize = MAX_INPUT_SIZE * MAX_INPUT_SIZE * 3; 15 | constexpr static int kMaxOutputProbSize = 16 | (MAX_INPUT_SIZE * MAX_INPUT_SIZE) >> 6; -------------------------------------------------------------------------------- /csrnet/gen_wts.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules import module 2 | from model import CSRNet 3 | import torch 4 | import os 5 | import struct 6 | 7 | 8 | save_path = os.path.join(os.path.dirname( 9 | __file__), "output", os.path.basename(__file__).split('.')[0]) 10 | os.makedirs(save_path, exist_ok=True) 11 | wts_file = os.path.join(save_path, "csrnet.wts") 12 | 13 | 14 | # load model 15 | model_path = "partBmodel_best.pth.tar" 16 | model = CSRNet() 17 | checkpoint = torch.load(model_path) 18 | model.load_state_dict(checkpoint['state_dict']) 19 | 20 | 21 | # save to wts 22 | print(f'Writing into {wts_file}') 23 | with open(wts_file, 'w') as f: 24 | f.write('{}\n'.format(len(model.state_dict().keys()))) 25 | for k, v in model.state_dict().items(): 26 | vr = v.reshape(-1).cpu().numpy() 27 | f.write('{} {} '.format(k, len(vr))) 28 | for vv in vr: 29 | f.write(' ') 30 | f.write(struct.pack('>f', float(vv)).hex()) 31 | f.write('\n') -------------------------------------------------------------------------------- /csrnet/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #if NV_TENSORRT_MAJOR >= 8 5 | #define TRT_NOEXCEPT noexcept 6 | #define TRT_CONST_ENQUEUE const 7 | #else 8 | #define TRT_NOEXCEPT 9 | #define TRT_CONST_ENQUEUE 10 | #endif 11 | 12 | #endif // __MACROS_H 13 | -------------------------------------------------------------------------------- /dbnet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(dbnet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | 15 | # cuda 16 | include_directories(/usr/local/cuda/include) 17 | link_directories(/usr/local/cuda/lib64) 18 | # tensorrt 19 | include_directories(/usr/include/x86_64-linux-gnu/) 20 | link_directories(/usr/lib/x86_64-linux-gnu/) 21 | 22 | find_package(OpenCV) 23 | include_directories(${OpenCV_INCLUDE_DIRS}) 24 | 25 | aux_source_directory(. DIRSRCS) 26 | 27 | # clipper 28 | include_directories(./ ./clipper) 29 | add_subdirectory(clipper) 30 | 31 | add_executable(dbnet ${DIRSRCS}) 32 | target_link_libraries(dbnet clipper) 33 | target_link_libraries(dbnet nvinfer) 34 | target_link_libraries(dbnet cudart) 35 | target_link_libraries(dbnet ${OpenCV_LIBS}) 36 | 37 | add_definitions(-O2 -pthread) 38 | 39 | -------------------------------------------------------------------------------- /dbnet/clipper/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | aux_source_directory(. DIR_CLIPPER_SRCS) 4 | add_library(clipper ${DIR_CLIPPER_SRCS}) -------------------------------------------------------------------------------- /densenet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | # set the project name 4 | project(densenet) 5 | 6 | add_definitions(-std=c++11) 7 | 8 | # get main project dir to include common files 9 | get_filename_component(MAIN_DIR ../ ABSOLUTE) 10 | 11 | # When enabled the static version of the 12 | # CUDA runtime library will be used in CUDA_LIBRARIES 13 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 14 | 15 | # specify the C++ standard 16 | set(CMAKE_CXX_STANDARD 11) 17 | set(CMAKE_CXX_STANDARD_REQUIRED True) 18 | set(CMAKE_BUILD_TYPE Debug) 19 | 20 | # include 21 | 22 | # include and link cuda 23 | include_directories(/usr/local/cuda/include) 24 | link_directories(/usr/local/cuda/lib64) 25 | 26 | # include and link tensorrt 27 | include_directories(/usr/include/x86_64-linux-gnu) 28 | link_directories(/usr/lib/x86_64-linux-gnu) 29 | 30 | # add the executable 31 | add_executable(densenet ${PROJECT_SOURCE_DIR}/densenet121.cpp) 32 | 33 | target_link_libraries(densenet nvinfer) 34 | target_link_libraries(densenet cudart) 35 | 36 | add_definitions(-O2 -pthread) -------------------------------------------------------------------------------- /densenet/README.md: -------------------------------------------------------------------------------- 1 | # Densenet121 2 | 3 | The Pytorch implementation is [makaveli10/densenet](https://github.com/makaveli10/torchtrtz/tree/main/densenet). Model from torchvision. 4 | The tensorrt implemenation is taken from [makaveli10/cpptensorrtz](https://github.com/makaveli10/cpptensorrtz/). 5 | 6 | ## How to Run 7 | 8 | 1. generate densenet121.wts from pytorch 9 | 10 | ``` 11 | git clone https://github.com/wang-xinyu/tensorrtx.git 12 | git clone https://github.com/makaveli10/torchtrtz.git 13 | 14 | // go to torchtrtz/densenet 15 | // Enter these two commands to create densenet121.wts 16 | python models.py 17 | python gen_trtwts.py 18 | ``` 19 | 20 | 2. build densenet and run 21 | 22 | ``` 23 | // put densenet121.wts into tensorrtx/densenet 24 | // go to tensorrtx/densenet 25 | mkdir build 26 | cd build 27 | cmake .. 28 | make 29 | sudo ./densenet -s // serialize model to file i.e. 'densenet.engine' 30 | sudo ./densenet -d // deserialize model and run inference 31 | ``` 32 | 33 | 3. Verify output from [torch impl](https://github.com/makaveli10/torchtrtz/blob/main/densenet/README.md) 34 | 35 | TensorRT output[:5]: 36 | ``` 37 | [-0.587389, -0.329202, -1.83404, -1.89935, -0.928404] 38 | ``` 39 | 40 | -------------------------------------------------------------------------------- /detr/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(detr) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 15 | # cuda 16 | include_directories(/usr/local/cuda/include) 17 | link_directories(/usr/local/cuda/lib64) 18 | # tensorrt 19 | include_directories(/data/app/TensorRT-8.4.3.1/include) 20 | link_directories(/data/app/TensorRT-8.4.3.1/lib) 21 | 22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 23 | 24 | find_package(OpenCV) 25 | include_directories(${OpenCV_INCLUDE_DIRS}) 26 | 27 | add_executable(detr ${PROJECT_SOURCE_DIR}/detr.cpp) 28 | target_link_libraries(detr nvinfer) 29 | target_link_libraries(detr cudart) 30 | target_link_libraries(detr ${OpenCV_LIBS}) 31 | 32 | add_definitions(-O2 -pthread) 33 | 34 | -------------------------------------------------------------------------------- /detr/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #include "NvInfer.h" 5 | 6 | #ifdef API_EXPORTS 7 | #if defined(_MSC_VER) 8 | #define API __declspec(dllexport) 9 | #else 10 | #define API __attribute__((visibility("default"))) 11 | #endif 12 | #else 13 | 14 | #if defined(_MSC_VER) 15 | #define API __declspec(dllimport)a 16 | #else 17 | #define API 18 | #endif 19 | #endif // API_EXPORTS 20 | 21 | #if NV_TENSORRT_MAJOR >= 8 22 | #define TRT_NOEXCEPT noexcept 23 | #define TRT_CONST_ENQUEUE const 24 | #else 25 | #define TRT_NOEXCEPT 26 | #define TRT_CONST_ENQUEUE 27 | #endif 28 | 29 | #endif // __MACROS_H 30 | -------------------------------------------------------------------------------- /docker/.env: -------------------------------------------------------------------------------- 1 | COMPOSE_PROJECT_NAME=tensorrtx 2 | HOME=$HOME 3 | EUID=$(id -u) 4 | 5 | ## (optional) a local mount point path 6 | DATA_DIR="" 7 | -------------------------------------------------------------------------------- /docker/tensorrtx-docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | tensorrt: 3 | image: tensortx:1.0.0 4 | container_name: tensortx 5 | environment: 6 | - NVIDIA_VISIBLE_DEVICES=all 7 | build: 8 | dockerfile: x86_64.dockerfile 9 | cap_add: 10 | - CAP_SYS_ADMIN 11 | security_opt: 12 | - seccomp:unconfined 13 | privileged: true 14 | stdin_open: true 15 | tty: true 16 | shm_size: '8gb' 17 | ulimits: 18 | memlock: 19 | soft: -1 20 | hard: -1 21 | devices: 22 | - /dev:/dev:rw 23 | volumes: 24 | #### user #### 25 | - ${HOME}:/workspace/localhome:rw 26 | #### custom #### 27 | - mount:/mnt:rw 28 | deploy: 29 | restart_policy: 30 | condition: on-failure 31 | max_attempts: 1 32 | delay: 5s 33 | resources: 34 | reservations: 35 | devices: 36 | - driver: nvidia 37 | capabilities: [gpu] 38 | count: all 39 | 40 | volumes: 41 | mount: 42 | driver: local 43 | driver_opts: 44 | type: none 45 | o: bind 46 | device: ${DATA_DIR} 47 | -------------------------------------------------------------------------------- /docker/x86_64.dockerfile: -------------------------------------------------------------------------------- 1 | ARG TAG=24.01-py3 2 | 3 | FROM nvcr.io/nvidia/tensorrt:${TAG} AS tensorrtx 4 | 5 | ENV DEBIAN_FRONTEND noninteractive 6 | 7 | # basic tools 8 | RUN apt update && apt-get install -y --fix-missing --no-install-recommends \ 9 | sudo wget curl git ca-certificates ninja-build tzdata pkg-config \ 10 | gdb libglib2.0-dev libmount-dev \ 11 | && rm -rf /var/lib/apt/lists/* 12 | RUN pip install --no-cache-dir yapf isort cmake-format pre-commit 13 | 14 | ## override older cmake 15 | RUN find /usr/local/share -type d -name "cmake-*" -exec rm -rf {} + \ 16 | && curl -fsSL "https://github.com/Kitware/CMake/releases/download/v3.29.0/cmake-3.29.0-linux-x86_64.sh" \ 17 | -o cmake.sh && bash cmake.sh --skip-license --exclude-subdir --prefix=/usr/local && rm cmake.sh 18 | 19 | RUN apt update && apt-get install -y \ 20 | libopencv-dev \ 21 | && rm -rf /var/lib/apt/lists/* 22 | 23 | ## a template to build opencv and opencv_contrib from source 24 | # RUN git clone -b 4.x https://github.com/opencv/opencv_contrib.git \ 25 | # && git clone -b 4.x https://github.com/opencv/opencv.git opencv \ 26 | # && cmake -S opencv -B opencv/build -G Ninja \ 27 | # -DBUILD_LIST=core,calib3d,imgproc,imgcodecs,highgui \ 28 | # -DOPENCV_EXTRA_MODULES_PATH="/workspace/opencv_contrib/modules" \ 29 | # -DCMAKE_BUILD_TYPE=RELEASE \ 30 | # -DCMAKE_INSTALL_PREFIX=/usr/local \ 31 | # -DENABLE_FAST_MATH=ON \ 32 | # -DOPENCV_GENERATE_PKGCONFIG=ON \ 33 | # -DBUILD_opencv_python2=OFF \ 34 | # -DBUILD_opencv_python3=OFF \ 35 | # -DBUILD_JAVA=OFF \ 36 | # -DBUILD_DOCS=OFF \ 37 | # -DBUILD_PERF_TESTS=OFF \ 38 | # -DBUILD_TESTS=OFF \ 39 | # && ninja -C opencv/build install 40 | -------------------------------------------------------------------------------- /efficient_ad/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.12) 2 | project(EfficientAD-M) 3 | 4 | add_definitions(-w) 5 | add_definitions(-D API_EXPORTS) 6 | set(CMAKE_CXX_STANDARD 11) 7 | set(CMAKE_BUILD_TYPE "Debug") 8 | set(CMAKE_CUDA_ARCHITECTURES 61 75 86 89) 9 | set(THREADS_PREFER_PTHREAD_FLAG ON) 10 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /od") 11 | 12 | ### nvcc 13 | set(CMAKE_CUDA_COMPILER "D:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/bin/nvcc.exe") 14 | enable_language(CUDA) 15 | ### cuda 16 | include_directories("D:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/include") 17 | link_directories("D:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/lib/x64") 18 | ### tensorrt 19 | set(TRT_DIR "D:/Program Files/NVIDIA GPU Computing Toolkit/TensorRT-8.5.3.1/") 20 | include_directories(${TRT_DIR}/include) 21 | link_directories(${TRT_DIR}/lib) 22 | ### opencv 23 | set(OpenCV_DIR "E:/OpenCV/OpenCV_4.6.0/opencv/build") 24 | find_package(OpenCV) 25 | include_directories(${OpenCV_INCLUDE_DIRS}) 26 | ### dirent 27 | include_directories("E:/SDK/dirent-1.24/include") 28 | 29 | include_directories(${PROJECT_SOURCE_DIR}/src/) 30 | file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) 31 | 32 | add_executable(efficientAD_det "./efficientAD_det.cpp" ${SRCS}) 33 | target_link_libraries(efficientAD_det nvinfer 34 | cudart 35 | nvinfer_plugin 36 | ${OpenCV_LIBS} 37 | ) 38 | -------------------------------------------------------------------------------- /efficient_ad/README.md: -------------------------------------------------------------------------------- 1 | # EfficientAd 2 | 3 | EfficientAd: Accurate Visual Anomaly Detection at Millisecond-Level Latencies. 4 | 5 | The Pytorch implementation is [openvinotoolkit/anomalib](https://github.com/openvinotoolkit/anomalib). 6 | 7 |

8 | 9 |

10 | 11 | # Test Environment 12 | 13 | GTX3080 / Windows10 22H2 / cuda11.8 / cudnn8.9.7 / TensorRT8.5.3 / OpenCV4.6 14 | 15 | # How to Run 16 | 17 | 1. training to generate weight files (`efficientAD_[category].pt`) 18 | 19 | ``` 20 | // Please refer to Anomalib's tutorial for details: 21 | // https://github.com/openvinotoolkit/anomalib?tab=readme-ov-file#-training 22 | ``` 23 | 24 | 2. generate `.wts` from pytorch with `.pt` 25 | 26 | ``` 27 | cd ./datas/models/ 28 | // copy your `.pt` file to the current directory. 29 | python gen_wts.py 30 | // a file `efficientAD_[category].wts` will be generated. 31 | ``` 32 | 33 | 3. build and run 34 | 35 | ``` 36 | mkdir build 37 | cd build 38 | cmake .. 39 | make 40 | sudo ./EfficientAD-M -s [.wts] // serialize model to plan file 41 | sudo ./EfficientAD-M -d [.engine] [image folder] // deserialize and run inference, the images in [image folder] will be processed 42 | ``` 43 | 44 | # Latency 45 | 46 | average cost of doInference(in `efficientad_detect.cpp`) from second time with batch=1 under the windows environment above 47 | 48 | | | FP32 | 49 | | :-----------: | :--: | 50 | | EfficientAD-M | 12ms | 51 | -------------------------------------------------------------------------------- /efficient_ad/datas/models/gen_wts.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import struct 3 | import sys 4 | 5 | # Initialize 6 | pt_file = sys.argv[1] 7 | device = torch.device('cuda') 8 | # Load model 9 | model = torch.load(pt_file, map_location=torch.device('cpu'))['model'].float() # load to FP32 10 | model.to(device).eval() 11 | 12 | with open(pt_file.split('.')[0] + '.wts', 'w') as f: 13 | f.write('{}\n'.format(len(model.state_dict().keys()))) 14 | for k, v in model.state_dict().items(): 15 | vr = v.reshape(-1).cpu().numpy() 16 | f.write('{} {} '.format(k, len(vr))) 17 | for vv in vr: 18 | f.write(' ') 19 | f.write(struct.pack('>f', float(vv)).hex()) 20 | f.write('\n') 21 | -------------------------------------------------------------------------------- /efficient_ad/src/config.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | /* -------------------------------------------------------- 4 | * These configs are related to tensorrt model, if these are changed, 5 | * please re-compile and re-serialize the tensorrt model. 6 | * --------------------------------------------------------*/ 7 | 8 | // For INT8, you need prepare the calibration dataset, please refer to 9 | #define USE_FP32 // set USE_INT8 or USE_FP16 or USE_FP32 10 | 11 | // These are used to define input/output tensor names, 12 | // you can set them to whatever you want. 13 | const static char* kInputTensorName = "data"; 14 | const static char* kOutputTensorName = "prob"; 15 | 16 | constexpr static int kBatchSize = 1; 17 | 18 | // input width and height must by divisible by 32 19 | constexpr static int kInputH = 256; 20 | constexpr static int kInputW = 256; 21 | 22 | /* -------------------------------------------------------- 23 | * These configs are NOT related to tensorrt model, if these are changed, 24 | * please re-compile, but no need to re-serialize the tensorrt model. 25 | * --------------------------------------------------------*/ 26 | 27 | // default GPU_id 28 | const static int kGpuId = 0; 29 | 30 | // If your image size is larger than 4096 * 3112, please increase this value 31 | const static int kMaxInputImageSize = 4096 * 3112; 32 | -------------------------------------------------------------------------------- /efficient_ad/src/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRTX_CUDA_UTILS_H_ 2 | #define TRTX_CUDA_UTILS_H_ 3 | 4 | #include 5 | 6 | #ifndef CUDA_CHECK 7 | #define CUDA_CHECK(callstr) \ 8 | { \ 9 | cudaError_t error_code = callstr; \ 10 | if (error_code != cudaSuccess) { \ 11 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ 12 | assert(0); \ 13 | } \ 14 | } 15 | #endif // CUDA_CHECK 16 | 17 | #endif // TRTX_CUDA_UTILS_H_ 18 | -------------------------------------------------------------------------------- /efficient_ad/src/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #include 5 | 6 | #ifdef API_EXPORTS 7 | #if defined(_MSC_VER) 8 | #define API __declspec(dllexport) 9 | #else 10 | #define API __attribute__((visibility("default"))) 11 | #endif 12 | #else 13 | 14 | #if defined(_MSC_VER) 15 | #define API __declspec(dllimport) 16 | #else 17 | #define API 18 | #endif 19 | #endif // API_EXPORTS 20 | 21 | #if NV_TENSORRT_MAJOR >= 8 22 | #define TRT_NOEXCEPT noexcept 23 | #define TRT_CONST_ENQUEUE const 24 | #else 25 | #define TRT_NOEXCEPT 26 | #define TRT_CONST_ENQUEUE 27 | #endif 28 | 29 | #endif // __MACROS_H 30 | -------------------------------------------------------------------------------- /efficient_ad/src/model.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | nvinfer1::ICudaEngine* build_efficientAD_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, 8 | nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, 9 | float& gw, std::string& wts_name); 10 | -------------------------------------------------------------------------------- /efficient_ad/src/postprocess.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | void genHeatMap(cv::Mat originImg, cv::Mat& anomalyGrayMap, cv::Mat& HeatMap) { 6 | cv::Mat colorMap; 7 | cv::applyColorMap(colorMap, anomalyGrayMap, cv::COLORMAP_JET); 8 | cv::addWeighted(originImg, 0.5, colorMap, 0.5, 0, HeatMap); 9 | } 10 | -------------------------------------------------------------------------------- /efficient_ad/src/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | static inline int read_files_in_dir(const char* p_dir_name, std::vector& file_names) { 12 | DIR* p_dir = opendir(p_dir_name); 13 | if (p_dir == nullptr) { 14 | return -1; 15 | } 16 | 17 | struct dirent* p_file = nullptr; 18 | while ((p_file = readdir(p_dir)) != nullptr) { 19 | if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) { 20 | //std::string cur_file_name(p_dir_name); 21 | //cur_file_name += "/"; 22 | //cur_file_name += p_file->d_name; 23 | std::string cur_file_name(p_file->d_name); 24 | file_names.push_back(cur_file_name); 25 | } 26 | } 27 | 28 | closedir(p_dir); 29 | return 0; 30 | } 31 | -------------------------------------------------------------------------------- /efficientnet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(efficientnet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 15 | # cuda 16 | include_directories(/usr/local/cuda/include) 17 | link_directories(/usr/local/cuda/lib64) 18 | # tensorrt 19 | include_directories(/usr/include/x86_64-linux-gnu/) 20 | link_directories(/usr/lib/x86_64-linux-gnu/) 21 | 22 | add_executable(efficientnet ${PROJECT_SOURCE_DIR}/efficientnet.cpp) 23 | target_link_libraries(efficientnet nvinfer) 24 | target_link_libraries(efficientnet cudart) 25 | 26 | add_definitions(-O2 -pthread) 27 | 28 | -------------------------------------------------------------------------------- /efficientnet/README.md: -------------------------------------------------------------------------------- 1 | # EfficientNet 2 | 3 | A TensorRT implementation of EfficientNet. 4 | For the Pytorch implementation, you can refer to [EfficientNet-PyTorch](https://github.com/lukemelas/EfficientNet-PyTorch) 5 | 6 | ## How to run 7 | 8 | 1. install `efficientnet_pytorch` 9 | ``` 10 | pip install efficientnet_pytorch 11 | ``` 12 | 13 | 2. gennerate `.wts` file 14 | ``` 15 | python gen_wts.py 16 | ``` 17 | 18 | 3. build 19 | 20 | ``` 21 | mkdir build 22 | cd build 23 | cmake .. 24 | make 25 | ``` 26 | 4. serialize model to engine 27 | ``` 28 | ./efficientnet -s [.wts] [.engine] [b0 b1 b2 b3 ... b7] // serialize model to engine file 29 | ``` 30 | such as 31 | ``` 32 | ./efficientnet -s ../efficientnet-b3.wts efficientnet-b3.engine b3 33 | ``` 34 | 5. deserialize and do infer 35 | ``` 36 | ./efficientnet -d [.engine] [b0 b1 b2 b3 ... b7] // deserialize engine file and run inference 37 | ``` 38 | such as 39 | ``` 40 | ./efficientnet -d efficientnet-b3.engine b3 41 | ``` 42 | 6. see if the output is same as pytorch side 43 | 44 | 45 | For more models, please refer to [tensorrtx](https://github.com/wang-xinyu/tensorrtx) 46 | -------------------------------------------------------------------------------- /efficientnet/gen_wts.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import struct 3 | from efficientnet_pytorch import EfficientNet 4 | model = EfficientNet.from_pretrained('efficientnet-b3') 5 | 6 | model.eval() 7 | f = open('efficientnet-b3.wts', 'w') 8 | f.write('{}\n'.format(len(model.state_dict().keys()))) 9 | for k, v in model.state_dict().items(): 10 | vr = v.reshape(-1).cpu().numpy() 11 | f.write('{} {} '.format(k, len(vr))) 12 | for vv in vr: 13 | f.write(' ') 14 | f.write(struct.pack('>f',float(vv)).hex()) 15 | f.write('\n') 16 | f.close() 17 | -------------------------------------------------------------------------------- /ghostnet/ghostnetv1/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(ghostnetv1) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | include_directories(${PROJECT_SOURCE_DIR}/include) 12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 13 | # cuda 14 | include_directories(/usr/local/cuda/include) 15 | link_directories(/usr/local/cuda/lib64) 16 | # tensorrt 17 | include_directories(/usr/include/x86_64-linux-gnu/) 18 | link_directories(/usr/lib/x86_64-linux-gnu/) 19 | 20 | add_executable(ghostnetv1 ${PROJECT_SOURCE_DIR}/ghostnetv1.cpp) 21 | target_link_libraries(ghostnetv1 nvinfer) 22 | target_link_libraries(ghostnetv1 cudart) 23 | 24 | add_definitions(-O2 -pthread) 25 | -------------------------------------------------------------------------------- /ghostnet/ghostnetv2/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(ghostnetv2) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | include_directories(${PROJECT_SOURCE_DIR}/include) 12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 13 | # cuda 14 | include_directories(/usr/local/cuda/include) 15 | link_directories(/usr/local/cuda/lib64) 16 | # tensorrt 17 | include_directories(/usr/include/x86_64-linux-gnu/) 18 | link_directories(/usr/lib/x86_64-linux-gnu/) 19 | 20 | add_executable(ghostnetv2 ${PROJECT_SOURCE_DIR}/ghostnetv2.cpp) 21 | target_link_libraries(ghostnetv2 nvinfer) 22 | target_link_libraries(ghostnetv2 cudart) 23 | 24 | add_definitions(-O2 -pthread) 25 | -------------------------------------------------------------------------------- /googlenet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(googlenet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | include_directories(${PROJECT_SOURCE_DIR}/include) 12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 13 | # cuda 14 | include_directories(/usr/local/cuda/include) 15 | link_directories(/usr/local/cuda/lib64) 16 | # tensorrt 17 | include_directories(/usr/include/x86_64-linux-gnu/) 18 | link_directories(/usr/lib/x86_64-linux-gnu/) 19 | 20 | add_executable(googlenet ${PROJECT_SOURCE_DIR}/googlenet.cpp) 21 | target_link_libraries(googlenet nvinfer) 22 | target_link_libraries(googlenet cudart) 23 | 24 | add_definitions(-O2 -pthread) 25 | 26 | -------------------------------------------------------------------------------- /googlenet/README.md: -------------------------------------------------------------------------------- 1 | # googlenet 2 | 3 | GoogLeNet (Inception v1) model architecture from "Going Deeper with Convolutions" `_. 4 | 5 | For the details, you can refer to [pytorchx/googlenet](https://github.com/wang-xinyu/pytorchx/tree/master/googlenet) 6 | 7 | Following tricks used in this googlenet: 8 | 9 | - MaxPool2d(ceil_mode=True), ceilmode=True, which is not supported in Tensorrt4, we use a padding layer before maxpool to solve this problem. 10 | - Batchnorm layer, implemented by scale layer. 11 | 12 | ``` 13 | // 1. generate googlenet.wts from [pytorchx/googlenet](https://github.com/wang-xinyu/pytorchx/tree/master/googlenet) 14 | 15 | // 2. put googlenet.wts into tensorrtx/googlenet 16 | 17 | // 3. build and run 18 | 19 | cd tensorrtx/googlenet 20 | 21 | mkdir build 22 | 23 | cd build 24 | 25 | cmake .. 26 | 27 | make 28 | 29 | sudo ./googlenet -s // serialize model to plan file i.e. 'googlenet.engine' 30 | 31 | sudo ./googlenet -d // deserialize plan file and run inference 32 | 33 | // 4. see if the output is same as pytorchx/googlenet 34 | ``` 35 | 36 | 37 | -------------------------------------------------------------------------------- /hrnet/hrnet-image-classification/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(hrnet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | include_directories(/usr/local/cuda/include) 13 | link_directories(/usr/local/cuda/lib64) 14 | 15 | find_package(OpenCV) 16 | include_directories(${OpenCV_INCLUDE_DIRS}) 17 | 18 | add_executable(hrnet ${PROJECT_SOURCE_DIR}/hrnet.cpp) 19 | target_link_libraries(hrnet nvinfer) 20 | target_link_libraries(hrnet cudart) 21 | target_link_libraries(hrnet ${OpenCV_LIBS}) 22 | 23 | add_definitions(-O2 -pthread) 24 | 25 | -------------------------------------------------------------------------------- /hrnet/hrnet-image-classification/README.md: -------------------------------------------------------------------------------- 1 | # HRNet 2 | 3 | The Pytorch implementation is [HRNet-Image-Classification](https://github.com/HRNet/HRNet-Image-Classification). The implemented model is **HRNet-W18-C-Small-v2** 4 | 5 | 6 | ## How to Run 7 | 8 | * 1. generate .wts 9 | 10 | Download code and model from [HRNet-Image-Classification](https://github.com/HRNet/HRNet-Image-Classification) and config your environments. 11 | 12 | Put `demo.py` in the `YOUR_ROOT_DIR\HRNet-Image-Classification\tools ` folder, set `savewts in main()` as `True`, and run, the .wts will be generated. 13 | 14 | * 2. cmake and make 15 | 16 | ``` 17 | mkdir build 18 | cd build 19 | cmake .. 20 | make 21 | sudo ./hrnet -s // serialize model to plan file i.e. 'hrnet.engine' 22 | sudo ./hrnet -d ../samples // deserialize plan file and run inference, the images in samples will be processed. 23 | ``` 24 | 25 | ## Result 26 | 27 | The test img: 28 | 29 | ![](https://user-images.githubusercontent.com/20653176/93732833-ac103200-fc05-11ea-88ff-6f59f316a377.JPEG) 30 | 31 | Pytorch Result: 32 | 33 | ![image-20200921115119593](https://user-images.githubusercontent.com/20653176/93731787-225e6580-fc01-11ea-9578-393079cd1873.png) 34 | 35 | TRT Result: 36 | 37 | ![image-20200921114959069](https://user-images.githubusercontent.com/20653176/93731788-238f9280-fc01-11ea-954f-2debc20e102a.png) 38 | -------------------------------------------------------------------------------- /hrnet/hrnet-image-classification/hrnet.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/tensorrtx/c6bcebd812907a6d9eb66ecc909eb17b51d40fac/hrnet/hrnet-image-classification/hrnet.cpp -------------------------------------------------------------------------------- /hrnet/hrnet-semantic-segmentation/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(hrnetseg) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 15 | # cuda 16 | include_directories(/usr/local/cuda/include) 17 | link_directories(/usr/local/cuda/lib64) 18 | # tensorrt 19 | include_directories(/usr/include/x86_64-linux-gnu/) 20 | link_directories(/usr/lib/x86_64-linux-gnu/) 21 | 22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 23 | 24 | find_package(OpenCV) 25 | include_directories(${OpenCV_INCLUDE_DIRS}) 26 | 27 | add_executable(hrnet ${PROJECT_SOURCE_DIR}/hrnet.cpp) 28 | target_link_libraries(hrnet nvinfer) 29 | target_link_libraries(hrnet cudart) 30 | target_link_libraries(hrnet ${OpenCV_LIBS}) 31 | 32 | 33 | add_executable(hrnet_ocr ${PROJECT_SOURCE_DIR}/hrnet_ocr.cpp) 34 | target_link_libraries(hrnet_ocr nvinfer) 35 | target_link_libraries(hrnet_ocr cudart) 36 | target_link_libraries(hrnet_ocr ${OpenCV_LIBS}) 37 | 38 | 39 | add_definitions(-O2 -pthread) 40 | 41 | -------------------------------------------------------------------------------- /ibnnet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(IBNNet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 15 | # cuda 16 | include_directories(/usr/local/cuda/include) 17 | link_directories(/usr/local/cuda/lib64) 18 | # tensorrt 19 | include_directories(/usr/include/x86_64-linux-gnu/) 20 | link_directories(/usr/lib/x86_64-linux-gnu/) 21 | 22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 23 | 24 | find_package(OpenCV) 25 | include_directories(${OpenCV_INCLUDE_DIRS}) 26 | 27 | file(GLOB SOURCE_FILES "*.h" "*.cpp") 28 | 29 | add_executable(ibnnet ${SOURCE_FILES}) 30 | target_link_libraries(ibnnet nvinfer) 31 | target_link_libraries(ibnnet cudart) 32 | target_link_libraries(ibnnet ${OpenCV_LIBS}) 33 | 34 | add_definitions(-O2 -pthread) 35 | 36 | -------------------------------------------------------------------------------- /ibnnet/README.md: -------------------------------------------------------------------------------- 1 | # IBN-Net 2 | 3 | An implementation of IBN-Net, proposed in ["Two at Once: Enhancing Learning and Generalization Capacities via IBN-Net"](https://arxiv.org/abs/1807.09441), ECCV2018 by Xingang Pan, Ping Luo, Jianping Shi, Xiaoou Tang. 4 | 5 | For the Pytorch implementation, you can refer to [IBN-Net](https://github.com/XingangPan/IBN-Net) 6 | 7 | ## Features 8 | - InstanceNorm2d 9 | - bottleneck_ibn 10 | - Resnet50-IBNA 11 | - Resnet50-IBNB 12 | - Multi-thread inference 13 | 14 | ## How to Run 15 | 16 | * 1. generate .wts 17 | 18 | // for ibn-a 19 | ``` 20 | python gen_wts.py a 21 | ``` 22 | a file 'resnet50-ibna.wts' will be generated. 23 | 24 | // for ibn-b 25 | ``` 26 | python gen_wts.py b 27 | ``` 28 | a file 'resnet50-ibnb.wts' will be generated. 29 | * 2. cmake and make 30 | 31 | ``` 32 | mkdir build 33 | cd build 34 | cmake .. 35 | make 36 | ``` 37 | * 3. build engine and run classification 38 | 39 | // put resnet50-ibna.wts/resnet50-ibnb.wts into tensorrtx/ibnnet 40 | 41 | // go to tensorrtx/ibnnet 42 | ``` 43 | ./ibnnet -s // serialize model to plan file 44 | ./ibnnet -d // deserialize plan file and run inference 45 | ``` 46 | -------------------------------------------------------------------------------- /ibnnet/gen_wts.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import sys 4 | import struct 5 | 6 | 7 | assert sys.argv[1] == "a" or sys.argv[1] == "b" 8 | model_name = "resnet50_ibn_" + sys.argv[1] 9 | 10 | net = torch.hub.load('XingangPan/IBN-Net', model_name, pretrained=True).to('cuda:0').eval() 11 | 12 | #verify 13 | #input = torch.ones(1, 3, 224, 224).to('cuda:0') 14 | #pixel_mean = torch.tensor([0.485, 0.456, 0.406]).view(1, -1, 1, 1).to('cuda:0') 15 | #pixel_std = torch.tensor([0.229, 0.224, 0.225]).view(1, -1, 1, 1).to('cuda:0') 16 | #input.sub_(pixel_mean).div_(pixel_std) 17 | #out = net(input) 18 | #print(out) 19 | 20 | f = open(model_name + ".wts", 'w') 21 | f.write("{}\n".format(len(net.state_dict().keys()))) 22 | for k,v in net.state_dict().items(): 23 | vr = v.reshape(-1).cpu().numpy() 24 | f.write("{} {}".format(k, len(vr))) 25 | for vv in vr: 26 | f.write(" ") 27 | f.write(struct.pack(">f", float(vv)).hex()) 28 | f.write("\n") 29 | 30 | 31 | -------------------------------------------------------------------------------- /ibnnet/holder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | template 4 | class TensorRTHolder { 5 | T* holder; 6 | public: 7 | explicit TensorRTHolder(T* holder_) : holder(holder_) {} 8 | ~TensorRTHolder() { 9 | if (holder) 10 | holder->destroy(); 11 | } 12 | TensorRTHolder(const TensorRTHolder&) = delete; 13 | TensorRTHolder& operator=(const TensorRTHolder&) = delete; 14 | TensorRTHolder(TensorRTHolder && rhs) noexcept{ 15 | holder = rhs.holder; 16 | rhs.holder = nullptr; 17 | } 18 | TensorRTHolder& operator=(TensorRTHolder&& rhs) noexcept { 19 | if (this == &rhs) { 20 | return *this; 21 | } 22 | if (holder) holder->destroy(); 23 | holder = rhs.holder; 24 | rhs.holder = nullptr; 25 | return *this; 26 | } 27 | T* operator->() { 28 | return holder; 29 | } 30 | T* get() { return holder; } 31 | explicit operator bool() { return holder != nullptr; } 32 | T& operator*() noexcept { return *holder; } 33 | }; 34 | 35 | template 36 | TensorRTHolder make_holder(T* holder) { 37 | return TensorRTHolder(holder); 38 | } 39 | 40 | template 41 | using TensorRTNonHolder = T*; -------------------------------------------------------------------------------- /ibnnet/ibnnet.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "utils.h" 4 | #include "holder.h" 5 | #include "layers.h" 6 | #include "InferenceEngine.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | extern Logger gLogger; 12 | using namespace trtxapi; 13 | 14 | namespace trt { 15 | 16 | enum IBN { 17 | A, // resnet50-ibna, 18 | B, // resnet50-ibnb, 19 | NONE // resnet50 20 | }; 21 | 22 | class IBNNet { 23 | public: 24 | IBNNet(trt::EngineConfig &enginecfg, const IBN ibn); 25 | ~IBNNet() {}; 26 | 27 | bool serializeEngine(); /* create & serializeEngine */ 28 | bool deserializeEngine(); 29 | bool inference(std::vector &input); /* support batch inference */ 30 | 31 | float* getOutput(); 32 | int getDeviceID(); /* cuda deviceid */ 33 | 34 | private: 35 | ICudaEngine *createEngine(IBuilder *builder, IBuilderConfig *config); 36 | void preprocessing(const cv::Mat& img, float* const data, const std::size_t stride); 37 | 38 | private: 39 | trt::EngineConfig _engineCfg; 40 | std::unique_ptr _inferEngine{nullptr}; 41 | std::string _ibn; 42 | DataType _dt{DataType::kFLOAT}; 43 | }; 44 | 45 | } -------------------------------------------------------------------------------- /ibnnet/layers.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "NvInfer.h" 7 | #include "cuda_runtime_api.h" 8 | using namespace nvinfer1; 9 | 10 | namespace trtxapi { 11 | 12 | ITensor* MeanStd(INetworkDefinition *network, 13 | std::map& weightMap, 14 | ITensor* input, 15 | const std::string lname, 16 | const float* mean, 17 | const float* std, 18 | const bool div255); 19 | 20 | IScaleLayer* addBatchNorm2d(INetworkDefinition *network, 21 | std::map& weightMap, 22 | ITensor& input, 23 | const std::string lname, 24 | const float eps); 25 | 26 | IScaleLayer* addInstanceNorm2d(INetworkDefinition *network, 27 | std::map& weightMap, 28 | ITensor& input, 29 | const std::string lname, 30 | const float eps); 31 | 32 | IConcatenationLayer* addIBN(INetworkDefinition *network, 33 | std::map& weightMap, 34 | ITensor& input, 35 | const std::string lname); 36 | 37 | IActivationLayer* bottleneck_ibn(INetworkDefinition *network, 38 | std::map& weightMap, 39 | ITensor& input, 40 | const int inch, 41 | const int outch, 42 | const int stride, 43 | const std::string lname, 44 | const std::string ibn); 45 | 46 | } -------------------------------------------------------------------------------- /ibnnet/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | // Load weights from files shared with TensorRT samples. 4 | // TensorRT weight files have a simple space delimited format: 5 | // [type] [size] 6 | std::map loadWeights(const std::string file) { 7 | std::cout << "Loading weights: " << file << std::endl; 8 | std::map weightMap; 9 | 10 | // Open weights file 11 | std::ifstream input(file); 12 | assert(input.is_open() && "Unable to load weight file."); 13 | 14 | // Read number of weight blobs 15 | int32_t count; 16 | input >> count; 17 | assert(count > 0 && "Invalid weight map file."); 18 | 19 | while (count--) { 20 | Weights wt{DataType::kFLOAT, nullptr, 0}; 21 | uint32_t size; 22 | 23 | // Read name and type of blob 24 | std::string name; 25 | input >> name >> std::dec >> size; 26 | wt.type = DataType::kFLOAT; 27 | 28 | // Load blob 29 | uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); 30 | for (uint32_t x = 0, y = size; x < y; ++x) { 31 | input >> std::hex >> val[x]; 32 | } 33 | wt.values = val; 34 | wt.count = size; 35 | weightMap[name] = wt; 36 | } 37 | 38 | return weightMap; 39 | } 40 | -------------------------------------------------------------------------------- /ibnnet/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "NvInfer.h" 5 | #include "cuda_runtime_api.h" 6 | #include "assert.h" 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace nvinfer1; 12 | 13 | #define CHECK(status) \ 14 | do \ 15 | { \ 16 | auto ret = (status); \ 17 | if (ret != 0) \ 18 | { \ 19 | std::cout << "Cuda failure: " << ret; \ 20 | abort(); \ 21 | } \ 22 | } while (0) 23 | 24 | template 25 | std::unique_ptr make_unique(Args&&... args) { 26 | return std::unique_ptr(new T(std::forward(args)...)); 27 | } 28 | 29 | std::map loadWeights(const std::string file); 30 | 31 | -------------------------------------------------------------------------------- /inception/inceptionv3/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(inception) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | include_directories(${PROJECT_SOURCE_DIR}/include) 12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 13 | # cuda 14 | include_directories(/usr/local/cuda/include) 15 | link_directories(/usr/local/cuda/lib64) 16 | # tensorrt 17 | include_directories(/usr/include/x86_64-linux-gnu/) 18 | link_directories(/usr/lib/x86_64-linux-gnu/) 19 | 20 | add_executable(inception ${PROJECT_SOURCE_DIR}/inception_v3.cpp) 21 | target_link_libraries(inception nvinfer) 22 | target_link_libraries(inception cudart) 23 | 24 | add_definitions(-O2 -pthread) 25 | 26 | -------------------------------------------------------------------------------- /inception/inceptionv3/README.md: -------------------------------------------------------------------------------- 1 | # Inception v3 2 | 3 | Inception v3 model architecture from "Rethinking the Inception Architecture for Computer Vision" . 4 | 5 | For the details, you can refer to [pytorchx/inception](https://github.com/wang-xinyu/pytorchx/tree/master/inception) 6 | 7 | Following tricks are used in this inception: 8 | 9 | - For pooling layer with padding, we need pay attention to see if padding is included or excluded while calculating average number. Pytorch includes padding while doing avgPool by default, but Tensorrt doesn't. So for pooling layer with padding, we need `setAverageCountExcludesPadding(false)` in tensorrt. 10 | - Batchnorm layer, implemented by scale layer. 11 | 12 | ``` 13 | // 1. generate inception.wts from [pytorchx/inception](https://github.com/wang-xinyu/pytorchx/tree/master/inception) 14 | 15 | // 2. put inception.wts into tensorrtx/inception 16 | 17 | // 3. build and run 18 | 19 | cd tensorrtx/inception 20 | 21 | mkdir build 22 | 23 | cd build 24 | 25 | cmake .. 26 | 27 | make 28 | 29 | sudo ./inception -s // serialize model to plan file i.e. 'inception.engine' 30 | 31 | sudo ./inception -d // deserialize plan file and run inference 32 | 33 | // 4. see if the output is same as pytorchx/inception 34 | ``` 35 | 36 | 37 | -------------------------------------------------------------------------------- /inception/inceptionv4/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(InceptionV4) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 15 | # cuda 16 | include_directories(/usr/local/cuda/include) 17 | link_directories(/usr/local/cuda/lib64) 18 | # tensorrt 19 | include_directories(/usr/include/x86_64-linux-gnu/) 20 | link_directories(/usr/lib/x86_64-linux-gnu/) 21 | 22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 23 | 24 | find_package(OpenCV) 25 | include_directories(${OpenCV_INCLUDE_DIRS}) 26 | 27 | file(GLOB SOURCE_FILES "*.h" "*.cpp") 28 | 29 | add_executable(inceptionv4 ${SOURCE_FILES}) 30 | target_link_libraries(inceptionv4 nvinfer) 31 | target_link_libraries(inceptionv4 cudart) 32 | target_link_libraries(inceptionv4 ${OpenCV_LIBS}) 33 | 34 | add_definitions(-O2 -pthread) 35 | 36 | -------------------------------------------------------------------------------- /inception/inceptionv4/README.md: -------------------------------------------------------------------------------- 1 | # Inception v4 2 | 3 | Inception v4 model architecture from "Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning" . 4 | 5 | For the details, you can refer to [rwightman/pytorch-image-models](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/inception_v4.py) 6 | 7 | Following tricks are used in this inception: 8 | 9 | - For pooling layer with padding, we need pay attention to see if padding is included or excluded while calculating average number. Pytorch includes padding while doing avgPool by default, but Tensorrt doesn't. So for pooling layer with padding, we need `setAverageCountExcludesPadding(false)` in tensorrt. 10 | - Batchnorm layer, implemented by scale layer. 11 | 12 | ``` 13 | // 1. generate inception.wts from [BlueMirrors/torchtrtz](https://github.com/BlueMirrors/torchtrtz/blob/main/generate_weights.py) 14 | 15 | // 2. put inception.wts into tensorrtx/inceptionV4 16 | 17 | // 3. build and run 18 | 19 | cd tensorrtx/inception/inceptionV4 20 | 21 | mkdir build 22 | 23 | cd build 24 | 25 | cmake .. 26 | 27 | make 28 | 29 | sudo ./inceptionV4 -s // serialize model to plan file i.e. 'inceptionV4.engine' 30 | 31 | sudo ./inceptionV4 -d // deserialize plan file and run inference 32 | 33 | // 4. see if the output is same as rwightman/pytorch-image-models/inceptionv4 34 | ``` 35 | 36 | 37 | -------------------------------------------------------------------------------- /inception/inceptionv4/utils.cpp: -------------------------------------------------------------------------------- 1 | # include "utils.h" 2 | 3 | 4 | // Load weights from files. 5 | // TensorRT weight files have a simple space delimited format: 6 | // [type] [size] 7 | std::map loadWeights(const std::string file) { 8 | std::cout << "Loading weights: " << file << std::endl; 9 | std::map weightMap; 10 | 11 | // Open weights file 12 | std::ifstream input(file); 13 | assert(input.is_open() && "Unable to load weight file."); 14 | 15 | // Read number of weight blobs 16 | int32_t count; 17 | input >> count; 18 | assert(count > 0 && "Invalid weight map file."); 19 | 20 | while (count--) 21 | { 22 | Weights wt{DataType::kFLOAT, nullptr, 0}; 23 | uint32_t size; 24 | 25 | // Read name and type of blob 26 | std::string name; 27 | input >> name >> std::dec >> size; 28 | wt.type = DataType::kFLOAT; 29 | 30 | // Load blob 31 | uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); 32 | for (uint32_t x = 0, y = size; x < y; ++x) 33 | { 34 | input >> std::hex >> val[x]; 35 | } 36 | wt.values = val; 37 | 38 | wt.count = size; 39 | weightMap[name] = wt; 40 | } 41 | 42 | return weightMap; 43 | } -------------------------------------------------------------------------------- /inception/inceptionv4/utils.h: -------------------------------------------------------------------------------- 1 | # ifndef TRTX_UTILS_H 2 | # define TRTX_UTILS_H 3 | 4 | #include 5 | #include "NvInfer.h" 6 | #include "cuda_runtime_api.h" 7 | #include "assert.h" 8 | #include 9 | #include 10 | #include 11 | 12 | #ifndef CUDA_CHECK 13 | #define CUDA_CHECK(callstr)\ 14 | {\ 15 | cudaError_t error_code = callstr;\ 16 | if (error_code != cudaSuccess) {\ 17 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ 18 | assert(0);\ 19 | }\ 20 | } 21 | #endif // CUDA_CHECK 22 | 23 | using namespace nvinfer1; 24 | 25 | std::map loadWeights(const std::string input); 26 | 27 | #endif // TRTX_UTILS_H -------------------------------------------------------------------------------- /lenet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(lenet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | set(TARGET_NAME "lenet") 8 | 9 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 10 | set(CMAKE_CXX_STANDARD 11) 11 | set(CMAKE_BUILD_TYPE Debug) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 15 | # cuda 16 | include_directories(/usr/local/cuda/include) 17 | link_directories(/usr/local/cuda/lib64) 18 | # tensorrt 19 | include_directories(/usr/include/x86_64-linux-gnu/) 20 | link_directories(/usr/lib/x86_64-linux-gnu/) 21 | 22 | FILE(GLOB SRC_FILES ${PROJECT_SOURCE_DIR}/lenet.cpp ${PROJECT_SOURCE_DIR}/include/*.h) 23 | 24 | add_executable(${TARGET_NAME} ${SRC_FILES}) 25 | target_link_libraries(${TARGET_NAME} nvinfer) 26 | target_link_libraries(${TARGET_NAME} cudart) 27 | 28 | add_definitions(-O2 -pthread) 29 | 30 | -------------------------------------------------------------------------------- /lenet/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #if NV_TENSORRT_MAJOR >= 8 5 | #define TRT_NOEXCEPT noexcept 6 | #define TRT_CONST_ENQUEUE const 7 | #else 8 | #define TRT_NOEXCEPT 9 | #define TRT_CONST_ENQUEUE 10 | #endif 11 | 12 | #endif // __MACROS_H 13 | -------------------------------------------------------------------------------- /lprnet/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/tensorrtx/c6bcebd812907a6d9eb66ecc909eb17b51d40fac/lprnet/1.jpg -------------------------------------------------------------------------------- /lprnet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(LPRnet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") 15 | message("embed_platform on") 16 | include_directories(/usr/local/cuda/targets/aarch64-linux/include) 17 | link_directories(/usr/local/cuda/targets/aarch64-linux/lib) 18 | else() 19 | message("embed_platform off") 20 | include_directories(/usr/local/cuda/include) 21 | link_directories(/usr/local/cuda/lib64) 22 | # tensorrt 23 | include_directories(/usr/local/TensorRT-7.0.0.11/include) 24 | link_directories(/usr/local/TensorRT-7.0.0.11/lib) 25 | endif() 26 | 27 | find_package(OpenCV) 28 | include_directories(OpenCV_INCLUDE_DIRS) 29 | 30 | add_executable(LPRnet ${PROJECT_SOURCE_DIR}/LPRnet.cpp) 31 | target_link_libraries(LPRnet nvinfer) 32 | target_link_libraries(LPRnet cudart) 33 | target_link_libraries(LPRnet ${OpenCV_LIBS}) 34 | 35 | add_definitions(-O2 -pthread) -------------------------------------------------------------------------------- /lprnet/README.md: -------------------------------------------------------------------------------- 1 | # LPRNet 2 | 3 | The Pytorch implementation is [xuexingyu24/License_Plate_Detection_Pytorch](https://github.com/xuexingyu24/License_Plate_Detection_Pytorch). 4 | 5 | ## How to Run 6 | 7 | 1. generate LPRnet.wts from pytorch 8 | 9 | ``` 10 | git clone https://github.com/wang-xinyu/tensorrtx.git 11 | git clone https://github.com/xuexingyu24/License_Plate_Detection_Pytorch.git 12 | 13 | // copy tensorrtx/LRPnet/gen_wts.py to License_Plate_Detection_Pytorch/ 14 | // go to License_Plate_Detection_Pytorch/ 15 | python genwts.py 16 | // a file 'LPRnet.wts' will be generated. 17 | ``` 18 | 19 | 2. build LPRnet and run 20 | 21 | ``` 22 | // put LPRnet.wts into tensorrtx/LPRnet 23 | // go to tensorrtx/LPRnet 24 | mkdir build 25 | cd build 26 | cmake .. 27 | make 28 | sudo ./LPRnet -s // serialize model to file i.e. 'LPRnet.engine' 29 | sudo ./LPRnet -d // deserialize model and run inference 30 | ``` 31 | 32 | -------------------------------------------------------------------------------- /lprnet/genwts.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | 4 | from LPRNet.model import LPRNET 5 | import struct 6 | 7 | model_path = './weights/Final_LPRNet_model.pth' 8 | CHARS = ['京', '沪', '津', '渝', '冀', '晋', '蒙', '辽', '吉', '黑', 9 | '苏', '浙', '皖', '闽', '赣', '鲁', '豫', '鄂', '湘', '粤', 10 | '桂', '琼', '川', '贵', '云', '藏', '陕', '甘', '青', '宁', 11 | '新', 12 | '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 13 | 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 14 | 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 15 | 'W', 'X', 'Y', 'Z', 'I', 'O', '-' 16 | ] 17 | model = LPRNET.LPRNet(class_num=len(CHARS), dropout_rate=0) 18 | if torch.cuda.is_available(): 19 | model = model.cuda() 20 | print('loading pretrained model from %s' % model_path) 21 | model.load_state_dict(torch.load(model_path)) 22 | 23 | image = torch.ones(1, 3, 24, 94) 24 | if torch.cuda.is_available(): 25 | image = image.cuda() 26 | 27 | model.eval() 28 | print(model) 29 | print('image shape ', image.shape) 30 | preds = model(image) 31 | 32 | f = open("LPRNet.wts", 'w') 33 | f.write("{}\n".format(len(model.state_dict().keys()))) 34 | for k, v in model.state_dict().items(): 35 | print('key: ', k) 36 | print('value: ', v.shape) 37 | vr = v.reshape(-1).cpu().numpy() 38 | f.write("{} {}".format(k, len(vr))) 39 | for vv in vr: 40 | f.write(" ") 41 | f.write(struct.pack(">f", float(vv)).hex()) 42 | f.write("\n") 43 | -------------------------------------------------------------------------------- /mlp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14) # change the version, if asked by compiler 2 | project(mlp) 3 | 4 | set(CMAKE_CXX_STANDARD 14) 5 | 6 | # include and link dirs of tensorrt, you need adapt them if yours are different 7 | include_directories(/usr/include/x86_64-linux-gnu/) 8 | link_directories(/usr/lib/x86_64-linux-gnu/) 9 | 10 | # include and link dirs of cuda for inference 11 | include_directories(/usr/local/cuda/include) 12 | link_directories(/usr/local/cuda/lib64) 13 | 14 | # create link for executable files 15 | add_executable(mlp mlp.cpp) 16 | 17 | # perform linking with nvinfer libraries 18 | target_link_libraries(mlp nvinfer) 19 | 20 | # link with cuda libraries for Inference 21 | target_link_libraries(mlp cudart) 22 | 23 | add_definitions(-O2 -pthread) 24 | 25 | -------------------------------------------------------------------------------- /mlp/README.md: -------------------------------------------------------------------------------- 1 | # MLP 2 | 3 | MLP is the most basic net in this tensorrtx project for starters. You can learn the basic procedures of building 4 | TensorRT app from the provided APIs. The process of building a TensorRT engine explained in the chart below. 5 | 6 | ![TensorRT Image](https://user-images.githubusercontent.com/33795294/148565279-795b12da-5243-4e7e-881b-263eb7658683.jpg) 7 | 8 | ## Helper Files 9 | 10 | `logging.h` : A logger file for using NVIDIA TRT API (mostly same for all models) 11 | 12 | `mlp.wts` : Converted weight file (simple file, you can open and check it) 13 | 14 | ## TensorRT C++ API 15 | 16 | ``` 17 | // 1. generate mlp.wts from https://github.com/wang-xinyu/pytorchx/tree/master/mlp -- or use the given .wts file 18 | 19 | // 2. put mlp.wts into tensorrtx/mlp (if using the generated weights) 20 | 21 | // 3. build and run 22 | 23 | cd tensorrtx/mlp 24 | 25 | mkdir build 26 | 27 | cd build 28 | 29 | cmake .. 30 | 31 | make 32 | 33 | sudo ./mlp -s // serialize model to plan file i.e. 'mlp.engine' 34 | 35 | sudo ./mlp -d // deserialize plan file and run inference 36 | ``` 37 | 38 | ## TensorRT Python API 39 | 40 | ``` 41 | # 1. Generate mlp.wts from https://github.com/wang-xinyu/pytorchx/tree/master/mlp -- or use the given .wts file 42 | 43 | # 2. Put mlp.wts into tensorrtx/mlp (if using the generated weights) 44 | 45 | # 3. Install Python dependencies (tensorrt/pycuda/numpy) 46 | 47 | # 4. Run 48 | 49 | cd tensorrtx/mlp 50 | 51 | python mlp.py -s # serialize model to plan file, i.e. 'mlp.engine' 52 | 53 | python mlp.py -d # deserialize plan file and run inference 54 | ``` 55 | 56 | ## Note 57 | It also supports the latest CUDA-11.4 and TensorRT-8.2.x 58 | -------------------------------------------------------------------------------- /mlp/mlp.wts: -------------------------------------------------------------------------------- 1 | 2 2 | linear.weight 1 3fff7e32 3 | linear.bias 1 3c138a5a 4 | -------------------------------------------------------------------------------- /mnasnet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(mnasnet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | include_directories(${PROJECT_SOURCE_DIR}/include) 12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 13 | # cuda 14 | include_directories(/usr/local/cuda/include) 15 | link_directories(/usr/local/cuda/lib64) 16 | # tensorrt 17 | include_directories(/usr/include/x86_64-linux-gnu/) 18 | link_directories(/usr/lib/x86_64-linux-gnu/) 19 | 20 | add_executable(mnasnet ${PROJECT_SOURCE_DIR}/mnasnet.cpp) 21 | target_link_libraries(mnasnet nvinfer) 22 | target_link_libraries(mnasnet cudart) 23 | 24 | add_definitions(-O2 -pthread) 25 | 26 | -------------------------------------------------------------------------------- /mnasnet/README.md: -------------------------------------------------------------------------------- 1 | # mnasnet 2 | 3 | MNASNet with depth multiplier of 0.5 from 4 | "MnasNet: Platform-Aware Neural Architecture Search for Mobile" 5 | 6 | For the Pytorch implementation, you can refer to [pytorchx/mnasnet](https://github.com/wang-xinyu/pytorchx/tree/master/mnasnet) 7 | 8 | Following tricks are used in this mnasnet, nothing special, group conv and batchnorm are used. 9 | 10 | - Batchnorm layer, implemented by scale layer. 11 | 12 | ``` 13 | // 1. generate mnasnet.wts from [pytorchx/mnasnet](https://github.com/wang-xinyu/pytorchx/tree/master/mnasnet) 14 | 15 | // 2. put mnasnet.wts into tensorrtx/mnasnet 16 | 17 | // 3. build and run 18 | 19 | cd tensorrtx/mnasnet 20 | 21 | mkdir build 22 | 23 | cd build 24 | 25 | cmake .. 26 | 27 | make 28 | 29 | sudo ./mnasnet -s // serialize model to plan file i.e. 'mnasnet.engine' 30 | 31 | sudo ./mnasnet -d // deserialize plan file and run inference 32 | 33 | // 4. see if the output is same as pytorchx/mnasnet 34 | ``` 35 | 36 | 37 | -------------------------------------------------------------------------------- /mobilenet/mobilenetv2/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(mobilenet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | include_directories(${PROJECT_SOURCE_DIR}/include) 12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 13 | # cuda 14 | include_directories(/usr/local/cuda/include) 15 | link_directories(/usr/local/cuda/lib64) 16 | # tensorrt 17 | include_directories(/usr/include/x86_64-linux-gnu/) 18 | link_directories(/usr/lib/x86_64-linux-gnu/) 19 | 20 | add_executable(mobilenet ${PROJECT_SOURCE_DIR}/mobilenet_v2.cpp) 21 | target_link_libraries(mobilenet nvinfer) 22 | target_link_libraries(mobilenet cudart) 23 | 24 | add_definitions(-O2 -pthread) 25 | -------------------------------------------------------------------------------- /mobilenet/mobilenetv2/README.md: -------------------------------------------------------------------------------- 1 | # mobilenet v2 2 | 3 | MobileNetV2 architecture from 4 | "MobileNetV2: Inverted Residuals and Linear Bottlenecks" . 5 | 6 | For the Pytorch implementation, you can refer to [pytorchx/mobilenet](https://github.com/wang-xinyu/pytorchx/tree/master/mobilenet) 7 | 8 | Following tricks are used in this mobilenet, 9 | 10 | - Relu6 is used in mobilenet v2. We use `Relu6(x) = Relu(x) - Relu(x-6)` in tensorrt. 11 | - Batchnorm layer, implemented by scale layer. 12 | 13 | ``` 14 | // 1. generate mobilenet.wts from [pytorchx/mobilenet](https://github.com/wang-xinyu/pytorchx/tree/master/mobilenet) 15 | 16 | // 2. put mobilenet.wts into tensorrtx/mobilenet 17 | 18 | // 3. build and run 19 | 20 | cd tensorrtx/mobilenet/mobilenetv2 21 | 22 | mkdir build 23 | 24 | cd build 25 | 26 | cmake .. 27 | 28 | make 29 | 30 | sudo ./mobilenet -s // serialize model to plan file i.e. 'mobilenet.engine' 31 | 32 | sudo ./mobilenet -d // deserialize plan file and run inference 33 | 34 | // 4. see if the output is same as pytorchx/mobilenet 35 | ``` 36 | 37 | ### TensorRT Python API 38 | 39 | ``` 40 | # 1. generate mobilenetv2.wts from [pytorchx/mobilenet](https://github.com/wang-xinyu/pytorchx/tree/master/mobilenet) 41 | 42 | # 2. put mobilenetv2.wts into tensorrtx/mobilenet/mobilenetv2 43 | 44 | # 3. install Python dependencies (tensorrt/pycuda/numpy) 45 | 46 | cd tensorrtx/mobilenet/mobilenetv2 47 | 48 | python mobilenet_v2.py -s // serialize model to plan file i.e. 'mobilenetv2.engine' 49 | python mobilenet_v2.py -d // deserialize plan file and run inference 50 | 51 | # 4. see if the output is same as pytorchx/mobilenet 52 | ``` 53 | -------------------------------------------------------------------------------- /mobilenet/mobilenetv3/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(mobilenetv3) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 15 | # cuda 16 | include_directories(/usr/local/cuda/include) 17 | link_directories(/usr/local/cuda/lib64) 18 | # tensorrt 19 | include_directories(/usr/include/x86_64-linux-gnu/) 20 | link_directories(/usr/lib/x86_64-linux-gnu/) 21 | 22 | add_executable(mobilenetv3 ${PROJECT_SOURCE_DIR}/mobilenet_v3.cpp) 23 | target_link_libraries(mobilenetv3 nvinfer) 24 | target_link_libraries(mobilenetv3 cudart) 25 | 26 | add_definitions(-O2 -pthread) 27 | -------------------------------------------------------------------------------- /mobilenet/mobilenetv3/README.md: -------------------------------------------------------------------------------- 1 | # mobilenet v3 2 | 3 | MobileNetV3 architecture from 4 | "Searching for MobileNetV3" . 5 | 6 | For the Pytorch implementation, you can refer to [mobilenetv3.pytorch](https://github.com/chufei1995/mobilenetv3.pytorch) 7 | 8 | ## Run 9 | 10 | 1. generate mbv3_small.wts/mbv3_large.wts from pytorch implementation 11 | 12 | 2. put mbv3_small.wts/mbv3_large.wts into tensorrtx/mobilenet/mobilenetv3 13 | 14 | 3. build and run 15 | 16 | ``` 17 | cd tensorrtx/mobilenet/mobilenetv3 18 | mkdir build 19 | cd build 20 | cmake .. 21 | make 22 | sudo ./mobilenetv3 -s small(or large) // serialize model to plan file i.e. 'mobilenetv3_small.engine' 23 | sudo ./mobilenetv3 -d small(or large) // deserialize plan file and run inference 24 | ``` 25 | 26 | 4. see if the output is same as pytorch side 27 | 28 | ### TensorRT Python API 29 | 30 | ``` 31 | # 1. generate mobilenetv3.wts from [mobilenetv3.pytorch](https://github.com/chufei1995/mobilenetv3.pytorch) 32 | 33 | # 2. put mobilenetv3.wts into tensorrtx/mobilenet/mobilenetv3 34 | 35 | # 3. install Python dependencies (tensorrt/pycuda/numpy) 36 | 37 | cd tensorrtx/mobilenet/mobilenetv3 38 | 39 | python mobilenet_v2.py -s small(or large) // serialize model to plan file i.e. 'mobilenetv2.engine' 40 | python mobilenet_v2.py -d small(or large) // deserialize plan file and run inference 41 | 42 | ``` 43 | -------------------------------------------------------------------------------- /psenet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(PSENet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 15 | # cuda 16 | include_directories(/usr/local/cuda/include) 17 | link_directories(/usr/local/cuda/lib64) 18 | # tensorrt 19 | include_directories(/usr/include/x86_64-linux-gnu/) 20 | link_directories(/usr/lib/x86_64-linux-gnu/) 21 | 22 | 23 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 24 | 25 | 26 | 27 | find_package(OpenCV) 28 | include_directories(${OpenCV_INCLUDE_DIRS}) 29 | 30 | file(GLOB SOURCE_FILES "*.h" "*.cpp") 31 | 32 | add_executable(psenet ${SOURCE_FILES}) 33 | target_link_libraries(psenet nvinfer) 34 | target_link_libraries(psenet cudart) 35 | target_link_libraries(psenet ${OpenCV_LIBS}) 36 | 37 | add_definitions(-O2 -pthread) 38 | 39 | -------------------------------------------------------------------------------- /psenet/README.md: -------------------------------------------------------------------------------- 1 | # PSENet 2 | 3 | **preprocessing + inference + postprocessing = 30ms** with fp32 on Tesla P40. 4 | The original Tensorflow implementation is [tensorflow_PSENet](https://github.com/liuheng92/tensorflow_PSENet). A TensorRT Python api implementation is [TensorRT-Python-PSENet](https://github.com/upczww/TensorRT-Python-PSENet). 5 | 6 | ## Key Features 7 | - Generating `.wts` from `Tensorflow`. 8 | - Dynamic batch and dynamic shape input. 9 | - Object-Oriented Programming. 10 | - Practice with C++ 11. 11 | 12 | 13 |

14 | 15 |

16 | 17 | ## How to Run 18 | 19 | * 1. generate .wts 20 | 21 | Download pretrained model from https://github.com/liuheng92/tensorflow_PSENet 22 | and put `model.ckpt.*` to `model` dir. Add a file `model/checkpoint` with content 23 | ``` 24 | model_checkpoint_path: "model.ckpt" 25 | all_model_checkpoint_paths: "model.ckpt" 26 | ``` 27 | Then run 28 | 29 | ``` 30 | python gen_tf_wts.py 31 | ``` 32 | which will gengerate a `psenet.wts`. 33 | * 2. cmake and make 34 | 35 | ``` 36 | mkdir build 37 | cd build 38 | cmake .. 39 | make 40 | ``` 41 | * 3. build engine and run detection 42 | ``` 43 | cp ../psenet.wts ./ 44 | cp ../test.jpg ./ 45 | ./psenet -s // serialize model to plan file 46 | ./psenet -d // deserialize plan file and run inference 47 | ``` 48 | 49 | ## Known Issues 50 | None 51 | 52 | ## Todo 53 | 54 | * use `ExponentialMovingAverage` weight. 55 | -------------------------------------------------------------------------------- /psenet/gen_tf_wts.py: -------------------------------------------------------------------------------- 1 | from sys import prefix 2 | import tensorflow as tf 3 | from tensorflow.python import pywrap_tensorflow 4 | import numpy as np 5 | import struct 6 | 7 | model_dir = "model" 8 | 9 | ckpt = tf.train.get_checkpoint_state(model_dir) 10 | ckpt_path = ckpt.model_checkpoint_path 11 | 12 | reader = pywrap_tensorflow.NewCheckpointReader(ckpt_path) 13 | param_dict = reader.get_variable_to_shape_map() 14 | 15 | 16 | f = open(r"psenet.wts", "w") 17 | keys = param_dict.keys() 18 | f.write("{}\n".format(len(keys))) 19 | 20 | for key in keys: 21 | weight = reader.get_tensor(key) 22 | print(key, weight.shape) 23 | if len(weight.shape) == 4: 24 | weight = np.transpose(weight, (3, 2, 0, 1)) 25 | print(weight.shape) 26 | weight = np.reshape(weight, -1) 27 | f.write("{} {} ".format(key, len(weight))) 28 | for w in weight: 29 | f.write(" ") 30 | f.write(struct.pack(">f", float(w)).hex()) 31 | f.write("\n") -------------------------------------------------------------------------------- /psenet/layers.h: -------------------------------------------------------------------------------- 1 | #ifndef TENSORRTX_LAYERS_H 2 | #define TENSORRTX_LAYERS_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "NvInfer.h" 9 | #include "cuda_runtime_api.h" 10 | using namespace nvinfer1; 11 | 12 | IScaleLayer *addBatchNorm2d(INetworkDefinition *network, std::map &weightMap, ITensor &input, std::string lname, float eps); 13 | 14 | IActivationLayer *bottleneck(INetworkDefinition *network, std::map &weightMap, ITensor &input, int ch, int stride, std::string lname, int branch_type); 15 | 16 | IActivationLayer *addConvRelu(INetworkDefinition *network, std::map &weightMap, ITensor &input, int outch, int kernel, int stride, std::string lname); 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /psenet/main.cpp: -------------------------------------------------------------------------------- 1 | #include "psenet.h" 2 | 3 | int main(int argc, char** argv) 4 | { 5 | PSENet psenet(1200, 640, 0.90, 6, 4); 6 | 7 | if (argc == 2 && std::string(argv[1]) == "-s") 8 | { 9 | std::cout << "Serializling Engine" << std::endl; 10 | psenet.serializeEngine(); 11 | return 0; 12 | } 13 | else if (argc == 2 && std::string(argv[1]) == "-d") 14 | { 15 | psenet.init(); 16 | std::vector files; 17 | for (int i = 0; i < 10; i++) 18 | files.emplace_back("test.jpg"); 19 | for (auto file : files) 20 | { 21 | std::cout << "Detect " << file << std::endl; 22 | psenet.detect(file); 23 | } 24 | 25 | return 0; 26 | } 27 | else 28 | { 29 | std::cerr << "arguments not right!" << std::endl; 30 | std::cerr << "./psenet -s // serialize model to plan file" << std::endl; 31 | std::cerr << "./psenet -d // deserialize plan file and run inference" << std::endl; 32 | return -1; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /psenet/psenet.h: -------------------------------------------------------------------------------- 1 | #ifndef TENSORRTX_PSENET_H 2 | #define TENSORRTX_PSENET_H 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "utils.h" 8 | #include "layers.h" 9 | class PSENet 10 | { 11 | public: 12 | PSENet(int max_side_len, int min_side_len, float threshold, int num_kernel, int stride); 13 | ~PSENet(); 14 | 15 | ICudaEngine* createEngine(IBuilder* builder, IBuilderConfig* config); 16 | void serializeEngine(); 17 | void deserializeEngine(); 18 | void init(); 19 | void inferenceOnce(IExecutionContext& context, float* input, float* output, int input_h, int input_w); 20 | void detect(std::string image_path); 21 | float* preProcess(cv::Mat image, int& resize_h, int& resize_w, float& ratio_h, float& ratio_w); 22 | std::vector postProcess(float* origin_output, int resize_h, int resize_w); 23 | 24 | private: 25 | Logger gLogger; 26 | std::shared_ptr mRuntime; 27 | std::shared_ptr mCudaEngine; 28 | std::shared_ptr mContext; 29 | DataType dt = DataType::kFLOAT; 30 | const char* input_name_ = "input"; 31 | const char* output_name_ = "output"; 32 | int max_side_len_ = 1024; 33 | int min_side_len_ = 640; 34 | float post_threshold_ = 0.9; 35 | int num_kernels_ = 6; 36 | int stride_ = 4; 37 | }; 38 | 39 | #endif // TENSORRTX_PSENET_H 40 | -------------------------------------------------------------------------------- /psenet/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/tensorrtx/c6bcebd812907a6d9eb66ecc909eb17b51d40fac/psenet/test.jpg -------------------------------------------------------------------------------- /rcnn/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.1) 2 | 3 | project(rcnn) 4 | 5 | add_definitions(-std=c++14) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 14) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--extended-lambda) 12 | 13 | find_package(CUDA REQUIRED) 14 | 15 | include_directories(${PROJECT_SOURCE_DIR}/include) 16 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 17 | # cuda 18 | include_directories(/usr/local/cuda/include) 19 | link_directories(/usr/local/cuda/lib64) 20 | # tensorrt 21 | include_directories(/home/jushi/TensorRT-8.2.1.6/include) 22 | link_directories(/home/jushi/TensorRT-8.2.1.6/lib) 23 | 24 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 25 | 26 | cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/BatchedNms.cu ${PROJECT_SOURCE_DIR}/PredictorDecode.cu ${PROJECT_SOURCE_DIR}/RoiAlign.cu ${PROJECT_SOURCE_DIR}/RpnDecode.cu ${PROJECT_SOURCE_DIR}/RpnNms.cu ${PROJECT_SOURCE_DIR}/MaskRcnnInference.cu) 27 | target_link_libraries(myplugins nvinfer cudart) 28 | 29 | find_package(OpenCV) 30 | include_directories(${OpenCV_INCLUDE_DIRS}) 31 | 32 | add_executable(rcnn ${PROJECT_SOURCE_DIR}/rcnn.cpp) 33 | target_link_libraries(rcnn nvinfer) 34 | target_link_libraries(rcnn cudart) 35 | target_link_libraries(rcnn myplugins) 36 | target_link_libraries(rcnn ${OpenCV_LIBS}) 37 | 38 | add_definitions(-O2 -pthread) 39 | 40 | -------------------------------------------------------------------------------- /rcnn/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #define CUDA_ALIGN 256 8 | 9 | template 10 | inline size_t get_size_aligned(size_t num_elem) { 11 | size_t size = num_elem * sizeof(T); 12 | size_t extra_align = 0; 13 | if (size % CUDA_ALIGN != 0) { 14 | extra_align = CUDA_ALIGN - size % CUDA_ALIGN; 15 | } 16 | return size + extra_align; 17 | } 18 | 19 | template 20 | inline T *get_next_ptr(size_t num_elem, void *&workspace, size_t &workspace_size) { 21 | size_t size = get_size_aligned(num_elem); 22 | if (size > workspace_size) { 23 | throw std::runtime_error("Workspace is too small!"); 24 | } 25 | workspace_size -= size; 26 | T *ptr = reinterpret_cast(workspace); 27 | workspace = reinterpret_cast(reinterpret_cast(workspace) + size); 28 | return ptr; 29 | } 30 | 31 | #ifndef CUDA_CHECK 32 | #define CUDA_CHECK(callstr)\ 33 | {\ 34 | cudaError_t error_code = callstr;\ 35 | if (error_code != cudaSuccess) {\ 36 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ 37 | assert(0);\ 38 | }\ 39 | } 40 | #endif // CUDA_CHECK 41 | -------------------------------------------------------------------------------- /rcnn/macros.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #if CUDA_VERSION >=11000 7 | #define CUDA_11 8 | #endif 9 | 10 | #if NV_TENSORRT_MAJOR >= 8 11 | #define TRT_NOEXCEPT noexcept 12 | #define TRT_CONST_ENQUEUE const 13 | #else 14 | #define TRT_NOEXCEPT 15 | #define TRT_CONST_ENQUEUE 16 | #endif 17 | -------------------------------------------------------------------------------- /real-esrgan/general-x4v3/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | project(real-esrgan) 3 | 4 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/") 5 | 6 | add_definitions(-std=c++17) 7 | add_definitions(-DAPI_EXPORTS) 8 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 9 | #set(CMAKE_CXX_STANDARD 11) 10 | set(CMAKE_BUILD_TYPE Debug) 11 | 12 | #find_package(CUDA REQUIRED) 13 | 14 | INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src/include) 15 | 16 | # cuda 17 | FIND_PACKAGE(CUDA REQUIRED) 18 | #INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS}) 19 | include_directories(/usr/local/cuda/include) 20 | link_directories(/usr/local/cuda/lib64) 21 | 22 | # <------------------------TensorRT Related-------------------------> 23 | include_directories(YOUR_TENSORRT_INCLUDE_DIR) # TensorRT-8.6.1.6/include 24 | link_directories(YOUR_TENSORRT_LIB_DIR) # TensorRT-8.6.1.6/lib 25 | 26 | # <------------------------OpenCV Related-------------------------> 27 | # opencv 28 | FIND_PACKAGE(OpenCV REQUIRED) 29 | INCLUDE_DIRECTORIES(${OpenCV_INCLUDE_DIRS}) 30 | 31 | set(CMAKE_CXX_STANDARD 17) 32 | 33 | add_executable(${PROJECT_NAME} main.cpp) 34 | 35 | cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/src/pixel_shuffle/pixel_shuffle.cu) 36 | target_link_libraries(myplugins nvinfer cudart) 37 | 38 | 39 | TARGET_LINK_LIBRARIES(${PROJECT_NAME} nvinfer) 40 | TARGET_LINK_LIBRARIES(${PROJECT_NAME} cudart) 41 | TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${OpenCV_LIBS}) 42 | TARGET_LINK_LIBRARIES(${PROJECT_NAME} myplugins) 43 | -------------------------------------------------------------------------------- /real-esrgan/general-x4v3/README.md: -------------------------------------------------------------------------------- 1 | # Real-ESRGAN realesr-general-x4v3 model 2 | 3 | ## How to Run 4 | 0. Replace YOUR_TENSORRT_INCLUDE_DIR and YOUR_TENSORRT_LIB_DIR in CMakeLists.txt with your TensorRT include and lib directories. 5 | 1. generate .wts from pytorch with .pt 6 | ``` 7 | git clone https://github.com/xinntao/Real-ESRGAN.git 8 | cd Real-ESRGAN 9 | 10 | # Install basicsr - https://github.com/xinntao/BasicSR 11 | # We use BasicSR for both training and inference 12 | pip install basicsr 13 | # facexlib and gfpgan are for face enhancement 14 | pip install facexlib 15 | pip install gfpgan 16 | pip install -r requirements.txt 17 | python setup.py develop 18 | ``` 19 | download realesr-general-x4v3.pth (and realesr-general-wdn-x4v3.pth if needed) from 20 | https://github.com/xinntao/Real-ESRGAN/releases 21 | 22 | ``` 23 | cp {tensorrtx}/real-esrgan-general-x4v3/gen_wts.py {xinntao}/Real-ESRGAN 24 | cd {xinntao}/Real-ESRGAN 25 | python gen_wts.py 26 | // a file 'real-esrgan.wts' will be generated. 27 | ``` 28 | 29 | **Be aware that if you need both realesr-general-x4v3.pth and realesr-general-wdn-x4v3.pth, please write a Python script to average all weights of realesr-general-x4v3.pth and realesr-general-wdn-x4v3.pth (from {xinntao}/Real-ESRGAN), then save it as a .pth file, and use this new file to generate a .wts file.** 30 | 31 | 2. build tensorrtx/real-esrgan-general-x4v3 and run 32 | 33 | ``` 34 | cd {tensorrtx}/real-esrgan-general-x4v3/ 35 | mkdir build 36 | cd build 37 | cp {xinntao}/Real-ESRGAN/real-esrgan.wts {tensorrtx}/real-esrgan/weights/ 38 | cmake .. 39 | make 40 | ./real-esrgan your_images_dir 41 | ``` 42 | -------------------------------------------------------------------------------- /real-esrgan/general-x4v3/src/include/config/config.hpp: -------------------------------------------------------------------------------- 1 | #ifndef REAL_ESRGAN_TRT_CONFIG_HPP 2 | #define REAL_ESRGAN_TRT_CONFIG_HPP 3 | 4 | #include 5 | 6 | //std::string INPUT_BLOB_NAME = "input"; 7 | //std::string OUTPUT_BLOB_NAME = "output"; 8 | 9 | const char* INPUT_BLOB_NAME = "input_0"; 10 | const char* OUTPUT_BLOB_NAME = "output_0"; 11 | 12 | const bool USE_FP16 = false; 13 | 14 | static const int BATCH_SIZE = 1; 15 | static const int INPUT_C = 3; 16 | static const int INPUT_H = 450; 17 | static const int INPUT_W = 300; 18 | static const int OUT_SCALE = 4; 19 | //static const int OUTPUT_SIZE = INPUT_C * INPUT_H * OUT_SCALE * INPUT_W * OUT_SCALE; 20 | static const int OUTPUT_SIZE = BATCH_SIZE * 48 * 450 * 300; 21 | //INPUT_C * INPUT_H * OUT_SCALE * INPUT_W * OUT_SCALE; 22 | #endif //REAL_ESRGAN_TRT_CONFIG_HPP 23 | -------------------------------------------------------------------------------- /real-esrgan/general-x4v3/src/include/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRTX_CUDA_UTILS_H_ 2 | #define TRTX_CUDA_UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #ifndef CUDA_CHECK 11 | #define CUDA_CHECK(callstr) \ 12 | { \ 13 | cudaError_t error_code = callstr; \ 14 | if (error_code != cudaSuccess) { \ 15 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ 16 | assert(0); \ 17 | } \ 18 | } 19 | #endif // CUDA_CHECK 20 | 21 | #endif // TRTX_CUDA_UTILS_H_ 22 | -------------------------------------------------------------------------------- /real-esrgan/general-x4v3/src/include/preprocess/preprocess.hpp: -------------------------------------------------------------------------------- 1 | #ifndef REAL_ESRGAN_TRT_PREPROCESS_HPP 2 | #define REAL_ESRGAN_TRT_PREPROCESS_HPP 3 | 4 | struct PreprocessStruct { 5 | int N; 6 | int C; 7 | int H; 8 | int W; 9 | }; 10 | 11 | #endif //REAL_ESRGAN_TRT_PREPROCESS_HPP 12 | -------------------------------------------------------------------------------- /real-esrgan/x4plus/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(real-esrgan) 4 | 5 | add_definitions(-std=c++11) 6 | add_definitions(-DAPI_EXPORTS) 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | if(WIN32) 14 | enable_language(CUDA) 15 | endif(WIN32) 16 | 17 | include_directories(${PROJECT_SOURCE_DIR}/include) 18 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 19 | # cuda 20 | include_directories(/usr/local/cuda/include) 21 | link_directories(/usr/local/cuda/lib64) 22 | # tensorrt 23 | include_directories(/usr/include/x86_64-linux-gnu/) 24 | link_directories(/usr/lib/x86_64-linux-gnu/) 25 | 26 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -g -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 27 | cuda_add_library(myplugins SHARED preprocess.cu postprocess.cu) 28 | target_link_libraries(myplugins nvinfer cudart) 29 | 30 | find_package(OpenCV) 31 | include_directories(${OpenCV_INCLUDE_DIRS}) 32 | 33 | cuda_add_executable(real-esrgan real-esrgan.cpp) 34 | 35 | target_link_libraries(real-esrgan nvinfer) 36 | target_link_libraries(real-esrgan cudart) 37 | target_link_libraries(real-esrgan myplugins) 38 | target_link_libraries(real-esrgan ${OpenCV_LIBS}) 39 | 40 | if(UNIX) 41 | add_definitions(-O2 -pthread) 42 | endif(UNIX) 43 | 44 | 45 | -------------------------------------------------------------------------------- /real-esrgan/x4plus/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRTX_CUDA_UTILS_H_ 2 | #define TRTX_CUDA_UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #ifndef CUDA_CHECK 11 | #define CUDA_CHECK(callstr)\ 12 | {\ 13 | cudaError_t error_code = callstr;\ 14 | if (error_code != cudaSuccess) {\ 15 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ 16 | assert(0);\ 17 | }\ 18 | } 19 | #endif // CUDA_CHECK 20 | 21 | #endif // TRTX_CUDA_UTILS_H_ 22 | 23 | -------------------------------------------------------------------------------- /real-esrgan/x4plus/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #ifdef API_EXPORTS 5 | #if defined(_MSC_VER) 6 | #define API __declspec(dllexport) 7 | #else 8 | #define API __attribute__((visibility("default"))) 9 | #endif 10 | #else 11 | 12 | #if defined(_MSC_VER) 13 | #define API __declspec(dllimport) 14 | #else 15 | #define API 16 | #endif 17 | #endif // API_EXPORTS 18 | 19 | #if NV_TENSORRT_MAJOR >= 8 20 | #define TRT_NOEXCEPT noexcept 21 | #define TRT_CONST_ENQUEUE const 22 | #else 23 | #define TRT_NOEXCEPT 24 | #define TRT_CONST_ENQUEUE 25 | #endif 26 | 27 | #endif // __MACROS_H 28 | -------------------------------------------------------------------------------- /real-esrgan/x4plus/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRTX_REAL_ESRGAN_UTILS_H_ 2 | #define TRTX_REAL_ESRGAN_UTILS_H_ 3 | 4 | #include 5 | #include 6 | 7 | static inline int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { 8 | DIR *p_dir = opendir(p_dir_name); 9 | if (p_dir == nullptr) { 10 | return -1; 11 | } 12 | 13 | struct dirent* p_file = nullptr; 14 | while ((p_file = readdir(p_dir)) != nullptr) { 15 | if (strcmp(p_file->d_name, ".") != 0 && 16 | strcmp(p_file->d_name, "..") != 0) { 17 | //std::string cur_file_name(p_dir_name); 18 | //cur_file_name += "/"; 19 | //cur_file_name += p_file->d_name; 20 | std::string cur_file_name(p_file->d_name); 21 | file_names.push_back(cur_file_name); 22 | } 23 | } 24 | 25 | closedir(p_dir); 26 | return 0; 27 | } 28 | 29 | #endif // TRTX_REAL_ESRGAN_UTILS_H_ 30 | 31 | -------------------------------------------------------------------------------- /refinedet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(refinedet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | # tensorrt 12 | include_directories(/data_2/tensorrt/TensorRT-7.0.0.11/include/) #include_directories(/usr/include/x86_64-linux-gnu/) 13 | link_directories(/data_2/tensorrt/TensorRT-7.0.0.11/lib/) #link_directories(/usr/lib/x86_64-linux-gnu/) 14 | 15 | 16 | find_package(CUDA REQUIRED) 17 | 18 | include_directories(${PROJECT_SOURCE_DIR}/include) 19 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 20 | # cuda 21 | include_directories(/usr/local/cuda/include) 22 | link_directories(/usr/local/cuda/lib64) 23 | 24 | #find_package(OpenCV) 25 | #include_directories(OpenCV_INCLUDE_DIRS) 26 | 27 | include_directories(/home/software_install/opencv3.4.6/include) 28 | link_directories(/home/software_install/opencv3.4.6/lib) 29 | 30 | 31 | set(CMAKE_PREFIX_PATH "/data_1/torch1.1.0") ###torch1.1.0 32 | find_package(Torch REQUIRED) 33 | 34 | include_directories(/data_1/torch1.1.0/include) 35 | link_directories(/data_1/torch1.1.0/lib) 36 | 37 | 38 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 39 | 40 | 41 | add_executable(refinedet ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/refinedet.cpp) 42 | target_link_libraries(refinedet nvinfer) 43 | target_link_libraries(refinedet cudart) 44 | target_link_libraries(refinedet "${TORCH_LIBRARIES}") 45 | target_link_libraries(refinedet opencv_calib3d opencv_core opencv_dnn opencv_imgproc opencv_highgui opencv_imgcodecs caffe2) 46 | 47 | add_definitions(-O2 -pthread) 48 | 49 | -------------------------------------------------------------------------------- /refinedet/README.md: -------------------------------------------------------------------------------- 1 | # RefineDet 2 | 3 | For the Pytorch implementation, you can refer to [luuuyi/RefineDet.PyTorch](https://github.com/luuuyi/RefineDet.PyTorch) 4 | 5 | ## How to run 6 | 7 | ``` 8 | 1. generate wts file. from pytorch 9 | python gen_wts_refinedet.py 10 | // a file 'refinedet.wts' will be generated. 11 | 12 | 2. build tensorrtx/RefineDet and run or Using clion to open a project(recommend) 13 | Configuration file in configure.h 14 | You need configure your own paths and modes(SERIALIZE or INFER) 15 | Detailed information reference configure.h 16 | mkdir build 17 | cd build 18 | cmake .. 19 | make 20 | ``` 21 | 22 | ## dependence 23 | 24 | ``` 25 | TensorRT7.0.0.11 26 | OpenCV >= 3.4 27 | libtorch >=1.1.0 28 | ``` 29 | 30 | ## feature 31 | 32 | 1.tensorrt Multi output 33 | 2.L2norm 34 | 3.Postprocessing with libtorch 35 | 36 | ## More Information 37 | 38 | See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) 39 | [tensorrt tutorials](https://github.com/wang-xinyu/tensorrtx/tree/master/tutorials) 40 | For more detailed guidance, see [yhl blog](https://www.cnblogs.com/yanghailin/p/14525128.html) 41 | 42 | -------------------------------------------------------------------------------- /refinedet/calibrator.h: -------------------------------------------------------------------------------- 1 | #ifndef ENTROPY_CALIBRATOR_H 2 | #define ENTROPY_CALIBRATOR_H 3 | 4 | #include "NvInfer.h" 5 | #include 6 | #include 7 | 8 | //! \class Int8EntropyCalibrator2 9 | //! 10 | //! \brief Implements Entropy calibrator 2. 11 | //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. 12 | //! 13 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 14 | { 15 | public: 16 | Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); 17 | 18 | virtual ~Int8EntropyCalibrator2(); 19 | int getBatchSize() const override; 20 | bool getBatch(void* bindings[], const char* names[], int nbBindings) override; 21 | const void* readCalibrationCache(size_t& length) override; 22 | void writeCalibrationCache(const void* cache, size_t length) override; 23 | 24 | private: 25 | int batchsize_; 26 | int input_w_; 27 | int input_h_; 28 | int img_idx_; 29 | std::string img_dir_; 30 | std::vector img_files_; 31 | size_t input_count_; 32 | std::string calib_table_name_; 33 | const char* input_blob_name_; 34 | bool read_cache_; 35 | void* device_input_; 36 | std::vector calib_cache_; 37 | }; 38 | 39 | #endif // ENTROPY_CALIBRATOR_H 40 | -------------------------------------------------------------------------------- /refinedet/gen_wts_refinedet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import struct 4 | from models.refinedet import build_refinedet 5 | 6 | 7 | 8 | num_classes = 25 9 | path_model = "/data_2/project_2021/pytorch_refinedet/2021/20210308.pth" 10 | path_save_wts = "./refinedet0312.wts" 11 | input_size = 320 12 | 13 | net = build_refinedet('test', input_size, num_classes) # initialize net 14 | net.load_state_dict(torch.load(path_model)) 15 | net.eval() 16 | 17 | 18 | f = open(path_save_wts, 'w') 19 | f.write('{}\n'.format(len(net.state_dict().keys()))) 20 | for k, v in net.state_dict().items(): 21 | vr = v.reshape(-1).cpu().numpy() 22 | f.write('{} {} '.format(k, len(vr))) 23 | for vv in vr: 24 | f.write(' ') 25 | f.write(struct.pack('>f',float(vv)).hex()) 26 | f.write('\n') 27 | 28 | print("success generate wts!") -------------------------------------------------------------------------------- /repvgg/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(repvgg) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | include_directories(${PROJECT_SOURCE_DIR}/include) 12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 13 | # cuda 14 | include_directories(/usr/local/cuda/include) 15 | link_directories(/usr/local/cuda/lib64) 16 | # tensorrt 17 | include_directories(/usr/include/x86_64-linux-gnu/) 18 | link_directories(/usr/lib/x86_64-linux-gnu/) 19 | 20 | add_executable(repvgg ${PROJECT_SOURCE_DIR}/repvgg.cpp) 21 | target_link_libraries(repvgg nvinfer) 22 | target_link_libraries(repvgg cudart) 23 | 24 | 25 | add_definitions(-O2 -pthread) 26 | 27 | -------------------------------------------------------------------------------- /repvgg/README.md: -------------------------------------------------------------------------------- 1 | # RepVGG 2 | 3 | RepVGG models from 4 | "RepVGG: Making VGG-style ConvNets Great Again" 5 | 6 | For the Pytorch implementation, you can refer to [DingXiaoH/RepVGG](https://github.com/DingXiaoH/RepVGG) 7 | 8 | # How to run 9 | 10 | 1. generate wts file. 11 | 12 | ``` 13 | git clone https://github.com/DingXiaoH/RepVGG.git 14 | cd ReoVGG 15 | ``` 16 | 17 | You may convert a trained model into the inference-time structure with 18 | 19 | ``` 20 | python convert.py [weights file of the training-time model to load] [path to save] -a [model name] 21 | ``` 22 | 23 | For example, 24 | 25 | ``` 26 | python convert.py RepVGG-B2-train.pth RepVGG-B2-deploy.pth -a RepVGG-B2 27 | ``` 28 | 29 | Then copy `gen_wts.py` to `RepVGG` and generate .wts file, for example 30 | 31 | ``` 32 | python gen_wts.py -w RepVGG-B2-deploy.pth -s RepVGG-B2.wts 33 | ``` 34 | 35 | 2. build and run 36 | ``` 37 | cd tensorrtx/repvgg 38 | 39 | mkdir build 40 | 41 | cd build 42 | 43 | cmake .. 44 | 45 | make 46 | 47 | sudo ./repvgg -s RepVGG-B2 // serialize model to plan file i.e. 'RepVGG-B2.engine' 48 | sudo ./repvgg -d RepVGG-B2 // deserialize plan file and run inference 49 | ``` 50 | 51 | -------------------------------------------------------------------------------- /repvgg/gen_wts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import struct 3 | 4 | import torch 5 | 6 | 7 | def main(args): 8 | # Load model 9 | state_dict = torch.load(args.weight) 10 | with open(args.save_path, "w") as f: 11 | f.write("{}\n".format(len(state_dict.keys()))) 12 | for k, v in state_dict.items(): 13 | vr = v.reshape(-1).cpu().numpy() 14 | f.write("{} {} ".format(k, len(vr))) 15 | for vv in vr: 16 | f.write(" ") 17 | f.write(struct.pack(">f", float(vv)).hex()) 18 | f.write("\n") 19 | 20 | 21 | if __name__ == "__main__": 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument( 24 | "-w", 25 | "--weight", 26 | type=str, 27 | required=True, 28 | help="RepVGG model weight path", 29 | ) 30 | parser.add_argument( 31 | "-s", 32 | "--save_path", 33 | type=str, 34 | required=True, 35 | help="generated wts path", 36 | ) 37 | args = parser.parse_args() 38 | main(args) -------------------------------------------------------------------------------- /repvgg/logging.h: -------------------------------------------------------------------------------- 1 | #ifndef TENSORRT_LOGGING_H 2 | #define TENSORRT_LOGGING_H 3 | 4 | #include "NvInferRuntimeCommon.h" 5 | #include 6 | #include 7 | 8 | // Logger for TensorRT info/warning/errors 9 | class Logger : public nvinfer1::ILogger 10 | { 11 | public: 12 | Logger() : Logger(Severity::kINFO) {} 13 | 14 | Logger(Severity severity) : reportableSeverity(severity) {} 15 | 16 | void log(Severity severity, const char *msg) override 17 | { 18 | // suppress messages with severity enum value greater than the reportable 19 | if (severity > reportableSeverity) 20 | return; 21 | 22 | switch (severity) 23 | { 24 | case Severity::kINTERNAL_ERROR: 25 | std::cerr << "INTERNAL_ERROR: "; 26 | break; 27 | case Severity::kERROR: 28 | std::cerr << "ERROR: "; 29 | break; 30 | case Severity::kWARNING: 31 | std::cerr << "WARNING: "; 32 | break; 33 | case Severity::kINFO: 34 | std::cerr << "INFO: "; 35 | break; 36 | default: 37 | std::cerr << "UNKNOWN: "; 38 | break; 39 | } 40 | std::cerr << msg << std::endl; 41 | } 42 | 43 | Severity reportableSeverity{Severity::kWARNING}; 44 | }; 45 | 46 | #endif // TENSORRT_LOGGING_H 47 | -------------------------------------------------------------------------------- /resnet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(resnet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | include_directories(${PROJECT_SOURCE_DIR}/include) 12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 13 | # cuda 14 | include_directories(/usr/local/cuda/include) 15 | link_directories(/usr/local/cuda/lib64) 16 | # tensorrt 17 | include_directories(/usr/include/x86_64-linux-gnu/) 18 | link_directories(/usr/lib/x86_64-linux-gnu/) 19 | 20 | add_executable(resnet18 ${PROJECT_SOURCE_DIR}/resnet18.cpp) 21 | target_link_libraries(resnet18 nvinfer) 22 | target_link_libraries(resnet18 cudart) 23 | 24 | add_executable(resnet34 ${PROJECT_SOURCE_DIR}/resnet34.cpp) 25 | target_link_libraries(resnet34 nvinfer) 26 | target_link_libraries(resnet34 cudart) 27 | 28 | add_executable(resnet50 ${PROJECT_SOURCE_DIR}/resnet50.cpp) 29 | target_link_libraries(resnet50 nvinfer) 30 | target_link_libraries(resnet50 cudart) 31 | 32 | add_executable(resnext50 ${PROJECT_SOURCE_DIR}/resnext50_32x4d.cpp) 33 | target_link_libraries(resnext50 nvinfer) 34 | target_link_libraries(resnext50 cudart) 35 | 36 | add_executable(wideresnet50 ${PROJECT_SOURCE_DIR}/wideresnet50.cpp) 37 | target_link_libraries(wideresnet50 nvinfer) 38 | target_link_libraries(wideresnet50 cudart) 39 | 40 | add_definitions(-O2 -pthread) 41 | 42 | -------------------------------------------------------------------------------- /retinaface/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(retinaface) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") 14 | message("embed_platform on") 15 | include_directories(/usr/local/cuda/targets/aarch64-linux/include) 16 | link_directories(/usr/local/cuda/targets/aarch64-linux/lib) 17 | else() 18 | message("embed_platform off") 19 | include_directories(/usr/local/cuda/include) 20 | link_directories(/usr/local/cuda/lib64) 21 | endif() 22 | 23 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 24 | 25 | cuda_add_library(decodeplugin SHARED ${PROJECT_SOURCE_DIR}/decode.cu) 26 | target_link_libraries(decodeplugin nvinfer cudart) 27 | 28 | find_package(OpenCV) 29 | include_directories(${OpenCV_INCLUDE_DIRS}) 30 | 31 | add_executable(retina_r50 ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/retina_r50.cpp) 32 | target_link_libraries(retina_r50 nvinfer) 33 | target_link_libraries(retina_r50 cudart) 34 | target_link_libraries(retina_r50 decodeplugin) 35 | target_link_libraries(retina_r50 ${OpenCV_LIBRARIES}) 36 | 37 | add_executable(retina_mnet ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/retina_mnet.cpp) 38 | target_link_libraries(retina_mnet nvinfer) 39 | target_link_libraries(retina_mnet cudart) 40 | target_link_libraries(retina_mnet decodeplugin) 41 | target_link_libraries(retina_mnet ${OpenCV_LIBRARIES}) 42 | 43 | add_definitions(-O2 -pthread) 44 | 45 | -------------------------------------------------------------------------------- /retinaface/calibrator.h: -------------------------------------------------------------------------------- 1 | #ifndef ENTROPY_CALIBRATOR_H 2 | #define ENTROPY_CALIBRATOR_H 3 | 4 | #include "NvInfer.h" 5 | #include 6 | #include 7 | #include "macros.h" 8 | 9 | //! \class Int8EntropyCalibrator2 10 | //! 11 | //! \brief Implements Entropy calibrator 2. 12 | //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. 13 | //! 14 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 15 | { 16 | public: 17 | Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); 18 | 19 | virtual ~Int8EntropyCalibrator2(); 20 | int getBatchSize() const TRT_NOEXCEPT override; 21 | bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; 22 | const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; 23 | void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; 24 | 25 | private: 26 | int batchsize_; 27 | int input_w_; 28 | int input_h_; 29 | int img_idx_; 30 | std::string img_dir_; 31 | std::vector img_files_; 32 | size_t input_count_; 33 | std::string calib_table_name_; 34 | const char* input_blob_name_; 35 | bool read_cache_; 36 | void* device_input_; 37 | std::vector calib_cache_; 38 | }; 39 | 40 | #endif // ENTROPY_CALIBRATOR_H 41 | -------------------------------------------------------------------------------- /retinaface/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #if NV_TENSORRT_MAJOR >= 8 5 | #define TRT_NOEXCEPT noexcept 6 | #define TRT_CONST_ENQUEUE const 7 | #else 8 | #define TRT_NOEXCEPT 9 | #define TRT_CONST_ENQUEUE 10 | #endif 11 | 12 | #endif // __MACROS_H 13 | -------------------------------------------------------------------------------- /retinafaceAntiCov/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(retinafaceAntiCov) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | 14 | include_directories(${PROJECT_SOURCE_DIR}/include) 15 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") 16 | message("embed_platform on") 17 | include_directories(/usr/local/cuda/targets/aarch64-linux/include) 18 | link_directories(/usr/local/cuda/targets/aarch64-linux/lib) 19 | else() 20 | message("embed_platform off") 21 | # cuda 22 | include_directories(/usr/local/cuda/include) 23 | link_directories(/usr/local/cuda/lib64) 24 | 25 | # tensorrt 26 | include_directories(/home/lindsay/TensorRT-8.6.1.6/include) 27 | link_directories(/home/lindsay/TensorRT-8.6.1.6/lib) 28 | # include_directories(/home/lindsay/TensorRT-7.2.3.4/include) 29 | # link_directories(/home/lindsay/TensorRT-7.2.3.4/lib) 30 | 31 | 32 | endif() 33 | 34 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 35 | 36 | cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/decode.cu) 37 | 38 | find_package(OpenCV) 39 | include_directories(${OpenCV_INCLUDE_DIRS}) 40 | 41 | add_executable(retinafaceAntiCov ${PROJECT_SOURCE_DIR}/retinafaceAntiCov.cpp) 42 | target_link_libraries(retinafaceAntiCov nvinfer) 43 | target_link_libraries(retinafaceAntiCov cudart) 44 | target_link_libraries(retinafaceAntiCov myplugins) 45 | target_link_libraries(retinafaceAntiCov ${OpenCV_LIBS}) 46 | 47 | add_definitions(-O2 -pthread) 48 | 49 | -------------------------------------------------------------------------------- /retinafaceAntiCov/gen_wts.py: -------------------------------------------------------------------------------- 1 | import struct 2 | from retinaface_cov import RetinaFaceCoV 3 | 4 | gpuid = 0 5 | model = RetinaFaceCoV('./cov2/mnet_cov2', 0, gpuid, 'net3l') 6 | 7 | f = open('retinafaceAntiCov.wts', 'w') 8 | f.write('{}\n'.format(len(model.model.get_params()[0].keys()) + len(model.model.get_params()[1].keys()))) 9 | for k, v in model.model.get_params()[0].items(): 10 | vr = v.reshape(-1).asnumpy() 11 | f.write('{} {} '.format(k, len(vr))) 12 | for vv in vr: 13 | f.write(' ') 14 | f.write(struct.pack('>f',float(vv)).hex()) 15 | f.write('\n') 16 | for k, v in model.model.get_params()[1].items(): 17 | vr = v.reshape(-1).asnumpy() 18 | f.write('{} {} '.format(k, len(vr))) 19 | for vv in vr: 20 | f.write(' ') 21 | f.write(struct.pack('>f',float(vv)).hex()) 22 | f.write('\n') 23 | 24 | -------------------------------------------------------------------------------- /retinafaceAntiCov/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #if NV_TENSORRT_MAJOR >= 8 5 | #define TRT_NOEXCEPT noexcept 6 | #define TRT_CONST_ENQUEUE const 7 | #else 8 | #define TRT_NOEXCEPT 9 | #define TRT_CONST_ENQUEUE 10 | #endif 11 | 12 | #endif // __MACROS_H -------------------------------------------------------------------------------- /scaled-yolov4/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(yolov4) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 15 | # cuda 16 | include_directories(/usr/local/cuda/include) 17 | link_directories(/usr/local/cuda/lib64) 18 | # tensorrt 19 | include_directories(/usr/include/x86_64-linux-gnu/) 20 | link_directories(/usr/lib/x86_64-linux-gnu/) 21 | 22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 23 | 24 | cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu ${PROJECT_SOURCE_DIR}/mish.cu) 25 | target_link_libraries(myplugins nvinfer cudart) 26 | 27 | find_package(OpenCV) 28 | include_directories(${OpenCV_INCLUDE_DIRS}) 29 | 30 | add_executable(yolov4csp ${PROJECT_SOURCE_DIR}/yolov4_csp.cpp) 31 | target_link_libraries(yolov4csp nvinfer) 32 | target_link_libraries(yolov4csp cudart) 33 | target_link_libraries(yolov4csp myplugins) 34 | target_link_libraries(yolov4csp ${OpenCV_LIBS}) 35 | 36 | add_definitions(-O2 -pthread) 37 | 38 | -------------------------------------------------------------------------------- /scaled-yolov4/gen_wts.py: -------------------------------------------------------------------------------- 1 | import struct 2 | import sys 3 | from models.models import * 4 | from utils import * 5 | 6 | model = Darknet('models/yolov4-csp.cfg', (512, 512)) 7 | weights = sys.argv[1] 8 | device = torch_utils.select_device('0') 9 | if weights.endswith('.pt'): # pytorch format 10 | model.load_state_dict(torch.load(weights, map_location=device)['model']) 11 | else: # darknet format 12 | load_darknet_weights(model, weights) 13 | 14 | with open('yolov4_csp.wts', 'w') as f: 15 | f.write('{}\n'.format(len(model.state_dict().keys()))) 16 | for k, v in model.state_dict().items(): 17 | vr = v.reshape(-1).cpu().numpy() 18 | f.write('{} {} '.format(k, len(vr))) 19 | for vv in vr: 20 | f.write(' ') 21 | f.write(struct.pack('>f',float(vv)).hex()) 22 | f.write('\n') 23 | 24 | -------------------------------------------------------------------------------- /scaled-yolov4/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef __TRT_UTILS_H_ 2 | #define __TRT_UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #ifndef CUDA_CHECK 10 | 11 | #define CUDA_CHECK(callstr) \ 12 | { \ 13 | cudaError_t error_code = callstr; \ 14 | if (error_code != cudaSuccess) { \ 15 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ 16 | assert(0); \ 17 | } \ 18 | } 19 | 20 | #endif 21 | 22 | namespace Tn 23 | { 24 | template 25 | void write(char*& buffer, const T& val) 26 | { 27 | *reinterpret_cast(buffer) = val; 28 | buffer += sizeof(T); 29 | } 30 | 31 | template 32 | void read(const char*& buffer, T& val) 33 | { 34 | val = *reinterpret_cast(buffer); 35 | buffer += sizeof(T); 36 | } 37 | } 38 | 39 | #endif -------------------------------------------------------------------------------- /senet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(senet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | include_directories(${PROJECT_SOURCE_DIR}/include) 12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 13 | # cuda 14 | include_directories(/usr/local/cuda/include) 15 | link_directories(/usr/local/cuda/lib64) 16 | # tensorrt 17 | include_directories(/usr/include/x86_64-linux-gnu/) 18 | link_directories(/usr/lib/x86_64-linux-gnu/) 19 | 20 | add_executable(se_resnet ${PROJECT_SOURCE_DIR}/se_resnet50.cpp) 21 | target_link_libraries(se_resnet nvinfer) 22 | target_link_libraries(se_resnet cudart) 23 | 24 | add_definitions(-O2 -pthread) 25 | 26 | -------------------------------------------------------------------------------- /senet/README.md: -------------------------------------------------------------------------------- 1 | # SENet 2 | 3 | An implementation of SENet, proposed in Squeeze-and-Excitation Networks by Jie Hu, Li Shen, Samuel Albanie, Gang Sun, Enhua Wu 4 | 5 | [https://arxiv.org/abs/1709.01507](https://arxiv.org/abs/1709.01507) 6 | 7 | For the Pytorch implementation, you can refer to [wang-xinyu/senet.pytorch](https://github.com/wang-xinyu/senet.pytorch), which is forked from [moskomule/senet.pytorch](https://github.com/moskomule/senet.pytorch). 8 | 9 | 10 | ``` 11 | // 1. generate se_resnet50.wts from [wang-xinyu/senet.pytorch](https://github.com/wang-xinyu/senet.pytorch) 12 | 13 | // 2. put se_resnet50.wts into tensorrtx/senet 14 | 15 | // 3. build and run 16 | 17 | cd tensorrtx/senet 18 | 19 | mkdir build 20 | 21 | cd build 22 | 23 | cmake .. 24 | 25 | make 26 | 27 | sudo ./se_resnet -s // serialize model to plan file i.e. 'se_resnet50.engine' 28 | 29 | sudo ./se_resnet -d // deserialize plan file and run inference 30 | 31 | // 4. see if the output is same as [wang-xinyu/senet.pytorch] 32 | ``` 33 | 34 | -------------------------------------------------------------------------------- /shufflenetv2/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(shufflenet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | include_directories(${PROJECT_SOURCE_DIR}/include) 12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 13 | # cuda 14 | include_directories(/usr/local/cuda/include) 15 | link_directories(/usr/local/cuda/lib64) 16 | # tensorrt 17 | include_directories(/usr/include/x86_64-linux-gnu/) 18 | link_directories(/usr/lib/x86_64-linux-gnu/) 19 | 20 | add_executable(shufflenet ${PROJECT_SOURCE_DIR}/shufflenet_v2.cpp) 21 | target_link_libraries(shufflenet nvinfer) 22 | target_link_libraries(shufflenet cudart) 23 | 24 | add_definitions(-O2 -pthread) 25 | 26 | -------------------------------------------------------------------------------- /shufflenetv2/README.md: -------------------------------------------------------------------------------- 1 | # shufflenet v2 2 | 3 | ShuffleNetV2 with 0.5x output channels, as described in 4 | "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" 5 | 6 | 7 | For the Pytorch implementation, you can refer to [pytorchx/shufflenet](https://github.com/wang-xinyu/pytorchx/tree/master/shufflenet) 8 | 9 | Following tricks are used in this shufflenet, 10 | 11 | - `torch.chunk` is used in shufflenet v2. We implemented the 'chunk(2, dim=C)' by tensorrt plugin. Which is the simplest plugin in this tensorrtx project. You can learn the basic procedures of build tensorrt plugin. 12 | - shuffle layer is used, the `channel_shuffle()` in pytorchx/shufflenet can be implemented by two shuffle layers in tensorrt. 13 | - Batchnorm layer, implemented by scale layer. 14 | 15 | ``` 16 | // 1. generate shufflenet.wts from [pytorchx/shufflenet](https://github.com/wang-xinyu/pytorchx/tree/master/shufflenet) 17 | 18 | // 2. put shufflenet.wts into tensorrtx/shufflenet 19 | 20 | // 3. build and run 21 | 22 | cd tensorrtx/shufflenet 23 | 24 | mkdir build 25 | 26 | cd build 27 | 28 | cmake .. 29 | 30 | make 31 | 32 | sudo ./shufflenet -s // serialize model to plan file i.e. 'shufflenet.engine' 33 | sudo ./shufflenet -d // deserialize plan file and run inference 34 | 35 | // 4. see if the output is same as pytorchx/shufflenet 36 | ``` 37 | 38 | 39 | -------------------------------------------------------------------------------- /squeezenet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(squeezenet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | include_directories(${PROJECT_SOURCE_DIR}/include) 12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 13 | # cuda 14 | include_directories(/usr/local/cuda/include) 15 | link_directories(/usr/local/cuda/lib64) 16 | # tensorrt 17 | include_directories(/usr/include/x86_64-linux-gnu/) 18 | link_directories(/usr/lib/x86_64-linux-gnu/) 19 | 20 | add_executable(squeezenet ${PROJECT_SOURCE_DIR}/squeezenet.cpp) 21 | target_link_libraries(squeezenet nvinfer) 22 | target_link_libraries(squeezenet cudart) 23 | 24 | add_definitions(-O2 -pthread) 25 | 26 | -------------------------------------------------------------------------------- /squeezenet/README.md: -------------------------------------------------------------------------------- 1 | # squeezenet v1.1 2 | 3 | SqueezeNet 1.1 model from the official SqueezeNet repo 4 | 5 | 6 | SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters 7 | than SqueezeNet 1.0, without sacrificing accuracy. 8 | 9 | For the Pytorch implementation, you can refer to [pytorchx/squeezenet](https://github.com/wang-xinyu/pytorchx/tree/master/squeezenet) 10 | 11 | ``` 12 | // 1. generate squeezenet.wts from [pytorchx/squeezenet](https://github.com/wang-xinyu/pytorchx/tree/master/squeezenet) 13 | 14 | // 2. put squeezenet.wts into tensorrtx/squeezenet 15 | 16 | // 3. build and run 17 | 18 | cd tensorrtx/squeezenet 19 | 20 | mkdir build 21 | 22 | cd build 23 | 24 | cmake .. 25 | 26 | make 27 | 28 | sudo ./squeezenet -s // serialize model to plan file i.e. 'squeezenet.engine' 29 | sudo ./squeezenet -d // deserialize plan file and run inference 30 | 31 | // 4. see if the output is same as pytorchx/squeezenet 32 | ``` 33 | 34 | -------------------------------------------------------------------------------- /superpoint/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(SuperPointNet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 15 | # cuda 16 | include_directories(/usr/local/cuda/include) 17 | link_directories(/usr/local/cuda/lib64) 18 | # tensorrt 19 | include_directories(/usr/include/x86_64-linux-gnu/) 20 | link_directories(/usr/lib/x86_64-linux-gnu/) 21 | 22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 23 | 24 | find_package(OpenCV) 25 | include_directories(${OpenCV_INCLUDE_DIRS}) 26 | 27 | add_executable(supernet ${PROJECT_SOURCE_DIR}/supernet.cpp ${PROJECT_SOURCE_DIR}/utils.cpp) 28 | target_link_libraries(supernet nvinfer) 29 | target_link_libraries(supernet cudart) 30 | target_link_libraries(supernet ${OpenCV_LIBS}) 31 | 32 | add_definitions(-O2 -pthread) -------------------------------------------------------------------------------- /superpoint/gen_wts.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import struct 3 | from model import SuperPointNet 4 | 5 | model_name = "superpoint_v1" 6 | 7 | net = SuperPointNet() 8 | net.load_state_dict(torch.load("superpoint_v1.pth")) 9 | net = net.cuda() 10 | net.eval() 11 | 12 | f = open(model_name + ".wts", "w") 13 | f.write("{}\n".format(len(net.state_dict().keys()))) 14 | for k, v in net.state_dict().items(): 15 | vr = v.reshape(-1).cpu().numpy() 16 | f.write("{} {}".format(k, len(vr))) 17 | for vv in vr: 18 | f.write(" ") 19 | f.write(struct.pack(">f", float(vv)).hex()) 20 | f.write("\n") -------------------------------------------------------------------------------- /superpoint/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "NvInfer.h" 5 | #include "cuda_runtime_api.h" 6 | #include "assert.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | 14 | using namespace nvinfer1; 15 | 16 | #define CHECK(status) \ 17 | do \ 18 | { \ 19 | auto ret = (status); \ 20 | if (ret != 0) \ 21 | { \ 22 | std::cout << "Cuda failure: " << ret; \ 23 | abort(); \ 24 | } \ 25 | } while (0) 26 | 27 | 28 | int read_files_in_dir(const char *p_dir_name, std::vector &file_names); 29 | std::map loadWeights(const std::string file); 30 | void tokenize(const std::string &str, std::vector &tokens, const std::string &delimiters = ","); -------------------------------------------------------------------------------- /swin-transformer/semantic-segmentation/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/tensorrtx/c6bcebd812907a6d9eb66ecc909eb17b51d40fac/swin-transformer/semantic-segmentation/CMakeLists.txt -------------------------------------------------------------------------------- /swin-transformer/semantic-segmentation/README.md: -------------------------------------------------------------------------------- 1 | # Swin Transform - Semantic Segmentation 2 | 3 | The Pytorch implementation is [microsoft/Swin-Transformer](https://github.com/microsoft/Swin-Transformer.git). 4 | 5 | Only support Swin-T, welcome the PR for other backbones. 6 | 7 | ## Authors 8 | 9 | 10 | 11 | 12 | ## How to Run 13 | 14 | 1. generate .wts from pytorch with .pt, or download .wts from model zoo 15 | 16 | ``` 17 | git clone https://github.com/microsoft/Swin-Transformer.git 18 | git clone https://github.com/wang-xinyu/tensorrtx.git 19 | 20 | python gen_wts.py Swin-Transform.pt 21 | // a file 'Swin-Transform.wts' will be generated. 22 | ``` 23 | 24 | 2. build tensorrtx/swin-transform and run 25 | 26 | ``` 27 | cd {tensorrtx}/swin-transform/semantic-segmentation/ 28 | mkdir build 29 | cd build 30 | cp {microsoft}/Swin-Transformer/Swin-Transform.wts {tensorrtx}/swin-transformer/semantic-segmentation/build 31 | cmake .. 32 | make 33 | sudo ./swintransformer -s [.wts] [.engine] // serialize model to plan file 34 | sudo ./swintransformer -d [.engine] [image folder] // deserialize and run inference, the images in [image folder] will be processed. 35 | 36 | ``` 37 | 38 | ## More Information 39 | 40 | See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) 41 | 42 | -------------------------------------------------------------------------------- /swin-transformer/semantic-segmentation/UpsmapleKernel.h: -------------------------------------------------------------------------------- 1 | #ifndef UPSAMPLE_KERNEL_H 2 | #define UPSAMPLE_KERNEL_H 3 | 4 | #include 5 | #include "NvInfer.h" 6 | 7 | int UpsampleInference( 8 | cudaStream_t stream, 9 | int n, 10 | int input_b, 11 | int input_c, 12 | int input_h, 13 | int input_w, 14 | float scale_h, 15 | float scale_w, 16 | const void* inputs, 17 | void* outputs); 18 | 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /swin-transformer/semantic-segmentation/gen_wts.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import struct 3 | import sys 4 | 5 | # Initialize 6 | pt_file = sys.argv[1] 7 | # Load model 8 | model = torch.load(pt_file, map_location=torch.device('cpu'))['model'].float() # load to FP32 9 | model.to(device).eval() 10 | 11 | with open(pt_file.split('.')[0] + '.wts', 'w') as f: 12 | f.write('{}\n'.format(len(model.state_dict().keys()))) 13 | for k, v in model.state_dict().items(): 14 | vr = v.reshape(-1).cpu().numpy() 15 | f.write('{} {} '.format(k, len(vr))) 16 | for vv in vr: 17 | f.write(' ') 18 | f.write(struct.pack('>f',float(vv)).hex()) 19 | f.write('\n') 20 | -------------------------------------------------------------------------------- /swin-transformer/semantic-segmentation/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | using namespace std; 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /swin-transformer/semantic-segmentation/myhpp.h: -------------------------------------------------------------------------------- 1 | #ifndef MYHPP_H 2 | #define MYHPP_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #define _USE_MATH_DEFINES 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | //#include 23 | #include 24 | #include 25 | //#include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | 35 | 36 | #endif // MYHPP_H 37 | -------------------------------------------------------------------------------- /tsm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(TSM) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | include_directories(${PROJECT_SOURCE_DIR}/include) 12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 13 | # cuda 14 | include_directories(/usr/local/cuda/include) 15 | link_directories(/usr/local/cuda/lib64) 16 | 17 | # tensorrt 18 | include_directories(/home/ubuntu/TensorRT/include/) 19 | link_directories(/home/ubuntu/TensorRT/lib/) 20 | 21 | add_executable(tsm_r50 ${PROJECT_SOURCE_DIR}/tsm_r50.cpp) 22 | target_link_libraries(tsm_r50 nvinfer) 23 | target_link_libraries(tsm_r50 cudart) 24 | 25 | add_definitions(-O2 -pthread) 26 | -------------------------------------------------------------------------------- /tsm/mmaction2_tsm_r50_config.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='Recognizer2D', 4 | backbone=dict( 5 | type='ResNetTSM', 6 | pretrained='torchvision://resnet50', 7 | depth=50, 8 | norm_eval=False, 9 | shift_div=8), 10 | cls_head=dict( 11 | type='TSMHead', 12 | num_classes=400, 13 | in_channels=2048, 14 | spatial_type='avg', 15 | consensus=dict(type='AvgConsensus', dim=1), 16 | dropout_ratio=0.5, 17 | init_std=0.001, 18 | is_shift=True), 19 | # model training and testing settings 20 | train_cfg=None, 21 | test_cfg=dict(average_clips='prob')) 22 | -------------------------------------------------------------------------------- /tutorials/check_fp16_int8_support.md: -------------------------------------------------------------------------------- 1 | # Check if Your GPU Supports FP16/INT8 2 | 3 | ## 1. check your GPU Compute Capability 4 | 5 | visit https://developer.nvidia.com/cuda-gpus#compute and check your GPU compute capability. 6 | 7 | For example, GTX1080 is 6.1, Tesla T4 is 7.5. 8 | 9 | ## 2. check the hardware-precision-matrix 10 | 11 | visit https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html#hardware-precision-matrix and check the matrix. 12 | 13 | For example, compute capability 6.1 supports FP32 and INT8. 7.5 supports FP32, FP16, INT8, FP16 tensor core, etc. 14 | 15 | -------------------------------------------------------------------------------- /tutorials/contribution.md: -------------------------------------------------------------------------------- 1 | # How to make contribution 2 | 3 | 1. Fork this repo to your github account 4 | 5 | 2. Clone your fork 6 | 7 | 3. Create a feature branch 8 | 9 | 4. Make changes, including but not limited to create new model, bug fix, documentation, tutorials, etc. 10 | 11 | 5. Pre-commit check and push, we use clang-format to do coding style checking, and the coding style is following google c++ coding style with 4-space. 12 | 13 | ``` 14 | pip install pre-commit 15 | pip install clang-format 16 | 17 | cd tensorrtx/ 18 | git add [files-to-commit] 19 | pre-commit run 20 | 21 | # fix pre-commit errors, then git add files-to-commit again 22 | git add [files-to-commit] 23 | 24 | git commit -m "describe your commit" 25 | 26 | git push origin [feature-branch] 27 | ``` 28 | 29 | 6. Submit a pull-request on github web UI to master branch of wang-xinyu/tensorrtx. 30 | -------------------------------------------------------------------------------- /tutorials/migrating_from_tensorrt_4_to_7.md: -------------------------------------------------------------------------------- 1 | # Migrating from TensorRT 4 to 7 2 | 3 | The following APIs are deprecated and replaced in TensorRT 7. 4 | 5 | - `DimsCHW`, replaced by `Dims3` 6 | - `addConvolution()`, replaced by `addConvolutionNd()` 7 | - `addPooling()`, replaced by `addPoolingNd()` 8 | - `addDeconvolution()`, replaced by `addDeconvolutionNd()` 9 | - `createNetwork()`, replaced by `createNetworkV2()` 10 | - `buildCudaEngine()`, replaced by `buildEngineWithConfig()` 11 | - `createPReLUPlugin()`, replaced by `addActivation()` with `ActivationType::kLEAKY_RELU` 12 | - `IPlugin` and `IPluginExt` class, replaced by `IPluginV2IOExt` or `IPluginV2DynamicExt` 13 | - Use the new `Logger` class defined in logging.h 14 | -------------------------------------------------------------------------------- /ufld/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(lane_det) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | # cuda directory 12 | include_directories(/usr/local/cuda/include/) 13 | link_directories(/usr/local/cuda/lib64/) 14 | 15 | # tensorrt 16 | #include_directories(/workspace/TensorRT-7.2.3.4/include/) 17 | #link_directories(/workspace/TensorRT-7.2.3.4/lib/) 18 | 19 | 20 | find_package(OpenCV) 21 | include_directories(${OpenCV_INCLUDE_DIRS}) 22 | 23 | add_executable(lane_det ${PROJECT_SOURCE_DIR}/lane_det.cpp) 24 | target_link_libraries(lane_det nvinfer) 25 | target_link_libraries(lane_det cudart) 26 | target_link_libraries(lane_det ${OpenCV_LIBS}) 27 | 28 | add_definitions(-O2 -pthread) 29 | 30 | -------------------------------------------------------------------------------- /ufld/README.md: -------------------------------------------------------------------------------- 1 | # Ultra-Fast-Lane-Detection(UFLD) 2 | 3 | The Pytorch implementation is [Ultra-Fast-Lane-Detection](https://github.com/cfzd/Ultra-Fast-Lane-Detection). 4 | 5 | ## How to Run 6 | ``` 7 | 1. generate lane.wts and lane.onnx from pytorch with tusimple_18.pth 8 | 9 | git clone https://github.com/wang-xinyu/tensorrtx.git 10 | git clone https://github.com/cfzd/Ultra-Fast-Lane-Detection.git 11 | // download its weights 'tusimple_18.pth' 12 | // copy tensorrtx/ufld/gen_wts.py into Ultra-Fast-Lane-Detection/ 13 | // ensure the file name is tusimple_18.pth and lane.wts in gen_wts.py 14 | // go to Ultra-Fast-Lane-Detection 15 | python gen_wts.py 16 | // a file 'lane.wts' will be generated. 17 | // then ( not necessary ) 18 | python pth2onnx.py 19 | //a file 'lane.onnx' will be generated. 20 | 21 | 2. build tensorrtx/ufld and run 22 | 23 | mkdir build 24 | cd build 25 | cmake .. 26 | make 27 | sudo ./lane_det -s // serialize model to plan file i.e. 'lane.engine' 28 | sudo ./lane_det -d PATH_TO_YOUR_IMAGE_FOLDER // deserialize plan file and run inference, the images will be processed. 29 | 30 | ``` 31 | 32 | ## More Information 33 | 1. Changed the preprocess and postprocess in tensorrtx, give a different way to convert NHWC to NCHW in preprocess and just show the result using opencv rather than saving the result in postprocess. 34 | 2. If there are some bugs where you inference with multi batch_size, just modify the code in preprocess or postprocess, it's not complicated. 35 | 3. Some results are stored in resluts folder. 36 | -------------------------------------------------------------------------------- /ufld/gen_wts.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import struct 3 | #import models.crnn as crnn 4 | from model.model import parsingNet 5 | 6 | # Initialize 7 | model = parsingNet(pretrained = False, backbone='18', cls_dim = (101, 56, 4), use_aux=False) 8 | device = 'cpu' 9 | # Load model 10 | state_dict = torch.load('tusimple_18.pth', map_location='cpu')['model'] 11 | model.to(device).eval() 12 | 13 | f = open('lane.wts', 'w') 14 | f.write('{}\n'.format(len(state_dict.keys()))) 15 | for k, v in state_dict.items(): 16 | vr = v.reshape(-1).cpu().numpy() 17 | f.write('{} {} '.format(k, len(vr))) 18 | for vv in vr: 19 | f.write(' ') 20 | f.write(struct.pack('>f',float(vv)).hex()) 21 | f.write('\n') 22 | -------------------------------------------------------------------------------- /ufld/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #ifdef API_EXPORTS 5 | #if defined(_MSC_VER) 6 | #define API __declspec(dllexport) 7 | #else 8 | #define API __attribute__((visibility("default"))) 9 | #endif 10 | #else 11 | 12 | #if defined(_MSC_VER) 13 | #define API __declspec(dllimport) 14 | #else 15 | #define API 16 | #endif 17 | #endif // API_EXPORTS 18 | 19 | #if NV_TENSORRT_MAJOR >= 8 20 | #define TRT_NOEXCEPT noexcept 21 | #define TRT_CONST_ENQUEUE const 22 | #else 23 | #define TRT_NOEXCEPT 24 | #define TRT_CONST_ENQUEUE 25 | #endif 26 | 27 | #endif // __MACROS_H 28 | -------------------------------------------------------------------------------- /ufld/pth2onnx.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | from torchvision import datasets, transforms 6 | import torch.onnx as torch_onnx 7 | from model.model import parsingNet 8 | 9 | MODELPATH = "tusimple_18.pth" 10 | 11 | net = parsingNet(pretrained = False, backbone='18', cls_dim = (101, 56, 4), use_aux=False).cuda() 12 | 13 | state_dict = torch.load(MODELPATH, map_location='cpu')['model'] 14 | 15 | net.train(False) 16 | 17 | x = torch.randn(1, 3, 288, 800).cuda() 18 | 19 | torch_onnx.export(net, x, "lane.onnx", verbose=True, input_names=["input"], output_names=["output"],opset_version=11) 20 | -------------------------------------------------------------------------------- /unet/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(unet) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | # cuda directory 12 | include_directories(/usr/local/cuda/include/) 13 | link_directories(/usr/local/cuda/lib64/) 14 | 15 | # tensorrt 16 | include_directories(/workspace/TensorRT-7.2.3.4/include/) 17 | link_directories(/workspace/TensorRT-7.2.3.4/lib/) 18 | 19 | # opencv library 20 | find_package(OpenCV) 21 | include_directories(${OpenCV_INCLUDE_DIRS}) 22 | 23 | # link library and add exec file 24 | add_executable(unet ${PROJECT_SOURCE_DIR}/unet.cpp) 25 | target_link_libraries(unet nvinfer) 26 | target_link_libraries(unet cudart) 27 | target_link_libraries(unet ${OpenCV_LIBS}) 28 | 29 | add_definitions(-O2 -pthread) 30 | 31 | -------------------------------------------------------------------------------- /unet/gen_wts.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | import struct 4 | 5 | def main(): 6 | device = torch.device('cpu') 7 | state_dict = torch.load(sys.argv[1], map_location=device) 8 | 9 | f = open("unet.wts", 'w') 10 | f.write("{}\n".format(len(state_dict.keys()))) 11 | for k, v in state_dict.items(): 12 | print('key: ', k) 13 | print('value: ', v.shape) 14 | vr = v.reshape(-1).cpu().numpy() 15 | f.write("{} {}".format(k, len(vr))) 16 | for vv in vr: 17 | f.write(" ") 18 | f.write(struct.pack(">f", float(vv)).hex()) 19 | f.write("\n") 20 | f.close() 21 | 22 | if __name__ == '__main__': 23 | main() 24 | 25 | -------------------------------------------------------------------------------- /unet/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #ifdef API_EXPORTS 5 | #if defined(_MSC_VER) 6 | #define API __declspec(dllexport) 7 | #else 8 | #define API __attribute__((visibility("default"))) 9 | #endif 10 | #else 11 | 12 | #if defined(_MSC_VER) 13 | #define API __declspec(dllimport) 14 | #else 15 | #define API 16 | #endif 17 | #endif // API_EXPORTS 18 | 19 | #if NV_TENSORRT_MAJOR >= 8 20 | #define TRT_NOEXCEPT noexcept 21 | #define TRT_CONST_ENQUEUE const 22 | #else 23 | #define TRT_NOEXCEPT 24 | #define TRT_CONST_ENQUEUE 25 | #endif 26 | 27 | #endif // __MACROS_H 28 | -------------------------------------------------------------------------------- /vgg/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(vgg) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | include_directories(${PROJECT_SOURCE_DIR}/include) 12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 13 | # cuda 14 | include_directories(/usr/local/cuda/include) 15 | link_directories(/usr/local/cuda/lib64) 16 | # tensorrt 17 | include_directories(/usr/include/x86_64-linux-gnu/) 18 | link_directories(/usr/lib/x86_64-linux-gnu/) 19 | 20 | add_executable(vgg ${PROJECT_SOURCE_DIR}/vgg11.cpp) 21 | target_link_libraries(vgg nvinfer) 22 | target_link_libraries(vgg cudart) 23 | 24 | add_definitions(-O2 -pthread) 25 | 26 | -------------------------------------------------------------------------------- /vgg/README.md: -------------------------------------------------------------------------------- 1 | # vgg 2 | 3 | VGG 11-layer model (configuration "A") from 4 | "Very Deep Convolutional Networks For Large-Scale Image Recognition" 5 | 6 | For the Pytorch implementation, you can refer to [pytorchx/vgg](https://github.com/wang-xinyu/pytorchx/tree/master/vgg) 7 | 8 | VGG's architecture is simple, just some conv, relu, maxpool, and fc layers. 9 | 10 | ``` 11 | // 1. generate vgg.wts from [pytorchx/vgg](https://github.com/wang-xinyu/pytorchx/tree/master/vgg) 12 | 13 | // 2. put vgg.wts into tensorrtx/vgg 14 | 15 | // 3. build and run 16 | 17 | cd tensorrtx/vgg 18 | 19 | mkdir build 20 | 21 | cd build 22 | 23 | cmake .. 24 | 25 | make 26 | 27 | sudo ./vgg -s // serialize model to plan file i.e. 'vgg.engine' 28 | sudo ./vgg -d // deserialize plan file and run inference 29 | 30 | // 4. see if the output is same as pytorchx/vgg 31 | ``` 32 | 33 | 34 | -------------------------------------------------------------------------------- /yolo11/include/calibrator.h: -------------------------------------------------------------------------------- 1 | #ifndef ENTROPY_CALIBRATOR_H 2 | #define ENTROPY_CALIBRATOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "macros.h" 8 | 9 | //! \class Int8EntropyCalibrator2 10 | //! 11 | //! \brief Implements Entropy calibrator 2. 12 | //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. 13 | //! 14 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { 15 | public: 16 | Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, 17 | const char* input_blob_name, bool read_cache = true); 18 | virtual ~Int8EntropyCalibrator2(); 19 | int getBatchSize() const TRT_NOEXCEPT override; 20 | bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; 21 | const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; 22 | void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; 23 | 24 | private: 25 | int batchsize_; 26 | int input_w_; 27 | int input_h_; 28 | int img_idx_; 29 | std::string img_dir_; 30 | std::vector img_files_; 31 | size_t input_count_; 32 | std::string calib_table_name_; 33 | const char* input_blob_name_; 34 | bool read_cache_; 35 | void* device_input_; 36 | std::vector calib_cache_; 37 | }; 38 | 39 | #endif // ENTROPY_CALIBRATOR_H 40 | -------------------------------------------------------------------------------- /yolo11/include/config.h: -------------------------------------------------------------------------------- 1 | #define USE_FP16 2 | // #define USE_FP32 3 | // #define USE_INT8 4 | 5 | const static char* kInputTensorName = "images"; 6 | const static char* kOutputTensorName = "output"; 7 | const static char* kProtoTensorName = "proto"; 8 | const static int kNumClass = 80; 9 | const static int kPoseNumClass = 1; 10 | const static int kNumberOfPoints = 17; // number of keypoints total 11 | // obb model's number of classes 12 | constexpr static int kObbNumClass = 15; 13 | const static int kObbNe = 1; // number of extra parameters 14 | const static int kBatchSize = 1; 15 | const static int kGpuId = 0; 16 | const static int kInputH = 640; 17 | const static int kInputW = 640; 18 | const static int kObbInputH = 1024; 19 | const static int kObbInputW = 1024; 20 | const static float kNmsThresh = 0.45f; 21 | const static float kConfThresh = 0.5f; 22 | const static float kConfThreshKeypoints = 0.5f; // keypoints confidence 23 | const static int kMaxInputImageSize = 3000 * 3000; 24 | const static int kMaxNumOutputBbox = 1000; 25 | //Quantization input image folder path 26 | const static char* kInputQuantizationFolder = "./coco_calib"; 27 | 28 | // Classfication model's number of classes 29 | constexpr static int kClsNumClass = 1000; 30 | // Classfication model's input shape 31 | constexpr static int kClsInputH = 224; 32 | constexpr static int kClsInputW = 224; 33 | -------------------------------------------------------------------------------- /yolo11/include/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRTX_CUDA_UTILS_H_ 2 | #define TRTX_CUDA_UTILS_H_ 3 | 4 | #include 5 | 6 | #ifndef CUDA_CHECK 7 | #define CUDA_CHECK(callstr) \ 8 | { \ 9 | cudaError_t error_code = callstr; \ 10 | if (error_code != cudaSuccess) { \ 11 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ 12 | assert(0); \ 13 | } \ 14 | } 15 | #endif // CUDA_CHECK 16 | 17 | #endif // TRTX_CUDA_UTILS_H_ 18 | -------------------------------------------------------------------------------- /yolo11/include/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #include "NvInfer.h" 5 | 6 | #ifdef API_EXPORTS 7 | #if defined(_MSC_VER) 8 | #define API __declspec(dllexport) 9 | #else 10 | #define API __attribute__((visibility("default"))) 11 | #endif 12 | #else 13 | 14 | #if defined(_MSC_VER) 15 | #define API __declspec(dllimport) 16 | #else 17 | #define API 18 | #endif 19 | #endif // API_EXPORTS 20 | 21 | #if NV_TENSORRT_MAJOR >= 8 22 | #define TRT_NOEXCEPT noexcept 23 | #define TRT_CONST_ENQUEUE const 24 | #else 25 | #define TRT_NOEXCEPT 26 | #define TRT_CONST_ENQUEUE 27 | #endif 28 | 29 | #endif // __MACROS_H 30 | -------------------------------------------------------------------------------- /yolo11/include/model.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "NvInfer.h" 6 | 7 | nvinfer1::IHostMemory* buildEngineYolo11Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, 8 | nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, 9 | std::string& type, int max_channels); 10 | 11 | nvinfer1::IHostMemory* buildEngineYolo11Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, 12 | nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, 13 | int& max_channels, std::string& type); 14 | 15 | nvinfer1::IHostMemory* buildEngineYolo11Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, 16 | nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, 17 | int& max_channels, std::string& type); 18 | 19 | nvinfer1::IHostMemory* buildEngineYolo11Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, 20 | nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, 21 | int& max_channels, std::string& type); 22 | 23 | nvinfer1::IHostMemory* buildEngineYolo11Obb(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, 24 | nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, 25 | int& max_channels, std::string& type); 26 | -------------------------------------------------------------------------------- /yolo11/include/preprocess.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "NvInfer.h" 6 | #include "types.h" 7 | 8 | void cuda_preprocess_init(int max_image_size); 9 | 10 | void cuda_preprocess_destroy(); 11 | 12 | void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, 13 | cudaStream_t stream); 14 | 15 | void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, 16 | cudaStream_t stream); 17 | -------------------------------------------------------------------------------- /yolo11/include/types.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "config.h" 3 | 4 | struct alignas(float) Detection { 5 | //center_x center_y w h 6 | float bbox[4]; 7 | float conf; // bbox_conf * cls_conf 8 | float class_id; 9 | float mask[32]; 10 | float keypoints[kNumberOfPoints * 3]; // 17*3 keypoints 11 | float angle; // obb angle 12 | }; 13 | 14 | struct AffineMatrix { 15 | float value[6]; 16 | }; 17 | 18 | const int bbox_element = 19 | sizeof(AffineMatrix) / sizeof(float) + 1; // left, top, right, bottom, confidence, class, keepflag 20 | -------------------------------------------------------------------------------- /yolo11_tripy/.gitignore: -------------------------------------------------------------------------------- 1 | imagenet_classes.txt 2 | *.JPEG 3 | *.pt 4 | -------------------------------------------------------------------------------- /yolo11_tripy/README.md: -------------------------------------------------------------------------------- 1 | # YOLO11 Tripy 2 | 3 | This example implements a YOLO11 classifier model using [Tripy](https://nvidia.github.io/TensorRT-Incubator/). 4 | 5 | ## Running The Example 6 | 7 | Run the following commands from the [`yolo11_tripy`](./) directory: 8 | 9 | 1. Install Dependencies: 10 | 11 | ```bash 12 | python3 -m pip install -r requirements.txt 13 | ``` 14 | 15 | 2. Download ImageNet classes file: 16 | 17 | ```bash 18 | wget https://raw.githubusercontent.com/joannzhang00/ImageNet-dataset-classes-labels/main/imagenet_classes.txt 19 | ``` 20 | 21 | 3. [*Optional*] Download some images: 22 | 23 | ```bash 24 | wget https://raw.githubusercontent.com/EliSchwartz/imagenet-sample-images/master/n01558993_robin.JPEG 25 | wget https://raw.githubusercontent.com/EliSchwartz/imagenet-sample-images/master/n04389033_tank.JPEG 26 | ``` 27 | 28 | You can skip this step if you already have images you'd like to classify. 29 | 30 | 3. Build the model: 31 | 32 | ```bash 33 | python3 compile_classifier.py 34 | ``` 35 | 36 | You can configure various aspects of the model when you compile. 37 | Run `python3 compile_classifier.py -h` for details. 38 | 39 | 4. Run inference: 40 | 41 | ```bash 42 | python3 classify.py n01558993_robin.JPEG n04389033_tank.JPEG 43 | ``` 44 | 45 | The `classify.py` script allows you to pass one or more image file paths on the command line. 46 | The images are batched and classified in a single forward pass. 47 | -------------------------------------------------------------------------------- /yolo11_tripy/constants.py: -------------------------------------------------------------------------------- 1 | IMAGE_C = 3 2 | IMAGE_H = 224 3 | IMAGE_W = 224 4 | -------------------------------------------------------------------------------- /yolo11_tripy/requirements.txt: -------------------------------------------------------------------------------- 1 | -f https://nvidia.github.io/TensorRT-Incubator/packages.html 2 | nvtripy>=0.1.1 3 | opencv-python-headless 4 | numpy 5 | torch 6 | -------------------------------------------------------------------------------- /yolop/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(yolop) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Release) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | 15 | find_package(OpenCV REQUIRED) 16 | include_directories(${OpenCV_INCLUDE_DIRS}) 17 | 18 | # cuda 19 | include_directories(/usr/local/cuda-10.2/include) 20 | link_directories(/usr/local/cuda-10.2/lib64) 21 | # tensorrt 22 | include_directories(/usr/include/aarch64-linux-gnu/) 23 | link_directories(/usr/lib/aarch64-linux-gnu/) 24 | 25 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 26 | 27 | # to generate plugins 28 | cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu) 29 | target_link_libraries(myplugins nvinfer cudart) 30 | 31 | # to generate trt and test image dir 32 | add_executable(yolop ${PROJECT_SOURCE_DIR}/yolop.cpp) 33 | target_link_libraries(yolop nvinfer cudart myplugins ${OpenCV_LIBS}) 34 | add_definitions(-O3 -pthread) 35 | 36 | -------------------------------------------------------------------------------- /yolop/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #ifndef CUDA_CHECK 5 | #define CUDA_CHECK(callstr)\ 6 | {\ 7 | cudaError_t error_code = callstr;\ 8 | if (error_code != cudaSuccess) {\ 9 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ 10 | assert(0);\ 11 | }\ 12 | } 13 | #endif // CUDA_CHECK 14 | 15 | -------------------------------------------------------------------------------- /yolop/gen_wts.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import torch 3 | import struct 4 | 5 | # TODO: YOLOP_BASE_DIR is the root of YOLOP 6 | print("[WARN] Please download/clone YOLOP, then set YOLOP_BASE_DIR to the root of YOLOP") 7 | 8 | #YOLOP_BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 9 | YOLOP_BASE_DIR = "/home/user/jetson/tmp/YOLOP" 10 | 11 | sys.path.append(YOLOP_BASE_DIR) 12 | from lib.models import get_net 13 | from lib.config import cfg 14 | 15 | 16 | # Initialize 17 | device = torch.device('cpu') 18 | # Load model 19 | model = get_net(cfg) 20 | checkpoint = torch.load(YOLOP_BASE_DIR + '/weights/End-to-end.pth', map_location=device) 21 | model.load_state_dict(checkpoint['state_dict']) 22 | # load to FP32 23 | model.float() 24 | model.to(device).eval() 25 | 26 | f = open('yolop.wts', 'w') 27 | f.write('{}\n'.format(len(model.state_dict().keys()))) 28 | for k, v in model.state_dict().items(): 29 | vr = v.reshape(-1).cpu().numpy() 30 | f.write('{} {} '.format(k, len(vr))) 31 | for vv in vr: 32 | f.write(' ') 33 | f.write(struct.pack('>f',float(vv)).hex()) 34 | f.write('\n') 35 | 36 | f.close() 37 | 38 | print("save as yolop.wts") -------------------------------------------------------------------------------- /yolop/logging.h: -------------------------------------------------------------------------------- 1 | // create by ausk(jinlj) 2022/10/25 2 | #pragma once 3 | 4 | #include "NvInferRuntimeCommon.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "macros.h" 13 | 14 | #if NV_TENSORRT_MAJOR >= 8 15 | #define TRT_NOEXCEPT noexcept 16 | #else 17 | #define TRT_NOEXCEPT 18 | #endif 19 | 20 | using Severity = nvinfer1::ILogger::Severity; 21 | 22 | class Logger : public nvinfer1::ILogger 23 | { 24 | public: 25 | void log(Severity severity, const char* msg) TRT_NOEXCEPT override 26 | { 27 | if (severity < Severity::kINFO) { 28 | std::cout << msg << std::endl; 29 | } 30 | } 31 | }; 32 | -------------------------------------------------------------------------------- /yolop/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #if NV_TENSORRT_MAJOR >= 8 5 | #define TRT_NOEXCEPT noexcept 6 | #define TRT_CONST_ENQUEUE const 7 | #else 8 | #define TRT_NOEXCEPT 9 | #define TRT_CONST_ENQUEUE 10 | #endif 11 | 12 | #endif // __MACROS_H -------------------------------------------------------------------------------- /yolov10/include/calibrator.h: -------------------------------------------------------------------------------- 1 | #ifndef ENTROPY_CALIBRATOR_H 2 | #define ENTROPY_CALIBRATOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "macros.h" 8 | 9 | //! \class Int8EntropyCalibrator2 10 | //! 11 | //! \brief Implements Entropy calibrator 2. 12 | //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. 13 | //! 14 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { 15 | public: 16 | Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, 17 | const char* input_blob_name, bool read_cache = true); 18 | virtual ~Int8EntropyCalibrator2(); 19 | int getBatchSize() const TRT_NOEXCEPT override; 20 | bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; 21 | const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; 22 | void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; 23 | 24 | private: 25 | int batchsize_; 26 | int input_w_; 27 | int input_h_; 28 | int img_idx_; 29 | std::string img_dir_; 30 | std::vector img_files_; 31 | size_t input_count_; 32 | std::string calib_table_name_; 33 | const char* input_blob_name_; 34 | bool read_cache_; 35 | void* device_input_; 36 | std::vector calib_cache_; 37 | }; 38 | 39 | #endif // ENTROPY_CALIBRATOR_H 40 | -------------------------------------------------------------------------------- /yolov10/include/config.h: -------------------------------------------------------------------------------- 1 | //#define USE_FP32 2 | #define USE_FP16 3 | // #define USE_INT8 4 | 5 | const static char* kInputTensorName = "images"; 6 | const static char* kOutputTensorName = "output"; 7 | const static int kNumClass = 80; 8 | const static int kBatchSize = 1; 9 | const static int kGpuId = 0; 10 | const static int kInputH = 640; 11 | const static int kInputW = 640; 12 | const static float kConfThresh = 0.5f; 13 | const static int kMaxInputImageSize = 3000 * 3000; 14 | const static int kMaxNumOutputBbox = 1000; 15 | //Quantization input image folder path 16 | const static char* kInputQuantizationFolder = "./coco_calib"; 17 | -------------------------------------------------------------------------------- /yolov10/include/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRTX_CUDA_UTILS_H_ 2 | #define TRTX_CUDA_UTILS_H_ 3 | 4 | #include 5 | 6 | #ifndef CUDA_CHECK 7 | #define CUDA_CHECK(callstr) \ 8 | { \ 9 | cudaError_t error_code = callstr; \ 10 | if (error_code != cudaSuccess) { \ 11 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ 12 | assert(0); \ 13 | } \ 14 | } 15 | #endif // CUDA_CHECK 16 | 17 | #endif // TRTX_CUDA_UTILS_H_ 18 | -------------------------------------------------------------------------------- /yolov10/include/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #include "NvInfer.h" 5 | 6 | #ifdef API_EXPORTS 7 | #if defined(_MSC_VER) 8 | #define API __declspec(dllexport) 9 | #else 10 | #define API __attribute__((visibility("default"))) 11 | #endif 12 | #else 13 | 14 | #if defined(_MSC_VER) 15 | #define API __declspec(dllimport) 16 | #else 17 | #define API 18 | #endif 19 | #endif // API_EXPORTS 20 | 21 | #if NV_TENSORRT_MAJOR >= 8 22 | #define TRT_NOEXCEPT noexcept 23 | #define TRT_CONST_ENQUEUE const 24 | #else 25 | #define TRT_NOEXCEPT 26 | #define TRT_CONST_ENQUEUE 27 | #endif 28 | 29 | #endif // __MACROS_H 30 | -------------------------------------------------------------------------------- /yolov10/include/model.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "NvInfer.h" 6 | 7 | nvinfer1::IHostMemory* buildEngineYolov10DetN(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, 8 | nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, 9 | int& max_channels); 10 | 11 | nvinfer1::IHostMemory* buildEngineYolov10DetS(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, 12 | nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, 13 | int& max_channels); 14 | 15 | nvinfer1::IHostMemory* buildEngineYolov10DetM(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, 16 | nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, 17 | int& max_channels); 18 | 19 | nvinfer1::IHostMemory* buildEngineYolov10DetBL(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, 20 | nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, 21 | int& max_channels); 22 | 23 | nvinfer1::IHostMemory* buildEngineYolov10DetX(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, 24 | nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, 25 | int& max_channels); 26 | -------------------------------------------------------------------------------- /yolov10/include/postprocess.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "NvInfer.h" 5 | #include "types.h" 6 | 7 | cv::Rect get_rect(cv::Mat& img, float bbox[4]); 8 | 9 | void draw_bbox(std::vector& img_batch, std::vector>& res_batch); 10 | 11 | void batch_topk(std::vector>& res_batch, float* output, int batch_size, int output_size, 12 | float conf_thresh, int topk = 300); 13 | -------------------------------------------------------------------------------- /yolov10/include/preprocess.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "NvInfer.h" 6 | #include "types.h" 7 | 8 | void cuda_preprocess_init(int max_image_size); 9 | 10 | void cuda_preprocess_destroy(); 11 | 12 | void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, 13 | cudaStream_t stream); 14 | 15 | void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, 16 | cudaStream_t stream); 17 | -------------------------------------------------------------------------------- /yolov10/include/types.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "config.h" 3 | 4 | struct alignas(float) Detection { 5 | //center_x center_y w h 6 | float bbox[4]; 7 | float conf; // bbox_conf * cls_conf 8 | float class_id; 9 | }; 10 | 11 | struct AffineMatrix { 12 | float value[6]; 13 | }; 14 | 15 | const int bbox_element = 16 | sizeof(Detection) / sizeof(float) + 1; // left, top, right, bottom, confidence, class, keepflag 17 | -------------------------------------------------------------------------------- /yolov12/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | project(yolov12) 4 | 5 | add_definitions(-std=c++11) 6 | add_definitions(-DAPI_EXPORTS) 7 | set(CMAKE_CXX_STANDARD 11) 8 | set(CMAKE_BUILD_TYPE Debug) 9 | 10 | set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) 11 | enable_language(CUDA) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | include_directories(${PROJECT_SOURCE_DIR}/plugin) 15 | 16 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 17 | if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") 18 | message("embed_platform on") 19 | include_directories(/usr/local/cuda/targets/aarch64-linux/include) 20 | link_directories(/usr/local/cuda/targets/aarch64-linux/lib) 21 | else() 22 | message("embed_platform off") 23 | 24 | # cuda 25 | include_directories(/usr/local/cuda/include) 26 | link_directories(/usr/local/cuda/lib64) 27 | 28 | # tensorrt 29 | include_directories(/workspace/shared/TensorRT-8.6.1.6/include) 30 | link_directories(/workspace/shared/TensorRT-8.6.1.6/lib) 31 | endif() 32 | 33 | add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu) 34 | target_link_libraries(myplugins nvinfer cudart) 35 | 36 | find_package(OpenCV) 37 | include_directories(${OpenCV_INCLUDE_DIRS}) 38 | 39 | file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) 40 | 41 | add_executable(yolo12_det ${PROJECT_SOURCE_DIR}/yolo12_det.cpp ${SRCS}) 42 | target_link_libraries(yolo12_det nvinfer) 43 | target_link_libraries(yolo12_det cudart) 44 | target_link_libraries(yolo12_det myplugins) 45 | target_link_libraries(yolo12_det ${OpenCV_LIBS}) 46 | -------------------------------------------------------------------------------- /yolov12/include/config.h: -------------------------------------------------------------------------------- 1 | #define USE_FP16 2 | // #define USE_FP32 3 | // #define USE_INT8 4 | 5 | const static char* kInputTensorName = "images"; 6 | const static char* kOutputTensorName = "output"; 7 | const static char* kProtoTensorName = "proto"; 8 | const static int kNumClass = 80; 9 | const static int kPoseNumClass = 1; 10 | const static int kNumberOfPoints = 17; // number of keypoints total 11 | // obb model's number of classes 12 | constexpr static int kObbNumClass = 15; 13 | const static int kObbNe = 1; // number of extra parameters 14 | const static int kBatchSize = 1; 15 | const static int kGpuId = 0; 16 | const static int kInputH = 640; 17 | const static int kInputW = 640; 18 | const static int kObbInputH = 1024; 19 | const static int kObbInputW = 1024; 20 | const static float kNmsThresh = 0.45f; 21 | const static float kConfThresh = 0.5f; 22 | const static float kConfThreshKeypoints = 0.5f; // keypoints confidence 23 | const static int kMaxInputImageSize = 3000 * 3000; 24 | const static int kMaxNumOutputBbox = 1000; 25 | //Quantization input image folder path 26 | const static char* kInputQuantizationFolder = "./coco_calib"; 27 | 28 | // Classfication model's number of classes 29 | constexpr static int kClsNumClass = 1000; 30 | // Classfication model's input shape 31 | constexpr static int kClsInputH = 224; 32 | constexpr static int kClsInputW = 224; 33 | -------------------------------------------------------------------------------- /yolov12/include/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRTX_CUDA_UTILS_H_ 2 | #define TRTX_CUDA_UTILS_H_ 3 | 4 | #include 5 | 6 | #ifndef CUDA_CHECK 7 | #define CUDA_CHECK(callstr) \ 8 | { \ 9 | cudaError_t error_code = callstr; \ 10 | if (error_code != cudaSuccess) { \ 11 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ 12 | assert(0); \ 13 | } \ 14 | } 15 | #endif // CUDA_CHECK 16 | 17 | #endif // TRTX_CUDA_UTILS_H_ 18 | -------------------------------------------------------------------------------- /yolov12/include/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #include "NvInfer.h" 5 | 6 | #ifdef API_EXPORTS 7 | #if defined(_MSC_VER) 8 | #define API __declspec(dllexport) 9 | #else 10 | #define API __attribute__((visibility("default"))) 11 | #endif 12 | #else 13 | 14 | #if defined(_MSC_VER) 15 | #define API __declspec(dllimport) 16 | #else 17 | #define API 18 | #endif 19 | #endif // API_EXPORTS 20 | 21 | #if NV_TENSORRT_MAJOR >= 8 22 | #define TRT_NOEXCEPT noexcept 23 | #define TRT_CONST_ENQUEUE const 24 | #else 25 | #define TRT_NOEXCEPT 26 | #define TRT_CONST_ENQUEUE 27 | #endif 28 | 29 | #endif // __MACROS_H 30 | -------------------------------------------------------------------------------- /yolov12/include/model.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "NvInfer.h" 6 | 7 | nvinfer1::IHostMemory* buildEngineYolo12Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, 8 | nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw, 9 | int& max_channels, std::string& type); 10 | -------------------------------------------------------------------------------- /yolov12/include/preprocess.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "NvInfer.h" 6 | #include "types.h" 7 | 8 | void cuda_preprocess_init(int max_image_size); 9 | 10 | void cuda_preprocess_destroy(); 11 | 12 | void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, 13 | cudaStream_t stream); 14 | 15 | void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, 16 | cudaStream_t stream); 17 | -------------------------------------------------------------------------------- /yolov12/include/types.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "config.h" 3 | 4 | struct alignas(float) Detection { 5 | //center_x center_y w h 6 | float bbox[4]; 7 | float conf; // bbox_conf * cls_conf 8 | float class_id; 9 | float mask[32]; 10 | float keypoints[kNumberOfPoints * 3]; // 17*3 keypoints 11 | float angle; // obb angle 12 | }; 13 | 14 | struct AffineMatrix { 15 | float value[6]; 16 | }; 17 | 18 | const int bbox_element = 19 | sizeof(AffineMatrix) / sizeof(float) + 1; // left, top, right, bottom, confidence, class, keepflag 20 | -------------------------------------------------------------------------------- /yolov3-spp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(yolov3-spp) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 15 | # cuda 16 | include_directories(/usr/local/cuda/include) 17 | link_directories(/usr/local/cuda/lib64) 18 | # tensorrt 19 | include_directories(/usr/include/x86_64-linux-gnu/) 20 | link_directories(/usr/lib/x86_64-linux-gnu/) 21 | 22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 23 | 24 | cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu) 25 | target_link_libraries(yololayer nvinfer cudart) 26 | 27 | find_package(OpenCV) 28 | include_directories(${OpenCV_INCLUDE_DIRS}) 29 | 30 | add_executable(yolov3-spp ${PROJECT_SOURCE_DIR}/yolov3-spp.cpp) 31 | target_link_libraries(yolov3-spp nvinfer) 32 | target_link_libraries(yolov3-spp cudart) 33 | target_link_libraries(yolov3-spp yololayer) 34 | target_link_libraries(yolov3-spp ${OpenCV_LIBS}) 35 | 36 | add_definitions(-O2 -pthread) 37 | 38 | -------------------------------------------------------------------------------- /yolov3-spp/gen_wts.py: -------------------------------------------------------------------------------- 1 | import struct 2 | import sys 3 | from models import * 4 | from utils.utils import * 5 | 6 | model = Darknet('cfg/yolov3-spp.cfg', (416, 416)) 7 | weights = sys.argv[1] 8 | dev = '0' 9 | device = torch_utils.select_device(dev) 10 | model.load_state_dict(torch.load(weights, map_location=device)['model']) 11 | 12 | 13 | with open('yolov3-spp_ultralytics68.wts', 'w') as f: 14 | f.write('{}\n'.format(len(model.state_dict().keys()))) 15 | for k, v in model.state_dict().items(): 16 | vr = v.reshape(-1).cpu().numpy() 17 | f.write('{} {} '.format(k, len(vr))) 18 | for vv in vr: 19 | f.write(' ') 20 | f.write(struct.pack('>f',float(vv)).hex()) 21 | f.write('\n') 22 | 23 | -------------------------------------------------------------------------------- /yolov3-spp/samples/bus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/tensorrtx/c6bcebd812907a6d9eb66ecc909eb17b51d40fac/yolov3-spp/samples/bus.jpg -------------------------------------------------------------------------------- /yolov3-spp/samples/zidane.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wang-xinyu/tensorrtx/c6bcebd812907a6d9eb66ecc909eb17b51d40fac/yolov3-spp/samples/zidane.jpg -------------------------------------------------------------------------------- /yolov3-tiny/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(yolov3-tiny) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") 15 | message("embed_platform on") 16 | include_directories(/usr/local/cuda/targets/aarch64-linux/include) 17 | link_directories(/usr/local/cuda/targets/aarch64-linux/lib) 18 | else() 19 | message("embed_platform off") 20 | include_directories(/usr/local/cuda/include) 21 | link_directories(/usr/local/cuda/lib64) 22 | endif() 23 | 24 | 25 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 26 | 27 | #cuda_add_library(leaky ${PROJECT_SOURCE_DIR}/leaky.cu) 28 | cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu) 29 | target_link_libraries(yololayer nvinfer cudart) 30 | 31 | find_package(OpenCV) 32 | include_directories(${OpenCV_INCLUDE_DIRS}) 33 | 34 | add_executable(yolov3-tiny ${PROJECT_SOURCE_DIR}/yolov3-tiny.cpp) 35 | target_link_libraries(yolov3-tiny nvinfer) 36 | target_link_libraries(yolov3-tiny cudart) 37 | target_link_libraries(yolov3-tiny yololayer) 38 | target_link_libraries(yolov3-tiny ${OpenCV_LIBS}) 39 | 40 | add_definitions(-O2 -pthread) 41 | 42 | -------------------------------------------------------------------------------- /yolov3-tiny/gen_wts.py: -------------------------------------------------------------------------------- 1 | import struct 2 | import sys 3 | from models import * 4 | from utils.utils import * 5 | 6 | model = Darknet('cfg/yolov3-tiny.cfg', (608, 608)) 7 | weights = sys.argv[1] 8 | device = torch_utils.select_device('0') 9 | if weights.endswith('.pt'): # pytorch format 10 | model.load_state_dict(torch.load(weights, map_location=device)['model']) 11 | else: # darknet format 12 | load_darknet_weights(model, weights) 13 | model = model.eval() 14 | 15 | with open('yolov3-tiny.wts', 'w') as f: 16 | f.write('{}\n'.format(len(model.state_dict().keys()))) 17 | for k, v in model.state_dict().items(): 18 | vr = v.reshape(-1).cpu().numpy() 19 | f.write('{} {} '.format(k, len(vr))) 20 | for vv in vr: 21 | f.write(' ') 22 | f.write(struct.pack('>f',float(vv)).hex()) 23 | f.write('\n') 24 | 25 | -------------------------------------------------------------------------------- /yolov3-tiny/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #if NV_TENSORRT_MAJOR >= 8 5 | #define TRT_NOEXCEPT noexcept 6 | #define TRT_CONST_ENQUEUE const 7 | #else 8 | #define TRT_NOEXCEPT 9 | #define TRT_CONST_ENQUEUE 10 | #endif 11 | 12 | #endif // __MACROS_H -------------------------------------------------------------------------------- /yolov3/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(yolov3) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 15 | # cuda 16 | include_directories(/usr/local/cuda/include) 17 | link_directories(/usr/local/cuda/lib64) 18 | # tensorrt 19 | include_directories(/usr/include/x86_64-linux-gnu/) 20 | link_directories(/usr/lib/x86_64-linux-gnu/) 21 | 22 | find_package(OpenCV) 23 | include_directories(${OpenCV_INCLUDE_DIRS}) 24 | 25 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 26 | 27 | #cuda_add_library(leaky ${PROJECT_SOURCE_DIR}/leaky.cu) 28 | cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu) 29 | target_link_libraries(yololayer nvinfer cudart ${OpenCV_LIBS}) 30 | 31 | add_executable(yolov3 ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/yolov3.cpp) 32 | target_link_libraries(yolov3 nvinfer) 33 | target_link_libraries(yolov3 cudart) 34 | target_link_libraries(yolov3 yololayer) 35 | target_link_libraries(yolov3 ${OpenCV_LIBS}) 36 | 37 | add_definitions(-O2 -pthread) 38 | 39 | -------------------------------------------------------------------------------- /yolov3/calibrator.h: -------------------------------------------------------------------------------- 1 | #ifndef ENTROPY_CALIBRATOR_H 2 | #define ENTROPY_CALIBRATOR_H 3 | 4 | #include "NvInfer.h" 5 | #include 6 | #include 7 | #include "macros.h" 8 | 9 | //! \class Int8EntropyCalibrator2 10 | //! 11 | //! \brief Implements Entropy calibrator 2. 12 | //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. 13 | //! 14 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 15 | { 16 | public: 17 | Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); 18 | 19 | virtual ~Int8EntropyCalibrator2(); 20 | int getBatchSize() const TRT_NOEXCEPT override; 21 | bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; 22 | const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; 23 | void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; 24 | 25 | private: 26 | int batchsize_; 27 | int input_w_; 28 | int input_h_; 29 | int img_idx_; 30 | std::string img_dir_; 31 | std::vector img_files_; 32 | size_t input_count_; 33 | std::string calib_table_name_; 34 | const char* input_blob_name_; 35 | bool read_cache_; 36 | void* device_input_; 37 | std::vector calib_cache_; 38 | }; 39 | 40 | #endif // ENTROPY_CALIBRATOR_H 41 | -------------------------------------------------------------------------------- /yolov3/gen_wts.py: -------------------------------------------------------------------------------- 1 | import struct 2 | import sys 3 | from models import * 4 | from utils.utils import * 5 | 6 | model = Darknet('cfg/yolov3.cfg', (608, 608)) 7 | weights = sys.argv[1] 8 | device = torch_utils.select_device('0') 9 | if weights.endswith('.pt'): # pytorch format 10 | model.load_state_dict(torch.load(weights, map_location=device)['model']) 11 | else: # darknet format 12 | load_darknet_weights(model, weights) 13 | model = model.eval() 14 | 15 | with open('yolov3.wts', 'w') as f: 16 | f.write('{}\n'.format(len(model.state_dict().keys()))) 17 | for k, v in model.state_dict().items(): 18 | vr = v.reshape(-1).cpu().numpy() 19 | f.write('{} {} '.format(k, len(vr))) 20 | for vv in vr: 21 | f.write(' ') 22 | f.write(struct.pack('>f',float(vv)).hex()) 23 | f.write('\n') 24 | 25 | -------------------------------------------------------------------------------- /yolov3/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #if NV_TENSORRT_MAJOR >= 8 5 | #define TRT_NOEXCEPT noexcept 6 | #define TRT_CONST_ENQUEUE const 7 | #else 8 | #define TRT_NOEXCEPT 9 | #define TRT_CONST_ENQUEUE 10 | #endif 11 | 12 | #endif // __MACROS_H 13 | -------------------------------------------------------------------------------- /yolov4/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project(yolov4) 4 | 5 | add_definitions(-std=c++11) 6 | 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 8 | set(CMAKE_CXX_STANDARD 11) 9 | set(CMAKE_BUILD_TYPE Debug) 10 | 11 | find_package(CUDA REQUIRED) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 15 | # cuda 16 | include_directories(/usr/local/cuda/include) 17 | link_directories(/usr/local/cuda/lib64) 18 | # tensorrt 19 | include_directories(/usr/include/x86_64-linux-gnu/) 20 | link_directories(/usr/lib/x86_64-linux-gnu/) 21 | 22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") 23 | 24 | cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu ${PROJECT_SOURCE_DIR}/mish.cu) 25 | target_link_libraries(myplugins nvinfer cudart) 26 | 27 | find_package(OpenCV) 28 | include_directories(${OpenCV_INCLUDE_DIRS}) 29 | 30 | add_executable(yolov4 ${PROJECT_SOURCE_DIR}/yolov4.cpp) 31 | target_link_libraries(yolov4 nvinfer) 32 | target_link_libraries(yolov4 cudart) 33 | target_link_libraries(yolov4 myplugins) 34 | target_link_libraries(yolov4 ${OpenCV_LIBS}) 35 | 36 | add_definitions(-O2 -pthread) 37 | 38 | -------------------------------------------------------------------------------- /yolov4/gen_wts.py: -------------------------------------------------------------------------------- 1 | import struct 2 | import sys 3 | from models import * 4 | from utils.utils import * 5 | 6 | model = Darknet('cfg/yolov4.cfg', (608, 608)) 7 | weights = sys.argv[1] 8 | device = torch_utils.select_device('0') 9 | if weights.endswith('.pt'): # pytorch format 10 | model.load_state_dict(torch.load(weights, map_location=device)['model']) 11 | else: # darknet format 12 | load_darknet_weights(model, weights) 13 | 14 | with open('yolov4.wts', 'w') as f: 15 | f.write('{}\n'.format(len(model.state_dict().keys()))) 16 | for k, v in model.state_dict().items(): 17 | vr = v.reshape(-1).cpu().numpy() 18 | f.write('{} {} '.format(k, len(vr))) 19 | for vv in vr: 20 | f.write(' ') 21 | f.write(struct.pack('>f',float(vv)).hex()) 22 | f.write('\n') 23 | 24 | -------------------------------------------------------------------------------- /yolov4/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef __TRT_UTILS_H_ 2 | #define __TRT_UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #ifndef CUDA_CHECK 10 | 11 | #define CUDA_CHECK(callstr) \ 12 | { \ 13 | cudaError_t error_code = callstr; \ 14 | if (error_code != cudaSuccess) { \ 15 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ 16 | assert(0); \ 17 | } \ 18 | } 19 | 20 | #endif 21 | 22 | namespace Tn 23 | { 24 | template 25 | void write(char*& buffer, const T& val) 26 | { 27 | *reinterpret_cast(buffer) = val; 28 | buffer += sizeof(T); 29 | } 30 | 31 | template 32 | void read(const char*& buffer, T& val) 33 | { 34 | val = *reinterpret_cast(buffer); 35 | buffer += sizeof(T); 36 | } 37 | } 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /yolov5/images: -------------------------------------------------------------------------------- 1 | ../yolov3-spp/samples -------------------------------------------------------------------------------- /yolov5/src/calibrator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "macros.h" 4 | #include 5 | #include 6 | #include 7 | 8 | cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h); 9 | 10 | //! \class Int8EntropyCalibrator2 11 | //! 12 | //! \brief Implements Entropy calibrator 2. 13 | //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. 14 | //! 15 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { 16 | public: 17 | Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); 18 | 19 | virtual ~Int8EntropyCalibrator2(); 20 | int getBatchSize() const TRT_NOEXCEPT override; 21 | bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; 22 | const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; 23 | void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; 24 | 25 | private: 26 | int batchsize_; 27 | int input_w_; 28 | int input_h_; 29 | int img_idx_; 30 | std::string img_dir_; 31 | std::vector img_files_; 32 | size_t input_count_; 33 | std::string calib_table_name_; 34 | const char* input_blob_name_; 35 | bool read_cache_; 36 | void* device_input_; 37 | std::vector calib_cache_; 38 | }; 39 | 40 | -------------------------------------------------------------------------------- /yolov5/src/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRTX_CUDA_UTILS_H_ 2 | #define TRTX_CUDA_UTILS_H_ 3 | 4 | #include 5 | 6 | #ifndef CUDA_CHECK 7 | #define CUDA_CHECK(callstr)\ 8 | {\ 9 | cudaError_t error_code = callstr;\ 10 | if (error_code != cudaSuccess) {\ 11 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ 12 | assert(0);\ 13 | }\ 14 | } 15 | #endif // CUDA_CHECK 16 | 17 | #endif // TRTX_CUDA_UTILS_H_ 18 | 19 | -------------------------------------------------------------------------------- /yolov5/src/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #include 5 | 6 | #ifdef API_EXPORTS 7 | #if defined(_MSC_VER) 8 | #define API __declspec(dllexport) 9 | #else 10 | #define API __attribute__((visibility("default"))) 11 | #endif 12 | #else 13 | 14 | #if defined(_MSC_VER) 15 | #define API __declspec(dllimport) 16 | #else 17 | #define API 18 | #endif 19 | #endif // API_EXPORTS 20 | 21 | #if NV_TENSORRT_MAJOR >= 8 22 | #define TRT_NOEXCEPT noexcept 23 | #define TRT_CONST_ENQUEUE const 24 | #else 25 | #define TRT_NOEXCEPT 26 | #define TRT_CONST_ENQUEUE 27 | #endif 28 | 29 | #endif // __MACROS_H 30 | -------------------------------------------------------------------------------- /yolov5/src/model.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | nvinfer1::ICudaEngine* build_det_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, 7 | nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, 8 | float& gd, float& gw, std::string& wts_name); 9 | 10 | nvinfer1::ICudaEngine* build_det_p6_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, 11 | nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, 12 | float& gd, float& gw, std::string& wts_name); 13 | 14 | nvinfer1::ICudaEngine* build_cls_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw, std::string& wts_name); 15 | 16 | nvinfer1::ICudaEngine* build_seg_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw, std::string& wts_name); 17 | -------------------------------------------------------------------------------- /yolov5/src/postprocess.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "types.h" 4 | #include 5 | 6 | cv::Rect get_rect(cv::Mat& img, float bbox[4]); 7 | 8 | void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh = 0.5); 9 | 10 | void batch_nms(std::vector>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5); 11 | 12 | void draw_bbox(std::vector& img_batch, std::vector>& res_batch); 13 | 14 | std::vector process_mask(const float* proto, int proto_size, std::vector& dets); 15 | 16 | void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, std::unordered_map& labels_map); 17 | -------------------------------------------------------------------------------- /yolov5/src/preprocess.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | void cuda_preprocess_init(int max_image_size); 8 | void cuda_preprocess_destroy(); 9 | void cuda_preprocess(uint8_t* src, int src_width, int src_height, 10 | float* dst, int dst_width, int dst_height, 11 | cudaStream_t stream); 12 | void cuda_batch_preprocess(std::vector& img_batch, 13 | float* dst, int dst_width, int dst_height, 14 | cudaStream_t stream); 15 | 16 | -------------------------------------------------------------------------------- /yolov5/src/types.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "config.h" 4 | 5 | struct YoloKernel { 6 | int width; 7 | int height; 8 | float anchors[kNumAnchor * 2]; 9 | }; 10 | 11 | struct alignas(float) Detection { 12 | float bbox[4]; // center_x center_y w h 13 | float conf; // bbox_conf * cls_conf 14 | float class_id; 15 | float mask[32]; 16 | }; 17 | 18 | -------------------------------------------------------------------------------- /yolov7/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | project(yolov7) 4 | 5 | add_definitions(-std=c++11) 6 | add_definitions(-DAPI_EXPORTS) 7 | set(CMAKE_CXX_STANDARD 11) 8 | set(CMAKE_BUILD_TYPE Debug) 9 | 10 | set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) 11 | enable_language(CUDA) 12 | 13 | include_directories(${PROJECT_SOURCE_DIR}/include) 14 | include_directories(${PROJECT_SOURCE_DIR}/plugin) 15 | 16 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different 17 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") 18 | message("embed_platform on") 19 | include_directories(/usr/local/cuda/targets/aarch64-linux/include) 20 | link_directories(/usr/local/cuda/targets/aarch64-linux/lib) 21 | else() 22 | message("embed_platform off") 23 | # cuda 24 | include_directories(/usr/local/cuda/include) 25 | link_directories(/usr/local/cuda/lib64) 26 | 27 | # tensorrt 28 | include_directories(/home/nvidia/TensorRT-8.2.5.1/include) 29 | link_directories(/home/nvidia/TensorRT-8.2.5.1/lib) 30 | endif() 31 | 32 | add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu) 33 | target_link_libraries(myplugins nvinfer cudart) 34 | 35 | find_package(OpenCV) 36 | include_directories(${OpenCV_INCLUDE_DIRS}) 37 | 38 | file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) 39 | add_executable(yolov7 main.cpp ${SRCS}) 40 | 41 | target_link_libraries(yolov7 nvinfer) 42 | target_link_libraries(yolov7 cudart) 43 | target_link_libraries(yolov7 myplugins) 44 | target_link_libraries(yolov7 ${OpenCV_LIBS}) 45 | 46 | -------------------------------------------------------------------------------- /yolov7/images: -------------------------------------------------------------------------------- 1 | ../yolov3-spp/samples -------------------------------------------------------------------------------- /yolov7/include/block.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "NvInfer.h" 4 | #include 5 | #include 6 | #include 7 | 8 | std::map loadWeights(const std::string file); 9 | 10 | nvinfer1::IElementWiseLayer* convBnSilu(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c2, int k, int s, int p, std::string lname); 11 | 12 | nvinfer1::ILayer* ReOrg(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int inch); 13 | 14 | nvinfer1::ILayer* DownC(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c1, int c2, const std::string& lname); 15 | 16 | nvinfer1::IElementWiseLayer* SPPCSPC(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c2, const std::string& lname); 17 | 18 | nvinfer1::IElementWiseLayer* RepConv(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int c2, int k, int s, const std::string& lname); 19 | 20 | nvinfer1::IActivationLayer* convBlockLeakRelu(nvinfer1::INetworkDefinition* network, std::map& weightMap, nvinfer1::ITensor& input, int outch, int ksize, int s, int p, std::string lname); 21 | 22 | nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition *network, std::map& weightMap, std::string lname, std::vector dets); 23 | 24 | -------------------------------------------------------------------------------- /yolov7/include/calibrator.h: -------------------------------------------------------------------------------- 1 | #ifndef ENTROPY_CALIBRATOR_H 2 | #define ENTROPY_CALIBRATOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "macros.h" 8 | 9 | //! \class Int8EntropyCalibrator2 10 | //! 11 | //! \brief Implements Entropy calibrator 2. 12 | //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. 13 | //! 14 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 15 | { 16 | public: 17 | Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); 18 | 19 | virtual ~Int8EntropyCalibrator2(); 20 | int getBatchSize() const TRT_NOEXCEPT override; 21 | bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; 22 | const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; 23 | void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; 24 | 25 | private: 26 | int batchsize_; 27 | int input_w_; 28 | int input_h_; 29 | int img_idx_; 30 | std::string img_dir_; 31 | std::vector img_files_; 32 | size_t input_count_; 33 | std::string calib_table_name_; 34 | const char* input_blob_name_; 35 | bool read_cache_; 36 | void* device_input_; 37 | std::vector calib_cache_; 38 | }; 39 | 40 | #endif // ENTROPY_CALIBRATOR_H 41 | -------------------------------------------------------------------------------- /yolov7/include/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRTX_CUDA_UTILS_H_ 2 | #define TRTX_CUDA_UTILS_H_ 3 | 4 | #include 5 | 6 | #ifndef CUDA_CHECK 7 | #define CUDA_CHECK(callstr)\ 8 | {\ 9 | cudaError_t error_code = callstr;\ 10 | if (error_code != cudaSuccess) {\ 11 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ 12 | assert(0);\ 13 | }\ 14 | } 15 | #endif // CUDA_CHECK 16 | 17 | #endif // TRTX_CUDA_UTILS_H_ 18 | 19 | -------------------------------------------------------------------------------- /yolov7/include/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #include "NvInfer.h" 5 | 6 | #ifdef API_EXPORTS 7 | #if defined(_MSC_VER) 8 | #define API __declspec(dllexport) 9 | #else 10 | #define API __attribute__((visibility("default"))) 11 | #endif 12 | #else 13 | 14 | #if defined(_MSC_VER) 15 | #define API __declspec(dllimport) 16 | #else 17 | #define API 18 | #endif 19 | #endif // API_EXPORTS 20 | 21 | #if NV_TENSORRT_MAJOR >= 8 22 | #define TRT_NOEXCEPT noexcept 23 | #define TRT_CONST_ENQUEUE const 24 | #else 25 | #define TRT_NOEXCEPT 26 | #define TRT_CONST_ENQUEUE 27 | #endif 28 | 29 | #endif // __MACROS_H 30 | -------------------------------------------------------------------------------- /yolov7/include/model.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "NvInfer.h" 4 | #include 5 | 6 | nvinfer1::IHostMemory* build_engine_yolov7e6e(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path); 7 | nvinfer1::IHostMemory* build_engine_yolov7d6(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path); 8 | nvinfer1::IHostMemory* build_engine_yolov7e6(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path); 9 | nvinfer1::IHostMemory* build_engine_yolov7w6(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path); 10 | nvinfer1::IHostMemory* build_engine_yolov7x(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path); 11 | nvinfer1::IHostMemory* build_engine_yolov7(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path); 12 | nvinfer1::IHostMemory* build_engine_yolov7_tiny(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name); 13 | -------------------------------------------------------------------------------- /yolov7/include/postprocess.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "types.h" 4 | #include 5 | 6 | cv::Rect get_rect(cv::Mat& img, float bbox[4]); 7 | 8 | void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh = 0.5); 9 | 10 | void batch_nms(std::vector>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5); 11 | 12 | void draw_bbox(std::vector& img_batch, std::vector>& res_batch); 13 | 14 | -------------------------------------------------------------------------------- /yolov7/include/preprocess.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | void cuda_preprocess_init(int max_image_size); 9 | void cuda_preprocess_destroy(); 10 | void cuda_preprocess(uint8_t* src, int src_width, int src_height, 11 | float* dst, int dst_width, int dst_height, 12 | cudaStream_t stream); 13 | void cuda_batch_preprocess(std::vector& img_batch, 14 | float* dst, int dst_width, int dst_height, 15 | cudaStream_t stream); 16 | 17 | -------------------------------------------------------------------------------- /yolov7/include/types.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "config.h" 4 | 5 | struct YoloKernel { 6 | int width; 7 | int height; 8 | float anchors[kNumAnchor * 2]; 9 | }; 10 | 11 | struct alignas(float) Detection { 12 | //center_x center_y w h 13 | float bbox[4]; 14 | float conf; // bbox_conf * cls_conf 15 | float class_id; 16 | }; 17 | 18 | -------------------------------------------------------------------------------- /yolov7/include/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRTX_YOLOV7_UTILS_H_ 2 | #define TRTX_YOLOV7_UTILS_H_ 3 | 4 | #include 5 | #include 6 | 7 | static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) { 8 | int w, h, x, y; 9 | float r_w = input_w / (img.cols*1.0); 10 | float r_h = input_h / (img.rows*1.0); 11 | if (r_h > r_w) { 12 | w = input_w; 13 | h = r_w * img.rows; 14 | x = 0; 15 | y = (input_h - h) / 2; 16 | } else { 17 | w = r_h * img.cols; 18 | h = input_h; 19 | x = (input_w - w) / 2; 20 | y = 0; 21 | } 22 | cv::Mat re(h, w, CV_8UC3); 23 | cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); 24 | cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); 25 | re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); 26 | return out; 27 | } 28 | 29 | static inline int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { 30 | DIR *p_dir = opendir(p_dir_name); 31 | if (p_dir == nullptr) { 32 | return -1; 33 | } 34 | 35 | struct dirent* p_file = nullptr; 36 | while ((p_file = readdir(p_dir)) != nullptr) { 37 | if (strcmp(p_file->d_name, ".") != 0 && 38 | strcmp(p_file->d_name, "..") != 0) { 39 | //std::string cur_file_name(p_dir_name); 40 | //cur_file_name += "/"; 41 | //cur_file_name += p_file->d_name; 42 | std::string cur_file_name(p_file->d_name); 43 | file_names.push_back(cur_file_name); 44 | } 45 | } 46 | 47 | closedir(p_dir); 48 | return 0; 49 | } 50 | 51 | #endif // TRTX_YOLOV7_UTILS_H_ 52 | 53 | -------------------------------------------------------------------------------- /yolov8/include/calibrator.h: -------------------------------------------------------------------------------- 1 | #ifndef ENTROPY_CALIBRATOR_H 2 | #define ENTROPY_CALIBRATOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "macros.h" 8 | 9 | //! \class Int8EntropyCalibrator2 10 | //! 11 | //! \brief Implements Entropy calibrator 2. 12 | //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. 13 | //! 14 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { 15 | public: 16 | Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, 17 | const char* input_blob_name, bool read_cache = true); 18 | virtual ~Int8EntropyCalibrator2(); 19 | int getBatchSize() const TRT_NOEXCEPT override; 20 | bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; 21 | const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; 22 | void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; 23 | 24 | private: 25 | int batchsize_; 26 | int input_w_; 27 | int input_h_; 28 | int img_idx_; 29 | std::string img_dir_; 30 | std::vector img_files_; 31 | size_t input_count_; 32 | std::string calib_table_name_; 33 | const char* input_blob_name_; 34 | bool read_cache_; 35 | void* device_input_; 36 | std::vector calib_cache_; 37 | }; 38 | 39 | #endif // ENTROPY_CALIBRATOR_H 40 | -------------------------------------------------------------------------------- /yolov8/include/config.h: -------------------------------------------------------------------------------- 1 | #define USE_FP16 2 | //#define USE_FP32 3 | //#define USE_INT8 4 | 5 | const static char* kInputTensorName = "images"; 6 | const static char* kOutputTensorName = "output"; 7 | const static int kNumClass = 80; 8 | const static int kBatchSize = 1; 9 | const static int kGpuId = 0; 10 | const static int kInputH = 640; 11 | const static int kInputW = 640; 12 | const static float kNmsThresh = 0.45f; 13 | const static float kConfThresh = 0.5f; 14 | const static float kConfThreshKeypoints = 0.5f; // keypoints confidence 15 | const static int kMaxInputImageSize = 3000 * 3000; 16 | const static int kMaxNumOutputBbox = 1000; 17 | //Quantization input image folder path 18 | const static char* kInputQuantizationFolder = "./coco_calib"; 19 | 20 | // Classfication model's number of classes 21 | constexpr static int kClsNumClass = 1000; 22 | // Classfication model's input shape 23 | constexpr static int kClsInputH = 224; 24 | constexpr static int kClsInputW = 224; 25 | 26 | // pose model's number of classes 27 | constexpr static int kPoseNumClass = 1; 28 | const static int kNumberOfPoints = 17; // number of keypoints total 29 | 30 | // obb model's number of classes 31 | constexpr static int kObbNumClass = 15; 32 | -------------------------------------------------------------------------------- /yolov8/include/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRTX_CUDA_UTILS_H_ 2 | #define TRTX_CUDA_UTILS_H_ 3 | 4 | #include 5 | 6 | #ifndef CUDA_CHECK 7 | #define CUDA_CHECK(callstr) \ 8 | { \ 9 | cudaError_t error_code = callstr; \ 10 | if (error_code != cudaSuccess) { \ 11 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ 12 | assert(0); \ 13 | } \ 14 | } 15 | #endif // CUDA_CHECK 16 | 17 | #endif // TRTX_CUDA_UTILS_H_ 18 | -------------------------------------------------------------------------------- /yolov8/include/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #include "NvInfer.h" 5 | 6 | #ifdef API_EXPORTS 7 | #if defined(_MSC_VER) 8 | #define API __declspec(dllexport) 9 | #else 10 | #define API __attribute__((visibility("default"))) 11 | #endif 12 | #else 13 | 14 | #if defined(_MSC_VER) 15 | #define API __declspec(dllimport) 16 | #else 17 | #define API 18 | #endif 19 | #endif // API_EXPORTS 20 | 21 | #if NV_TENSORRT_MAJOR >= 8 22 | #define TRT_NOEXCEPT noexcept 23 | #define TRT_CONST_ENQUEUE const 24 | #else 25 | #define TRT_NOEXCEPT 26 | #define TRT_CONST_ENQUEUE 27 | #endif 28 | 29 | #endif // __MACROS_H 30 | -------------------------------------------------------------------------------- /yolov8/include/preprocess.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "NvInfer.h" 6 | #include "types.h" 7 | 8 | void cuda_preprocess_init(int max_image_size); 9 | 10 | void cuda_preprocess_destroy(); 11 | 12 | void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height, 13 | cudaStream_t stream); 14 | 15 | void cuda_batch_preprocess(std::vector& img_batch, float* dst, int dst_width, int dst_height, 16 | cudaStream_t stream); 17 | -------------------------------------------------------------------------------- /yolov8/include/types.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "config.h" 3 | 4 | struct alignas(float) Detection { 5 | //center_x center_y w h 6 | float bbox[4]; 7 | float conf; // bbox_conf * cls_conf 8 | float class_id; 9 | float mask[32]; 10 | float keypoints[kNumberOfPoints * 3]; // keypoints array with dynamic size based on kNumberOfPoints 11 | float angle; // obb angle 12 | }; 13 | 14 | struct AffineMatrix { 15 | float value[6]; 16 | }; 17 | 18 | const int bbox_element = 19 | sizeof(AffineMatrix) / sizeof(float) + 1; // left, top, right, bottom, confidence, class, keepflag 20 | -------------------------------------------------------------------------------- /yolov9/images: -------------------------------------------------------------------------------- 1 | ../yolov3-spp/samples -------------------------------------------------------------------------------- /yolov9/include/calibrator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "macros.h" 4 | #include 5 | #include 6 | 7 | //! \class Int8EntropyCalibrator2 8 | //! 9 | //! \brief Implements Entropy calibrator 2. 10 | //! CalibrationAlgoType is kENTROPY_CALIBRATION_2. 11 | //! 12 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 { 13 | public: 14 | Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); 15 | 16 | virtual ~Int8EntropyCalibrator2(); 17 | int getBatchSize() const TRT_NOEXCEPT override; 18 | bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; 19 | const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; 20 | void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; 21 | 22 | private: 23 | int batchsize_; 24 | int input_w_; 25 | int input_h_; 26 | int img_idx_; 27 | std::string img_dir_; 28 | std::vector img_files_; 29 | size_t input_count_; 30 | std::string calib_table_name_; 31 | const char* input_blob_name_; 32 | bool read_cache_; 33 | void* device_input_; 34 | std::vector calib_cache_; 35 | }; 36 | 37 | -------------------------------------------------------------------------------- /yolov9/include/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef TRTX_CUDA_UTILS_H_ 2 | #define TRTX_CUDA_UTILS_H_ 3 | 4 | #include 5 | 6 | #ifndef CUDA_CHECK 7 | #define CUDA_CHECK(callstr)\ 8 | {\ 9 | cudaError_t error_code = callstr;\ 10 | if (error_code != cudaSuccess) {\ 11 | std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ 12 | assert(0);\ 13 | }\ 14 | } 15 | #endif // CUDA_CHECK 16 | 17 | #endif // TRTX_CUDA_UTILS_H_ 18 | 19 | -------------------------------------------------------------------------------- /yolov9/include/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __MACROS_H 2 | #define __MACROS_H 3 | 4 | #include 5 | 6 | #ifdef API_EXPORTS 7 | #if defined(_MSC_VER) 8 | #define API __declspec(dllexport) 9 | #else 10 | #define API __attribute__((visibility("default"))) 11 | #endif 12 | #else 13 | 14 | #if defined(_MSC_VER) 15 | #define API __declspec(dllimport) 16 | #else 17 | #define API 18 | #endif 19 | #endif // API_EXPORTS 20 | 21 | #if NV_TENSORRT_MAJOR >= 8 22 | #define TRT_NOEXCEPT noexcept 23 | #define TRT_CONST_ENQUEUE const 24 | #else 25 | #define TRT_NOEXCEPT 26 | #define TRT_CONST_ENQUEUE 27 | #endif 28 | 29 | #endif // __MACROS_H 30 | -------------------------------------------------------------------------------- /yolov9/include/postprocess.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "types.h" 4 | #include 5 | #include 6 | cv::Rect get_rect(cv::Mat& img, float bbox[4]); 7 | 8 | void nms(std::vector& res, float *output, float conf_thresh, float nms_thresh = 0.5); 9 | 10 | void batch_nms(std::vector>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5); 11 | 12 | void draw_bbox(std::vector& img_batch, std::vector>& res_batch); 13 | 14 | std::vector process_mask(const float* proto, int proto_size, std::vector& dets); 15 | 16 | void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, std::unordered_map& labels_map); 17 | // cuda NMS 18 | void cuda_decode(float* predict, int num_bboxes, float confidence_threshold,float* parray,int max_objects, cudaStream_t stream); 19 | void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream); 20 | void batch_process(std::vector> &res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector& img_batch); -------------------------------------------------------------------------------- /yolov9/include/preprocess.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | void cuda_preprocess_init(int max_image_size); 8 | void cuda_preprocess_destroy(); 9 | void cuda_preprocess(uint8_t* src, int src_width, int src_height, 10 | float* dst, int dst_width, int dst_height, 11 | cudaStream_t stream); 12 | void cuda_batch_preprocess(std::vector& img_batch, 13 | float* dst, int dst_width, int dst_height, 14 | cudaStream_t stream); 15 | 16 | -------------------------------------------------------------------------------- /yolov9/include/types.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "config.h" 4 | 5 | struct YoloKernel { 6 | int width; 7 | int height; 8 | float anchors[kNumAnchor * 2]; 9 | }; 10 | 11 | struct alignas(float) Detection { 12 | float bbox[4]; // center_x center_y w h 13 | float conf; // bbox_conf * cls_conf 14 | float class_id; 15 | float mask[32]; 16 | }; 17 | const int bbox_element = 7; // center_x, center_y, w, h, conf, cls, obj 18 | --------------------------------------------------------------------------------