├── .clang-format
├── .cmake-format.yaml
├── .github
    ├── ISSUE_TEMPLATE
    │   └── tensorrtx-issue-template.md
    ├── stale.yml
    └── workflows
    │   └── pre-commit.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── alexnet
    ├── CMakeLists.txt
    ├── README.md
    ├── alex.cpp
    ├── alexnet.py
    └── logging.h
├── arcface
    ├── CMakeLists.txt
    ├── README.md
    ├── arcface-mobilefacenet.cpp
    ├── arcface-r100.cpp
    ├── arcface-r50.cpp
    ├── gen_wts.py
    ├── logging.h
    ├── macros.h
    ├── prelu.cu
    └── prelu.h
├── centernet
    ├── README.md
    ├── centernet.py
    ├── dcnv2Plugin
    │   ├── CMakeLists.txt
    │   ├── dcn_v2_im2col_cuda.cu
    │   ├── dcn_v2_im2col_cuda.h
    │   ├── dcnv2Plugin.cpp
    │   └── dcnv2Plugin.h
    └── sample
    │   ├── common.py
    │   └── test.py
├── crnn
    ├── CMakeLists.txt
    ├── README.md
    ├── crnn.cpp
    ├── genwts.py
    └── logging.h
├── csrnet
    ├── CMakeLists.txt
    ├── README.md
    ├── config.h
    ├── csrnet.cpp
    ├── gen_wts.py
    ├── logging.h
    └── macros.h
├── dbnet
    ├── CMakeLists.txt
    ├── README.md
    ├── clipper
    │   ├── CMakeLists.txt
    │   ├── clipper.cpp
    │   └── clipper.hpp
    ├── common.hpp
    ├── dbnet.cpp
    ├── logging.h
    └── utils.h
├── densenet
    ├── CMakeLists.txt
    ├── README.md
    ├── densenet121.cpp
    ├── densenet121.py
    └── logging.h
├── detr
    ├── CMakeLists.txt
    ├── README.md
    ├── backbone.hpp
    ├── calibrator.hpp
    ├── common.hpp
    ├── detr.cpp
    ├── gen_wts.py
    ├── logging.h
    └── macros.h
├── docker
    ├── .env
    ├── README.md
    ├── tensorrtx-docker-compose.yml
    └── x86_64.dockerfile
├── efficient_ad
    ├── CMakeLists.txt
    ├── README.md
    ├── datas
    │   └── models
    │   │   └── gen_wts.py
    ├── efficientAD_det.cpp
    └── src
    │   ├── config.h
    │   ├── cuda_utils.h
    │   ├── logging.h
    │   ├── macros.h
    │   ├── model.cpp
    │   ├── model.h
    │   ├── postprocess.h
    │   └── utils.h
├── efficientnet
    ├── CMakeLists.txt
    ├── README.md
    ├── efficientnet.cpp
    ├── gen_wts.py
    ├── logging.h
    └── utils.hpp
├── ghostnet
    ├── README.md
    ├── ghostnetv1
    │   ├── CMakeLists.txt
    │   ├── gen_wts.py
    │   ├── ghostnetv1.cpp
    │   └── logging.h
    └── ghostnetv2
    │   ├── CMakeLists.txt
    │   ├── gen_wts.py
    │   ├── ghostnetv2.cpp
    │   └── logging.h
├── googlenet
    ├── CMakeLists.txt
    ├── README.md
    ├── googlenet.cpp
    └── logging.h
├── hrnet
    ├── hrnet-image-classification
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── common.hpp
    │   ├── demo.py
    │   ├── hrnet.cpp
    │   └── logging.h
    └── hrnet-semantic-segmentation
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── common.hpp
    │   ├── gen_wts.py
    │   ├── hrnet.cpp
    │   ├── hrnet_ocr.cpp
    │   ├── hrnet_trt.py
    │   └── logging.h
├── ibnnet
    ├── CMakeLists.txt
    ├── InferenceEngine.cpp
    ├── InferenceEngine.h
    ├── README.md
    ├── gen_wts.py
    ├── holder.h
    ├── ibnnet.cpp
    ├── ibnnet.h
    ├── layers.cpp
    ├── layers.h
    ├── logging.h
    ├── main.cpp
    ├── utils.cpp
    └── utils.h
├── inception
    ├── inceptionv3
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── inception_v3.cpp
    │   └── logging.h
    └── inceptionv4
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── inception_v4.cpp
    │   ├── inception_v4.h
    │   ├── layers_api.cpp
    │   ├── layers_api.h
    │   ├── logging.h
    │   ├── main.cpp
    │   ├── utils.cpp
    │   └── utils.h
├── lenet
    ├── CMakeLists.txt
    ├── README.md
    ├── lenet.cpp
    ├── lenet.py
    ├── lenet_tripy.py
    ├── logging.h
    └── macros.h
├── lprnet
    ├── 1.jpg
    ├── CMakeLists.txt
    ├── LPRnet.cpp
    ├── README.md
    ├── genwts.py
    └── logging.h
├── mlp
    ├── CMakeLists.txt
    ├── README.md
    ├── logging.h
    ├── mlp.cpp
    ├── mlp.py
    └── mlp.wts
├── mnasnet
    ├── CMakeLists.txt
    ├── README.md
    ├── logging.h
    └── mnasnet.cpp
├── mobilenet
    ├── mobilenetv2
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── logging.h
    │   ├── mobilenet_v2.cpp
    │   └── mobilenet_v2.py
    └── mobilenetv3
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── logging.h
    │   ├── mobilenet_v3.cpp
    │   └── mobilenet_v3.py
├── psenet
    ├── CMakeLists.txt
    ├── README.md
    ├── gen_tf_wts.py
    ├── layers.cpp
    ├── layers.h
    ├── main.cpp
    ├── psenet.cpp
    ├── psenet.h
    ├── test.jpg
    ├── utils.cpp
    └── utils.h
├── rcnn
    ├── BatchedNms.cu
    ├── BatchedNmsPlugin.h
    ├── CMakeLists.txt
    ├── MaskRcnnInference.cu
    ├── MaskRcnnInferencePlugin.h
    ├── PredictorDecode.cu
    ├── PredictorDecodePlugin.h
    ├── README.md
    ├── RoiAlign.cu
    ├── RoiAlignPlugin.h
    ├── RpnDecode.cu
    ├── RpnDecodePlugin.h
    ├── RpnNms.cu
    ├── RpnNmsPlugin.h
    ├── backbone.hpp
    ├── calibrator.hpp
    ├── common.hpp
    ├── cuda_utils.h
    ├── gen_wts.py
    ├── logging.h
    ├── macros.h
    └── rcnn.cpp
├── real-esrgan
    ├── general-x4v3
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── cmake
    │   │   └── FindTensorRT.cmake
    │   ├── gen_wts.py
    │   ├── main.cpp
    │   └── src
    │   │   ├── include
    │   │       ├── config
    │   │       │   └── config.hpp
    │   │       ├── cuda_utils.h
    │   │       ├── logging
    │   │       │   └── logging.h
    │   │       ├── pixel_shuffle
    │   │       │   └── pixel_shuffle.hpp
    │   │       └── preprocess
    │   │       │   └── preprocess.hpp
    │   │   └── pixel_shuffle
    │   │       ├── pixel_shuffle.cpp
    │   │       └── pixel_shuffle.cu
    └── x4plus
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── common.hpp
    │   ├── cuda_utils.h
    │   ├── gen_wts.py
    │   ├── logging.h
    │   ├── macros.h
    │   ├── postprocess.cu
    │   ├── postprocess.hpp
    │   ├── preprocess.cu
    │   ├── preprocess.hpp
    │   ├── real-esrgan.cpp
    │   └── utils.h
├── refinedet
    ├── CMakeLists.txt
    ├── README.md
    ├── calibrator.cpp
    ├── calibrator.h
    ├── configure.h
    ├── gen_wts_refinedet.py
    ├── logging.h
    ├── refinedet.cpp
    └── utils.h
├── repvgg
    ├── CMakeLists.txt
    ├── README.md
    ├── gen_wts.py
    ├── logging.h
    └── repvgg.cpp
├── resnet
    ├── CMakeLists.txt
    ├── README.md
    ├── logging.h
    ├── resnet18.cpp
    ├── resnet34.cpp
    ├── resnet50.cpp
    ├── resnet50.py
    ├── resnext50_32x4d.cpp
    ├── wide_resnet50.py
    └── wideresnet50.cpp
├── retinaface
    ├── CMakeLists.txt
    ├── README.md
    ├── calibrator.cpp
    ├── calibrator.h
    ├── common.hpp
    ├── decode.cu
    ├── decode.h
    ├── logging.h
    ├── macros.h
    ├── retina_mnet.cpp
    ├── retina_r50.cpp
    └── retinaface_trt.py
├── retinafaceAntiCov
    ├── CMakeLists.txt
    ├── README.md
    ├── decode.cu
    ├── decode.h
    ├── gen_wts.py
    ├── logging.h
    ├── macros.h
    └── retinafaceAntiCov.cpp
├── scaled-yolov4
    ├── CMakeLists.txt
    ├── README.md
    ├── common.hpp
    ├── gen_wts.py
    ├── logging.h
    ├── mish.cu
    ├── mish.h
    ├── utils.h
    ├── yololayer.cu
    ├── yololayer.h
    └── yolov4_csp.cpp
├── senet
    ├── CMakeLists.txt
    ├── README.md
    ├── logging.h
    └── se_resnet50.cpp
├── shufflenetv2
    ├── CMakeLists.txt
    ├── README.md
    ├── logging.h
    └── shufflenet_v2.cpp
├── squeezenet
    ├── CMakeLists.txt
    ├── README.md
    ├── logging.h
    └── squeezenet.cpp
├── superpoint
    ├── CMakeLists.txt
    ├── README.md
    ├── gen_wts.py
    ├── logging.h
    ├── supernet.cpp
    ├── utils.cpp
    └── utils.h
├── swin-transformer
    └── semantic-segmentation
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── UpsampleKernel.cu
    │   ├── UpsamplePlugin.cpp
    │   ├── UpsamplePlugin.h
    │   ├── UpsmapleKernel.h
    │   ├── common.hpp
    │   ├── fillmask.cu
    │   ├── fillmask.h
    │   ├── gelu.cu
    │   ├── gelu.h
    │   ├── gen_wts.py
    │   ├── include
    │       └── dirent.h
    │   ├── layerNorm.cu
    │   ├── layerNorm.h
    │   ├── logging.h
    │   ├── main.cpp
    │   ├── myhpp.h
    │   ├── trainsform.cpp
    │   └── utilsn.h
├── tsm
    ├── CMakeLists.txt
    ├── README.md
    ├── demo.sh
    ├── gen_wts.py
    ├── logging.h
    ├── mmaction2_tsm_r50_config.py
    ├── test_shift.py
    ├── tsm_r50.cpp
    └── tsm_r50.py
├── tutorials
    ├── check_fp16_int8_support.md
    ├── contribution.md
    ├── faq.md
    ├── from_pytorch_to_trt_stepbystep_hrnet.md
    ├── getting_started.md
    ├── install.md
    ├── measure_performance.md
    ├── migrating_from_tensorrt_4_to_7.md
    ├── multi_GPU_processing.md
    └── run_on_windows.md
├── ufld
    ├── CMakeLists.txt
    ├── README.md
    ├── common.hpp
    ├── gen_wts.py
    ├── lane_det.cpp
    ├── logging.h
    ├── macros.h
    └── pth2onnx.py
├── unet
    ├── CMakeLists.txt
    ├── README.md
    ├── common.hpp
    ├── gen_wts.py
    ├── logging.h
    ├── macros.h
    └── unet.cpp
├── vgg
    ├── CMakeLists.txt
    ├── README.md
    ├── logging.h
    └── vgg11.cpp
├── yolo11
    ├── CMakeLists.txt
    ├── gen_wts.py
    ├── include
    │   ├── block.h
    │   ├── calibrator.h
    │   ├── config.h
    │   ├── cuda_utils.h
    │   ├── logging.h
    │   ├── macros.h
    │   ├── model.h
    │   ├── postprocess.h
    │   ├── preprocess.h
    │   ├── types.h
    │   └── utils.h
    ├── plugin
    │   ├── yololayer.cu
    │   └── yololayer.h
    ├── readme.md
    ├── src
    │   ├── block.cpp
    │   ├── calibrator.cpp
    │   ├── model.cpp
    │   ├── postprocess.cpp
    │   ├── postprocess.cu
    │   └── preprocess.cu
    ├── yolo11_cls.cpp
    ├── yolo11_cls_trt.py
    ├── yolo11_det.cpp
    ├── yolo11_det_trt.py
    ├── yolo11_obb.cpp
    ├── yolo11_obb_trt.py
    ├── yolo11_pose.cpp
    ├── yolo11_pose_trt.py
    ├── yolo11_seg.cpp
    └── yolo11_seg_trt.py
├── yolo11_tripy
    ├── .gitignore
    ├── README.md
    ├── classify.py
    ├── compile_classifier.py
    ├── constants.py
    ├── model
    │   ├── block.py
    │   └── model.py
    └── requirements.txt
├── yolop
    ├── CMakeLists.txt
    ├── README.md
    ├── common.hpp
    ├── cuda_utils.h
    ├── gen_wts.py
    ├── logging.h
    ├── macros.h
    ├── utils.h
    ├── yololayer.cu
    ├── yololayer.h
    ├── yolop.cpp
    ├── yolop.hpp
    └── yolop_trt.py
├── yolov10
    ├── CMakeLists.txt
    ├── README.md
    ├── gen_wts.py
    ├── include
    │   ├── block.h
    │   ├── calibrator.h
    │   ├── config.h
    │   ├── cuda_utils.h
    │   ├── logging.h
    │   ├── macros.h
    │   ├── model.h
    │   ├── postprocess.h
    │   ├── preprocess.h
    │   ├── types.h
    │   └── utils.h
    ├── plugin
    │   ├── yololayer.cu
    │   └── yololayer.h
    ├── src
    │   ├── block.cpp
    │   ├── calibrator.cpp
    │   ├── model.cpp
    │   ├── postprocess.cpp
    │   └── preprocess.cu
    ├── yolov10_det.cpp
    └── yolov10_det_trt.py
├── yolov12
    ├── CMakeLists.txt
    ├── gen_wts.py
    ├── include
    │   ├── block.h
    │   ├── config.h
    │   ├── cuda_utils.h
    │   ├── logging.h
    │   ├── macros.h
    │   ├── model.h
    │   ├── postprocess.h
    │   ├── preprocess.h
    │   ├── types.h
    │   └── utils.h
    ├── plugin
    │   ├── yololayer.cu
    │   └── yololayer.h
    ├── readme.md
    ├── src
    │   ├── block.cpp
    │   ├── model.cpp
    │   ├── postprocess.cpp
    │   ├── postprocess.cu
    │   └── preprocess.cu
    └── yolo12_det.cpp
├── yolov3-spp
    ├── CMakeLists.txt
    ├── README.md
    ├── Utils.h
    ├── gen_wts.py
    ├── logging.h
    ├── samples
    │   ├── bus.jpg
    │   └── zidane.jpg
    ├── yololayer.cu
    ├── yololayer.h
    └── yolov3-spp.cpp
├── yolov3-tiny
    ├── CMakeLists.txt
    ├── README.md
    ├── gen_wts.py
    ├── logging.h
    ├── macros.h
    ├── utils.h
    ├── yololayer.cu
    ├── yololayer.h
    └── yolov3-tiny.cpp
├── yolov3
    ├── CMakeLists.txt
    ├── README.md
    ├── calibrator.cpp
    ├── calibrator.h
    ├── gen_wts.py
    ├── logging.h
    ├── macros.h
    ├── utils.h
    ├── yololayer.cu
    ├── yololayer.h
    ├── yolov3.cpp
    └── yolov3_trt.py
├── yolov4
    ├── CMakeLists.txt
    ├── README.md
    ├── gen_wts.py
    ├── logging.h
    ├── mish.cu
    ├── mish.h
    ├── utils.h
    ├── yololayer.cu
    ├── yololayer.h
    └── yolov4.cpp
├── yolov5
    ├── CMakeLists.txt
    ├── README.md
    ├── gen_wts.py
    ├── images
    ├── plugin
    │   ├── yololayer.cu
    │   └── yololayer.h
    ├── src
    │   ├── calibrator.cpp
    │   ├── calibrator.h
    │   ├── config.h
    │   ├── cuda_utils.h
    │   ├── logging.h
    │   ├── macros.h
    │   ├── model.cpp
    │   ├── model.h
    │   ├── postprocess.cpp
    │   ├── postprocess.h
    │   ├── preprocess.cu
    │   ├── preprocess.h
    │   ├── types.h
    │   └── utils.h
    ├── yolov5_cls.cpp
    ├── yolov5_cls_trt.py
    ├── yolov5_det.cpp
    ├── yolov5_det_cuda_python.py
    ├── yolov5_det_trt.py
    ├── yolov5_seg.cpp
    └── yolov5_seg_trt.py
├── yolov7
    ├── CMakeLists.txt
    ├── README.md
    ├── gen_wts.py
    ├── images
    ├── include
    │   ├── block.h
    │   ├── calibrator.h
    │   ├── config.h
    │   ├── cuda_utils.h
    │   ├── logging.h
    │   ├── macros.h
    │   ├── model.h
    │   ├── postprocess.h
    │   ├── preprocess.h
    │   ├── types.h
    │   └── utils.h
    ├── main.cpp
    ├── plugin
    │   ├── yololayer.cu
    │   └── yololayer.h
    ├── src
    │   ├── block.cpp
    │   ├── calibrator.cpp
    │   ├── model.cpp
    │   ├── postprocess.cpp
    │   └── preprocess.cu
    └── yolov7_trt.py
├── yolov8
    ├── CMakeLists.txt
    ├── README.md
    ├── gen_wts.py
    ├── include
    │   ├── block.h
    │   ├── calibrator.h
    │   ├── config.h
    │   ├── cuda_utils.h
    │   ├── logging.h
    │   ├── macros.h
    │   ├── model.h
    │   ├── postprocess.h
    │   ├── preprocess.h
    │   ├── types.h
    │   └── utils.h
    ├── plugin
    │   ├── yololayer.cu
    │   └── yololayer.h
    ├── src
    │   ├── block.cpp
    │   ├── calibrator.cpp
    │   ├── model.cpp
    │   ├── postprocess.cpp
    │   ├── postprocess.cu
    │   └── preprocess.cu
    ├── yolov8_5u_det.cpp
    ├── yolov8_5u_det_trt.py
    ├── yolov8_cls.cpp
    ├── yolov8_cls_trt.py
    ├── yolov8_det.cpp
    ├── yolov8_det_trt.py
    ├── yolov8_obb.cpp
    ├── yolov8_obb_trt.py
    ├── yolov8_pose.cpp
    ├── yolov8_pose_trt.py
    ├── yolov8_seg.cpp
    └── yolov8_seg_trt.py
└── yolov9
    ├── CMakeLists.txt
    ├── README.md
    ├── demo.cpp
    ├── gen_wts.py
    ├── images
    ├── include
        ├── block.h
        ├── calibrator.h
        ├── config.h
        ├── cuda_utils.h
        ├── logging.h
        ├── macros.h
        ├── model.h
        ├── postprocess.h
        ├── preprocess.h
        ├── types.h
        └── utils.h
    ├── plugin
        ├── yololayer.cu
        └── yololayer.h
    ├── src
        ├── block.cpp
        ├── calibrator.cpp
        ├── model.cpp
        ├── postprocess.cpp
        ├── postprocess.cu
        └── preprocess.cu
    ├── windows
        └── dirent.h
    └── yolov9_trt.py


/.github/ISSUE_TEMPLATE/tensorrtx-issue-template.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: tensorrtx issue template
 3 | about: To understand your issue better
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Env
11 | 
12 | - GPU, e.g. V100, RTX2080, TX2, Xavier NX, Nano, etc.
13 | - OS, e.g. Ubuntu16.04, Win10, etc.
14 | - Cuda version
15 | - TensorRT version
16 | 
17 | ## About this repo
18 | 
19 | - which branch/tag/commit are you using?
20 | - which model? yolov5, retinaface?
21 | 
22 | ## Your problem
23 | 
24 | - what is your command? e.g. `sudo ./yolov5 -s`
25 | - what's your output?
26 | - what output do you expect?
27 | 


--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an issue becomes stale
 2 | daysUntilStale: 60
 3 | # Number of days of inactivity before a stale issue is closed
 4 | daysUntilClose: 7
 5 | # Issues with these labels will never be considered stale
 6 | exemptLabels:
 7 |   - pinned
 8 |   - security
 9 | # Label to use when marking an issue as stale
10 | staleLabel: wontfix
11 | # Comment to post when marking an issue as stale. Set to `false` to disable
12 | markComment: >
13 |   This issue has been automatically marked as stale because it has not had
14 |   recent activity. It will be closed if no further activity occurs. Thank you
15 |   for your contributions.
16 | # Comment to post when closing a stale issue. Set to `false` to disable
17 | closeComment: false
18 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | name: pre-commit
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - master
 7 |       - trt10
 8 | 
 9 | jobs:
10 |   pre-commit:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - uses: actions/checkout@v3
14 |       with:
15 |         # grab the history of the PR
16 |         fetch-depth: 0
17 |     - uses: actions/setup-python@v3
18 |     - uses: pre-commit/action@v3.0.1
19 |       with:
20 |         extra_args: --from-ref ${{ github.event.pull_request.base.sha }} --to-ref ${{ github.event.pull_request.head.sha }}
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build
 2 | *.wts
 3 | *.engine
 4 | *.tpymodel
 5 | */*.ppm
 6 | *idea*
 7 | 
 8 | .vscode/*
 9 | !.vscode/settings.json
10 | !.vscode/tasks.json
11 | !.vscode/launch.json
12 | !.vscode/extensions.json
13 | !.vscode/*.code-snippets
14 | 
15 | # Local History for Visual Studio Code
16 | .history/
17 | 
18 | # Built Visual Studio Code Extensions
19 | *.vsix
20 | 
21 | .vscode/*
22 | !.vscode/settings.json
23 | !.vscode/tasks.json
24 | !.vscode/launch.json
25 | !.vscode/extensions.json
26 | !.vscode/*.code-snippets
27 | 
28 | # Local History for Visual Studio Code
29 | .history/
30 | 
31 | # Built Visual Studio Code Extensions
32 | *.vsix
33 | 
34 | # Prerequisites
35 | *.d
36 | 
37 | # Compiled Object files
38 | *.slo
39 | *.lo
40 | *.o
41 | *.obj
42 | 
43 | # Precompiled Headers
44 | *.gch
45 | *.pch
46 | 
47 | # Compiled Dynamic libraries
48 | *.so
49 | *.dylib
50 | *.dll
51 | 
52 | # Fortran module files
53 | *.mod
54 | *.smod
55 | 
56 | # Compiled Static libraries
57 | *.lai
58 | *.la
59 | *.a
60 | *.lib
61 | 
62 | # Executables
63 | *.exe
64 | *.out
65 | *.app
66 | 
67 | CMakeLists.txt.user
68 | CMakeCache.txt
69 | CMakeFiles
70 | CMakeScripts
71 | Testing
72 | Makefile
73 | cmake_install.cmake
74 | install_manifest.txt
75 | compile_commands.json
76 | CTestTestfile.cmake
77 | _deps
78 | CMakeUserPresets.json
79 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.5.0
 4 |     hooks:
 5 |     -   id: check-merge-conflict
 6 |     -   id: check-symlinks
 7 |     -   id: end-of-file-fixer
 8 |     -   id: trailing-whitespace
 9 |     -   id: check-added-large-files
10 | -   repo: https://github.com/pre-commit/mirrors-clang-format
11 |     rev: v14.0.6
12 |     hooks:
13 |     -   id: clang-format
14 |         types_or: [c++, c, cuda]
15 | -   repo: https://github.com/PyCQA/flake8
16 |     rev: 7.0.0
17 |     hooks:
18 |     -   id: flake8
19 |         args: [--max-line-length=120]
20 | - repo: https://github.com/cheshirekow/cmake-format-precommit
21 |   rev: v0.6.13
22 |   hooks:
23 |   - id: cmake-format
24 |     additional_dependencies: [pyyaml]
25 |     args: [--in-place, -c, .cmake-format.yaml]
26 |     types: [file]
27 |     files: (\.cmake|CMakeLists.txt)(.in)?$
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019-2020 Wang Xinyu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/alexnet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(alexnet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | include_directories(${PROJECT_SOURCE_DIR}/include)
12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
13 | # cuda
14 | include_directories(/usr/local/cuda/include)
15 | link_directories(/usr/local/cuda/lib64)
16 | # tensorrt
17 | include_directories(/usr/include/x86_64-linux-gnu/)
18 | link_directories(/usr/lib/x86_64-linux-gnu/)
19 | 
20 | add_executable(alexnet ${PROJECT_SOURCE_DIR}/alex.cpp)
21 | target_link_libraries(alexnet nvinfer)
22 | target_link_libraries(alexnet cudart)
23 | 
24 | add_definitions(-O2 -pthread)
25 | 
26 | 


--------------------------------------------------------------------------------
/alexnet/README.md:
--------------------------------------------------------------------------------
 1 | # alexnet
 2 | 
 3 | AlexNet model architecture from the "One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
 4 | 
 5 | For the details, you can refer to [pytorchx/alexnet](https://github.com/wang-xinyu/pytorchx/tree/master/alexnet)
 6 | 
 7 | This alexnet is just several `conv-relu-pool` blocks followed by several `fc-relu`, nothing special. All layers can be implemented by tensorrt api, including `addConvolution`, `addActivation`, `addPooling`, `addFullyConnected`.
 8 | 
 9 | ```
10 | // 1. generate alexnet.wts from [pytorchx/alexnet](https://github.com/wang-xinyu/pytorchx/tree/master/alexnet)
11 | 
12 | // 2. put alexnet.wts into tensorrtx/alexnet
13 | 
14 | // 3. build and run
15 | 
16 | cd tensorrtx/alexnet
17 | 
18 | mkdir build
19 | 
20 | cd build
21 | 
22 | cmake ..
23 | 
24 | make
25 | 
26 | sudo ./alexnet -s   // serialize model to plan file i.e. 'alexnet.engine'
27 | 
28 | sudo ./alexnet -d   // deserialize plan file and run inference
29 | 
30 | // 4. see if the output is same as pytorchx/alexnet
31 | ```
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/arcface/gen_wts.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | import sys
 3 | import argparse
 4 | import face_model
 5 | import cv2
 6 | import numpy as np
 7 | 
 8 | parser = argparse.ArgumentParser(description='face model test')
 9 | # general
10 | parser.add_argument('--image-size', default='112,112', help='')
11 | parser.add_argument('--model', default='model-r100-ii/model,0', help='path to load model.')
12 | parser.add_argument('--ga-model', default='', help='path to load model.')
13 | parser.add_argument('--gpu', default=0, type=int, help='gpu id')
14 | parser.add_argument('--det', default=0, type=int, help='mtcnn option, 1 means using R+O, 0 means detect from begining')
15 | parser.add_argument('--flip', default=0, type=int, help='whether do lr flip aug')
16 | parser.add_argument('--threshold', default=1.24, type=float, help='ver dist threshold')
17 | args = parser.parse_args()
18 | 
19 | model = face_model.FaceModel(args)
20 | 
21 | f = open('arcface-r100.wts', 'w')
22 | f.write('{}\n'.format(len(model.model.get_params()[0].keys()) + len(model.model.get_params()[1].keys())))
23 | for k, v in model.model.get_params()[0].items():
24 |     vr = v.reshape(-1).asnumpy()
25 |     f.write('{} {} '.format(k, len(vr)))
26 |     for vv in vr:
27 |         f.write(' ')
28 |         f.write(struct.pack('>f',float(vv)).hex())
29 |     f.write('\n')
30 | for k, v in model.model.get_params()[1].items():
31 |     vr = v.reshape(-1).asnumpy()
32 |     f.write('{} {} '.format(k, len(vr)))
33 |     for vv in vr:
34 |         f.write(' ')
35 |         f.write(struct.pack('>f',float(vv)).hex())
36 |     f.write('\n')
37 | 
38 | 


--------------------------------------------------------------------------------
/arcface/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #if NV_TENSORRT_MAJOR >= 8
 5 | #define TRT_NOEXCEPT noexcept
 6 | #define TRT_CONST_ENQUEUE const
 7 | #else
 8 | #define TRT_NOEXCEPT
 9 | #define TRT_CONST_ENQUEUE
10 | #endif
11 | 
12 | #endif  // __MACROS_H


--------------------------------------------------------------------------------
/centernet/README.md:
--------------------------------------------------------------------------------
 1 | # CenterNet
 2 | 
 3 | This is the trt implementation of detection model [ctdet_coco_dla_2x](https://drive.google.com/open?id=1pl_-ael8wERdUREEnaIfqOV_VF2bEVRT) from [xingyizhou/CenterNet](https://github.com/xingyizhou/CenterNet) official work. 
 4 | 
 5 | ## How to Run
 6 | 
 7 | 1. Follow [NVIDIA/TensorRT](https://github.com/NVIDIA/TensorRT) tutorial to build TensorRT7
 8 | 
 9 | 2. Copy folder `dcnv2Plugin` to `TensorRT/plugin` and edit `InferPlugin.cpp` and `CMakeLists.txt`
10 | 
11 | 3. Rebuild to install custom plugin
12 | 
13 | 4. Use `tensorrt-7.2.3.4-cp36-none-linux_x86_64.whl` in TensorRT OSS to update your python-tensorrt
14 | 
15 | 5. Run `python centernet.py -m ${PTH_PATH} -s` to create trt engine 
16 | 
17 | ## Sample
18 | 
19 | ```
20 | // Download ctdet_coco_dla_2x.pth and transfer it into trt engine first
21 | // Download the test img from https://raw.githubusercontent.com/tensorflow/models/master/research/deeplab/g3doc/img/image2.jpg or choose your own one
22 | cd sample
23 | python test.py ${ENGINE_PATH} ${IMG_PATH}
24 | ```
25 | ![trt_out](https://user-images.githubusercontent.com/47047345/119128637-7a878900-ba68-11eb-91ff-5dcc10f01b77.jpg)
26 | 
27 | ## TODO
28 | 
29 | Integrate the post process with trt engine to make it more easier to use.


--------------------------------------------------------------------------------
/centernet/dcnv2Plugin/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | file(GLOB SRCS *.cpp)
17 | set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS})
18 | set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE)
19 | file(GLOB CU_SRCS *.cu)
20 | set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} ${CU_SRCS})
21 | set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} PARENT_SCOPE)


--------------------------------------------------------------------------------
/crnn/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(crnn)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
15 |     message("embed_platform on")
16 |     include_directories(/usr/local/cuda/targets/aarch64-linux/include)
17 |     link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
18 | else()
19 |     message("embed_platform off")
20 |     include_directories(/usr/local/cuda/include)
21 |     link_directories(/usr/local/cuda/lib64)
22 | endif()
23 | 
24 | find_package(OpenCV)
25 | include_directories(${OpenCV_INCLUDE_DIRS})
26 | 
27 | add_executable(crnn ${PROJECT_SOURCE_DIR}/crnn.cpp)
28 | target_link_libraries(crnn nvinfer)
29 | target_link_libraries(crnn cudart)
30 | target_link_libraries(crnn ${OpenCV_LIBS})
31 | 
32 | add_definitions(-O2 -pthread)
33 | 
34 | 


--------------------------------------------------------------------------------
/crnn/README.md:
--------------------------------------------------------------------------------
 1 | # crnn
 2 | 
 3 | The Pytorch implementation is [meijieru/crnn.pytorch](https://github.com/meijieru/crnn.pytorch).
 4 | 
 5 | ## How to Run
 6 | 
 7 | ```
 8 | 1. generate crnn.wts from pytorch
 9 | 
10 | git clone https://github.com/wang-xinyu/tensorrtx.git
11 | git clone https://github.com/meijieru/crnn.pytorch.git
12 | // download its weights 'crnn.pth'
13 | // copy tensorrtx/crnn/genwts.py into crnn.pytorch/
14 | // go to crnn.pytorch/
15 | python genwts.py
16 | // a file 'crnn.wts' will be generated.
17 | 
18 | 2. build tensorrtx/crnn and run
19 | 
20 | // put crnn.wts into tensorrtx/crnn
21 | // go to tensorrtx/crnn
22 | mkdir build
23 | cd build
24 | cmake ..
25 | make
26 | sudo ./crnn -s  // serialize model to plan file i.e. 'crnn.engine'
27 | // copy crnn.pytorch/data/demo.png here
28 | sudo ./crnn -d  // deserialize plan file and run inference
29 | 
30 | 3. check the output as follows:
31 | 
32 | raw: a-----v--a-i-l-a-bb-l-e---
33 | sim: available
34 | 
35 | ```
36 | 
37 | ## More Information
38 | 
39 | See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)
40 | 
41 | ## Acknowledgment
42 | 
43 | Thanks for the donation for this crnn tensorrt implementation from @雍.
44 | 
45 | 


--------------------------------------------------------------------------------
/crnn/genwts.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | import utils
 4 | import models.crnn as crnn
 5 | import struct
 6 | 
 7 | model_path = './data/crnn.pth'
 8 | 
 9 | model = crnn.CRNN(32, 1, 37, 256)
10 | if torch.cuda.is_available():
11 |     model = model.cuda()
12 | print('loading pretrained model from %s' % model_path)
13 | model.load_state_dict(torch.load(model_path))
14 | 
15 | image = torch.ones(1, 1, 32, 100)
16 | if torch.cuda.is_available():
17 |     image = image.cuda()
18 | 
19 | model.eval()
20 | print(model)
21 | print('image shape ', image.shape)
22 | preds = model(image)
23 | 
24 | f = open("crnn.wts", 'w')
25 | f.write("{}\n".format(len(model.state_dict().keys())))
26 | for k,v in model.state_dict().items():
27 |     print('key: ', k)
28 |     print('value: ', v.shape)
29 |     vr = v.reshape(-1).cpu().numpy()
30 |     f.write("{} {}".format(k, len(vr)))
31 |     for vv in vr:
32 |         f.write(" ")
33 |         f.write(struct.pack(">f", float(vv)).hex())
34 |     f.write("\n")
35 | 
36 | 


--------------------------------------------------------------------------------
/csrnet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | 
 3 | project(csrnet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | add_definitions(-DAPI_EXPORTS)
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | # cuda
12 | include_directories(/usr/local/cuda/targets/x86_64-linux/include )
13 | link_directories(/usr/local/cuda/targets/x86_64-linux/lib)
14 | 
15 | # tensorrt
16 | include_directories(/usr/include/x86_64-linux-gnu/)
17 | link_directories(/usr/lib/x86_64-linux-gnu/)
18 | 
19 | # opencv
20 | find_package(OpenCV)
21 | include_directories(${OpenCV_INCLUDE_DIRS})
22 | 
23 | include_directories(${PROJECT_SOURCE_DIR}/)
24 | 
25 | add_executable(csrnet csrnet.cpp)
26 | target_link_libraries(csrnet nvinfer cudart ${OpenCV_LIBS})


--------------------------------------------------------------------------------
/csrnet/README.md:
--------------------------------------------------------------------------------
 1 | # csrnet
 2 | 
 3 | The Pytorch implementation is [leeyeehoo/CSRNet-pytorch](https://github.com/leeyeehoo/CSRNet-pytorch).
 4 | 
 5 | This repo is a TensorRT implementation of CSRNet.
 6 | 
 7 | paper : [CSRNet: Dilated Convolutional Neural Networks for Understanding the Highly Congested Scenes](https://arxiv.org/abs/1802.10062)
 8 | 
 9 | Dev environment:
10 | - Ubuntu 22.04
11 | - TensorRT 8.6
12 | - OpenCV 4.5.4
13 | - CMake 3.24
14 | - GPU Driver 535.113.01
15 | - CUDA 12.2
16 | - RTX3080
17 | 
18 | 
19 | # how to run
20 | 
21 | ```bash
22 | 1. generate csrnet engine
23 | git clone https://github.com/leeyeehoo/CSRNet-pytorch.git
24 | git clone https://github.com/wang-xinyu/tensorrtx.git
25 | // copy gen_wts.py to CSRNet-pytorch
26 | // generate wts file
27 | python gen_wts.py
28 | // csrnet wts will be generated in CSRNet-pytorch
29 | 
30 | 2. build csrnet.engine
31 | // mv CSRNet-pytorch/csrnet.engine to tensorrtx/csrnet
32 | mv CSRNet-pytorch/csrnet.wts tensorrtx/csrnet
33 | // build
34 | mkdir build
35 | cmake ..
36 | make
37 | sudo ./csrnet -s  ./csrnet.wts
38 | 
39 | Loading weights: ./csrnet.wts
40 | build engine successfully : ./csrnet.engine
41 | 
42 | // download images https://github.com/wang-xinyu/tensorrtx/assets/46584679/46bc4def-e573-44ae-996d-5d68927c78ff and copy to images
43 | sudo ./csrnet -d  ./images
44 | 
45 | // output e.g
46 | // enqueueV2 time: 0.0323869s
47 | // detect time:44ms
48 | // people num :22.9101 write_path: ../images/data.jpg
49 | ```
50 | 
51 | 
52 | # result 
53 | 
54 | inference people num: 22.9101
55 | 
56 | <p align="center">
57 | <img src= https://raw.githubusercontent.com/wang-xinyu/tensorrtx/dbf857d25f77bf64113fc99a745ccf4973bdd44e/Density_Plot.jpg>
58 | </p>
59 | 


--------------------------------------------------------------------------------
/csrnet/config.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | const static char *kInputTensorName = "data";
 4 | const static char *kOutputTensorName = "prob";
 5 | const static char *kEngineFile = "./csrnet.engine";
 6 | 
 7 | const static int kBatchSize = 1;
 8 | 
 9 | const static int MAX_INPUT_SIZE = 1440; // 32x
10 | const static int MIN_INPUT_SIZE = 608;
11 | const static int OPT_INPUT_W = 1152;
12 | const static int OPT_INPUT_H = 640;
13 | 
14 | constexpr static int kMaxInputImageSize = MAX_INPUT_SIZE * MAX_INPUT_SIZE * 3;
15 | constexpr static int kMaxOutputProbSize =
16 |     (MAX_INPUT_SIZE * MAX_INPUT_SIZE) >> 6;


--------------------------------------------------------------------------------
/csrnet/gen_wts.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.modules import module
 2 | from model import CSRNet
 3 | import torch
 4 | import os
 5 | import struct
 6 | 
 7 | 
 8 | save_path = os.path.join(os.path.dirname(
 9 |     __file__), "output", os.path.basename(__file__).split('.')[0])
10 | os.makedirs(save_path, exist_ok=True)
11 | wts_file = os.path.join(save_path, "csrnet.wts")
12 | 
13 | 
14 | # load model
15 | model_path = "partBmodel_best.pth.tar"
16 | model = CSRNet()
17 | checkpoint = torch.load(model_path)
18 | model.load_state_dict(checkpoint['state_dict'])
19 | 
20 | 
21 | # save to wts
22 | print(f'Writing into {wts_file}')
23 | with open(wts_file, 'w') as f:
24 |     f.write('{}\n'.format(len(model.state_dict().keys())))
25 |     for k, v in model.state_dict().items():
26 |         vr = v.reshape(-1).cpu().numpy()
27 |         f.write('{} {} '.format(k, len(vr)))
28 |         for vv in vr:
29 |             f.write(' ')
30 |             f.write(struct.pack('>f', float(vv)).hex())
31 |         f.write('\n')


--------------------------------------------------------------------------------
/csrnet/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #if NV_TENSORRT_MAJOR >= 8
 5 | #define TRT_NOEXCEPT noexcept
 6 | #define TRT_CONST_ENQUEUE const
 7 | #else
 8 | #define TRT_NOEXCEPT
 9 | #define TRT_CONST_ENQUEUE
10 | #endif
11 | 
12 | #endif  // __MACROS_H
13 | 


--------------------------------------------------------------------------------
/dbnet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(dbnet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | 
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | # tensorrt
19 | include_directories(/usr/include/x86_64-linux-gnu/)
20 | link_directories(/usr/lib/x86_64-linux-gnu/)
21 | 
22 | find_package(OpenCV)
23 | include_directories(${OpenCV_INCLUDE_DIRS})
24 | 
25 | aux_source_directory(. DIRSRCS)
26 | 
27 | # clipper
28 | include_directories(./ ./clipper)
29 | add_subdirectory(clipper)
30 | 
31 | add_executable(dbnet ${DIRSRCS})
32 | target_link_libraries(dbnet clipper)
33 | target_link_libraries(dbnet nvinfer)
34 | target_link_libraries(dbnet cudart)
35 | target_link_libraries(dbnet ${OpenCV_LIBS})
36 | 
37 | add_definitions(-O2 -pthread)
38 | 
39 | 


--------------------------------------------------------------------------------
/dbnet/clipper/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 2.6)
2 | 
3 | aux_source_directory(. DIR_CLIPPER_SRCS)
4 | add_library(clipper ${DIR_CLIPPER_SRCS})


--------------------------------------------------------------------------------
/densenet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | 
 3 | # set the project name
 4 | project(densenet)
 5 | 
 6 | add_definitions(-std=c++11)
 7 | 
 8 | # get main project dir to include common files
 9 | get_filename_component(MAIN_DIR ../ ABSOLUTE)
10 | 
11 | # When enabled the static version of the 
12 | # CUDA runtime library will be used in CUDA_LIBRARIES
13 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
14 | 
15 | # specify the C++ standard
16 | set(CMAKE_CXX_STANDARD 11)
17 | set(CMAKE_CXX_STANDARD_REQUIRED True)
18 | set(CMAKE_BUILD_TYPE Debug)
19 | 
20 | # include
21 | 
22 | # include and link cuda
23 | include_directories(/usr/local/cuda/include)
24 | link_directories(/usr/local/cuda/lib64)
25 | 
26 | # include and link tensorrt
27 | include_directories(/usr/include/x86_64-linux-gnu)
28 | link_directories(/usr/lib/x86_64-linux-gnu)
29 | 
30 | # add the executable
31 | add_executable(densenet ${PROJECT_SOURCE_DIR}/densenet121.cpp)
32 | 
33 | target_link_libraries(densenet nvinfer)
34 | target_link_libraries(densenet cudart)
35 | 
36 | add_definitions(-O2 -pthread)


--------------------------------------------------------------------------------
/densenet/README.md:
--------------------------------------------------------------------------------
 1 | # Densenet121
 2 | 
 3 | The Pytorch implementation is [makaveli10/densenet](https://github.com/makaveli10/torchtrtz/tree/main/densenet). Model from torchvision.
 4 | The tensorrt implemenation is taken from [makaveli10/cpptensorrtz](https://github.com/makaveli10/cpptensorrtz/).
 5 | 
 6 | ## How to Run
 7 | 
 8 | 1. generate densenet121.wts from pytorch
 9 | 
10 | ```
11 | git clone https://github.com/wang-xinyu/tensorrtx.git
12 | git clone https://github.com/makaveli10/torchtrtz.git
13 | 
14 | // go to torchtrtz/densenet
15 | // Enter these two commands to create densenet121.wts
16 | python models.py
17 | python gen_trtwts.py
18 | ```
19 | 
20 | 2. build densenet and run
21 | 
22 | ```
23 | // put densenet121.wts into tensorrtx/densenet
24 | // go to tensorrtx/densenet
25 | mkdir build
26 | cd build
27 | cmake ..
28 | make
29 | sudo ./densenet -s  // serialize model to file i.e. 'densenet.engine'
30 | sudo ./densenet -d  // deserialize model and run inference
31 | ```
32 | 
33 | 3. Verify output from [torch impl](https://github.com/makaveli10/torchtrtz/blob/main/densenet/README.md)
34 | 
35 | TensorRT output[:5]:
36 | ```
37 |     [-0.587389, -0.329202, -1.83404, -1.89935, -0.928404]
38 | ```
39 | 
40 | 


--------------------------------------------------------------------------------
/detr/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(detr)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | # tensorrt
19 | include_directories(/data/app/TensorRT-8.4.3.1/include)
20 | link_directories(/data/app/TensorRT-8.4.3.1/lib)
21 | 
22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
23 | 
24 | find_package(OpenCV)
25 | include_directories(${OpenCV_INCLUDE_DIRS})
26 | 
27 | add_executable(detr ${PROJECT_SOURCE_DIR}/detr.cpp)
28 | target_link_libraries(detr nvinfer)
29 | target_link_libraries(detr cudart)
30 | target_link_libraries(detr ${OpenCV_LIBS})
31 | 
32 | add_definitions(-O2 -pthread)
33 | 
34 | 


--------------------------------------------------------------------------------
/detr/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #include "NvInfer.h"
 5 | 
 6 | #ifdef API_EXPORTS
 7 | #if defined(_MSC_VER)
 8 | #define API __declspec(dllexport)
 9 | #else
10 | #define API __attribute__((visibility("default")))
11 | #endif
12 | #else
13 | 
14 | #if defined(_MSC_VER)
15 | #define API __declspec(dllimport)a
16 | #else
17 | #define API
18 | #endif
19 | #endif  // API_EXPORTS
20 | 
21 | #if NV_TENSORRT_MAJOR >= 8
22 | #define TRT_NOEXCEPT noexcept
23 | #define TRT_CONST_ENQUEUE const
24 | #else
25 | #define TRT_NOEXCEPT
26 | #define TRT_CONST_ENQUEUE
27 | #endif
28 | 
29 | #endif  // __MACROS_H
30 | 


--------------------------------------------------------------------------------
/docker/.env:
--------------------------------------------------------------------------------
1 | COMPOSE_PROJECT_NAME=tensorrtx
2 | HOME=$HOME
3 | EUID=$(id -u)
4 | 
5 | ## (optional) a local mount point path
6 | DATA_DIR=""
7 | 


--------------------------------------------------------------------------------
/docker/tensorrtx-docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   tensorrt:
 3 |     image: tensortx:1.0.0
 4 |     container_name: tensortx
 5 |     environment:
 6 |       - NVIDIA_VISIBLE_DEVICES=all
 7 |     build:
 8 |       dockerfile: x86_64.dockerfile
 9 |     cap_add:
10 |       - CAP_SYS_ADMIN
11 |     security_opt:
12 |       - seccomp:unconfined
13 |     privileged: true
14 |     stdin_open: true
15 |     tty: true
16 |     shm_size: '8gb'
17 |     ulimits:
18 |       memlock:
19 |         soft: -1
20 |         hard: -1
21 |     devices:
22 |       - /dev:/dev:rw
23 |     volumes:
24 |       #### user ####
25 |       - ${HOME}:/workspace/localhome:rw
26 |       #### custom ####
27 |       - mount:/mnt:rw
28 |     deploy:
29 |       restart_policy:
30 |         condition: on-failure
31 |         max_attempts: 1
32 |         delay: 5s
33 |       resources:
34 |         reservations:
35 |           devices:
36 |             - driver: nvidia
37 |               capabilities: [gpu]
38 |               count: all
39 | 
40 | volumes:
41 |   mount:
42 |     driver: local
43 |     driver_opts:
44 |       type: none
45 |       o: bind
46 |       device: ${DATA_DIR}
47 | 


--------------------------------------------------------------------------------
/docker/x86_64.dockerfile:
--------------------------------------------------------------------------------
 1 | ARG TAG=24.01-py3
 2 | 
 3 | FROM nvcr.io/nvidia/tensorrt:${TAG} AS tensorrtx
 4 | 
 5 | ENV DEBIAN_FRONTEND noninteractive
 6 | 
 7 | # basic tools
 8 | RUN apt update && apt-get install -y --fix-missing --no-install-recommends \
 9 | sudo wget curl git ca-certificates ninja-build tzdata pkg-config \
10 | gdb libglib2.0-dev libmount-dev \
11 | && rm -rf /var/lib/apt/lists/*
12 | RUN pip install --no-cache-dir yapf isort cmake-format pre-commit
13 | 
14 | ## override older cmake
15 | RUN find /usr/local/share -type d -name "cmake-*" -exec rm -rf {} + \
16 | && curl -fsSL "https://github.com/Kitware/CMake/releases/download/v3.29.0/cmake-3.29.0-linux-x86_64.sh" \
17 | -o cmake.sh && bash cmake.sh --skip-license --exclude-subdir --prefix=/usr/local && rm cmake.sh
18 | 
19 | RUN apt update && apt-get install -y \
20 | libopencv-dev \
21 | && rm -rf /var/lib/apt/lists/*
22 | 
23 | ## a template to build opencv and opencv_contrib from source
24 | # RUN git clone -b 4.x https://github.com/opencv/opencv_contrib.git \
25 | # && git clone -b 4.x https://github.com/opencv/opencv.git opencv \
26 | # && cmake -S opencv -B opencv/build -G Ninja \
27 | # -DBUILD_LIST=core,calib3d,imgproc,imgcodecs,highgui \
28 | # -DOPENCV_EXTRA_MODULES_PATH="/workspace/opencv_contrib/modules" \
29 | # -DCMAKE_BUILD_TYPE=RELEASE \
30 | # -DCMAKE_INSTALL_PREFIX=/usr/local \
31 | # -DENABLE_FAST_MATH=ON \
32 | # -DOPENCV_GENERATE_PKGCONFIG=ON \
33 | # -DBUILD_opencv_python2=OFF \
34 | # -DBUILD_opencv_python3=OFF \
35 | # -DBUILD_JAVA=OFF \
36 | # -DBUILD_DOCS=OFF \
37 | # -DBUILD_PERF_TESTS=OFF \
38 | # -DBUILD_TESTS=OFF \
39 | # && ninja -C opencv/build install
40 | 


--------------------------------------------------------------------------------
/efficient_ad/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.12)
 2 | project(EfficientAD-M)
 3 | 
 4 | add_definitions(-w)
 5 | add_definitions(-D API_EXPORTS)
 6 | set(CMAKE_CXX_STANDARD 11)
 7 | set(CMAKE_BUILD_TYPE "Debug")
 8 | set(CMAKE_CUDA_ARCHITECTURES 61 75 86 89)
 9 | set(THREADS_PREFER_PTHREAD_FLAG ON)
10 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /od")
11 | 
12 | ### nvcc
13 | set(CMAKE_CUDA_COMPILER "D:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/bin/nvcc.exe")
14 | enable_language(CUDA)
15 | ### cuda
16 | include_directories("D:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/include")
17 | link_directories("D:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.8/lib/x64")
18 | ### tensorrt
19 | set(TRT_DIR "D:/Program Files/NVIDIA GPU Computing Toolkit/TensorRT-8.5.3.1/")
20 | include_directories(${TRT_DIR}/include)
21 | link_directories(${TRT_DIR}/lib)
22 | ### opencv
23 | set(OpenCV_DIR "E:/OpenCV/OpenCV_4.6.0/opencv/build")
24 | find_package(OpenCV)
25 | include_directories(${OpenCV_INCLUDE_DIRS})
26 | ### dirent
27 | include_directories("E:/SDK/dirent-1.24/include")
28 | 
29 | include_directories(${PROJECT_SOURCE_DIR}/src/)
30 | file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)
31 | 
32 | add_executable(efficientAD_det "./efficientAD_det.cpp" ${SRCS})
33 | target_link_libraries(efficientAD_det nvinfer
34 |                                       cudart
35 |                                       nvinfer_plugin
36 |                                       ${OpenCV_LIBS}
37 |                                       )
38 | 


--------------------------------------------------------------------------------
/efficient_ad/README.md:
--------------------------------------------------------------------------------
 1 | # EfficientAd
 2 | 
 3 | EfficientAd: Accurate Visual Anomaly Detection at Millisecond-Level Latencies.
 4 | 
 5 | The Pytorch implementation is [openvinotoolkit/anomalib](https://github.com/openvinotoolkit/anomalib).
 6 | 
 7 | <p align="center">
 8 | <img src="https://github.com/wang-xinyu/tensorrtx/assets/15235574/061c90a7-fe59-48e0-a8d0-6bddc4296cf1">
 9 | </p>
10 | 
11 | # Test Environment
12 | 
13 | GTX3080 / Windows10 22H2 / cuda11.8 / cudnn8.9.7 / TensorRT8.5.3 / OpenCV4.6
14 | 
15 | # How to Run
16 | 
17 | 1. training to generate weight files (`efficientAD_[category].pt`)
18 | 
19 |    ```
20 |    // Please refer to Anomalib's tutorial for details:
21 |    // https://github.com/openvinotoolkit/anomalib?tab=readme-ov-file#-training
22 |    ```
23 | 
24 | 2. generate `.wts` from pytorch with `.pt`
25 | 
26 |    ```
27 |    cd ./datas/models/
28 |    // copy your `.pt` file to the current directory.
29 |    python gen_wts.py
30 |    // a file `efficientAD_[category].wts` will be generated.
31 |    ```
32 | 
33 | 3. build and run
34 | 
35 |    ```
36 |    mkdir build
37 |    cd build
38 |    cmake ..
39 |    make
40 |    sudo ./EfficientAD-M -s [.wts] // serialize model to plan file
41 |    sudo ./EfficientAD-M -d [.engine] [image folder] // deserialize and run inference, the images in [image folder] will be processed
42 |    ```
43 | 
44 | # Latency
45 | 
46 | average cost of doInference(in `efficientad_detect.cpp`) from second time with batch=1 under the windows environment above
47 | 
48 | |               | FP32 |
49 | | :-----------: | :--: |
50 | | EfficientAD-M | 12ms |
51 | 


--------------------------------------------------------------------------------
/efficient_ad/datas/models/gen_wts.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import struct
 3 | import sys
 4 | 
 5 | # Initialize
 6 | pt_file = sys.argv[1]
 7 | device = torch.device('cuda')
 8 | # Load model
 9 | model = torch.load(pt_file, map_location=torch.device('cpu'))['model'].float()  # load to FP32
10 | model.to(device).eval()
11 | 
12 | with open(pt_file.split('.')[0] + '.wts', 'w') as f:
13 |     f.write('{}\n'.format(len(model.state_dict().keys())))
14 |     for k, v in model.state_dict().items():
15 |         vr = v.reshape(-1).cpu().numpy()
16 |         f.write('{} {} '.format(k, len(vr)))
17 |         for vv in vr:
18 |             f.write(' ')
19 |             f.write(struct.pack('>f', float(vv)).hex())
20 |         f.write('\n')
21 | 


--------------------------------------------------------------------------------
/efficient_ad/src/config.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | /* --------------------------------------------------------
 4 |  * These configs are related to tensorrt model, if these are changed,
 5 |  * please re-compile and re-serialize the tensorrt model.
 6 |  * --------------------------------------------------------*/
 7 | 
 8 | // For INT8, you need prepare the calibration dataset, please refer to
 9 | #define USE_FP32  // set USE_INT8 or USE_FP16 or USE_FP32
10 | 
11 | // These are used to define input/output tensor names,
12 | // you can set them to whatever you want.
13 | const static char* kInputTensorName = "data";
14 | const static char* kOutputTensorName = "prob";
15 | 
16 | constexpr static int kBatchSize = 1;
17 | 
18 | // input width and height must by divisible by 32
19 | constexpr static int kInputH = 256;
20 | constexpr static int kInputW = 256;
21 | 
22 | /* --------------------------------------------------------
23 |  * These configs are NOT related to tensorrt model, if these are changed,
24 |  * please re-compile, but no need to re-serialize the tensorrt model.
25 |  * --------------------------------------------------------*/
26 | 
27 | // default GPU_id
28 | const static int kGpuId = 0;
29 | 
30 | // If your image size is larger than 4096 * 3112, please increase this value
31 | const static int kMaxInputImageSize = 4096 * 3112;
32 | 


--------------------------------------------------------------------------------
/efficient_ad/src/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRTX_CUDA_UTILS_H_
 2 | #define TRTX_CUDA_UTILS_H_
 3 | 
 4 | #include <cuda_runtime_api.h>
 5 | 
 6 | #ifndef CUDA_CHECK
 7 | #define CUDA_CHECK(callstr)                                                                    \
 8 |     {                                                                                          \
 9 |         cudaError_t error_code = callstr;                                                      \
10 |         if (error_code != cudaSuccess) {                                                       \
11 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
12 |             assert(0);                                                                         \
13 |         }                                                                                      \
14 |     }
15 | #endif  // CUDA_CHECK
16 | 
17 | #endif  // TRTX_CUDA_UTILS_H_
18 | 


--------------------------------------------------------------------------------
/efficient_ad/src/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #include <NvInfer.h>
 5 | 
 6 | #ifdef API_EXPORTS
 7 | #if defined(_MSC_VER)
 8 | #define API __declspec(dllexport)
 9 | #else
10 | #define API __attribute__((visibility("default")))
11 | #endif
12 | #else
13 | 
14 | #if defined(_MSC_VER)
15 | #define API __declspec(dllimport)
16 | #else
17 | #define API
18 | #endif
19 | #endif  // API_EXPORTS
20 | 
21 | #if NV_TENSORRT_MAJOR >= 8
22 | #define TRT_NOEXCEPT noexcept
23 | #define TRT_CONST_ENQUEUE const
24 | #else
25 | #define TRT_NOEXCEPT
26 | #define TRT_CONST_ENQUEUE
27 | #endif
28 | 
29 | #endif  // __MACROS_H
30 | 


--------------------------------------------------------------------------------
/efficient_ad/src/model.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <NvInfer.h>
 4 | 
 5 | #include <string>
 6 | 
 7 | nvinfer1::ICudaEngine* build_efficientAD_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
 8 |                                                 nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd,
 9 |                                                 float& gw, std::string& wts_name);
10 | 


--------------------------------------------------------------------------------
/efficient_ad/src/postprocess.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <opencv2/opencv.hpp>
 4 | 
 5 | void genHeatMap(cv::Mat originImg, cv::Mat& anomalyGrayMap, cv::Mat& HeatMap) {
 6 |     cv::Mat colorMap;
 7 |     cv::applyColorMap(colorMap, anomalyGrayMap, cv::COLORMAP_JET);
 8 |     cv::addWeighted(originImg, 0.5, colorMap, 0.5, 0, HeatMap);
 9 | }
10 | 


--------------------------------------------------------------------------------
/efficient_ad/src/utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <dirent.h>
 4 | #include <cstring>
 5 | #include <fstream>
 6 | #include <sstream>
 7 | #include <string>
 8 | #include <unordered_map>
 9 | #include <vector>
10 | 
11 | static inline int read_files_in_dir(const char* p_dir_name, std::vector<std::string>& file_names) {
12 |     DIR* p_dir = opendir(p_dir_name);
13 |     if (p_dir == nullptr) {
14 |         return -1;
15 |     }
16 | 
17 |     struct dirent* p_file = nullptr;
18 |     while ((p_file = readdir(p_dir)) != nullptr) {
19 |         if (strcmp(p_file->d_name, ".") != 0 && strcmp(p_file->d_name, "..") != 0) {
20 |             //std::string cur_file_name(p_dir_name);
21 |             //cur_file_name += "/";
22 |             //cur_file_name += p_file->d_name;
23 |             std::string cur_file_name(p_file->d_name);
24 |             file_names.push_back(cur_file_name);
25 |         }
26 |     }
27 | 
28 |     closedir(p_dir);
29 |     return 0;
30 | }
31 | 


--------------------------------------------------------------------------------
/efficientnet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(efficientnet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | # tensorrt
19 | include_directories(/usr/include/x86_64-linux-gnu/)
20 | link_directories(/usr/lib/x86_64-linux-gnu/)
21 | 
22 | add_executable(efficientnet  ${PROJECT_SOURCE_DIR}/efficientnet.cpp)
23 | target_link_libraries(efficientnet nvinfer)
24 | target_link_libraries(efficientnet cudart)
25 | 
26 | add_definitions(-O2 -pthread)
27 | 
28 | 


--------------------------------------------------------------------------------
/efficientnet/README.md:
--------------------------------------------------------------------------------
 1 | # EfficientNet
 2 | 
 3 | A TensorRT implementation of EfficientNet.
 4 | For the Pytorch implementation, you can refer to [EfficientNet-PyTorch](https://github.com/lukemelas/EfficientNet-PyTorch)
 5 | 
 6 | ## How to run
 7 | 
 8 | 1. install `efficientnet_pytorch`
 9 | ```
10 | pip install efficientnet_pytorch
11 | ```
12 | 
13 | 2. gennerate `.wts` file
14 | ```
15 | python gen_wts.py
16 | ```
17 | 
18 | 3. build
19 | 
20 | ```
21 | mkdir build
22 | cd build
23 | cmake ..
24 | make
25 | ```
26 | 4. serialize model to engine
27 | ```
28 | ./efficientnet -s [.wts] [.engine] [b0 b1 b2 b3 ... b7]  // serialize model to engine file
29 | ```
30 | such as
31 | ```
32 | ./efficientnet -s ../efficientnet-b3.wts efficientnet-b3.engine b3
33 | ```
34 | 5. deserialize and do infer
35 | ```
36 | ./efficientnet -d [.engine] [b0 b1 b2 b3 ... b7]   // deserialize engine file and run inference
37 | ```
38 | such as 
39 | ```
40 | ./efficientnet -d efficientnet-b3.engine b3
41 | ```
42 | 6. see if the output is same as pytorch side
43 | 
44 | 
45 | For more models, please refer to [tensorrtx](https://github.com/wang-xinyu/tensorrtx)
46 | 


--------------------------------------------------------------------------------
/efficientnet/gen_wts.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import struct
 3 | from efficientnet_pytorch import EfficientNet
 4 | model = EfficientNet.from_pretrained('efficientnet-b3')
 5 | 
 6 | model.eval()
 7 | f = open('efficientnet-b3.wts', 'w')
 8 | f.write('{}\n'.format(len(model.state_dict().keys())))
 9 | for k, v in model.state_dict().items():
10 |     vr = v.reshape(-1).cpu().numpy()
11 |     f.write('{} {} '.format(k, len(vr)))
12 |     for vv in vr:
13 |         f.write(' ')
14 |         f.write(struct.pack('>f',float(vv)).hex())
15 |     f.write('\n')
16 | f.close()
17 | 


--------------------------------------------------------------------------------
/ghostnet/ghostnetv1/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(ghostnetv1)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | include_directories(${PROJECT_SOURCE_DIR}/include)
12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
13 | # cuda
14 | include_directories(/usr/local/cuda/include)
15 | link_directories(/usr/local/cuda/lib64)
16 | # tensorrt
17 | include_directories(/usr/include/x86_64-linux-gnu/)
18 | link_directories(/usr/lib/x86_64-linux-gnu/)
19 | 
20 | add_executable(ghostnetv1 ${PROJECT_SOURCE_DIR}/ghostnetv1.cpp)
21 | target_link_libraries(ghostnetv1 nvinfer)
22 | target_link_libraries(ghostnetv1 cudart)
23 | 
24 | add_definitions(-O2 -pthread)
25 | 


--------------------------------------------------------------------------------
/ghostnet/ghostnetv2/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(ghostnetv2)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | include_directories(${PROJECT_SOURCE_DIR}/include)
12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
13 | # cuda
14 | include_directories(/usr/local/cuda/include)
15 | link_directories(/usr/local/cuda/lib64)
16 | # tensorrt
17 | include_directories(/usr/include/x86_64-linux-gnu/)
18 | link_directories(/usr/lib/x86_64-linux-gnu/)
19 | 
20 | add_executable(ghostnetv2 ${PROJECT_SOURCE_DIR}/ghostnetv2.cpp)
21 | target_link_libraries(ghostnetv2 nvinfer)
22 | target_link_libraries(ghostnetv2 cudart)
23 | 
24 | add_definitions(-O2 -pthread)
25 | 


--------------------------------------------------------------------------------
/googlenet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(googlenet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | include_directories(${PROJECT_SOURCE_DIR}/include)
12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
13 | # cuda
14 | include_directories(/usr/local/cuda/include)
15 | link_directories(/usr/local/cuda/lib64)
16 | # tensorrt
17 | include_directories(/usr/include/x86_64-linux-gnu/)
18 | link_directories(/usr/lib/x86_64-linux-gnu/)
19 | 
20 | add_executable(googlenet ${PROJECT_SOURCE_DIR}/googlenet.cpp)
21 | target_link_libraries(googlenet nvinfer)
22 | target_link_libraries(googlenet cudart)
23 | 
24 | add_definitions(-O2 -pthread)
25 | 
26 | 


--------------------------------------------------------------------------------
/googlenet/README.md:
--------------------------------------------------------------------------------
 1 | # googlenet
 2 | 
 3 | GoogLeNet (Inception v1) model architecture from "Going Deeper with Convolutions" <http://arxiv.org/abs/1409.4842>`_.
 4 | 
 5 | For the details, you can refer to [pytorchx/googlenet](https://github.com/wang-xinyu/pytorchx/tree/master/googlenet)
 6 | 
 7 | Following tricks used in this googlenet:
 8 | 
 9 | - MaxPool2d(ceil_mode=True), ceilmode=True, which is not supported in Tensorrt4, we use a padding layer before maxpool to solve this problem.
10 | - Batchnorm layer, implemented by scale layer.
11 | 
12 | ```
13 | // 1. generate googlenet.wts from [pytorchx/googlenet](https://github.com/wang-xinyu/pytorchx/tree/master/googlenet)
14 | 
15 | // 2. put googlenet.wts into tensorrtx/googlenet
16 | 
17 | // 3. build and run
18 | 
19 | cd tensorrtx/googlenet
20 | 
21 | mkdir build
22 | 
23 | cd build
24 | 
25 | cmake ..
26 | 
27 | make
28 | 
29 | sudo ./googlenet -s   // serialize model to plan file i.e. 'googlenet.engine'
30 | 
31 | sudo ./googlenet -d   // deserialize plan file and run inference
32 | 
33 | // 4. see if the output is same as pytorchx/googlenet
34 | ```
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/hrnet/hrnet-image-classification/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(hrnet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | include_directories(/usr/local/cuda/include)
13 | link_directories(/usr/local/cuda/lib64)
14 | 
15 | find_package(OpenCV)
16 | include_directories(${OpenCV_INCLUDE_DIRS})
17 | 
18 | add_executable(hrnet ${PROJECT_SOURCE_DIR}/hrnet.cpp)
19 | target_link_libraries(hrnet nvinfer)
20 | target_link_libraries(hrnet cudart)
21 | target_link_libraries(hrnet ${OpenCV_LIBS})
22 | 
23 | add_definitions(-O2 -pthread)
24 | 
25 | 


--------------------------------------------------------------------------------
/hrnet/hrnet-image-classification/README.md:
--------------------------------------------------------------------------------
 1 | # HRNet
 2 | 
 3 | The Pytorch implementation is [HRNet-Image-Classification](https://github.com/HRNet/HRNet-Image-Classification).  The implemented model is **HRNet-W18-C-Small-v2** 
 4 | 
 5 | 
 6 | ## How to Run
 7 | 
 8 | * 1. generate .wts
 9 | 
10 |   Download code and model from [HRNet-Image-Classification](https://github.com/HRNet/HRNet-Image-Classification) and config your environments.
11 | 
12 |   Put `demo.py`  in the `YOUR_ROOT_DIR\HRNet-Image-Classification\tools `  folder, set `savewts in  main()` as `True`, and run, the .wts will be generated.
13 | 
14 | * 2. cmake and make
15 | 
16 |   ```
17 |   mkdir build
18 |   cd build
19 |   cmake ..
20 |   make
21 |   sudo ./hrnet -s             // serialize model to plan file i.e. 'hrnet.engine'
22 |   sudo ./hrnet -d  ../samples // deserialize plan file and run inference, the images in samples will be processed.
23 |   ```
24 | 
25 | ## Result
26 | 
27 | The test img:
28 | 
29 | ![](https://user-images.githubusercontent.com/20653176/93732833-ac103200-fc05-11ea-88ff-6f59f316a377.JPEG)
30 | 
31 | Pytorch Result:
32 | 
33 | ![image-20200921115119593](https://user-images.githubusercontent.com/20653176/93731787-225e6580-fc01-11ea-9578-393079cd1873.png)
34 | 
35 | TRT Result:
36 | 
37 | ![image-20200921114959069](https://user-images.githubusercontent.com/20653176/93731788-238f9280-fc01-11ea-954f-2debc20e102a.png)
38 | 


--------------------------------------------------------------------------------
/hrnet/hrnet-image-classification/hrnet.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wang-xinyu/tensorrtx/c6bcebd812907a6d9eb66ecc909eb17b51d40fac/hrnet/hrnet-image-classification/hrnet.cpp


--------------------------------------------------------------------------------
/hrnet/hrnet-semantic-segmentation/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(hrnetseg)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | # tensorrt
19 | include_directories(/usr/include/x86_64-linux-gnu/)
20 | link_directories(/usr/lib/x86_64-linux-gnu/)
21 | 
22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
23 | 
24 | find_package(OpenCV)
25 | include_directories(${OpenCV_INCLUDE_DIRS})
26 | 
27 | add_executable(hrnet ${PROJECT_SOURCE_DIR}/hrnet.cpp)
28 | target_link_libraries(hrnet nvinfer)
29 | target_link_libraries(hrnet cudart)
30 | target_link_libraries(hrnet ${OpenCV_LIBS})
31 | 
32 | 
33 | add_executable(hrnet_ocr ${PROJECT_SOURCE_DIR}/hrnet_ocr.cpp)
34 | target_link_libraries(hrnet_ocr nvinfer)
35 | target_link_libraries(hrnet_ocr cudart)
36 | target_link_libraries(hrnet_ocr ${OpenCV_LIBS})
37 | 
38 | 
39 | add_definitions(-O2 -pthread)
40 | 
41 | 


--------------------------------------------------------------------------------
/ibnnet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(IBNNet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | # tensorrt
19 | include_directories(/usr/include/x86_64-linux-gnu/)
20 | link_directories(/usr/lib/x86_64-linux-gnu/)
21 | 
22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
23 | 
24 | find_package(OpenCV)
25 | include_directories(${OpenCV_INCLUDE_DIRS})
26 | 
27 | file(GLOB SOURCE_FILES "*.h" "*.cpp")
28 | 
29 | add_executable(ibnnet ${SOURCE_FILES})
30 | target_link_libraries(ibnnet nvinfer)
31 | target_link_libraries(ibnnet cudart)
32 | target_link_libraries(ibnnet ${OpenCV_LIBS})
33 | 
34 | add_definitions(-O2 -pthread)
35 | 
36 | 


--------------------------------------------------------------------------------
/ibnnet/README.md:
--------------------------------------------------------------------------------
 1 | # IBN-Net
 2 | 
 3 | An implementation of IBN-Net, proposed in ["Two at Once: Enhancing Learning and Generalization Capacities via IBN-Net"](https://arxiv.org/abs/1807.09441), ECCV2018 by Xingang Pan, Ping Luo, Jianping Shi, Xiaoou Tang. 
 4 | 
 5 | For the Pytorch implementation, you can refer to [IBN-Net](https://github.com/XingangPan/IBN-Net)
 6 | 
 7 | ## Features
 8 | - InstanceNorm2d
 9 | - bottleneck_ibn
10 | - Resnet50-IBNA
11 | - Resnet50-IBNB
12 | - Multi-thread inference
13 | 
14 | ## How to Run
15 | 
16 | * 1. generate .wts
17 | 
18 |   // for ibn-a
19 |   ```
20 |   python gen_wts.py a
21 |   ```
22 |   a file 'resnet50-ibna.wts' will be generated.
23 | 
24 |   // for ibn-b
25 |   ```
26 |   python gen_wts.py b
27 |   ```
28 |   a file 'resnet50-ibnb.wts' will be generated.
29 | * 2. cmake and make
30 | 
31 |   ```
32 |   mkdir build
33 |   cd build
34 |   cmake ..
35 |   make
36 |   ```
37 | * 3. build engine and run classification
38 | 
39 |   // put resnet50-ibna.wts/resnet50-ibnb.wts into tensorrtx/ibnnet
40 |   
41 |   // go to tensorrtx/ibnnet
42 |   ```
43 |   ./ibnnet -s  // serialize model to plan file
44 |   ./ibnnet -d  // deserialize plan file and run inference
45 |   ```
46 |   


--------------------------------------------------------------------------------
/ibnnet/gen_wts.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | import sys
 4 | import struct
 5 | 
 6 | 
 7 | assert sys.argv[1] == "a" or sys.argv[1] == "b"
 8 | model_name = "resnet50_ibn_" + sys.argv[1]
 9 | 
10 | net = torch.hub.load('XingangPan/IBN-Net', model_name, pretrained=True).to('cuda:0').eval()
11 | 
12 | #verify
13 | #input = torch.ones(1, 3, 224, 224).to('cuda:0')
14 | #pixel_mean = torch.tensor([0.485, 0.456, 0.406]).view(1, -1, 1, 1).to('cuda:0')
15 | #pixel_std = torch.tensor([0.229, 0.224, 0.225]).view(1, -1, 1, 1).to('cuda:0')
16 | #input.sub_(pixel_mean).div_(pixel_std)
17 | #out = net(input)
18 | #print(out)
19 | 
20 | f = open(model_name + ".wts", 'w')
21 | f.write("{}\n".format(len(net.state_dict().keys())))
22 | for k,v in net.state_dict().items():
23 |     vr = v.reshape(-1).cpu().numpy()
24 |     f.write("{} {}".format(k, len(vr)))
25 |     for vv in vr:
26 |         f.write(" ")
27 |         f.write(struct.pack(">f", float(vv)).hex())
28 |     f.write("\n")
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/ibnnet/holder.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | template <typename T>
 4 | class TensorRTHolder {
 5 |     T* holder;
 6 | public:
 7 |     explicit TensorRTHolder(T* holder_) : holder(holder_) {}
 8 |     ~TensorRTHolder() {
 9 |         if (holder)
10 |             holder->destroy();
11 |     }
12 |     TensorRTHolder(const TensorRTHolder&) = delete;
13 |     TensorRTHolder& operator=(const TensorRTHolder&) = delete;
14 |     TensorRTHolder(TensorRTHolder && rhs) noexcept{
15 |         holder = rhs.holder;
16 |         rhs.holder = nullptr;
17 |     }
18 |     TensorRTHolder& operator=(TensorRTHolder&& rhs) noexcept {
19 |         if (this == &rhs) {
20 |             return *this;
21 |         }
22 |         if (holder) holder->destroy();
23 |         holder = rhs.holder;
24 |         rhs.holder = nullptr;
25 |         return *this;
26 |     }
27 |     T* operator->() {
28 |         return holder;
29 |     }
30 |     T* get() { return holder; }
31 |     explicit operator bool() { return holder != nullptr; }
32 |     T& operator*() noexcept { return *holder; }
33 | };
34 | 
35 | template <typename T>
36 | TensorRTHolder<T> make_holder(T* holder) {
37 |     return TensorRTHolder<T>(holder);
38 | }
39 | 
40 | template <typename T>
41 | using TensorRTNonHolder = T*;


--------------------------------------------------------------------------------
/ibnnet/ibnnet.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "utils.h"
 4 | #include "holder.h"
 5 | #include "layers.h"
 6 | #include "InferenceEngine.h"
 7 | #include <memory>
 8 | #include <vector>
 9 | #include <chrono>
10 | #include <opencv2/opencv.hpp>
11 | extern Logger gLogger;
12 | using namespace trtxapi;
13 | 
14 | namespace trt {
15 | 
16 |     enum IBN {
17 |         A, // resnet50-ibna,
18 |         B, // resnet50-ibnb,
19 |         NONE // resnet50
20 |     };
21 | 
22 |     class IBNNet {
23 |     public:
24 |         IBNNet(trt::EngineConfig &enginecfg, const IBN ibn);
25 |         ~IBNNet() {};
26 | 
27 |         bool serializeEngine(); /* create & serializeEngine */ 
28 |         bool deserializeEngine();
29 |         bool inference(std::vector<cv::Mat> &input); /* support batch inference */
30 | 
31 |         float* getOutput(); 
32 |         int getDeviceID(); /* cuda deviceid */ 
33 | 
34 |     private:
35 |         ICudaEngine *createEngine(IBuilder *builder, IBuilderConfig *config);
36 |         void preprocessing(const cv::Mat& img, float* const data, const std::size_t stride);
37 | 
38 |     private:
39 |         trt::EngineConfig _engineCfg;
40 |         std::unique_ptr<trt::InferenceEngine> _inferEngine{nullptr};
41 |         std::string _ibn;
42 |         DataType _dt{DataType::kFLOAT};
43 |     };
44 | 
45 | }


--------------------------------------------------------------------------------
/ibnnet/layers.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <map>
 4 | #include <math.h>
 5 | #include <assert.h>
 6 | #include "NvInfer.h"
 7 | #include "cuda_runtime_api.h"
 8 | using namespace nvinfer1;
 9 | 
10 | namespace trtxapi {
11 | 
12 |     ITensor* MeanStd(INetworkDefinition *network, 
13 |         std::map<std::string, Weights>& weightMap, 
14 |         ITensor* input, 
15 |         const std::string lname,
16 |         const float* mean, 
17 |         const float* std, 
18 |         const bool div255);
19 | 
20 |     IScaleLayer* addBatchNorm2d(INetworkDefinition *network, 
21 |         std::map<std::string, Weights>& weightMap, 
22 |         ITensor& input, 
23 |         const std::string lname, 
24 |         const float eps);
25 | 
26 |     IScaleLayer* addInstanceNorm2d(INetworkDefinition *network, 
27 |         std::map<std::string, Weights>& weightMap, 
28 |         ITensor& input, 
29 |         const std::string lname, 
30 |         const float eps);
31 | 
32 |     IConcatenationLayer* addIBN(INetworkDefinition *network, 
33 |         std::map<std::string, Weights>& weightMap, 
34 |         ITensor& input, 
35 |         const std::string lname);
36 | 
37 |     IActivationLayer* bottleneck_ibn(INetworkDefinition *network, 
38 |         std::map<std::string, Weights>& weightMap, 
39 |         ITensor& input, 
40 |         const int inch, 
41 |         const int outch,
42 |         const int stride, 
43 |         const std::string lname, 
44 |         const std::string ibn);
45 | 
46 | }


--------------------------------------------------------------------------------
/ibnnet/utils.cpp:
--------------------------------------------------------------------------------
 1 | #include "utils.h"
 2 | 
 3 | // Load weights from files shared with TensorRT samples.
 4 | // TensorRT weight files have a simple space delimited format:
 5 | // [type] [size] <data x size in hex>
 6 | std::map<std::string, Weights> loadWeights(const std::string file) {
 7 |     std::cout << "Loading weights: " << file << std::endl;
 8 |     std::map<std::string, Weights> weightMap;
 9 | 
10 |     // Open weights file
11 |     std::ifstream input(file);
12 |     assert(input.is_open() && "Unable to load weight file.");
13 | 
14 |     // Read number of weight blobs
15 |     int32_t count;
16 |     input >> count;
17 |     assert(count > 0 && "Invalid weight map file.");
18 | 
19 |     while (count--) {
20 |         Weights wt{DataType::kFLOAT, nullptr, 0};
21 |         uint32_t size;
22 | 
23 |         // Read name and type of blob
24 |         std::string name;
25 |         input >> name >> std::dec >> size;
26 |         wt.type = DataType::kFLOAT;
27 | 
28 |         // Load blob
29 |         uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
30 |         for (uint32_t x = 0, y = size; x < y; ++x) {
31 |             input >> std::hex >> val[x];
32 |         }
33 |         wt.values = val;
34 |         wt.count = size;
35 |         weightMap[name] = wt;
36 |     }
37 | 
38 |     return weightMap;
39 | }
40 | 


--------------------------------------------------------------------------------
/ibnnet/utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <map>
 4 | #include "NvInfer.h"
 5 | #include "cuda_runtime_api.h"
 6 | #include "assert.h"
 7 | #include <fstream>
 8 | #include <iostream>
 9 | #include <memory>
10 | 
11 | using namespace nvinfer1;
12 | 
13 | #define CHECK(status)                             \
14 |     do                                            \
15 |     {                                             \
16 |         auto ret = (status);                      \
17 |         if (ret != 0)                             \
18 |         {                                         \
19 |             std::cout << "Cuda failure: " << ret; \
20 |             abort();                              \
21 |         }                                         \
22 |     } while (0)
23 | 
24 | template<typename T, typename... Args>
25 | std::unique_ptr<T> make_unique(Args&&... args) {
26 |     return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
27 | }
28 | 
29 | std::map<std::string, Weights> loadWeights(const std::string file);
30 | 
31 | 


--------------------------------------------------------------------------------
/inception/inceptionv3/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(inception)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | include_directories(${PROJECT_SOURCE_DIR}/include)
12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
13 | # cuda
14 | include_directories(/usr/local/cuda/include)
15 | link_directories(/usr/local/cuda/lib64)
16 | # tensorrt
17 | include_directories(/usr/include/x86_64-linux-gnu/)
18 | link_directories(/usr/lib/x86_64-linux-gnu/)
19 | 
20 | add_executable(inception ${PROJECT_SOURCE_DIR}/inception_v3.cpp)
21 | target_link_libraries(inception nvinfer)
22 | target_link_libraries(inception cudart)
23 | 
24 | add_definitions(-O2 -pthread)
25 | 
26 | 


--------------------------------------------------------------------------------
/inception/inceptionv3/README.md:
--------------------------------------------------------------------------------
 1 | # Inception v3
 2 | 
 3 | Inception v3 model architecture from "Rethinking the Inception Architecture for Computer Vision" <http://arxiv.org/abs/1512.00567>.
 4 | 
 5 | For the details, you can refer to [pytorchx/inception](https://github.com/wang-xinyu/pytorchx/tree/master/inception)
 6 | 
 7 | Following tricks are used in this inception:
 8 | 
 9 | - For pooling layer with padding, we need pay attention to see if padding is included or excluded while calculating average number. Pytorch includes padding while doing avgPool by default, but Tensorrt doesn't. So for pooling layer with padding, we need `setAverageCountExcludesPadding(false)` in tensorrt.
10 | - Batchnorm layer, implemented by scale layer.
11 | 
12 | ```
13 | // 1. generate inception.wts from [pytorchx/inception](https://github.com/wang-xinyu/pytorchx/tree/master/inception)
14 | 
15 | // 2. put inception.wts into tensorrtx/inception
16 | 
17 | // 3. build and run
18 | 
19 | cd tensorrtx/inception
20 | 
21 | mkdir build
22 | 
23 | cd build
24 | 
25 | cmake ..
26 | 
27 | make
28 | 
29 | sudo ./inception -s   // serialize model to plan file i.e. 'inception.engine'
30 | 
31 | sudo ./inception -d   // deserialize plan file and run inference
32 | 
33 | // 4. see if the output is same as pytorchx/inception
34 | ```
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/inception/inceptionv4/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(InceptionV4)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | # tensorrt
19 | include_directories(/usr/include/x86_64-linux-gnu/)
20 | link_directories(/usr/lib/x86_64-linux-gnu/)
21 | 
22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
23 | 
24 | find_package(OpenCV)
25 | include_directories(${OpenCV_INCLUDE_DIRS})
26 | 
27 | file(GLOB SOURCE_FILES "*.h" "*.cpp")
28 | 
29 | add_executable(inceptionv4 ${SOURCE_FILES})
30 | target_link_libraries(inceptionv4 nvinfer)
31 | target_link_libraries(inceptionv4 cudart)
32 | target_link_libraries(inceptionv4 ${OpenCV_LIBS})
33 | 
34 | add_definitions(-O2 -pthread)
35 | 
36 | 


--------------------------------------------------------------------------------
/inception/inceptionv4/README.md:
--------------------------------------------------------------------------------
 1 | # Inception v4
 2 | 
 3 | Inception v4 model architecture from "Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning" <https://arxiv.org/abs/1602.07261v2>.
 4 | 
 5 | For the details, you can refer to [rwightman/pytorch-image-models](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/inception_v4.py)
 6 | 
 7 | Following tricks are used in this inception:
 8 | 
 9 | - For pooling layer with padding, we need pay attention to see if padding is included or excluded while calculating average number. Pytorch includes padding while doing avgPool by default, but Tensorrt doesn't. So for pooling layer with padding, we need `setAverageCountExcludesPadding(false)` in tensorrt.
10 | - Batchnorm layer, implemented by scale layer.
11 | 
12 | ```
13 | // 1. generate inception.wts from [BlueMirrors/torchtrtz](https://github.com/BlueMirrors/torchtrtz/blob/main/generate_weights.py)
14 | 
15 | // 2. put inception.wts into tensorrtx/inceptionV4
16 | 
17 | // 3. build and run
18 | 
19 | cd tensorrtx/inception/inceptionV4
20 | 
21 | mkdir build
22 | 
23 | cd build
24 | 
25 | cmake ..
26 | 
27 | make
28 | 
29 | sudo ./inceptionV4 -s   // serialize model to plan file i.e. 'inceptionV4.engine'
30 | 
31 | sudo ./inceptionV4 -d   // deserialize plan file and run inference
32 | 
33 | // 4. see if the output is same as rwightman/pytorch-image-models/inceptionv4
34 | ```
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/inception/inceptionv4/utils.cpp:
--------------------------------------------------------------------------------
 1 | # include "utils.h"
 2 | 
 3 | 
 4 | // Load weights from files.
 5 | // TensorRT weight files have a simple space delimited format:
 6 | // [type] [size] <data x size in hex>
 7 | std::map<std::string, Weights> loadWeights(const std::string file) {
 8 |     std::cout << "Loading weights: " << file << std::endl;
 9 |     std::map<std::string, Weights> weightMap;
10 | 
11 |     // Open weights file
12 |     std::ifstream input(file);
13 |     assert(input.is_open() && "Unable to load weight file.");
14 | 
15 |     // Read number of weight blobs
16 |     int32_t count;
17 |     input >> count;
18 |     assert(count > 0 && "Invalid weight map file.");
19 | 
20 |     while (count--)
21 |     {
22 |         Weights wt{DataType::kFLOAT, nullptr, 0};
23 |         uint32_t size;
24 | 
25 |         // Read name and type of blob
26 |         std::string name;
27 |         input >> name >> std::dec >> size;
28 |         wt.type = DataType::kFLOAT;
29 | 
30 |         // Load blob
31 |         uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
32 |         for (uint32_t x = 0, y = size; x < y; ++x)
33 |         {
34 |             input >> std::hex >> val[x];
35 |         }
36 |         wt.values = val;
37 |         
38 |         wt.count = size;
39 |         weightMap[name] = wt;
40 |     }
41 | 
42 |     return weightMap;
43 | } 


--------------------------------------------------------------------------------
/inception/inceptionv4/utils.h:
--------------------------------------------------------------------------------
 1 | # ifndef TRTX_UTILS_H
 2 | # define TRTX_UTILS_H
 3 | 
 4 | #include <map>
 5 | #include "NvInfer.h"
 6 | #include "cuda_runtime_api.h"
 7 | #include "assert.h"
 8 | #include <fstream>
 9 | #include <iostream>
10 | #include <memory>
11 | 
12 | #ifndef CUDA_CHECK
13 | #define CUDA_CHECK(callstr)\
14 |     {\
15 |         cudaError_t error_code = callstr;\
16 |         if (error_code != cudaSuccess) {\
17 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
18 |             assert(0);\
19 |         }\
20 |     }
21 | #endif  // CUDA_CHECK
22 | 
23 | using namespace nvinfer1;
24 | 
25 | std::map<std::string, Weights> loadWeights(const std::string input);
26 | 
27 | #endif // TRTX_UTILS_H


--------------------------------------------------------------------------------
/lenet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(lenet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | set(TARGET_NAME "lenet")
 8 | 
 9 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
10 | set(CMAKE_CXX_STANDARD 11)
11 | set(CMAKE_BUILD_TYPE Debug)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | # tensorrt
19 | include_directories(/usr/include/x86_64-linux-gnu/)
20 | link_directories(/usr/lib/x86_64-linux-gnu/)
21 | 
22 | FILE(GLOB SRC_FILES ${PROJECT_SOURCE_DIR}/lenet.cpp ${PROJECT_SOURCE_DIR}/include/*.h)
23 | 
24 | add_executable(${TARGET_NAME} ${SRC_FILES})
25 | target_link_libraries(${TARGET_NAME} nvinfer)
26 | target_link_libraries(${TARGET_NAME} cudart)
27 | 
28 | add_definitions(-O2 -pthread)
29 | 
30 | 


--------------------------------------------------------------------------------
/lenet/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #if NV_TENSORRT_MAJOR >= 8
 5 | #define TRT_NOEXCEPT noexcept
 6 | #define TRT_CONST_ENQUEUE const
 7 | #else
 8 | #define TRT_NOEXCEPT
 9 | #define TRT_CONST_ENQUEUE
10 | #endif
11 | 
12 | #endif  // __MACROS_H
13 | 


--------------------------------------------------------------------------------
/lprnet/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wang-xinyu/tensorrtx/c6bcebd812907a6d9eb66ecc909eb17b51d40fac/lprnet/1.jpg


--------------------------------------------------------------------------------
/lprnet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(LPRnet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
15 |     message("embed_platform on")
16 |     include_directories(/usr/local/cuda/targets/aarch64-linux/include)
17 |     link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
18 | else()
19 |     message("embed_platform off")
20 |     include_directories(/usr/local/cuda/include)
21 |     link_directories(/usr/local/cuda/lib64)
22 |     # tensorrt
23 |     include_directories(/usr/local/TensorRT-7.0.0.11/include)
24 |     link_directories(/usr/local/TensorRT-7.0.0.11/lib)
25 | endif()
26 | 
27 | find_package(OpenCV)
28 | include_directories(OpenCV_INCLUDE_DIRS)
29 | 
30 | add_executable(LPRnet ${PROJECT_SOURCE_DIR}/LPRnet.cpp)
31 | target_link_libraries(LPRnet nvinfer)
32 | target_link_libraries(LPRnet cudart)
33 | target_link_libraries(LPRnet ${OpenCV_LIBS})
34 | 
35 | add_definitions(-O2 -pthread)


--------------------------------------------------------------------------------
/lprnet/README.md:
--------------------------------------------------------------------------------
 1 | # LPRNet
 2 | 
 3 | The Pytorch implementation is [xuexingyu24/License_Plate_Detection_Pytorch](https://github.com/xuexingyu24/License_Plate_Detection_Pytorch).
 4 | 
 5 | ## How to Run
 6 | 
 7 | 1. generate LPRnet.wts from pytorch
 8 | 
 9 | ```
10 | git clone https://github.com/wang-xinyu/tensorrtx.git
11 | git clone https://github.com/xuexingyu24/License_Plate_Detection_Pytorch.git
12 | 
13 | // copy tensorrtx/LRPnet/gen_wts.py to License_Plate_Detection_Pytorch/
14 | // go to License_Plate_Detection_Pytorch/
15 | python genwts.py
16 | // a file 'LPRnet.wts' will be generated.
17 | ```
18 | 
19 | 2. build LPRnet and run
20 | 
21 | ```
22 | // put LPRnet.wts into tensorrtx/LPRnet
23 | // go to tensorrtx/LPRnet
24 | mkdir build
25 | cd build
26 | cmake ..
27 | make
28 | sudo ./LPRnet -s  // serialize model to file i.e. 'LPRnet.engine'
29 | sudo ./LPRnet -d  // deserialize model and run inference
30 | ```
31 | 
32 | 


--------------------------------------------------------------------------------
/lprnet/genwts.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | 
 4 | from LPRNet.model import LPRNET
 5 | import struct
 6 | 
 7 | model_path = './weights/Final_LPRNet_model.pth'
 8 | CHARS = ['京', '沪', '津', '渝', '冀', '晋', '蒙', '辽', '吉', '黑',
 9 |          '苏', '浙', '皖', '闽', '赣', '鲁', '豫', '鄂', '湘', '粤',
10 |          '桂', '琼', '川', '贵', '云', '藏', '陕', '甘', '青', '宁',
11 |          '新',
12 |          '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
13 |          'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K',
14 |          'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
15 |          'W', 'X', 'Y', 'Z', 'I', 'O', '-'
16 |          ]
17 | model = LPRNET.LPRNet(class_num=len(CHARS), dropout_rate=0)
18 | if torch.cuda.is_available():
19 |     model = model.cuda()
20 | print('loading pretrained model from %s' % model_path)
21 | model.load_state_dict(torch.load(model_path))
22 | 
23 | image = torch.ones(1, 3, 24, 94)
24 | if torch.cuda.is_available():
25 |     image = image.cuda()
26 | 
27 | model.eval()
28 | print(model)
29 | print('image shape ', image.shape)
30 | preds = model(image)
31 | 
32 | f = open("LPRNet.wts", 'w')
33 | f.write("{}\n".format(len(model.state_dict().keys())))
34 | for k, v in model.state_dict().items():
35 |     print('key: ', k)
36 |     print('value: ', v.shape)
37 |     vr = v.reshape(-1).cpu().numpy()
38 |     f.write("{} {}".format(k, len(vr)))
39 |     for vv in vr:
40 |         f.write(" ")
41 |         f.write(struct.pack(">f", float(vv)).hex())
42 |     f.write("\n")
43 | 


--------------------------------------------------------------------------------
/mlp/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.14)  # change the version, if asked by compiler
 2 | project(mlp)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 14)
 5 | 
 6 | # include and link dirs of tensorrt, you need adapt them if yours are different
 7 | include_directories(/usr/include/x86_64-linux-gnu/)
 8 | link_directories(/usr/lib/x86_64-linux-gnu/)
 9 | 
10 | # include and link dirs of cuda for inference
11 | include_directories(/usr/local/cuda/include)
12 | link_directories(/usr/local/cuda/lib64)
13 | 
14 | # create link for executable files
15 | add_executable(mlp mlp.cpp)
16 | 
17 | # perform linking with nvinfer libraries
18 | target_link_libraries(mlp nvinfer)
19 | 
20 | # link with cuda libraries for Inference
21 | target_link_libraries(mlp cudart)
22 | 
23 | add_definitions(-O2 -pthread)
24 | 
25 | 


--------------------------------------------------------------------------------
/mlp/README.md:
--------------------------------------------------------------------------------
 1 | # MLP
 2 | 
 3 | MLP is the most basic net in this tensorrtx project for starters. You can learn the basic procedures of building
 4 | TensorRT app from the provided APIs. The process of building a TensorRT engine explained in the chart below.
 5 | 
 6 | ![TensorRT Image](https://user-images.githubusercontent.com/33795294/148565279-795b12da-5243-4e7e-881b-263eb7658683.jpg)
 7 | 
 8 | ## Helper Files
 9 | 
10 | `logging.h` : A logger file for using NVIDIA TRT API (mostly same for all models)
11 | 
12 | `mlp.wts` : Converted weight file (simple file, you can open and check it)
13 | 
14 | ## TensorRT C++ API
15 | 
16 | ```
17 | // 1. generate mlp.wts from https://github.com/wang-xinyu/pytorchx/tree/master/mlp -- or use the given .wts file
18 | 
19 | // 2. put mlp.wts into tensorrtx/mlp (if using the generated weights)
20 | 
21 | // 3. build and run
22 | 
23 |     cd tensorrtx/mlp
24 | 
25 |     mkdir build
26 | 
27 |     cd build
28 | 
29 |     cmake ..
30 | 
31 |     make
32 | 
33 |     sudo ./mlp -s   // serialize model to plan file i.e. 'mlp.engine'
34 | 
35 |     sudo ./mlp -d   // deserialize plan file and run inference
36 | ```
37 | 
38 | ## TensorRT Python API
39 | 
40 | ```
41 | # 1. Generate mlp.wts from https://github.com/wang-xinyu/pytorchx/tree/master/mlp -- or use the given .wts file
42 | 
43 | # 2. Put mlp.wts into tensorrtx/mlp (if using the generated weights)
44 | 
45 | # 3. Install Python dependencies (tensorrt/pycuda/numpy)
46 | 
47 | # 4. Run 
48 |     
49 |     cd tensorrtx/mlp
50 |     
51 |     python mlp.py -s   # serialize model to plan file, i.e. 'mlp.engine'
52 |     
53 |     python mlp.py -d   # deserialize plan file and run inference
54 | ```
55 | 
56 | ## Note
57 | It also supports the latest CUDA-11.4 and TensorRT-8.2.x
58 | 


--------------------------------------------------------------------------------
/mlp/mlp.wts:
--------------------------------------------------------------------------------
1 | 2
2 | linear.weight 1 3fff7e32
3 | linear.bias 1 3c138a5a
4 | 


--------------------------------------------------------------------------------
/mnasnet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(mnasnet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | include_directories(${PROJECT_SOURCE_DIR}/include)
12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
13 | # cuda
14 | include_directories(/usr/local/cuda/include)
15 | link_directories(/usr/local/cuda/lib64)
16 | # tensorrt
17 | include_directories(/usr/include/x86_64-linux-gnu/)
18 | link_directories(/usr/lib/x86_64-linux-gnu/)
19 | 
20 | add_executable(mnasnet ${PROJECT_SOURCE_DIR}/mnasnet.cpp)
21 | target_link_libraries(mnasnet nvinfer)
22 | target_link_libraries(mnasnet cudart)
23 | 
24 | add_definitions(-O2 -pthread)
25 | 
26 | 


--------------------------------------------------------------------------------
/mnasnet/README.md:
--------------------------------------------------------------------------------
 1 | # mnasnet
 2 | 
 3 | MNASNet with depth multiplier of 0.5 from 
 4 | "MnasNet: Platform-Aware Neural Architecture Search for Mobile"   <https://arxiv.org/pdf/1807.11626.pdf>
 5 | 
 6 | For the Pytorch implementation, you can refer to [pytorchx/mnasnet](https://github.com/wang-xinyu/pytorchx/tree/master/mnasnet)
 7 | 
 8 | Following tricks are used in this mnasnet, nothing special, group conv and batchnorm are used.
 9 | 
10 | - Batchnorm layer, implemented by scale layer.
11 | 
12 | ```
13 | // 1. generate mnasnet.wts from [pytorchx/mnasnet](https://github.com/wang-xinyu/pytorchx/tree/master/mnasnet)
14 | 
15 | // 2. put mnasnet.wts into tensorrtx/mnasnet
16 | 
17 | // 3. build and run
18 | 
19 | cd tensorrtx/mnasnet
20 | 
21 | mkdir build
22 | 
23 | cd build
24 | 
25 | cmake ..
26 | 
27 | make
28 | 
29 | sudo ./mnasnet -s   // serialize model to plan file i.e. 'mnasnet.engine'
30 | 
31 | sudo ./mnasnet -d   // deserialize plan file and run inference
32 | 
33 | // 4. see if the output is same as pytorchx/mnasnet
34 | ```
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/mobilenet/mobilenetv2/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(mobilenet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | include_directories(${PROJECT_SOURCE_DIR}/include)
12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
13 | # cuda
14 | include_directories(/usr/local/cuda/include)
15 | link_directories(/usr/local/cuda/lib64)
16 | # tensorrt
17 | include_directories(/usr/include/x86_64-linux-gnu/)
18 | link_directories(/usr/lib/x86_64-linux-gnu/)
19 | 
20 | add_executable(mobilenet ${PROJECT_SOURCE_DIR}/mobilenet_v2.cpp)
21 | target_link_libraries(mobilenet nvinfer)
22 | target_link_libraries(mobilenet cudart)
23 | 
24 | add_definitions(-O2 -pthread)
25 | 


--------------------------------------------------------------------------------
/mobilenet/mobilenetv2/README.md:
--------------------------------------------------------------------------------
 1 | # mobilenet v2
 2 | 
 3 | MobileNetV2 architecture from
 4 |      "MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>.
 5 | 
 6 | For the Pytorch implementation, you can refer to [pytorchx/mobilenet](https://github.com/wang-xinyu/pytorchx/tree/master/mobilenet)
 7 | 
 8 | Following tricks are used in this mobilenet,
 9 | 
10 | - Relu6 is used in mobilenet v2. We use `Relu6(x) = Relu(x) - Relu(x-6)` in tensorrt.
11 | - Batchnorm layer, implemented by scale layer.
12 | 
13 | ```
14 | // 1. generate mobilenet.wts from [pytorchx/mobilenet](https://github.com/wang-xinyu/pytorchx/tree/master/mobilenet)
15 | 
16 | // 2. put mobilenet.wts into tensorrtx/mobilenet
17 | 
18 | // 3. build and run
19 | 
20 | cd tensorrtx/mobilenet/mobilenetv2
21 | 
22 | mkdir build
23 | 
24 | cd build
25 | 
26 | cmake ..
27 | 
28 | make
29 | 
30 | sudo ./mobilenet -s   // serialize model to plan file i.e. 'mobilenet.engine'
31 | 
32 | sudo ./mobilenet -d   // deserialize plan file and run inference
33 | 
34 | // 4. see if the output is same as pytorchx/mobilenet
35 | ```
36 | 
37 | ### TensorRT Python API
38 | 
39 | ```
40 | # 1. generate mobilenetv2.wts from [pytorchx/mobilenet](https://github.com/wang-xinyu/pytorchx/tree/master/mobilenet)
41 | 
42 | # 2. put mobilenetv2.wts into tensorrtx/mobilenet/mobilenetv2
43 | 
44 | # 3. install Python dependencies (tensorrt/pycuda/numpy)
45 | 
46 | cd tensorrtx/mobilenet/mobilenetv2
47 | 
48 | python mobilenet_v2.py -s   // serialize model to plan file i.e. 'mobilenetv2.engine'
49 | python mobilenet_v2.py -d   // deserialize plan file and run inference
50 | 
51 | # 4. see if the output is same as pytorchx/mobilenet
52 | ```
53 | 


--------------------------------------------------------------------------------
/mobilenet/mobilenetv3/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(mobilenetv3)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | # tensorrt
19 | include_directories(/usr/include/x86_64-linux-gnu/)
20 | link_directories(/usr/lib/x86_64-linux-gnu/)
21 | 
22 | add_executable(mobilenetv3  ${PROJECT_SOURCE_DIR}/mobilenet_v3.cpp)
23 | target_link_libraries(mobilenetv3 nvinfer)
24 | target_link_libraries(mobilenetv3 cudart)
25 | 
26 | add_definitions(-O2 -pthread)
27 | 


--------------------------------------------------------------------------------
/mobilenet/mobilenetv3/README.md:
--------------------------------------------------------------------------------
 1 | # mobilenet v3
 2 | 
 3 | MobileNetV3 architecture from
 4 |      "Searching for MobileNetV3" <https://arxiv.org/abs/1905.02244?context=cs>.
 5 | 
 6 | For the Pytorch implementation, you can refer to [mobilenetv3.pytorch](https://github.com/chufei1995/mobilenetv3.pytorch)
 7 | 
 8 | ## Run
 9 | 
10 | 1. generate mbv3_small.wts/mbv3_large.wts from pytorch implementation
11 | 
12 | 2. put mbv3_small.wts/mbv3_large.wts into tensorrtx/mobilenet/mobilenetv3
13 | 
14 | 3. build and run
15 | 
16 | ```
17 | cd tensorrtx/mobilenet/mobilenetv3
18 | mkdir build
19 | cd build
20 | cmake ..
21 | make
22 | sudo ./mobilenetv3 -s small(or large) // serialize model to plan file i.e. 'mobilenetv3_small.engine'
23 | sudo ./mobilenetv3 -d small(or large)  // deserialize plan file and run inference
24 | ```
25 | 
26 | 4. see if the output is same as pytorch side
27 | 
28 | ### TensorRT Python API
29 | 
30 | ```
31 | # 1. generate mobilenetv3.wts from [mobilenetv3.pytorch](https://github.com/chufei1995/mobilenetv3.pytorch)
32 | 
33 | # 2. put mobilenetv3.wts into tensorrtx/mobilenet/mobilenetv3
34 | 
35 | # 3. install Python dependencies (tensorrt/pycuda/numpy)
36 | 
37 | cd tensorrtx/mobilenet/mobilenetv3
38 | 
39 | python mobilenet_v2.py -s small(or large)  // serialize model to plan file i.e. 'mobilenetv2.engine'
40 | python mobilenet_v2.py -d small(or large)  // deserialize plan file and run inference
41 | 
42 | ```
43 | 


--------------------------------------------------------------------------------
/psenet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(PSENet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | # tensorrt
19 | include_directories(/usr/include/x86_64-linux-gnu/)
20 | link_directories(/usr/lib/x86_64-linux-gnu/)
21 | 
22 | 
23 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
24 | 
25 | 
26 | 
27 | find_package(OpenCV)
28 | include_directories(${OpenCV_INCLUDE_DIRS})
29 | 
30 | file(GLOB SOURCE_FILES "*.h" "*.cpp")
31 | 
32 | add_executable(psenet ${SOURCE_FILES})
33 | target_link_libraries(psenet nvinfer)
34 | target_link_libraries(psenet cudart)
35 | target_link_libraries(psenet ${OpenCV_LIBS})
36 | 
37 | add_definitions(-O2 -pthread)
38 | 
39 | 


--------------------------------------------------------------------------------
/psenet/README.md:
--------------------------------------------------------------------------------
 1 | # PSENet
 2 | 
 3 | **preprocessing + inference + postprocessing = 30ms** with fp32 on Tesla P40. 
 4 | The original Tensorflow implementation is [tensorflow_PSENet](https://github.com/liuheng92/tensorflow_PSENet). A TensorRT Python api implementation is [TensorRT-Python-PSENet](https://github.com/upczww/TensorRT-Python-PSENet).
 5 | 
 6 | ## Key Features
 7 | - Generating `.wts` from `Tensorflow`.
 8 | - Dynamic batch and dynamic shape input.
 9 | - Object-Oriented Programming.
10 | - Practice with C++ 11.
11 | 
12 | 
13 | <p align="center">
14 | <img src="https://user-images.githubusercontent.com/15235574/105487078-821d6800-5cea-11eb-87dc-e3317a941763.jpeg">
15 | </p>
16 | 
17 | ## How to Run
18 | 
19 | * 1. generate .wts
20 | 
21 |   Download pretrained model from https://github.com/liuheng92/tensorflow_PSENet
22 |   and put `model.ckpt.*` to `model` dir. Add a file `model/checkpoint` with content
23 |     ```
24 |     model_checkpoint_path: "model.ckpt"
25 |     all_model_checkpoint_paths: "model.ckpt"
26 |     ```
27 |     Then run
28 | 
29 |     ```
30 |     python gen_tf_wts.py
31 |     ```
32 |     which will gengerate a `psenet.wts`.
33 | * 2. cmake and make
34 | 
35 |   ```
36 |   mkdir build
37 |   cd build
38 |   cmake ..
39 |   make
40 |   ```
41 | * 3. build engine and run detection
42 |   ```
43 |   cp ../psenet.wts ./
44 |   cp ../test.jpg ./
45 |   ./psenet -s  // serialize model to plan file
46 |   ./psenet -d  // deserialize plan file and run inference
47 |   ```
48 | 
49 | ## Known Issues
50 | None
51 | 
52 | ## Todo
53 | 
54 | * use `ExponentialMovingAverage` weight.
55 | 


--------------------------------------------------------------------------------
/psenet/gen_tf_wts.py:
--------------------------------------------------------------------------------
 1 | from sys import prefix
 2 | import tensorflow as tf
 3 | from tensorflow.python import pywrap_tensorflow
 4 | import numpy as np
 5 | import struct
 6 | 
 7 | model_dir = "model"
 8 | 
 9 | ckpt = tf.train.get_checkpoint_state(model_dir)
10 | ckpt_path = ckpt.model_checkpoint_path
11 | 
12 | reader = pywrap_tensorflow.NewCheckpointReader(ckpt_path)
13 | param_dict = reader.get_variable_to_shape_map()
14 | 
15 | 
16 | f = open(r"psenet.wts", "w")
17 | keys = param_dict.keys()
18 | f.write("{}\n".format(len(keys)))
19 | 
20 | for key in keys:
21 |     weight = reader.get_tensor(key)
22 |     print(key, weight.shape)
23 |     if len(weight.shape) == 4:
24 |         weight = np.transpose(weight, (3, 2, 0, 1))
25 |         print(weight.shape)
26 |     weight = np.reshape(weight, -1)
27 |     f.write("{} {} ".format(key, len(weight)))
28 |     for w in weight:
29 |         f.write(" ")
30 |         f.write(struct.pack(">f", float(w)).hex())
31 |     f.write("\n")


--------------------------------------------------------------------------------
/psenet/layers.h:
--------------------------------------------------------------------------------
 1 | #ifndef TENSORRTX_LAYERS_H
 2 | #define TENSORRTX_LAYERS_H
 3 | 
 4 | #include <map>
 5 | #include <math.h>
 6 | #include <assert.h>
 7 | 
 8 | #include "NvInfer.h"
 9 | #include "cuda_runtime_api.h"
10 | using namespace nvinfer1;
11 | 
12 | IScaleLayer *addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights> &weightMap, ITensor &input, std::string lname, float eps);
13 | 
14 | IActivationLayer *bottleneck(INetworkDefinition *network, std::map<std::string, Weights> &weightMap, ITensor &input, int ch, int stride, std::string lname, int branch_type);
15 | 
16 | IActivationLayer *addConvRelu(INetworkDefinition *network, std::map<std::string, Weights> &weightMap, ITensor &input, int outch, int kernel, int stride, std::string lname);
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/psenet/main.cpp:
--------------------------------------------------------------------------------
 1 | #include "psenet.h"
 2 | 
 3 | int main(int argc, char** argv)
 4 | {
 5 |     PSENet psenet(1200, 640, 0.90, 6, 4);
 6 | 
 7 |     if (argc == 2 && std::string(argv[1]) == "-s")
 8 |     {
 9 |         std::cout << "Serializling Engine" << std::endl;
10 |         psenet.serializeEngine();
11 |         return 0;
12 |     }
13 |     else if (argc == 2 && std::string(argv[1]) == "-d")
14 |     {
15 |         psenet.init();
16 |         std::vector<std::string> files;
17 |         for (int i = 0; i < 10; i++)
18 |             files.emplace_back("test.jpg");
19 |         for (auto file : files)
20 |         {
21 |             std::cout << "Detect " << file << std::endl;
22 |             psenet.detect(file);
23 |         }
24 | 
25 |         return 0;
26 |     }
27 |     else
28 |     {
29 |         std::cerr << "arguments not right!" << std::endl;
30 |         std::cerr << "./psenet -s  // serialize model to plan file" << std::endl;
31 |         std::cerr << "./psenet -d  // deserialize plan file and run inference" << std::endl;
32 |         return -1;
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/psenet/psenet.h:
--------------------------------------------------------------------------------
 1 | #ifndef TENSORRTX_PSENET_H
 2 | #define TENSORRTX_PSENET_H
 3 | #include <memory>
 4 | #include <vector>
 5 | #include <chrono>
 6 | #include <opencv2/opencv.hpp>
 7 | #include "utils.h"
 8 | #include "layers.h"
 9 | class PSENet
10 | {
11 | public:
12 | 	PSENet(int max_side_len, int min_side_len, float threshold, int num_kernel, int stride);
13 | 	~PSENet();
14 | 
15 | 	ICudaEngine* createEngine(IBuilder* builder, IBuilderConfig* config);
16 | 	void serializeEngine();
17 | 	void deserializeEngine();
18 | 	void init();
19 | 	void inferenceOnce(IExecutionContext& context, float* input, float* output, int input_h, int input_w);
20 | 	void detect(std::string image_path);
21 | 	float* preProcess(cv::Mat image, int& resize_h, int& resize_w, float& ratio_h, float& ratio_w);
22 | 	std::vector<cv::RotatedRect> postProcess(float* origin_output, int resize_h, int resize_w);
23 | 
24 | private:
25 | 	Logger gLogger;
26 | 	std::shared_ptr<nvinfer1::IRuntime> mRuntime;
27 | 	std::shared_ptr<nvinfer1::ICudaEngine> mCudaEngine;
28 | 	std::shared_ptr<nvinfer1::IExecutionContext> mContext;
29 | 	DataType dt = DataType::kFLOAT;
30 | 	const char* input_name_ = "input";
31 | 	const char* output_name_ = "output";
32 | 	int max_side_len_ = 1024;
33 | 	int min_side_len_ = 640;
34 | 	float post_threshold_ = 0.9;
35 | 	int num_kernels_ = 6;
36 | 	int stride_ = 4;
37 | };
38 | 
39 | #endif // TENSORRTX_PSENET_H
40 | 


--------------------------------------------------------------------------------
/psenet/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wang-xinyu/tensorrtx/c6bcebd812907a6d9eb66ecc909eb17b51d40fac/psenet/test.jpg


--------------------------------------------------------------------------------
/rcnn/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.1)
 2 | 
 3 | project(rcnn)
 4 | 
 5 | add_definitions(-std=c++14)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 14)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--extended-lambda)
12 | 
13 | find_package(CUDA REQUIRED)
14 | 
15 | include_directories(${PROJECT_SOURCE_DIR}/include)
16 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
17 | # cuda
18 | include_directories(/usr/local/cuda/include)
19 | link_directories(/usr/local/cuda/lib64)
20 | # tensorrt
21 | include_directories(/home/jushi/TensorRT-8.2.1.6/include)
22 | link_directories(/home/jushi/TensorRT-8.2.1.6/lib)
23 | 
24 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
25 | 
26 | cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/BatchedNms.cu ${PROJECT_SOURCE_DIR}/PredictorDecode.cu ${PROJECT_SOURCE_DIR}/RoiAlign.cu ${PROJECT_SOURCE_DIR}/RpnDecode.cu ${PROJECT_SOURCE_DIR}/RpnNms.cu ${PROJECT_SOURCE_DIR}/MaskRcnnInference.cu)
27 | target_link_libraries(myplugins nvinfer cudart)
28 | 
29 | find_package(OpenCV)
30 | include_directories(${OpenCV_INCLUDE_DIRS})
31 | 
32 | add_executable(rcnn ${PROJECT_SOURCE_DIR}/rcnn.cpp)
33 | target_link_libraries(rcnn nvinfer)
34 | target_link_libraries(rcnn cudart)
35 | target_link_libraries(rcnn myplugins)
36 | target_link_libraries(rcnn ${OpenCV_LIBS})
37 | 
38 | add_definitions(-O2 -pthread)
39 | 
40 | 


--------------------------------------------------------------------------------
/rcnn/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda_runtime_api.h>
 4 | #include <stdexcept>
 5 | #include <cstdint>
 6 | 
 7 | #define CUDA_ALIGN 256
 8 | 
 9 | template <typename T>
10 | inline size_t get_size_aligned(size_t num_elem) {
11 |     size_t size = num_elem * sizeof(T);
12 |     size_t extra_align = 0;
13 |     if (size % CUDA_ALIGN != 0) {
14 |         extra_align = CUDA_ALIGN - size % CUDA_ALIGN;
15 |     }
16 |     return size + extra_align;
17 | }
18 | 
19 | template <typename T>
20 | inline T *get_next_ptr(size_t num_elem, void *&workspace, size_t &workspace_size) {
21 |     size_t size = get_size_aligned<T>(num_elem);
22 |     if (size > workspace_size) {
23 |         throw std::runtime_error("Workspace is too small!");
24 |     }
25 |     workspace_size -= size;
26 |     T *ptr = reinterpret_cast<T *>(workspace);
27 |     workspace = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(workspace) + size);
28 |     return ptr;
29 | }
30 | 
31 | #ifndef CUDA_CHECK
32 | #define CUDA_CHECK(callstr)\
33 |     {\
34 |         cudaError_t error_code = callstr;\
35 |         if (error_code != cudaSuccess) {\
36 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
37 |             assert(0);\
38 |         }\
39 |     }
40 | #endif  // CUDA_CHECK
41 | 


--------------------------------------------------------------------------------
/rcnn/macros.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <NvInfer.h>
 4 | #include <cuda.h>
 5 | 
 6 | #if CUDA_VERSION >=11000
 7 | #define CUDA_11
 8 | #endif
 9 | 
10 | #if NV_TENSORRT_MAJOR >= 8
11 | #define TRT_NOEXCEPT noexcept
12 | #define TRT_CONST_ENQUEUE const
13 | #else
14 | #define TRT_NOEXCEPT
15 | #define TRT_CONST_ENQUEUE
16 | #endif
17 | 


--------------------------------------------------------------------------------
/real-esrgan/general-x4v3/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.16)
 2 | project(real-esrgan)
 3 | 
 4 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")
 5 | 
 6 | add_definitions(-std=c++17)
 7 | add_definitions(-DAPI_EXPORTS)
 8 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 9 | #set(CMAKE_CXX_STANDARD 11)
10 | set(CMAKE_BUILD_TYPE Debug)
11 | 
12 | #find_package(CUDA REQUIRED)
13 | 
14 | INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/src/include)
15 | 
16 | # cuda
17 | FIND_PACKAGE(CUDA REQUIRED)
18 | #INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
19 | include_directories(/usr/local/cuda/include)
20 | link_directories(/usr/local/cuda/lib64)
21 | 
22 | # <------------------------TensorRT Related------------------------->
23 | include_directories(YOUR_TENSORRT_INCLUDE_DIR) # TensorRT-8.6.1.6/include
24 | link_directories(YOUR_TENSORRT_LIB_DIR) # TensorRT-8.6.1.6/lib
25 | 
26 | # <------------------------OpenCV Related------------------------->
27 | # opencv
28 | FIND_PACKAGE(OpenCV REQUIRED)
29 | INCLUDE_DIRECTORIES(${OpenCV_INCLUDE_DIRS})
30 | 
31 | set(CMAKE_CXX_STANDARD 17)
32 | 
33 | add_executable(${PROJECT_NAME} main.cpp)
34 | 
35 | cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/src/pixel_shuffle/pixel_shuffle.cu)
36 | target_link_libraries(myplugins nvinfer cudart)
37 | 
38 | 
39 | TARGET_LINK_LIBRARIES(${PROJECT_NAME} nvinfer)
40 | TARGET_LINK_LIBRARIES(${PROJECT_NAME} cudart)
41 | TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${OpenCV_LIBS})
42 | TARGET_LINK_LIBRARIES(${PROJECT_NAME} myplugins)
43 | 


--------------------------------------------------------------------------------
/real-esrgan/general-x4v3/README.md:
--------------------------------------------------------------------------------
 1 | # Real-ESRGAN realesr-general-x4v3 model
 2 | 
 3 | ## How to Run
 4 | 0. Replace YOUR_TENSORRT_INCLUDE_DIR and YOUR_TENSORRT_LIB_DIR in CMakeLists.txt with your TensorRT include and lib directories.
 5 | 1. generate .wts from pytorch with .pt
 6 | ```
 7 | git clone https://github.com/xinntao/Real-ESRGAN.git
 8 | cd Real-ESRGAN
 9 | 
10 | # Install basicsr - https://github.com/xinntao/BasicSR
11 | # We use BasicSR for both training and inference
12 | pip install basicsr
13 | # facexlib and gfpgan are for face enhancement
14 | pip install facexlib
15 | pip install gfpgan
16 | pip install -r requirements.txt
17 | python setup.py develop
18 | ```
19 | download realesr-general-x4v3.pth (and realesr-general-wdn-x4v3.pth if needed) from
20 | https://github.com/xinntao/Real-ESRGAN/releases
21 | 
22 | ```
23 | cp {tensorrtx}/real-esrgan-general-x4v3/gen_wts.py {xinntao}/Real-ESRGAN
24 | cd {xinntao}/Real-ESRGAN
25 | python gen_wts.py
26 | // a file 'real-esrgan.wts' will be generated.
27 | ```
28 | 
29 | **Be aware that if you need both realesr-general-x4v3.pth and realesr-general-wdn-x4v3.pth, please write a Python script to average all weights of realesr-general-x4v3.pth and realesr-general-wdn-x4v3.pth (from {xinntao}/Real-ESRGAN), then save it as a .pth file, and use this new file to generate a .wts file.**
30 | 
31 | 2. build tensorrtx/real-esrgan-general-x4v3 and run
32 | 
33 | ```
34 | cd {tensorrtx}/real-esrgan-general-x4v3/
35 | mkdir build
36 | cd build
37 | cp {xinntao}/Real-ESRGAN/real-esrgan.wts {tensorrtx}/real-esrgan/weights/
38 | cmake ..
39 | make
40 | ./real-esrgan your_images_dir
41 | ```
42 | 


--------------------------------------------------------------------------------
/real-esrgan/general-x4v3/src/include/config/config.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef REAL_ESRGAN_TRT_CONFIG_HPP
 2 | #define REAL_ESRGAN_TRT_CONFIG_HPP
 3 | 
 4 | #include <string>
 5 | 
 6 | //std::string INPUT_BLOB_NAME = "input";
 7 | //std::string OUTPUT_BLOB_NAME = "output";
 8 | 
 9 | const char* INPUT_BLOB_NAME = "input_0";
10 | const char* OUTPUT_BLOB_NAME = "output_0";
11 | 
12 | const bool USE_FP16 = false;
13 | 
14 | static const int BATCH_SIZE = 1;
15 | static const int INPUT_C = 3;
16 | static const int INPUT_H = 450;
17 | static const int INPUT_W = 300;
18 | static const int OUT_SCALE = 4;
19 | //static const int OUTPUT_SIZE = INPUT_C * INPUT_H * OUT_SCALE * INPUT_W * OUT_SCALE;
20 | static const int OUTPUT_SIZE = BATCH_SIZE * 48 * 450 * 300;
21 | //INPUT_C * INPUT_H * OUT_SCALE * INPUT_W * OUT_SCALE;
22 | #endif  //REAL_ESRGAN_TRT_CONFIG_HPP
23 | 


--------------------------------------------------------------------------------
/real-esrgan/general-x4v3/src/include/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRTX_CUDA_UTILS_H_
 2 | #define TRTX_CUDA_UTILS_H_
 3 | 
 4 | #include <cuda_runtime_api.h>
 5 | #include <stdint.h>
 6 | #include <cstdio>
 7 | #include <iostream>
 8 | #include <vector>
 9 | 
10 | #ifndef CUDA_CHECK
11 | #define CUDA_CHECK(callstr)                                                                    \
12 |     {                                                                                          \
13 |         cudaError_t error_code = callstr;                                                      \
14 |         if (error_code != cudaSuccess) {                                                       \
15 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
16 |             assert(0);                                                                         \
17 |         }                                                                                      \
18 |     }
19 | #endif  // CUDA_CHECK
20 | 
21 | #endif  // TRTX_CUDA_UTILS_H_
22 | 


--------------------------------------------------------------------------------
/real-esrgan/general-x4v3/src/include/preprocess/preprocess.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef REAL_ESRGAN_TRT_PREPROCESS_HPP
 2 | #define REAL_ESRGAN_TRT_PREPROCESS_HPP
 3 | 
 4 | struct PreprocessStruct {
 5 |     int N;
 6 |     int C;
 7 |     int H;
 8 |     int W;
 9 | };
10 | 
11 | #endif  //REAL_ESRGAN_TRT_PREPROCESS_HPP
12 | 


--------------------------------------------------------------------------------
/real-esrgan/x4plus/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(real-esrgan)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | add_definitions(-DAPI_EXPORTS)
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | if(WIN32)
14 | enable_language(CUDA)
15 | endif(WIN32)
16 | 
17 | include_directories(${PROJECT_SOURCE_DIR}/include)
18 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
19 | # cuda
20 | include_directories(/usr/local/cuda/include)
21 | link_directories(/usr/local/cuda/lib64)
22 | # tensorrt
23 | include_directories(/usr/include/x86_64-linux-gnu/)
24 | link_directories(/usr/lib/x86_64-linux-gnu/)
25 | 
26 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -g -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
27 | cuda_add_library(myplugins SHARED preprocess.cu postprocess.cu)
28 | target_link_libraries(myplugins nvinfer cudart)
29 | 
30 | find_package(OpenCV)
31 | include_directories(${OpenCV_INCLUDE_DIRS})
32 | 
33 | cuda_add_executable(real-esrgan real-esrgan.cpp)
34 | 
35 | target_link_libraries(real-esrgan nvinfer)
36 | target_link_libraries(real-esrgan cudart)
37 | target_link_libraries(real-esrgan myplugins)
38 | target_link_libraries(real-esrgan ${OpenCV_LIBS})
39 | 
40 | if(UNIX)
41 | add_definitions(-O2 -pthread)
42 | endif(UNIX)
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/real-esrgan/x4plus/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRTX_CUDA_UTILS_H_
 2 | #define TRTX_CUDA_UTILS_H_
 3 | 
 4 | #include <cuda_runtime_api.h>
 5 | #include <stdint.h>
 6 | #include <cstdio>
 7 | #include <vector>
 8 | #include <iostream>
 9 | 
10 | #ifndef CUDA_CHECK
11 | #define CUDA_CHECK(callstr)\
12 |     {\
13 |         cudaError_t error_code = callstr;\
14 |         if (error_code != cudaSuccess) {\
15 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
16 |             assert(0);\
17 |         }\
18 |     } 
19 | #endif  // CUDA_CHECK
20 | 
21 | #endif  // TRTX_CUDA_UTILS_H_
22 | 
23 | 


--------------------------------------------------------------------------------
/real-esrgan/x4plus/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #ifdef API_EXPORTS
 5 | #if defined(_MSC_VER)
 6 | #define API __declspec(dllexport)
 7 | #else
 8 | #define API __attribute__((visibility("default")))
 9 | #endif
10 | #else
11 | 
12 | #if defined(_MSC_VER)
13 | #define API __declspec(dllimport)
14 | #else
15 | #define API
16 | #endif
17 | #endif  // API_EXPORTS
18 | 
19 | #if NV_TENSORRT_MAJOR >= 8
20 | #define TRT_NOEXCEPT noexcept
21 | #define TRT_CONST_ENQUEUE const
22 | #else
23 | #define TRT_NOEXCEPT
24 | #define TRT_CONST_ENQUEUE
25 | #endif
26 | 
27 | #endif  // __MACROS_H
28 | 


--------------------------------------------------------------------------------
/real-esrgan/x4plus/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRTX_REAL_ESRGAN_UTILS_H_
 2 | #define TRTX_REAL_ESRGAN_UTILS_H_
 3 | 
 4 | #include <dirent.h>
 5 | #include <opencv2/opencv.hpp>
 6 | 
 7 | static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
 8 |     DIR *p_dir = opendir(p_dir_name);
 9 |     if (p_dir == nullptr) {
10 |         return -1;
11 |     }
12 | 
13 |     struct dirent* p_file = nullptr;
14 |     while ((p_file = readdir(p_dir)) != nullptr) {
15 |         if (strcmp(p_file->d_name, ".") != 0 &&
16 |             strcmp(p_file->d_name, "..") != 0) {
17 |             //std::string cur_file_name(p_dir_name);
18 |             //cur_file_name += "/";
19 |             //cur_file_name += p_file->d_name;
20 |             std::string cur_file_name(p_file->d_name);
21 |             file_names.push_back(cur_file_name);
22 |         }
23 |     }
24 | 
25 |     closedir(p_dir);
26 |     return 0;
27 | }
28 | 
29 | #endif  // TRTX_REAL_ESRGAN_UTILS_H_
30 | 
31 | 


--------------------------------------------------------------------------------
/refinedet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(refinedet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | # tensorrt
12 | include_directories(/data_2/tensorrt/TensorRT-7.0.0.11/include/) #include_directories(/usr/include/x86_64-linux-gnu/)
13 | link_directories(/data_2/tensorrt/TensorRT-7.0.0.11/lib/) #link_directories(/usr/lib/x86_64-linux-gnu/)
14 | 
15 | 
16 | find_package(CUDA REQUIRED)
17 | 
18 | include_directories(${PROJECT_SOURCE_DIR}/include)
19 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
20 | # cuda
21 | include_directories(/usr/local/cuda/include)
22 | link_directories(/usr/local/cuda/lib64)
23 | 
24 | #find_package(OpenCV)
25 | #include_directories(OpenCV_INCLUDE_DIRS)
26 | 
27 | include_directories(/home/software_install/opencv3.4.6/include)
28 | link_directories(/home/software_install/opencv3.4.6/lib)
29 | 
30 | 
31 | set(CMAKE_PREFIX_PATH "/data_1/torch1.1.0") ###torch1.1.0
32 | find_package(Torch REQUIRED)
33 | 
34 | include_directories(/data_1/torch1.1.0/include)
35 | link_directories(/data_1/torch1.1.0/lib)
36 | 
37 | 
38 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
39 | 
40 | 
41 | add_executable(refinedet ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/refinedet.cpp)
42 | target_link_libraries(refinedet nvinfer)
43 | target_link_libraries(refinedet cudart)
44 | target_link_libraries(refinedet "${TORCH_LIBRARIES}")
45 | target_link_libraries(refinedet opencv_calib3d opencv_core opencv_dnn opencv_imgproc opencv_highgui opencv_imgcodecs caffe2)
46 | 
47 | add_definitions(-O2 -pthread)
48 | 
49 | 


--------------------------------------------------------------------------------
/refinedet/README.md:
--------------------------------------------------------------------------------
 1 | # RefineDet
 2 | 
 3 | For the Pytorch implementation, you can refer to [luuuyi/RefineDet.PyTorch](https://github.com/luuuyi/RefineDet.PyTorch)
 4 | 
 5 | ## How to run
 6 | 
 7 | ```
 8 | 1. generate wts file. from pytorch
 9 | python gen_wts_refinedet.py
10 | // a file 'refinedet.wts' will be generated.
11 | 
12 | 2. build tensorrtx/RefineDet and run or Using clion to open a project(recommend)
13 | Configuration file in configure.h
14 | You need configure your own paths and modes(SERIALIZE or INFER)
15 | Detailed information reference configure.h
16 | mkdir build
17 | cd build
18 | cmake ..
19 | make
20 | ```
21 | 
22 | ## dependence
23 | 
24 | ```
25 | TensorRT7.0.0.11 
26 | OpenCV >= 3.4
27 | libtorch >=1.1.0
28 | ```
29 | 
30 | ## feature
31 | 
32 | 1.tensorrt Multi output  
33 | 2.L2norm  
34 | 3.Postprocessing with libtorch
35 | 
36 | ## More Information
37 | 
38 | See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)  
39 | [tensorrt tutorials](https://github.com/wang-xinyu/tensorrtx/tree/master/tutorials)  
40 | For more detailed guidance, see [yhl blog](https://www.cnblogs.com/yanghailin/p/14525128.html)
41 | 
42 | 


--------------------------------------------------------------------------------
/refinedet/calibrator.h:
--------------------------------------------------------------------------------
 1 | #ifndef ENTROPY_CALIBRATOR_H
 2 | #define ENTROPY_CALIBRATOR_H
 3 | 
 4 | #include "NvInfer.h"
 5 | #include <string>
 6 | #include <vector>
 7 | 
 8 | //! \class Int8EntropyCalibrator2
 9 | //!
10 | //! \brief Implements Entropy calibrator 2.
11 | //!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
12 | //!
13 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
14 | {
15 | public:
16 |     Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);
17 | 
18 |     virtual ~Int8EntropyCalibrator2();
19 |     int getBatchSize() const override;
20 |     bool getBatch(void* bindings[], const char* names[], int nbBindings) override;
21 |     const void* readCalibrationCache(size_t& length) override;
22 |     void writeCalibrationCache(const void* cache, size_t length) override;
23 | 
24 | private:
25 |     int batchsize_;
26 |     int input_w_;
27 |     int input_h_;
28 |     int img_idx_;
29 |     std::string img_dir_;
30 |     std::vector<std::string> img_files_;
31 |     size_t input_count_;
32 |     std::string calib_table_name_;
33 |     const char* input_blob_name_;
34 |     bool read_cache_;
35 |     void* device_input_;
36 |     std::vector<char> calib_cache_;
37 | };
38 | 
39 | #endif // ENTROPY_CALIBRATOR_H
40 | 


--------------------------------------------------------------------------------
/refinedet/gen_wts_refinedet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import struct
 4 | from models.refinedet import build_refinedet
 5 | 
 6 | 
 7 | 
 8 | num_classes = 25
 9 | path_model = "/data_2/project_2021/pytorch_refinedet/2021/20210308.pth"
10 | path_save_wts = "./refinedet0312.wts"
11 | input_size = 320
12 | 
13 | net = build_refinedet('test', input_size, num_classes)  # initialize net
14 | net.load_state_dict(torch.load(path_model))
15 | net.eval()
16 | 
17 | 
18 | f = open(path_save_wts, 'w')
19 | f.write('{}\n'.format(len(net.state_dict().keys())))
20 | for k, v in net.state_dict().items():
21 |     vr = v.reshape(-1).cpu().numpy()
22 |     f.write('{} {} '.format(k, len(vr)))
23 |     for vv in vr:
24 |         f.write(' ')
25 |         f.write(struct.pack('>f',float(vv)).hex())
26 |     f.write('\n')
27 | 
28 | print("success generate wts!")


--------------------------------------------------------------------------------
/repvgg/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(repvgg)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | include_directories(${PROJECT_SOURCE_DIR}/include)
12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
13 | # cuda
14 | include_directories(/usr/local/cuda/include)
15 | link_directories(/usr/local/cuda/lib64)
16 | # tensorrt
17 | include_directories(/usr/include/x86_64-linux-gnu/)
18 | link_directories(/usr/lib/x86_64-linux-gnu/)
19 | 
20 | add_executable(repvgg ${PROJECT_SOURCE_DIR}/repvgg.cpp)
21 | target_link_libraries(repvgg nvinfer)
22 | target_link_libraries(repvgg cudart)
23 | 
24 | 
25 | add_definitions(-O2 -pthread)
26 | 
27 | 


--------------------------------------------------------------------------------
/repvgg/README.md:
--------------------------------------------------------------------------------
 1 | # RepVGG
 2 | 
 3 | RepVGG models from
 4 | "RepVGG: Making VGG-style ConvNets Great Again" <https://arxiv.org/pdf/2101.03697.pdf>
 5 | 
 6 | For the Pytorch implementation, you can refer to [DingXiaoH/RepVGG](https://github.com/DingXiaoH/RepVGG)
 7 | 
 8 | # How to run
 9 | 
10 | 1. generate wts file.
11 | 
12 | ```
13 | git clone https://github.com/DingXiaoH/RepVGG.git
14 | cd ReoVGG
15 | ```
16 | 
17 | You may convert a trained model into the inference-time structure with
18 | 
19 | ```
20 | python convert.py [weights file of the training-time model to load] [path to save] -a [model name]
21 | ```
22 | 
23 | For example,
24 | 
25 | ```
26 | python convert.py RepVGG-B2-train.pth RepVGG-B2-deploy.pth -a RepVGG-B2
27 | ```
28 | 
29 | Then copy `gen_wts.py` to `RepVGG` and generate .wts file, for example
30 | 
31 | ```
32 | python gen_wts.py -w RepVGG-B2-deploy.pth -s RepVGG-B2.wts
33 | ```
34 | 
35 | 2. build and run
36 | ```
37 | cd tensorrtx/repvgg
38 | 
39 | mkdir build
40 | 
41 | cd build
42 | 
43 | cmake ..
44 | 
45 | make
46 | 
47 | sudo ./repvgg -s RepVGG-B2  // serialize model to plan file i.e. 'RepVGG-B2.engine'
48 | sudo ./repvgg -d RepVGG-B2  // deserialize plan file and run inference
49 | ```
50 | 
51 | 


--------------------------------------------------------------------------------
/repvgg/gen_wts.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import struct
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | def main(args):
 8 |     # Load model
 9 |     state_dict = torch.load(args.weight)
10 |     with open(args.save_path, "w") as f:
11 |         f.write("{}\n".format(len(state_dict.keys())))
12 |         for k, v in state_dict.items():
13 |             vr = v.reshape(-1).cpu().numpy()
14 |             f.write("{} {} ".format(k, len(vr)))
15 |             for vv in vr:
16 |                 f.write(" ")
17 |                 f.write(struct.pack(">f", float(vv)).hex())
18 |             f.write("\n")
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument(
24 |         "-w",
25 |         "--weight",
26 |         type=str,
27 |         required=True,
28 |         help="RepVGG model weight path",
29 |     )
30 |     parser.add_argument(
31 |         "-s",
32 |         "--save_path",
33 |         type=str,
34 |         required=True,
35 |         help="generated wts path",
36 |     )
37 |     args = parser.parse_args()
38 |     main(args)


--------------------------------------------------------------------------------
/repvgg/logging.h:
--------------------------------------------------------------------------------
 1 | #ifndef TENSORRT_LOGGING_H
 2 | #define TENSORRT_LOGGING_H
 3 | 
 4 | #include "NvInferRuntimeCommon.h"
 5 | #include <cassert>
 6 | #include <iostream>
 7 | 
 8 | // Logger for TensorRT info/warning/errors
 9 | class Logger : public nvinfer1::ILogger
10 | {
11 | public:
12 |     Logger() : Logger(Severity::kINFO) {}
13 | 
14 |     Logger(Severity severity) : reportableSeverity(severity) {}
15 | 
16 |     void log(Severity severity, const char *msg) override
17 |     {
18 |         // suppress messages with severity enum value greater than the reportable
19 |         if (severity > reportableSeverity)
20 |             return;
21 | 
22 |         switch (severity)
23 |         {
24 |         case Severity::kINTERNAL_ERROR:
25 |             std::cerr << "INTERNAL_ERROR: ";
26 |             break;
27 |         case Severity::kERROR:
28 |             std::cerr << "ERROR: ";
29 |             break;
30 |         case Severity::kWARNING:
31 |             std::cerr << "WARNING: ";
32 |             break;
33 |         case Severity::kINFO:
34 |             std::cerr << "INFO: ";
35 |             break;
36 |         default:
37 |             std::cerr << "UNKNOWN: ";
38 |             break;
39 |         }
40 |         std::cerr << msg << std::endl;
41 |     }
42 | 
43 |     Severity reportableSeverity{Severity::kWARNING};
44 | };
45 | 
46 | #endif // TENSORRT_LOGGING_H
47 | 


--------------------------------------------------------------------------------
/resnet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(resnet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | include_directories(${PROJECT_SOURCE_DIR}/include)
12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
13 | # cuda
14 | include_directories(/usr/local/cuda/include)
15 | link_directories(/usr/local/cuda/lib64)
16 | # tensorrt
17 | include_directories(/usr/include/x86_64-linux-gnu/)
18 | link_directories(/usr/lib/x86_64-linux-gnu/)
19 | 
20 | add_executable(resnet18 ${PROJECT_SOURCE_DIR}/resnet18.cpp)
21 | target_link_libraries(resnet18 nvinfer)
22 | target_link_libraries(resnet18 cudart)
23 | 
24 | add_executable(resnet34 ${PROJECT_SOURCE_DIR}/resnet34.cpp)
25 | target_link_libraries(resnet34 nvinfer)
26 | target_link_libraries(resnet34 cudart)
27 | 
28 | add_executable(resnet50 ${PROJECT_SOURCE_DIR}/resnet50.cpp)
29 | target_link_libraries(resnet50 nvinfer)
30 | target_link_libraries(resnet50 cudart)
31 | 
32 | add_executable(resnext50 ${PROJECT_SOURCE_DIR}/resnext50_32x4d.cpp)
33 | target_link_libraries(resnext50 nvinfer)
34 | target_link_libraries(resnext50 cudart)
35 | 
36 | add_executable(wideresnet50 ${PROJECT_SOURCE_DIR}/wideresnet50.cpp)
37 | target_link_libraries(wideresnet50 nvinfer)
38 | target_link_libraries(wideresnet50 cudart)
39 | 
40 | add_definitions(-O2 -pthread)
41 | 
42 | 


--------------------------------------------------------------------------------
/retinaface/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(retinaface)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
14 |     message("embed_platform on")
15 |     include_directories(/usr/local/cuda/targets/aarch64-linux/include)
16 |     link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
17 | else()
18 |     message("embed_platform off")
19 |     include_directories(/usr/local/cuda/include)
20 |     link_directories(/usr/local/cuda/lib64)
21 | endif()
22 | 
23 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
24 | 
25 | cuda_add_library(decodeplugin SHARED ${PROJECT_SOURCE_DIR}/decode.cu)
26 | target_link_libraries(decodeplugin nvinfer cudart)
27 | 
28 | find_package(OpenCV)
29 | include_directories(${OpenCV_INCLUDE_DIRS})
30 | 
31 | add_executable(retina_r50 ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/retina_r50.cpp)
32 | target_link_libraries(retina_r50 nvinfer)
33 | target_link_libraries(retina_r50 cudart)
34 | target_link_libraries(retina_r50 decodeplugin)
35 | target_link_libraries(retina_r50 ${OpenCV_LIBRARIES})
36 | 
37 | add_executable(retina_mnet ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/retina_mnet.cpp)
38 | target_link_libraries(retina_mnet nvinfer)
39 | target_link_libraries(retina_mnet cudart)
40 | target_link_libraries(retina_mnet decodeplugin)
41 | target_link_libraries(retina_mnet ${OpenCV_LIBRARIES})
42 | 
43 | add_definitions(-O2 -pthread)
44 | 
45 | 


--------------------------------------------------------------------------------
/retinaface/calibrator.h:
--------------------------------------------------------------------------------
 1 | #ifndef ENTROPY_CALIBRATOR_H
 2 | #define ENTROPY_CALIBRATOR_H
 3 | 
 4 | #include "NvInfer.h"
 5 | #include <string>
 6 | #include <vector>
 7 | #include "macros.h"
 8 | 
 9 | //! \class Int8EntropyCalibrator2
10 | //!
11 | //! \brief Implements Entropy calibrator 2.
12 | //!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
13 | //!
14 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
15 | {
16 | public:
17 |     Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);
18 | 
19 |     virtual ~Int8EntropyCalibrator2();
20 |     int getBatchSize() const TRT_NOEXCEPT override;
21 |     bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
22 |     const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
23 |     void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;
24 | 
25 | private:
26 |     int batchsize_;
27 |     int input_w_;
28 |     int input_h_;
29 |     int img_idx_;
30 |     std::string img_dir_;
31 |     std::vector<std::string> img_files_;
32 |     size_t input_count_;
33 |     std::string calib_table_name_;
34 |     const char* input_blob_name_;
35 |     bool read_cache_;
36 |     void* device_input_;
37 |     std::vector<char> calib_cache_;
38 | };
39 | 
40 | #endif // ENTROPY_CALIBRATOR_H
41 | 


--------------------------------------------------------------------------------
/retinaface/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #if NV_TENSORRT_MAJOR >= 8
 5 | #define TRT_NOEXCEPT noexcept
 6 | #define TRT_CONST_ENQUEUE const
 7 | #else
 8 | #define TRT_NOEXCEPT
 9 | #define TRT_CONST_ENQUEUE
10 | #endif
11 | 
12 | #endif  // __MACROS_H
13 | 


--------------------------------------------------------------------------------
/retinafaceAntiCov/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(retinafaceAntiCov)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | 
14 | include_directories(${PROJECT_SOURCE_DIR}/include)
15 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
16 |     message("embed_platform on")
17 |     include_directories(/usr/local/cuda/targets/aarch64-linux/include)
18 |     link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
19 | else()
20 |     message("embed_platform off")
21 |     # cuda
22 |     include_directories(/usr/local/cuda/include)
23 |     link_directories(/usr/local/cuda/lib64)
24 | 
25 |     # tensorrt
26 |     include_directories(/home/lindsay/TensorRT-8.6.1.6/include)
27 |     link_directories(/home/lindsay/TensorRT-8.6.1.6/lib)
28 |     #  include_directories(/home/lindsay/TensorRT-7.2.3.4/include)
29 |     #  link_directories(/home/lindsay/TensorRT-7.2.3.4/lib)
30 | 
31 | 
32 | endif()
33 | 
34 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
35 | 
36 | cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/decode.cu)
37 | 
38 | find_package(OpenCV)
39 | include_directories(${OpenCV_INCLUDE_DIRS})
40 | 
41 | add_executable(retinafaceAntiCov ${PROJECT_SOURCE_DIR}/retinafaceAntiCov.cpp)
42 | target_link_libraries(retinafaceAntiCov nvinfer)
43 | target_link_libraries(retinafaceAntiCov cudart)
44 | target_link_libraries(retinafaceAntiCov myplugins)
45 | target_link_libraries(retinafaceAntiCov ${OpenCV_LIBS})
46 | 
47 | add_definitions(-O2 -pthread)
48 | 
49 | 


--------------------------------------------------------------------------------
/retinafaceAntiCov/gen_wts.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | from retinaface_cov import RetinaFaceCoV
 3 | 
 4 | gpuid = 0
 5 | model = RetinaFaceCoV('./cov2/mnet_cov2', 0, gpuid, 'net3l')
 6 | 
 7 | f = open('retinafaceAntiCov.wts', 'w')
 8 | f.write('{}\n'.format(len(model.model.get_params()[0].keys()) + len(model.model.get_params()[1].keys())))
 9 | for k, v in model.model.get_params()[0].items():
10 |     vr = v.reshape(-1).asnumpy()
11 |     f.write('{} {} '.format(k, len(vr)))
12 |     for vv in vr:
13 |         f.write(' ')
14 |         f.write(struct.pack('>f',float(vv)).hex())
15 |     f.write('\n')
16 | for k, v in model.model.get_params()[1].items():
17 |     vr = v.reshape(-1).asnumpy()
18 |     f.write('{} {} '.format(k, len(vr)))
19 |     for vv in vr:
20 |         f.write(' ')
21 |         f.write(struct.pack('>f',float(vv)).hex())
22 |     f.write('\n')
23 | 
24 | 


--------------------------------------------------------------------------------
/retinafaceAntiCov/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #if NV_TENSORRT_MAJOR >= 8
 5 | #define TRT_NOEXCEPT noexcept
 6 | #define TRT_CONST_ENQUEUE const
 7 | #else
 8 | #define TRT_NOEXCEPT
 9 | #define TRT_CONST_ENQUEUE
10 | #endif
11 | 
12 | #endif  // __MACROS_H


--------------------------------------------------------------------------------
/scaled-yolov4/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(yolov4)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | # tensorrt
19 | include_directories(/usr/include/x86_64-linux-gnu/)
20 | link_directories(/usr/lib/x86_64-linux-gnu/)
21 | 
22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
23 | 
24 | cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu ${PROJECT_SOURCE_DIR}/mish.cu)
25 | target_link_libraries(myplugins nvinfer cudart)
26 | 
27 | find_package(OpenCV)
28 | include_directories(${OpenCV_INCLUDE_DIRS})
29 | 
30 | add_executable(yolov4csp ${PROJECT_SOURCE_DIR}/yolov4_csp.cpp)
31 | target_link_libraries(yolov4csp nvinfer)
32 | target_link_libraries(yolov4csp cudart)
33 | target_link_libraries(yolov4csp myplugins)
34 | target_link_libraries(yolov4csp ${OpenCV_LIBS})
35 | 
36 | add_definitions(-O2 -pthread)
37 | 
38 | 


--------------------------------------------------------------------------------
/scaled-yolov4/gen_wts.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | import sys
 3 | from models.models import *
 4 | from utils import *
 5 | 
 6 | model = Darknet('models/yolov4-csp.cfg', (512, 512))
 7 | weights = sys.argv[1]
 8 | device = torch_utils.select_device('0')
 9 | if weights.endswith('.pt'):  # pytorch format
10 |     model.load_state_dict(torch.load(weights, map_location=device)['model'])
11 | else:  # darknet format
12 |     load_darknet_weights(model, weights)
13 | 
14 | with open('yolov4_csp.wts', 'w') as f:
15 |     f.write('{}\n'.format(len(model.state_dict().keys())))
16 |     for k, v in model.state_dict().items():
17 |         vr = v.reshape(-1).cpu().numpy()
18 |         f.write('{} {} '.format(k, len(vr)))
19 |         for vv in vr:
20 |             f.write(' ')
21 |             f.write(struct.pack('>f',float(vv)).hex())
22 |         f.write('\n')
23 | 
24 | 


--------------------------------------------------------------------------------
/scaled-yolov4/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef __TRT_UTILS_H_
 2 | #define __TRT_UTILS_H_
 3 | 
 4 | #include <iostream>
 5 | #include <vector>
 6 | #include <algorithm>
 7 | #include <cudnn.h>
 8 | 
 9 | #ifndef CUDA_CHECK
10 | 
11 | #define CUDA_CHECK(callstr)                                                                    \
12 |     {                                                                                          \
13 |         cudaError_t error_code = callstr;                                                      \
14 |         if (error_code != cudaSuccess) {                                                       \
15 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
16 |             assert(0);                                                                         \
17 |         }                                                                                      \
18 |     }
19 | 
20 | #endif
21 | 
22 | namespace Tn
23 | {
24 |     template<typename T> 
25 |     void write(char*& buffer, const T& val)
26 |     {
27 |         *reinterpret_cast<T*>(buffer) = val;
28 |         buffer += sizeof(T);
29 |     }
30 | 
31 |     template<typename T> 
32 |     void read(const char*& buffer, T& val)
33 |     {
34 |         val = *reinterpret_cast<const T*>(buffer);
35 |         buffer += sizeof(T);
36 |     }
37 | }
38 | 
39 | #endif


--------------------------------------------------------------------------------
/senet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(senet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | include_directories(${PROJECT_SOURCE_DIR}/include)
12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
13 | # cuda
14 | include_directories(/usr/local/cuda/include)
15 | link_directories(/usr/local/cuda/lib64)
16 | # tensorrt
17 | include_directories(/usr/include/x86_64-linux-gnu/)
18 | link_directories(/usr/lib/x86_64-linux-gnu/)
19 | 
20 | add_executable(se_resnet ${PROJECT_SOURCE_DIR}/se_resnet50.cpp)
21 | target_link_libraries(se_resnet nvinfer)
22 | target_link_libraries(se_resnet cudart)
23 | 
24 | add_definitions(-O2 -pthread)
25 | 
26 | 


--------------------------------------------------------------------------------
/senet/README.md:
--------------------------------------------------------------------------------
 1 | # SENet
 2 | 
 3 | An implementation of SENet, proposed in Squeeze-and-Excitation Networks by Jie Hu, Li Shen, Samuel Albanie, Gang Sun, Enhua Wu
 4 | 
 5 | [https://arxiv.org/abs/1709.01507](https://arxiv.org/abs/1709.01507)
 6 | 
 7 | For the Pytorch implementation, you can refer to [wang-xinyu/senet.pytorch](https://github.com/wang-xinyu/senet.pytorch), which is forked from [moskomule/senet.pytorch](https://github.com/moskomule/senet.pytorch).
 8 | 
 9 | 
10 | ```
11 | // 1. generate se_resnet50.wts from [wang-xinyu/senet.pytorch](https://github.com/wang-xinyu/senet.pytorch)
12 | 
13 | // 2. put se_resnet50.wts into tensorrtx/senet
14 | 
15 | // 3. build and run
16 | 
17 | cd tensorrtx/senet
18 | 
19 | mkdir build
20 | 
21 | cd build
22 | 
23 | cmake ..
24 | 
25 | make
26 | 
27 | sudo ./se_resnet -s   // serialize model to plan file i.e. 'se_resnet50.engine'
28 | 
29 | sudo ./se_resnet -d   // deserialize plan file and run inference
30 | 
31 | // 4. see if the output is same as [wang-xinyu/senet.pytorch]
32 | ```
33 | 
34 | 


--------------------------------------------------------------------------------
/shufflenetv2/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(shufflenet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | include_directories(${PROJECT_SOURCE_DIR}/include)
12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
13 | # cuda
14 | include_directories(/usr/local/cuda/include)
15 | link_directories(/usr/local/cuda/lib64)
16 | # tensorrt
17 | include_directories(/usr/include/x86_64-linux-gnu/)
18 | link_directories(/usr/lib/x86_64-linux-gnu/)
19 | 
20 | add_executable(shufflenet ${PROJECT_SOURCE_DIR}/shufflenet_v2.cpp)
21 | target_link_libraries(shufflenet nvinfer)
22 | target_link_libraries(shufflenet cudart)
23 | 
24 | add_definitions(-O2 -pthread)
25 | 
26 | 


--------------------------------------------------------------------------------
/shufflenetv2/README.md:
--------------------------------------------------------------------------------
 1 | # shufflenet v2
 2 | 
 3 | ShuffleNetV2 with 0.5x output channels, as described in
 4 |  "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design"
 5 |             <https://arxiv.org/abs/1807.11164>
 6 | 
 7 | For the Pytorch implementation, you can refer to [pytorchx/shufflenet](https://github.com/wang-xinyu/pytorchx/tree/master/shufflenet)
 8 | 
 9 | Following tricks are used in this shufflenet,
10 | 
11 | - `torch.chunk` is used in shufflenet v2. We implemented the 'chunk(2, dim=C)' by tensorrt plugin. Which is the simplest plugin in this tensorrtx project. You can learn the basic procedures of build tensorrt plugin.
12 | - shuffle layer is used, the `channel_shuffle()` in pytorchx/shufflenet can be implemented by two shuffle layers in tensorrt.
13 | - Batchnorm layer, implemented by scale layer.
14 | 
15 | ```
16 | // 1. generate shufflenet.wts from [pytorchx/shufflenet](https://github.com/wang-xinyu/pytorchx/tree/master/shufflenet)
17 | 
18 | // 2. put shufflenet.wts into tensorrtx/shufflenet
19 | 
20 | // 3. build and run
21 | 
22 | cd tensorrtx/shufflenet
23 | 
24 | mkdir build
25 | 
26 | cd build
27 | 
28 | cmake ..
29 | 
30 | make
31 | 
32 | sudo ./shufflenet -s   // serialize model to plan file i.e. 'shufflenet.engine'
33 | sudo ./shufflenet -d   // deserialize plan file and run inference
34 | 
35 | // 4. see if the output is same as pytorchx/shufflenet
36 | ```
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/squeezenet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(squeezenet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | include_directories(${PROJECT_SOURCE_DIR}/include)
12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
13 | # cuda
14 | include_directories(/usr/local/cuda/include)
15 | link_directories(/usr/local/cuda/lib64)
16 | # tensorrt
17 | include_directories(/usr/include/x86_64-linux-gnu/)
18 | link_directories(/usr/lib/x86_64-linux-gnu/)
19 | 
20 | add_executable(squeezenet ${PROJECT_SOURCE_DIR}/squeezenet.cpp)
21 | target_link_libraries(squeezenet nvinfer)
22 | target_link_libraries(squeezenet cudart)
23 | 
24 | add_definitions(-O2 -pthread)
25 | 
26 | 


--------------------------------------------------------------------------------
/squeezenet/README.md:
--------------------------------------------------------------------------------
 1 | # squeezenet v1.1
 2 | 
 3 | SqueezeNet 1.1 model from the official SqueezeNet repo
 4 |     <https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1>
 5 | 
 6 | SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters
 7 |     than SqueezeNet 1.0, without sacrificing accuracy.
 8 | 
 9 | For the Pytorch implementation, you can refer to [pytorchx/squeezenet](https://github.com/wang-xinyu/pytorchx/tree/master/squeezenet)
10 | 
11 | ```
12 | // 1. generate squeezenet.wts from [pytorchx/squeezenet](https://github.com/wang-xinyu/pytorchx/tree/master/squeezenet)
13 | 
14 | // 2. put squeezenet.wts into tensorrtx/squeezenet
15 | 
16 | // 3. build and run
17 | 
18 | cd tensorrtx/squeezenet
19 | 
20 | mkdir build
21 | 
22 | cd build
23 | 
24 | cmake ..
25 | 
26 | make
27 | 
28 | sudo ./squeezenet -s   // serialize model to plan file i.e. 'squeezenet.engine'
29 | sudo ./squeezenet -d   // deserialize plan file and run inference
30 | 
31 | // 4. see if the output is same as pytorchx/squeezenet
32 | ```
33 | 
34 | 


--------------------------------------------------------------------------------
/superpoint/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(SuperPointNet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | # tensorrt
19 | include_directories(/usr/include/x86_64-linux-gnu/)
20 | link_directories(/usr/lib/x86_64-linux-gnu/)
21 | 
22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
23 | 
24 | find_package(OpenCV)
25 | include_directories(${OpenCV_INCLUDE_DIRS})
26 | 
27 | add_executable(supernet ${PROJECT_SOURCE_DIR}/supernet.cpp ${PROJECT_SOURCE_DIR}/utils.cpp)
28 | target_link_libraries(supernet nvinfer)
29 | target_link_libraries(supernet cudart)
30 | target_link_libraries(supernet ${OpenCV_LIBS})
31 | 
32 | add_definitions(-O2 -pthread)


--------------------------------------------------------------------------------
/superpoint/gen_wts.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import struct
 3 | from model import SuperPointNet
 4 | 
 5 | model_name = "superpoint_v1"
 6 | 
 7 | net = SuperPointNet()
 8 | net.load_state_dict(torch.load("superpoint_v1.pth"))
 9 | net = net.cuda()
10 | net.eval()
11 | 
12 | f = open(model_name + ".wts", "w")
13 | f.write("{}\n".format(len(net.state_dict().keys())))
14 | for k, v in net.state_dict().items():
15 |     vr = v.reshape(-1).cpu().numpy()
16 |     f.write("{} {}".format(k, len(vr)))
17 |     for vv in vr:
18 |         f.write(" ")
19 |         f.write(struct.pack(">f", float(vv)).hex())
20 |     f.write("\n")


--------------------------------------------------------------------------------
/superpoint/utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <map>
 4 | #include "NvInfer.h"
 5 | #include "cuda_runtime_api.h"
 6 | #include "assert.h"
 7 | #include <fstream>
 8 | #include <iostream>
 9 | #include <memory>
10 | #include <vector>
11 | #include <opencv2/opencv.hpp>
12 | 
13 | 
14 | using namespace nvinfer1;
15 | 
16 | #define CHECK(status)                             \
17 |     do                                            \
18 |     {                                             \
19 |         auto ret = (status);                      \
20 |         if (ret != 0)                             \
21 |         {                                         \
22 |             std::cout << "Cuda failure: " << ret; \
23 |             abort();                              \
24 |         }                                         \
25 |     } while (0)
26 | 
27 | 
28 | int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names);
29 | std::map<std::string, Weights> loadWeights(const std::string file);
30 | void tokenize(const std::string &str, std::vector<std::string> &tokens, const std::string &delimiters = ",");


--------------------------------------------------------------------------------
/swin-transformer/semantic-segmentation/CMakeLists.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wang-xinyu/tensorrtx/c6bcebd812907a6d9eb66ecc909eb17b51d40fac/swin-transformer/semantic-segmentation/CMakeLists.txt


--------------------------------------------------------------------------------
/swin-transformer/semantic-segmentation/README.md:
--------------------------------------------------------------------------------
 1 | # Swin Transform - Semantic Segmentation
 2 | 
 3 | The Pytorch implementation is [microsoft/Swin-Transformer](https://github.com/microsoft/Swin-Transformer.git).
 4 | 
 5 | Only support Swin-T, welcome the PR for other backbones.
 6 | 
 7 | ## Authors
 8 | 
 9 | <a href="https://github.com/wdhao"><img src="https://avatars.githubusercontent.com/u/58798355?v=4?s=48" width="40px;" alt=""/></a> 
10 | <a href="https://github.com/wang-xinyu"><img src="https://avatars.githubusercontent.com/u/15235574?s=48&v=4" width="40px;" alt=""/></a> 
11 | 
12 | ## How to Run
13 | 
14 | 1. generate .wts from pytorch with .pt, or download .wts from model zoo
15 | 
16 | ```
17 | git clone https://github.com/microsoft/Swin-Transformer.git
18 | git clone https://github.com/wang-xinyu/tensorrtx.git
19 | 
20 | python gen_wts.py Swin-Transform.pt
21 | // a file 'Swin-Transform.wts' will be generated.
22 | ```
23 | 
24 | 2. build tensorrtx/swin-transform and run
25 | 
26 | ```
27 | cd {tensorrtx}/swin-transform/semantic-segmentation/
28 | mkdir build
29 | cd build
30 | cp {microsoft}/Swin-Transformer/Swin-Transform.wts {tensorrtx}/swin-transformer/semantic-segmentation/build
31 | cmake ..
32 | make
33 | sudo ./swintransformer -s [.wts] [.engine]   // serialize model to plan file
34 | sudo ./swintransformer -d [.engine] [image folder]  // deserialize and run inference, the images in [image folder] will be processed.
35 | 
36 | ```
37 | 
38 | ## More Information
39 | 
40 | See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)
41 | 
42 | 


--------------------------------------------------------------------------------
/swin-transformer/semantic-segmentation/UpsmapleKernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef UPSAMPLE_KERNEL_H
 2 | #define UPSAMPLE_KERNEL_H
 3 | 
 4 | #include <iostream>
 5 | #include "NvInfer.h"
 6 | 
 7 | int UpsampleInference(
 8 |     cudaStream_t stream,
 9 |     int n,
10 |     int input_b,
11 |     int input_c,
12 |     int input_h,
13 |     int input_w,
14 |     float scale_h,
15 |     float scale_w,
16 |     const void* inputs,
17 |     void* outputs);
18 | 
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/swin-transformer/semantic-segmentation/gen_wts.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import struct
 3 | import sys
 4 | 
 5 | # Initialize
 6 | pt_file = sys.argv[1]
 7 | # Load model
 8 | model = torch.load(pt_file, map_location=torch.device('cpu'))['model'].float()  # load to FP32
 9 | model.to(device).eval()
10 | 
11 | with open(pt_file.split('.')[0] + '.wts', 'w') as f:
12 |     f.write('{}\n'.format(len(model.state_dict().keys())))
13 |     for k, v in model.state_dict().items():
14 |         vr = v.reshape(-1).cpu().numpy()
15 |         f.write('{} {} '.format(k, len(vr)))
16 |         for vv in vr:
17 |             f.write(' ')
18 |             f.write(struct.pack('>f',float(vv)).hex())
19 |         f.write('\n')
20 | 


--------------------------------------------------------------------------------
/swin-transformer/semantic-segmentation/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | 
 4 | using namespace std;
 5 | 
 6 | 
 7 | 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/swin-transformer/semantic-segmentation/myhpp.h:
--------------------------------------------------------------------------------
 1 | #ifndef MYHPP_H
 2 | #define MYHPP_H
 3 | 
 4 | #include <assert.h>
 5 | #include <iostream>
 6 | #include<vector>
 7 | #include<map>
 8 | #define _USE_MATH_DEFINES
 9 | #include <math.h>
10 | #include <cmath>
11 | #include<string>
12 | #include<fstream>
13 | #include<streambuf>
14 | #include<ctime>
15 | #include<chrono>
16 | #include<iomanip>
17 | #include<cuda_runtime.h>
18 | #include<opencv2/core/core.hpp>
19 | #include<opencv2/imgproc/imgproc.hpp>
20 | #include<opencv2/imgcodecs/imgcodecs.hpp>
21 | #include<opencv2/dnn/dnn.hpp>
22 | //#include <opencv2/highgui/highgui.hpp>
23 | #include<stdio.h>
24 | #include<cuda.h>
25 | //#include <cudnn.h>
26 | #include <cublas_v2.h>
27 | #include<driver_types.h>
28 | #include<NvInfer.h>
29 | #include<NvInferPlugin.h>
30 | #include<NvOnnxParser.h>
31 | #include<NvOnnxConfig.h>
32 | #include<cstdint>
33 | 
34 | 
35 | 
36 | #endif // MYHPP_H
37 | 


--------------------------------------------------------------------------------
/tsm/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(TSM)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | include_directories(${PROJECT_SOURCE_DIR}/include)
12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
13 | # cuda
14 | include_directories(/usr/local/cuda/include)
15 | link_directories(/usr/local/cuda/lib64)
16 | 
17 | # tensorrt
18 | include_directories(/home/ubuntu/TensorRT/include/)
19 | link_directories(/home/ubuntu/TensorRT/lib/)
20 | 
21 | add_executable(tsm_r50 ${PROJECT_SOURCE_DIR}/tsm_r50.cpp)
22 | target_link_libraries(tsm_r50 nvinfer)
23 | target_link_libraries(tsm_r50 cudart)
24 | 
25 | add_definitions(-O2 -pthread)
26 | 


--------------------------------------------------------------------------------
/tsm/mmaction2_tsm_r50_config.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | model = dict(
 3 |     type='Recognizer2D',
 4 |     backbone=dict(
 5 |         type='ResNetTSM',
 6 |         pretrained='torchvision://resnet50',
 7 |         depth=50,
 8 |         norm_eval=False,
 9 |         shift_div=8),
10 |     cls_head=dict(
11 |         type='TSMHead',
12 |         num_classes=400,
13 |         in_channels=2048,
14 |         spatial_type='avg',
15 |         consensus=dict(type='AvgConsensus', dim=1),
16 |         dropout_ratio=0.5,
17 |         init_std=0.001,
18 |         is_shift=True),
19 |     # model training and testing settings
20 |     train_cfg=None,
21 |     test_cfg=dict(average_clips='prob'))
22 | 


--------------------------------------------------------------------------------
/tutorials/check_fp16_int8_support.md:
--------------------------------------------------------------------------------
 1 | # Check if Your GPU Supports FP16/INT8
 2 | 
 3 | ## 1. check your GPU Compute Capability
 4 | 
 5 | visit https://developer.nvidia.com/cuda-gpus#compute and check your GPU compute capability.
 6 | 
 7 | For example, GTX1080 is 6.1, Tesla T4 is 7.5.
 8 | 
 9 | ## 2. check the hardware-precision-matrix
10 | 
11 | visit https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html#hardware-precision-matrix and check the matrix.
12 | 
13 | For example, compute capability 6.1 supports FP32 and INT8. 7.5 supports FP32, FP16, INT8, FP16 tensor core, etc.
14 | 
15 | 


--------------------------------------------------------------------------------
/tutorials/contribution.md:
--------------------------------------------------------------------------------
 1 | # How to make contribution
 2 | 
 3 | 1. Fork this repo to your github account
 4 | 
 5 | 2. Clone your fork
 6 | 
 7 | 3. Create a feature branch
 8 | 
 9 | 4. Make changes, including but not limited to create new model, bug fix, documentation, tutorials, etc.
10 | 
11 | 5. Pre-commit check and push, we use clang-format to do coding style checking, and the coding style is following google c++ coding style with 4-space.
12 | 
13 | ```
14 | pip install pre-commit
15 | pip install clang-format
16 | 
17 | cd tensorrtx/
18 | git add [files-to-commit]
19 | pre-commit run
20 | 
21 | # fix pre-commit errors, then git add files-to-commit again
22 | git add [files-to-commit]
23 | 
24 | git commit -m "describe your commit"
25 | 
26 | git push origin [feature-branch]
27 | ```
28 | 
29 | 6. Submit a pull-request on github web UI to master branch of wang-xinyu/tensorrtx.
30 | 


--------------------------------------------------------------------------------
/tutorials/migrating_from_tensorrt_4_to_7.md:
--------------------------------------------------------------------------------
 1 | # Migrating from TensorRT 4 to 7
 2 | 
 3 | The following APIs are deprecated and replaced in TensorRT 7.
 4 | 
 5 | - `DimsCHW`, replaced by `Dims3`
 6 | - `addConvolution()`, replaced by `addConvolutionNd()`
 7 | - `addPooling()`, replaced by `addPoolingNd()`
 8 | - `addDeconvolution()`, replaced by `addDeconvolutionNd()`
 9 | - `createNetwork()`, replaced by `createNetworkV2()`
10 | - `buildCudaEngine()`, replaced by `buildEngineWithConfig()`
11 | - `createPReLUPlugin()`, replaced by `addActivation()` with `ActivationType::kLEAKY_RELU`
12 | - `IPlugin` and `IPluginExt` class, replaced by `IPluginV2IOExt` or `IPluginV2DynamicExt`
13 | - Use the new `Logger` class defined in logging.h
14 | 


--------------------------------------------------------------------------------
/ufld/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(lane_det)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | # cuda directory
12 | include_directories(/usr/local/cuda/include/)
13 | link_directories(/usr/local/cuda/lib64/)
14 | 
15 | # tensorrt
16 | #include_directories(/workspace/TensorRT-7.2.3.4/include/)
17 | #link_directories(/workspace/TensorRT-7.2.3.4/lib/)
18 | 
19 | 
20 | find_package(OpenCV)
21 | include_directories(${OpenCV_INCLUDE_DIRS})
22 | 
23 | add_executable(lane_det ${PROJECT_SOURCE_DIR}/lane_det.cpp)
24 | target_link_libraries(lane_det nvinfer)
25 | target_link_libraries(lane_det cudart)
26 | target_link_libraries(lane_det ${OpenCV_LIBS})
27 | 
28 | add_definitions(-O2 -pthread)
29 | 
30 | 


--------------------------------------------------------------------------------
/ufld/README.md:
--------------------------------------------------------------------------------
 1 | # Ultra-Fast-Lane-Detection(UFLD)
 2 | 
 3 | The Pytorch implementation is [Ultra-Fast-Lane-Detection](https://github.com/cfzd/Ultra-Fast-Lane-Detection).
 4 | 
 5 | ## How to Run
 6 | ```
 7 | 1. generate lane.wts and lane.onnx from pytorch with tusimple_18.pth
 8 | 
 9 | git clone https://github.com/wang-xinyu/tensorrtx.git
10 | git clone https://github.com/cfzd/Ultra-Fast-Lane-Detection.git
11 | // download its weights 'tusimple_18.pth'
12 | // copy tensorrtx/ufld/gen_wts.py into Ultra-Fast-Lane-Detection/
13 | // ensure the file name is tusimple_18.pth and lane.wts in gen_wts.py
14 | // go to Ultra-Fast-Lane-Detection
15 | python gen_wts.py
16 | // a file 'lane.wts' will be generated.
17 | // then ( not necessary )
18 | python pth2onnx.py
19 | //a file 'lane.onnx' will be generated.
20 | 
21 | 2. build tensorrtx/ufld and run
22 | 
23 | mkdir build
24 | cd build
25 | cmake ..
26 | make
27 | sudo ./lane_det -s          // serialize model to plan file i.e. 'lane.engine'
28 | sudo ./lane_det -d  PATH_TO_YOUR_IMAGE_FOLDER // deserialize plan file and run inference, the images will be processed.
29 | 
30 | ```
31 | 
32 | ## More Information
33 | 1. Changed the preprocess and postprocess in tensorrtx, give a different way to convert NHWC to NCHW in preprocess and just show the result using opencv rather than saving the result in postprocess.
34 | 2. If there are some bugs where you inference with multi batch_size, just modify the code in preprocess or postprocess, it's not complicated.
35 | 3. Some results are stored in resluts folder.
36 | 


--------------------------------------------------------------------------------
/ufld/gen_wts.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import struct
 3 | #import models.crnn as crnn
 4 | from model.model import parsingNet
 5 | 
 6 | # Initialize
 7 | model = parsingNet(pretrained = False, backbone='18', cls_dim = (101, 56, 4), use_aux=False)
 8 | device = 'cpu'
 9 | # Load model
10 | state_dict = torch.load('tusimple_18.pth', map_location='cpu')['model']
11 | model.to(device).eval()
12 | 
13 | f = open('lane.wts', 'w')
14 | f.write('{}\n'.format(len(state_dict.keys())))
15 | for k, v in state_dict.items():
16 |     vr = v.reshape(-1).cpu().numpy()
17 |     f.write('{} {} '.format(k, len(vr)))
18 |     for vv in vr:
19 |         f.write(' ')
20 |         f.write(struct.pack('>f',float(vv)).hex())
21 |     f.write('\n')
22 | 


--------------------------------------------------------------------------------
/ufld/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #ifdef API_EXPORTS
 5 | #if defined(_MSC_VER)
 6 | #define API __declspec(dllexport)
 7 | #else
 8 | #define API __attribute__((visibility("default")))
 9 | #endif
10 | #else
11 | 
12 | #if defined(_MSC_VER)
13 | #define API __declspec(dllimport)
14 | #else
15 | #define API
16 | #endif
17 | #endif  // API_EXPORTS
18 | 
19 | #if NV_TENSORRT_MAJOR >= 8
20 | #define TRT_NOEXCEPT noexcept
21 | #define TRT_CONST_ENQUEUE const
22 | #else
23 | #define TRT_NOEXCEPT
24 | #define TRT_CONST_ENQUEUE
25 | #endif
26 | 
27 | #endif  // __MACROS_H
28 | 


--------------------------------------------------------------------------------
/ufld/pth2onnx.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.optim as optim
 5 | from torchvision import datasets, transforms
 6 | import torch.onnx as torch_onnx
 7 | from model.model import parsingNet
 8 | 
 9 | MODELPATH = "tusimple_18.pth"
10 | 
11 | net = parsingNet(pretrained = False, backbone='18', cls_dim = (101, 56, 4), use_aux=False).cuda()
12 | 
13 | state_dict = torch.load(MODELPATH, map_location='cpu')['model']
14 | 
15 | net.train(False)
16 | 
17 | x = torch.randn(1, 3, 288, 800).cuda()
18 | 
19 | torch_onnx.export(net, x, "lane.onnx", verbose=True, input_names=["input"], output_names=["output"],opset_version=11)
20 | 


--------------------------------------------------------------------------------
/unet/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(unet)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | # cuda directory
12 | include_directories(/usr/local/cuda/include/)
13 | link_directories(/usr/local/cuda/lib64/)
14 | 
15 | # tensorrt
16 | include_directories(/workspace/TensorRT-7.2.3.4/include/)
17 | link_directories(/workspace/TensorRT-7.2.3.4/lib/)
18 | 
19 | # opencv library
20 | find_package(OpenCV)
21 | include_directories(${OpenCV_INCLUDE_DIRS})
22 | 
23 | # link library and add exec file
24 | add_executable(unet ${PROJECT_SOURCE_DIR}/unet.cpp)
25 | target_link_libraries(unet nvinfer)
26 | target_link_libraries(unet cudart)
27 | target_link_libraries(unet ${OpenCV_LIBS})
28 | 
29 | add_definitions(-O2 -pthread)
30 | 
31 | 


--------------------------------------------------------------------------------
/unet/gen_wts.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | import struct
 4 | 
 5 | def main():
 6 |   device = torch.device('cpu')
 7 |   state_dict = torch.load(sys.argv[1], map_location=device)
 8 | 
 9 |   f = open("unet.wts", 'w')
10 |   f.write("{}\n".format(len(state_dict.keys())))
11 |   for k, v in state_dict.items():
12 |     print('key: ', k)
13 |     print('value: ', v.shape)
14 |     vr = v.reshape(-1).cpu().numpy()
15 |     f.write("{} {}".format(k, len(vr)))
16 |     for vv in vr:
17 |       f.write(" ")
18 |       f.write(struct.pack(">f", float(vv)).hex())
19 |     f.write("\n")
20 |   f.close()
21 | 
22 | if __name__ == '__main__':
23 |   main()
24 | 
25 | 


--------------------------------------------------------------------------------
/unet/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #ifdef API_EXPORTS
 5 | #if defined(_MSC_VER)
 6 | #define API __declspec(dllexport)
 7 | #else
 8 | #define API __attribute__((visibility("default")))
 9 | #endif
10 | #else
11 | 
12 | #if defined(_MSC_VER)
13 | #define API __declspec(dllimport)
14 | #else
15 | #define API
16 | #endif
17 | #endif  // API_EXPORTS
18 | 
19 | #if NV_TENSORRT_MAJOR >= 8
20 | #define TRT_NOEXCEPT noexcept
21 | #define TRT_CONST_ENQUEUE const
22 | #else
23 | #define TRT_NOEXCEPT
24 | #define TRT_CONST_ENQUEUE
25 | #endif
26 | 
27 | #endif  // __MACROS_H
28 | 


--------------------------------------------------------------------------------
/vgg/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(vgg)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | include_directories(${PROJECT_SOURCE_DIR}/include)
12 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
13 | # cuda
14 | include_directories(/usr/local/cuda/include)
15 | link_directories(/usr/local/cuda/lib64)
16 | # tensorrt
17 | include_directories(/usr/include/x86_64-linux-gnu/)
18 | link_directories(/usr/lib/x86_64-linux-gnu/)
19 | 
20 | add_executable(vgg ${PROJECT_SOURCE_DIR}/vgg11.cpp)
21 | target_link_libraries(vgg nvinfer)
22 | target_link_libraries(vgg cudart)
23 | 
24 | add_definitions(-O2 -pthread)
25 | 
26 | 


--------------------------------------------------------------------------------
/vgg/README.md:
--------------------------------------------------------------------------------
 1 | # vgg
 2 | 
 3 | VGG 11-layer model (configuration "A") from
 4 |     "Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>
 5 | 
 6 | For the Pytorch implementation, you can refer to [pytorchx/vgg](https://github.com/wang-xinyu/pytorchx/tree/master/vgg)
 7 | 
 8 | VGG's architecture is simple, just some conv, relu, maxpool, and fc layers.
 9 | 
10 | ```
11 | // 1. generate vgg.wts from [pytorchx/vgg](https://github.com/wang-xinyu/pytorchx/tree/master/vgg)
12 | 
13 | // 2. put vgg.wts into tensorrtx/vgg
14 | 
15 | // 3. build and run
16 | 
17 | cd tensorrtx/vgg
18 | 
19 | mkdir build
20 | 
21 | cd build
22 | 
23 | cmake ..
24 | 
25 | make
26 | 
27 | sudo ./vgg -s   // serialize model to plan file i.e. 'vgg.engine'
28 | sudo ./vgg -d   // deserialize plan file and run inference
29 | 
30 | // 4. see if the output is same as pytorchx/vgg
31 | ```
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/yolo11/include/calibrator.h:
--------------------------------------------------------------------------------
 1 | #ifndef ENTROPY_CALIBRATOR_H
 2 | #define ENTROPY_CALIBRATOR_H
 3 | 
 4 | #include <NvInfer.h>
 5 | #include <string>
 6 | #include <vector>
 7 | #include "macros.h"
 8 | 
 9 | //! \class Int8EntropyCalibrator2
10 | //!
11 | //! \brief Implements Entropy calibrator 2.
12 | //!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
13 | //!
14 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
15 |    public:
16 |     Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name,
17 |                            const char* input_blob_name, bool read_cache = true);
18 |     virtual ~Int8EntropyCalibrator2();
19 |     int getBatchSize() const TRT_NOEXCEPT override;
20 |     bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
21 |     const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
22 |     void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;
23 | 
24 |    private:
25 |     int batchsize_;
26 |     int input_w_;
27 |     int input_h_;
28 |     int img_idx_;
29 |     std::string img_dir_;
30 |     std::vector<std::string> img_files_;
31 |     size_t input_count_;
32 |     std::string calib_table_name_;
33 |     const char* input_blob_name_;
34 |     bool read_cache_;
35 |     void* device_input_;
36 |     std::vector<char> calib_cache_;
37 | };
38 | 
39 | #endif  // ENTROPY_CALIBRATOR_H
40 | 


--------------------------------------------------------------------------------
/yolo11/include/config.h:
--------------------------------------------------------------------------------
 1 | #define USE_FP16
 2 | // #define USE_FP32
 3 | // #define USE_INT8
 4 | 
 5 | const static char* kInputTensorName = "images";
 6 | const static char* kOutputTensorName = "output";
 7 | const static char* kProtoTensorName = "proto";
 8 | const static int kNumClass = 80;
 9 | const static int kPoseNumClass = 1;
10 | const static int kNumberOfPoints = 17;  // number of keypoints total
11 | // obb model's number of classes
12 | constexpr static int kObbNumClass = 15;
13 | const static int kObbNe = 1;  // number of extra parameters
14 | const static int kBatchSize = 1;
15 | const static int kGpuId = 0;
16 | const static int kInputH = 640;
17 | const static int kInputW = 640;
18 | const static int kObbInputH = 1024;
19 | const static int kObbInputW = 1024;
20 | const static float kNmsThresh = 0.45f;
21 | const static float kConfThresh = 0.5f;
22 | const static float kConfThreshKeypoints = 0.5f;  // keypoints confidence
23 | const static int kMaxInputImageSize = 3000 * 3000;
24 | const static int kMaxNumOutputBbox = 1000;
25 | //Quantization input image folder path
26 | const static char* kInputQuantizationFolder = "./coco_calib";
27 | 
28 | // Classfication model's number of classes
29 | constexpr static int kClsNumClass = 1000;
30 | // Classfication model's input shape
31 | constexpr static int kClsInputH = 224;
32 | constexpr static int kClsInputW = 224;
33 | 


--------------------------------------------------------------------------------
/yolo11/include/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRTX_CUDA_UTILS_H_
 2 | #define TRTX_CUDA_UTILS_H_
 3 | 
 4 | #include <cuda_runtime_api.h>
 5 | 
 6 | #ifndef CUDA_CHECK
 7 | #define CUDA_CHECK(callstr)                                                                    \
 8 |     {                                                                                          \
 9 |         cudaError_t error_code = callstr;                                                      \
10 |         if (error_code != cudaSuccess) {                                                       \
11 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
12 |             assert(0);                                                                         \
13 |         }                                                                                      \
14 |     }
15 | #endif  // CUDA_CHECK
16 | 
17 | #endif  // TRTX_CUDA_UTILS_H_
18 | 


--------------------------------------------------------------------------------
/yolo11/include/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #include "NvInfer.h"
 5 | 
 6 | #ifdef API_EXPORTS
 7 | #if defined(_MSC_VER)
 8 | #define API __declspec(dllexport)
 9 | #else
10 | #define API __attribute__((visibility("default")))
11 | #endif
12 | #else
13 | 
14 | #if defined(_MSC_VER)
15 | #define API __declspec(dllimport)
16 | #else
17 | #define API
18 | #endif
19 | #endif  // API_EXPORTS
20 | 
21 | #if NV_TENSORRT_MAJOR >= 8
22 | #define TRT_NOEXCEPT noexcept
23 | #define TRT_CONST_ENQUEUE const
24 | #else
25 | #define TRT_NOEXCEPT
26 | #define TRT_CONST_ENQUEUE
27 | #endif
28 | 
29 | #endif  // __MACROS_H
30 | 


--------------------------------------------------------------------------------
/yolo11/include/model.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <assert.h>
 4 | #include <string>
 5 | #include "NvInfer.h"
 6 | 
 7 | nvinfer1::IHostMemory* buildEngineYolo11Cls(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
 8 |                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
 9 |                                             std::string& type, int max_channels);
10 | 
11 | nvinfer1::IHostMemory* buildEngineYolo11Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
12 |                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
13 |                                             int& max_channels, std::string& type);
14 | 
15 | nvinfer1::IHostMemory* buildEngineYolo11Seg(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
16 |                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
17 |                                             int& max_channels, std::string& type);
18 | 
19 | nvinfer1::IHostMemory* buildEngineYolo11Pose(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
20 |                                              nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
21 |                                              int& max_channels, std::string& type);
22 | 
23 | nvinfer1::IHostMemory* buildEngineYolo11Obb(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
24 |                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
25 |                                             int& max_channels, std::string& type);
26 | 


--------------------------------------------------------------------------------
/yolo11/include/preprocess.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <map>
 4 | #include <opencv2/opencv.hpp>
 5 | #include "NvInfer.h"
 6 | #include "types.h"
 7 | 
 8 | void cuda_preprocess_init(int max_image_size);
 9 | 
10 | void cuda_preprocess_destroy();
11 | 
12 | void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
13 |                      cudaStream_t stream);
14 | 
15 | void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
16 |                            cudaStream_t stream);
17 | 


--------------------------------------------------------------------------------
/yolo11/include/types.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "config.h"
 3 | 
 4 | struct alignas(float) Detection {
 5 |     //center_x center_y w h
 6 |     float bbox[4];
 7 |     float conf;  // bbox_conf * cls_conf
 8 |     float class_id;
 9 |     float mask[32];
10 |     float keypoints[kNumberOfPoints * 3];  // 17*3 keypoints
11 |     float angle;                           // obb angle
12 | };
13 | 
14 | struct AffineMatrix {
15 |     float value[6];
16 | };
17 | 
18 | const int bbox_element =
19 |         sizeof(AffineMatrix) / sizeof(float) + 1;  // left, top, right, bottom, confidence, class, keepflag
20 | 


--------------------------------------------------------------------------------
/yolo11_tripy/.gitignore:
--------------------------------------------------------------------------------
1 | imagenet_classes.txt
2 | *.JPEG
3 | *.pt
4 | 


--------------------------------------------------------------------------------
/yolo11_tripy/README.md:
--------------------------------------------------------------------------------
 1 | # YOLO11 Tripy
 2 | 
 3 | This example implements a YOLO11 classifier model using [Tripy](https://nvidia.github.io/TensorRT-Incubator/).
 4 | 
 5 | ## Running The Example
 6 | 
 7 | Run the following commands from the [`yolo11_tripy`](./) directory:
 8 | 
 9 | 1. Install Dependencies:
10 | 
11 |     ```bash
12 |     python3 -m pip install -r requirements.txt
13 |     ```
14 | 
15 | 2. Download ImageNet classes file:
16 | 
17 |     ```bash
18 |     wget https://raw.githubusercontent.com/joannzhang00/ImageNet-dataset-classes-labels/main/imagenet_classes.txt
19 |     ```
20 | 
21 | 3. [*Optional*] Download some images:
22 | 
23 |     ```bash
24 |     wget https://raw.githubusercontent.com/EliSchwartz/imagenet-sample-images/master/n01558993_robin.JPEG
25 |     wget https://raw.githubusercontent.com/EliSchwartz/imagenet-sample-images/master/n04389033_tank.JPEG
26 |     ```
27 | 
28 |     You can skip this step if you already have images you'd like to classify.
29 | 
30 | 3. Build the model:
31 | 
32 |     ```bash
33 |     python3 compile_classifier.py
34 |     ```
35 | 
36 |     You can configure various aspects of the model when you compile.
37 |     Run `python3 compile_classifier.py -h` for details.
38 | 
39 | 4. Run inference:
40 | 
41 |     ```bash
42 |     python3 classify.py n01558993_robin.JPEG n04389033_tank.JPEG
43 |     ```
44 | 
45 |     The `classify.py` script allows you to pass one or more image file paths on the command line.
46 |     The images are batched and classified in a single forward pass.
47 | 


--------------------------------------------------------------------------------
/yolo11_tripy/constants.py:
--------------------------------------------------------------------------------
1 | IMAGE_C = 3
2 | IMAGE_H = 224
3 | IMAGE_W = 224
4 | 


--------------------------------------------------------------------------------
/yolo11_tripy/requirements.txt:
--------------------------------------------------------------------------------
1 | -f https://nvidia.github.io/TensorRT-Incubator/packages.html
2 | nvtripy>=0.1.1
3 | opencv-python-headless
4 | numpy
5 | torch
6 | 


--------------------------------------------------------------------------------
/yolop/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(yolop)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Release)
10 | 
11 | find_package(CUDA  REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | 
15 | find_package(OpenCV REQUIRED)
16 | include_directories(${OpenCV_INCLUDE_DIRS})
17 | 
18 | # cuda
19 | include_directories(/usr/local/cuda-10.2/include)
20 | link_directories(/usr/local/cuda-10.2/lib64)
21 | # tensorrt
22 | include_directories(/usr/include/aarch64-linux-gnu/)
23 | link_directories(/usr/lib/aarch64-linux-gnu/)
24 | 
25 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
26 | 
27 | # to generate plugins
28 | cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
29 | target_link_libraries(myplugins nvinfer cudart)
30 | 
31 | # to generate trt and test image dir
32 | add_executable(yolop ${PROJECT_SOURCE_DIR}/yolop.cpp)
33 | target_link_libraries(yolop nvinfer cudart myplugins ${OpenCV_LIBS})
34 | add_definitions(-O3 -pthread)
35 | 
36 | 


--------------------------------------------------------------------------------
/yolop/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cuda_runtime_api.h>
 3 | 
 4 | #ifndef CUDA_CHECK
 5 | #define CUDA_CHECK(callstr)\
 6 |     {\
 7 |         cudaError_t error_code = callstr;\
 8 |         if (error_code != cudaSuccess) {\
 9 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
10 |             assert(0);\
11 |         }\
12 |     }
13 | #endif  // CUDA_CHECK
14 | 
15 | 


--------------------------------------------------------------------------------
/yolop/gen_wts.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import torch
 3 | import struct
 4 | 
 5 | # TODO: YOLOP_BASE_DIR is the root of YOLOP
 6 | print("[WARN] Please download/clone YOLOP, then set YOLOP_BASE_DIR to the root of YOLOP")
 7 | 
 8 | #YOLOP_BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 9 | YOLOP_BASE_DIR = "/home/user/jetson/tmp/YOLOP"
10 | 
11 | sys.path.append(YOLOP_BASE_DIR)
12 | from lib.models import get_net
13 | from lib.config import cfg
14 | 
15 | 
16 | # Initialize
17 | device = torch.device('cpu')
18 | # Load model
19 | model = get_net(cfg)
20 | checkpoint = torch.load(YOLOP_BASE_DIR + '/weights/End-to-end.pth', map_location=device)
21 | model.load_state_dict(checkpoint['state_dict'])
22 | # load to FP32
23 | model.float()
24 | model.to(device).eval()
25 | 
26 | f = open('yolop.wts', 'w')
27 | f.write('{}\n'.format(len(model.state_dict().keys())))
28 | for k, v in model.state_dict().items():
29 |     vr = v.reshape(-1).cpu().numpy()
30 |     f.write('{} {} '.format(k, len(vr)))
31 |     for vv in vr:
32 |         f.write(' ')
33 |         f.write(struct.pack('>f',float(vv)).hex())
34 |     f.write('\n')
35 | 
36 | f.close()
37 | 
38 | print("save as yolop.wts")


--------------------------------------------------------------------------------
/yolop/logging.h:
--------------------------------------------------------------------------------
 1 | // create by ausk(jinlj) 2022/10/25
 2 | #pragma once
 3 | 
 4 | #include "NvInferRuntimeCommon.h"
 5 | #include <cassert>
 6 | #include <ctime>
 7 | #include <iomanip>
 8 | #include <iostream>
 9 | #include <ostream>
10 | #include <sstream>
11 | #include <string>
12 | #include "macros.h"
13 | 
14 | #if NV_TENSORRT_MAJOR >= 8
15 | #define TRT_NOEXCEPT noexcept
16 | #else
17 | #define TRT_NOEXCEPT
18 | #endif
19 | 
20 | using Severity = nvinfer1::ILogger::Severity;
21 | 
22 | class Logger : public nvinfer1::ILogger
23 | {
24 | public:
25 |     void log(Severity severity, const char* msg) TRT_NOEXCEPT override
26 |     {
27 |         if (severity < Severity::kINFO) {
28 |             std::cout << msg << std::endl;
29 |         }
30 |     }
31 | };
32 | 


--------------------------------------------------------------------------------
/yolop/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #if NV_TENSORRT_MAJOR >= 8
 5 | #define TRT_NOEXCEPT noexcept
 6 | #define TRT_CONST_ENQUEUE const
 7 | #else
 8 | #define TRT_NOEXCEPT
 9 | #define TRT_CONST_ENQUEUE
10 | #endif
11 | 
12 | #endif  // __MACROS_H


--------------------------------------------------------------------------------
/yolov10/include/calibrator.h:
--------------------------------------------------------------------------------
 1 | #ifndef ENTROPY_CALIBRATOR_H
 2 | #define ENTROPY_CALIBRATOR_H
 3 | 
 4 | #include <NvInfer.h>
 5 | #include <string>
 6 | #include <vector>
 7 | #include "macros.h"
 8 | 
 9 | //! \class Int8EntropyCalibrator2
10 | //!
11 | //! \brief Implements Entropy calibrator 2.
12 | //!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
13 | //!
14 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
15 |    public:
16 |     Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name,
17 |                            const char* input_blob_name, bool read_cache = true);
18 |     virtual ~Int8EntropyCalibrator2();
19 |     int getBatchSize() const TRT_NOEXCEPT override;
20 |     bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
21 |     const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
22 |     void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;
23 | 
24 |    private:
25 |     int batchsize_;
26 |     int input_w_;
27 |     int input_h_;
28 |     int img_idx_;
29 |     std::string img_dir_;
30 |     std::vector<std::string> img_files_;
31 |     size_t input_count_;
32 |     std::string calib_table_name_;
33 |     const char* input_blob_name_;
34 |     bool read_cache_;
35 |     void* device_input_;
36 |     std::vector<char> calib_cache_;
37 | };
38 | 
39 | #endif  // ENTROPY_CALIBRATOR_H
40 | 


--------------------------------------------------------------------------------
/yolov10/include/config.h:
--------------------------------------------------------------------------------
 1 | //#define USE_FP32
 2 | #define USE_FP16
 3 | // #define USE_INT8
 4 | 
 5 | const static char* kInputTensorName = "images";
 6 | const static char* kOutputTensorName = "output";
 7 | const static int kNumClass = 80;
 8 | const static int kBatchSize = 1;
 9 | const static int kGpuId = 0;
10 | const static int kInputH = 640;
11 | const static int kInputW = 640;
12 | const static float kConfThresh = 0.5f;
13 | const static int kMaxInputImageSize = 3000 * 3000;
14 | const static int kMaxNumOutputBbox = 1000;
15 | //Quantization input image folder path
16 | const static char* kInputQuantizationFolder = "./coco_calib";
17 | 


--------------------------------------------------------------------------------
/yolov10/include/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRTX_CUDA_UTILS_H_
 2 | #define TRTX_CUDA_UTILS_H_
 3 | 
 4 | #include <cuda_runtime_api.h>
 5 | 
 6 | #ifndef CUDA_CHECK
 7 | #define CUDA_CHECK(callstr)                                                                    \
 8 |     {                                                                                          \
 9 |         cudaError_t error_code = callstr;                                                      \
10 |         if (error_code != cudaSuccess) {                                                       \
11 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
12 |             assert(0);                                                                         \
13 |         }                                                                                      \
14 |     }
15 | #endif  // CUDA_CHECK
16 | 
17 | #endif  // TRTX_CUDA_UTILS_H_
18 | 


--------------------------------------------------------------------------------
/yolov10/include/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #include "NvInfer.h"
 5 | 
 6 | #ifdef API_EXPORTS
 7 | #if defined(_MSC_VER)
 8 | #define API __declspec(dllexport)
 9 | #else
10 | #define API __attribute__((visibility("default")))
11 | #endif
12 | #else
13 | 
14 | #if defined(_MSC_VER)
15 | #define API __declspec(dllimport)
16 | #else
17 | #define API
18 | #endif
19 | #endif  // API_EXPORTS
20 | 
21 | #if NV_TENSORRT_MAJOR >= 8
22 | #define TRT_NOEXCEPT noexcept
23 | #define TRT_CONST_ENQUEUE const
24 | #else
25 | #define TRT_NOEXCEPT
26 | #define TRT_CONST_ENQUEUE
27 | #endif
28 | 
29 | #endif  // __MACROS_H
30 | 


--------------------------------------------------------------------------------
/yolov10/include/model.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <assert.h>
 4 | #include <string>
 5 | #include "NvInfer.h"
 6 | 
 7 | nvinfer1::IHostMemory* buildEngineYolov10DetN(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
 8 |                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
 9 |                                               int& max_channels);
10 | 
11 | nvinfer1::IHostMemory* buildEngineYolov10DetS(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
12 |                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
13 |                                               int& max_channels);
14 | 
15 | nvinfer1::IHostMemory* buildEngineYolov10DetM(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
16 |                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
17 |                                               int& max_channels);
18 | 
19 | nvinfer1::IHostMemory* buildEngineYolov10DetBL(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
20 |                                                nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
21 |                                                int& max_channels);
22 | 
23 | nvinfer1::IHostMemory* buildEngineYolov10DetX(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
24 |                                               nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
25 |                                               int& max_channels);
26 | 


--------------------------------------------------------------------------------
/yolov10/include/postprocess.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <opencv2/opencv.hpp>
 4 | #include "NvInfer.h"
 5 | #include "types.h"
 6 | 
 7 | cv::Rect get_rect(cv::Mat& img, float bbox[4]);
 8 | 
 9 | void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
10 | 
11 | void batch_topk(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
12 |                 float conf_thresh, int topk = 300);
13 | 


--------------------------------------------------------------------------------
/yolov10/include/preprocess.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <map>
 4 | #include <opencv2/opencv.hpp>
 5 | #include "NvInfer.h"
 6 | #include "types.h"
 7 | 
 8 | void cuda_preprocess_init(int max_image_size);
 9 | 
10 | void cuda_preprocess_destroy();
11 | 
12 | void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
13 |                      cudaStream_t stream);
14 | 
15 | void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
16 |                            cudaStream_t stream);
17 | 


--------------------------------------------------------------------------------
/yolov10/include/types.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "config.h"
 3 | 
 4 | struct alignas(float) Detection {
 5 |     //center_x center_y w h
 6 |     float bbox[4];
 7 |     float conf;  // bbox_conf * cls_conf
 8 |     float class_id;
 9 | };
10 | 
11 | struct AffineMatrix {
12 |     float value[6];
13 | };
14 | 
15 | const int bbox_element =
16 |         sizeof(Detection) / sizeof(float) + 1;  // left, top, right, bottom, confidence, class, keepflag
17 | 


--------------------------------------------------------------------------------
/yolov12/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | 
 3 | project(yolov12)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | add_definitions(-DAPI_EXPORTS)
 7 | set(CMAKE_CXX_STANDARD 11)
 8 | set(CMAKE_BUILD_TYPE Debug)
 9 | 
10 | set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
11 | enable_language(CUDA)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | include_directories(${PROJECT_SOURCE_DIR}/plugin)
15 | 
16 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
17 | if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
18 |   message("embed_platform on")
19 |   include_directories(/usr/local/cuda/targets/aarch64-linux/include)
20 |   link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
21 | else()
22 |   message("embed_platform off")
23 | 
24 |   # cuda
25 |   include_directories(/usr/local/cuda/include)
26 |   link_directories(/usr/local/cuda/lib64)
27 | 
28 |   # tensorrt
29 |   include_directories(/workspace/shared/TensorRT-8.6.1.6/include)
30 |   link_directories(/workspace/shared/TensorRT-8.6.1.6/lib)
31 | endif()
32 | 
33 | add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)
34 | target_link_libraries(myplugins nvinfer cudart)
35 | 
36 | find_package(OpenCV)
37 | include_directories(${OpenCV_INCLUDE_DIRS})
38 | 
39 | file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)
40 | 
41 | add_executable(yolo12_det ${PROJECT_SOURCE_DIR}/yolo12_det.cpp ${SRCS})
42 | target_link_libraries(yolo12_det nvinfer)
43 | target_link_libraries(yolo12_det cudart)
44 | target_link_libraries(yolo12_det myplugins)
45 | target_link_libraries(yolo12_det ${OpenCV_LIBS})
46 | 


--------------------------------------------------------------------------------
/yolov12/include/config.h:
--------------------------------------------------------------------------------
 1 | #define USE_FP16
 2 | // #define USE_FP32
 3 | // #define USE_INT8
 4 | 
 5 | const static char* kInputTensorName = "images";
 6 | const static char* kOutputTensorName = "output";
 7 | const static char* kProtoTensorName = "proto";
 8 | const static int kNumClass = 80;
 9 | const static int kPoseNumClass = 1;
10 | const static int kNumberOfPoints = 17;  // number of keypoints total
11 | // obb model's number of classes
12 | constexpr static int kObbNumClass = 15;
13 | const static int kObbNe = 1;  // number of extra parameters
14 | const static int kBatchSize = 1;
15 | const static int kGpuId = 0;
16 | const static int kInputH = 640;
17 | const static int kInputW = 640;
18 | const static int kObbInputH = 1024;
19 | const static int kObbInputW = 1024;
20 | const static float kNmsThresh = 0.45f;
21 | const static float kConfThresh = 0.5f;
22 | const static float kConfThreshKeypoints = 0.5f;  // keypoints confidence
23 | const static int kMaxInputImageSize = 3000 * 3000;
24 | const static int kMaxNumOutputBbox = 1000;
25 | //Quantization input image folder path
26 | const static char* kInputQuantizationFolder = "./coco_calib";
27 | 
28 | // Classfication model's number of classes
29 | constexpr static int kClsNumClass = 1000;
30 | // Classfication model's input shape
31 | constexpr static int kClsInputH = 224;
32 | constexpr static int kClsInputW = 224;
33 | 


--------------------------------------------------------------------------------
/yolov12/include/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRTX_CUDA_UTILS_H_
 2 | #define TRTX_CUDA_UTILS_H_
 3 | 
 4 | #include <cuda_runtime_api.h>
 5 | 
 6 | #ifndef CUDA_CHECK
 7 | #define CUDA_CHECK(callstr)                                                                    \
 8 |     {                                                                                          \
 9 |         cudaError_t error_code = callstr;                                                      \
10 |         if (error_code != cudaSuccess) {                                                       \
11 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
12 |             assert(0);                                                                         \
13 |         }                                                                                      \
14 |     }
15 | #endif  // CUDA_CHECK
16 | 
17 | #endif  // TRTX_CUDA_UTILS_H_
18 | 


--------------------------------------------------------------------------------
/yolov12/include/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #include "NvInfer.h"
 5 | 
 6 | #ifdef API_EXPORTS
 7 | #if defined(_MSC_VER)
 8 | #define API __declspec(dllexport)
 9 | #else
10 | #define API __attribute__((visibility("default")))
11 | #endif
12 | #else
13 | 
14 | #if defined(_MSC_VER)
15 | #define API __declspec(dllimport)
16 | #else
17 | #define API
18 | #endif
19 | #endif  // API_EXPORTS
20 | 
21 | #if NV_TENSORRT_MAJOR >= 8
22 | #define TRT_NOEXCEPT noexcept
23 | #define TRT_CONST_ENQUEUE const
24 | #else
25 | #define TRT_NOEXCEPT
26 | #define TRT_CONST_ENQUEUE
27 | #endif
28 | 
29 | #endif  // __MACROS_H
30 | 


--------------------------------------------------------------------------------
/yolov12/include/model.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <assert.h>
 4 | #include <string>
 5 | #include "NvInfer.h"
 6 | 
 7 | nvinfer1::IHostMemory* buildEngineYolo12Det(nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
 8 |                                             nvinfer1::DataType dt, const std::string& wts_path, float& gd, float& gw,
 9 |                                             int& max_channels, std::string& type);
10 | 


--------------------------------------------------------------------------------
/yolov12/include/preprocess.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <map>
 4 | #include <opencv2/opencv.hpp>
 5 | #include "NvInfer.h"
 6 | #include "types.h"
 7 | 
 8 | void cuda_preprocess_init(int max_image_size);
 9 | 
10 | void cuda_preprocess_destroy();
11 | 
12 | void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
13 |                      cudaStream_t stream);
14 | 
15 | void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
16 |                            cudaStream_t stream);
17 | 


--------------------------------------------------------------------------------
/yolov12/include/types.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "config.h"
 3 | 
 4 | struct alignas(float) Detection {
 5 |     //center_x center_y w h
 6 |     float bbox[4];
 7 |     float conf;  // bbox_conf * cls_conf
 8 |     float class_id;
 9 |     float mask[32];
10 |     float keypoints[kNumberOfPoints * 3];  // 17*3 keypoints
11 |     float angle;                           // obb angle
12 | };
13 | 
14 | struct AffineMatrix {
15 |     float value[6];
16 | };
17 | 
18 | const int bbox_element =
19 |         sizeof(AffineMatrix) / sizeof(float) + 1;  // left, top, right, bottom, confidence, class, keepflag
20 | 


--------------------------------------------------------------------------------
/yolov3-spp/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(yolov3-spp)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | # tensorrt
19 | include_directories(/usr/include/x86_64-linux-gnu/)
20 | link_directories(/usr/lib/x86_64-linux-gnu/)
21 | 
22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
23 | 
24 | cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
25 | target_link_libraries(yololayer nvinfer cudart)
26 | 
27 | find_package(OpenCV)
28 | include_directories(${OpenCV_INCLUDE_DIRS})
29 | 
30 | add_executable(yolov3-spp ${PROJECT_SOURCE_DIR}/yolov3-spp.cpp)
31 | target_link_libraries(yolov3-spp nvinfer)
32 | target_link_libraries(yolov3-spp cudart)
33 | target_link_libraries(yolov3-spp yololayer)
34 | target_link_libraries(yolov3-spp ${OpenCV_LIBS})
35 | 
36 | add_definitions(-O2 -pthread)
37 | 
38 | 


--------------------------------------------------------------------------------
/yolov3-spp/gen_wts.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | import sys
 3 | from models import *
 4 | from utils.utils import *
 5 | 
 6 | model = Darknet('cfg/yolov3-spp.cfg', (416, 416))
 7 | weights = sys.argv[1]
 8 | dev = '0'
 9 | device = torch_utils.select_device(dev)
10 | model.load_state_dict(torch.load(weights, map_location=device)['model'])
11 | 
12 | 
13 | with open('yolov3-spp_ultralytics68.wts', 'w') as f:
14 |     f.write('{}\n'.format(len(model.state_dict().keys())))
15 |     for k, v in model.state_dict().items():
16 |         vr = v.reshape(-1).cpu().numpy()
17 |         f.write('{} {} '.format(k, len(vr)))
18 |         for vv in vr:
19 |             f.write(' ')
20 |             f.write(struct.pack('>f',float(vv)).hex())
21 |         f.write('\n')
22 | 
23 | 


--------------------------------------------------------------------------------
/yolov3-spp/samples/bus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wang-xinyu/tensorrtx/c6bcebd812907a6d9eb66ecc909eb17b51d40fac/yolov3-spp/samples/bus.jpg


--------------------------------------------------------------------------------
/yolov3-spp/samples/zidane.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wang-xinyu/tensorrtx/c6bcebd812907a6d9eb66ecc909eb17b51d40fac/yolov3-spp/samples/zidane.jpg


--------------------------------------------------------------------------------
/yolov3-tiny/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(yolov3-tiny)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
15 |     message("embed_platform on")
16 |     include_directories(/usr/local/cuda/targets/aarch64-linux/include)
17 |     link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
18 | else()
19 |     message("embed_platform off")
20 |     include_directories(/usr/local/cuda/include)
21 |     link_directories(/usr/local/cuda/lib64)
22 | endif()
23 | 
24 | 
25 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
26 | 
27 | #cuda_add_library(leaky ${PROJECT_SOURCE_DIR}/leaky.cu)
28 | cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
29 | target_link_libraries(yololayer nvinfer cudart)
30 | 
31 | find_package(OpenCV)
32 | include_directories(${OpenCV_INCLUDE_DIRS})
33 | 
34 | add_executable(yolov3-tiny ${PROJECT_SOURCE_DIR}/yolov3-tiny.cpp)
35 | target_link_libraries(yolov3-tiny nvinfer)
36 | target_link_libraries(yolov3-tiny cudart)
37 | target_link_libraries(yolov3-tiny yololayer)
38 | target_link_libraries(yolov3-tiny ${OpenCV_LIBS})
39 | 
40 | add_definitions(-O2 -pthread)
41 | 
42 | 


--------------------------------------------------------------------------------
/yolov3-tiny/gen_wts.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | import sys
 3 | from models import *
 4 | from utils.utils import *
 5 | 
 6 | model = Darknet('cfg/yolov3-tiny.cfg', (608, 608))
 7 | weights = sys.argv[1]
 8 | device = torch_utils.select_device('0')
 9 | if weights.endswith('.pt'):  # pytorch format
10 |     model.load_state_dict(torch.load(weights, map_location=device)['model'])
11 | else:  # darknet format
12 |     load_darknet_weights(model, weights)
13 | model = model.eval()
14 | 
15 | with open('yolov3-tiny.wts', 'w') as f:
16 |     f.write('{}\n'.format(len(model.state_dict().keys())))
17 |     for k, v in model.state_dict().items():
18 |         vr = v.reshape(-1).cpu().numpy()
19 |         f.write('{} {} '.format(k, len(vr)))
20 |         for vv in vr:
21 |             f.write(' ')
22 |             f.write(struct.pack('>f',float(vv)).hex())
23 |         f.write('\n')
24 | 
25 | 


--------------------------------------------------------------------------------
/yolov3-tiny/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #if NV_TENSORRT_MAJOR >= 8
 5 | #define TRT_NOEXCEPT noexcept
 6 | #define TRT_CONST_ENQUEUE const
 7 | #else
 8 | #define TRT_NOEXCEPT
 9 | #define TRT_CONST_ENQUEUE
10 | #endif
11 | 
12 | #endif  // __MACROS_H


--------------------------------------------------------------------------------
/yolov3/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(yolov3)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | # tensorrt
19 | include_directories(/usr/include/x86_64-linux-gnu/)
20 | link_directories(/usr/lib/x86_64-linux-gnu/)
21 | 
22 | find_package(OpenCV)
23 | include_directories(${OpenCV_INCLUDE_DIRS})
24 | 
25 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
26 | 
27 | #cuda_add_library(leaky ${PROJECT_SOURCE_DIR}/leaky.cu)
28 | cuda_add_library(yololayer SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
29 | target_link_libraries(yololayer nvinfer cudart ${OpenCV_LIBS})
30 | 
31 | add_executable(yolov3 ${PROJECT_SOURCE_DIR}/calibrator.cpp ${PROJECT_SOURCE_DIR}/yolov3.cpp)
32 | target_link_libraries(yolov3 nvinfer)
33 | target_link_libraries(yolov3 cudart)
34 | target_link_libraries(yolov3 yololayer)
35 | target_link_libraries(yolov3 ${OpenCV_LIBS})
36 | 
37 | add_definitions(-O2 -pthread)
38 | 
39 | 


--------------------------------------------------------------------------------
/yolov3/calibrator.h:
--------------------------------------------------------------------------------
 1 | #ifndef ENTROPY_CALIBRATOR_H
 2 | #define ENTROPY_CALIBRATOR_H
 3 | 
 4 | #include "NvInfer.h"
 5 | #include <string>
 6 | #include <vector>
 7 | #include "macros.h"
 8 | 
 9 | //! \class Int8EntropyCalibrator2
10 | //!
11 | //! \brief Implements Entropy calibrator 2.
12 | //!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
13 | //!
14 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
15 | {
16 | public:
17 |     Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);
18 | 
19 |     virtual ~Int8EntropyCalibrator2();
20 |     int getBatchSize() const TRT_NOEXCEPT override;
21 |     bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
22 |     const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
23 |     void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;
24 |     
25 | private:
26 |     int batchsize_;
27 |     int input_w_;
28 |     int input_h_;
29 |     int img_idx_;
30 |     std::string img_dir_;
31 |     std::vector<std::string> img_files_;
32 |     size_t input_count_;
33 |     std::string calib_table_name_;
34 |     const char* input_blob_name_;
35 |     bool read_cache_;
36 |     void* device_input_;
37 |     std::vector<char> calib_cache_;
38 | };
39 | 
40 | #endif // ENTROPY_CALIBRATOR_H
41 | 


--------------------------------------------------------------------------------
/yolov3/gen_wts.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | import sys
 3 | from models import *
 4 | from utils.utils import *
 5 | 
 6 | model = Darknet('cfg/yolov3.cfg', (608, 608))
 7 | weights = sys.argv[1]
 8 | device = torch_utils.select_device('0')
 9 | if weights.endswith('.pt'):  # pytorch format
10 |     model.load_state_dict(torch.load(weights, map_location=device)['model'])
11 | else:  # darknet format
12 |     load_darknet_weights(model, weights)
13 | model = model.eval()
14 | 
15 | with open('yolov3.wts', 'w') as f:
16 |     f.write('{}\n'.format(len(model.state_dict().keys())))
17 |     for k, v in model.state_dict().items():
18 |         vr = v.reshape(-1).cpu().numpy()
19 |         f.write('{} {} '.format(k, len(vr)))
20 |         for vv in vr:
21 |             f.write(' ')
22 |             f.write(struct.pack('>f',float(vv)).hex())
23 |         f.write('\n')
24 | 
25 | 


--------------------------------------------------------------------------------
/yolov3/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #if NV_TENSORRT_MAJOR >= 8
 5 | #define TRT_NOEXCEPT noexcept
 6 | #define TRT_CONST_ENQUEUE const
 7 | #else
 8 | #define TRT_NOEXCEPT
 9 | #define TRT_CONST_ENQUEUE
10 | #endif
11 | 
12 | #endif  // __MACROS_H
13 | 


--------------------------------------------------------------------------------
/yolov4/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.6)
 2 | 
 3 | project(yolov4)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | 
 7 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 8 | set(CMAKE_CXX_STANDARD 11)
 9 | set(CMAKE_BUILD_TYPE Debug)
10 | 
11 | find_package(CUDA REQUIRED)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
15 | # cuda
16 | include_directories(/usr/local/cuda/include)
17 | link_directories(/usr/local/cuda/lib64)
18 | # tensorrt
19 | include_directories(/usr/include/x86_64-linux-gnu/)
20 | link_directories(/usr/lib/x86_64-linux-gnu/)
21 | 
22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
23 | 
24 | cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu ${PROJECT_SOURCE_DIR}/mish.cu)
25 | target_link_libraries(myplugins nvinfer cudart)
26 | 
27 | find_package(OpenCV)
28 | include_directories(${OpenCV_INCLUDE_DIRS})
29 | 
30 | add_executable(yolov4 ${PROJECT_SOURCE_DIR}/yolov4.cpp)
31 | target_link_libraries(yolov4 nvinfer)
32 | target_link_libraries(yolov4 cudart)
33 | target_link_libraries(yolov4 myplugins)
34 | target_link_libraries(yolov4 ${OpenCV_LIBS})
35 | 
36 | add_definitions(-O2 -pthread)
37 | 
38 | 


--------------------------------------------------------------------------------
/yolov4/gen_wts.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | import sys
 3 | from models import *
 4 | from utils.utils import *
 5 | 
 6 | model = Darknet('cfg/yolov4.cfg', (608, 608))
 7 | weights = sys.argv[1]
 8 | device = torch_utils.select_device('0')
 9 | if weights.endswith('.pt'):  # pytorch format
10 |     model.load_state_dict(torch.load(weights, map_location=device)['model'])
11 | else:  # darknet format
12 |     load_darknet_weights(model, weights)
13 | 
14 | with open('yolov4.wts', 'w') as f:
15 |     f.write('{}\n'.format(len(model.state_dict().keys())))
16 |     for k, v in model.state_dict().items():
17 |         vr = v.reshape(-1).cpu().numpy()
18 |         f.write('{} {} '.format(k, len(vr)))
19 |         for vv in vr:
20 |             f.write(' ')
21 |             f.write(struct.pack('>f',float(vv)).hex())
22 |         f.write('\n')
23 | 
24 | 


--------------------------------------------------------------------------------
/yolov4/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef __TRT_UTILS_H_
 2 | #define __TRT_UTILS_H_
 3 | 
 4 | #include <iostream>
 5 | #include <vector>
 6 | #include <algorithm>
 7 | #include <cudnn.h>
 8 | 
 9 | #ifndef CUDA_CHECK
10 | 
11 | #define CUDA_CHECK(callstr)                                                                    \
12 |     {                                                                                          \
13 |         cudaError_t error_code = callstr;                                                      \
14 |         if (error_code != cudaSuccess) {                                                       \
15 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
16 |             assert(0);                                                                         \
17 |         }                                                                                      \
18 |     }
19 | 
20 | #endif
21 | 
22 | namespace Tn
23 | {
24 |     template<typename T> 
25 |     void write(char*& buffer, const T& val)
26 |     {
27 |         *reinterpret_cast<T*>(buffer) = val;
28 |         buffer += sizeof(T);
29 |     }
30 | 
31 |     template<typename T> 
32 |     void read(const char*& buffer, T& val)
33 |     {
34 |         val = *reinterpret_cast<const T*>(buffer);
35 |         buffer += sizeof(T);
36 |     }
37 | }
38 | 
39 | #endif
40 | 


--------------------------------------------------------------------------------
/yolov5/images:
--------------------------------------------------------------------------------
1 | ../yolov3-spp/samples


--------------------------------------------------------------------------------
/yolov5/src/calibrator.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "macros.h"
 4 | #include <string>
 5 | #include <vector>
 6 | #include <opencv2/opencv.hpp>
 7 | 
 8 | cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h);
 9 | 
10 | //! \class Int8EntropyCalibrator2
11 | //!
12 | //! \brief Implements Entropy calibrator 2.
13 | //!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
14 | //!
15 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
16 |  public:
17 |   Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);
18 | 
19 |   virtual ~Int8EntropyCalibrator2();
20 |   int getBatchSize() const TRT_NOEXCEPT override;
21 |   bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
22 |   const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
23 |   void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;
24 | 
25 |  private:
26 |   int batchsize_;
27 |   int input_w_;
28 |   int input_h_;
29 |   int img_idx_;
30 |   std::string img_dir_;
31 |   std::vector<std::string> img_files_;
32 |   size_t input_count_;
33 |   std::string calib_table_name_;
34 |   const char* input_blob_name_;
35 |   bool read_cache_;
36 |   void* device_input_;
37 |   std::vector<char> calib_cache_;
38 | };
39 | 
40 | 


--------------------------------------------------------------------------------
/yolov5/src/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRTX_CUDA_UTILS_H_
 2 | #define TRTX_CUDA_UTILS_H_
 3 | 
 4 | #include <cuda_runtime_api.h>
 5 | 
 6 | #ifndef CUDA_CHECK
 7 | #define CUDA_CHECK(callstr)\
 8 |     {\
 9 |         cudaError_t error_code = callstr;\
10 |         if (error_code != cudaSuccess) {\
11 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
12 |             assert(0);\
13 |         }\
14 |     }
15 | #endif  // CUDA_CHECK
16 | 
17 | #endif  // TRTX_CUDA_UTILS_H_
18 | 
19 | 


--------------------------------------------------------------------------------
/yolov5/src/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #include <NvInfer.h>
 5 | 
 6 | #ifdef API_EXPORTS
 7 | #if defined(_MSC_VER)
 8 | #define API __declspec(dllexport)
 9 | #else
10 | #define API __attribute__((visibility("default")))
11 | #endif
12 | #else
13 | 
14 | #if defined(_MSC_VER)
15 | #define API __declspec(dllimport)
16 | #else
17 | #define API
18 | #endif
19 | #endif  // API_EXPORTS
20 | 
21 | #if NV_TENSORRT_MAJOR >= 8
22 | #define TRT_NOEXCEPT noexcept
23 | #define TRT_CONST_ENQUEUE const
24 | #else
25 | #define TRT_NOEXCEPT
26 | #define TRT_CONST_ENQUEUE
27 | #endif
28 | 
29 | #endif  // __MACROS_H
30 | 


--------------------------------------------------------------------------------
/yolov5/src/model.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <NvInfer.h>
 4 | #include <string>
 5 | 
 6 | nvinfer1::ICudaEngine* build_det_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
 7 |                                         nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt,
 8 |                                         float& gd, float& gw, std::string& wts_name);
 9 | 
10 | nvinfer1::ICudaEngine* build_det_p6_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder,
11 |                                            nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt,
12 |                                            float& gd, float& gw, std::string& wts_name);
13 | 
14 | nvinfer1::ICudaEngine* build_cls_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw, std::string& wts_name);
15 | 
16 | nvinfer1::ICudaEngine* build_seg_engine(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, float& gd, float& gw, std::string& wts_name);
17 | 


--------------------------------------------------------------------------------
/yolov5/src/postprocess.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "types.h"
 4 | #include <opencv2/opencv.hpp>
 5 | 
 6 | cv::Rect get_rect(cv::Mat& img, float bbox[4]);
 7 | 
 8 | void nms(std::vector<Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5);
 9 | 
10 | void batch_nms(std::vector<std::vector<Detection>>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5);
11 | 
12 | void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
13 | 
14 | std::vector<cv::Mat> process_mask(const float* proto, int proto_size, std::vector<Detection>& dets);
15 | 
16 | void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks, std::unordered_map<int, std::string>& labels_map);
17 | 


--------------------------------------------------------------------------------
/yolov5/src/preprocess.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda_runtime.h>
 4 | #include <cstdint>
 5 | #include <opencv2/opencv.hpp>
 6 | 
 7 | void cuda_preprocess_init(int max_image_size);
 8 | void cuda_preprocess_destroy();
 9 | void cuda_preprocess(uint8_t* src, int src_width, int src_height,
10 |                      float* dst, int dst_width, int dst_height,
11 |                      cudaStream_t stream);
12 | void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch,
13 |                            float* dst, int dst_width, int dst_height,
14 |                            cudaStream_t stream);
15 | 
16 | 


--------------------------------------------------------------------------------
/yolov5/src/types.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "config.h"
 4 | 
 5 | struct YoloKernel {
 6 |   int width;
 7 |   int height;
 8 |   float anchors[kNumAnchor * 2];
 9 | };
10 | 
11 | struct alignas(float) Detection {
12 |   float bbox[4];  // center_x center_y w h
13 |   float conf;  // bbox_conf * cls_conf
14 |   float class_id;
15 |   float mask[32];
16 | };
17 | 
18 | 


--------------------------------------------------------------------------------
/yolov7/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | 
 3 | project(yolov7)
 4 | 
 5 | add_definitions(-std=c++11)
 6 | add_definitions(-DAPI_EXPORTS)
 7 | set(CMAKE_CXX_STANDARD 11)
 8 | set(CMAKE_BUILD_TYPE Debug)
 9 | 
10 | set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
11 | enable_language(CUDA)
12 | 
13 | include_directories(${PROJECT_SOURCE_DIR}/include)
14 | include_directories(${PROJECT_SOURCE_DIR}/plugin)
15 | 
16 | # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
17 | if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
18 |   message("embed_platform on")
19 |   include_directories(/usr/local/cuda/targets/aarch64-linux/include)
20 |   link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
21 | else()
22 |   message("embed_platform off")
23 |   # cuda
24 |   include_directories(/usr/local/cuda/include)
25 |   link_directories(/usr/local/cuda/lib64)
26 | 
27 |   # tensorrt
28 |   include_directories(/home/nvidia/TensorRT-8.2.5.1/include)
29 |   link_directories(/home/nvidia/TensorRT-8.2.5.1/lib)
30 | endif()
31 | 
32 | add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)
33 | target_link_libraries(myplugins nvinfer cudart)
34 | 
35 | find_package(OpenCV)
36 | include_directories(${OpenCV_INCLUDE_DIRS})
37 | 
38 | file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)
39 | add_executable(yolov7 main.cpp ${SRCS})
40 | 
41 | target_link_libraries(yolov7 nvinfer)
42 | target_link_libraries(yolov7 cudart)
43 | target_link_libraries(yolov7 myplugins)
44 | target_link_libraries(yolov7 ${OpenCV_LIBS})
45 | 
46 | 


--------------------------------------------------------------------------------
/yolov7/images:
--------------------------------------------------------------------------------
1 | ../yolov3-spp/samples


--------------------------------------------------------------------------------
/yolov7/include/block.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "NvInfer.h"
 4 | #include <string>
 5 | #include <vector>
 6 | #include <map>
 7 | 
 8 | std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);
 9 | 
10 | nvinfer1::IElementWiseLayer* convBnSilu(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c2, int k, int s, int p, std::string lname);
11 | 
12 | nvinfer1::ILayer* ReOrg(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int inch);
13 | 
14 | nvinfer1::ILayer* DownC(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c1, int c2, const std::string& lname);
15 | 
16 | nvinfer1::IElementWiseLayer* SPPCSPC(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c2, const std::string& lname);
17 | 
18 | nvinfer1::IElementWiseLayer* RepConv(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int c2, int k, int s, const std::string& lname);
19 | 
20 | nvinfer1::IActivationLayer* convBlockLeakRelu(nvinfer1::INetworkDefinition* network, std::map<std::string, nvinfer1::Weights>& weightMap, nvinfer1::ITensor& input, int outch, int ksize, int s, int p, std::string lname);
21 | 
22 | nvinfer1::IPluginV2Layer* addYoLoLayer(nvinfer1::INetworkDefinition *network, std::map<std::string, nvinfer1::Weights>& weightMap, std::string lname, std::vector<nvinfer1::IConvolutionLayer*> dets);
23 | 
24 | 


--------------------------------------------------------------------------------
/yolov7/include/calibrator.h:
--------------------------------------------------------------------------------
 1 | #ifndef ENTROPY_CALIBRATOR_H
 2 | #define ENTROPY_CALIBRATOR_H
 3 | 
 4 | #include <NvInfer.h>
 5 | #include <string>
 6 | #include <vector>
 7 | #include "macros.h"
 8 | 
 9 | //! \class Int8EntropyCalibrator2
10 | //!
11 | //! \brief Implements Entropy calibrator 2.
12 | //!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
13 | //!
14 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
15 | {
16 | public:
17 |     Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);
18 | 
19 |     virtual ~Int8EntropyCalibrator2();
20 |     int getBatchSize() const TRT_NOEXCEPT override;
21 |     bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
22 |     const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
23 |     void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;
24 | 
25 | private:
26 |     int batchsize_;
27 |     int input_w_;
28 |     int input_h_;
29 |     int img_idx_;
30 |     std::string img_dir_;
31 |     std::vector<std::string> img_files_;
32 |     size_t input_count_;
33 |     std::string calib_table_name_;
34 |     const char* input_blob_name_;
35 |     bool read_cache_;
36 |     void* device_input_;
37 |     std::vector<char> calib_cache_;
38 | };
39 | 
40 | #endif // ENTROPY_CALIBRATOR_H
41 | 


--------------------------------------------------------------------------------
/yolov7/include/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRTX_CUDA_UTILS_H_
 2 | #define TRTX_CUDA_UTILS_H_
 3 | 
 4 | #include <cuda_runtime_api.h>
 5 | 
 6 | #ifndef CUDA_CHECK
 7 | #define CUDA_CHECK(callstr)\
 8 |     {\
 9 |         cudaError_t error_code = callstr;\
10 |         if (error_code != cudaSuccess) {\
11 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
12 |             assert(0);\
13 |         }\
14 |     }
15 | #endif  // CUDA_CHECK
16 | 
17 | #endif  // TRTX_CUDA_UTILS_H_
18 | 
19 | 


--------------------------------------------------------------------------------
/yolov7/include/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #include "NvInfer.h"
 5 | 
 6 | #ifdef API_EXPORTS
 7 | #if defined(_MSC_VER)
 8 | #define API __declspec(dllexport)
 9 | #else
10 | #define API __attribute__((visibility("default")))
11 | #endif
12 | #else
13 | 
14 | #if defined(_MSC_VER)
15 | #define API __declspec(dllimport)
16 | #else
17 | #define API
18 | #endif
19 | #endif  // API_EXPORTS
20 | 
21 | #if NV_TENSORRT_MAJOR >= 8
22 | #define TRT_NOEXCEPT noexcept
23 | #define TRT_CONST_ENQUEUE const
24 | #else
25 | #define TRT_NOEXCEPT
26 | #define TRT_CONST_ENQUEUE
27 | #endif
28 | 
29 | #endif  // __MACROS_H
30 | 


--------------------------------------------------------------------------------
/yolov7/include/model.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "NvInfer.h"
 4 | #include <string>
 5 | 
 6 | nvinfer1::IHostMemory* build_engine_yolov7e6e(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
 7 | nvinfer1::IHostMemory* build_engine_yolov7d6(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
 8 | nvinfer1::IHostMemory* build_engine_yolov7e6(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
 9 | nvinfer1::IHostMemory* build_engine_yolov7w6(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
10 | nvinfer1::IHostMemory* build_engine_yolov7x(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
11 | nvinfer1::IHostMemory* build_engine_yolov7(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, const std::string& wts_path);
12 | nvinfer1::IHostMemory* build_engine_yolov7_tiny(unsigned int maxBatchSize, nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config, nvinfer1::DataType dt, std::string& wts_name);
13 | 


--------------------------------------------------------------------------------
/yolov7/include/postprocess.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "types.h"
 4 | #include <opencv2/opencv.hpp>
 5 | 
 6 | cv::Rect get_rect(cv::Mat& img, float bbox[4]);
 7 | 
 8 | void nms(std::vector<Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5);
 9 | 
10 | void batch_nms(std::vector<std::vector<Detection>>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5);
11 | 
12 | void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
13 | 
14 | 


--------------------------------------------------------------------------------
/yolov7/include/preprocess.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda_runtime.h>
 4 | #include <cstdint>
 5 | #include <opencv2/opencv.hpp>
 6 | #include <iostream>
 7 | 
 8 | void cuda_preprocess_init(int max_image_size);
 9 | void cuda_preprocess_destroy();
10 | void cuda_preprocess(uint8_t* src, int src_width, int src_height,
11 |                      float* dst, int dst_width, int dst_height,
12 |                      cudaStream_t stream);
13 | void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch,
14 |                            float* dst, int dst_width, int dst_height,
15 |                            cudaStream_t stream);
16 | 
17 | 


--------------------------------------------------------------------------------
/yolov7/include/types.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "config.h"
 4 | 
 5 | struct YoloKernel {
 6 |   int width;
 7 |   int height;
 8 |   float anchors[kNumAnchor * 2];
 9 | };
10 | 
11 | struct alignas(float) Detection {
12 |   //center_x center_y w h
13 |   float bbox[4];
14 |   float conf;  // bbox_conf * cls_conf
15 |   float class_id;
16 | };
17 | 
18 | 


--------------------------------------------------------------------------------
/yolov7/include/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRTX_YOLOV7_UTILS_H_
 2 | #define TRTX_YOLOV7_UTILS_H_
 3 | 
 4 | #include <dirent.h>
 5 | #include <opencv2/opencv.hpp>
 6 | 
 7 | static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
 8 |     int w, h, x, y;
 9 |     float r_w = input_w / (img.cols*1.0);
10 |     float r_h = input_h / (img.rows*1.0);
11 |     if (r_h > r_w) {
12 |         w = input_w;
13 |         h = r_w * img.rows;
14 |         x = 0;
15 |         y = (input_h - h) / 2;
16 |     } else {
17 |         w = r_h * img.cols;
18 |         h = input_h;
19 |         x = (input_w - w) / 2;
20 |         y = 0;
21 |     }
22 |     cv::Mat re(h, w, CV_8UC3);
23 |     cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
24 |     cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
25 |     re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
26 |     return out;
27 | }
28 | 
29 | static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
30 |     DIR *p_dir = opendir(p_dir_name);
31 |     if (p_dir == nullptr) {
32 |         return -1;
33 |     }
34 | 
35 |     struct dirent* p_file = nullptr;
36 |     while ((p_file = readdir(p_dir)) != nullptr) {
37 |         if (strcmp(p_file->d_name, ".") != 0 &&
38 |             strcmp(p_file->d_name, "..") != 0) {
39 |             //std::string cur_file_name(p_dir_name);
40 |             //cur_file_name += "/";
41 |             //cur_file_name += p_file->d_name;
42 |             std::string cur_file_name(p_file->d_name);
43 |             file_names.push_back(cur_file_name);
44 |         }
45 |     }
46 | 
47 |     closedir(p_dir);
48 |     return 0;
49 | }
50 | 
51 | #endif  // TRTX_YOLOV7_UTILS_H_
52 | 
53 | 


--------------------------------------------------------------------------------
/yolov8/include/calibrator.h:
--------------------------------------------------------------------------------
 1 | #ifndef ENTROPY_CALIBRATOR_H
 2 | #define ENTROPY_CALIBRATOR_H
 3 | 
 4 | #include <NvInfer.h>
 5 | #include <string>
 6 | #include <vector>
 7 | #include "macros.h"
 8 | 
 9 | //! \class Int8EntropyCalibrator2
10 | //!
11 | //! \brief Implements Entropy calibrator 2.
12 | //!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
13 | //!
14 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
15 |    public:
16 |     Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name,
17 |                            const char* input_blob_name, bool read_cache = true);
18 |     virtual ~Int8EntropyCalibrator2();
19 |     int getBatchSize() const TRT_NOEXCEPT override;
20 |     bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
21 |     const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
22 |     void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;
23 | 
24 |    private:
25 |     int batchsize_;
26 |     int input_w_;
27 |     int input_h_;
28 |     int img_idx_;
29 |     std::string img_dir_;
30 |     std::vector<std::string> img_files_;
31 |     size_t input_count_;
32 |     std::string calib_table_name_;
33 |     const char* input_blob_name_;
34 |     bool read_cache_;
35 |     void* device_input_;
36 |     std::vector<char> calib_cache_;
37 | };
38 | 
39 | #endif  // ENTROPY_CALIBRATOR_H
40 | 


--------------------------------------------------------------------------------
/yolov8/include/config.h:
--------------------------------------------------------------------------------
 1 | #define USE_FP16
 2 | //#define USE_FP32
 3 | //#define USE_INT8
 4 | 
 5 | const static char* kInputTensorName = "images";
 6 | const static char* kOutputTensorName = "output";
 7 | const static int kNumClass = 80;
 8 | const static int kBatchSize = 1;
 9 | const static int kGpuId = 0;
10 | const static int kInputH = 640;
11 | const static int kInputW = 640;
12 | const static float kNmsThresh = 0.45f;
13 | const static float kConfThresh = 0.5f;
14 | const static float kConfThreshKeypoints = 0.5f;  // keypoints confidence
15 | const static int kMaxInputImageSize = 3000 * 3000;
16 | const static int kMaxNumOutputBbox = 1000;
17 | //Quantization input image folder path
18 | const static char* kInputQuantizationFolder = "./coco_calib";
19 | 
20 | // Classfication model's number of classes
21 | constexpr static int kClsNumClass = 1000;
22 | // Classfication model's input shape
23 | constexpr static int kClsInputH = 224;
24 | constexpr static int kClsInputW = 224;
25 | 
26 | // pose model's number of classes
27 | constexpr static int kPoseNumClass = 1;
28 | const static int kNumberOfPoints = 17;  // number of keypoints total
29 | 
30 | // obb model's number of classes
31 | constexpr static int kObbNumClass = 15;
32 | 


--------------------------------------------------------------------------------
/yolov8/include/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRTX_CUDA_UTILS_H_
 2 | #define TRTX_CUDA_UTILS_H_
 3 | 
 4 | #include <cuda_runtime_api.h>
 5 | 
 6 | #ifndef CUDA_CHECK
 7 | #define CUDA_CHECK(callstr)                                                                    \
 8 |     {                                                                                          \
 9 |         cudaError_t error_code = callstr;                                                      \
10 |         if (error_code != cudaSuccess) {                                                       \
11 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
12 |             assert(0);                                                                         \
13 |         }                                                                                      \
14 |     }
15 | #endif  // CUDA_CHECK
16 | 
17 | #endif  // TRTX_CUDA_UTILS_H_
18 | 


--------------------------------------------------------------------------------
/yolov8/include/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #include "NvInfer.h"
 5 | 
 6 | #ifdef API_EXPORTS
 7 | #if defined(_MSC_VER)
 8 | #define API __declspec(dllexport)
 9 | #else
10 | #define API __attribute__((visibility("default")))
11 | #endif
12 | #else
13 | 
14 | #if defined(_MSC_VER)
15 | #define API __declspec(dllimport)
16 | #else
17 | #define API
18 | #endif
19 | #endif  // API_EXPORTS
20 | 
21 | #if NV_TENSORRT_MAJOR >= 8
22 | #define TRT_NOEXCEPT noexcept
23 | #define TRT_CONST_ENQUEUE const
24 | #else
25 | #define TRT_NOEXCEPT
26 | #define TRT_CONST_ENQUEUE
27 | #endif
28 | 
29 | #endif  // __MACROS_H
30 | 


--------------------------------------------------------------------------------
/yolov8/include/preprocess.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <map>
 4 | #include <opencv2/opencv.hpp>
 5 | #include "NvInfer.h"
 6 | #include "types.h"
 7 | 
 8 | void cuda_preprocess_init(int max_image_size);
 9 | 
10 | void cuda_preprocess_destroy();
11 | 
12 | void cuda_preprocess(uint8_t* src, int src_width, int src_height, float* dst, int dst_width, int dst_height,
13 |                      cudaStream_t stream);
14 | 
15 | void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch, float* dst, int dst_width, int dst_height,
16 |                            cudaStream_t stream);
17 | 


--------------------------------------------------------------------------------
/yolov8/include/types.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "config.h"
 3 | 
 4 | struct alignas(float) Detection {
 5 |     //center_x center_y w h
 6 |     float bbox[4];
 7 |     float conf;  // bbox_conf * cls_conf
 8 |     float class_id;
 9 |     float mask[32];
10 |     float keypoints[kNumberOfPoints * 3];  // keypoints array with dynamic size based on kNumberOfPoints
11 |     float angle;                           // obb angle
12 | };
13 | 
14 | struct AffineMatrix {
15 |     float value[6];
16 | };
17 | 
18 | const int bbox_element =
19 |         sizeof(AffineMatrix) / sizeof(float) + 1;  // left, top, right, bottom, confidence, class, keepflag
20 | 


--------------------------------------------------------------------------------
/yolov9/images:
--------------------------------------------------------------------------------
1 | ../yolov3-spp/samples


--------------------------------------------------------------------------------
/yolov9/include/calibrator.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "macros.h"
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | //! \class Int8EntropyCalibrator2
 8 | //!
 9 | //! \brief Implements Entropy calibrator 2.
10 | //!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
11 | //!
12 | class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
13 | public:
14 |     Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);
15 | 
16 |     virtual ~Int8EntropyCalibrator2();
17 |     int getBatchSize() const TRT_NOEXCEPT override;
18 |     bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
19 |     const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
20 |     void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;
21 | 
22 | private:
23 |     int batchsize_;
24 |     int input_w_;
25 |     int input_h_;
26 |     int img_idx_;
27 |     std::string img_dir_;
28 |     std::vector<std::string> img_files_;
29 |     size_t input_count_;
30 |     std::string calib_table_name_;
31 |     const char* input_blob_name_;
32 |     bool read_cache_;
33 |     void* device_input_;
34 |     std::vector<char> calib_cache_;
35 | };
36 | 
37 | 


--------------------------------------------------------------------------------
/yolov9/include/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef TRTX_CUDA_UTILS_H_
 2 | #define TRTX_CUDA_UTILS_H_
 3 | 
 4 | #include <cuda_runtime_api.h>
 5 | 
 6 | #ifndef CUDA_CHECK
 7 | #define CUDA_CHECK(callstr)\
 8 |     {\
 9 |         cudaError_t error_code = callstr;\
10 |         if (error_code != cudaSuccess) {\
11 |             std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
12 |             assert(0);\
13 |         }\
14 |     }
15 | #endif  // CUDA_CHECK
16 | 
17 | #endif  // TRTX_CUDA_UTILS_H_
18 | 
19 | 


--------------------------------------------------------------------------------
/yolov9/include/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MACROS_H
 2 | #define __MACROS_H
 3 | 
 4 | #include <NvInfer.h>
 5 | 
 6 | #ifdef API_EXPORTS
 7 | #if defined(_MSC_VER)
 8 | #define API __declspec(dllexport)
 9 | #else
10 | #define API __attribute__((visibility("default")))
11 | #endif
12 | #else
13 | 
14 | #if defined(_MSC_VER)
15 | #define API __declspec(dllimport)
16 | #else
17 | #define API
18 | #endif
19 | #endif  // API_EXPORTS
20 | 
21 | #if NV_TENSORRT_MAJOR >= 8
22 | #define TRT_NOEXCEPT noexcept
23 | #define TRT_CONST_ENQUEUE const
24 | #else
25 | #define TRT_NOEXCEPT
26 | #define TRT_CONST_ENQUEUE
27 | #endif
28 | 
29 | #endif  // __MACROS_H
30 | 


--------------------------------------------------------------------------------
/yolov9/include/postprocess.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "types.h"
 4 | #include <opencv2/opencv.hpp>
 5 | #include <cuda_runtime.h>
 6 | cv::Rect get_rect(cv::Mat& img, float bbox[4]);
 7 | 
 8 | void nms(std::vector<Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5);
 9 | 
10 | void batch_nms(std::vector<std::vector<Detection>>& batch_res, float *output, int batch_size, int output_size, float conf_thresh, float nms_thresh = 0.5);
11 | 
12 | void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
13 | 
14 | std::vector<cv::Mat> process_mask(const float* proto, int proto_size, std::vector<Detection>& dets);
15 | 
16 | void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks, std::unordered_map<int, std::string>& labels_map);
17 | // cuda NMS
18 | void cuda_decode(float* predict, int num_bboxes, float confidence_threshold,float* parray,int max_objects, cudaStream_t stream);
19 | void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
20 | void batch_process(std::vector<std::vector<Detection>> &res_batch, const float* decode_ptr_host, int batch_size, int bbox_element, const std::vector<cv::Mat>& img_batch);


--------------------------------------------------------------------------------
/yolov9/include/preprocess.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda_runtime.h>
 4 | #include <cstdint>
 5 | #include <opencv2/opencv.hpp>
 6 | 
 7 | void cuda_preprocess_init(int max_image_size);
 8 | void cuda_preprocess_destroy();
 9 | void cuda_preprocess(uint8_t* src, int src_width, int src_height,
10 |                      float* dst, int dst_width, int dst_height,
11 |                      cudaStream_t stream);
12 | void cuda_batch_preprocess(std::vector<cv::Mat>& img_batch,
13 |                            float* dst, int dst_width, int dst_height,
14 |                            cudaStream_t stream);
15 | 
16 | 


--------------------------------------------------------------------------------
/yolov9/include/types.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "config.h"
 4 | 
 5 | struct YoloKernel {
 6 |     int width;
 7 |     int height;
 8 |     float anchors[kNumAnchor * 2];
 9 | };
10 | 
11 | struct alignas(float) Detection {
12 |     float bbox[4];  // center_x center_y w h
13 |     float conf;  // bbox_conf * cls_conf
14 |     float class_id;
15 |     float mask[32];
16 | };
17 | const int bbox_element = 7; // center_x, center_y, w, h, conf, cls, obj
18 | 


--------------------------------------------------------------------------------