├── .github └── FUNDING.yml ├── config_infer_primary_rfdetr_seg.txt ├── config_infer_primary_yoloV7_seg.txt ├── config_infer_primary_yolo11_seg.txt ├── config_infer_primary_yoloV5_seg.txt ├── config_infer_primary_yoloV7_mask.txt ├── config_infer_primary_yoloV8_seg.txt ├── labels.txt ├── deepstream_app_config.txt ├── LICENSE.md ├── nvdsinfer_custom_impl_Yolo_seg ├── trt_plugins │ ├── roiAlignPlugin │ │ ├── roiAlignKernel.h │ │ ├── roiAlignPlugin.h │ │ ├── roiAlignKernel.cu │ │ └── roiAlignPlugin.cpp │ ├── efficientNMSPlugin │ │ ├── efficientNMSInference.h │ │ ├── efficientNMSParameters.h │ │ ├── efficientNMSPlugin.h │ │ ├── efficientNMSInference.cuh │ │ ├── efficientNMSPlugin.cpp │ │ └── efficientNMSInference.cu │ ├── common.cpp │ └── common.h ├── Makefile └── nvdsparseseg_Yolo.cpp ├── README.md ├── docs ├── RFDETR_Seg.md ├── YOLO11_Seg.md ├── YOLOv8_Seg.md ├── YOLOv5_Seg.md ├── YOLOv7_Seg.md └── YOLOv7_Mask.md └── utils ├── export_yoloV5_seg.py ├── export_yoloV7_seg.py ├── export_yoloV7_mask.py ├── export_yolo11_seg.py ├── export_yoloV8_seg.py └── export_rfdetr_seg.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | #github: [marcoslucianops] 2 | custom: ['https://www.buymeacoffee.com/marcoslucianops'] 3 | -------------------------------------------------------------------------------- /config_infer_primary_rfdetr_seg.txt: -------------------------------------------------------------------------------- 1 | [property] 2 | gpu-id=0 3 | net-scale-factor=0.0039215697906911373 4 | model-color-format=0 5 | onnx-file=rf-detr-seg-preview.onnx 6 | model-engine-file=rf-detr-seg-preview.onnx_b1_gpu0_fp32.engine 7 | #int8-calib-file=calib.table 8 | labelfile-path=labels.txt 9 | batch-size=1 10 | network-mode=0 11 | num-detected-classes=91 12 | interval=0 13 | gie-unique-id=1 14 | process-mode=1 15 | network-type=3 16 | cluster-mode=4 17 | maintain-aspect-ratio=0 18 | scaling-filter=1 19 | scaling-compute-hw=0 20 | force-implicit-batch-dim=0 21 | #workspace-size=2000 22 | parse-bbox-instance-mask-func-name=NvDsInferParseYoloSeg 23 | custom-lib-path=nvdsinfer_custom_impl_Yolo_seg/libnvdsinfer_custom_impl_Yolo_seg.so 24 | output-instance-mask=1 25 | segmentation-threshold=0.5 26 | 27 | [class-attrs-all] 28 | pre-cluster-threshold=0.25 29 | -------------------------------------------------------------------------------- /config_infer_primary_yoloV7_seg.txt: -------------------------------------------------------------------------------- 1 | [property] 2 | gpu-id=0 3 | net-scale-factor=0.0039215697906911373 4 | model-color-format=0 5 | onnx-file=yolov7-seg.onnx 6 | model-engine-file=yolov7-seg.onnx_b1_gpu0_fp32.engine 7 | #int8-calib-file=calib.table 8 | labelfile-path=labels.txt 9 | batch-size=1 10 | network-mode=0 11 | num-detected-classes=80 12 | interval=0 13 | gie-unique-id=1 14 | process-mode=1 15 | network-type=3 16 | cluster-mode=4 17 | maintain-aspect-ratio=1 18 | symmetric-padding=1 19 | scaling-filter=1 20 | scaling-compute-hw=0 21 | force-implicit-batch-dim=0 22 | #workspace-size=2000 23 | parse-bbox-instance-mask-func-name=NvDsInferParseYoloSeg 24 | custom-lib-path=nvdsinfer_custom_impl_Yolo_seg/libnvdsinfer_custom_impl_Yolo_seg.so 25 | output-instance-mask=1 26 | segmentation-threshold=0.5 27 | 28 | [class-attrs-all] 29 | pre-cluster-threshold=0.25 30 | -------------------------------------------------------------------------------- /config_infer_primary_yolo11_seg.txt: -------------------------------------------------------------------------------- 1 | [property] 2 | gpu-id=0 3 | net-scale-factor=0.0039215697906911373 4 | model-color-format=0 5 | onnx-file=yolo11s-seg.onnx 6 | model-engine-file=yolo11s-seg.onnx_b1_gpu0_fp32.engine 7 | #int8-calib-file=calib.table 8 | labelfile-path=labels.txt 9 | batch-size=1 10 | network-mode=0 11 | num-detected-classes=80 12 | interval=0 13 | gie-unique-id=1 14 | process-mode=1 15 | network-type=3 16 | cluster-mode=4 17 | maintain-aspect-ratio=1 18 | symmetric-padding=1 19 | scaling-filter=1 20 | scaling-compute-hw=0 21 | force-implicit-batch-dim=0 22 | #workspace-size=2000 23 | parse-bbox-instance-mask-func-name=NvDsInferParseYoloSeg 24 | custom-lib-path=nvdsinfer_custom_impl_Yolo_seg/libnvdsinfer_custom_impl_Yolo_seg.so 25 | output-instance-mask=1 26 | segmentation-threshold=0.5 27 | 28 | [class-attrs-all] 29 | pre-cluster-threshold=0.25 30 | -------------------------------------------------------------------------------- /config_infer_primary_yoloV5_seg.txt: -------------------------------------------------------------------------------- 1 | [property] 2 | gpu-id=0 3 | net-scale-factor=0.0039215697906911373 4 | model-color-format=0 5 | onnx-file=yolov5s-seg.onnx 6 | model-engine-file=yolov5s-seg.onnx_b1_gpu0_fp32.engine 7 | #int8-calib-file=calib.table 8 | labelfile-path=labels.txt 9 | batch-size=1 10 | network-mode=0 11 | num-detected-classes=80 12 | interval=0 13 | gie-unique-id=1 14 | process-mode=1 15 | network-type=3 16 | cluster-mode=4 17 | maintain-aspect-ratio=1 18 | symmetric-padding=1 19 | scaling-filter=1 20 | scaling-compute-hw=0 21 | force-implicit-batch-dim=0 22 | #workspace-size=2000 23 | parse-bbox-instance-mask-func-name=NvDsInferParseYoloSeg 24 | custom-lib-path=nvdsinfer_custom_impl_Yolo_seg/libnvdsinfer_custom_impl_Yolo_seg.so 25 | output-instance-mask=1 26 | segmentation-threshold=0.5 27 | 28 | [class-attrs-all] 29 | pre-cluster-threshold=0.25 30 | -------------------------------------------------------------------------------- /config_infer_primary_yoloV7_mask.txt: -------------------------------------------------------------------------------- 1 | [property] 2 | gpu-id=0 3 | net-scale-factor=0.0039215697906911373 4 | model-color-format=0 5 | onnx-file=yolov7-mask.onnx 6 | model-engine-file=yolov7-mask.onnx_b1_gpu0_fp32.engine 7 | #int8-calib-file=calib.table 8 | labelfile-path=labels.txt 9 | batch-size=1 10 | network-mode=0 11 | num-detected-classes=80 12 | interval=0 13 | gie-unique-id=1 14 | process-mode=1 15 | network-type=3 16 | cluster-mode=4 17 | maintain-aspect-ratio=1 18 | symmetric-padding=1 19 | scaling-filter=1 20 | scaling-compute-hw=0 21 | force-implicit-batch-dim=0 22 | #workspace-size=2000 23 | parse-bbox-instance-mask-func-name=NvDsInferParseYoloSeg 24 | custom-lib-path=nvdsinfer_custom_impl_Yolo_seg/libnvdsinfer_custom_impl_Yolo_seg.so 25 | output-instance-mask=1 26 | segmentation-threshold=0.5 27 | 28 | [class-attrs-all] 29 | pre-cluster-threshold=0.25 30 | -------------------------------------------------------------------------------- /config_infer_primary_yoloV8_seg.txt: -------------------------------------------------------------------------------- 1 | [property] 2 | gpu-id=0 3 | net-scale-factor=0.0039215697906911373 4 | model-color-format=0 5 | onnx-file=yolov8s-seg.onnx 6 | model-engine-file=yolov8s-seg.onnx_b1_gpu0_fp32.engine 7 | #int8-calib-file=calib.table 8 | labelfile-path=labels.txt 9 | batch-size=1 10 | network-mode=0 11 | num-detected-classes=80 12 | interval=0 13 | gie-unique-id=1 14 | process-mode=1 15 | network-type=3 16 | cluster-mode=4 17 | maintain-aspect-ratio=1 18 | symmetric-padding=1 19 | scaling-filter=1 20 | scaling-compute-hw=0 21 | force-implicit-batch-dim=0 22 | #workspace-size=2000 23 | parse-bbox-instance-mask-func-name=NvDsInferParseYoloSeg 24 | custom-lib-path=nvdsinfer_custom_impl_Yolo_seg/libnvdsinfer_custom_impl_Yolo_seg.so 25 | output-instance-mask=1 26 | segmentation-threshold=0.5 27 | 28 | [class-attrs-all] 29 | pre-cluster-threshold=0.25 30 | -------------------------------------------------------------------------------- /labels.txt: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /deepstream_app_config.txt: -------------------------------------------------------------------------------- 1 | [application] 2 | enable-perf-measurement=1 3 | perf-measurement-interval-sec=5 4 | 5 | [tiled-display] 6 | enable=1 7 | rows=1 8 | columns=1 9 | width=1280 10 | height=720 11 | gpu-id=0 12 | nvbuf-memory-type=0 13 | 14 | [source0] 15 | enable=1 16 | type=3 17 | uri=file:///opt/nvidia/deepstream/deepstream/samples/streams/sample_1080p_h264.mp4 18 | num-sources=1 19 | gpu-id=0 20 | cudadec-memtype=0 21 | 22 | [sink0] 23 | enable=1 24 | type=2 25 | sync=0 26 | gpu-id=0 27 | nvbuf-memory-type=0 28 | 29 | [osd] 30 | enable=1 31 | display-mask=1 32 | gpu-id=0 33 | border-width=5 34 | text-size=15 35 | text-color=1;1;1;1; 36 | text-bg-color=0.3;0.3;0.3;1 37 | font=Serif 38 | show-clock=0 39 | clock-x-offset=800 40 | clock-y-offset=820 41 | clock-text-size=12 42 | clock-color=1;0;0;0 43 | nvbuf-memory-type=0 44 | 45 | [streammux] 46 | gpu-id=0 47 | live-source=0 48 | batch-size=1 49 | batched-push-timeout=40000 50 | width=1920 51 | height=1080 52 | enable-padding=0 53 | nvbuf-memory-type=0 54 | 55 | [primary-gie] 56 | enable=1 57 | gpu-id=0 58 | gie-unique-id=1 59 | nvbuf-memory-type=0 60 | config-file=config_infer_primary_yoloV8_seg.txt 61 | 62 | [tests] 63 | file-loop=0 64 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023-2025, Marcos Luciano Piropo Santos. 4 | Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /nvdsinfer_custom_impl_Yolo_seg/trt_plugins/roiAlignPlugin/roiAlignKernel.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | #ifndef TRT_ROIALIGN_KERNEL_H 18 | #define TRT_ROIALIGN_KERNEL_H 19 | 20 | #include 21 | #include 22 | 23 | template 24 | cudaError_t RoiAlignImpl(cudaStream_t stream, int32_t const maxThreadsPerBlock, T const* bottomData, 25 | T const spatialScale, int32_t const numRois, int32_t const channels, int32_t const height, int32_t const width, 26 | int32_t const pooledHeight, int32_t const pooledWidth, int32_t const samplingRatio, T const* bottomRois, T* topData, 27 | int32_t const isModeAvg, int32_t const* batchIndicesPtr, int32_t const aligned); 28 | 29 | #endif // TRT_ROIALIGN_KERNEL_H 30 | -------------------------------------------------------------------------------- /nvdsinfer_custom_impl_Yolo_seg/trt_plugins/efficientNMSPlugin/efficientNMSInference.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #ifndef TRT_EFFICIENT_NMS_INFERENCE_H 19 | #define TRT_EFFICIENT_NMS_INFERENCE_H 20 | 21 | #include "../common.h" 22 | 23 | #include "efficientNMSParameters.h" 24 | 25 | size_t EfficientNMSWorkspaceSize( 26 | int32_t batchSize, int32_t numScoreElements, int32_t numClasses, nvinfer1::DataType datatype); 27 | 28 | pluginStatus_t EfficientNMSInference(nvinfer1::plugin::EfficientNMSParameters param, void const* boxesInput, 29 | void const* scoresInput, void const* anchorsInput, void* numDetectionsOutput, void* nmsBoxesOutput, 30 | void* nmsScoresOutput, void* nmsClassesOutput, void* nmsIndicesOutput, void* workspace, cudaStream_t stream); 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /nvdsinfer_custom_impl_Yolo_seg/trt_plugins/efficientNMSPlugin/efficientNMSParameters.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #ifndef TRT_EFFICIENT_NMS_PARAMETERS_H 19 | #define TRT_EFFICIENT_NMS_PARAMETERS_H 20 | 21 | #include "../common.h" 22 | 23 | namespace nvinfer1 24 | { 25 | namespace plugin 26 | { 27 | 28 | struct EfficientNMSParameters 29 | { 30 | // Related to NMS Options 31 | float iouThreshold = 0.5F; 32 | float scoreThreshold = 0.5F; 33 | int32_t numOutputBoxes = 100; 34 | int32_t numOutputBoxesPerClass = -1; 35 | bool padOutputBoxesPerClass = false; 36 | int32_t backgroundClass = -1; 37 | bool scoreSigmoid = false; 38 | bool clipBoxes = false; 39 | int32_t boxCoding = 0; 40 | bool classAgnostic = false; 41 | 42 | // Related to NMS Internals 43 | int32_t numSelectedBoxes = 4096; 44 | int32_t scoreBits = -1; 45 | 46 | // Related to Tensor Configuration 47 | // (These are set by the various plugin configuration methods, no need to define them during plugin creation.) 48 | int32_t batchSize = -1; 49 | int32_t numClasses = 1; 50 | int32_t numBoxElements = -1; 51 | int32_t numScoreElements = -1; 52 | int32_t numAnchors = -1; 53 | bool shareLocation = true; 54 | bool shareAnchors = true; 55 | bool boxDecoder = false; 56 | nvinfer1::DataType datatype = nvinfer1::DataType::kFLOAT; 57 | }; 58 | 59 | } // namespace plugin 60 | } // namespace nvinfer1 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /nvdsinfer_custom_impl_Yolo_seg/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_VER?= 2 | ifeq ($(CUDA_VER),) 3 | $(error "CUDA_VER is not set") 4 | endif 5 | 6 | CUDA_VER_MAJOR:= $(word 1,$(subst ., ,$(CUDA_VER))) 7 | CUDA_VER_MINOR:= $(word 2,$(subst ., ,$(CUDA_VER))) 8 | CUDA_VER_NUM:= $(CUDA_VER_MAJOR)$(CUDA_VER_MINOR) 9 | 10 | CUDA_ARCH:= 53 60 61 62 70 72 75 11 | ifeq ($(shell expr $(CUDA_VER_NUM) \>= 110),1) 12 | CUDA_ARCH+= 80 13 | endif 14 | ifeq ($(shell expr $(CUDA_VER_NUM) \>= 111),1) 15 | CUDA_ARCH+= 86 16 | endif 17 | ifeq ($(shell expr $(CUDA_VER_NUM) \>= 118),1) 18 | CUDA_ARCH+= 87 89 19 | endif 20 | ifeq ($(shell expr $(CUDA_VER_NUM) \>= 120),1) 21 | CUDA_ARCH+= 90 22 | endif 23 | ifeq ($(shell expr $(CUDA_VER_NUM) \>= 128),1) 24 | CUDA_ARCH+= 100 103 110 120 25 | endif 26 | 27 | GENCODE_FLAGS:= $(foreach a,$(CUDA_ARCH),-gencode arch=compute_$(a),code=sm_$(a) \ 28 | -gencode arch=compute_$(a),code=compute_$(a)) 29 | 30 | SM?= 31 | ifneq ($(SM),) 32 | GENCODE_FLAGS:= -gencode arch=compute_$(SM),code=sm_$(SM) -gencode arch=compute_$(SM),code=compute_$(SM) 33 | endif 34 | 35 | CXX:= g++ 36 | NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc 37 | 38 | TARGET_LIB:= libnvdsinfer_custom_impl_Yolo_seg.so 39 | 40 | CFLAGS:= -Wall -std=c++17 -shared -fPIC -Wno-error=deprecated-declarations 41 | 42 | CFLAGS+= -I/opt/nvidia/deepstream/deepstream/sources/includes -I/usr/local/cuda-$(CUDA_VER)/include 43 | CUFLAGS:= -I/opt/nvidia/deepstream/deepstream/sources/includes -I/usr/local/cuda-$(CUDA_VER)/include 44 | 45 | LIBS+= -lnvinfer -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lstdc++fs 46 | 47 | LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group 48 | 49 | SRCS:= $(wildcard *.cpp) 50 | SRCS+= $(wildcard trt_plugins/*.cpp) 51 | SRCS+= $(wildcard trt_plugins/efficientNMSPlugin/*.cpp) 52 | SRCS+= $(wildcard trt_plugins/efficientNMSPlugin/*.cu) 53 | SRCS+= $(wildcard trt_plugins/roiAlignPlugin/*.cpp) 54 | SRCS+= $(wildcard trt_plugins/roiAlignPlugin/*.cu) 55 | 56 | INCS:= $(wildcard *.h) 57 | INCS+= $(wildcard trt_plugins/*.h) 58 | INCS+= $(wildcard trt_plugins/efficientNMSPlugin/*.h) 59 | INCS+= $(wildcard trt_plugins/efficientNMSPlugin/*.cuh) 60 | INCS+= $(wildcard trt_plugins/roiAlignPlugin/*.h) 61 | INCS+= $(wildcard trt_plugins/roiAlignPlugin/*.cuh) 62 | 63 | OBJS:= $(addsuffix .o, $(basename $(SRCS))) 64 | 65 | all: $(TARGET_LIB) 66 | 67 | %.o: %.cpp Makefile 68 | $(CXX) -c -o $@ $(CFLAGS) $< 69 | 70 | %.o: %.cu $(INCS) Makefile 71 | $(NVCC) -c -o $@ $(GENCODE_FLAGS) --compiler-options '-fPIC' $(CUFLAGS) $< 72 | 73 | $(TARGET_LIB) : $(OBJS) 74 | $(CXX) -o $@ $(OBJS) $(LFLAGS) 75 | 76 | clean: 77 | rm -rf $(OBJS) $(TARGET_LIB) 78 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepStream-Yolo-Seg 2 | 3 | NVIDIA DeepStream SDK 8.0 / 7.1 / 7.0 / 6.4 / 6.3 / 6.2 / 6.1.1 / 6.1 / 6.0.1 / 6.0 application for YOLO-Seg models 4 | 5 | -------------------------------------------------------------------------------------------------- 6 | ### YOLO object detection models and other infos: https://github.com/marcoslucianops/DeepStream-Yolo 7 | -------------------------------------------------------------------------------------------------- 8 | ### Important: Please export the ONNX model with the new export file, generate the TensorRT engine again with the updated files, and use the new config_infer_primary file according to your model 9 | -------------------------------------------------------------------------------------------------- 10 | 11 | ### Getting started 12 | 13 | * [Supported models](#supported-models) 14 | * [Instructions](#basic-usage) 15 | * [YOLOv5-Seg usage](docs/YOLOv5_Seg.md) 16 | * [YOLOv7-Seg usage](docs/YOLOv7_Seg.md) 17 | * [YOLOv7-Mask usage](docs/YOLOv7_Mask.md) 18 | * [YOLOv8-Seg usage](docs/YOLOv8_Seg.md) 19 | * [YOLO11-Seg usage](docs/YOLO11_Seg.md) 20 | * [RF-DETR-Seg usage](docs/RFDETR_Seg.md) 21 | * [NMS configuration](#nms-configuration) 22 | * [Detection threshold configuration](#detection-threshold-configuration) 23 | 24 | ## 25 | 26 | ### Supported models 27 | 28 | * [RF-DETR-Seg](https://github.com/roboflow/rf-detr) 29 | * [YOLO11-Seg](https://github.com/ultralytics/ultralytics) 30 | * [YOLOv8-Seg](https://github.com/ultralytics/ultralytics) 31 | * [YOLOv7-Mask](https://github.com/WongKinYiu/yolov7/tree/mask) 32 | * [YOLOv7-Seg](https://github.com/WongKinYiu/yolov7/tree/u7/seg) 33 | * [YOLOv5-Seg](https://github.com/ultralytics/yolov5) 34 | 35 | ## 36 | 37 | ### Instructions 38 | 39 | #### 1. Download the DeepStream-Yolo-Seg repo 40 | 41 | ``` 42 | git clone https://github.com/marcoslucianops/DeepStream-Yolo-Seg.git 43 | cd DeepStream-Yolo-Seg 44 | ``` 45 | 46 | #### 2. Compile the libs 47 | 48 | 2.1. Set the `CUDA_VER` according to your DeepStream version 49 | 50 | ``` 51 | export CUDA_VER=XY.Z 52 | ``` 53 | 54 | * x86 platform 55 | 56 | ``` 57 | DeepStream 8.0 = 12.8 58 | DeepStream 7.1 = 12.6 59 | DeepStream 7.0 / 6.4 = 12.2 60 | DeepStream 6.3 = 12.1 61 | DeepStream 6.2 = 11.8 62 | DeepStream 6.1.1 = 11.7 63 | DeepStream 6.1 = 11.6 64 | DeepStream 6.0.1 / 6.0 = 11.4 65 | ``` 66 | 67 | * Jetson platform 68 | 69 | ``` 70 | DeepStream 8.0 = 13.0 71 | DeepStream 7.1 = 12.6 72 | DeepStream 7.0 / 6.4 = 12.2 73 | DeepStream 6.3 / 6.2 / 6.1.1 / 6.1 = 11.4 74 | DeepStream 6.0.1 / 6.0 = 10.2 75 | ``` 76 | 77 | 2.2. Make the libs 78 | 79 | ``` 80 | make -C nvdsinfer_custom_impl_Yolo_seg clean && make -C nvdsinfer_custom_impl_Yolo_seg 81 | ``` 82 | 83 | #### 3. Run 84 | 85 | ``` 86 | deepstream-app -c deepstream_app_config.txt 87 | ``` 88 | 89 | **NOTE**: The TensorRT engine file may take a very long time to generate (sometimes more than 10 minutes). 90 | 91 | ## 92 | 93 | ### NMS configuration 94 | 95 | For now, the NMS is configured in the ONNX exporter file. 96 | 97 | **NOTE**: Make sure to set `cluster-mode=4` in the config_infer file. 98 | 99 | ## 100 | 101 | ### Detection threshold configuration 102 | 103 | The minimum detection confidence threshold is configured in the ONNX exporter file. The `pre-cluster-threshold` should be >= the value used in the ONNX model. 104 | 105 | ``` 106 | [class-attrs-all] 107 | pre-cluster-threshold=0.25 108 | ``` 109 | 110 | ## 111 | 112 | My projects: https://www.youtube.com/MarcosLucianoTV 113 | -------------------------------------------------------------------------------- /nvdsinfer_custom_impl_Yolo_seg/nvdsparseseg_Yolo.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "nvdsinfer_custom_impl.h" 7 | 8 | extern "C" bool 9 | NvDsInferParseYoloSeg(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, 10 | NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList); 11 | 12 | static float 13 | clamp(float val, float minVal, float maxVal) 14 | { 15 | assert(minVal <= maxVal); 16 | return std::min(maxVal, std::max(minVal, val)); 17 | } 18 | 19 | static void 20 | addSegProposal(const float* output, size_t channelsSize, uint netW, uint netH, size_t n, NvDsInferInstanceMaskInfo& b) 21 | { 22 | size_t maskSize = channelsSize - 6; 23 | b.mask = new float[maskSize]; 24 | b.mask_width = netW / 4; 25 | b.mask_height = netH / 4; 26 | b.mask_size = sizeof(float) * maskSize; 27 | std::memcpy(b.mask, output + n * channelsSize + 6, sizeof(float) * maskSize); 28 | } 29 | 30 | static void 31 | addBBoxProposal(float x1, float y1, float x2, float y2, uint netW, uint netH, int maxIndex, float maxProb, 32 | NvDsInferInstanceMaskInfo& b) 33 | { 34 | x1 = clamp(x1, 0, netW); 35 | y1 = clamp(y1, 0, netH); 36 | x2 = clamp(x2, 0, netW); 37 | y2 = clamp(y2, 0, netH); 38 | 39 | b.left = x1; 40 | b.width = clamp(x2 - x1, 0, netW); 41 | b.top = y1; 42 | b.height = clamp(y2 - y1, 0, netH); 43 | 44 | if (b.width < 1 || b.height < 1) { 45 | return; 46 | } 47 | 48 | b.detectionConfidence = maxProb; 49 | b.classId = maxIndex; 50 | } 51 | 52 | static std::vector 53 | decodeTensorYoloSeg(const float* output, size_t outputSize, size_t channelsSize, uint netW, uint netH, 54 | const std::vector& preclusterThreshold) 55 | { 56 | std::vector objects; 57 | 58 | for (size_t n = 0; n < outputSize; ++n) { 59 | float maxProb = output[n * channelsSize + 4]; 60 | int maxIndex = (int) output[n * channelsSize + 5]; 61 | 62 | if (maxProb < preclusterThreshold[maxIndex]) { 63 | continue; 64 | } 65 | 66 | float x1 = output[n * channelsSize + 0]; 67 | float y1 = output[n * channelsSize + 1]; 68 | float x2 = output[n * channelsSize + 2]; 69 | float y2 = output[n * channelsSize + 3]; 70 | 71 | NvDsInferInstanceMaskInfo b; 72 | 73 | addBBoxProposal(x1, y1, x2, y2, netW, netH, maxIndex, maxProb, b); 74 | addSegProposal(output, channelsSize, netW, netH, n, b); 75 | 76 | objects.push_back(b); 77 | } 78 | 79 | return objects; 80 | } 81 | 82 | static bool 83 | NvDsInferParseCustomYoloSeg(std::vector const& outputLayersInfo, 84 | NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams, 85 | std::vector& objectList) 86 | { 87 | if (outputLayersInfo.empty()) { 88 | std::cerr << "ERROR - Could not find output layer" << std::endl; 89 | return false; 90 | } 91 | 92 | const NvDsInferLayerInfo& output = outputLayersInfo[0]; 93 | 94 | size_t outputSize = output.inferDims.d[0]; 95 | size_t channelsSize = output.inferDims.d[1]; 96 | 97 | std::vector objects = decodeTensorYoloSeg((const float*) (output.buffer), outputSize, 98 | channelsSize, networkInfo.width, networkInfo.height, detectionParams.perClassPreclusterThreshold); 99 | 100 | objectList = objects; 101 | 102 | return true; 103 | } 104 | 105 | extern "C" bool 106 | NvDsInferParseYoloSeg(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo, 107 | NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList) 108 | { 109 | return NvDsInferParseCustomYoloSeg(outputLayersInfo, networkInfo, detectionParams, objectList); 110 | } 111 | 112 | CHECK_CUSTOM_INSTANCE_MASK_PARSE_FUNC_PROTOTYPE(NvDsInferParseYoloSeg); 113 | -------------------------------------------------------------------------------- /nvdsinfer_custom_impl_Yolo_seg/trt_plugins/common.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | namespace nvinfer1 4 | { 5 | 6 | namespace plugin 7 | { 8 | 9 | ILogger* gLogger{}; 10 | 11 | template 12 | int32_t LogStream::Buf::sync() 13 | { 14 | std::string s = str(); 15 | while (!s.empty() && s.back() == '\n') 16 | { 17 | s.pop_back(); 18 | } 19 | if (gLogger != nullptr) 20 | { 21 | gLogger->log(tSeverity, s.c_str()); 22 | } 23 | str(""); 24 | return 0; 25 | } 26 | 27 | LogStream gLogError; 28 | LogStream gLogWarning; 29 | 30 | void caughtError(std::exception const& e) 31 | { 32 | gLogError << e.what() << std::endl; 33 | } 34 | 35 | void throwCudaError(char const* file, char const* function, int32_t line, int32_t status, char const* msg) 36 | { 37 | CudaError error(file, function, line, status, msg); 38 | error.log(gLogError); 39 | // NOLINTNEXTLINE(misc-throw-by-value-catch-by-reference) 40 | throw error; 41 | } 42 | 43 | void throwPluginError(char const* file, char const* function, int32_t line, int32_t status, char const* msg) 44 | { 45 | PluginError error(file, function, line, status, msg); 46 | reportValidationFailure(msg, file, line); 47 | // NOLINTNEXTLINE(misc-throw-by-value-catch-by-reference) 48 | throw error; 49 | } 50 | 51 | void reportValidationFailure(char const* msg, char const* file, int32_t line) 52 | { 53 | std::ostringstream stream; 54 | stream << "Validation failed: " << msg << "\n" << file << ':' << line << "\n"; 55 | #ifdef COMPILE_VFC_PLUGIN 56 | ILogger* logger = getPluginLogger(); 57 | if (logger != nullptr) 58 | { 59 | logger->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str()); 60 | } 61 | #else 62 | getLogger()->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str()); 63 | #endif 64 | } 65 | 66 | void reportAssertion(char const* msg, char const* file, int32_t line) 67 | { 68 | std::ostringstream stream; 69 | stream << "Assertion failed: " << msg << "\n" 70 | << file << ':' << line << "\n" 71 | << "Aborting..." 72 | << "\n"; 73 | #ifdef COMPILE_VFC_PLUGIN 74 | ILogger* logger = getPluginLogger(); 75 | if (logger != nullptr) 76 | { 77 | logger->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str()); 78 | } 79 | #else 80 | getLogger()->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str()); 81 | #endif 82 | PLUGIN_CUASSERT(cudaDeviceReset()); 83 | exit(EXIT_FAILURE); 84 | } 85 | 86 | void TRTException::log(std::ostream& logStream) const 87 | { 88 | logStream << file << " (" << line << ") - " << name << " Error in " << function << ": " << status; 89 | if (message != nullptr) 90 | { 91 | logStream << " (" << message << ")"; 92 | } 93 | logStream << std::endl; 94 | } 95 | 96 | void validateRequiredAttributesExist(std::set requiredFieldNames, PluginFieldCollection const* fc) 97 | { 98 | for (int32_t i = 0; i < fc->nbFields; i++) 99 | { 100 | requiredFieldNames.erase(fc->fields[i].name); 101 | } 102 | if (!requiredFieldNames.empty()) 103 | { 104 | std::stringstream msg{}; 105 | msg << "PluginFieldCollection missing required fields: {"; 106 | char const* separator = ""; 107 | for (auto const& field : requiredFieldNames) 108 | { 109 | msg << separator << field; 110 | separator = ", "; 111 | } 112 | msg << "}"; 113 | std::string msg_str = msg.str(); 114 | PLUGIN_ERROR(msg_str.c_str()); 115 | } 116 | } 117 | 118 | size_t dataTypeSize(const DataType dtype) 119 | { 120 | switch (dtype) 121 | { 122 | case DataType::kINT8: return sizeof(char); 123 | case DataType::kHALF: return sizeof(short); 124 | case DataType::kFLOAT: return sizeof(float); 125 | default: PLUGIN_FAIL("Unsupported data type"); 126 | return 0; 127 | } 128 | } 129 | 130 | } // namespace plugin 131 | } // namespace nvinfer1 132 | -------------------------------------------------------------------------------- /docs/RFDETR_Seg.md: -------------------------------------------------------------------------------- 1 | # RF-DETR-Seg usage 2 | 3 | **NOTE**: The yaml file is not required. 4 | 5 | * [Convert model](#convert-model) 6 | * [Compile the lib](#compile-the-lib) 7 | * [Edit the config_infer_primary_rfdetr_seg file](#edit-the-config_infer_primary_rfdetr_seg-file) 8 | 9 | ## 10 | 11 | ### Convert model 12 | 13 | #### 1. Download the RF-DETR repo and install the requirements 14 | 15 | ``` 16 | git clone https://github.com/ultralytics/ultralytics.git 17 | cd ultralytics 18 | pip3 install -e . 19 | pip3 install onnx onnxslim onnxruntime 20 | ``` 21 | 22 | **NOTE**: It is recommended to use Python virtualenv. 23 | 24 | #### 2. Copy conversor 25 | 26 | Copy the `export_rfdetr_seg.py` file from `DeepStream-Yolo-Seg/utils` directory to the `ultralytics` folder. 27 | 28 | #### 3. Download the model 29 | 30 | Download the `pt` file from [RF-DETR](https://github.com/roboflow/rf-detr) releases (example for RF-DETR-Seg-Preview) 31 | 32 | ``` 33 | wget https://storage.googleapis.com/rfdetr/rf-detr-seg-preview.pt 34 | ``` 35 | 36 | **NOTE**: You can use your custom model. 37 | 38 | #### 4. Convert model 39 | 40 | Generate the ONNX model file (example for RF-DETR-Seg-Preview) 41 | 42 | ``` 43 | python3 export_rfdetr_seg.py -w rf-detr-seg-preview.pt --dynamic 44 | ``` 45 | 46 | **NOTE**: Minimum detection confidence threshold (example for conf-threshold = 0.25) 47 | 48 | The minimum detection confidence threshold is configured in the ONNX exporter file. The `pre-cluster-threshold` should be >= the value used in the ONNX model. 49 | 50 | ``` 51 | --conf-threshold 0.25 52 | ``` 53 | 54 | **NOTE**: NMS IoU threshold (example for iou-threshold = 0.45) 55 | 56 | ``` 57 | --iou-threshold 0.45 58 | ``` 59 | 60 | **NOTE**: Maximum number of output detections (example for max-detections = 300) 61 | 62 | ``` 63 | --max-detections 300 64 | ``` 65 | 66 | **NOTE**: To change the inference size (defaut: 640) 67 | 68 | ``` 69 | -s SIZE 70 | --size SIZE 71 | -s HEIGHT WIDTH 72 | --size HEIGHT WIDTH 73 | ``` 74 | 75 | Example for 1280 76 | 77 | ``` 78 | -s 1280 79 | ``` 80 | 81 | or 82 | 83 | ``` 84 | -s 1280 1280 85 | ``` 86 | 87 | **NOTE**: To simplify the ONNX model 88 | 89 | ``` 90 | --simplify 91 | ``` 92 | 93 | **NOTE**: To use dynamic batch-size (DeepStream >= 6.1) 94 | 95 | ``` 96 | --dynamic 97 | ``` 98 | 99 | **NOTE**: To use static batch-size (example for batch-size = 4) 100 | 101 | ``` 102 | --batch 4 103 | ``` 104 | 105 | #### 5. Copy generated files 106 | 107 | Copy the generated ONNX model file and labels.txt file (if generated) to the `DeepStream-Yolo-Seg` folder. 108 | 109 | ## 110 | 111 | ### Compile the lib 112 | 113 | 1. Open the `DeepStream-Yolo-Seg` folder and compile the lib 114 | 115 | 2. Set the `CUDA_VER` according to your DeepStream version 116 | 117 | ``` 118 | export CUDA_VER=XY.Z 119 | ``` 120 | 121 | * x86 platform 122 | 123 | ``` 124 | DeepStream 8.0 = 12.8 125 | DeepStream 7.1 = 12.6 126 | DeepStream 7.0 / 6.4 = 12.2 127 | DeepStream 6.3 = 12.1 128 | DeepStream 6.2 = 11.8 129 | DeepStream 6.1.1 = 11.7 130 | DeepStream 6.1 = 11.6 131 | DeepStream 6.0.1 / 6.0 = 11.4 132 | ``` 133 | 134 | * Jetson platform 135 | 136 | ``` 137 | DeepStream 8.0 = 13.0 138 | DeepStream 7.1 = 12.6 139 | DeepStream 7.0 / 6.4 = 12.2 140 | DeepStream 6.3 / 6.2 / 6.1.1 / 6.1 = 11.4 141 | DeepStream 6.0.1 / 6.0 = 10.2 142 | ``` 143 | 144 | 3. Make the lib 145 | 146 | ``` 147 | make -C nvdsinfer_custom_impl_Yolo_seg clean && make -C nvdsinfer_custom_impl_Yolo_seg 148 | ``` 149 | 150 | ## 151 | 152 | ### Edit the config_infer_primary_rfdetr_seg file 153 | 154 | Edit the `config_infer_primary_rfdetr_seg.txt` file according to your model (example for RF-DETR-Seg-Preview) 155 | 156 | ``` 157 | [property] 158 | ... 159 | onnx-file=rf-detr-seg-preview.onnx 160 | ... 161 | num-detected-classes=91 162 | ... 163 | parse-bbox-func-name=NvDsInferParseYoloSeg 164 | ... 165 | ``` 166 | 167 | **NOTE**: To output the masks, use 168 | 169 | ``` 170 | [property] 171 | ... 172 | output-instance-mask=1 173 | segmentation-threshold=0.5 174 | ... 175 | ``` 176 | 177 | **NOTE**: The **RF-DETR-Seg** do not resize the input with padding. To get better accuracy, use 178 | 179 | ``` 180 | [property] 181 | ... 182 | maintain-aspect-ratio=0 183 | ... 184 | ``` 185 | -------------------------------------------------------------------------------- /docs/YOLO11_Seg.md: -------------------------------------------------------------------------------- 1 | # YOLO11-Seg usage 2 | 3 | **NOTE**: The yaml file is not required. 4 | 5 | * [Convert model](#convert-model) 6 | * [Compile the lib](#compile-the-lib) 7 | * [Edit the config_infer_primary_yolo11_seg file](#edit-the-config_infer_primary_yolo11_seg-file) 8 | 9 | ## 10 | 11 | ### Convert model 12 | 13 | #### 1. Download the YOLO11 repo and install the requirements 14 | 15 | ``` 16 | git clone https://github.com/ultralytics/ultralytics.git 17 | cd ultralytics 18 | pip3 install -e . 19 | pip3 install onnx onnxslim onnxruntime 20 | ``` 21 | 22 | **NOTE**: It is recommended to use Python virtualenv. 23 | 24 | #### 2. Copy conversor 25 | 26 | Copy the `export_yolo11_seg.py` file from `DeepStream-Yolo-Seg/utils` directory to the `ultralytics` folder. 27 | 28 | #### 3. Download the model 29 | 30 | Download the `pt` file from [YOLO11](https://github.com/ultralytics/assets/releases/) releases (example for YOLO11s-Seg) 31 | 32 | ``` 33 | wget https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11s-seg.pt 34 | ``` 35 | 36 | **NOTE**: You can use your custom model. 37 | 38 | #### 4. Convert model 39 | 40 | Generate the ONNX model file (example for YOLO11s-Seg) 41 | 42 | ``` 43 | python3 export_yolo11_seg.py -w yolo11s-seg.pt --dynamic 44 | ``` 45 | 46 | **NOTE**: Minimum detection confidence threshold (example for conf-threshold = 0.25) 47 | 48 | The minimum detection confidence threshold is configured in the ONNX exporter file. The `pre-cluster-threshold` should be >= the value used in the ONNX model. 49 | 50 | ``` 51 | --conf-threshold 0.25 52 | ``` 53 | 54 | **NOTE**: NMS IoU threshold (example for iou-threshold = 0.45) 55 | 56 | ``` 57 | --iou-threshold 0.45 58 | ``` 59 | 60 | **NOTE**: Maximum number of output detections (example for max-detections = 300) 61 | 62 | ``` 63 | --max-detections 300 64 | ``` 65 | 66 | **NOTE**: To change the inference size (defaut: 640) 67 | 68 | ``` 69 | -s SIZE 70 | --size SIZE 71 | -s HEIGHT WIDTH 72 | --size HEIGHT WIDTH 73 | ``` 74 | 75 | Example for 1280 76 | 77 | ``` 78 | -s 1280 79 | ``` 80 | 81 | or 82 | 83 | ``` 84 | -s 1280 1280 85 | ``` 86 | 87 | **NOTE**: To simplify the ONNX model 88 | 89 | ``` 90 | --simplify 91 | ``` 92 | 93 | **NOTE**: To use dynamic batch-size (DeepStream >= 6.1) 94 | 95 | ``` 96 | --dynamic 97 | ``` 98 | 99 | **NOTE**: To use static batch-size (example for batch-size = 4) 100 | 101 | ``` 102 | --batch 4 103 | ``` 104 | 105 | #### 5. Copy generated files 106 | 107 | Copy the generated ONNX model file and labels.txt file (if generated) to the `DeepStream-Yolo-Seg` folder. 108 | 109 | ## 110 | 111 | ### Compile the lib 112 | 113 | 1. Open the `DeepStream-Yolo-Seg` folder and compile the lib 114 | 115 | 2. Set the `CUDA_VER` according to your DeepStream version 116 | 117 | ``` 118 | export CUDA_VER=XY.Z 119 | ``` 120 | 121 | * x86 platform 122 | 123 | ``` 124 | DeepStream 8.0 = 12.8 125 | DeepStream 7.1 = 12.6 126 | DeepStream 7.0 / 6.4 = 12.2 127 | DeepStream 6.3 = 12.1 128 | DeepStream 6.2 = 11.8 129 | DeepStream 6.1.1 = 11.7 130 | DeepStream 6.1 = 11.6 131 | DeepStream 6.0.1 / 6.0 = 11.4 132 | ``` 133 | 134 | * Jetson platform 135 | 136 | ``` 137 | DeepStream 8.0 = 13.0 138 | DeepStream 7.1 = 12.6 139 | DeepStream 7.0 / 6.4 = 12.2 140 | DeepStream 6.3 / 6.2 / 6.1.1 / 6.1 = 11.4 141 | DeepStream 6.0.1 / 6.0 = 10.2 142 | ``` 143 | 144 | 3. Make the lib 145 | 146 | ``` 147 | make -C nvdsinfer_custom_impl_Yolo_seg clean && make -C nvdsinfer_custom_impl_Yolo_seg 148 | ``` 149 | 150 | ## 151 | 152 | ### Edit the config_infer_primary_yolo11_seg file 153 | 154 | Edit the `config_infer_primary_yolo11_seg.txt` file according to your model (example for YOLO11s-Seg) 155 | 156 | ``` 157 | [property] 158 | ... 159 | onnx-file=yolo11s-seg.onnx 160 | ... 161 | num-detected-classes=80 162 | ... 163 | parse-bbox-func-name=NvDsInferParseYoloSeg 164 | ... 165 | ``` 166 | 167 | **NOTE**: To output the masks, use 168 | 169 | ``` 170 | [property] 171 | ... 172 | output-instance-mask=1 173 | segmentation-threshold=0.5 174 | ... 175 | ``` 176 | 177 | **NOTE**: The **YOLO11-Seg** resizes the input with center padding. To get better accuracy, use 178 | 179 | ``` 180 | [property] 181 | ... 182 | maintain-aspect-ratio=1 183 | symmetric-padding=1 184 | ... 185 | ``` 186 | -------------------------------------------------------------------------------- /docs/YOLOv8_Seg.md: -------------------------------------------------------------------------------- 1 | # YOLOv8-Seg usage 2 | 3 | **NOTE**: The yaml file is not required. 4 | 5 | * [Convert model](#convert-model) 6 | * [Compile the lib](#compile-the-lib) 7 | * [Edit the config_infer_primary_yoloV8_seg file](#edit-the-config_infer_primary_yolov8_seg-file) 8 | 9 | ## 10 | 11 | ### Convert model 12 | 13 | #### 1. Download the YOLOv8 repo and install the requirements 14 | 15 | ``` 16 | git clone https://github.com/ultralytics/ultralytics.git 17 | cd ultralytics 18 | pip3 install -e . 19 | pip3 install onnx onnxslim onnxruntime 20 | ``` 21 | 22 | **NOTE**: It is recommended to use Python virtualenv. 23 | 24 | #### 2. Copy conversor 25 | 26 | Copy the `export_yoloV8_seg.py` file from `DeepStream-Yolo-Seg/utils` directory to the `ultralytics` folder. 27 | 28 | #### 3. Download the model 29 | 30 | Download the `pt` file from [YOLOv8](https://github.com/ultralytics/assets/releases/) releases (example for YOLOv8s-Seg) 31 | 32 | ``` 33 | wget https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8s-seg.pt 34 | ``` 35 | 36 | **NOTE**: You can use your custom model. 37 | 38 | #### 4. Convert model 39 | 40 | Generate the ONNX model file (example for YOLOv8s-Seg) 41 | 42 | ``` 43 | python3 export_yoloV8_seg.py -w yolov8s-seg.pt --dynamic 44 | ``` 45 | 46 | **NOTE**: Minimum detection confidence threshold (example for conf-threshold = 0.25) 47 | 48 | The minimum detection confidence threshold is configured in the ONNX exporter file. The `pre-cluster-threshold` should be >= the value used in the ONNX model. 49 | 50 | ``` 51 | --conf-threshold 0.25 52 | ``` 53 | 54 | **NOTE**: NMS IoU threshold (example for iou-threshold = 0.45) 55 | 56 | ``` 57 | --iou-threshold 0.45 58 | ``` 59 | 60 | **NOTE**: Maximum number of output detections (example for max-detections = 300) 61 | 62 | ``` 63 | --max-detections 300 64 | ``` 65 | 66 | **NOTE**: To change the inference size (defaut: 640) 67 | 68 | ``` 69 | -s SIZE 70 | --size SIZE 71 | -s HEIGHT WIDTH 72 | --size HEIGHT WIDTH 73 | ``` 74 | 75 | Example for 1280 76 | 77 | ``` 78 | -s 1280 79 | ``` 80 | 81 | or 82 | 83 | ``` 84 | -s 1280 1280 85 | ``` 86 | 87 | **NOTE**: To simplify the ONNX model 88 | 89 | ``` 90 | --simplify 91 | ``` 92 | 93 | **NOTE**: To use dynamic batch-size (DeepStream >= 6.1) 94 | 95 | ``` 96 | --dynamic 97 | ``` 98 | 99 | **NOTE**: To use static batch-size (example for batch-size = 4) 100 | 101 | ``` 102 | --batch 4 103 | ``` 104 | 105 | #### 5. Copy generated files 106 | 107 | Copy the generated ONNX model file and labels.txt file (if generated) to the `DeepStream-Yolo-Seg` folder. 108 | 109 | ## 110 | 111 | ### Compile the lib 112 | 113 | 1. Open the `DeepStream-Yolo-Seg` folder and compile the lib 114 | 115 | 2. Set the `CUDA_VER` according to your DeepStream version 116 | 117 | ``` 118 | export CUDA_VER=XY.Z 119 | ``` 120 | 121 | * x86 platform 122 | 123 | ``` 124 | DeepStream 8.0 = 12.8 125 | DeepStream 7.1 = 12.6 126 | DeepStream 7.0 / 6.4 = 12.2 127 | DeepStream 6.3 = 12.1 128 | DeepStream 6.2 = 11.8 129 | DeepStream 6.1.1 = 11.7 130 | DeepStream 6.1 = 11.6 131 | DeepStream 6.0.1 / 6.0 = 11.4 132 | ``` 133 | 134 | * Jetson platform 135 | 136 | ``` 137 | DeepStream 8.0 = 13.0 138 | DeepStream 7.1 = 12.6 139 | DeepStream 7.0 / 6.4 = 12.2 140 | DeepStream 6.3 / 6.2 / 6.1.1 / 6.1 = 11.4 141 | DeepStream 6.0.1 / 6.0 = 10.2 142 | ``` 143 | 144 | 3. Make the lib 145 | 146 | ``` 147 | make -C nvdsinfer_custom_impl_Yolo_seg clean && make -C nvdsinfer_custom_impl_Yolo_seg 148 | ``` 149 | 150 | ## 151 | 152 | ### Edit the config_infer_primary_yoloV8_seg file 153 | 154 | Edit the `config_infer_primary_yoloV8_seg.txt` file according to your model (example for YOLOv8s-Seg) 155 | 156 | ``` 157 | [property] 158 | ... 159 | onnx-file=yolov8s-seg.onnx 160 | ... 161 | num-detected-classes=80 162 | ... 163 | parse-bbox-func-name=NvDsInferParseYoloSeg 164 | ... 165 | ``` 166 | 167 | **NOTE**: To output the masks, use 168 | 169 | ``` 170 | [property] 171 | ... 172 | output-instance-mask=1 173 | segmentation-threshold=0.5 174 | ... 175 | ``` 176 | 177 | **NOTE**: The **YOLOv8-Seg** resizes the input with center padding. To get better accuracy, use 178 | 179 | ``` 180 | [property] 181 | ... 182 | maintain-aspect-ratio=1 183 | symmetric-padding=1 184 | ... 185 | ``` 186 | -------------------------------------------------------------------------------- /nvdsinfer_custom_impl_Yolo_seg/trt_plugins/efficientNMSPlugin/efficientNMSPlugin.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | #ifndef TRT_EFFICIENT_NMS_PLUGIN_H 18 | #define TRT_EFFICIENT_NMS_PLUGIN_H 19 | 20 | #include 21 | 22 | #include "../common.h" 23 | #include "efficientNMSParameters.h" 24 | 25 | namespace nvinfer1 26 | { 27 | namespace plugin 28 | { 29 | 30 | class EfficientNMSPlugin : public IPluginV2DynamicExt 31 | { 32 | public: 33 | explicit EfficientNMSPlugin(EfficientNMSParameters param); 34 | EfficientNMSPlugin(void const* data, size_t length); 35 | ~EfficientNMSPlugin() override = default; 36 | 37 | // IPluginV2 methods 38 | char const* getPluginType() const noexcept override; 39 | char const* getPluginVersion() const noexcept override; 40 | int32_t getNbOutputs() const noexcept override; 41 | int32_t initialize() noexcept override; 42 | void terminate() noexcept override; 43 | size_t getSerializationSize() const noexcept override; 44 | void serialize(void* buffer) const noexcept override; 45 | void destroy() noexcept override; 46 | void setPluginNamespace(char const* libNamespace) noexcept override; 47 | char const* getPluginNamespace() const noexcept override; 48 | 49 | // IPluginV2Ext methods 50 | nvinfer1::DataType getOutputDataType( 51 | int32_t index, nvinfer1::DataType const* inputType, int32_t nbInputs) const noexcept override; 52 | 53 | // IPluginV2DynamicExt methods 54 | IPluginV2DynamicExt* clone() const noexcept override; 55 | DimsExprs getOutputDimensions( 56 | int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs, IExprBuilder& exprBuilder) noexcept override; 57 | bool supportsFormatCombination( 58 | int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override; 59 | void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out, 60 | int32_t nbOutputs) noexcept override; 61 | size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs, 62 | int32_t nbOutputs) const noexcept override; 63 | int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs, 64 | void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; 65 | 66 | protected: 67 | EfficientNMSParameters mParam{}; 68 | bool initialized{false}; 69 | std::string mNamespace; 70 | 71 | private: 72 | void deserialize(int8_t const* data, size_t length); 73 | }; 74 | 75 | // Standard NMS Plugin Operation 76 | class EfficientNMSPluginCreator : public nvinfer1::pluginInternal::BaseCreator 77 | { 78 | public: 79 | EfficientNMSPluginCreator(); 80 | ~EfficientNMSPluginCreator() override = default; 81 | 82 | char const* getPluginName() const noexcept override; 83 | char const* getPluginVersion() const noexcept override; 84 | PluginFieldCollection const* getFieldNames() noexcept override; 85 | 86 | IPluginV2DynamicExt* createPlugin(char const* name, PluginFieldCollection const* fc) noexcept override; 87 | IPluginV2DynamicExt* deserializePlugin( 88 | char const* name, void const* serialData, size_t serialLength) noexcept override; 89 | 90 | protected: 91 | PluginFieldCollection mFC; 92 | EfficientNMSParameters mParam; 93 | std::vector mPluginAttributes; 94 | std::string mPluginName; 95 | }; 96 | 97 | } // namespace plugin 98 | } // namespace nvinfer1 99 | 100 | #endif // TRT_EFFICIENT_NMS_PLUGIN_H 101 | -------------------------------------------------------------------------------- /docs/YOLOv5_Seg.md: -------------------------------------------------------------------------------- 1 | # YOLOv5-Seg usage 2 | 3 | **NOTE**: The yaml file is not required. 4 | 5 | * [Convert model](#convert-model) 6 | * [Compile the lib](#compile-the-lib) 7 | * [Edit the config_infer_primary_yoloV5_seg file](#edit-the-config_infer_primary_yolov5_seg-file) 8 | 9 | ## 10 | 11 | ### Convert model 12 | 13 | #### 1. Download the YOLOv5 repo and install the requirements 14 | 15 | ``` 16 | git clone https://github.com/ultralytics/yolov5.git 17 | cd yolov5 18 | pip3 install -r requirements.txt 19 | pip3 install onnx onnxslim onnxruntime 20 | ``` 21 | 22 | **NOTE**: It is recommended to use Python virtualenv. 23 | 24 | #### 2. Copy conversor 25 | 26 | Copy the `export_yoloV5_seg.py` file from `DeepStream-Yolo-Seg/utils` directory to the `yolov5` folder. 27 | 28 | #### 3. Download the model 29 | 30 | Download the `pt` file from [YOLOv5](https://github.com/ultralytics/yolov5/releases/) releases (example for YOLOv5s-Seg 7.0) 31 | 32 | ``` 33 | wget https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s-seg.pt 34 | ``` 35 | 36 | **NOTE**: You can use your custom model. 37 | 38 | #### 4. Convert model 39 | 40 | Generate the ONNX model file (example for YOLOv5s-Seg) 41 | 42 | ``` 43 | python3 export_yoloV5_seg.py -w yolov5s-seg.pt --dynamic 44 | ``` 45 | 46 | **NOTE**: Minimum detection confidence threshold (example for conf-threshold = 0.25) 47 | 48 | The minimum detection confidence threshold is configured in the ONNX exporter file. The `pre-cluster-threshold` should be >= the value used in the ONNX model. 49 | 50 | ``` 51 | --conf-threshold 0.25 52 | ``` 53 | 54 | **NOTE**: NMS IoU threshold (example for iou-threshold = 0.45) 55 | 56 | ``` 57 | --iou-threshold 0.45 58 | ``` 59 | 60 | **NOTE**: Maximum number of output detections (example for max-detections = 300) 61 | 62 | ``` 63 | --max-detections 300 64 | ``` 65 | 66 | **NOTE**: To convert a P6 model 67 | 68 | ``` 69 | --p6 70 | ``` 71 | 72 | **NOTE**: To change the inference size (defaut: 640 / 1280 for `--p6` models) 73 | 74 | ``` 75 | -s SIZE 76 | --size SIZE 77 | -s HEIGHT WIDTH 78 | --size HEIGHT WIDTH 79 | ``` 80 | 81 | Example for 1280 82 | 83 | ``` 84 | -s 1280 85 | ``` 86 | 87 | or 88 | 89 | ``` 90 | -s 1280 1280 91 | ``` 92 | 93 | **NOTE**: To simplify the ONNX model 94 | 95 | ``` 96 | --simplify 97 | ``` 98 | 99 | **NOTE**: To use dynamic batch-size (DeepStream >= 6.1) 100 | 101 | ``` 102 | --dynamic 103 | ``` 104 | 105 | **NOTE**: To use static batch-size (example for batch-size = 4) 106 | 107 | ``` 108 | --batch 4 109 | ``` 110 | 111 | #### 5. Copy generated files 112 | 113 | Copy the generated ONNX model file and labels.txt file (if generated) to the `DeepStream-Yolo-Seg` folder. 114 | 115 | ## 116 | 117 | ### Compile the lib 118 | 119 | 1. Open the `DeepStream-Yolo-Seg` folder and compile the lib 120 | 121 | 2. Set the `CUDA_VER` according to your DeepStream version 122 | 123 | ``` 124 | export CUDA_VER=XY.Z 125 | ``` 126 | 127 | * x86 platform 128 | 129 | ``` 130 | DeepStream 8.0 = 12.8 131 | DeepStream 7.1 = 12.6 132 | DeepStream 7.0 / 6.4 = 12.2 133 | DeepStream 6.3 = 12.1 134 | DeepStream 6.2 = 11.8 135 | DeepStream 6.1.1 = 11.7 136 | DeepStream 6.1 = 11.6 137 | DeepStream 6.0.1 / 6.0 = 11.4 138 | ``` 139 | 140 | * Jetson platform 141 | 142 | ``` 143 | DeepStream 8.0 = 13.0 144 | DeepStream 7.1 = 12.6 145 | DeepStream 7.0 / 6.4 = 12.2 146 | DeepStream 6.3 / 6.2 / 6.1.1 / 6.1 = 11.4 147 | DeepStream 6.0.1 / 6.0 = 10.2 148 | ``` 149 | 150 | 3. Make the lib 151 | 152 | ``` 153 | make -C nvdsinfer_custom_impl_Yolo_seg clean && make -C nvdsinfer_custom_impl_Yolo_seg 154 | ``` 155 | 156 | ## 157 | 158 | ### Edit the config_infer_primary_yoloV5_seg file 159 | 160 | Edit the `config_infer_primary_yoloV5_seg.txt` file according to your model (example for YOLOv5s-Seg) 161 | 162 | ``` 163 | [property] 164 | ... 165 | onnx-file=yolov5s-seg.onnx 166 | ... 167 | num-detected-classes=80 168 | ... 169 | parse-bbox-func-name=NvDsInferParseYoloSeg 170 | ... 171 | ``` 172 | 173 | **NOTE**: To output the masks, use 174 | 175 | ``` 176 | [property] 177 | ... 178 | output-instance-mask=1 179 | segmentation-threshold=0.5 180 | ... 181 | ``` 182 | 183 | **NOTE**: The **YOLOv5-Seg** resizes the input with center padding. To get better accuracy, use 184 | 185 | ``` 186 | [property] 187 | ... 188 | maintain-aspect-ratio=1 189 | symmetric-padding=1 190 | ... 191 | ``` 192 | -------------------------------------------------------------------------------- /docs/YOLOv7_Seg.md: -------------------------------------------------------------------------------- 1 | # YOLOv7-Seg usage 2 | 3 | **NOTE**: The yaml file is not required. 4 | 5 | * [Convert model](#convert-model) 6 | * [Compile the lib](#compile-the-lib) 7 | * [Edit the config_infer_primary_yoloV7_seg file](#edit-the-config_infer_primary_yolov7_seg-file) 8 | 9 | ## 10 | 11 | ### Convert model 12 | 13 | #### 1. Download the YOLOv7 repo and install the requirements 14 | 15 | ``` 16 | git clone -b u7 https://github.com/WongKinYiu/yolov7 17 | cd yolov7/seg 18 | pip3 install -r requirements.txt 19 | pip3 install onnx onnxslim onnxruntime 20 | ``` 21 | 22 | **NOTE**: It is recommended to use Python virtualenv. 23 | 24 | #### 2. Copy conversor 25 | 26 | Copy the `export_yoloV7_seg.py` file from `DeepStream-Yolo-Seg/utils` directory to the `yolov7/seg` folder. 27 | 28 | #### 3. Download the model 29 | 30 | Download the `pt` file from [YOLOv7](https://github.com/WongKinYiu/yolov7/releases/) releases (example for YOLOv7-Seg) 31 | 32 | ``` 33 | wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-seg.pt 34 | ``` 35 | 36 | **NOTE**: You can use your custom model. 37 | 38 | #### 4. Convert model 39 | 40 | Generate the ONNX model file (example for YOLOv7-Seg) 41 | 42 | ``` 43 | python3 export_yoloV7_seg.py -w yolov7-seg.pt --dynamic 44 | ``` 45 | 46 | **NOTE**: Minimum detection confidence threshold (example for conf-threshold = 0.25) 47 | 48 | The minimum detection confidence threshold is configured in the ONNX exporter file. The `pre-cluster-threshold` should be >= the value used in the ONNX model. 49 | 50 | ``` 51 | --conf-threshold 0.25 52 | ``` 53 | 54 | **NOTE**: NMS IoU threshold (example for iou-threshold = 0.45) 55 | 56 | ``` 57 | --iou-threshold 0.45 58 | ``` 59 | 60 | **NOTE**: Maximum number of output detections (example for max-detections = 300) 61 | 62 | ``` 63 | --max-detections 300 64 | ``` 65 | 66 | **NOTE**: To convert a P6 model 67 | 68 | ``` 69 | --p6 70 | ``` 71 | 72 | **NOTE**: To change the inference size (defaut: 640 / 1280 for `--p6` models) 73 | 74 | ``` 75 | -s SIZE 76 | --size SIZE 77 | -s HEIGHT WIDTH 78 | --size HEIGHT WIDTH 79 | ``` 80 | 81 | Example for 1280 82 | 83 | ``` 84 | -s 1280 85 | ``` 86 | 87 | or 88 | 89 | ``` 90 | -s 1280 1280 91 | ``` 92 | 93 | **NOTE**: To simplify the ONNX model 94 | 95 | ``` 96 | --simplify 97 | ``` 98 | 99 | **NOTE**: To use dynamic batch-size (DeepStream >= 6.1) 100 | 101 | ``` 102 | --dynamic 103 | ``` 104 | 105 | **NOTE**: To use static batch-size (example for batch-size = 4) 106 | 107 | ``` 108 | --batch 4 109 | ``` 110 | 111 | #### 5. Copy generated files 112 | 113 | Copy the generated ONNX model file and labels.txt file (if generated) to the `DeepStream-Yolo-Seg` folder. 114 | 115 | ## 116 | 117 | ### Compile the lib 118 | 119 | 1. Open the `DeepStream-Yolo-Seg` folder and compile the lib 120 | 121 | 2. Set the `CUDA_VER` according to your DeepStream version 122 | 123 | ``` 124 | export CUDA_VER=XY.Z 125 | ``` 126 | 127 | * x86 platform 128 | 129 | ``` 130 | DeepStream 8.0 = 12.8 131 | DeepStream 7.1 = 12.6 132 | DeepStream 7.0 / 6.4 = 12.2 133 | DeepStream 6.3 = 12.1 134 | DeepStream 6.2 = 11.8 135 | DeepStream 6.1.1 = 11.7 136 | DeepStream 6.1 = 11.6 137 | DeepStream 6.0.1 / 6.0 = 11.4 138 | ``` 139 | 140 | * Jetson platform 141 | 142 | ``` 143 | DeepStream 8.0 = 13.0 144 | DeepStream 7.1 = 12.6 145 | DeepStream 7.0 / 6.4 = 12.2 146 | DeepStream 6.3 / 6.2 / 6.1.1 / 6.1 = 11.4 147 | DeepStream 6.0.1 / 6.0 = 10.2 148 | ``` 149 | 150 | 3. Make the lib 151 | 152 | ``` 153 | make -C nvdsinfer_custom_impl_Yolo_seg clean && make -C nvdsinfer_custom_impl_Yolo_seg 154 | ``` 155 | 156 | ## 157 | 158 | ### Edit the config_infer_primary_yoloV7_seg file 159 | 160 | Edit the `config_infer_primary_yoloV7_seg.txt` file according to your model (example for YOLOv7-Seg) 161 | 162 | ``` 163 | [property] 164 | ... 165 | onnx-file=yolov7-seg.onnx 166 | ... 167 | num-detected-classes=80 168 | ... 169 | parse-bbox-func-name=NvDsInferParseYoloSeg 170 | ... 171 | ``` 172 | 173 | **NOTE**: To output the masks, use 174 | 175 | ``` 176 | [property] 177 | ... 178 | output-instance-mask=1 179 | segmentation-threshold=0.5 180 | ... 181 | ``` 182 | 183 | **NOTE**: The **YOLOv7-Seg** resizes the input with center padding. To get better accuracy, use 184 | 185 | ``` 186 | [property] 187 | ... 188 | maintain-aspect-ratio=1 189 | symmetric-padding=1 190 | ... 191 | ``` 192 | -------------------------------------------------------------------------------- /docs/YOLOv7_Mask.md: -------------------------------------------------------------------------------- 1 | # YOLOv7-Mask usage 2 | 3 | **NOTE**: The yaml file is not required. 4 | 5 | * [Convert model](#convert-model) 6 | * [Compile the lib](#compile-the-lib) 7 | * [Edit the config_infer_primary_yoloV7_mask file](#edit-the-config_infer_primary_yolov7_mask-file) 8 | 9 | ## 10 | 11 | ### Convert model 12 | 13 | #### 1. Download the YOLOv7 repo and install the requirements 14 | 15 | ``` 16 | git clone -b mask https://github.com/WongKinYiu/yolov7 17 | cd yolov7 18 | pip3 install -r requirements.txt 19 | git clone https://github.com/facebookresearch/detectron2.git 20 | pip3 install -e detectron2 21 | pip3 install onnx onnxslim onnxruntime 22 | ``` 23 | 24 | **NOTE**: It is recommended to use Python virtualenv. 25 | 26 | #### 2. Copy conversor 27 | 28 | Copy the `export_yoloV7_mask.py` file from `DeepStream-Yolo-Mask/utils` directory to the `yolov7` folder. 29 | 30 | #### 3. Download the model 31 | 32 | Download the `pt` file from [YOLOv7](https://github.com/WongKinYiu/yolov7/releases/) releases (example for YOLOv7-Mask) 33 | 34 | ``` 35 | wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-mask.pt 36 | ``` 37 | 38 | **NOTE**: You can use your custom model. 39 | 40 | #### 4. Convert model 41 | 42 | Generate the ONNX model file (example for YOLOv7-Mask) 43 | 44 | ``` 45 | python3 export_yoloV7_mask.py -w yolov7-mask.pt --dynamic 46 | ``` 47 | 48 | **NOTE**: Minimum detection confidence threshold (example for conf-threshold = 0.25) 49 | 50 | The minimum detection confidence threshold is configured in the ONNX exporter file. The `pre-cluster-threshold` should be >= the value used in the ONNX model. 51 | 52 | ``` 53 | --conf-threshold 0.25 54 | ``` 55 | 56 | **NOTE**: NMS IoU threshold (example for iou-threshold = 0.45) 57 | 58 | ``` 59 | --iou-threshold 0.45 60 | ``` 61 | 62 | **NOTE**: Maximum number of output detections (example for max-detections = 300) 63 | 64 | ``` 65 | --max-detections 300 66 | ``` 67 | 68 | **NOTE**: To convert a P6 model 69 | 70 | ``` 71 | --p6 72 | ``` 73 | 74 | **NOTE**: To change the inference size (defaut: 640 / 1280 for `--p6` models) 75 | 76 | ``` 77 | -s SIZE 78 | --size SIZE 79 | -s HEIGHT WIDTH 80 | --size HEIGHT WIDTH 81 | ``` 82 | 83 | Example for 1280 84 | 85 | ``` 86 | -s 1280 87 | ``` 88 | 89 | or 90 | 91 | ``` 92 | -s 1280 1280 93 | ``` 94 | 95 | **NOTE**: To simplify the ONNX model 96 | 97 | ``` 98 | --simplify 99 | ``` 100 | 101 | **NOTE**: To use dynamic batch-size (DeepStream >= 6.1) 102 | 103 | ``` 104 | --dynamic 105 | ``` 106 | 107 | **NOTE**: To use static batch-size (example for batch-size = 4) 108 | 109 | ``` 110 | --batch 4 111 | ``` 112 | 113 | #### 5. Copy generated files 114 | 115 | Copy the generated ONNX model file and labels.txt file (if generated) to the `DeepStream-Yolo-Mask` folder. 116 | 117 | ## 118 | 119 | ### Compile the lib 120 | 121 | 1. Open the `DeepStream-Yolo-Mask` folder and compile the lib 122 | 123 | 2. Set the `CUDA_VER` according to your DeepStream version 124 | 125 | ``` 126 | export CUDA_VER=XY.Z 127 | ``` 128 | 129 | * x86 platform 130 | 131 | ``` 132 | DeepStream 8.0 = 12.8 133 | DeepStream 7.1 = 12.6 134 | DeepStream 7.0 / 6.4 = 12.2 135 | DeepStream 6.3 = 12.1 136 | DeepStream 6.2 = 11.8 137 | DeepStream 6.1.1 = 11.7 138 | DeepStream 6.1 = 11.6 139 | DeepStream 6.0.1 / 6.0 = 11.4 140 | ``` 141 | 142 | * Jetson platform 143 | 144 | ``` 145 | DeepStream 8.0 = 13.0 146 | DeepStream 7.1 = 12.6 147 | DeepStream 7.0 / 6.4 = 12.2 148 | DeepStream 6.3 / 6.2 / 6.1.1 / 6.1 = 11.4 149 | DeepStream 6.0.1 / 6.0 = 10.2 150 | ``` 151 | 152 | 3. Make the lib 153 | 154 | ``` 155 | make -C nvdsinfer_custom_impl_Yolo_mask clean && make -C nvdsinfer_custom_impl_Yolo_mask 156 | ``` 157 | 158 | ## 159 | 160 | ### Edit the config_infer_primary_yoloV7_mask file 161 | 162 | Edit the `config_infer_primary_yoloV7_mask.txt` file according to your model (example for YOLOv7-Mask) 163 | 164 | ``` 165 | [property] 166 | ... 167 | onnx-file=yolov7-mask.onnx 168 | ... 169 | num-detected-classes=80 170 | ... 171 | parse-bbox-func-name=NvDsInferParseYoloSeg 172 | ... 173 | ``` 174 | 175 | **NOTE**: To output the masks, use 176 | 177 | ``` 178 | [property] 179 | ... 180 | output-instance-mask=1 181 | segmentation-threshold=0.5 182 | ... 183 | ``` 184 | 185 | **NOTE**: The **YOLOv7-Mask** resizes the input with center padding. To get better accuracy, use 186 | 187 | ``` 188 | [property] 189 | ... 190 | maintain-aspect-ratio=1 191 | symmetric-padding=1 192 | ... 193 | ``` 194 | -------------------------------------------------------------------------------- /nvdsinfer_custom_impl_Yolo_seg/trt_plugins/roiAlignPlugin/roiAlignPlugin.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | #ifndef TRT_ROIALIGN_PLUGIN_H 18 | #define TRT_ROIALIGN_PLUGIN_H 19 | 20 | #include "../common.h" 21 | #include 22 | #include 23 | #include 24 | 25 | #include "NvInfer.h" 26 | #include "NvInferPlugin.h" 27 | 28 | namespace nvinfer1 29 | { 30 | namespace plugin 31 | { 32 | 33 | class ROIAlign : public IPluginV2DynamicExt 34 | { 35 | public: 36 | ROIAlign(int32_t outputHeight, int32_t outputWidth, int32_t samplingRatio, int32_t mode, float spatialScale, 37 | int32_t aligned); 38 | ROIAlign(void const* data, size_t length); 39 | ROIAlign() = default; 40 | ~ROIAlign() override = default; 41 | 42 | // IPluginV2 methods 43 | char const* getPluginType() const noexcept override; 44 | char const* getPluginVersion() const noexcept override; 45 | int32_t getNbOutputs() const noexcept override; 46 | int32_t initialize() noexcept override; 47 | void terminate() noexcept override; 48 | size_t getSerializationSize() const noexcept override; 49 | void serialize(void* buffer) const noexcept override; 50 | void destroy() noexcept override; 51 | void setPluginNamespace(char const* libNamespace) noexcept override; 52 | char const* getPluginNamespace() const noexcept override; 53 | void setClipParam(bool clip) noexcept; 54 | void setScoreBits(int32_t scoreBits) noexcept; 55 | void setCaffeSemantics(bool caffeSemantics) noexcept; 56 | 57 | // IPluginV2Ext methods 58 | nvinfer1::DataType getOutputDataType( 59 | int32_t index, nvinfer1::DataType const* inputType, int32_t nbInputs) const noexcept override; 60 | 61 | // IPluginV2DynamicExt methods 62 | IPluginV2DynamicExt* clone() const noexcept override; 63 | DimsExprs getOutputDimensions( 64 | int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs, IExprBuilder& exprBuilder) noexcept override; 65 | bool supportsFormatCombination( 66 | int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override; 67 | void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out, 68 | int32_t nbOutputs) noexcept override; 69 | size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs, 70 | int32_t nbOutputs) const noexcept override; 71 | int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs, 72 | void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; 73 | 74 | private: 75 | void checkValidInputs(nvinfer1::DynamicPluginTensorDesc const* inputs, int32_t nbInputDims); 76 | void validateAttributes(int32_t outputHeight, int32_t outputWidth, int32_t samplingRatio, int32_t mode, 77 | float spatialScale, int32_t aligned); 78 | 79 | int32_t mOutputHeight{}; 80 | int32_t mOutputWidth{}; 81 | int32_t mSamplingRatio{}; 82 | float mSpatialScale{}; 83 | int32_t mMode{}; 84 | int32_t mAligned{}; 85 | 86 | int32_t mROICount{}; 87 | int32_t mFeatureLength{}; // number of channels 88 | int32_t mHeight{}; 89 | int32_t mWidth{}; 90 | 91 | int32_t mMaxThreadsPerBlock{}; 92 | 93 | std::string mNameSpace{}; 94 | }; 95 | 96 | class ROIAlignPluginCreator : public nvinfer1::pluginInternal::BaseCreator 97 | { 98 | public: 99 | ROIAlignPluginCreator(); 100 | 101 | ~ROIAlignPluginCreator() override = default; 102 | 103 | char const* getPluginName() const noexcept override; 104 | 105 | char const* getPluginVersion() const noexcept override; 106 | 107 | PluginFieldCollection const* getFieldNames() noexcept override; 108 | 109 | IPluginV2DynamicExt* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override; 110 | 111 | IPluginV2DynamicExt* deserializePlugin( 112 | char const* name, void const* serialData, size_t serialLength) noexcept override; 113 | 114 | private: 115 | PluginFieldCollection mFC; 116 | std::vector mPluginAttributes; 117 | }; 118 | 119 | } // namespace plugin 120 | } // namespace nvinfer1 121 | #endif // TRT_ROIALIGN_PLUGIN_H 122 | -------------------------------------------------------------------------------- /nvdsinfer_custom_impl_Yolo_seg/trt_plugins/efficientNMSPlugin/efficientNMSInference.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #ifndef TRT_EFFICIENT_NMS_INFERENCE_CUH 19 | #define TRT_EFFICIENT_NMS_INFERENCE_CUH 20 | 21 | #include 22 | 23 | // FP32 Intrinsics 24 | 25 | float __device__ __inline__ exp_mp(const float a) 26 | { 27 | return __expf(a); 28 | } 29 | float __device__ __inline__ sigmoid_mp(const float a) 30 | { 31 | return __frcp_rn(__fadd_rn(1.f, __expf(-a))); 32 | } 33 | float __device__ __inline__ add_mp(const float a, const float b) 34 | { 35 | return __fadd_rn(a, b); 36 | } 37 | float __device__ __inline__ sub_mp(const float a, const float b) 38 | { 39 | return __fsub_rn(a, b); 40 | } 41 | float __device__ __inline__ mul_mp(const float a, const float b) 42 | { 43 | return __fmul_rn(a, b); 44 | } 45 | bool __device__ __inline__ gt_mp(const float a, const float b) 46 | { 47 | return a > b; 48 | } 49 | bool __device__ __inline__ lt_mp(const float a, const float b) 50 | { 51 | return a < b; 52 | } 53 | bool __device__ __inline__ lte_mp(const float a, const float b) 54 | { 55 | return a <= b; 56 | } 57 | bool __device__ __inline__ gte_mp(const float a, const float b) 58 | { 59 | return a >= b; 60 | } 61 | 62 | #if __CUDA_ARCH__ >= 530 63 | 64 | // FP16 Intrinsics 65 | 66 | __half __device__ __inline__ exp_mp(const __half a) 67 | { 68 | return hexp(a); 69 | } 70 | __half __device__ __inline__ sigmoid_mp(const __half a) 71 | { 72 | return hrcp(__hadd((__half) 1, hexp(__hneg(a)))); 73 | } 74 | __half __device__ __inline__ add_mp(const __half a, const __half b) 75 | { 76 | return __hadd(a, b); 77 | } 78 | __half __device__ __inline__ sub_mp(const __half a, const __half b) 79 | { 80 | return __hsub(a, b); 81 | } 82 | __half __device__ __inline__ mul_mp(const __half a, const __half b) 83 | { 84 | return __hmul(a, b); 85 | } 86 | bool __device__ __inline__ gt_mp(const __half a, const __half b) 87 | { 88 | return __hgt(a, b); 89 | } 90 | bool __device__ __inline__ lt_mp(const __half a, const __half b) 91 | { 92 | return __hlt(a, b); 93 | } 94 | bool __device__ __inline__ lte_mp(const __half a, const __half b) 95 | { 96 | return __hle(a, b); 97 | } 98 | bool __device__ __inline__ gte_mp(const __half a, const __half b) 99 | { 100 | return __hge(a, b); 101 | } 102 | 103 | #else 104 | 105 | // FP16 Fallbacks on older architectures that lack support 106 | 107 | __half __device__ __inline__ exp_mp(const __half a) 108 | { 109 | return __float2half(exp_mp(__half2float(a))); 110 | } 111 | __half __device__ __inline__ sigmoid_mp(const __half a) 112 | { 113 | return __float2half(sigmoid_mp(__half2float(a))); 114 | } 115 | __half __device__ __inline__ add_mp(const __half a, const __half b) 116 | { 117 | return __float2half(add_mp(__half2float(a), __half2float(b))); 118 | } 119 | __half __device__ __inline__ sub_mp(const __half a, const __half b) 120 | { 121 | return __float2half(sub_mp(__half2float(a), __half2float(b))); 122 | } 123 | __half __device__ __inline__ mul_mp(const __half a, const __half b) 124 | { 125 | return __float2half(mul_mp(__half2float(a), __half2float(b))); 126 | } 127 | bool __device__ __inline__ gt_mp(const __half a, const __half b) 128 | { 129 | return __float2half(gt_mp(__half2float(a), __half2float(b))); 130 | } 131 | bool __device__ __inline__ lt_mp(const __half a, const __half b) 132 | { 133 | return __float2half(lt_mp(__half2float(a), __half2float(b))); 134 | } 135 | bool __device__ __inline__ lte_mp(const __half a, const __half b) 136 | { 137 | return __float2half(lte_mp(__half2float(a), __half2float(b))); 138 | } 139 | bool __device__ __inline__ gte_mp(const __half a, const __half b) 140 | { 141 | return __float2half(gte_mp(__half2float(a), __half2float(b))); 142 | } 143 | 144 | #endif 145 | 146 | template 147 | struct __align__(4 * sizeof(T)) BoxCorner; 148 | 149 | template 150 | struct __align__(4 * sizeof(T)) BoxCenterSize; 151 | 152 | template 153 | struct __align__(4 * sizeof(T)) BoxCorner 154 | { 155 | // For NMS/IOU purposes, YXYX coding is identical to XYXY 156 | T y1, x1, y2, x2; 157 | 158 | __device__ void reorder() 159 | { 160 | if (gt_mp(y1, y2)) 161 | { 162 | // Swap values, so y1 < y2 163 | y1 = sub_mp(y1, y2); 164 | y2 = add_mp(y1, y2); 165 | y1 = sub_mp(y2, y1); 166 | } 167 | if (gt_mp(x1, x2)) 168 | { 169 | // Swap values, so x1 < x2 170 | x1 = sub_mp(x1, x2); 171 | x2 = add_mp(x1, x2); 172 | x1 = sub_mp(x2, x1); 173 | } 174 | } 175 | 176 | __device__ BoxCorner clip(T low, T high) const 177 | { 178 | return {lt_mp(y1, low) ? low : (gt_mp(y1, high) ? high : y1), 179 | lt_mp(x1, low) ? low : (gt_mp(x1, high) ? high : x1), lt_mp(y2, low) ? low : (gt_mp(y2, high) ? high : y2), 180 | lt_mp(x2, low) ? low : (gt_mp(x2, high) ? high : x2)}; 181 | } 182 | 183 | __device__ BoxCorner decode(BoxCorner anchor) const 184 | { 185 | return {add_mp(y1, anchor.y1), add_mp(x1, anchor.x1), add_mp(y2, anchor.y2), add_mp(x2, anchor.x2)}; 186 | } 187 | 188 | __device__ float area() const 189 | { 190 | T w = sub_mp(x2, x1); 191 | T h = sub_mp(y2, y1); 192 | if (lte_mp(h, (T) 0)) 193 | { 194 | return 0; 195 | } 196 | if (lte_mp(w, (T) 0)) 197 | { 198 | return 0; 199 | } 200 | return (float) h * (float) w; 201 | } 202 | 203 | __device__ operator BoxCenterSize() const 204 | { 205 | T w = sub_mp(x2, x1); 206 | T h = sub_mp(y2, y1); 207 | return BoxCenterSize{add_mp(y1, mul_mp((T) 0.5, h)), add_mp(x1, mul_mp((T) 0.5, w)), h, w}; 208 | } 209 | 210 | __device__ static BoxCorner intersect(BoxCorner a, BoxCorner b) 211 | { 212 | return {gt_mp(a.y1, b.y1) ? a.y1 : b.y1, gt_mp(a.x1, b.x1) ? a.x1 : b.x1, lt_mp(a.y2, b.y2) ? a.y2 : b.y2, 213 | lt_mp(a.x2, b.x2) ? a.x2 : b.x2}; 214 | } 215 | }; 216 | 217 | template 218 | struct __align__(4 * sizeof(T)) BoxCenterSize 219 | { 220 | // For NMS/IOU purposes, YXHW coding is identical to XYWH 221 | T y, x, h, w; 222 | 223 | __device__ void reorder() {} 224 | 225 | __device__ BoxCenterSize clip(T low, T high) const 226 | { 227 | return BoxCenterSize(BoxCorner(*this).clip(low, high)); 228 | } 229 | 230 | __device__ BoxCenterSize decode(BoxCenterSize anchor) const 231 | { 232 | return {add_mp(mul_mp(y, anchor.h), anchor.y), add_mp(mul_mp(x, anchor.w), anchor.x), 233 | mul_mp(anchor.h, exp_mp(h)), mul_mp(anchor.w, exp_mp(w))}; 234 | } 235 | 236 | __device__ float area() const 237 | { 238 | if (h <= (T) 0) 239 | { 240 | return 0; 241 | } 242 | if (w <= (T) 0) 243 | { 244 | return 0; 245 | } 246 | return (float) h * (float) w; 247 | } 248 | 249 | __device__ operator BoxCorner() const 250 | { 251 | T h2 = mul_mp(h, (T) 0.5); 252 | T w2 = mul_mp(w, (T) 0.5); 253 | return BoxCorner{sub_mp(y, h2), sub_mp(x, w2), add_mp(y, h2), add_mp(x, w2)}; 254 | } 255 | __device__ static BoxCenterSize intersect(BoxCenterSize a, BoxCenterSize b) 256 | { 257 | return BoxCenterSize(BoxCorner::intersect(BoxCorner(a), BoxCorner(b))); 258 | } 259 | }; 260 | 261 | #endif 262 | -------------------------------------------------------------------------------- /utils/export_yoloV5_seg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import onnx 3 | import torch 4 | import torch.nn as nn 5 | 6 | from models.experimental import attempt_load 7 | from models.yolo import Detect 8 | 9 | 10 | class RoiAlign(torch.autograd.Function): 11 | @staticmethod 12 | def forward( 13 | self, 14 | X, 15 | rois, 16 | batch_indices, 17 | coordinate_transformation_mode, 18 | mode, 19 | output_height, 20 | output_width, 21 | sampling_ratio, 22 | spatial_scale 23 | ): 24 | C = X.shape[1] 25 | num_rois = rois.shape[0] 26 | return torch.randn([num_rois, C, output_height, output_width], device=rois.device, dtype=rois.dtype) 27 | 28 | @staticmethod 29 | def symbolic( 30 | g, 31 | X, 32 | rois, 33 | batch_indices, 34 | coordinate_transformation_mode, 35 | mode, 36 | output_height, 37 | output_width, 38 | sampling_ratio, 39 | spatial_scale 40 | ): 41 | return g.op( 42 | "TRT::ROIAlignX_TRT", 43 | X, 44 | rois, 45 | batch_indices, 46 | coordinate_transformation_mode_i=coordinate_transformation_mode, 47 | mode_i=mode, 48 | output_height_i=output_height, 49 | output_width_i=output_width, 50 | sampling_ratio_i=sampling_ratio, 51 | spatial_scale_f=spatial_scale 52 | ) 53 | 54 | 55 | class NMS(torch.autograd.Function): 56 | @staticmethod 57 | def forward(self, boxes, scores, score_threshold, iou_threshold, max_output_boxes): 58 | batch_size = scores.shape[0] 59 | num_classes = scores.shape[-1] 60 | num_detections = torch.randint(0, max_output_boxes, (batch_size, 1), dtype=torch.int32) 61 | detection_boxes = torch.randn(batch_size, max_output_boxes, 4) 62 | detection_scores = torch.randn(batch_size, max_output_boxes) 63 | detection_classes = torch.randint(0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32) 64 | detections_indices = torch.randint(0, max_output_boxes, (batch_size, max_output_boxes), dtype=torch.int32) 65 | return num_detections, detection_boxes, detection_scores, detection_classes, detections_indices 66 | 67 | @staticmethod 68 | def symbolic(g, boxes, scores, score_threshold, iou_threshold, max_output_boxes): 69 | return g.op( 70 | "TRT::EfficientNMSX_TRT", 71 | boxes, 72 | scores, 73 | score_threshold_f=score_threshold, 74 | iou_threshold_f=iou_threshold, 75 | max_output_boxes_i=max_output_boxes, 76 | background_class_i=-1, 77 | score_activation_i=0, 78 | class_agnostic_i=0, 79 | box_coding_i=0, 80 | outputs=5 81 | ) 82 | 83 | 84 | class DeepStreamOutput(nn.Module): 85 | def __init__(self, nc, conf_threshold, iou_threshold, max_detections): 86 | super().__init__() 87 | self.nc = nc 88 | self.conf_threshold = conf_threshold 89 | self.iou_threshold = iou_threshold 90 | self.max_detections = max_detections 91 | 92 | def forward(self, x): 93 | preds = x[0] 94 | boxes = preds[:, :, :4] 95 | convert_matrix = torch.tensor( 96 | [[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], dtype=boxes.dtype, device=boxes.device 97 | ) 98 | boxes @= convert_matrix 99 | objectness = preds[:, :, 4:5] 100 | scores = preds[:, :, 5:self.nc+5] 101 | scores *= objectness 102 | masks = preds[:, :, self.nc+5:] 103 | protos = x[1] 104 | 105 | num_detections, detection_boxes, detection_scores, detection_classes, detections_indices = NMS.apply( 106 | boxes, scores, self.conf_threshold, self.iou_threshold, self.max_detections 107 | ) 108 | 109 | batch_size, num_protos, h_protos, w_protos = protos.shape 110 | 111 | total_detections = batch_size * self.max_detections 112 | 113 | batch_index = torch.ones_like(detections_indices) * torch.arange( 114 | batch_size, device=boxes.device, dtype=torch.int32 115 | ).unsqueeze(1) 116 | batch_index = batch_index.view(total_detections).to(torch.int32) 117 | box_index = detections_indices.view(total_detections).to(torch.int32) 118 | 119 | selected_boxes = boxes[batch_index, box_index] 120 | selected_masks = masks[batch_index, box_index] 121 | 122 | pooled_proto = RoiAlign.apply(protos, selected_boxes, batch_index, 1, 1, int(h_protos), int(w_protos), 0, 0.25) 123 | 124 | masks_protos = torch.matmul( 125 | selected_masks.unsqueeze(1), pooled_proto.view(total_detections, num_protos, h_protos * w_protos) 126 | ) 127 | masks_protos = masks_protos.sigmoid().view(batch_size, self.max_detections, h_protos * w_protos) 128 | 129 | return torch.cat( 130 | [detection_boxes, detection_scores.unsqueeze(-1), detection_classes.unsqueeze(-1), masks_protos], dim=-1 131 | ) 132 | 133 | 134 | def yolov5_seg_export(weights, device, inplace=True, fuse=True): 135 | model = attempt_load(weights, device=device, inplace=inplace, fuse=fuse) 136 | model.eval() 137 | for k, m in model.named_modules(): 138 | if isinstance(m, Detect): 139 | m.inplace = False 140 | m.dynamic = False 141 | m.export = True 142 | return model 143 | 144 | 145 | def suppress_warnings(): 146 | import warnings 147 | warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) 148 | warnings.filterwarnings("ignore", category=UserWarning) 149 | warnings.filterwarnings("ignore", category=DeprecationWarning) 150 | warnings.filterwarnings("ignore", category=FutureWarning) 151 | warnings.filterwarnings("ignore", category=ResourceWarning) 152 | 153 | 154 | def main(args): 155 | suppress_warnings() 156 | 157 | print(f"\nStarting: {args.weights}") 158 | 159 | print("Opening YOLOv5-Seg model") 160 | 161 | device = torch.device("cpu") 162 | model = yolov5_seg_export(args.weights, device) 163 | 164 | if len(model.names.keys()) > 0: 165 | print("Creating labels.txt file") 166 | with open("labels.txt", "w", encoding="utf-8") as f: 167 | for name in model.names.values(): 168 | f.write(f"{name}\n") 169 | 170 | model = nn.Sequential( 171 | model, DeepStreamOutput(len(model.names), args.conf_threshold, args.iou_threshold, args.max_detections) 172 | ) 173 | 174 | img_size = args.size * 2 if len(args.size) == 1 else args.size 175 | 176 | if img_size == [640, 640] and args.p6: 177 | img_size = [1280] * 2 178 | 179 | onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device) 180 | onnx_output_file = args.weights.rsplit(".", 1)[0] + ".onnx" 181 | 182 | dynamic_axes = { 183 | "input": { 184 | 0: "batch" 185 | }, 186 | "output": { 187 | 0: "batch" 188 | } 189 | } 190 | 191 | print("Exporting the model to ONNX") 192 | torch.onnx.export( 193 | model, 194 | onnx_input_im, 195 | onnx_output_file, 196 | verbose=False, 197 | opset_version=args.opset, 198 | do_constant_folding=True, 199 | input_names=["input"], 200 | output_names=["output"], 201 | dynamic_axes=dynamic_axes if args.dynamic else None 202 | ) 203 | 204 | if args.simplify: 205 | print("Simplifying the ONNX model") 206 | import onnxslim 207 | model_onnx = onnx.load(onnx_output_file) 208 | model_onnx = onnxslim.slim(model_onnx) 209 | onnx.save(model_onnx, onnx_output_file) 210 | 211 | print(f"Done: {onnx_output_file}\n") 212 | 213 | 214 | def parse_args(): 215 | import argparse 216 | parser = argparse.ArgumentParser(description="DeepStream YOLOv5-Seg conversion") 217 | parser.add_argument("-w", "--weights", required=True, type=str, help="Input weights (.pt) file path (required)") 218 | parser.add_argument("-s", "--size", nargs="+", type=int, default=[640], help="Inference size [H,W] (default [640])") 219 | parser.add_argument("--p6", action="store_true", help="P6 model") 220 | parser.add_argument("--opset", type=int, default=17, help="ONNX opset version") 221 | parser.add_argument("--simplify", action="store_true", help="ONNX simplify model") 222 | parser.add_argument("--dynamic", action="store_true", help="Dynamic batch-size") 223 | parser.add_argument("--batch", type=int, default=1, help="Static batch-size") 224 | parser.add_argument( 225 | "--conf-threshold", type=float, default=0.25, help="Minimum detection confidence threshold (default 0.25)" 226 | ) 227 | parser.add_argument("--iou-threshold", type=float, default=0.45, help="NMS IoU threshold (default 0.45)") 228 | parser.add_argument( 229 | "--max-detections", type=int, default=100, help="Maximum number of output detections (default 100)" 230 | ) 231 | args = parser.parse_args() 232 | if not os.path.isfile(args.weights): 233 | raise SystemExit("Invalid weights file") 234 | if args.dynamic and args.batch > 1: 235 | raise SystemExit("Cannot set dynamic batch-size and static batch-size at same time") 236 | return args 237 | 238 | 239 | if __name__ == "__main__": 240 | args = parse_args() 241 | main(args) 242 | -------------------------------------------------------------------------------- /utils/export_yoloV7_seg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import onnx 3 | import torch 4 | import torch.nn as nn 5 | 6 | from models.experimental import attempt_load 7 | from models.yolo import Detect 8 | 9 | 10 | class RoiAlign(torch.autograd.Function): 11 | @staticmethod 12 | def forward( 13 | self, 14 | X, 15 | rois, 16 | batch_indices, 17 | coordinate_transformation_mode, 18 | mode, 19 | output_height, 20 | output_width, 21 | sampling_ratio, 22 | spatial_scale 23 | ): 24 | C = X.shape[1] 25 | num_rois = rois.shape[0] 26 | return torch.randn([num_rois, C, output_height, output_width], device=rois.device, dtype=rois.dtype) 27 | 28 | @staticmethod 29 | def symbolic( 30 | g, 31 | X, 32 | rois, 33 | batch_indices, 34 | coordinate_transformation_mode, 35 | mode, 36 | output_height, 37 | output_width, 38 | sampling_ratio, 39 | spatial_scale 40 | ): 41 | return g.op( 42 | "TRT::ROIAlignX_TRT", 43 | X, 44 | rois, 45 | batch_indices, 46 | coordinate_transformation_mode_i=coordinate_transformation_mode, 47 | mode_i=mode, 48 | output_height_i=output_height, 49 | output_width_i=output_width, 50 | sampling_ratio_i=sampling_ratio, 51 | spatial_scale_f=spatial_scale 52 | ) 53 | 54 | 55 | class NMS(torch.autograd.Function): 56 | @staticmethod 57 | def forward(self, boxes, scores, score_threshold, iou_threshold, max_output_boxes): 58 | batch_size = scores.shape[0] 59 | num_classes = scores.shape[-1] 60 | num_detections = torch.randint(0, max_output_boxes, (batch_size, 1), dtype=torch.int32) 61 | detection_boxes = torch.randn(batch_size, max_output_boxes, 4) 62 | detection_scores = torch.randn(batch_size, max_output_boxes) 63 | detection_classes = torch.randint(0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32) 64 | detections_indices = torch.randint(0, max_output_boxes, (batch_size, max_output_boxes), dtype=torch.int32) 65 | return num_detections, detection_boxes, detection_scores, detection_classes, detections_indices 66 | 67 | @staticmethod 68 | def symbolic(g, boxes, scores, score_threshold, iou_threshold, max_output_boxes): 69 | return g.op( 70 | "TRT::EfficientNMSX_TRT", 71 | boxes, 72 | scores, 73 | score_threshold_f=score_threshold, 74 | iou_threshold_f=iou_threshold, 75 | max_output_boxes_i=max_output_boxes, 76 | background_class_i=-1, 77 | score_activation_i=0, 78 | class_agnostic_i=0, 79 | box_coding_i=0, 80 | outputs=5 81 | ) 82 | 83 | 84 | class DeepStreamOutput(nn.Module): 85 | def __init__(self, nc, conf_threshold, iou_threshold, max_detections): 86 | super().__init__() 87 | self.nc = nc 88 | self.conf_threshold = conf_threshold 89 | self.iou_threshold = iou_threshold 90 | self.max_detections = max_detections 91 | 92 | def forward(self, x): 93 | preds = x[0] 94 | boxes = preds[:, :, :4] 95 | convert_matrix = torch.tensor( 96 | [[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], dtype=boxes.dtype, device=boxes.device 97 | ) 98 | boxes @= convert_matrix 99 | objectness = preds[:, :, 4:5] 100 | scores = preds[:, :, 5:self.nc+5] 101 | scores *= objectness 102 | masks = preds[:, :, self.nc+5:] 103 | protos = x[1][1] 104 | 105 | num_detections, detection_boxes, detection_scores, detection_classes, detections_indices = NMS.apply( 106 | boxes, scores, self.conf_threshold, self.iou_threshold, self.max_detections 107 | ) 108 | 109 | batch_size, num_protos, h_protos, w_protos = protos.shape 110 | 111 | total_detections = batch_size * self.max_detections 112 | 113 | batch_index = torch.ones_like(detections_indices) * torch.arange( 114 | batch_size, device=boxes.device, dtype=torch.int32 115 | ).unsqueeze(1) 116 | batch_index = batch_index.view(total_detections).to(torch.int32) 117 | box_index = detections_indices.view(total_detections).to(torch.int32) 118 | 119 | selected_boxes = boxes[batch_index, box_index] 120 | selected_masks = masks[batch_index, box_index] 121 | 122 | pooled_proto = RoiAlign.apply(protos, selected_boxes, batch_index, 1, 1, int(h_protos), int(w_protos), 0, 0.25) 123 | 124 | masks_protos = torch.matmul( 125 | selected_masks.unsqueeze(1), pooled_proto.view(total_detections, num_protos, h_protos * w_protos) 126 | ) 127 | masks_protos = masks_protos.sigmoid().view(batch_size, self.max_detections, h_protos * w_protos) 128 | 129 | return torch.cat( 130 | [detection_boxes, detection_scores.unsqueeze(-1), detection_classes.unsqueeze(-1), masks_protos], dim=-1 131 | ) 132 | 133 | 134 | def yolov7_seg_export(weights, device, inplace=True, fuse=True): 135 | model = attempt_load(weights, device=device, inplace=inplace, fuse=fuse) 136 | model.eval() 137 | for k, m in model.named_modules(): 138 | if isinstance(m, Detect): 139 | m.inplace = False 140 | m.dynamic = False 141 | m.export = True 142 | return model 143 | 144 | 145 | def suppress_warnings(): 146 | import warnings 147 | warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) 148 | warnings.filterwarnings("ignore", category=UserWarning) 149 | warnings.filterwarnings("ignore", category=DeprecationWarning) 150 | warnings.filterwarnings("ignore", category=FutureWarning) 151 | warnings.filterwarnings("ignore", category=ResourceWarning) 152 | 153 | 154 | def main(args): 155 | suppress_warnings() 156 | 157 | print(f"\nStarting: {args.weights}") 158 | 159 | print("Opening YOLOv7-Seg model") 160 | 161 | device = torch.device("cpu") 162 | model = yolov7_seg_export(args.weights, device) 163 | 164 | if len(model.names.keys()) > 0: 165 | print("Creating labels.txt file") 166 | with open("labels.txt", "w", encoding="utf-8") as f: 167 | for name in model.names.values(): 168 | f.write(f"{name}\n") 169 | 170 | model = nn.Sequential( 171 | model, DeepStreamOutput(len(model.names), args.conf_threshold, args.iou_threshold, args.max_detections) 172 | ) 173 | 174 | img_size = args.size * 2 if len(args.size) == 1 else args.size 175 | 176 | if img_size == [640, 640] and args.p6: 177 | img_size = [1280] * 2 178 | 179 | onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device) 180 | onnx_output_file = args.weights.rsplit(".", 1)[0] + ".onnx" 181 | 182 | dynamic_axes = { 183 | "input": { 184 | 0: "batch" 185 | }, 186 | "output": { 187 | 0: "batch" 188 | } 189 | } 190 | 191 | print("Exporting the model to ONNX") 192 | torch.onnx.export( 193 | model, 194 | onnx_input_im, 195 | onnx_output_file, 196 | verbose=False, 197 | opset_version=args.opset, 198 | do_constant_folding=True, 199 | input_names=["input"], 200 | output_names=["output"], 201 | dynamic_axes=dynamic_axes if args.dynamic else None 202 | ) 203 | 204 | if args.simplify: 205 | print("Simplifying the ONNX model") 206 | import onnxslim 207 | model_onnx = onnx.load(onnx_output_file) 208 | model_onnx = onnxslim.slim(model_onnx) 209 | onnx.save(model_onnx, onnx_output_file) 210 | 211 | print(f"Done: {onnx_output_file}\n") 212 | 213 | 214 | def parse_args(): 215 | import argparse 216 | parser = argparse.ArgumentParser(description="DeepStream YOLOv7-Seg conversion") 217 | parser.add_argument("-w", "--weights", required=True, type=str, help="Input weights (.pt) file path (required)") 218 | parser.add_argument("-s", "--size", nargs="+", type=int, default=[640], help="Inference size [H,W] (default [640])") 219 | parser.add_argument("--p6", action="store_true", help="P6 model") 220 | parser.add_argument("--opset", type=int, default=17, help="ONNX opset version") 221 | parser.add_argument("--simplify", action="store_true", help="ONNX simplify model") 222 | parser.add_argument("--dynamic", action="store_true", help="Dynamic batch-size") 223 | parser.add_argument("--batch", type=int, default=1, help="Static batch-size") 224 | parser.add_argument( 225 | "--conf-threshold", type=float, default=0.25, help="Minimum detection confidence threshold (default 0.25)" 226 | ) 227 | parser.add_argument("--iou-threshold", type=float, default=0.45, help="NMS IoU threshold (default 0.45)") 228 | parser.add_argument( 229 | "--max-detections", type=int, default=100, help="Maximum number of output detections (default 100)" 230 | ) 231 | args = parser.parse_args() 232 | if not os.path.isfile(args.weights): 233 | raise SystemExit("Invalid weights file") 234 | if args.dynamic and args.batch > 1: 235 | raise SystemExit("Cannot set dynamic batch-size and static batch-size at same time") 236 | return args 237 | 238 | 239 | if __name__ == "__main__": 240 | args = parse_args() 241 | main(args) 242 | -------------------------------------------------------------------------------- /utils/export_yoloV7_mask.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | import onnx 4 | import torch 5 | import torch.nn as nn 6 | 7 | from utils.general import merge_bases 8 | 9 | 10 | class RoiAlign(torch.autograd.Function): 11 | @staticmethod 12 | def forward( 13 | self, 14 | X, 15 | rois, 16 | batch_indices, 17 | coordinate_transformation_mode, 18 | mode, 19 | output_height, 20 | output_width, 21 | sampling_ratio, 22 | spatial_scale 23 | ): 24 | C = X.shape[1] 25 | num_rois = rois.shape[0] 26 | return torch.randn([num_rois, C, output_height, output_width], device=rois.device, dtype=rois.dtype) 27 | 28 | @staticmethod 29 | def symbolic( 30 | g, 31 | X, 32 | rois, 33 | batch_indices, 34 | coordinate_transformation_mode, 35 | mode, 36 | output_height, 37 | output_width, 38 | sampling_ratio, 39 | spatial_scale 40 | ): 41 | return g.op( 42 | "TRT::ROIAlignX_TRT", 43 | X, 44 | rois, 45 | batch_indices, 46 | coordinate_transformation_mode_i=coordinate_transformation_mode, 47 | mode_i=mode, 48 | output_height_i=output_height, 49 | output_width_i=output_width, 50 | sampling_ratio_i=sampling_ratio, 51 | spatial_scale_f=spatial_scale 52 | ) 53 | 54 | 55 | class NMS(torch.autograd.Function): 56 | @staticmethod 57 | def forward(self, boxes, scores, score_threshold, iou_threshold, max_output_boxes): 58 | batch_size = scores.shape[0] 59 | num_classes = scores.shape[-1] 60 | num_detections = torch.randint(0, max_output_boxes, (batch_size, 1), dtype=torch.int32) 61 | detection_boxes = torch.randn(batch_size, max_output_boxes, 4) 62 | detection_scores = torch.randn(batch_size, max_output_boxes) 63 | detection_classes = torch.randint(0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32) 64 | detections_indices = torch.randint(0, max_output_boxes, (batch_size, max_output_boxes), dtype=torch.int32) 65 | return num_detections, detection_boxes, detection_scores, detection_classes, detections_indices 66 | 67 | @staticmethod 68 | def symbolic(g, boxes, scores, score_threshold, iou_threshold, max_output_boxes): 69 | return g.op( 70 | "TRT::EfficientNMSX_TRT", 71 | boxes, 72 | scores, 73 | score_threshold_f=score_threshold, 74 | iou_threshold_f=iou_threshold, 75 | max_output_boxes_i=max_output_boxes, 76 | background_class_i=-1, 77 | score_activation_i=0, 78 | class_agnostic_i=0, 79 | box_coding_i=0, 80 | outputs=5 81 | ) 82 | 83 | 84 | class DeepStreamOutput(nn.Module): 85 | def __init__(self, nc, conf_threshold, iou_threshold, max_detections, attn_resolution, num_base): 86 | super().__init__() 87 | self.nc = nc 88 | self.conf_threshold = conf_threshold 89 | self.iou_threshold = iou_threshold 90 | self.max_detections = max_detections 91 | self.attn_resolution = attn_resolution 92 | self.num_base = num_base 93 | 94 | def forward(self, x): 95 | preds = x["test"] 96 | boxes = preds[:, :, :4] 97 | convert_matrix = torch.tensor( 98 | [[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], dtype=boxes.dtype, device=boxes.device 99 | ) 100 | boxes @= convert_matrix 101 | objectness = preds[:, :, 4:5] 102 | scores = preds[:, :, 5:self.nc+5] 103 | scores *= objectness 104 | attn = x["attn"] 105 | bases = torch.cat([x["bases"], x["sem"]], dim=1) 106 | 107 | num_detections, detection_boxes, detection_scores, detection_classes, detections_indices = NMS.apply( 108 | boxes, scores, self.conf_threshold, self.iou_threshold, self.max_detections 109 | ) 110 | 111 | batch_size, num_protos, h_protos, w_protos = bases.shape 112 | 113 | total_detections = batch_size * self.max_detections 114 | 115 | batch_index = torch.ones_like(detections_indices) * torch.arange( 116 | batch_size, device=boxes.device, dtype=torch.int32 117 | ).unsqueeze(1) 118 | batch_index = batch_index.view(total_detections).to(torch.int32) 119 | box_index = detections_indices.view(total_detections).to(torch.int32) 120 | 121 | selected_boxes = boxes[batch_index, box_index] 122 | selected_masks = attn[batch_index, box_index] 123 | 124 | pooled_proto = RoiAlign.apply(bases, selected_boxes, batch_index, 1, 1, int(h_protos), int(w_protos), 0, 0.25) 125 | 126 | masks_protos = merge_bases( 127 | pooled_proto, selected_masks, self.attn_resolution, self.num_base 128 | ).view(batch_size, self.max_detections, h_protos * w_protos).sigmoid() 129 | 130 | return torch.cat( 131 | [detection_boxes, detection_scores.unsqueeze(-1), detection_classes.unsqueeze(-1), masks_protos], dim=-1 132 | ) 133 | 134 | 135 | def yolov7_mask_export(weights, device): 136 | ckpt = torch.load(weights) 137 | model = ckpt["model"] 138 | model = model.float().to(device) 139 | model.eval() 140 | with open("data/hyp.scratch.mask.yaml") as f: 141 | hyp = yaml.load(f, Loader=yaml.FullLoader) 142 | return model, hyp["attn_resolution"], hyp["num_base"] 143 | 144 | 145 | def suppress_warnings(): 146 | import warnings 147 | warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) 148 | warnings.filterwarnings("ignore", category=UserWarning) 149 | warnings.filterwarnings("ignore", category=DeprecationWarning) 150 | warnings.filterwarnings("ignore", category=FutureWarning) 151 | warnings.filterwarnings("ignore", category=ResourceWarning) 152 | 153 | 154 | def main(args): 155 | suppress_warnings() 156 | 157 | print(f"\nStarting: {args.weights}") 158 | 159 | print("Opening YOLOv7-Mask model") 160 | 161 | device = torch.device("cpu") 162 | model, attn_resolution, num_base = yolov7_mask_export(args.weights, device) 163 | 164 | if hasattr(model, "names") and len(model.names) > 0: 165 | print("Creating labels.txt file") 166 | with open("labels.txt", "w", encoding="utf-8") as f: 167 | for name in model.names: 168 | f.write(f"{name}\n") 169 | 170 | model = nn.Sequential( 171 | model, DeepStreamOutput( 172 | len(model.names), args.conf_threshold, args.iou_threshold, args.max_detections, attn_resolution, num_base 173 | ) 174 | ) 175 | 176 | img_size = args.size * 2 if len(args.size) == 1 else args.size 177 | 178 | if img_size == [640, 640] and args.p6: 179 | img_size = [1280] * 2 180 | 181 | onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device) 182 | onnx_output_file = args.weights.rsplit(".", 1)[0] + ".onnx" 183 | 184 | dynamic_axes = { 185 | "input": { 186 | 0: "batch" 187 | }, 188 | "output": { 189 | 0: "batch" 190 | } 191 | } 192 | 193 | print("Exporting the model to ONNX") 194 | torch.onnx.export( 195 | model, 196 | onnx_input_im, 197 | onnx_output_file, 198 | verbose=False, 199 | opset_version=args.opset, 200 | do_constant_folding=True, 201 | input_names=["input"], 202 | output_names=["output"], 203 | dynamic_axes=dynamic_axes if args.dynamic else None 204 | ) 205 | 206 | if args.simplify: 207 | print("Simplifying the ONNX model") 208 | import onnxslim 209 | model_onnx = onnx.load(onnx_output_file) 210 | model_onnx = onnxslim.slim(model_onnx) 211 | onnx.save(model_onnx, onnx_output_file) 212 | 213 | print(f"Done: {onnx_output_file}\n") 214 | 215 | 216 | def parse_args(): 217 | import argparse 218 | parser = argparse.ArgumentParser(description="DeepStream YOLOv7-Mask conversion") 219 | parser.add_argument("-w", "--weights", required=True, type=str, help="Input weights (.pt) file path (required)") 220 | parser.add_argument("-s", "--size", nargs="+", type=int, default=[640], help="Inference size [H,W] (default [640])") 221 | parser.add_argument("--p6", action="store_true", help="P6 model") 222 | parser.add_argument("--opset", type=int, default=17, help="ONNX opset version") 223 | parser.add_argument("--simplify", action="store_true", help="ONNX simplify model") 224 | parser.add_argument("--dynamic", action="store_true", help="Dynamic batch-size") 225 | parser.add_argument("--batch", type=int, default=1, help="Static batch-size") 226 | parser.add_argument( 227 | "--conf-threshold", type=float, default=0.25, help="Minimum detection confidence threshold (default 0.25)" 228 | ) 229 | parser.add_argument("--iou-threshold", type=float, default=0.45, help="NMS IoU threshold (default 0.45)") 230 | parser.add_argument( 231 | "--max-detections", type=int, default=100, help="Maximum number of output detections (default 100)" 232 | ) 233 | args = parser.parse_args() 234 | if not os.path.isfile(args.weights): 235 | raise SystemExit("Invalid weights file") 236 | if args.dynamic and args.batch > 1: 237 | raise SystemExit("Cannot set dynamic batch-size and static batch-size at same time") 238 | return args 239 | 240 | 241 | if __name__ == "__main__": 242 | args = parse_args() 243 | main(args) 244 | -------------------------------------------------------------------------------- /utils/export_yolo11_seg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import onnx 4 | import torch 5 | import torch.nn as nn 6 | from copy import deepcopy 7 | 8 | from ultralytics import YOLO 9 | from ultralytics.nn.modules import C2f, Detect, RTDETRDecoder 10 | import ultralytics.utils 11 | import ultralytics.models.yolo 12 | import ultralytics.utils.tal as _m 13 | 14 | sys.modules["ultralytics.yolo"] = ultralytics.models.yolo 15 | sys.modules["ultralytics.yolo.utils"] = ultralytics.utils 16 | 17 | 18 | def _dist2bbox(distance, anchor_points, xywh=False, dim=-1): 19 | lt, rb = distance.chunk(2, dim) 20 | x1y1 = anchor_points - lt 21 | x2y2 = anchor_points + rb 22 | return torch.cat([x1y1, x2y2], dim) 23 | 24 | 25 | _m.dist2bbox.__code__ = _dist2bbox.__code__ 26 | 27 | 28 | class RoiAlign(torch.autograd.Function): 29 | @staticmethod 30 | def forward( 31 | self, 32 | X, 33 | rois, 34 | batch_indices, 35 | coordinate_transformation_mode, 36 | mode, 37 | output_height, 38 | output_width, 39 | sampling_ratio, 40 | spatial_scale 41 | ): 42 | C = X.shape[1] 43 | num_rois = rois.shape[0] 44 | return torch.randn([num_rois, C, output_height, output_width], device=rois.device, dtype=rois.dtype) 45 | 46 | @staticmethod 47 | def symbolic( 48 | g, 49 | X, 50 | rois, 51 | batch_indices, 52 | coordinate_transformation_mode, 53 | mode, 54 | output_height, 55 | output_width, 56 | sampling_ratio, 57 | spatial_scale 58 | ): 59 | return g.op( 60 | "TRT::ROIAlignX_TRT", 61 | X, 62 | rois, 63 | batch_indices, 64 | coordinate_transformation_mode_i=coordinate_transformation_mode, 65 | mode_i=mode, 66 | output_height_i=output_height, 67 | output_width_i=output_width, 68 | sampling_ratio_i=sampling_ratio, 69 | spatial_scale_f=spatial_scale 70 | ) 71 | 72 | 73 | class NMS(torch.autograd.Function): 74 | @staticmethod 75 | def forward(self, boxes, scores, score_threshold, iou_threshold, max_output_boxes): 76 | batch_size = scores.shape[0] 77 | num_classes = scores.shape[-1] 78 | num_detections = torch.randint(0, max_output_boxes, (batch_size, 1), dtype=torch.int32) 79 | detection_boxes = torch.randn(batch_size, max_output_boxes, 4) 80 | detection_scores = torch.randn(batch_size, max_output_boxes) 81 | detection_classes = torch.randint(0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32) 82 | detections_indices = torch.randint(0, max_output_boxes, (batch_size, max_output_boxes), dtype=torch.int32) 83 | return num_detections, detection_boxes, detection_scores, detection_classes, detections_indices 84 | 85 | @staticmethod 86 | def symbolic(g, boxes, scores, score_threshold, iou_threshold, max_output_boxes): 87 | return g.op( 88 | "TRT::EfficientNMSX_TRT", 89 | boxes, 90 | scores, 91 | score_threshold_f=score_threshold, 92 | iou_threshold_f=iou_threshold, 93 | max_output_boxes_i=max_output_boxes, 94 | background_class_i=-1, 95 | score_activation_i=0, 96 | class_agnostic_i=0, 97 | box_coding_i=0, 98 | outputs=5 99 | ) 100 | 101 | 102 | class DeepStreamOutput(nn.Module): 103 | def __init__(self, nc, conf_threshold, iou_threshold, max_detections): 104 | super().__init__() 105 | self.nc = nc 106 | self.conf_threshold = conf_threshold 107 | self.iou_threshold = iou_threshold 108 | self.max_detections = max_detections 109 | 110 | def forward(self, x): 111 | preds = x[0].transpose(1, 2) 112 | boxes = preds[:, :, :4] 113 | scores = preds[:, :, 4:self.nc+4] 114 | masks = preds[:, :, self.nc+4:] 115 | protos = x[1] 116 | 117 | num_detections, detection_boxes, detection_scores, detection_classes, detections_indices = NMS.apply( 118 | boxes, scores, self.conf_threshold, self.iou_threshold, self.max_detections 119 | ) 120 | 121 | batch_size, num_protos, h_protos, w_protos = protos.shape 122 | 123 | total_detections = batch_size * self.max_detections 124 | 125 | batch_index = torch.ones_like(detections_indices) * torch.arange( 126 | batch_size, device=boxes.device, dtype=torch.int32 127 | ).unsqueeze(1) 128 | batch_index = batch_index.view(total_detections).to(torch.int32) 129 | box_index = detections_indices.view(total_detections).to(torch.int32) 130 | 131 | selected_boxes = boxes[batch_index, box_index] 132 | selected_masks = masks[batch_index, box_index] 133 | 134 | pooled_proto = RoiAlign.apply(protos, selected_boxes, batch_index, 1, 1, int(h_protos), int(w_protos), 0, 0.25) 135 | 136 | masks_protos = torch.matmul( 137 | selected_masks.unsqueeze(1), pooled_proto.view(total_detections, num_protos, h_protos * w_protos) 138 | ) 139 | masks_protos = masks_protos.sigmoid().view(batch_size, self.max_detections, h_protos * w_protos) 140 | 141 | return torch.cat( 142 | [detection_boxes, detection_scores.unsqueeze(-1), detection_classes.unsqueeze(-1), masks_protos], dim=-1 143 | ) 144 | 145 | 146 | def yolo11_seg_export(weights, device, fuse=True): 147 | model = YOLO(weights) 148 | model = deepcopy(model.model).to(device) 149 | for p in model.parameters(): 150 | p.requires_grad = False 151 | model.eval() 152 | model.float() 153 | if fuse: 154 | model = model.fuse() 155 | for k, m in model.named_modules(): 156 | if isinstance(m, (Detect, RTDETRDecoder)): 157 | m.dynamic = False 158 | m.export = True 159 | m.format = "onnx" 160 | elif isinstance(m, C2f): 161 | m.forward = m.forward_split 162 | return model 163 | 164 | 165 | def suppress_warnings(): 166 | import warnings 167 | warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) 168 | warnings.filterwarnings("ignore", category=UserWarning) 169 | warnings.filterwarnings("ignore", category=DeprecationWarning) 170 | warnings.filterwarnings("ignore", category=FutureWarning) 171 | warnings.filterwarnings("ignore", category=ResourceWarning) 172 | 173 | 174 | def main(args): 175 | suppress_warnings() 176 | 177 | print(f"\nStarting: {args.weights}") 178 | 179 | print("Opening YOLO11-Seg model") 180 | 181 | device = torch.device("cpu") 182 | model = yolo11_seg_export(args.weights, device) 183 | 184 | if len(model.names.keys()) > 0: 185 | print("Creating labels.txt file") 186 | with open("labels.txt", "w", encoding="utf-8") as f: 187 | for name in model.names.values(): 188 | f.write(f"{name}\n") 189 | 190 | model = nn.Sequential( 191 | model, DeepStreamOutput(len(model.names), args.conf_threshold, args.iou_threshold, args.max_detections) 192 | ) 193 | 194 | img_size = args.size * 2 if len(args.size) == 1 else args.size 195 | 196 | onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device) 197 | onnx_output_file = args.weights.rsplit(".", 1)[0] + ".onnx" 198 | 199 | dynamic_axes = { 200 | "input": { 201 | 0: "batch" 202 | }, 203 | "output": { 204 | 0: "batch" 205 | } 206 | } 207 | 208 | print("Exporting the model to ONNX") 209 | torch.onnx.export( 210 | model, 211 | onnx_input_im, 212 | onnx_output_file, 213 | verbose=False, 214 | opset_version=args.opset, 215 | do_constant_folding=True, 216 | input_names=["input"], 217 | output_names=["output"], 218 | dynamic_axes=dynamic_axes if args.dynamic else None 219 | ) 220 | 221 | if args.simplify: 222 | print("Simplifying the ONNX model") 223 | import onnxslim 224 | model_onnx = onnx.load(onnx_output_file) 225 | model_onnx = onnxslim.slim(model_onnx) 226 | onnx.save(model_onnx, onnx_output_file) 227 | 228 | print(f"Done: {onnx_output_file}\n") 229 | 230 | 231 | def parse_args(): 232 | import argparse 233 | parser = argparse.ArgumentParser(description="DeepStream YOLO11-Seg conversion") 234 | parser.add_argument("-w", "--weights", required=True, type=str, help="Input weights (.pt) file path (required)") 235 | parser.add_argument("-s", "--size", nargs="+", type=int, default=[640], help="Inference size [H,W] (default [640])") 236 | parser.add_argument("--opset", type=int, default=17, help="ONNX opset version") 237 | parser.add_argument("--simplify", action="store_true", help="ONNX simplify model") 238 | parser.add_argument("--dynamic", action="store_true", help="Dynamic batch-size") 239 | parser.add_argument("--batch", type=int, default=1, help="Static batch-size") 240 | parser.add_argument( 241 | "--conf-threshold", type=float, default=0.25, help="Minimum detection confidence threshold (default 0.25)" 242 | ) 243 | parser.add_argument("--iou-threshold", type=float, default=0.45, help="NMS IoU threshold (default 0.45)") 244 | parser.add_argument( 245 | "--max-detections", type=int, default=100, help="Maximum number of output detections (default 100)" 246 | ) 247 | args = parser.parse_args() 248 | if not os.path.isfile(args.weights): 249 | raise SystemExit("Invalid weights file") 250 | if args.dynamic and args.batch > 1: 251 | raise SystemExit("Cannot set dynamic batch-size and static batch-size at same time") 252 | return args 253 | 254 | 255 | if __name__ == "__main__": 256 | args = parse_args() 257 | main(args) 258 | -------------------------------------------------------------------------------- /utils/export_yoloV8_seg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import onnx 4 | import torch 5 | import torch.nn as nn 6 | from copy import deepcopy 7 | 8 | from ultralytics import YOLO 9 | from ultralytics.nn.modules import C2f, Detect, RTDETRDecoder 10 | import ultralytics.utils 11 | import ultralytics.models.yolo 12 | import ultralytics.utils.tal as _m 13 | 14 | sys.modules["ultralytics.yolo"] = ultralytics.models.yolo 15 | sys.modules["ultralytics.yolo.utils"] = ultralytics.utils 16 | 17 | 18 | def _dist2bbox(distance, anchor_points, xywh=False, dim=-1): 19 | lt, rb = distance.chunk(2, dim) 20 | x1y1 = anchor_points - lt 21 | x2y2 = anchor_points + rb 22 | return torch.cat([x1y1, x2y2], dim) 23 | 24 | 25 | _m.dist2bbox.__code__ = _dist2bbox.__code__ 26 | 27 | 28 | class RoiAlign(torch.autograd.Function): 29 | @staticmethod 30 | def forward( 31 | self, 32 | X, 33 | rois, 34 | batch_indices, 35 | coordinate_transformation_mode, 36 | mode, 37 | output_height, 38 | output_width, 39 | sampling_ratio, 40 | spatial_scale 41 | ): 42 | C = X.shape[1] 43 | num_rois = rois.shape[0] 44 | return torch.randn([num_rois, C, output_height, output_width], device=rois.device, dtype=rois.dtype) 45 | 46 | @staticmethod 47 | def symbolic( 48 | g, 49 | X, 50 | rois, 51 | batch_indices, 52 | coordinate_transformation_mode, 53 | mode, 54 | output_height, 55 | output_width, 56 | sampling_ratio, 57 | spatial_scale 58 | ): 59 | return g.op( 60 | "TRT::ROIAlignX_TRT", 61 | X, 62 | rois, 63 | batch_indices, 64 | coordinate_transformation_mode_i=coordinate_transformation_mode, 65 | mode_i=mode, 66 | output_height_i=output_height, 67 | output_width_i=output_width, 68 | sampling_ratio_i=sampling_ratio, 69 | spatial_scale_f=spatial_scale 70 | ) 71 | 72 | 73 | class NMS(torch.autograd.Function): 74 | @staticmethod 75 | def forward(self, boxes, scores, score_threshold, iou_threshold, max_output_boxes): 76 | batch_size = scores.shape[0] 77 | num_classes = scores.shape[-1] 78 | num_detections = torch.randint(0, max_output_boxes, (batch_size, 1), dtype=torch.int32) 79 | detection_boxes = torch.randn(batch_size, max_output_boxes, 4) 80 | detection_scores = torch.randn(batch_size, max_output_boxes) 81 | detection_classes = torch.randint(0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32) 82 | detections_indices = torch.randint(0, max_output_boxes, (batch_size, max_output_boxes), dtype=torch.int32) 83 | return num_detections, detection_boxes, detection_scores, detection_classes, detections_indices 84 | 85 | @staticmethod 86 | def symbolic(g, boxes, scores, score_threshold, iou_threshold, max_output_boxes): 87 | return g.op( 88 | "TRT::EfficientNMSX_TRT", 89 | boxes, 90 | scores, 91 | score_threshold_f=score_threshold, 92 | iou_threshold_f=iou_threshold, 93 | max_output_boxes_i=max_output_boxes, 94 | background_class_i=-1, 95 | score_activation_i=0, 96 | class_agnostic_i=0, 97 | box_coding_i=0, 98 | outputs=5 99 | ) 100 | 101 | 102 | class DeepStreamOutput(nn.Module): 103 | def __init__(self, nc, conf_threshold, iou_threshold, max_detections): 104 | super().__init__() 105 | self.nc = nc 106 | self.conf_threshold = conf_threshold 107 | self.iou_threshold = iou_threshold 108 | self.max_detections = max_detections 109 | 110 | def forward(self, x): 111 | preds = x[0].transpose(1, 2) 112 | boxes = preds[:, :, :4] 113 | scores = preds[:, :, 4:self.nc+4] 114 | masks = preds[:, :, self.nc+4:] 115 | protos = x[1] 116 | 117 | num_detections, detection_boxes, detection_scores, detection_classes, detections_indices = NMS.apply( 118 | boxes, scores, self.conf_threshold, self.iou_threshold, self.max_detections 119 | ) 120 | 121 | batch_size, num_protos, h_protos, w_protos = protos.shape 122 | 123 | total_detections = batch_size * self.max_detections 124 | 125 | batch_index = torch.ones_like(detections_indices) * torch.arange( 126 | batch_size, device=boxes.device, dtype=torch.int32 127 | ).unsqueeze(1) 128 | batch_index = batch_index.view(total_detections).to(torch.int32) 129 | box_index = detections_indices.view(total_detections).to(torch.int32) 130 | 131 | selected_boxes = boxes[batch_index, box_index] 132 | selected_masks = masks[batch_index, box_index] 133 | 134 | pooled_proto = RoiAlign.apply(protos, selected_boxes, batch_index, 1, 1, int(h_protos), int(w_protos), 0, 0.25) 135 | 136 | masks_protos = torch.matmul( 137 | selected_masks.unsqueeze(1), pooled_proto.view(total_detections, num_protos, h_protos * w_protos) 138 | ) 139 | masks_protos = masks_protos.sigmoid().view(batch_size, self.max_detections, h_protos * w_protos) 140 | 141 | return torch.cat( 142 | [detection_boxes, detection_scores.unsqueeze(-1), detection_classes.unsqueeze(-1), masks_protos], dim=-1 143 | ) 144 | 145 | 146 | def yolov8_seg_export(weights, device, fuse=True): 147 | model = YOLO(weights) 148 | model = deepcopy(model.model).to(device) 149 | for p in model.parameters(): 150 | p.requires_grad = False 151 | model.eval() 152 | model.float() 153 | if fuse: 154 | model = model.fuse() 155 | for k, m in model.named_modules(): 156 | if isinstance(m, (Detect, RTDETRDecoder)): 157 | m.dynamic = False 158 | m.export = True 159 | m.format = "onnx" 160 | elif isinstance(m, C2f): 161 | m.forward = m.forward_split 162 | return model 163 | 164 | 165 | def suppress_warnings(): 166 | import warnings 167 | warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) 168 | warnings.filterwarnings("ignore", category=UserWarning) 169 | warnings.filterwarnings("ignore", category=DeprecationWarning) 170 | warnings.filterwarnings("ignore", category=FutureWarning) 171 | warnings.filterwarnings("ignore", category=ResourceWarning) 172 | 173 | 174 | def main(args): 175 | suppress_warnings() 176 | 177 | print(f"\nStarting: {args.weights}") 178 | 179 | print("Opening YOLOv8-Seg model") 180 | 181 | device = torch.device("cpu") 182 | model = yolov8_seg_export(args.weights, device) 183 | 184 | if len(model.names.keys()) > 0: 185 | print("Creating labels.txt file") 186 | with open("labels.txt", "w", encoding="utf-8") as f: 187 | for name in model.names.values(): 188 | f.write(f"{name}\n") 189 | 190 | model = nn.Sequential( 191 | model, DeepStreamOutput(len(model.names), args.conf_threshold, args.iou_threshold, args.max_detections) 192 | ) 193 | 194 | img_size = args.size * 2 if len(args.size) == 1 else args.size 195 | 196 | onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device) 197 | onnx_output_file = args.weights.rsplit(".", 1)[0] + ".onnx" 198 | 199 | dynamic_axes = { 200 | "input": { 201 | 0: "batch" 202 | }, 203 | "output": { 204 | 0: "batch" 205 | } 206 | } 207 | 208 | print("Exporting the model to ONNX") 209 | torch.onnx.export( 210 | model, 211 | onnx_input_im, 212 | onnx_output_file, 213 | verbose=False, 214 | opset_version=args.opset, 215 | do_constant_folding=True, 216 | input_names=["input"], 217 | output_names=["output"], 218 | dynamic_axes=dynamic_axes if args.dynamic else None 219 | ) 220 | 221 | if args.simplify: 222 | print("Simplifying the ONNX model") 223 | import onnxslim 224 | model_onnx = onnx.load(onnx_output_file) 225 | model_onnx = onnxslim.slim(model_onnx) 226 | onnx.save(model_onnx, onnx_output_file) 227 | 228 | print(f"Done: {onnx_output_file}\n") 229 | 230 | 231 | def parse_args(): 232 | import argparse 233 | parser = argparse.ArgumentParser(description="DeepStream YOLOv8-Seg conversion") 234 | parser.add_argument("-w", "--weights", required=True, type=str, help="Input weights (.pt) file path (required)") 235 | parser.add_argument("-s", "--size", nargs="+", type=int, default=[640], help="Inference size [H,W] (default [640])") 236 | parser.add_argument("--opset", type=int, default=17, help="ONNX opset version") 237 | parser.add_argument("--simplify", action="store_true", help="ONNX simplify model") 238 | parser.add_argument("--dynamic", action="store_true", help="Dynamic batch-size") 239 | parser.add_argument("--batch", type=int, default=1, help="Static batch-size") 240 | parser.add_argument( 241 | "--conf-threshold", type=float, default=0.25, help="Minimum detection confidence threshold (default 0.25)" 242 | ) 243 | parser.add_argument("--iou-threshold", type=float, default=0.45, help="NMS IoU threshold (default 0.45)") 244 | parser.add_argument( 245 | "--max-detections", type=int, default=100, help="Maximum number of output detections (default 100)" 246 | ) 247 | args = parser.parse_args() 248 | if not os.path.isfile(args.weights): 249 | raise SystemExit("Invalid weights file") 250 | if args.dynamic and args.batch > 1: 251 | raise SystemExit("Cannot set dynamic batch-size and static batch-size at same time") 252 | return args 253 | 254 | 255 | if __name__ == "__main__": 256 | args = parse_args() 257 | main(args) 258 | -------------------------------------------------------------------------------- /nvdsinfer_custom_impl_Yolo_seg/trt_plugins/common.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMON_H 2 | #define COMMON_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | #include "NvInfer.h" 14 | #include "NvInferPlugin.h" 15 | 16 | #ifdef _MSC_VER 17 | #define FN_NAME __FUNCTION__ 18 | #else 19 | #define FN_NAME __func__ 20 | #endif 21 | 22 | #define PLUGIN_CHECK_CUDA(call) \ 23 | do \ 24 | { \ 25 | cudaError_t status = call; \ 26 | if (status != cudaSuccess) \ 27 | { \ 28 | return status; \ 29 | } \ 30 | } while (0) 31 | 32 | #define PLUGIN_CUASSERT(status_) \ 33 | do \ 34 | { \ 35 | auto s_ = status_; \ 36 | if (s_ != cudaSuccess) \ 37 | { \ 38 | char const* msg = cudaGetErrorString(s_); \ 39 | nvinfer1::plugin::throwCudaError(__FILE__, FN_NAME, __LINE__, s_, msg); \ 40 | } \ 41 | } while (0) 42 | 43 | #define GET_MACRO(_1, _2, NAME, ...) NAME 44 | 45 | #define PLUGIN_VALIDATE(...) GET_MACRO(__VA_ARGS__, PLUGIN_VALIDATE_MSG, PLUGIN_VALIDATE_DEFAULT, )(__VA_ARGS__) 46 | 47 | #define PLUGIN_VALIDATE_DEFAULT(condition) \ 48 | do \ 49 | { \ 50 | if (!(condition)) \ 51 | { \ 52 | nvinfer1::plugin::throwPluginError(__FILE__, FN_NAME, __LINE__, 0, #condition); \ 53 | } \ 54 | } while (0) 55 | 56 | #define PLUGIN_VALIDATE_MSG(condition, msg) \ 57 | do \ 58 | { \ 59 | if (!(condition)) \ 60 | { \ 61 | nvinfer1::plugin::throwPluginError(__FILE__, FN_NAME, __LINE__, 0, msg); \ 62 | } \ 63 | } while (0) 64 | 65 | #define PLUGIN_ASSERT(assertion) \ 66 | do \ 67 | { \ 68 | if (!(assertion)) \ 69 | { \ 70 | nvinfer1::plugin::reportAssertion(#assertion, __FILE__, __LINE__); \ 71 | } \ 72 | } while (0) 73 | 74 | #define PLUGIN_FAIL(msg) \ 75 | do \ 76 | { \ 77 | nvinfer1::plugin::reportAssertion(msg, __FILE__, __LINE__); \ 78 | } while (0) 79 | 80 | #define PLUGIN_ERROR(msg) \ 81 | { \ 82 | nvinfer1::plugin::throwPluginError(__FILE__, FN_NAME, __LINE__, 0, msg); \ 83 | } 84 | 85 | #define CSC(call, err) \ 86 | do \ 87 | { \ 88 | cudaError_t cudaStatus = call; \ 89 | if (cudaStatus != cudaSuccess) \ 90 | { \ 91 | return err; \ 92 | } \ 93 | } while (0) 94 | 95 | typedef enum 96 | { 97 | STATUS_SUCCESS = 0, 98 | STATUS_FAILURE = 1, 99 | STATUS_BAD_PARAM = 2, 100 | STATUS_NOT_SUPPORTED = 3, 101 | STATUS_NOT_INITIALIZED = 4 102 | } pluginStatus_t; 103 | 104 | namespace nvinfer1 105 | { 106 | 107 | namespace pluginInternal 108 | { 109 | 110 | class BaseCreator : public IPluginCreator 111 | { 112 | public: 113 | void setPluginNamespace(char const* libNamespace) noexcept override 114 | { 115 | mNamespace = libNamespace; 116 | } 117 | 118 | char const* getPluginNamespace() const noexcept override 119 | { 120 | return mNamespace.c_str(); 121 | } 122 | 123 | protected: 124 | std::string mNamespace; 125 | }; 126 | 127 | } // namespace pluginInternal 128 | 129 | namespace plugin 130 | { 131 | 132 | template 133 | class LogStream : public std::ostream 134 | { 135 | class Buf : public std::stringbuf 136 | { 137 | public: 138 | int32_t sync() override; 139 | }; 140 | 141 | Buf buffer; 142 | std::mutex mLogStreamMutex; 143 | 144 | public: 145 | std::mutex& getMutex() 146 | { 147 | return mLogStreamMutex; 148 | } 149 | LogStream() 150 | : std::ostream(&buffer){}; 151 | }; 152 | 153 | class TRTException : public std::exception 154 | { 155 | public: 156 | TRTException(char const* fl, char const* fn, int32_t ln, int32_t st, char const* msg, char const* nm) 157 | : file(fl) 158 | , function(fn) 159 | , line(ln) 160 | , status(st) 161 | , message(msg) 162 | , name(nm) 163 | { 164 | } 165 | virtual void log(std::ostream& logStream) const; 166 | void setMessage(char const* msg) 167 | { 168 | message = msg; 169 | } 170 | 171 | protected: 172 | char const* file{nullptr}; 173 | char const* function{nullptr}; 174 | int32_t line{0}; 175 | int32_t status{0}; 176 | char const* message{nullptr}; 177 | char const* name{nullptr}; 178 | }; 179 | 180 | class CudaError : public TRTException 181 | { 182 | public: 183 | CudaError(char const* fl, char const* fn, int32_t ln, int32_t stat, char const* msg = nullptr) 184 | : TRTException(fl, fn, ln, stat, msg, "Cuda") 185 | { 186 | } 187 | }; 188 | 189 | class PluginError : public TRTException 190 | { 191 | public: 192 | PluginError(char const* fl, char const* fn, int32_t ln, int32_t stat, char const* msg = nullptr) 193 | : TRTException(fl, fn, ln, stat, msg, "Plugin") 194 | { 195 | } 196 | }; 197 | 198 | extern LogStream gLogError; 199 | extern LogStream gLogWarning; 200 | 201 | void caughtError(std::exception const& e); 202 | 203 | void throwCudaError(char const* file, char const* function, int32_t line, int32_t status, char const* msg); 204 | 205 | void throwPluginError(char const* file, char const* function, int32_t line, int32_t status, char const* msg); 206 | 207 | void reportValidationFailure(char const* msg, char const* file, int32_t line); 208 | 209 | void reportAssertion(char const* msg, char const* file, int32_t line); 210 | 211 | void validateRequiredAttributesExist(std::set requiredFieldNames, PluginFieldCollection const* fc); 212 | 213 | template 214 | void write(BufferType*& buffer, Type const& val) 215 | { 216 | static_assert(sizeof(BufferType) == 1, "BufferType must be a 1 byte type."); 217 | std::memcpy(buffer, &val, sizeof(Type)); 218 | buffer += sizeof(Type); 219 | } 220 | 221 | template 222 | OutType read(BufferType const*& buffer) 223 | { 224 | static_assert(sizeof(BufferType) == 1, "BufferType must be a 1 byte type."); 225 | OutType val{}; 226 | std::memcpy(&val, static_cast(buffer), sizeof(OutType)); 227 | buffer += sizeof(OutType); 228 | return val; 229 | } 230 | 231 | size_t dataTypeSize(nvinfer1::DataType dtype); 232 | 233 | } // namespace plugin 234 | } // namespace nvinfer1 235 | 236 | #endif // COMMON_H 237 | -------------------------------------------------------------------------------- /nvdsinfer_custom_impl_Yolo_seg/trt_plugins/roiAlignPlugin/roiAlignKernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * ************************************************************************ 18 | * Modified from Pytorch 19 | * Copyright (c) 2016-present, Facebook, Inc. 20 | * 21 | * See https://github.com/pytorch/pytorch/blob/main/LICENSE for details 22 | * ************************************************************************ 23 | * Modified from ONNX Runtime 24 | * Copyright (c) Microsoft Corporation 25 | * 26 | * See https://github.com/microsoft/onnxruntime/blob/main/LICENSE for details 27 | * ************************************************************************ 28 | */ 29 | 30 | 31 | #include 32 | #include 33 | #include "../common.h" 34 | #include "roiAlignKernel.h" 35 | 36 | using half = __half; 37 | 38 | __device__ half floatMax(half a, half b) 39 | { 40 | #if __CUDA_ARCH__ >= 800 41 | return __hmax(a, b); 42 | #else 43 | return __float2half(max(__half2float(a), __half2float(b))); 44 | #endif 45 | } 46 | 47 | __device__ float floatMax(float a, float b) 48 | { 49 | return max(a, b); 50 | } 51 | 52 | template 53 | __device__ T bilinearInterpolate(T const* bottomData, int32_t const height, int32_t const width, T y, T x, 54 | int32_t const isModeAvg, int32_t const index /* index for debug only*/) 55 | { 56 | // deal with cases that inverse elements are out of feature map boundary 57 | if (y < static_cast(-1.0) || y > static_cast(height) || x < static_cast(-1.0) || x > static_cast(width)) 58 | { 59 | // empty 60 | return 0; 61 | } 62 | 63 | if (y <= static_cast(0)) 64 | { 65 | y = 0; 66 | } 67 | if (x <= static_cast(0)) 68 | { 69 | x = 0; 70 | } 71 | 72 | int32_t yLow = static_cast(y); 73 | int32_t xLow = static_cast(x); 74 | int32_t yHigh; 75 | int32_t xHigh; 76 | 77 | if (yLow >= height - 1) 78 | { 79 | yHigh = yLow = height - 1; 80 | y = static_cast(yLow); 81 | } 82 | else 83 | { 84 | yHigh = yLow + 1; 85 | } 86 | 87 | if (xLow >= width - 1) 88 | { 89 | xHigh = xLow = width - 1; 90 | x = static_cast(xLow); 91 | } 92 | else 93 | { 94 | xHigh = xLow + 1; 95 | } 96 | 97 | T ly = y - static_cast(yLow); 98 | T lx = x - static_cast(xLow); 99 | T hy = static_cast(1.) - ly, hx = static_cast(1.) - lx; 100 | // do bilinear interpolation 101 | T v1 = bottomData[yLow * width + xLow]; 102 | T v2 = bottomData[yLow * width + xHigh]; 103 | T v3 = bottomData[yHigh * width + xLow]; 104 | T v4 = bottomData[yHigh * width + xHigh]; 105 | T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; 106 | 107 | T val; 108 | if (isModeAvg) 109 | { 110 | val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); // mode Avg 111 | } 112 | else 113 | { 114 | val = floatMax(floatMax(floatMax(w1 * v1, w2 * v2), w3 * v3), w4 * v4); // mode Max 115 | } 116 | 117 | return val; 118 | } 119 | 120 | template 121 | __global__ void RoIAlignForward(int32_t const nthreads, T const* bottomData, T const spatialScale, int32_t const channels, 122 | int32_t const height, int32_t const width, int32_t const pooledHeight, int32_t const pooledWidth, int32_t const samplingRatio, 123 | T const* bottomRois, T* topData, int32_t const isModeAvg, int32_t const* batchIndicesPtr, 124 | int32_t const aligned) 125 | { 126 | for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) 127 | { 128 | // (n, c, ph, pw) is an element in the pooled output 129 | int32_t pw = index % pooledWidth; 130 | int32_t ph = (index / pooledWidth) % pooledHeight; 131 | int32_t c = (index / pooledWidth / pooledHeight) % channels; 132 | int32_t n = index / pooledWidth / pooledHeight / channels; 133 | 134 | T const* offsetBottomRois = bottomRois + n * 4; 135 | auto const roiBatchInd = batchIndicesPtr[n]; 136 | 137 | bool continuousCoordinate = aligned; 138 | // Do not using rounding; this implementation detail is critical 139 | T roiOffset = static_cast(continuousCoordinate ? 0.5 : 0); 140 | T roiStartW = offsetBottomRois[0] * spatialScale - roiOffset; 141 | T roiStartH = offsetBottomRois[1] * spatialScale - roiOffset; 142 | T roiEndW = offsetBottomRois[2] * spatialScale - roiOffset; 143 | T roiEndH = offsetBottomRois[3] * spatialScale - roiOffset; 144 | 145 | T roiWidth = roiEndW - roiStartW; 146 | T roiHeight = roiEndH - roiStartH; 147 | if (!continuousCoordinate) 148 | { // backward compatibility 149 | // Force malformed ROIs to be 1x1 150 | roiWidth = floatMax(roiWidth, static_cast(1.)); 151 | roiHeight = floatMax(roiHeight, static_cast(1.)); 152 | } 153 | T binSizeH = static_cast(roiHeight) / static_cast(pooledHeight); 154 | T binSizeW = static_cast(roiWidth) / static_cast(pooledWidth); 155 | 156 | T const* offsetBottomData = bottomData + static_cast((roiBatchInd * channels + c) * height * width); 157 | 158 | // We use roiBinGrid to sample the grid and mimic integral 159 | int32_t roiBinGridH; 160 | if (samplingRatio > 0) 161 | { 162 | roiBinGridH = samplingRatio; 163 | } 164 | else 165 | { 166 | roiBinGridH = ceilf(roiHeight / static_cast(pooledHeight)); 167 | } 168 | 169 | int32_t roiBinGridW; 170 | if (samplingRatio > 0) 171 | { 172 | roiBinGridW = samplingRatio; 173 | } 174 | else 175 | { 176 | roiBinGridW = ceilf(roiWidth / static_cast(pooledWidth)); 177 | } 178 | // We do average (integral) pooling inside a bin 179 | T const count = roiBinGridH * roiBinGridW; // e.g. = 4 180 | 181 | T const yOff = roiStartH + static_cast(ph) * binSizeH; 182 | T const yFac = binSizeH / static_cast(roiBinGridH); 183 | 184 | T const xOff = roiStartW + static_cast(pw) * binSizeW; 185 | T const xFac = binSizeW / static_cast(roiBinGridW); 186 | 187 | T outputVal = 0.; 188 | bool maxFlag = false; 189 | for (int32_t iy = 0; iy < roiBinGridH; iy++) // e.g., iy = 0, 1 190 | { 191 | T const y = yOff + static_cast(iy + .5F) * yFac; // e.g., 0.5, 1.5 192 | for (int32_t ix = 0; ix < roiBinGridW; ix++) 193 | { 194 | T const x = xOff + static_cast(ix + .5F) * xFac; 195 | 196 | T val = bilinearInterpolate(offsetBottomData, height, width, y, x, isModeAvg, index); 197 | 198 | if (isModeAvg) 199 | { 200 | outputVal += val; 201 | } 202 | else 203 | { 204 | if (!maxFlag) 205 | { 206 | outputVal = val; 207 | maxFlag = true; 208 | } 209 | else 210 | { 211 | outputVal = floatMax(outputVal, val); 212 | } 213 | } 214 | } 215 | } 216 | if (isModeAvg) 217 | { 218 | outputVal = outputVal / count; 219 | } 220 | 221 | topData[index] = outputVal; 222 | } 223 | } 224 | 225 | template 226 | cudaError_t RoiAlignImpl(cudaStream_t stream, int32_t const maxThreadsPerBlock, T const* bottomData, T const spatialScale, 227 | int32_t const numRois, int32_t const channels, int32_t const height, int32_t const width, int32_t const pooledHeight, 228 | int32_t const pooledWidth, int32_t const samplingRatio, T const* bottomRois, T* topData, int32_t const isModeAvg, 229 | int32_t const* batchIndicesPtr, int32_t const aligned) 230 | { 231 | PLUGIN_ASSERT(bottomData != nullptr); 232 | PLUGIN_ASSERT(bottomRois != nullptr); 233 | PLUGIN_ASSERT(batchIndicesPtr != nullptr); 234 | PLUGIN_ASSERT(topData != nullptr); 235 | 236 | PLUGIN_ASSERT(numRois >= 0); 237 | PLUGIN_ASSERT(maxThreadsPerBlock > 0); 238 | 239 | PLUGIN_ASSERT(height > 0); 240 | PLUGIN_ASSERT(width > 0); 241 | PLUGIN_ASSERT(pooledHeight > 0); 242 | PLUGIN_ASSERT(pooledWidth > 0); 243 | PLUGIN_ASSERT(samplingRatio >= 0); 244 | PLUGIN_ASSERT(isModeAvg == 0 || isModeAvg == 1); 245 | PLUGIN_ASSERT(static_cast(spatialScale) > 0.0F); 246 | PLUGIN_ASSERT(aligned == 0 || aligned == 1); 247 | 248 | int32_t const outputSize = numRois * channels * pooledHeight * pooledWidth; 249 | 250 | int32_t blocksPerGrid = static_cast(ceil(static_cast(outputSize) 251 | / maxThreadsPerBlock)); 252 | 253 | RoIAlignForward<<>>(outputSize,// nthreads 254 | bottomData, // bottomData 255 | spatialScale, // spatialScale 256 | channels, // channels 257 | height, // height 258 | width, // width 259 | pooledHeight, // pooledHeight 260 | pooledWidth, // pooledWidth 261 | samplingRatio, // samplingRatio 262 | bottomRois, // bottomRois 263 | topData, // topData 264 | isModeAvg, // isModeAvg 265 | batchIndicesPtr, // batchIndicesPtr 266 | aligned); 267 | 268 | return cudaGetLastError(); 269 | } 270 | 271 | #define SPECIALIZED_IMPL(T) \ 272 | template cudaError_t RoiAlignImpl(cudaStream_t stream, int32_t const maxThreadsPerBlock, T const* bottomData, \ 273 | T const spatialScale, int32_t const numRois, int32_t const channels, int32_t const height, \ 274 | int32_t const width, int32_t const pooledHeight, int32_t const pooledWidth, int32_t const samplingRatio, \ 275 | T const* bottomRois, T* topData, int32_t const isModeAvg, int32_t const* batchIndicesPtr, \ 276 | int32_t const aligned); 277 | 278 | SPECIALIZED_IMPL(float) 279 | SPECIALIZED_IMPL(half) 280 | -------------------------------------------------------------------------------- /utils/export_rfdetr_seg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import onnx 3 | import torch 4 | import torch.nn as nn 5 | from copy import deepcopy 6 | 7 | from rfdetr import RFDETRSegPreview 8 | import rfdetr.models.backbone.projector as _m1 9 | import rfdetr.models.segmentation_head as _m2 10 | import rfdetr.models.ops.modules.ms_deform_attn as _m3 11 | 12 | 13 | def LayerNorm_forward(self, x): 14 | x = x.permute(0, 2, 3, 1) 15 | x = F.layer_norm(x, (int(x.size(3)),), self.weight, self.bias, self.eps) 16 | x = x.permute(0, 3, 1, 2) 17 | return x 18 | 19 | _m1.LayerNorm.forward.__code__ = LayerNorm_forward.__code__ 20 | 21 | 22 | def SegmentationHead_forward_export(self, spatial_features, query_features, image_size, skip_blocks=False): 23 | assert len(query_features) == 1, "at export time, segmentation head expects exactly one query feature" 24 | 25 | target_size = (image_size[0] // self.downsample_ratio, image_size[1] // self.downsample_ratio) 26 | spatial_features = F.interpolate(spatial_features, size=target_size, mode="bilinear", align_corners=False) 27 | 28 | if not skip_blocks: 29 | for block in self.blocks: 30 | spatial_features = block(spatial_features) 31 | 32 | spatial_features_proj = self.spatial_features_proj(spatial_features) 33 | 34 | qf = self.query_features_proj(self.query_features_block(query_features[0])) 35 | 36 | return [[spatial_features_proj, qf, self.bias]] 37 | 38 | 39 | _m2.SegmentationHead.forward_export.__code__ = SegmentationHead_forward_export.__code__ 40 | 41 | 42 | def MSDeformAttn_forward( 43 | self, 44 | query, 45 | reference_points, 46 | input_flatten, 47 | input_spatial_shapes, 48 | input_level_start_index, 49 | input_padding_mask=None 50 | ): 51 | class MultiscaleDeformableAttnPlugin(torch.autograd.Function): 52 | @staticmethod 53 | def forward(self, value, spatial_shapes, level_start_index, sampling_locations, attention_weights): 54 | value = value.permute(0, 2, 3, 1) 55 | N, Lq, M, L, P, n = sampling_locations.shape 56 | attention_weights = attention_weights.view(N, Lq, M, L * P) 57 | return ms_deform_attn_core_pytorch(value, spatial_shapes, sampling_locations, attention_weights) 58 | 59 | @staticmethod 60 | def symbolic(g, value, spatial_shapes, level_start_index, sampling_locations, attention_weights): 61 | return g.op( 62 | "TRT::MultiscaleDeformableAttnPlugin_TRT", 63 | value, 64 | spatial_shapes, 65 | level_start_index, 66 | sampling_locations, 67 | attention_weights 68 | ) 69 | 70 | N, Len_q, _ = query.shape 71 | N, Len_in, _ = input_flatten.shape 72 | assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in 73 | 74 | value = self.value_proj(input_flatten) 75 | if input_padding_mask is not None: 76 | value = value.masked_fill(input_padding_mask[..., None], float(0)) 77 | 78 | sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) 79 | attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) 80 | 81 | if reference_points.shape[-1] == 2: 82 | offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) 83 | sampling_locations = reference_points[:, :, None, :, None, :] \ 84 | + sampling_offsets / offset_normalizer[None, None, None, :, None, :] 85 | elif reference_points.shape[-1] == 4: 86 | sampling_locations = reference_points[:, :, None, :, None, :2] \ 87 | + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 88 | else: 89 | raise ValueError(f"Last dim of reference_points must be 2 or 4, but get {reference_points.shape[-1]} instead.") 90 | 91 | attention_weights = F.softmax(attention_weights, -1) 92 | 93 | value = value.transpose(1, 2).contiguous().view(N, self.n_heads, self.d_model // self.n_heads, Len_in) 94 | 95 | value = value.permute(0, 3, 1, 2) 96 | 97 | L, P = sampling_locations.shape[3:5] 98 | 99 | attention_weights = attention_weights.view(N, Len_q, self.n_heads, L, P) 100 | 101 | output = MultiscaleDeformableAttnPlugin.apply( 102 | value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights 103 | ) 104 | 105 | output = output.view(N, Len_q, self.d_model) 106 | 107 | output = self.output_proj(output) 108 | return output 109 | 110 | 111 | _m3.MSDeformAttn.forward.__code__ = MSDeformAttn_forward.__code__ 112 | 113 | 114 | class RoiAlign(torch.autograd.Function): 115 | @staticmethod 116 | def forward( 117 | self, 118 | X, 119 | rois, 120 | batch_indices, 121 | coordinate_transformation_mode, 122 | mode, 123 | output_height, 124 | output_width, 125 | sampling_ratio, 126 | spatial_scale 127 | ): 128 | C = X.shape[1] 129 | num_rois = rois.shape[0] 130 | return torch.randn([num_rois, C, output_height, output_width], device=rois.device, dtype=rois.dtype) 131 | 132 | @staticmethod 133 | def symbolic( 134 | g, 135 | X, 136 | rois, 137 | batch_indices, 138 | coordinate_transformation_mode, 139 | mode, 140 | output_height, 141 | output_width, 142 | sampling_ratio, 143 | spatial_scale 144 | ): 145 | return g.op( 146 | "TRT::ROIAlignX_TRT", 147 | X, 148 | rois, 149 | batch_indices, 150 | coordinate_transformation_mode_i=coordinate_transformation_mode, 151 | mode_i=mode, 152 | output_height_i=output_height, 153 | output_width_i=output_width, 154 | sampling_ratio_i=sampling_ratio, 155 | spatial_scale_f=spatial_scale 156 | ) 157 | 158 | 159 | class DeepStreamOutput(nn.Module): 160 | def __init__(self, img_size, max_detections): 161 | super().__init__() 162 | self.img_size = img_size 163 | self.max_detections = max_detections 164 | 165 | def forward(self, x): 166 | boxes = x[0] 167 | convert_matrix = torch.tensor( 168 | [[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], dtype=boxes.dtype, device=boxes.device 169 | ) 170 | boxes @= convert_matrix 171 | boxes *= torch.as_tensor([[*self.img_size]]).flip(1).tile([1, 2]).unsqueeze(1) 172 | scores = x[1].sigmoid() 173 | protos, masks, mask_bias = x[2] 174 | 175 | num_classes = scores.shape[2] 176 | batch_size, num_protos, h_protos, w_protos = protos.shape 177 | 178 | topk_values, topk_indexes = torch.topk(scores.view(batch_size, -1), self.max_detections, dim=1, sorted=False) 179 | 180 | scores = topk_values.unsqueeze(-1) 181 | 182 | topk_boxes = topk_indexes // num_classes 183 | labels = topk_indexes % num_classes 184 | 185 | topk_boxes = topk_boxes.unsqueeze(-1) 186 | labels = labels.unsqueeze(-1) 187 | 188 | boxes = torch.gather(boxes, 1, topk_boxes.repeat(1, 1, 4)) 189 | masks = torch.gather(masks, 1, topk_boxes.repeat(1, 1, num_protos)) 190 | 191 | total_detections = batch_size * self.max_detections 192 | 193 | batch_index = torch.ones( 194 | [batch_size, self.max_detections], device=boxes.device, dtype=torch.int32 195 | ) * torch.arange(batch_size, device=boxes.device, dtype=torch.int32).unsqueeze(1) 196 | batch_index = batch_index.view(total_detections) 197 | 198 | selected_boxes = boxes.view(total_detections, 4) 199 | selected_masks = masks.view(total_detections, -1) 200 | 201 | pooled_proto = RoiAlign.apply(protos, selected_boxes, batch_index, 1, 1, int(h_protos), int(w_protos), 0, 0.25) 202 | 203 | masks_protos = torch.matmul( 204 | selected_masks.unsqueeze(1), pooled_proto.view(total_detections, num_protos, h_protos * w_protos) 205 | ) 206 | masks_protos = masks_protos.view(batch_size, self.max_detections, h_protos * w_protos) + mask_bias 207 | 208 | return torch.cat([boxes, scores, labels.to(boxes.dtype), masks_protos], dim=-1) 209 | 210 | 211 | def rfdetr_seg_export(model_name, weights, nc, img_size, max_detections, device): 212 | if model_name == "rfdetr-seg-preview": 213 | model = RFDETRSegPreview(pretrain_weights=weights, resolution=img_size[0], num_classes=nc, device=device.type) 214 | else: 215 | raise NotImplementedError("Model not supported") 216 | class_names = model.class_names 217 | model = deepcopy(model.model.model) 218 | model.to(device) 219 | model.eval() 220 | if hasattr(model, "export"): 221 | model.export() 222 | if max_detections > model.num_queries: 223 | raise ValueError( 224 | f"The `max_detections={max_detections}` is higher than the model `num_queries={model.num_queries}`") 225 | return model, class_names 226 | 227 | 228 | def suppress_warnings(): 229 | import warnings 230 | warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) 231 | warnings.filterwarnings("ignore", category=UserWarning) 232 | warnings.filterwarnings("ignore", category=DeprecationWarning) 233 | warnings.filterwarnings("ignore", category=FutureWarning) 234 | warnings.filterwarnings("ignore", category=ResourceWarning) 235 | 236 | 237 | def main(args): 238 | suppress_warnings() 239 | 240 | print(f"\nStarting: {args.weights}") 241 | 242 | print("Opening RF-DETR-Seg model") 243 | 244 | device = torch.device("cpu") 245 | model, class_names = rfdetr_seg_export( 246 | args.model, args.weights, args.classes, args.size, args.max_detections, device 247 | ) 248 | 249 | if len(class_names.keys()) > 0: 250 | print("Creating labels.txt file") 251 | with open("labels.txt", "w", encoding="utf-8") as f: 252 | f.write("background\n") 253 | for i in range(1, args.classes + 1): 254 | if i in class_names: 255 | f.write(f"{class_names[i]}\n") 256 | else: 257 | f.write("empty\n") 258 | 259 | 260 | img_size = args.size * 2 if len(args.size) == 1 else args.size 261 | 262 | model = nn.Sequential( 263 | model, DeepStreamOutput(img_size, args.max_detections) 264 | ) 265 | 266 | onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device) 267 | onnx_output_file = args.weights.rsplit(".", 1)[0] + ".onnx" 268 | 269 | dynamic_axes = { 270 | "input": { 271 | 0: "batch" 272 | }, 273 | "output": { 274 | 0: "batch" 275 | } 276 | } 277 | 278 | print("Exporting the model to ONNX") 279 | torch.onnx.export( 280 | model, 281 | onnx_input_im, 282 | onnx_output_file, 283 | verbose=False, 284 | opset_version=args.opset, 285 | do_constant_folding=True, 286 | input_names=["input"], 287 | output_names=["output"], 288 | dynamic_axes=dynamic_axes if args.dynamic else None 289 | ) 290 | 291 | if args.simplify: 292 | print("Simplifying the ONNX model") 293 | import onnxslim 294 | model_onnx = onnx.load(onnx_output_file) 295 | model_onnx = onnxslim.slim(model_onnx) 296 | onnx.save(model_onnx, onnx_output_file) 297 | 298 | print(f"Done: {onnx_output_file}\n") 299 | 300 | 301 | def parse_args(): 302 | import argparse 303 | parser = argparse.ArgumentParser(description="DeepStream RF-DETR-Seg conversion") 304 | parser.add_argument("-m", "--model", required=True, help="Model name (required)") 305 | parser.add_argument("-w", "--weights", required=True, type=str, help="Input weights (.pt) file path (required)") 306 | parser.add_argument("-n", "--classes", type=int, default=91, help="Number of trained classes (default 91)") 307 | parser.add_argument("-s", "--size", nargs="+", type=int, default=[432], help="Inference size [H,W] (default [432])") 308 | parser.add_argument("--opset", type=int, default=17, help="ONNX opset version") 309 | parser.add_argument("--simplify", action="store_true", help="ONNX simplify model") 310 | parser.add_argument("--dynamic", action="store_true", help="Dynamic batch-size") 311 | parser.add_argument("--batch", type=int, default=1, help="Static batch-size") 312 | parser.add_argument( 313 | "--max-detections", type=int, default=100, help="Maximum number of output detections (default 100)" 314 | ) 315 | args = parser.parse_args() 316 | if not os.path.isfile(args.weights): 317 | raise SystemExit("Invalid weights file") 318 | if len(args.size) > 1 and args.size[0] != args.size[1]: 319 | raise SystemExit("RF-DETR model requires square resolution (width = height)") 320 | if args.dynamic and args.batch > 1: 321 | raise SystemExit("Cannot set dynamic batch-size and static batch-size at same time") 322 | return args 323 | 324 | 325 | if __name__ == "__main__": 326 | args = parse_args() 327 | main(args) 328 | -------------------------------------------------------------------------------- /nvdsinfer_custom_impl_Yolo_seg/trt_plugins/roiAlignPlugin/roiAlignPlugin.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | #include "roiAlignPlugin.h" 18 | #include "roiAlignKernel.h" 19 | #include 20 | #include 21 | 22 | using namespace nvinfer1; 23 | using namespace plugin; 24 | using nvinfer1::plugin::ROIAlign; 25 | using nvinfer1::plugin::ROIAlignPluginCreator; 26 | 27 | namespace 28 | { 29 | char const* kROIALIGN_PLUGIN_VERSION{"1"}; 30 | char const* kROIALIGN_PLUGIN_NAME{"ROIAlignX_TRT"}; 31 | size_t constexpr kSERIALIZATION_SIZE{sizeof(int32_t) * 5 + sizeof(float) + sizeof(int32_t) * 4}; 32 | } // namespace 33 | 34 | ROIAlignPluginCreator::ROIAlignPluginCreator() 35 | { 36 | static std::mutex sMutex; 37 | std::lock_guard guard(sMutex); 38 | mPluginAttributes.clear(); 39 | mPluginAttributes.emplace_back(PluginField("coordinate_transformation_mode", nullptr, PluginFieldType::kINT32, 1)); 40 | mPluginAttributes.emplace_back(PluginField("mode", nullptr, PluginFieldType::kINT32, 1)); 41 | mPluginAttributes.emplace_back(PluginField("output_height", nullptr, PluginFieldType::kINT32, 1)); 42 | mPluginAttributes.emplace_back(PluginField("output_width", nullptr, PluginFieldType::kINT32, 1)); 43 | mPluginAttributes.emplace_back(PluginField("sampling_ratio", nullptr, PluginFieldType::kINT32, 1)); 44 | mPluginAttributes.emplace_back(PluginField("spatial_scale", nullptr, PluginFieldType::kFLOAT32, 1)); 45 | 46 | mFC.nbFields = mPluginAttributes.size(); 47 | mFC.fields = mPluginAttributes.data(); 48 | } 49 | 50 | char const* ROIAlignPluginCreator::getPluginName() const noexcept 51 | { 52 | return kROIALIGN_PLUGIN_NAME; 53 | } 54 | 55 | char const* ROIAlignPluginCreator::getPluginVersion() const noexcept 56 | { 57 | return kROIALIGN_PLUGIN_VERSION; 58 | } 59 | 60 | PluginFieldCollection const* ROIAlignPluginCreator::getFieldNames() noexcept 61 | { 62 | return &mFC; 63 | } 64 | 65 | IPluginV2DynamicExt* ROIAlignPluginCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept 66 | { 67 | try 68 | { 69 | PLUGIN_VALIDATE(fc != nullptr); 70 | PluginField const* fields = fc->fields; 71 | 72 | // default values 73 | int32_t outputHeight = 1; 74 | int32_t outputWidth = 1; 75 | int32_t samplingRatio = 0; 76 | int32_t mode = 1; 77 | int32_t aligned = 1; 78 | float spatialScale = 1.0F; 79 | 80 | for (int32_t i = 0; i < fc->nbFields; ++i) 81 | { 82 | char const* attrName = fields[i].name; 83 | if (!strcmp(attrName, "output_height")) 84 | { 85 | PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32); 86 | outputHeight = static_cast(*(static_cast(fields[i].data))); 87 | } 88 | else if (!strcmp(attrName, "output_width")) 89 | { 90 | PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32); 91 | outputWidth = static_cast(*(static_cast(fields[i].data))); 92 | } 93 | else if (!strcmp(attrName, "sampling_ratio")) 94 | { 95 | PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32); 96 | samplingRatio = static_cast(*(static_cast(fields[i].data))); 97 | } 98 | else if (!strcmp(attrName, "mode")) 99 | { 100 | PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32); 101 | mode = static_cast(*(static_cast(fields[i].data))); 102 | } 103 | else if (!strcmp(attrName, "spatial_scale")) 104 | { 105 | PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kFLOAT32); 106 | spatialScale = static_cast(*(static_cast(fields[i].data))); 107 | } 108 | else if (!strcmp(attrName, "coordinate_transformation_mode")) 109 | { 110 | PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32); 111 | aligned = static_cast(*(static_cast(fields[i].data))); 112 | } 113 | } 114 | return new ROIAlign(outputHeight, outputWidth, samplingRatio, mode, spatialScale, aligned); 115 | } 116 | catch (std::exception const& e) 117 | { 118 | caughtError(e); 119 | } 120 | return nullptr; 121 | } 122 | 123 | IPluginV2DynamicExt* ROIAlignPluginCreator::deserializePlugin( 124 | char const* name, void const* data, size_t length) noexcept 125 | { 126 | try 127 | { 128 | PLUGIN_VALIDATE(data != nullptr); 129 | return new ROIAlign(data, length); 130 | } 131 | catch (std::exception const& e) 132 | { 133 | caughtError(e); 134 | } 135 | return nullptr; 136 | } 137 | 138 | int32_t ROIAlign::getNbOutputs() const noexcept 139 | { 140 | return 1; 141 | } 142 | 143 | int32_t ROIAlign::initialize() noexcept 144 | { 145 | int32_t device; 146 | PLUGIN_CHECK_CUDA(cudaGetDevice(&device)); 147 | cudaDeviceProp props; 148 | PLUGIN_CHECK_CUDA(cudaGetDeviceProperties(&props, device)); 149 | 150 | mMaxThreadsPerBlock = props.maxThreadsPerBlock; 151 | 152 | return 0; 153 | } 154 | 155 | void ROIAlign::terminate() noexcept {} 156 | 157 | void ROIAlign::destroy() noexcept 158 | { 159 | delete this; 160 | } 161 | 162 | size_t ROIAlign::getWorkspaceSize( 163 | PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept 164 | { 165 | return 0; 166 | } 167 | 168 | bool ROIAlign::supportsFormatCombination( 169 | int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept 170 | { 171 | PLUGIN_ASSERT(inOut != nullptr); 172 | PLUGIN_ASSERT(pos >= 0 && pos <= 3); 173 | PLUGIN_ASSERT(nbInputs == 3); 174 | PLUGIN_ASSERT(nbOutputs == 1); 175 | 176 | PluginTensorDesc const& desc = inOut[pos]; 177 | if (desc.format != TensorFormat::kLINEAR) 178 | { 179 | return false; 180 | } 181 | 182 | // first input should be float16 or float32 183 | if (pos == 0) 184 | { 185 | return (inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF); 186 | } 187 | 188 | // batch_indices always has to be int32 189 | if (pos == 2) 190 | { 191 | return (inOut[pos].type == nvinfer1::DataType::kINT32); 192 | } 193 | 194 | // rois and the output should have the same type as the first input 195 | return (inOut[pos].type == inOut[0].type); 196 | } 197 | 198 | char const* ROIAlign::getPluginType() const noexcept 199 | { 200 | return kROIALIGN_PLUGIN_NAME; 201 | } 202 | 203 | char const* ROIAlign::getPluginVersion() const noexcept 204 | { 205 | return kROIALIGN_PLUGIN_VERSION; 206 | } 207 | 208 | IPluginV2DynamicExt* ROIAlign::clone() const noexcept 209 | { 210 | try 211 | { 212 | auto plugin = new ROIAlign(*this); 213 | plugin->setPluginNamespace(mNameSpace.c_str()); 214 | return plugin; 215 | } 216 | catch (std::exception const& e) 217 | { 218 | caughtError(e); 219 | } 220 | return nullptr; 221 | } 222 | 223 | void ROIAlign::setPluginNamespace(char const* libNamespace) noexcept 224 | { 225 | try 226 | { 227 | PLUGIN_ASSERT(libNamespace != nullptr); 228 | mNameSpace = libNamespace; 229 | } 230 | catch (std::exception const& e) 231 | { 232 | gLogError << e.what() << std::endl; 233 | } 234 | } 235 | 236 | char const* ROIAlign::getPluginNamespace() const noexcept 237 | { 238 | return mNameSpace.c_str(); 239 | } 240 | 241 | void ROIAlign::checkValidInputs(nvinfer1::DynamicPluginTensorDesc const* inputs, int32_t nbInputDims) 242 | { 243 | PLUGIN_ASSERT(inputs != nullptr); 244 | PLUGIN_ASSERT(nbInputDims == 3); 245 | 246 | nvinfer1::Dims rois = inputs[1].desc.dims; 247 | nvinfer1::Dims batchIndices = inputs[2].desc.dims; 248 | 249 | PLUGIN_ASSERT(rois.nbDims == 2); 250 | PLUGIN_ASSERT(rois.d[1] == 4); 251 | 252 | PLUGIN_ASSERT(batchIndices.nbDims == 1); 253 | // Check batch_indices matches rois in length 254 | PLUGIN_ASSERT(rois.d[0] == batchIndices.d[0]); 255 | } 256 | 257 | void ROIAlign::validateAttributes( 258 | int32_t outputHeight, int32_t outputWidth, int32_t samplingRatio, int32_t mode, float spatialScale, int32_t aligned) 259 | { 260 | PLUGIN_VALIDATE(outputHeight > 0); 261 | PLUGIN_VALIDATE(outputWidth > 0); 262 | PLUGIN_VALIDATE(samplingRatio >= 0); 263 | PLUGIN_VALIDATE(mode == 0 || mode == 1); 264 | PLUGIN_VALIDATE(spatialScale > 0.0F); 265 | PLUGIN_VALIDATE(aligned == 0 || aligned == 1); 266 | } 267 | 268 | DimsExprs ROIAlign::getOutputDimensions( 269 | int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs, IExprBuilder& exprBuilder) noexcept 270 | { 271 | PLUGIN_ASSERT(inputs != nullptr); 272 | PLUGIN_ASSERT(nbInputs == 3); 273 | PLUGIN_ASSERT(outputIndex == 0); // there is only one output 274 | 275 | nvinfer1::DimsExprs result; 276 | result.nbDims = 4; 277 | 278 | // mROICount 279 | result.d[0] = inputs[1].d[0]; 280 | // mFeatureLength 281 | result.d[1] = inputs[0].d[1]; 282 | // height 283 | auto const* height = exprBuilder.constant(mOutputHeight); 284 | PLUGIN_ASSERT(height != nullptr); 285 | result.d[2] = height; 286 | // width 287 | auto const* width = exprBuilder.constant(mOutputWidth); 288 | PLUGIN_ASSERT(width != nullptr); 289 | result.d[3] = width; 290 | 291 | return result; 292 | } 293 | 294 | int32_t ROIAlign::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* /* outputDesc */, 295 | void const* const* inputs, void* const* outputs, void* /* workspace */, cudaStream_t stream) noexcept 296 | { 297 | PLUGIN_VALIDATE(inputDesc != nullptr && inputs != nullptr && outputs != nullptr); 298 | 299 | // No-op pass-through for empty ROIs 300 | if (mROICount == 0) 301 | { 302 | return 0; 303 | } 304 | 305 | auto type = inputDesc[0].type; 306 | 307 | PLUGIN_ASSERT(type == nvinfer1::DataType::kHALF || type == nvinfer1::DataType::kFLOAT); 308 | 309 | switch (type) 310 | { 311 | case nvinfer1::DataType::kFLOAT: 312 | { 313 | auto bottomData = static_cast(inputs[0]); 314 | auto bottomRois = static_cast(inputs[1]); 315 | auto batchIndicesPtr = static_cast(inputs[2]); 316 | auto topData = static_cast(outputs[0]); 317 | 318 | return RoiAlignImpl(stream, mMaxThreadsPerBlock, bottomData, mSpatialScale, mROICount, mFeatureLength, 319 | mHeight, mWidth, mOutputHeight, mOutputWidth, mSamplingRatio, bottomRois, topData, mMode, batchIndicesPtr, 320 | mAligned); 321 | } 322 | break; 323 | case nvinfer1::DataType::kHALF: 324 | { 325 | auto bottomData = static_cast<__half const*>(inputs[0]); 326 | auto bottomRois = static_cast<__half const*>(inputs[1]); 327 | auto batchIndicesPtr = static_cast(inputs[2]); 328 | auto topData = static_cast<__half*>(outputs[0]); 329 | 330 | return RoiAlignImpl<__half>(stream, mMaxThreadsPerBlock, bottomData, mSpatialScale, mROICount, mFeatureLength, 331 | mHeight, mWidth, mOutputHeight, mOutputWidth, mSamplingRatio, bottomRois, topData, mMode, batchIndicesPtr, 332 | mAligned); 333 | } 334 | break; 335 | default: return -1; 336 | } 337 | 338 | return 0; 339 | } 340 | 341 | size_t ROIAlign::getSerializationSize() const noexcept 342 | { 343 | return kSERIALIZATION_SIZE; 344 | } 345 | 346 | void ROIAlign::serialize(void* buffer) const noexcept 347 | { 348 | PLUGIN_VALIDATE(buffer != nullptr); 349 | char* d = static_cast(buffer); 350 | char* a = d; 351 | write(d, mAligned); // int32_t 352 | write(d, mMode); // int32_t 353 | write(d, mOutputHeight); // int32_t 354 | write(d, mOutputWidth); // int32_t 355 | write(d, mSamplingRatio); // int32_t 356 | write(d, mSpatialScale); // float 357 | 358 | write(d, mROICount); // int32_t 359 | write(d, mFeatureLength); // int32_t 360 | write(d, mHeight); // int32_t 361 | write(d, mWidth); // int32_t 362 | PLUGIN_ASSERT(d == a + getSerializationSize()); 363 | } 364 | 365 | ROIAlign::ROIAlign( 366 | int32_t outputHeight, int32_t outputWidth, int32_t samplingRatio, int32_t mode, float spatialScale, int32_t aligned) 367 | : mOutputHeight(outputHeight) 368 | , mOutputWidth(outputWidth) 369 | , mSamplingRatio(samplingRatio) 370 | , mSpatialScale(spatialScale) 371 | , mMode(mode) 372 | , mAligned(aligned) 373 | { 374 | validateAttributes(mOutputHeight, mOutputWidth, mSamplingRatio, mMode, mSpatialScale, mAligned); 375 | } 376 | 377 | ROIAlign::ROIAlign(void const* data, size_t length) 378 | { 379 | PLUGIN_VALIDATE(data != nullptr); 380 | PLUGIN_VALIDATE(length == kSERIALIZATION_SIZE); 381 | 382 | char const* d = static_cast(data); 383 | char const* a = d; 384 | 385 | mAligned = read(d); 386 | mMode = read(d); 387 | mOutputHeight = read(d); 388 | mOutputWidth = read(d); 389 | mSamplingRatio = read(d); 390 | mSpatialScale = read(d); 391 | 392 | mROICount = read(d); 393 | mFeatureLength = read(d); 394 | mHeight = read(d); 395 | mWidth = read(d); 396 | 397 | PLUGIN_VALIDATE(d == a + length); 398 | validateAttributes(mOutputHeight, mOutputWidth, mSamplingRatio, mMode, mSpatialScale, mAligned); 399 | } 400 | 401 | DataType ROIAlign::getOutputDataType( 402 | int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept 403 | { 404 | PLUGIN_ASSERT(inputTypes != nullptr); 405 | PLUGIN_ASSERT(nbInputs == 3); 406 | PLUGIN_ASSERT(index == 0); 407 | return inputTypes[0]; 408 | } 409 | 410 | void ROIAlign::configurePlugin( 411 | DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept 412 | { 413 | PLUGIN_ASSERT(in != nullptr); 414 | PLUGIN_ASSERT(out != nullptr); 415 | PLUGIN_ASSERT(nbOutputs == 1); 416 | PLUGIN_ASSERT(nbInputs == 3); 417 | 418 | checkValidInputs(in, nbInputs); 419 | 420 | mFeatureLength = in[0].desc.dims.d[1]; 421 | mHeight = in[0].desc.dims.d[2]; 422 | mWidth = in[0].desc.dims.d[3]; 423 | 424 | mROICount = in[1].desc.dims.d[0]; 425 | } 426 | 427 | REGISTER_TENSORRT_PLUGIN(ROIAlignPluginCreator); 428 | -------------------------------------------------------------------------------- /nvdsinfer_custom_impl_Yolo_seg/trt_plugins/efficientNMSPlugin/efficientNMSPlugin.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #include "efficientNMSPlugin.h" 19 | #include "efficientNMSInference.h" 20 | 21 | using namespace nvinfer1; 22 | using nvinfer1::plugin::EfficientNMSPlugin; 23 | using nvinfer1::plugin::EfficientNMSParameters; 24 | using nvinfer1::plugin::EfficientNMSPluginCreator; 25 | 26 | namespace 27 | { 28 | char const* const kEFFICIENT_NMS_PLUGIN_VERSION{"1"}; 29 | char const* const kEFFICIENT_NMS_PLUGIN_NAME{"EfficientNMSX_TRT"}; 30 | } // namespace 31 | 32 | EfficientNMSPlugin::EfficientNMSPlugin(EfficientNMSParameters param) 33 | : mParam(std::move(param)) 34 | { 35 | } 36 | 37 | EfficientNMSPlugin::EfficientNMSPlugin(void const* data, size_t length) 38 | { 39 | deserialize(static_cast(data), length); 40 | } 41 | 42 | void EfficientNMSPlugin::deserialize(int8_t const* data, size_t length) 43 | { 44 | auto const* d{data}; 45 | mParam = read(d); 46 | PLUGIN_VALIDATE(d == data + length); 47 | } 48 | 49 | char const* EfficientNMSPlugin::getPluginType() const noexcept 50 | { 51 | return kEFFICIENT_NMS_PLUGIN_NAME; 52 | } 53 | 54 | char const* EfficientNMSPlugin::getPluginVersion() const noexcept 55 | { 56 | return kEFFICIENT_NMS_PLUGIN_VERSION; 57 | } 58 | 59 | int32_t EfficientNMSPlugin::getNbOutputs() const noexcept 60 | { 61 | // Standard Plugin Implementation 62 | return 5; 63 | } 64 | 65 | int32_t EfficientNMSPlugin::initialize() noexcept 66 | { 67 | if (!initialized) 68 | { 69 | int32_t device; 70 | CSC(cudaGetDevice(&device), STATUS_FAILURE); 71 | struct cudaDeviceProp properties; 72 | CSC(cudaGetDeviceProperties(&properties, device), STATUS_FAILURE); 73 | if (properties.regsPerBlock >= 65536) 74 | { 75 | // Most Devices 76 | mParam.numSelectedBoxes = 5000; 77 | } 78 | else 79 | { 80 | // Jetson TX1/TX2 81 | mParam.numSelectedBoxes = 2000; 82 | } 83 | initialized = true; 84 | } 85 | return STATUS_SUCCESS; 86 | } 87 | 88 | void EfficientNMSPlugin::terminate() noexcept {} 89 | 90 | size_t EfficientNMSPlugin::getSerializationSize() const noexcept 91 | { 92 | return sizeof(EfficientNMSParameters); 93 | } 94 | 95 | void EfficientNMSPlugin::serialize(void* buffer) const noexcept 96 | { 97 | char *d = reinterpret_cast(buffer), *a = d; 98 | write(d, mParam); 99 | PLUGIN_ASSERT(d == a + getSerializationSize()); 100 | } 101 | 102 | void EfficientNMSPlugin::destroy() noexcept 103 | { 104 | delete this; 105 | } 106 | 107 | void EfficientNMSPlugin::setPluginNamespace(char const* pluginNamespace) noexcept 108 | { 109 | try 110 | { 111 | mNamespace = pluginNamespace; 112 | } 113 | catch (std::exception const& e) 114 | { 115 | caughtError(e); 116 | } 117 | } 118 | 119 | char const* EfficientNMSPlugin::getPluginNamespace() const noexcept 120 | { 121 | return mNamespace.c_str(); 122 | } 123 | 124 | nvinfer1::DataType EfficientNMSPlugin::getOutputDataType( 125 | int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept 126 | { 127 | // On standard NMS, num_detections and detection_classes use integer outputs 128 | if (index == 0 || index == 3 || index == 4) 129 | { 130 | return nvinfer1::DataType::kINT32; 131 | } 132 | // All others should use the same datatype as the input 133 | return inputTypes[0]; 134 | } 135 | 136 | IPluginV2DynamicExt* EfficientNMSPlugin::clone() const noexcept 137 | { 138 | try 139 | { 140 | auto* plugin = new EfficientNMSPlugin(mParam); 141 | plugin->setPluginNamespace(mNamespace.c_str()); 142 | return plugin; 143 | } 144 | catch (std::exception const& e) 145 | { 146 | caughtError(e); 147 | } 148 | return nullptr; 149 | } 150 | 151 | DimsExprs EfficientNMSPlugin::getOutputDimensions( 152 | int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs, IExprBuilder& exprBuilder) noexcept 153 | { 154 | try 155 | { 156 | DimsExprs out_dim; 157 | 158 | // When pad per class is set, the output size may need to be reduced: 159 | // i.e.: outputBoxes = min(outputBoxes, outputBoxesPerClass * numClasses) 160 | // As the number of classes may not be static, numOutputBoxes must be a dynamic 161 | // expression. The corresponding parameter can not be set at this time, so the 162 | // value will be calculated again in configurePlugin() and the param overwritten. 163 | IDimensionExpr const* numOutputBoxes = exprBuilder.constant(mParam.numOutputBoxes); 164 | if (mParam.padOutputBoxesPerClass && mParam.numOutputBoxesPerClass > 0) 165 | { 166 | IDimensionExpr const* numOutputBoxesPerClass = exprBuilder.constant(mParam.numOutputBoxesPerClass); 167 | IDimensionExpr const* numClasses = inputs[1].d[2]; 168 | numOutputBoxes = exprBuilder.operation(DimensionOperation::kMIN, *numOutputBoxes, 169 | *exprBuilder.operation(DimensionOperation::kPROD, *numOutputBoxesPerClass, *numClasses)); 170 | } 171 | 172 | // Standard NMS 173 | PLUGIN_ASSERT(outputIndex >= 0 && outputIndex <= 4); 174 | 175 | // num_detections 176 | if (outputIndex == 0) 177 | { 178 | out_dim.nbDims = 2; 179 | out_dim.d[0] = inputs[0].d[0]; 180 | out_dim.d[1] = exprBuilder.constant(1); 181 | } 182 | // detection_boxes 183 | else if (outputIndex == 1) 184 | { 185 | out_dim.nbDims = 3; 186 | out_dim.d[0] = inputs[0].d[0]; 187 | out_dim.d[1] = numOutputBoxes; 188 | out_dim.d[2] = exprBuilder.constant(4); 189 | } 190 | // detection_scores: outputIndex == 2 191 | // detection_classes: outputIndex == 3 192 | else if (outputIndex == 2 || outputIndex == 3 || outputIndex == 4) 193 | { 194 | out_dim.nbDims = 2; 195 | out_dim.d[0] = inputs[0].d[0]; 196 | out_dim.d[1] = numOutputBoxes; 197 | } 198 | 199 | return out_dim; 200 | } 201 | catch (std::exception const& e) 202 | { 203 | caughtError(e); 204 | } 205 | return DimsExprs{}; 206 | } 207 | 208 | bool EfficientNMSPlugin::supportsFormatCombination( 209 | int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept 210 | { 211 | if (inOut[pos].format != PluginFormat::kLINEAR) 212 | { 213 | return false; 214 | } 215 | 216 | PLUGIN_ASSERT(nbInputs == 2 || nbInputs == 3); 217 | PLUGIN_ASSERT(nbOutputs == 5); 218 | if (nbInputs == 2) 219 | { 220 | PLUGIN_ASSERT(0 <= pos && pos <= 6); 221 | } 222 | if (nbInputs == 3) 223 | { 224 | PLUGIN_ASSERT(0 <= pos && pos <= 7); 225 | } 226 | 227 | // num_detections and detection_classes output: int32_t 228 | int32_t const posOut = pos - nbInputs; 229 | if (posOut == 0 || posOut == 3 || posOut == 4) 230 | { 231 | return inOut[pos].type == DataType::kINT32 && inOut[pos].format == PluginFormat::kLINEAR; 232 | } 233 | 234 | // all other inputs/outputs: fp32 or fp16 235 | return (inOut[pos].type == DataType::kHALF || inOut[pos].type == DataType::kFLOAT) 236 | && (inOut[0].type == inOut[pos].type); 237 | } 238 | 239 | void EfficientNMSPlugin::configurePlugin( 240 | DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept 241 | { 242 | try 243 | { 244 | // Accepts two or three inputs 245 | // If two inputs: [0] boxes, [1] scores 246 | // If three inputs: [0] boxes, [1] scores, [2] anchors 247 | PLUGIN_ASSERT(nbInputs == 2 || nbInputs == 3); 248 | PLUGIN_ASSERT(nbOutputs == 5); 249 | 250 | mParam.datatype = in[0].desc.type; 251 | 252 | // Shape of scores input should be 253 | // [batch_size, num_boxes, num_classes] or [batch_size, num_boxes, num_classes, 1] 254 | PLUGIN_ASSERT(in[1].desc.dims.nbDims == 3 || (in[1].desc.dims.nbDims == 4 && in[1].desc.dims.d[3] == 1)); 255 | mParam.numScoreElements = in[1].desc.dims.d[1] * in[1].desc.dims.d[2]; 256 | mParam.numClasses = in[1].desc.dims.d[2]; 257 | 258 | // When pad per class is set, the total output boxes size may need to be reduced. 259 | // This operation is also done in getOutputDimension(), but for dynamic shapes, the 260 | // numOutputBoxes param can't be set until the number of classes is fully known here. 261 | if (mParam.padOutputBoxesPerClass && mParam.numOutputBoxesPerClass > 0) 262 | { 263 | if (mParam.numOutputBoxesPerClass * mParam.numClasses < mParam.numOutputBoxes) 264 | { 265 | mParam.numOutputBoxes = mParam.numOutputBoxesPerClass * mParam.numClasses; 266 | } 267 | } 268 | 269 | // Shape of boxes input should be 270 | // [batch_size, num_boxes, 4] or [batch_size, num_boxes, 1, 4] or [batch_size, num_boxes, num_classes, 4] 271 | PLUGIN_ASSERT(in[0].desc.dims.nbDims == 3 || in[0].desc.dims.nbDims == 4); 272 | if (in[0].desc.dims.nbDims == 3) 273 | { 274 | PLUGIN_ASSERT(in[0].desc.dims.d[2] == 4); 275 | mParam.shareLocation = true; 276 | mParam.numBoxElements = in[0].desc.dims.d[1] * in[0].desc.dims.d[2]; 277 | } 278 | else 279 | { 280 | mParam.shareLocation = (in[0].desc.dims.d[2] == 1); 281 | PLUGIN_ASSERT(in[0].desc.dims.d[2] == mParam.numClasses || mParam.shareLocation); 282 | PLUGIN_ASSERT(in[0].desc.dims.d[3] == 4); 283 | mParam.numBoxElements = in[0].desc.dims.d[1] * in[0].desc.dims.d[2] * in[0].desc.dims.d[3]; 284 | } 285 | mParam.numAnchors = in[0].desc.dims.d[1]; 286 | 287 | if (nbInputs == 2) 288 | { 289 | // Only two inputs are used, disable the fused box decoder 290 | mParam.boxDecoder = false; 291 | } 292 | if (nbInputs == 3) 293 | { 294 | // All three inputs are used, enable the box decoder 295 | // Shape of anchors input should be 296 | // Constant shape: [1, numAnchors, 4] or [batch_size, numAnchors, 4] 297 | PLUGIN_ASSERT(in[2].desc.dims.nbDims == 3); 298 | mParam.boxDecoder = true; 299 | mParam.shareAnchors = (in[2].desc.dims.d[0] == 1); 300 | } 301 | } 302 | catch (std::exception const& e) 303 | { 304 | caughtError(e); 305 | } 306 | } 307 | 308 | size_t EfficientNMSPlugin::getWorkspaceSize( 309 | PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept 310 | { 311 | int32_t batchSize = inputs[1].dims.d[0]; 312 | int32_t numScoreElements = inputs[1].dims.d[1] * inputs[1].dims.d[2]; 313 | int32_t numClasses = inputs[1].dims.d[2]; 314 | return EfficientNMSWorkspaceSize(batchSize, numScoreElements, numClasses, mParam.datatype); 315 | } 316 | 317 | int32_t EfficientNMSPlugin::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* /* outputDesc */, 318 | void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept 319 | { 320 | try 321 | { 322 | PLUGIN_VALIDATE(inputDesc != nullptr && inputs != nullptr && outputs != nullptr && workspace != nullptr); 323 | 324 | mParam.batchSize = inputDesc[0].dims.d[0]; 325 | 326 | // Standard NMS Operation 327 | void const* const boxesInput = inputs[0]; 328 | void const* const scoresInput = inputs[1]; 329 | void const* const anchorsInput = mParam.boxDecoder ? inputs[2] : nullptr; 330 | 331 | void* numDetectionsOutput = outputs[0]; 332 | void* nmsBoxesOutput = outputs[1]; 333 | void* nmsScoresOutput = outputs[2]; 334 | void* nmsClassesOutput = outputs[3]; 335 | void* nmsIndicesOutput = outputs[4]; 336 | 337 | return EfficientNMSInference(mParam, boxesInput, scoresInput, anchorsInput, numDetectionsOutput, nmsBoxesOutput, 338 | nmsScoresOutput, nmsClassesOutput, nmsIndicesOutput, workspace, stream); 339 | } 340 | catch (std::exception const& e) 341 | { 342 | caughtError(e); 343 | } 344 | return -1; 345 | } 346 | 347 | // Standard NMS Plugin Operation 348 | 349 | EfficientNMSPluginCreator::EfficientNMSPluginCreator() 350 | : mParam{} 351 | { 352 | mPluginAttributes.clear(); 353 | mPluginAttributes.emplace_back(PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1)); 354 | mPluginAttributes.emplace_back(PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1)); 355 | mPluginAttributes.emplace_back(PluginField("max_output_boxes", nullptr, PluginFieldType::kINT32, 1)); 356 | mPluginAttributes.emplace_back(PluginField("background_class", nullptr, PluginFieldType::kINT32, 1)); 357 | mPluginAttributes.emplace_back(PluginField("score_activation", nullptr, PluginFieldType::kINT32, 1)); 358 | mPluginAttributes.emplace_back(PluginField("class_agnostic", nullptr, PluginFieldType::kINT32, 1)); 359 | mPluginAttributes.emplace_back(PluginField("box_coding", nullptr, PluginFieldType::kINT32, 1)); 360 | mFC.nbFields = mPluginAttributes.size(); 361 | mFC.fields = mPluginAttributes.data(); 362 | } 363 | 364 | char const* EfficientNMSPluginCreator::getPluginName() const noexcept 365 | { 366 | return kEFFICIENT_NMS_PLUGIN_NAME; 367 | } 368 | 369 | char const* EfficientNMSPluginCreator::getPluginVersion() const noexcept 370 | { 371 | return kEFFICIENT_NMS_PLUGIN_VERSION; 372 | } 373 | 374 | PluginFieldCollection const* EfficientNMSPluginCreator::getFieldNames() noexcept 375 | { 376 | return &mFC; 377 | } 378 | 379 | IPluginV2DynamicExt* EfficientNMSPluginCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept 380 | { 381 | try 382 | { 383 | PLUGIN_VALIDATE(fc != nullptr); 384 | PluginField const* fields = fc->fields; 385 | PLUGIN_VALIDATE(fields != nullptr); 386 | plugin::validateRequiredAttributesExist({"score_threshold", "iou_threshold", "max_output_boxes", 387 | "background_class", "score_activation", "box_coding"}, 388 | fc); 389 | for (int32_t i{0}; i < fc->nbFields; ++i) 390 | { 391 | char const* attrName = fields[i].name; 392 | if (!strcmp(attrName, "score_threshold")) 393 | { 394 | PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kFLOAT32); 395 | auto const scoreThreshold = *(static_cast(fields[i].data)); 396 | PLUGIN_VALIDATE(scoreThreshold >= 0.0F); 397 | mParam.scoreThreshold = scoreThreshold; 398 | } 399 | if (!strcmp(attrName, "iou_threshold")) 400 | { 401 | PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kFLOAT32); 402 | auto const iouThreshold = *(static_cast(fields[i].data)); 403 | PLUGIN_VALIDATE(iouThreshold > 0.0F); 404 | mParam.iouThreshold = iouThreshold; 405 | } 406 | if (!strcmp(attrName, "max_output_boxes")) 407 | { 408 | PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32); 409 | auto const numOutputBoxes = *(static_cast(fields[i].data)); 410 | PLUGIN_VALIDATE(numOutputBoxes > 0); 411 | mParam.numOutputBoxes = numOutputBoxes; 412 | } 413 | if (!strcmp(attrName, "background_class")) 414 | { 415 | PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32); 416 | mParam.backgroundClass = *(static_cast(fields[i].data)); 417 | } 418 | if (!strcmp(attrName, "score_activation")) 419 | { 420 | auto const scoreSigmoid = *(static_cast(fields[i].data)); 421 | PLUGIN_VALIDATE(scoreSigmoid == 0 || scoreSigmoid == 1); 422 | mParam.scoreSigmoid = static_cast(scoreSigmoid); 423 | } 424 | if (!strcmp(attrName, "class_agnostic")) 425 | { 426 | auto const classAgnostic = *(static_cast(fields[i].data)); 427 | PLUGIN_VALIDATE(classAgnostic == 0 || classAgnostic == 1); 428 | mParam.classAgnostic = static_cast(classAgnostic); 429 | } 430 | if (!strcmp(attrName, "box_coding")) 431 | { 432 | PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32); 433 | auto const boxCoding = *(static_cast(fields[i].data)); 434 | PLUGIN_VALIDATE(boxCoding == 0 || boxCoding == 1); 435 | mParam.boxCoding = boxCoding; 436 | } 437 | } 438 | 439 | auto* plugin = new EfficientNMSPlugin(mParam); 440 | plugin->setPluginNamespace(mNamespace.c_str()); 441 | return plugin; 442 | } 443 | catch (std::exception const& e) 444 | { 445 | caughtError(e); 446 | } 447 | return nullptr; 448 | } 449 | 450 | IPluginV2DynamicExt* EfficientNMSPluginCreator::deserializePlugin( 451 | char const* name, void const* serialData, size_t serialLength) noexcept 452 | { 453 | try 454 | { 455 | // This object will be deleted when the network is destroyed, which will 456 | // call EfficientNMSPlugin::destroy() 457 | auto* plugin = new EfficientNMSPlugin(serialData, serialLength); 458 | plugin->setPluginNamespace(mNamespace.c_str()); 459 | return plugin; 460 | } 461 | catch (std::exception const& e) 462 | { 463 | caughtError(e); 464 | } 465 | return nullptr; 466 | } 467 | 468 | REGISTER_TENSORRT_PLUGIN(EfficientNMSPluginCreator); 469 | -------------------------------------------------------------------------------- /nvdsinfer_custom_impl_Yolo_seg/trt_plugins/efficientNMSPlugin/efficientNMSInference.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: Apache-2.0 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | #include "../common.h" 19 | #include "cub/cub.cuh" 20 | #include "cuda_runtime_api.h" 21 | 22 | #include "efficientNMSInference.cuh" 23 | #include "efficientNMSInference.h" 24 | 25 | #define NMS_TILES 5 26 | 27 | using namespace nvinfer1; 28 | using namespace nvinfer1::plugin; 29 | 30 | template 31 | __device__ float IOU(EfficientNMSParameters param, BoxCorner box1, BoxCorner box2) 32 | { 33 | // Regardless of the selected box coding, IOU is always performed in BoxCorner coding. 34 | // The boxes are copied so that they can be reordered without affecting the originals. 35 | BoxCorner b1 = box1; 36 | BoxCorner b2 = box2; 37 | b1.reorder(); 38 | b2.reorder(); 39 | float intersectArea = BoxCorner::intersect(b1, b2).area(); 40 | if (intersectArea <= 0.f) 41 | { 42 | return 0.f; 43 | } 44 | float unionArea = b1.area() + b2.area() - intersectArea; 45 | if (unionArea <= 0.f) 46 | { 47 | return 0.f; 48 | } 49 | return intersectArea / unionArea; 50 | } 51 | 52 | template 53 | __device__ BoxCorner DecodeBoxes(EfficientNMSParameters param, int boxIdx, int anchorIdx, 54 | const Tb* __restrict__ boxesInput, const Tb* __restrict__ anchorsInput) 55 | { 56 | // The inputs will be in the selected coding format, as well as the decoding function. But the decoded box 57 | // will always be returned as BoxCorner. 58 | Tb box = boxesInput[boxIdx]; 59 | if (!param.boxDecoder) 60 | { 61 | return BoxCorner(box); 62 | } 63 | Tb anchor = anchorsInput[anchorIdx]; 64 | box.reorder(); 65 | anchor.reorder(); 66 | return BoxCorner(box.decode(anchor)); 67 | } 68 | 69 | template 70 | __device__ void MapNMSData(EfficientNMSParameters param, int idx, int imageIdx, const Tb* __restrict__ boxesInput, 71 | const Tb* __restrict__ anchorsInput, const int* __restrict__ topClassData, const int* __restrict__ topAnchorsData, 72 | const int* __restrict__ topNumData, const T* __restrict__ sortedScoresData, const int* __restrict__ sortedIndexData, 73 | T& scoreMap, int& classMap, BoxCorner& boxMap, int& boxIdxMap) 74 | { 75 | // idx: Holds the NMS box index, within the current batch. 76 | // idxSort: Holds the batched NMS box index, which indexes the (filtered, but sorted) score buffer. 77 | // scoreMap: Holds the score that corresponds to the indexed box being processed by NMS. 78 | if (idx >= topNumData[imageIdx]) 79 | { 80 | return; 81 | } 82 | int idxSort = imageIdx * param.numScoreElements + idx; 83 | scoreMap = sortedScoresData[idxSort]; 84 | 85 | // idxMap: Holds the re-mapped index, which indexes the (filtered, but unsorted) buffers. 86 | // classMap: Holds the class that corresponds to the idx'th sorted score being processed by NMS. 87 | // anchorMap: Holds the anchor that corresponds to the idx'th sorted score being processed by NMS. 88 | int idxMap = imageIdx * param.numScoreElements + sortedIndexData[idxSort]; 89 | classMap = topClassData[idxMap]; 90 | int anchorMap = topAnchorsData[idxMap]; 91 | 92 | // boxIdxMap: Holds the re-re-mapped index, which indexes the (unfiltered, and unsorted) boxes input buffer. 93 | boxIdxMap = -1; 94 | if (param.shareLocation) // Shape of boxesInput: [batchSize, numAnchors, 1, 4] 95 | { 96 | boxIdxMap = imageIdx * param.numAnchors + anchorMap; 97 | } 98 | else // Shape of boxesInput: [batchSize, numAnchors, numClasses, 4] 99 | { 100 | int batchOffset = imageIdx * param.numAnchors * param.numClasses; 101 | int anchorOffset = anchorMap * param.numClasses; 102 | boxIdxMap = batchOffset + anchorOffset + classMap; 103 | } 104 | // anchorIdxMap: Holds the re-re-mapped index, which indexes the (unfiltered, and unsorted) anchors input buffer. 105 | int anchorIdxMap = -1; 106 | if (param.shareAnchors) // Shape of anchorsInput: [1, numAnchors, 4] 107 | { 108 | anchorIdxMap = anchorMap; 109 | } 110 | else // Shape of anchorsInput: [batchSize, numAnchors, 4] 111 | { 112 | anchorIdxMap = imageIdx * param.numAnchors + anchorMap; 113 | } 114 | // boxMap: Holds the box that corresponds to the idx'th sorted score being processed by NMS. 115 | boxMap = DecodeBoxes(param, boxIdxMap, anchorIdxMap, boxesInput, anchorsInput); 116 | } 117 | 118 | template 119 | __device__ void WriteNMSResult(EfficientNMSParameters param, int* __restrict__ numDetectionsOutput, 120 | T* __restrict__ nmsScoresOutput, int* __restrict__ nmsClassesOutput, BoxCorner* __restrict__ nmsBoxesOutput, 121 | int* __restrict__ nmsIndicesOutput, T threadScore, int threadClass, BoxCorner threadBox, int boxIdxMap, 122 | int imageIdx, unsigned int resultsCounter) 123 | { 124 | int outputIdx = imageIdx * param.numOutputBoxes + resultsCounter - 1; 125 | if (param.scoreSigmoid) 126 | { 127 | nmsScoresOutput[outputIdx] = sigmoid_mp(threadScore); 128 | } 129 | else if (param.scoreBits > 0) 130 | { 131 | nmsScoresOutput[outputIdx] = add_mp(threadScore, (T) -1); 132 | } 133 | else 134 | { 135 | nmsScoresOutput[outputIdx] = threadScore; 136 | } 137 | nmsClassesOutput[outputIdx] = threadClass; 138 | if (param.clipBoxes) 139 | { 140 | nmsBoxesOutput[outputIdx] = threadBox.clip((T) 0, (T) 1); 141 | } 142 | else 143 | { 144 | nmsBoxesOutput[outputIdx] = threadBox; 145 | } 146 | nmsIndicesOutput[outputIdx] = boxIdxMap % param.numAnchors; 147 | numDetectionsOutput[imageIdx] = resultsCounter; 148 | 149 | } 150 | 151 | template 152 | __global__ void EfficientNMS(EfficientNMSParameters param, const int* topNumData, int* outputIndexData, 153 | int* outputClassData, const int* sortedIndexData, const T* __restrict__ sortedScoresData, 154 | const int* __restrict__ topClassData, const int* __restrict__ topAnchorsData, const Tb* __restrict__ boxesInput, 155 | const Tb* __restrict__ anchorsInput, int* __restrict__ numDetectionsOutput, T* __restrict__ nmsScoresOutput, 156 | int* __restrict__ nmsClassesOutput, int* __restrict__ nmsIndicesOutput, BoxCorner* __restrict__ nmsBoxesOutput) 157 | { 158 | unsigned int thread = threadIdx.x; 159 | unsigned int imageIdx = blockIdx.y; 160 | unsigned int tileSize = blockDim.x; 161 | if (imageIdx >= param.batchSize) 162 | { 163 | return; 164 | } 165 | 166 | int numSelectedBoxes = min(topNumData[imageIdx], param.numSelectedBoxes); 167 | int numTiles = (numSelectedBoxes + tileSize - 1) / tileSize; 168 | if (thread >= numSelectedBoxes) 169 | { 170 | return; 171 | } 172 | 173 | __shared__ int blockState; 174 | __shared__ unsigned int resultsCounter; 175 | if (thread == 0) 176 | { 177 | blockState = 0; 178 | resultsCounter = 0; 179 | } 180 | 181 | int threadState[NMS_TILES]; 182 | unsigned int boxIdx[NMS_TILES]; 183 | T threadScore[NMS_TILES]; 184 | int threadClass[NMS_TILES]; 185 | BoxCorner threadBox[NMS_TILES]; 186 | int boxIdxMap[NMS_TILES]; 187 | for (int tile = 0; tile < numTiles; tile++) 188 | { 189 | threadState[tile] = 0; 190 | boxIdx[tile] = thread + tile * blockDim.x; 191 | MapNMSData(param, boxIdx[tile], imageIdx, boxesInput, anchorsInput, topClassData, topAnchorsData, 192 | topNumData, sortedScoresData, sortedIndexData, threadScore[tile], threadClass[tile], threadBox[tile], 193 | boxIdxMap[tile]); 194 | } 195 | 196 | // Iterate through all boxes to NMS against. 197 | for (int i = 0; i < numSelectedBoxes; i++) 198 | { 199 | int tile = i / tileSize; 200 | 201 | if (boxIdx[tile] == i) 202 | { 203 | // Iteration lead thread, figure out what the other threads should do, 204 | // this will be signaled via the blockState shared variable. 205 | if (threadState[tile] == -1) 206 | { 207 | // Thread already dead, this box was already dropped in a previous iteration, 208 | // because it had a large IOU overlap with another lead thread previously, so 209 | // it would never be kept anyway, therefore it can safely be skip all IOU operations 210 | // in this iteration. 211 | blockState = -1; // -1 => Signal all threads to skip iteration 212 | } 213 | else if (threadState[tile] == 0) 214 | { 215 | // As this box will be kept, this is a good place to find what index in the results buffer it 216 | // should have, as this allows to perform an early loop exit if there are enough results. 217 | if (resultsCounter >= param.numOutputBoxes) 218 | { 219 | blockState = -2; // -2 => Signal all threads to do an early loop exit. 220 | } 221 | else 222 | { 223 | // Thread is still alive, because it has not had a large enough IOU overlap with 224 | // any other kept box previously. Therefore, this box will be kept for sure. However, 225 | // we need to check against all other subsequent boxes from this position onward, 226 | // to see how those other boxes will behave in future iterations. 227 | blockState = 1; // +1 => Signal all (higher index) threads to calculate IOU against this box 228 | threadState[tile] = 1; // +1 => Mark this box's thread to be kept and written out to results 229 | 230 | // If the numOutputBoxesPerClass check is enabled, write the result only if the limit for this 231 | // class on this image has not been reached yet. Other than (possibly) skipping the write, this 232 | // won't affect anything else in the NMS threading. 233 | bool write = true; 234 | if (param.numOutputBoxesPerClass >= 0) 235 | { 236 | int classCounterIdx = imageIdx * param.numClasses + threadClass[tile]; 237 | write = (outputClassData[classCounterIdx] < param.numOutputBoxesPerClass); 238 | outputClassData[classCounterIdx]++; 239 | } 240 | if (write) 241 | { 242 | // This branch is visited by one thread per iteration, so it's safe to do non-atomic increments. 243 | resultsCounter++; 244 | 245 | WriteNMSResult(param, numDetectionsOutput, nmsScoresOutput, nmsClassesOutput, nmsBoxesOutput, 246 | nmsIndicesOutput, threadScore[tile], threadClass[tile], threadBox[tile], boxIdxMap[tile], 247 | imageIdx, resultsCounter); 248 | } 249 | } 250 | } 251 | else 252 | { 253 | // This state should never be reached, but just in case... 254 | blockState = 0; // 0 => Signal all threads to not do any updates, nothing happens. 255 | } 256 | } 257 | 258 | __syncthreads(); 259 | 260 | if (blockState == -2) 261 | { 262 | // This is the signal to exit from the loop. 263 | return; 264 | } 265 | 266 | if (blockState == -1) 267 | { 268 | // This is the signal for all threads to just skip this iteration, as no IOU's need to be checked. 269 | continue; 270 | } 271 | 272 | // Grab a box and class to test the current box against. The test box corresponds to iteration i, 273 | // therefore it will have a lower index than the current thread box, and will therefore have a higher score 274 | // than the current box because it's located "before" in the sorted score list. 275 | T testScore; 276 | int testClass; 277 | BoxCorner testBox; 278 | int testBoxIdxMap; 279 | MapNMSData(param, i, imageIdx, boxesInput, anchorsInput, topClassData, topAnchorsData, topNumData, 280 | sortedScoresData, sortedIndexData, testScore, testClass, testBox, testBoxIdxMap); 281 | 282 | for (int tile = 0; tile < numTiles; tile++) 283 | { 284 | bool ignoreClass = true; 285 | if (!param.classAgnostic) 286 | { 287 | ignoreClass = threadClass[tile] == testClass; 288 | } 289 | 290 | // IOU 291 | if (boxIdx[tile] > i && // Make sure two different boxes are being tested, and that it's a higher index; 292 | boxIdx[tile] < numSelectedBoxes && // Make sure the box is within numSelectedBoxes; 293 | blockState == 1 && // Signal that allows IOU checks to be performed; 294 | threadState[tile] == 0 && // Make sure this box hasn't been either dropped or kept already; 295 | ignoreClass && // Compare only boxes of matching classes when classAgnostic is false; 296 | lte_mp(threadScore[tile], testScore) && // Make sure the sorting order of scores is as expected; 297 | IOU(param, threadBox[tile], testBox) >= param.iouThreshold) // And... IOU overlap. 298 | { 299 | // Current box overlaps with the box tested in this iteration, this box will be skipped. 300 | threadState[tile] = -1; // -1 => Mark this box's thread to be dropped. 301 | } 302 | } 303 | } 304 | } 305 | 306 | template 307 | cudaError_t EfficientNMSLauncher(EfficientNMSParameters& param, int* topNumData, int* outputIndexData, 308 | int* outputClassData, int* sortedIndexData, T* sortedScoresData, int* topClassData, int* topAnchorsData, 309 | const void* boxesInput, const void* anchorsInput, int* numDetectionsOutput, T* nmsScoresOutput, 310 | int* nmsClassesOutput, int* nmsIndicesOutput, void* nmsBoxesOutput, cudaStream_t stream) 311 | { 312 | unsigned int tileSize = param.numSelectedBoxes / NMS_TILES; 313 | if (param.numSelectedBoxes <= 512) 314 | { 315 | tileSize = 512; 316 | } 317 | if (param.numSelectedBoxes <= 256) 318 | { 319 | tileSize = 256; 320 | } 321 | 322 | const dim3 blockSize = {tileSize, 1, 1}; 323 | const dim3 gridSize = {1, (unsigned int) param.batchSize, 1}; 324 | 325 | if (param.boxCoding == 0) 326 | { 327 | EfficientNMS><<>>(param, topNumData, outputIndexData, 328 | outputClassData, sortedIndexData, sortedScoresData, topClassData, topAnchorsData, 329 | (BoxCorner*) boxesInput, (BoxCorner*) anchorsInput, numDetectionsOutput, nmsScoresOutput, 330 | nmsClassesOutput, nmsIndicesOutput, (BoxCorner*) nmsBoxesOutput); 331 | } 332 | else if (param.boxCoding == 1) 333 | { 334 | // Note that nmsBoxesOutput is always coded as BoxCorner, regardless of the input coding type. 335 | EfficientNMS><<>>(param, topNumData, outputIndexData, 336 | outputClassData, sortedIndexData, sortedScoresData, topClassData, topAnchorsData, 337 | (BoxCenterSize*) boxesInput, (BoxCenterSize*) anchorsInput, numDetectionsOutput, nmsScoresOutput, 338 | nmsClassesOutput, nmsIndicesOutput, (BoxCorner*) nmsBoxesOutput); 339 | } 340 | 341 | return cudaGetLastError(); 342 | } 343 | 344 | __global__ void EfficientNMSFilterSegments(EfficientNMSParameters param, const int* __restrict__ topNumData, 345 | int* __restrict__ topOffsetsStartData, int* __restrict__ topOffsetsEndData) 346 | { 347 | int imageIdx = threadIdx.x; 348 | if (imageIdx > param.batchSize) 349 | { 350 | return; 351 | } 352 | topOffsetsStartData[imageIdx] = imageIdx * param.numScoreElements; 353 | topOffsetsEndData[imageIdx] = imageIdx * param.numScoreElements + topNumData[imageIdx]; 354 | } 355 | 356 | template 357 | __global__ void EfficientNMSFilter(EfficientNMSParameters param, const T* __restrict__ scoresInput, 358 | int* __restrict__ topNumData, int* __restrict__ topIndexData, int* __restrict__ topAnchorsData, 359 | T* __restrict__ topScoresData, int* __restrict__ topClassData) 360 | { 361 | int elementIdx = blockDim.x * blockIdx.x + threadIdx.x; 362 | int imageIdx = blockDim.y * blockIdx.y + threadIdx.y; 363 | 364 | // Boundary Conditions 365 | if (elementIdx >= param.numScoreElements || imageIdx >= param.batchSize) 366 | { 367 | return; 368 | } 369 | 370 | // Shape of scoresInput: [batchSize, numAnchors, numClasses] 371 | int scoresInputIdx = imageIdx * param.numScoreElements + elementIdx; 372 | 373 | // For each class, check its corresponding score if it crosses the threshold, and if so select this anchor, 374 | // and keep track of the maximum score and the corresponding (argmax) class id 375 | T score = scoresInput[scoresInputIdx]; 376 | if (gte_mp(score, (T) param.scoreThreshold)) 377 | { 378 | // Unpack the class and anchor index from the element index 379 | int classIdx = elementIdx % param.numClasses; 380 | int anchorIdx = elementIdx / param.numClasses; 381 | 382 | // If this is a background class, ignore it. 383 | if (classIdx == param.backgroundClass) 384 | { 385 | return; 386 | } 387 | 388 | // Use an atomic to find an open slot where to write the selected anchor data. 389 | if (topNumData[imageIdx] >= param.numScoreElements) 390 | { 391 | return; 392 | } 393 | int selectedIdx = atomicAdd((unsigned int*) &topNumData[imageIdx], 1); 394 | if (selectedIdx >= param.numScoreElements) 395 | { 396 | topNumData[imageIdx] = param.numScoreElements; 397 | return; 398 | } 399 | 400 | // Shape of topScoresData / topClassData: [batchSize, numScoreElements] 401 | int topIdx = imageIdx * param.numScoreElements + selectedIdx; 402 | 403 | if (param.scoreBits > 0) 404 | { 405 | score = add_mp(score, (T) 1); 406 | if (gt_mp(score, (T) (2.f - 1.f / 1024.f))) 407 | { 408 | // Ensure the incremented score fits in the mantissa without changing the exponent 409 | score = (2.f - 1.f / 1024.f); 410 | } 411 | } 412 | 413 | topIndexData[topIdx] = selectedIdx; 414 | topAnchorsData[topIdx] = anchorIdx; 415 | topScoresData[topIdx] = score; 416 | topClassData[topIdx] = classIdx; 417 | } 418 | } 419 | 420 | template 421 | __global__ void EfficientNMSDenseIndex(EfficientNMSParameters param, int* __restrict__ topNumData, 422 | int* __restrict__ topIndexData, int* __restrict__ topAnchorsData, int* __restrict__ topOffsetsStartData, 423 | int* __restrict__ topOffsetsEndData, T* __restrict__ topScoresData, int* __restrict__ topClassData) 424 | { 425 | int elementIdx = blockDim.x * blockIdx.x + threadIdx.x; 426 | int imageIdx = blockDim.y * blockIdx.y + threadIdx.y; 427 | 428 | if (elementIdx >= param.numScoreElements || imageIdx >= param.batchSize) 429 | { 430 | return; 431 | } 432 | 433 | int dataIdx = imageIdx * param.numScoreElements + elementIdx; 434 | int anchorIdx = elementIdx / param.numClasses; 435 | int classIdx = elementIdx % param.numClasses; 436 | if (param.scoreBits > 0) 437 | { 438 | T score = topScoresData[dataIdx]; 439 | if (lt_mp(score, (T) param.scoreThreshold)) 440 | { 441 | score = (T) 1; 442 | } 443 | else if (classIdx == param.backgroundClass) 444 | { 445 | score = (T) 1; 446 | } 447 | else 448 | { 449 | score = add_mp(score, (T) 1); 450 | if (gt_mp(score, (T) (2.f - 1.f / 1024.f))) 451 | { 452 | // Ensure the incremented score fits in the mantissa without changing the exponent 453 | score = (2.f - 1.f / 1024.f); 454 | } 455 | } 456 | topScoresData[dataIdx] = score; 457 | } 458 | else 459 | { 460 | T score = topScoresData[dataIdx]; 461 | if (lt_mp(score, (T) param.scoreThreshold)) 462 | { 463 | topScoresData[dataIdx] = -(1 << 15); 464 | } 465 | else if (classIdx == param.backgroundClass) 466 | { 467 | topScoresData[dataIdx] = -(1 << 15); 468 | } 469 | } 470 | 471 | topIndexData[dataIdx] = elementIdx; 472 | topAnchorsData[dataIdx] = anchorIdx; 473 | topClassData[dataIdx] = classIdx; 474 | 475 | if (elementIdx == 0) 476 | { 477 | // Saturate counters 478 | topNumData[imageIdx] = param.numScoreElements; 479 | topOffsetsStartData[imageIdx] = imageIdx * param.numScoreElements; 480 | topOffsetsEndData[imageIdx] = (imageIdx + 1) * param.numScoreElements; 481 | } 482 | } 483 | 484 | template 485 | cudaError_t EfficientNMSFilterLauncher(EfficientNMSParameters& param, const T* scoresInput, int* topNumData, 486 | int* topIndexData, int* topAnchorsData, int* topOffsetsStartData, int* topOffsetsEndData, T* topScoresData, 487 | int* topClassData, cudaStream_t stream) 488 | { 489 | const unsigned int elementsPerBlock = 512; 490 | const unsigned int imagesPerBlock = 1; 491 | const unsigned int elementBlocks = (param.numScoreElements + elementsPerBlock - 1) / elementsPerBlock; 492 | const unsigned int imageBlocks = (param.batchSize + imagesPerBlock - 1) / imagesPerBlock; 493 | const dim3 blockSize = {elementsPerBlock, imagesPerBlock, 1}; 494 | const dim3 gridSize = {elementBlocks, imageBlocks, 1}; 495 | 496 | float kernelSelectThreshold = 0.007f; 497 | if (param.scoreSigmoid) 498 | { 499 | // Inverse Sigmoid 500 | if (param.scoreThreshold <= 0.f) 501 | { 502 | param.scoreThreshold = -(1 << 15); 503 | } 504 | else 505 | { 506 | param.scoreThreshold = logf(param.scoreThreshold / (1.f - param.scoreThreshold)); 507 | } 508 | kernelSelectThreshold = logf(kernelSelectThreshold / (1.f - kernelSelectThreshold)); 509 | // Disable Score Bits Optimization 510 | param.scoreBits = -1; 511 | } 512 | 513 | if (param.scoreThreshold < kernelSelectThreshold) 514 | { 515 | // A full copy of the buffer is necessary because sorting will scramble the input data otherwise. 516 | PLUGIN_CHECK_CUDA(cudaMemcpyAsync(topScoresData, scoresInput, 517 | param.batchSize * param.numScoreElements * sizeof(T), cudaMemcpyDeviceToDevice, stream)); 518 | 519 | EfficientNMSDenseIndex<<>>(param, topNumData, topIndexData, topAnchorsData, 520 | topOffsetsStartData, topOffsetsEndData, topScoresData, topClassData); 521 | } 522 | else 523 | { 524 | EfficientNMSFilter<<>>( 525 | param, scoresInput, topNumData, topIndexData, topAnchorsData, topScoresData, topClassData); 526 | 527 | EfficientNMSFilterSegments<<<1, param.batchSize, 0, stream>>>( 528 | param, topNumData, topOffsetsStartData, topOffsetsEndData); 529 | } 530 | 531 | return cudaGetLastError(); 532 | } 533 | 534 | template 535 | size_t EfficientNMSSortWorkspaceSize(int batchSize, int numScoreElements) 536 | { 537 | size_t sortedWorkspaceSize = 0; 538 | cub::DoubleBuffer keysDB(nullptr, nullptr); 539 | cub::DoubleBuffer valuesDB(nullptr, nullptr); 540 | cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr, sortedWorkspaceSize, keysDB, valuesDB, 541 | numScoreElements, batchSize, (const int*) nullptr, (const int*) nullptr); 542 | return sortedWorkspaceSize; 543 | } 544 | 545 | size_t EfficientNMSWorkspaceSize(int batchSize, int numScoreElements, int numClasses, DataType datatype) 546 | { 547 | size_t total = 0; 548 | const size_t align = 256; 549 | // Counters 550 | // 3 for Filtering 551 | // 1 for Output Indexing 552 | // C for Max per Class Limiting 553 | size_t size = (3 + 1 + numClasses) * batchSize * sizeof(int); 554 | total += size + (size % align ? align - (size % align) : 0); 555 | // Int Buffers 556 | for (int i = 0; i < 4; i++) 557 | { 558 | size = batchSize * numScoreElements * sizeof(int); 559 | total += size + (size % align ? align - (size % align) : 0); 560 | } 561 | // Float Buffers 562 | for (int i = 0; i < 2; i++) 563 | { 564 | size = batchSize * numScoreElements * dataTypeSize(datatype); 565 | total += size + (size % align ? align - (size % align) : 0); 566 | } 567 | // Sort Workspace 568 | if (datatype == DataType::kHALF) 569 | { 570 | size = EfficientNMSSortWorkspaceSize<__half>(batchSize, numScoreElements); 571 | total += size + (size % align ? align - (size % align) : 0); 572 | } 573 | else if (datatype == DataType::kFLOAT) 574 | { 575 | size = EfficientNMSSortWorkspaceSize(batchSize, numScoreElements); 576 | total += size + (size % align ? align - (size % align) : 0); 577 | } 578 | 579 | return total; 580 | } 581 | 582 | template 583 | T* EfficientNMSWorkspace(void* workspace, size_t& offset, size_t elements) 584 | { 585 | T* buffer = (T*) ((size_t) workspace + offset); 586 | size_t align = 256; 587 | size_t size = elements * sizeof(T); 588 | size_t sizeAligned = size + (size % align ? align - (size % align) : 0); 589 | offset += sizeAligned; 590 | return buffer; 591 | } 592 | 593 | template 594 | pluginStatus_t EfficientNMSDispatch(EfficientNMSParameters param, const void* boxesInput, const void* scoresInput, 595 | const void* anchorsInput, void* numDetectionsOutput, void* nmsBoxesOutput, void* nmsScoresOutput, 596 | void* nmsClassesOutput, void* nmsIndicesOutput, void* workspace, cudaStream_t stream) 597 | { 598 | // Clear Outputs (not all elements will get overwritten by the kernels, so safer to clear everything out) 599 | 600 | CSC(cudaMemsetAsync(numDetectionsOutput, 0x00, param.batchSize * sizeof(int), stream), STATUS_FAILURE); 601 | CSC(cudaMemsetAsync(nmsScoresOutput, 0x00, param.batchSize * param.numOutputBoxes * sizeof(T), stream), STATUS_FAILURE); 602 | CSC(cudaMemsetAsync(nmsBoxesOutput, 0x00, param.batchSize * param.numOutputBoxes * 4 * sizeof(T), stream), STATUS_FAILURE); 603 | CSC(cudaMemsetAsync(nmsClassesOutput, 0x00, param.batchSize * param.numOutputBoxes * sizeof(int), stream), STATUS_FAILURE); 604 | CSC(cudaMemsetAsync(nmsIndicesOutput, 0xFF, param.batchSize * param.numOutputBoxes * sizeof(int), stream), STATUS_FAILURE); 605 | 606 | // Empty Inputs 607 | if (param.numScoreElements < 1) 608 | { 609 | return STATUS_SUCCESS; 610 | } 611 | 612 | // Counters Workspace 613 | size_t workspaceOffset = 0; 614 | int countersTotalSize = (3 + 1 + param.numClasses) * param.batchSize; 615 | int* topNumData = EfficientNMSWorkspace(workspace, workspaceOffset, countersTotalSize); 616 | int* topOffsetsStartData = topNumData + param.batchSize; 617 | int* topOffsetsEndData = topNumData + 2 * param.batchSize; 618 | int* outputIndexData = topNumData + 3 * param.batchSize; 619 | int* outputClassData = topNumData + 4 * param.batchSize; 620 | CSC(cudaMemsetAsync(topNumData, 0x00, countersTotalSize * sizeof(int), stream), STATUS_FAILURE); 621 | cudaError_t status = cudaGetLastError(); 622 | CSC(status, STATUS_FAILURE); 623 | 624 | // Other Buffers Workspace 625 | int* topIndexData 626 | = EfficientNMSWorkspace(workspace, workspaceOffset, param.batchSize * param.numScoreElements); 627 | int* topClassData 628 | = EfficientNMSWorkspace(workspace, workspaceOffset, param.batchSize * param.numScoreElements); 629 | int* topAnchorsData 630 | = EfficientNMSWorkspace(workspace, workspaceOffset, param.batchSize * param.numScoreElements); 631 | int* sortedIndexData 632 | = EfficientNMSWorkspace(workspace, workspaceOffset, param.batchSize * param.numScoreElements); 633 | T* topScoresData = EfficientNMSWorkspace(workspace, workspaceOffset, param.batchSize * param.numScoreElements); 634 | T* sortedScoresData 635 | = EfficientNMSWorkspace(workspace, workspaceOffset, param.batchSize * param.numScoreElements); 636 | size_t sortedWorkspaceSize = EfficientNMSSortWorkspaceSize(param.batchSize, param.numScoreElements); 637 | char* sortedWorkspaceData = EfficientNMSWorkspace(workspace, workspaceOffset, sortedWorkspaceSize); 638 | cub::DoubleBuffer scoresDB(topScoresData, sortedScoresData); 639 | cub::DoubleBuffer indexDB(topIndexData, sortedIndexData); 640 | 641 | // Kernels 642 | status = EfficientNMSFilterLauncher(param, (T*) scoresInput, topNumData, topIndexData, topAnchorsData, 643 | topOffsetsStartData, topOffsetsEndData, topScoresData, topClassData, stream); 644 | CSC(status, STATUS_FAILURE); 645 | 646 | status = cub::DeviceSegmentedRadixSort::SortPairsDescending(sortedWorkspaceData, sortedWorkspaceSize, scoresDB, 647 | indexDB, param.batchSize * param.numScoreElements, param.batchSize, topOffsetsStartData, topOffsetsEndData, 648 | param.scoreBits > 0 ? (10 - param.scoreBits) : 0, param.scoreBits > 0 ? 10 : sizeof(T) * 8, stream); 649 | CSC(status, STATUS_FAILURE); 650 | 651 | status = EfficientNMSLauncher(param, topNumData, outputIndexData, outputClassData, indexDB.Current(), 652 | scoresDB.Current(), topClassData, topAnchorsData, boxesInput, anchorsInput, (int*) numDetectionsOutput, 653 | (T*) nmsScoresOutput, (int*) nmsClassesOutput, (int*) nmsIndicesOutput, nmsBoxesOutput, stream); 654 | CSC(status, STATUS_FAILURE); 655 | 656 | return STATUS_SUCCESS; 657 | } 658 | 659 | pluginStatus_t EfficientNMSInference(EfficientNMSParameters param, const void* boxesInput, const void* scoresInput, 660 | const void* anchorsInput, void* numDetectionsOutput, void* nmsBoxesOutput, void* nmsScoresOutput, 661 | void* nmsClassesOutput, void* nmsIndicesOutput, void* workspace, cudaStream_t stream) 662 | { 663 | if (param.datatype == DataType::kFLOAT) 664 | { 665 | param.scoreBits = -1; 666 | return EfficientNMSDispatch(param, boxesInput, scoresInput, anchorsInput, numDetectionsOutput, 667 | nmsBoxesOutput, nmsScoresOutput, nmsClassesOutput, nmsIndicesOutput, workspace, stream); 668 | } 669 | else if (param.datatype == DataType::kHALF) 670 | { 671 | if (param.scoreBits <= 0 || param.scoreBits > 10) 672 | { 673 | param.scoreBits = -1; 674 | } 675 | return EfficientNMSDispatch<__half>(param, boxesInput, scoresInput, anchorsInput, numDetectionsOutput, 676 | nmsBoxesOutput, nmsScoresOutput, nmsClassesOutput, nmsIndicesOutput, workspace, stream); 677 | } 678 | else 679 | { 680 | return STATUS_NOT_SUPPORTED; 681 | } 682 | } 683 | --------------------------------------------------------------------------------