├── .github
    └── FUNDING.yml
├── config_infer_primary_rfdetr_seg.txt
├── config_infer_primary_yoloV7_seg.txt
├── config_infer_primary_yolo11_seg.txt
├── config_infer_primary_yoloV5_seg.txt
├── config_infer_primary_yoloV7_mask.txt
├── config_infer_primary_yoloV8_seg.txt
├── labels.txt
├── deepstream_app_config.txt
├── LICENSE.md
├── nvdsinfer_custom_impl_Yolo_seg
    ├── trt_plugins
    │   ├── roiAlignPlugin
    │   │   ├── roiAlignKernel.h
    │   │   ├── roiAlignPlugin.h
    │   │   ├── roiAlignKernel.cu
    │   │   └── roiAlignPlugin.cpp
    │   ├── efficientNMSPlugin
    │   │   ├── efficientNMSInference.h
    │   │   ├── efficientNMSParameters.h
    │   │   ├── efficientNMSPlugin.h
    │   │   ├── efficientNMSInference.cuh
    │   │   ├── efficientNMSPlugin.cpp
    │   │   └── efficientNMSInference.cu
    │   ├── common.cpp
    │   └── common.h
    ├── Makefile
    └── nvdsparseseg_Yolo.cpp
├── README.md
├── docs
    ├── RFDETR_Seg.md
    ├── YOLO11_Seg.md
    ├── YOLOv8_Seg.md
    ├── YOLOv5_Seg.md
    ├── YOLOv7_Seg.md
    └── YOLOv7_Mask.md
└── utils
    ├── export_yoloV5_seg.py
    ├── export_yoloV7_seg.py
    ├── export_yoloV7_mask.py
    ├── export_yolo11_seg.py
    ├── export_yoloV8_seg.py
    └── export_rfdetr_seg.py


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | #github: [marcoslucianops]
2 | custom: ['https://www.buymeacoffee.com/marcoslucianops']
3 | 


--------------------------------------------------------------------------------
/config_infer_primary_rfdetr_seg.txt:
--------------------------------------------------------------------------------
 1 | [property]
 2 | gpu-id=0
 3 | net-scale-factor=0.0039215697906911373
 4 | model-color-format=0
 5 | onnx-file=rf-detr-seg-preview.onnx
 6 | model-engine-file=rf-detr-seg-preview.onnx_b1_gpu0_fp32.engine
 7 | #int8-calib-file=calib.table
 8 | labelfile-path=labels.txt
 9 | batch-size=1
10 | network-mode=0
11 | num-detected-classes=91
12 | interval=0
13 | gie-unique-id=1
14 | process-mode=1
15 | network-type=3
16 | cluster-mode=4
17 | maintain-aspect-ratio=0
18 | scaling-filter=1
19 | scaling-compute-hw=0
20 | force-implicit-batch-dim=0
21 | #workspace-size=2000
22 | parse-bbox-instance-mask-func-name=NvDsInferParseYoloSeg
23 | custom-lib-path=nvdsinfer_custom_impl_Yolo_seg/libnvdsinfer_custom_impl_Yolo_seg.so
24 | output-instance-mask=1
25 | segmentation-threshold=0.5
26 | 
27 | [class-attrs-all]
28 | pre-cluster-threshold=0.25
29 | 


--------------------------------------------------------------------------------
/config_infer_primary_yoloV7_seg.txt:
--------------------------------------------------------------------------------
 1 | [property]
 2 | gpu-id=0
 3 | net-scale-factor=0.0039215697906911373
 4 | model-color-format=0
 5 | onnx-file=yolov7-seg.onnx
 6 | model-engine-file=yolov7-seg.onnx_b1_gpu0_fp32.engine
 7 | #int8-calib-file=calib.table
 8 | labelfile-path=labels.txt
 9 | batch-size=1
10 | network-mode=0
11 | num-detected-classes=80
12 | interval=0
13 | gie-unique-id=1
14 | process-mode=1
15 | network-type=3
16 | cluster-mode=4
17 | maintain-aspect-ratio=1
18 | symmetric-padding=1
19 | scaling-filter=1
20 | scaling-compute-hw=0
21 | force-implicit-batch-dim=0
22 | #workspace-size=2000
23 | parse-bbox-instance-mask-func-name=NvDsInferParseYoloSeg
24 | custom-lib-path=nvdsinfer_custom_impl_Yolo_seg/libnvdsinfer_custom_impl_Yolo_seg.so
25 | output-instance-mask=1
26 | segmentation-threshold=0.5
27 | 
28 | [class-attrs-all]
29 | pre-cluster-threshold=0.25
30 | 


--------------------------------------------------------------------------------
/config_infer_primary_yolo11_seg.txt:
--------------------------------------------------------------------------------
 1 | [property]
 2 | gpu-id=0
 3 | net-scale-factor=0.0039215697906911373
 4 | model-color-format=0
 5 | onnx-file=yolo11s-seg.onnx
 6 | model-engine-file=yolo11s-seg.onnx_b1_gpu0_fp32.engine
 7 | #int8-calib-file=calib.table
 8 | labelfile-path=labels.txt
 9 | batch-size=1
10 | network-mode=0
11 | num-detected-classes=80
12 | interval=0
13 | gie-unique-id=1
14 | process-mode=1
15 | network-type=3
16 | cluster-mode=4
17 | maintain-aspect-ratio=1
18 | symmetric-padding=1
19 | scaling-filter=1
20 | scaling-compute-hw=0
21 | force-implicit-batch-dim=0
22 | #workspace-size=2000
23 | parse-bbox-instance-mask-func-name=NvDsInferParseYoloSeg
24 | custom-lib-path=nvdsinfer_custom_impl_Yolo_seg/libnvdsinfer_custom_impl_Yolo_seg.so
25 | output-instance-mask=1
26 | segmentation-threshold=0.5
27 | 
28 | [class-attrs-all]
29 | pre-cluster-threshold=0.25
30 | 


--------------------------------------------------------------------------------
/config_infer_primary_yoloV5_seg.txt:
--------------------------------------------------------------------------------
 1 | [property]
 2 | gpu-id=0
 3 | net-scale-factor=0.0039215697906911373
 4 | model-color-format=0
 5 | onnx-file=yolov5s-seg.onnx
 6 | model-engine-file=yolov5s-seg.onnx_b1_gpu0_fp32.engine
 7 | #int8-calib-file=calib.table
 8 | labelfile-path=labels.txt
 9 | batch-size=1
10 | network-mode=0
11 | num-detected-classes=80
12 | interval=0
13 | gie-unique-id=1
14 | process-mode=1
15 | network-type=3
16 | cluster-mode=4
17 | maintain-aspect-ratio=1
18 | symmetric-padding=1
19 | scaling-filter=1
20 | scaling-compute-hw=0
21 | force-implicit-batch-dim=0
22 | #workspace-size=2000
23 | parse-bbox-instance-mask-func-name=NvDsInferParseYoloSeg
24 | custom-lib-path=nvdsinfer_custom_impl_Yolo_seg/libnvdsinfer_custom_impl_Yolo_seg.so
25 | output-instance-mask=1
26 | segmentation-threshold=0.5
27 | 
28 | [class-attrs-all]
29 | pre-cluster-threshold=0.25
30 | 


--------------------------------------------------------------------------------
/config_infer_primary_yoloV7_mask.txt:
--------------------------------------------------------------------------------
 1 | [property]
 2 | gpu-id=0
 3 | net-scale-factor=0.0039215697906911373
 4 | model-color-format=0
 5 | onnx-file=yolov7-mask.onnx
 6 | model-engine-file=yolov7-mask.onnx_b1_gpu0_fp32.engine
 7 | #int8-calib-file=calib.table
 8 | labelfile-path=labels.txt
 9 | batch-size=1
10 | network-mode=0
11 | num-detected-classes=80
12 | interval=0
13 | gie-unique-id=1
14 | process-mode=1
15 | network-type=3
16 | cluster-mode=4
17 | maintain-aspect-ratio=1
18 | symmetric-padding=1
19 | scaling-filter=1
20 | scaling-compute-hw=0
21 | force-implicit-batch-dim=0
22 | #workspace-size=2000
23 | parse-bbox-instance-mask-func-name=NvDsInferParseYoloSeg
24 | custom-lib-path=nvdsinfer_custom_impl_Yolo_seg/libnvdsinfer_custom_impl_Yolo_seg.so
25 | output-instance-mask=1
26 | segmentation-threshold=0.5
27 | 
28 | [class-attrs-all]
29 | pre-cluster-threshold=0.25
30 | 


--------------------------------------------------------------------------------
/config_infer_primary_yoloV8_seg.txt:
--------------------------------------------------------------------------------
 1 | [property]
 2 | gpu-id=0
 3 | net-scale-factor=0.0039215697906911373
 4 | model-color-format=0
 5 | onnx-file=yolov8s-seg.onnx
 6 | model-engine-file=yolov8s-seg.onnx_b1_gpu0_fp32.engine
 7 | #int8-calib-file=calib.table
 8 | labelfile-path=labels.txt
 9 | batch-size=1
10 | network-mode=0
11 | num-detected-classes=80
12 | interval=0
13 | gie-unique-id=1
14 | process-mode=1
15 | network-type=3
16 | cluster-mode=4
17 | maintain-aspect-ratio=1
18 | symmetric-padding=1
19 | scaling-filter=1
20 | scaling-compute-hw=0
21 | force-implicit-batch-dim=0
22 | #workspace-size=2000
23 | parse-bbox-instance-mask-func-name=NvDsInferParseYoloSeg
24 | custom-lib-path=nvdsinfer_custom_impl_Yolo_seg/libnvdsinfer_custom_impl_Yolo_seg.so
25 | output-instance-mask=1
26 | segmentation-threshold=0.5
27 | 
28 | [class-attrs-all]
29 | pre-cluster-threshold=0.25
30 | 


--------------------------------------------------------------------------------
/labels.txt:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorbike
 5 | aeroplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 | 


--------------------------------------------------------------------------------
/deepstream_app_config.txt:
--------------------------------------------------------------------------------
 1 | [application]
 2 | enable-perf-measurement=1
 3 | perf-measurement-interval-sec=5
 4 | 
 5 | [tiled-display]
 6 | enable=1
 7 | rows=1
 8 | columns=1
 9 | width=1280
10 | height=720
11 | gpu-id=0
12 | nvbuf-memory-type=0
13 | 
14 | [source0]
15 | enable=1
16 | type=3
17 | uri=file:///opt/nvidia/deepstream/deepstream/samples/streams/sample_1080p_h264.mp4
18 | num-sources=1
19 | gpu-id=0
20 | cudadec-memtype=0
21 | 
22 | [sink0]
23 | enable=1
24 | type=2
25 | sync=0
26 | gpu-id=0
27 | nvbuf-memory-type=0
28 | 
29 | [osd]
30 | enable=1
31 | display-mask=1
32 | gpu-id=0
33 | border-width=5
34 | text-size=15
35 | text-color=1;1;1;1;
36 | text-bg-color=0.3;0.3;0.3;1
37 | font=Serif
38 | show-clock=0
39 | clock-x-offset=800
40 | clock-y-offset=820
41 | clock-text-size=12
42 | clock-color=1;0;0;0
43 | nvbuf-memory-type=0
44 | 
45 | [streammux]
46 | gpu-id=0
47 | live-source=0
48 | batch-size=1
49 | batched-push-timeout=40000
50 | width=1920
51 | height=1080
52 | enable-padding=0
53 | nvbuf-memory-type=0
54 | 
55 | [primary-gie]
56 | enable=1
57 | gpu-id=0
58 | gie-unique-id=1
59 | nvbuf-memory-type=0
60 | config-file=config_infer_primary_yoloV8_seg.txt
61 | 
62 | [tests]
63 | file-loop=0
64 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023-2025, Marcos Luciano Piropo Santos.
 4 | Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved.
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/nvdsinfer_custom_impl_Yolo_seg/trt_plugins/roiAlignPlugin/roiAlignKernel.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: Apache-2.0
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | #ifndef TRT_ROIALIGN_KERNEL_H
18 | #define TRT_ROIALIGN_KERNEL_H
19 | 
20 | #include <cuda_runtime.h>
21 | #include <stdint.h>
22 | 
23 | template <typename T>
24 | cudaError_t RoiAlignImpl(cudaStream_t stream, int32_t const maxThreadsPerBlock, T const* bottomData,
25 |     T const spatialScale, int32_t const numRois, int32_t const channels, int32_t const height, int32_t const width,
26 |     int32_t const pooledHeight, int32_t const pooledWidth, int32_t const samplingRatio, T const* bottomRois, T* topData,
27 |     int32_t const isModeAvg, int32_t const* batchIndicesPtr, int32_t const aligned);
28 | 
29 | #endif // TRT_ROIALIGN_KERNEL_H
30 | 


--------------------------------------------------------------------------------
/nvdsinfer_custom_impl_Yolo_seg/trt_plugins/efficientNMSPlugin/efficientNMSInference.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: Apache-2.0
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | #ifndef TRT_EFFICIENT_NMS_INFERENCE_H
19 | #define TRT_EFFICIENT_NMS_INFERENCE_H
20 | 
21 | #include "../common.h"
22 | 
23 | #include "efficientNMSParameters.h"
24 | 
25 | size_t EfficientNMSWorkspaceSize(
26 |     int32_t batchSize, int32_t numScoreElements, int32_t numClasses, nvinfer1::DataType datatype);
27 | 
28 | pluginStatus_t EfficientNMSInference(nvinfer1::plugin::EfficientNMSParameters param, void const* boxesInput,
29 |     void const* scoresInput, void const* anchorsInput, void* numDetectionsOutput, void* nmsBoxesOutput,
30 |     void* nmsScoresOutput, void* nmsClassesOutput, void* nmsIndicesOutput, void* workspace, cudaStream_t stream);
31 | 
32 | #endif
33 | 


--------------------------------------------------------------------------------
/nvdsinfer_custom_impl_Yolo_seg/trt_plugins/efficientNMSPlugin/efficientNMSParameters.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: Apache-2.0
 4 |  *
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  *
 9 |  * http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | #ifndef TRT_EFFICIENT_NMS_PARAMETERS_H
19 | #define TRT_EFFICIENT_NMS_PARAMETERS_H
20 | 
21 | #include "../common.h"
22 | 
23 | namespace nvinfer1
24 | {
25 | namespace plugin
26 | {
27 | 
28 | struct EfficientNMSParameters
29 | {
30 |     // Related to NMS Options
31 |     float iouThreshold = 0.5F;
32 |     float scoreThreshold = 0.5F;
33 |     int32_t numOutputBoxes = 100;
34 |     int32_t numOutputBoxesPerClass = -1;
35 |     bool padOutputBoxesPerClass = false;
36 |     int32_t backgroundClass = -1;
37 |     bool scoreSigmoid = false;
38 |     bool clipBoxes = false;
39 |     int32_t boxCoding = 0;
40 |     bool classAgnostic = false;
41 | 
42 |     // Related to NMS Internals
43 |     int32_t numSelectedBoxes = 4096;
44 |     int32_t scoreBits = -1;
45 | 
46 |     // Related to Tensor Configuration
47 |     // (These are set by the various plugin configuration methods, no need to define them during plugin creation.)
48 |     int32_t batchSize = -1;
49 |     int32_t numClasses = 1;
50 |     int32_t numBoxElements = -1;
51 |     int32_t numScoreElements = -1;
52 |     int32_t numAnchors = -1;
53 |     bool shareLocation = true;
54 |     bool shareAnchors = true;
55 |     bool boxDecoder = false;
56 |     nvinfer1::DataType datatype = nvinfer1::DataType::kFLOAT;
57 | };
58 | 
59 | } // namespace plugin
60 | } // namespace nvinfer1
61 | 
62 | #endif
63 | 


--------------------------------------------------------------------------------
/nvdsinfer_custom_impl_Yolo_seg/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_VER?=
 2 | ifeq ($(CUDA_VER),)
 3 | 	$(error "CUDA_VER is not set")
 4 | endif
 5 | 
 6 | CUDA_VER_MAJOR:= $(word 1,$(subst ., ,$(CUDA_VER)))
 7 | CUDA_VER_MINOR:= $(word 2,$(subst ., ,$(CUDA_VER)))
 8 | CUDA_VER_NUM:= $(CUDA_VER_MAJOR)$(CUDA_VER_MINOR)
 9 | 
10 | CUDA_ARCH:= 53 60 61 62 70 72 75
11 | ifeq ($(shell expr $(CUDA_VER_NUM) \>= 110),1)
12 |   CUDA_ARCH+= 80
13 | endif
14 | ifeq ($(shell expr $(CUDA_VER_NUM) \>= 111),1)
15 |   CUDA_ARCH+= 86
16 | endif
17 | ifeq ($(shell expr $(CUDA_VER_NUM) \>= 118),1)
18 |   CUDA_ARCH+= 87 89
19 | endif
20 | ifeq ($(shell expr $(CUDA_VER_NUM) \>= 120),1)
21 |   CUDA_ARCH+= 90
22 | endif
23 | ifeq ($(shell expr $(CUDA_VER_NUM) \>= 128),1)
24 |   CUDA_ARCH+= 100 103 110 120
25 | endif
26 | 
27 | GENCODE_FLAGS:= $(foreach a,$(CUDA_ARCH),-gencode arch=compute_$(a),code=sm_$(a) \
28 | 				-gencode arch=compute_$(a),code=compute_$(a))
29 | 
30 | SM?=
31 | ifneq ($(SM),)
32 | 	GENCODE_FLAGS:= -gencode arch=compute_$(SM),code=sm_$(SM) -gencode arch=compute_$(SM),code=compute_$(SM)
33 | endif
34 | 
35 | CXX:= g++
36 | NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc
37 | 
38 | TARGET_LIB:= libnvdsinfer_custom_impl_Yolo_seg.so
39 | 
40 | CFLAGS:= -Wall -std=c++17 -shared -fPIC -Wno-error=deprecated-declarations
41 | 
42 | CFLAGS+= -I/opt/nvidia/deepstream/deepstream/sources/includes -I/usr/local/cuda-$(CUDA_VER)/include
43 | CUFLAGS:= -I/opt/nvidia/deepstream/deepstream/sources/includes -I/usr/local/cuda-$(CUDA_VER)/include
44 | 
45 | LIBS+= -lnvinfer -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lstdc++fs
46 | 
47 | LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group
48 | 
49 | SRCS:= $(wildcard *.cpp)
50 | SRCS+= $(wildcard trt_plugins/*.cpp)
51 | SRCS+= $(wildcard trt_plugins/efficientNMSPlugin/*.cpp)
52 | SRCS+= $(wildcard trt_plugins/efficientNMSPlugin/*.cu)
53 | SRCS+= $(wildcard trt_plugins/roiAlignPlugin/*.cpp)
54 | SRCS+= $(wildcard trt_plugins/roiAlignPlugin/*.cu)
55 | 
56 | INCS:= $(wildcard *.h)
57 | INCS+= $(wildcard trt_plugins/*.h)
58 | INCS+= $(wildcard trt_plugins/efficientNMSPlugin/*.h)
59 | INCS+= $(wildcard trt_plugins/efficientNMSPlugin/*.cuh)
60 | INCS+= $(wildcard trt_plugins/roiAlignPlugin/*.h)
61 | INCS+= $(wildcard trt_plugins/roiAlignPlugin/*.cuh)
62 | 
63 | OBJS:= $(addsuffix .o, $(basename $(SRCS)))
64 | 
65 | all: $(TARGET_LIB)
66 | 
67 | %.o: %.cpp Makefile
68 | 	$(CXX) -c -o $@ $(CFLAGS) $<
69 | 
70 | %.o: %.cu $(INCS) Makefile
71 | 	$(NVCC) -c -o $@ $(GENCODE_FLAGS) --compiler-options '-fPIC' $(CUFLAGS) $<
72 | 
73 | $(TARGET_LIB) : $(OBJS)
74 | 	$(CXX) -o $@ $(OBJS) $(LFLAGS)
75 | 
76 | clean:
77 | 	rm -rf $(OBJS) $(TARGET_LIB)
78 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DeepStream-Yolo-Seg
  2 | 
  3 | NVIDIA DeepStream SDK 8.0 / 7.1 / 7.0 / 6.4 / 6.3 / 6.2 / 6.1.1 / 6.1 / 6.0.1 / 6.0 application for YOLO-Seg models
  4 | 
  5 | --------------------------------------------------------------------------------------------------
  6 | ### YOLO object detection models and other infos: https://github.com/marcoslucianops/DeepStream-Yolo
  7 | --------------------------------------------------------------------------------------------------
  8 | ### Important: Please export the ONNX model with the new export file, generate the TensorRT engine again with the updated files, and use the new config_infer_primary file according to your model
  9 | --------------------------------------------------------------------------------------------------
 10 | 
 11 | ### Getting started
 12 | 
 13 | * [Supported models](#supported-models)
 14 | * [Instructions](#basic-usage)
 15 | * [YOLOv5-Seg usage](docs/YOLOv5_Seg.md)
 16 | * [YOLOv7-Seg usage](docs/YOLOv7_Seg.md)
 17 | * [YOLOv7-Mask usage](docs/YOLOv7_Mask.md)
 18 | * [YOLOv8-Seg usage](docs/YOLOv8_Seg.md)
 19 | * [YOLO11-Seg usage](docs/YOLO11_Seg.md)
 20 | * [RF-DETR-Seg usage](docs/RFDETR_Seg.md)
 21 | * [NMS configuration](#nms-configuration)
 22 | * [Detection threshold configuration](#detection-threshold-configuration)
 23 | 
 24 | ##
 25 | 
 26 | ### Supported models
 27 | 
 28 | * [RF-DETR-Seg](https://github.com/roboflow/rf-detr)
 29 | * [YOLO11-Seg](https://github.com/ultralytics/ultralytics)
 30 | * [YOLOv8-Seg](https://github.com/ultralytics/ultralytics)
 31 | * [YOLOv7-Mask](https://github.com/WongKinYiu/yolov7/tree/mask)
 32 | * [YOLOv7-Seg](https://github.com/WongKinYiu/yolov7/tree/u7/seg)
 33 | * [YOLOv5-Seg](https://github.com/ultralytics/yolov5)
 34 | 
 35 | ##
 36 | 
 37 | ### Instructions
 38 | 
 39 | #### 1. Download the DeepStream-Yolo-Seg repo
 40 | 
 41 | ```
 42 | git clone https://github.com/marcoslucianops/DeepStream-Yolo-Seg.git
 43 | cd DeepStream-Yolo-Seg
 44 | ```
 45 | 
 46 | #### 2. Compile the libs
 47 | 
 48 | 2.1. Set the `CUDA_VER` according to your DeepStream version
 49 | 
 50 | ```
 51 | export CUDA_VER=XY.Z
 52 | ```
 53 | 
 54 | * x86 platform
 55 | 
 56 |   ```
 57 |   DeepStream 8.0 = 12.8
 58 |   DeepStream 7.1 = 12.6
 59 |   DeepStream 7.0 / 6.4 = 12.2
 60 |   DeepStream 6.3 = 12.1
 61 |   DeepStream 6.2 = 11.8
 62 |   DeepStream 6.1.1 = 11.7
 63 |   DeepStream 6.1 = 11.6
 64 |   DeepStream 6.0.1 / 6.0 = 11.4
 65 |   ```
 66 | 
 67 | * Jetson platform
 68 | 
 69 |   ```
 70 |   DeepStream 8.0 = 13.0
 71 |   DeepStream 7.1 = 12.6
 72 |   DeepStream 7.0 / 6.4 = 12.2
 73 |   DeepStream 6.3 / 6.2 / 6.1.1 / 6.1 = 11.4
 74 |   DeepStream 6.0.1 / 6.0 = 10.2
 75 |   ```
 76 | 
 77 | 2.2. Make the libs
 78 | 
 79 | ```
 80 | make -C nvdsinfer_custom_impl_Yolo_seg clean && make -C nvdsinfer_custom_impl_Yolo_seg
 81 | ```
 82 | 
 83 | #### 3. Run
 84 | 
 85 | ```
 86 | deepstream-app -c deepstream_app_config.txt
 87 | ```
 88 | 
 89 | **NOTE**: The TensorRT engine file may take a very long time to generate (sometimes more than 10 minutes).
 90 | 
 91 | ##
 92 | 
 93 | ### NMS configuration
 94 | 
 95 | For now, the NMS is configured in the ONNX exporter file.
 96 | 
 97 | **NOTE**: Make sure to set `cluster-mode=4` in the config_infer file.
 98 | 
 99 | ##
100 | 
101 | ### Detection threshold configuration
102 | 
103 | The minimum detection confidence threshold is configured in the ONNX exporter file. The `pre-cluster-threshold` should be >= the value used in the ONNX model.
104 | 
105 | ```
106 | [class-attrs-all]
107 | pre-cluster-threshold=0.25
108 | ```
109 | 
110 | ##
111 | 
112 | My projects: https://www.youtube.com/MarcosLucianoTV
113 | 


--------------------------------------------------------------------------------
/nvdsinfer_custom_impl_Yolo_seg/nvdsparseseg_Yolo.cpp:
--------------------------------------------------------------------------------
  1 | #include <cassert>
  2 | #include <algorithm>
  3 | #include <iostream>
  4 | #include <cstring>
  5 | 
  6 | #include "nvdsinfer_custom_impl.h"
  7 | 
  8 | extern "C" bool
  9 | NvDsInferParseYoloSeg(std::vector<NvDsInferLayerInfo> const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo,
 10 |     NvDsInferParseDetectionParams const& detectionParams, std::vector<NvDsInferInstanceMaskInfo>& objectList);
 11 | 
 12 | static float
 13 | clamp(float val, float minVal, float maxVal)
 14 | {
 15 |   assert(minVal <= maxVal);
 16 |   return std::min(maxVal, std::max(minVal, val));
 17 | }
 18 | 
 19 | static void
 20 | addSegProposal(const float* output, size_t channelsSize, uint netW, uint netH, size_t n, NvDsInferInstanceMaskInfo& b)
 21 | {
 22 |   size_t maskSize = channelsSize - 6;
 23 |   b.mask = new float[maskSize];
 24 |   b.mask_width = netW / 4;
 25 |   b.mask_height = netH / 4;
 26 |   b.mask_size = sizeof(float) * maskSize;
 27 |   std::memcpy(b.mask, output + n * channelsSize + 6, sizeof(float) * maskSize);
 28 | }
 29 | 
 30 | static void
 31 | addBBoxProposal(float x1, float y1, float x2, float y2, uint netW, uint netH, int maxIndex, float maxProb,
 32 |     NvDsInferInstanceMaskInfo& b)
 33 | {
 34 |   x1 = clamp(x1, 0, netW);
 35 |   y1 = clamp(y1, 0, netH);
 36 |   x2 = clamp(x2, 0, netW);
 37 |   y2 = clamp(y2, 0, netH);
 38 | 
 39 |   b.left = x1;
 40 |   b.width = clamp(x2 - x1, 0, netW);
 41 |   b.top = y1;
 42 |   b.height = clamp(y2 - y1, 0, netH);
 43 | 
 44 |   if (b.width < 1 || b.height < 1) {
 45 |       return;
 46 |   }
 47 | 
 48 |   b.detectionConfidence = maxProb;
 49 |   b.classId = maxIndex;
 50 | }
 51 | 
 52 | static std::vector<NvDsInferInstanceMaskInfo>
 53 | decodeTensorYoloSeg(const float* output, size_t outputSize, size_t channelsSize, uint netW, uint netH,
 54 |     const std::vector<float>& preclusterThreshold)
 55 | {
 56 |   std::vector<NvDsInferInstanceMaskInfo> objects;
 57 | 
 58 |   for (size_t n = 0; n < outputSize; ++n) {
 59 |     float maxProb = output[n * channelsSize + 4];
 60 |     int maxIndex = (int) output[n * channelsSize + 5];
 61 | 
 62 |     if (maxProb < preclusterThreshold[maxIndex]) {
 63 |       continue;
 64 |     }
 65 | 
 66 |     float x1 = output[n * channelsSize + 0];
 67 |     float y1 = output[n * channelsSize + 1];
 68 |     float x2 = output[n * channelsSize + 2];
 69 |     float y2 = output[n * channelsSize + 3];
 70 | 
 71 |     NvDsInferInstanceMaskInfo b;
 72 | 
 73 |     addBBoxProposal(x1, y1, x2, y2, netW, netH, maxIndex, maxProb, b);
 74 |     addSegProposal(output, channelsSize, netW, netH, n, b);
 75 | 
 76 |     objects.push_back(b);
 77 |   }
 78 | 
 79 |   return objects;
 80 | }
 81 | 
 82 | static bool
 83 | NvDsInferParseCustomYoloSeg(std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
 84 |     NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams,
 85 |     std::vector<NvDsInferInstanceMaskInfo>& objectList)
 86 | {
 87 |   if (outputLayersInfo.empty()) {
 88 |     std::cerr << "ERROR - Could not find output layer" << std::endl;
 89 |     return false;
 90 |   }
 91 | 
 92 |   const NvDsInferLayerInfo& output = outputLayersInfo[0];
 93 | 
 94 |   size_t outputSize = output.inferDims.d[0];
 95 |   size_t channelsSize = output.inferDims.d[1];
 96 | 
 97 |   std::vector<NvDsInferInstanceMaskInfo> objects = decodeTensorYoloSeg((const float*) (output.buffer), outputSize,
 98 |       channelsSize, networkInfo.width, networkInfo.height, detectionParams.perClassPreclusterThreshold);
 99 | 
100 |   objectList = objects;
101 | 
102 |   return true;
103 | }
104 | 
105 | extern "C" bool
106 | NvDsInferParseYoloSeg(std::vector<NvDsInferLayerInfo> const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo,
107 |     NvDsInferParseDetectionParams const& detectionParams, std::vector<NvDsInferInstanceMaskInfo>& objectList)
108 | {
109 |   return NvDsInferParseCustomYoloSeg(outputLayersInfo, networkInfo, detectionParams, objectList);
110 | }
111 | 
112 | CHECK_CUSTOM_INSTANCE_MASK_PARSE_FUNC_PROTOTYPE(NvDsInferParseYoloSeg);
113 | 


--------------------------------------------------------------------------------
/nvdsinfer_custom_impl_Yolo_seg/trt_plugins/common.cpp:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | 
  3 | namespace nvinfer1
  4 | {
  5 | 
  6 | namespace plugin
  7 | {
  8 | 
  9 | ILogger* gLogger{};
 10 | 
 11 | template <ILogger::Severity tSeverity>
 12 | int32_t LogStream<tSeverity>::Buf::sync()
 13 | {
 14 |     std::string s = str();
 15 |     while (!s.empty() && s.back() == '\n')
 16 |     {
 17 |         s.pop_back();
 18 |     }
 19 |     if (gLogger != nullptr)
 20 |     {
 21 |         gLogger->log(tSeverity, s.c_str());
 22 |     }
 23 |     str("");
 24 |     return 0;
 25 | }
 26 | 
 27 | LogStream<ILogger::Severity::kERROR> gLogError;
 28 | LogStream<ILogger::Severity::kWARNING> gLogWarning;
 29 | 
 30 | void caughtError(std::exception const& e)
 31 | {
 32 |     gLogError << e.what() << std::endl;
 33 | }
 34 | 
 35 | void throwCudaError(char const* file, char const* function, int32_t line, int32_t status, char const* msg)
 36 | {
 37 |     CudaError error(file, function, line, status, msg);
 38 |     error.log(gLogError);
 39 |     // NOLINTNEXTLINE(misc-throw-by-value-catch-by-reference)
 40 |     throw error;
 41 | }
 42 | 
 43 | void throwPluginError(char const* file, char const* function, int32_t line, int32_t status, char const* msg)
 44 | {
 45 |     PluginError error(file, function, line, status, msg);
 46 |     reportValidationFailure(msg, file, line);
 47 |     // NOLINTNEXTLINE(misc-throw-by-value-catch-by-reference)
 48 |     throw error;
 49 | }
 50 | 
 51 | void reportValidationFailure(char const* msg, char const* file, int32_t line)
 52 | {
 53 |     std::ostringstream stream;
 54 |     stream << "Validation failed: " << msg << "\n" << file << ':' << line << "\n";
 55 | #ifdef COMPILE_VFC_PLUGIN
 56 |     ILogger* logger = getPluginLogger();
 57 |     if (logger != nullptr)
 58 |     {
 59 |         logger->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str());
 60 |     }
 61 | #else
 62 |     getLogger()->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str());
 63 | #endif
 64 | }
 65 | 
 66 | void reportAssertion(char const* msg, char const* file, int32_t line)
 67 | {
 68 |     std::ostringstream stream;
 69 |     stream << "Assertion failed: " << msg << "\n"
 70 |            << file << ':' << line << "\n"
 71 |            << "Aborting..."
 72 |            << "\n";
 73 | #ifdef COMPILE_VFC_PLUGIN
 74 |     ILogger* logger = getPluginLogger();
 75 |     if (logger != nullptr)
 76 |     {
 77 |         logger->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str());
 78 |     }
 79 | #else
 80 |     getLogger()->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str());
 81 | #endif
 82 |     PLUGIN_CUASSERT(cudaDeviceReset());
 83 |     exit(EXIT_FAILURE);
 84 | }
 85 | 
 86 | void TRTException::log(std::ostream& logStream) const
 87 | {
 88 |     logStream << file << " (" << line << ") - " << name << " Error in " << function << ": " << status;
 89 |     if (message != nullptr)
 90 |     {
 91 |         logStream << " (" << message << ")";
 92 |     }
 93 |     logStream << std::endl;
 94 | }
 95 | 
 96 | void validateRequiredAttributesExist(std::set<std::string> requiredFieldNames, PluginFieldCollection const* fc)
 97 | {
 98 |     for (int32_t i = 0; i < fc->nbFields; i++)
 99 |     {
100 |         requiredFieldNames.erase(fc->fields[i].name);
101 |     }
102 |     if (!requiredFieldNames.empty())
103 |     {
104 |         std::stringstream msg{};
105 |         msg << "PluginFieldCollection missing required fields: {";
106 |         char const* separator = "";
107 |         for (auto const& field : requiredFieldNames)
108 |         {
109 |             msg << separator << field;
110 |             separator = ", ";
111 |         }
112 |         msg << "}";
113 |         std::string msg_str = msg.str();
114 |         PLUGIN_ERROR(msg_str.c_str());
115 |     }
116 | }
117 | 
118 | size_t dataTypeSize(const DataType dtype)
119 | {
120 |     switch (dtype)
121 |     {
122 |     case DataType::kINT8: return sizeof(char);
123 |     case DataType::kHALF: return sizeof(short);
124 |     case DataType::kFLOAT: return sizeof(float);
125 |     default: PLUGIN_FAIL("Unsupported data type");
126 |     return 0;
127 |     }
128 | }
129 | 
130 | } // namespace plugin
131 | } // namespace nvinfer1
132 | 


--------------------------------------------------------------------------------
/docs/RFDETR_Seg.md:
--------------------------------------------------------------------------------
  1 | # RF-DETR-Seg usage
  2 | 
  3 | **NOTE**: The yaml file is not required.
  4 | 
  5 | * [Convert model](#convert-model)
  6 | * [Compile the lib](#compile-the-lib)
  7 | * [Edit the config_infer_primary_rfdetr_seg file](#edit-the-config_infer_primary_rfdetr_seg-file)
  8 | 
  9 | ##
 10 | 
 11 | ### Convert model
 12 | 
 13 | #### 1. Download the RF-DETR repo and install the requirements
 14 | 
 15 | ```
 16 | git clone https://github.com/ultralytics/ultralytics.git
 17 | cd ultralytics
 18 | pip3 install -e .
 19 | pip3 install onnx onnxslim onnxruntime
 20 | ```
 21 | 
 22 | **NOTE**: It is recommended to use Python virtualenv.
 23 | 
 24 | #### 2. Copy conversor
 25 | 
 26 | Copy the `export_rfdetr_seg.py` file from `DeepStream-Yolo-Seg/utils` directory to the `ultralytics` folder.
 27 | 
 28 | #### 3. Download the model
 29 | 
 30 | Download the `pt` file from [RF-DETR](https://github.com/roboflow/rf-detr) releases (example for RF-DETR-Seg-Preview)
 31 | 
 32 | ```
 33 | wget https://storage.googleapis.com/rfdetr/rf-detr-seg-preview.pt
 34 | ```
 35 | 
 36 | **NOTE**: You can use your custom model.
 37 | 
 38 | #### 4. Convert model
 39 | 
 40 | Generate the ONNX model file (example for RF-DETR-Seg-Preview)
 41 | 
 42 | ```
 43 | python3 export_rfdetr_seg.py -w rf-detr-seg-preview.pt --dynamic
 44 | ```
 45 | 
 46 | **NOTE**: Minimum detection confidence threshold (example for conf-threshold = 0.25)
 47 | 
 48 | The minimum detection confidence threshold is configured in the ONNX exporter file. The `pre-cluster-threshold` should be >= the value used in the ONNX model.
 49 | 
 50 | ```
 51 | --conf-threshold 0.25
 52 | ```
 53 | 
 54 | **NOTE**: NMS IoU threshold (example for iou-threshold = 0.45)
 55 | 
 56 | ```
 57 | --iou-threshold 0.45
 58 | ```
 59 | 
 60 | **NOTE**: Maximum number of output detections (example for max-detections = 300)
 61 | 
 62 | ```
 63 | --max-detections 300
 64 | ```
 65 | 
 66 | **NOTE**: To change the inference size (defaut: 640)
 67 | 
 68 | ```
 69 | -s SIZE
 70 | --size SIZE
 71 | -s HEIGHT WIDTH
 72 | --size HEIGHT WIDTH
 73 | ```
 74 | 
 75 | Example for 1280
 76 | 
 77 | ```
 78 | -s 1280
 79 | ```
 80 | 
 81 | or
 82 | 
 83 | ```
 84 | -s 1280 1280
 85 | ```
 86 | 
 87 | **NOTE**: To simplify the ONNX model
 88 | 
 89 | ```
 90 | --simplify
 91 | ```
 92 | 
 93 | **NOTE**: To use dynamic batch-size (DeepStream >= 6.1)
 94 | 
 95 | ```
 96 | --dynamic
 97 | ```
 98 | 
 99 | **NOTE**: To use static batch-size (example for batch-size = 4)
100 | 
101 | ```
102 | --batch 4
103 | ```
104 | 
105 | #### 5. Copy generated files
106 | 
107 | Copy the generated ONNX model file and labels.txt file (if generated) to the `DeepStream-Yolo-Seg` folder.
108 | 
109 | ##
110 | 
111 | ### Compile the lib
112 | 
113 | 1. Open the `DeepStream-Yolo-Seg` folder and compile the lib
114 | 
115 | 2. Set the `CUDA_VER` according to your DeepStream version
116 | 
117 | ```
118 | export CUDA_VER=XY.Z
119 | ```
120 | 
121 | * x86 platform
122 | 
123 |   ```
124 |   DeepStream 8.0 = 12.8
125 |   DeepStream 7.1 = 12.6
126 |   DeepStream 7.0 / 6.4 = 12.2
127 |   DeepStream 6.3 = 12.1
128 |   DeepStream 6.2 = 11.8
129 |   DeepStream 6.1.1 = 11.7
130 |   DeepStream 6.1 = 11.6
131 |   DeepStream 6.0.1 / 6.0 = 11.4
132 |   ```
133 | 
134 | * Jetson platform
135 | 
136 |   ```
137 |   DeepStream 8.0 = 13.0
138 |   DeepStream 7.1 = 12.6
139 |   DeepStream 7.0 / 6.4 = 12.2
140 |   DeepStream 6.3 / 6.2 / 6.1.1 / 6.1 = 11.4
141 |   DeepStream 6.0.1 / 6.0 = 10.2
142 |   ```
143 | 
144 | 3. Make the lib
145 | 
146 | ```
147 | make -C nvdsinfer_custom_impl_Yolo_seg clean && make -C nvdsinfer_custom_impl_Yolo_seg
148 | ```
149 | 
150 | ##
151 | 
152 | ### Edit the config_infer_primary_rfdetr_seg file
153 | 
154 | Edit the `config_infer_primary_rfdetr_seg.txt` file according to your model (example for RF-DETR-Seg-Preview)
155 | 
156 | ```
157 | [property]
158 | ...
159 | onnx-file=rf-detr-seg-preview.onnx
160 | ...
161 | num-detected-classes=91
162 | ...
163 | parse-bbox-func-name=NvDsInferParseYoloSeg
164 | ...
165 | ```
166 | 
167 | **NOTE**: To output the masks, use
168 | 
169 | ```
170 | [property]
171 | ...
172 | output-instance-mask=1
173 | segmentation-threshold=0.5
174 | ...
175 | ```
176 | 
177 | **NOTE**: The **RF-DETR-Seg** do not resize the input with padding. To get better accuracy, use
178 | 
179 | ```
180 | [property]
181 | ...
182 | maintain-aspect-ratio=0
183 | ...
184 | ```
185 | 


--------------------------------------------------------------------------------
/docs/YOLO11_Seg.md:
--------------------------------------------------------------------------------
  1 | # YOLO11-Seg usage
  2 | 
  3 | **NOTE**: The yaml file is not required.
  4 | 
  5 | * [Convert model](#convert-model)
  6 | * [Compile the lib](#compile-the-lib)
  7 | * [Edit the config_infer_primary_yolo11_seg file](#edit-the-config_infer_primary_yolo11_seg-file)
  8 | 
  9 | ##
 10 | 
 11 | ### Convert model
 12 | 
 13 | #### 1. Download the YOLO11 repo and install the requirements
 14 | 
 15 | ```
 16 | git clone https://github.com/ultralytics/ultralytics.git
 17 | cd ultralytics
 18 | pip3 install -e .
 19 | pip3 install onnx onnxslim onnxruntime
 20 | ```
 21 | 
 22 | **NOTE**: It is recommended to use Python virtualenv.
 23 | 
 24 | #### 2. Copy conversor
 25 | 
 26 | Copy the `export_yolo11_seg.py` file from `DeepStream-Yolo-Seg/utils` directory to the `ultralytics` folder.
 27 | 
 28 | #### 3. Download the model
 29 | 
 30 | Download the `pt` file from [YOLO11](https://github.com/ultralytics/assets/releases/) releases (example for YOLO11s-Seg)
 31 | 
 32 | ```
 33 | wget https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11s-seg.pt
 34 | ```
 35 | 
 36 | **NOTE**: You can use your custom model.
 37 | 
 38 | #### 4. Convert model
 39 | 
 40 | Generate the ONNX model file (example for YOLO11s-Seg)
 41 | 
 42 | ```
 43 | python3 export_yolo11_seg.py -w yolo11s-seg.pt --dynamic
 44 | ```
 45 | 
 46 | **NOTE**: Minimum detection confidence threshold (example for conf-threshold = 0.25)
 47 | 
 48 | The minimum detection confidence threshold is configured in the ONNX exporter file. The `pre-cluster-threshold` should be >= the value used in the ONNX model.
 49 | 
 50 | ```
 51 | --conf-threshold 0.25
 52 | ```
 53 | 
 54 | **NOTE**: NMS IoU threshold (example for iou-threshold = 0.45)
 55 | 
 56 | ```
 57 | --iou-threshold 0.45
 58 | ```
 59 | 
 60 | **NOTE**: Maximum number of output detections (example for max-detections = 300)
 61 | 
 62 | ```
 63 | --max-detections 300
 64 | ```
 65 | 
 66 | **NOTE**: To change the inference size (defaut: 640)
 67 | 
 68 | ```
 69 | -s SIZE
 70 | --size SIZE
 71 | -s HEIGHT WIDTH
 72 | --size HEIGHT WIDTH
 73 | ```
 74 | 
 75 | Example for 1280
 76 | 
 77 | ```
 78 | -s 1280
 79 | ```
 80 | 
 81 | or
 82 | 
 83 | ```
 84 | -s 1280 1280
 85 | ```
 86 | 
 87 | **NOTE**: To simplify the ONNX model
 88 | 
 89 | ```
 90 | --simplify
 91 | ```
 92 | 
 93 | **NOTE**: To use dynamic batch-size (DeepStream >= 6.1)
 94 | 
 95 | ```
 96 | --dynamic
 97 | ```
 98 | 
 99 | **NOTE**: To use static batch-size (example for batch-size = 4)
100 | 
101 | ```
102 | --batch 4
103 | ```
104 | 
105 | #### 5. Copy generated files
106 | 
107 | Copy the generated ONNX model file and labels.txt file (if generated) to the `DeepStream-Yolo-Seg` folder.
108 | 
109 | ##
110 | 
111 | ### Compile the lib
112 | 
113 | 1. Open the `DeepStream-Yolo-Seg` folder and compile the lib
114 | 
115 | 2. Set the `CUDA_VER` according to your DeepStream version
116 | 
117 | ```
118 | export CUDA_VER=XY.Z
119 | ```
120 | 
121 | * x86 platform
122 | 
123 |   ```
124 |   DeepStream 8.0 = 12.8
125 |   DeepStream 7.1 = 12.6
126 |   DeepStream 7.0 / 6.4 = 12.2
127 |   DeepStream 6.3 = 12.1
128 |   DeepStream 6.2 = 11.8
129 |   DeepStream 6.1.1 = 11.7
130 |   DeepStream 6.1 = 11.6
131 |   DeepStream 6.0.1 / 6.0 = 11.4
132 |   ```
133 | 
134 | * Jetson platform
135 | 
136 |   ```
137 |   DeepStream 8.0 = 13.0
138 |   DeepStream 7.1 = 12.6
139 |   DeepStream 7.0 / 6.4 = 12.2
140 |   DeepStream 6.3 / 6.2 / 6.1.1 / 6.1 = 11.4
141 |   DeepStream 6.0.1 / 6.0 = 10.2
142 |   ```
143 | 
144 | 3. Make the lib
145 | 
146 | ```
147 | make -C nvdsinfer_custom_impl_Yolo_seg clean && make -C nvdsinfer_custom_impl_Yolo_seg
148 | ```
149 | 
150 | ##
151 | 
152 | ### Edit the config_infer_primary_yolo11_seg file
153 | 
154 | Edit the `config_infer_primary_yolo11_seg.txt` file according to your model (example for YOLO11s-Seg)
155 | 
156 | ```
157 | [property]
158 | ...
159 | onnx-file=yolo11s-seg.onnx
160 | ...
161 | num-detected-classes=80
162 | ...
163 | parse-bbox-func-name=NvDsInferParseYoloSeg
164 | ...
165 | ```
166 | 
167 | **NOTE**: To output the masks, use
168 | 
169 | ```
170 | [property]
171 | ...
172 | output-instance-mask=1
173 | segmentation-threshold=0.5
174 | ...
175 | ```
176 | 
177 | **NOTE**: The **YOLO11-Seg** resizes the input with center padding. To get better accuracy, use
178 | 
179 | ```
180 | [property]
181 | ...
182 | maintain-aspect-ratio=1
183 | symmetric-padding=1
184 | ...
185 | ```
186 | 


--------------------------------------------------------------------------------
/docs/YOLOv8_Seg.md:
--------------------------------------------------------------------------------
  1 | # YOLOv8-Seg usage
  2 | 
  3 | **NOTE**: The yaml file is not required.
  4 | 
  5 | * [Convert model](#convert-model)
  6 | * [Compile the lib](#compile-the-lib)
  7 | * [Edit the config_infer_primary_yoloV8_seg file](#edit-the-config_infer_primary_yolov8_seg-file)
  8 | 
  9 | ##
 10 | 
 11 | ### Convert model
 12 | 
 13 | #### 1. Download the YOLOv8 repo and install the requirements
 14 | 
 15 | ```
 16 | git clone https://github.com/ultralytics/ultralytics.git
 17 | cd ultralytics
 18 | pip3 install -e .
 19 | pip3 install onnx onnxslim onnxruntime
 20 | ```
 21 | 
 22 | **NOTE**: It is recommended to use Python virtualenv.
 23 | 
 24 | #### 2. Copy conversor
 25 | 
 26 | Copy the `export_yoloV8_seg.py` file from `DeepStream-Yolo-Seg/utils` directory to the `ultralytics` folder.
 27 | 
 28 | #### 3. Download the model
 29 | 
 30 | Download the `pt` file from [YOLOv8](https://github.com/ultralytics/assets/releases/) releases (example for YOLOv8s-Seg)
 31 | 
 32 | ```
 33 | wget https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8s-seg.pt
 34 | ```
 35 | 
 36 | **NOTE**: You can use your custom model.
 37 | 
 38 | #### 4. Convert model
 39 | 
 40 | Generate the ONNX model file (example for YOLOv8s-Seg)
 41 | 
 42 | ```
 43 | python3 export_yoloV8_seg.py -w yolov8s-seg.pt --dynamic
 44 | ```
 45 | 
 46 | **NOTE**: Minimum detection confidence threshold (example for conf-threshold = 0.25)
 47 | 
 48 | The minimum detection confidence threshold is configured in the ONNX exporter file. The `pre-cluster-threshold` should be >= the value used in the ONNX model.
 49 | 
 50 | ```
 51 | --conf-threshold 0.25
 52 | ```
 53 | 
 54 | **NOTE**: NMS IoU threshold (example for iou-threshold = 0.45)
 55 | 
 56 | ```
 57 | --iou-threshold 0.45
 58 | ```
 59 | 
 60 | **NOTE**: Maximum number of output detections (example for max-detections = 300)
 61 | 
 62 | ```
 63 | --max-detections 300
 64 | ```
 65 | 
 66 | **NOTE**: To change the inference size (defaut: 640)
 67 | 
 68 | ```
 69 | -s SIZE
 70 | --size SIZE
 71 | -s HEIGHT WIDTH
 72 | --size HEIGHT WIDTH
 73 | ```
 74 | 
 75 | Example for 1280
 76 | 
 77 | ```
 78 | -s 1280
 79 | ```
 80 | 
 81 | or
 82 | 
 83 | ```
 84 | -s 1280 1280
 85 | ```
 86 | 
 87 | **NOTE**: To simplify the ONNX model
 88 | 
 89 | ```
 90 | --simplify
 91 | ```
 92 | 
 93 | **NOTE**: To use dynamic batch-size (DeepStream >= 6.1)
 94 | 
 95 | ```
 96 | --dynamic
 97 | ```
 98 | 
 99 | **NOTE**: To use static batch-size (example for batch-size = 4)
100 | 
101 | ```
102 | --batch 4
103 | ```
104 | 
105 | #### 5. Copy generated files
106 | 
107 | Copy the generated ONNX model file and labels.txt file (if generated) to the `DeepStream-Yolo-Seg` folder.
108 | 
109 | ##
110 | 
111 | ### Compile the lib
112 | 
113 | 1. Open the `DeepStream-Yolo-Seg` folder and compile the lib
114 | 
115 | 2. Set the `CUDA_VER` according to your DeepStream version
116 | 
117 | ```
118 | export CUDA_VER=XY.Z
119 | ```
120 | 
121 | * x86 platform
122 | 
123 |   ```
124 |   DeepStream 8.0 = 12.8
125 |   DeepStream 7.1 = 12.6
126 |   DeepStream 7.0 / 6.4 = 12.2
127 |   DeepStream 6.3 = 12.1
128 |   DeepStream 6.2 = 11.8
129 |   DeepStream 6.1.1 = 11.7
130 |   DeepStream 6.1 = 11.6
131 |   DeepStream 6.0.1 / 6.0 = 11.4
132 |   ```
133 | 
134 | * Jetson platform
135 | 
136 |   ```
137 |   DeepStream 8.0 = 13.0
138 |   DeepStream 7.1 = 12.6
139 |   DeepStream 7.0 / 6.4 = 12.2
140 |   DeepStream 6.3 / 6.2 / 6.1.1 / 6.1 = 11.4
141 |   DeepStream 6.0.1 / 6.0 = 10.2
142 |   ```
143 | 
144 | 3. Make the lib
145 | 
146 | ```
147 | make -C nvdsinfer_custom_impl_Yolo_seg clean && make -C nvdsinfer_custom_impl_Yolo_seg
148 | ```
149 | 
150 | ##
151 | 
152 | ### Edit the config_infer_primary_yoloV8_seg file
153 | 
154 | Edit the `config_infer_primary_yoloV8_seg.txt` file according to your model (example for YOLOv8s-Seg)
155 | 
156 | ```
157 | [property]
158 | ...
159 | onnx-file=yolov8s-seg.onnx
160 | ...
161 | num-detected-classes=80
162 | ...
163 | parse-bbox-func-name=NvDsInferParseYoloSeg
164 | ...
165 | ```
166 | 
167 | **NOTE**: To output the masks, use
168 | 
169 | ```
170 | [property]
171 | ...
172 | output-instance-mask=1
173 | segmentation-threshold=0.5
174 | ...
175 | ```
176 | 
177 | **NOTE**: The **YOLOv8-Seg** resizes the input with center padding. To get better accuracy, use
178 | 
179 | ```
180 | [property]
181 | ...
182 | maintain-aspect-ratio=1
183 | symmetric-padding=1
184 | ...
185 | ```
186 | 


--------------------------------------------------------------------------------
/nvdsinfer_custom_impl_Yolo_seg/trt_plugins/efficientNMSPlugin/efficientNMSPlugin.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | #ifndef TRT_EFFICIENT_NMS_PLUGIN_H
 18 | #define TRT_EFFICIENT_NMS_PLUGIN_H
 19 | 
 20 | #include <vector>
 21 | 
 22 | #include "../common.h"
 23 | #include "efficientNMSParameters.h"
 24 | 
 25 | namespace nvinfer1
 26 | {
 27 | namespace plugin
 28 | {
 29 | 
 30 | class EfficientNMSPlugin : public IPluginV2DynamicExt
 31 | {
 32 | public:
 33 |     explicit EfficientNMSPlugin(EfficientNMSParameters param);
 34 |     EfficientNMSPlugin(void const* data, size_t length);
 35 |     ~EfficientNMSPlugin() override = default;
 36 | 
 37 |     // IPluginV2 methods
 38 |     char const* getPluginType() const noexcept override;
 39 |     char const* getPluginVersion() const noexcept override;
 40 |     int32_t getNbOutputs() const noexcept override;
 41 |     int32_t initialize() noexcept override;
 42 |     void terminate() noexcept override;
 43 |     size_t getSerializationSize() const noexcept override;
 44 |     void serialize(void* buffer) const noexcept override;
 45 |     void destroy() noexcept override;
 46 |     void setPluginNamespace(char const* libNamespace) noexcept override;
 47 |     char const* getPluginNamespace() const noexcept override;
 48 | 
 49 |     // IPluginV2Ext methods
 50 |     nvinfer1::DataType getOutputDataType(
 51 |         int32_t index, nvinfer1::DataType const* inputType, int32_t nbInputs) const noexcept override;
 52 | 
 53 |     // IPluginV2DynamicExt methods
 54 |     IPluginV2DynamicExt* clone() const noexcept override;
 55 |     DimsExprs getOutputDimensions(
 56 |         int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs, IExprBuilder& exprBuilder) noexcept override;
 57 |     bool supportsFormatCombination(
 58 |         int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override;
 59 |     void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out,
 60 |         int32_t nbOutputs) noexcept override;
 61 |     size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs,
 62 |         int32_t nbOutputs) const noexcept override;
 63 |     int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs,
 64 |         void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
 65 | 
 66 | protected:
 67 |     EfficientNMSParameters mParam{};
 68 |     bool initialized{false};
 69 |     std::string mNamespace;
 70 | 
 71 | private:
 72 |     void deserialize(int8_t const* data, size_t length);
 73 | };
 74 | 
 75 | // Standard NMS Plugin Operation
 76 | class EfficientNMSPluginCreator : public nvinfer1::pluginInternal::BaseCreator
 77 | {
 78 | public:
 79 |     EfficientNMSPluginCreator();
 80 |     ~EfficientNMSPluginCreator() override = default;
 81 | 
 82 |     char const* getPluginName() const noexcept override;
 83 |     char const* getPluginVersion() const noexcept override;
 84 |     PluginFieldCollection const* getFieldNames() noexcept override;
 85 | 
 86 |     IPluginV2DynamicExt* createPlugin(char const* name, PluginFieldCollection const* fc) noexcept override;
 87 |     IPluginV2DynamicExt* deserializePlugin(
 88 |         char const* name, void const* serialData, size_t serialLength) noexcept override;
 89 | 
 90 | protected:
 91 |     PluginFieldCollection mFC;
 92 |     EfficientNMSParameters mParam;
 93 |     std::vector<PluginField> mPluginAttributes;
 94 |     std::string mPluginName;
 95 | };
 96 | 
 97 | } // namespace plugin
 98 | } // namespace nvinfer1
 99 | 
100 | #endif // TRT_EFFICIENT_NMS_PLUGIN_H
101 | 


--------------------------------------------------------------------------------
/docs/YOLOv5_Seg.md:
--------------------------------------------------------------------------------
  1 | # YOLOv5-Seg usage
  2 | 
  3 | **NOTE**: The yaml file is not required.
  4 | 
  5 | * [Convert model](#convert-model)
  6 | * [Compile the lib](#compile-the-lib)
  7 | * [Edit the config_infer_primary_yoloV5_seg file](#edit-the-config_infer_primary_yolov5_seg-file)
  8 | 
  9 | ##
 10 | 
 11 | ### Convert model
 12 | 
 13 | #### 1. Download the YOLOv5 repo and install the requirements
 14 | 
 15 | ```
 16 | git clone https://github.com/ultralytics/yolov5.git
 17 | cd yolov5
 18 | pip3 install -r requirements.txt
 19 | pip3 install onnx onnxslim onnxruntime
 20 | ```
 21 | 
 22 | **NOTE**: It is recommended to use Python virtualenv.
 23 | 
 24 | #### 2. Copy conversor
 25 | 
 26 | Copy the `export_yoloV5_seg.py` file from `DeepStream-Yolo-Seg/utils` directory to the `yolov5` folder.
 27 | 
 28 | #### 3. Download the model
 29 | 
 30 | Download the `pt` file from [YOLOv5](https://github.com/ultralytics/yolov5/releases/) releases (example for YOLOv5s-Seg 7.0)
 31 | 
 32 | ```
 33 | wget https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s-seg.pt
 34 | ```
 35 | 
 36 | **NOTE**: You can use your custom model.
 37 | 
 38 | #### 4. Convert model
 39 | 
 40 | Generate the ONNX model file (example for YOLOv5s-Seg)
 41 | 
 42 | ```
 43 | python3 export_yoloV5_seg.py -w yolov5s-seg.pt --dynamic
 44 | ```
 45 | 
 46 | **NOTE**: Minimum detection confidence threshold (example for conf-threshold = 0.25)
 47 | 
 48 | The minimum detection confidence threshold is configured in the ONNX exporter file. The `pre-cluster-threshold` should be >= the value used in the ONNX model.
 49 | 
 50 | ```
 51 | --conf-threshold 0.25
 52 | ```
 53 | 
 54 | **NOTE**: NMS IoU threshold (example for iou-threshold = 0.45)
 55 | 
 56 | ```
 57 | --iou-threshold 0.45
 58 | ```
 59 | 
 60 | **NOTE**: Maximum number of output detections (example for max-detections = 300)
 61 | 
 62 | ```
 63 | --max-detections 300
 64 | ```
 65 | 
 66 | **NOTE**: To convert a P6 model
 67 | 
 68 | ```
 69 | --p6
 70 | ```
 71 | 
 72 | **NOTE**: To change the inference size (defaut: 640 / 1280 for `--p6` models)
 73 | 
 74 | ```
 75 | -s SIZE
 76 | --size SIZE
 77 | -s HEIGHT WIDTH
 78 | --size HEIGHT WIDTH
 79 | ```
 80 | 
 81 | Example for 1280
 82 | 
 83 | ```
 84 | -s 1280
 85 | ```
 86 | 
 87 | or
 88 | 
 89 | ```
 90 | -s 1280 1280
 91 | ```
 92 | 
 93 | **NOTE**: To simplify the ONNX model
 94 | 
 95 | ```
 96 | --simplify
 97 | ```
 98 | 
 99 | **NOTE**: To use dynamic batch-size (DeepStream >= 6.1)
100 | 
101 | ```
102 | --dynamic
103 | ```
104 | 
105 | **NOTE**: To use static batch-size (example for batch-size = 4)
106 | 
107 | ```
108 | --batch 4
109 | ```
110 | 
111 | #### 5. Copy generated files
112 | 
113 | Copy the generated ONNX model file and labels.txt file (if generated) to the `DeepStream-Yolo-Seg` folder.
114 | 
115 | ##
116 | 
117 | ### Compile the lib
118 | 
119 | 1. Open the `DeepStream-Yolo-Seg` folder and compile the lib
120 | 
121 | 2. Set the `CUDA_VER` according to your DeepStream version
122 | 
123 | ```
124 | export CUDA_VER=XY.Z
125 | ```
126 | 
127 | * x86 platform
128 | 
129 |   ```
130 |   DeepStream 8.0 = 12.8
131 |   DeepStream 7.1 = 12.6
132 |   DeepStream 7.0 / 6.4 = 12.2
133 |   DeepStream 6.3 = 12.1
134 |   DeepStream 6.2 = 11.8
135 |   DeepStream 6.1.1 = 11.7
136 |   DeepStream 6.1 = 11.6
137 |   DeepStream 6.0.1 / 6.0 = 11.4
138 |   ```
139 | 
140 | * Jetson platform
141 | 
142 |   ```
143 |   DeepStream 8.0 = 13.0
144 |   DeepStream 7.1 = 12.6
145 |   DeepStream 7.0 / 6.4 = 12.2
146 |   DeepStream 6.3 / 6.2 / 6.1.1 / 6.1 = 11.4
147 |   DeepStream 6.0.1 / 6.0 = 10.2
148 |   ```
149 | 
150 | 3. Make the lib
151 | 
152 | ```
153 | make -C nvdsinfer_custom_impl_Yolo_seg clean && make -C nvdsinfer_custom_impl_Yolo_seg
154 | ```
155 | 
156 | ##
157 | 
158 | ### Edit the config_infer_primary_yoloV5_seg file
159 | 
160 | Edit the `config_infer_primary_yoloV5_seg.txt` file according to your model (example for YOLOv5s-Seg)
161 | 
162 | ```
163 | [property]
164 | ...
165 | onnx-file=yolov5s-seg.onnx
166 | ...
167 | num-detected-classes=80
168 | ...
169 | parse-bbox-func-name=NvDsInferParseYoloSeg
170 | ...
171 | ```
172 | 
173 | **NOTE**: To output the masks, use
174 | 
175 | ```
176 | [property]
177 | ...
178 | output-instance-mask=1
179 | segmentation-threshold=0.5
180 | ...
181 | ```
182 | 
183 | **NOTE**: The **YOLOv5-Seg** resizes the input with center padding. To get better accuracy, use
184 | 
185 | ```
186 | [property]
187 | ...
188 | maintain-aspect-ratio=1
189 | symmetric-padding=1
190 | ...
191 | ```
192 | 


--------------------------------------------------------------------------------
/docs/YOLOv7_Seg.md:
--------------------------------------------------------------------------------
  1 | # YOLOv7-Seg usage
  2 | 
  3 | **NOTE**: The yaml file is not required.
  4 | 
  5 | * [Convert model](#convert-model)
  6 | * [Compile the lib](#compile-the-lib)
  7 | * [Edit the config_infer_primary_yoloV7_seg file](#edit-the-config_infer_primary_yolov7_seg-file)
  8 | 
  9 | ##
 10 | 
 11 | ### Convert model
 12 | 
 13 | #### 1. Download the YOLOv7 repo and install the requirements
 14 | 
 15 | ```
 16 | git clone -b u7 https://github.com/WongKinYiu/yolov7
 17 | cd yolov7/seg
 18 | pip3 install -r requirements.txt
 19 | pip3 install onnx onnxslim onnxruntime
 20 | ```
 21 | 
 22 | **NOTE**: It is recommended to use Python virtualenv.
 23 | 
 24 | #### 2. Copy conversor
 25 | 
 26 | Copy the `export_yoloV7_seg.py` file from `DeepStream-Yolo-Seg/utils` directory to the `yolov7/seg` folder.
 27 | 
 28 | #### 3. Download the model
 29 | 
 30 | Download the `pt` file from [YOLOv7](https://github.com/WongKinYiu/yolov7/releases/) releases (example for YOLOv7-Seg)
 31 | 
 32 | ```
 33 | wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-seg.pt
 34 | ```
 35 | 
 36 | **NOTE**: You can use your custom model.
 37 | 
 38 | #### 4. Convert model
 39 | 
 40 | Generate the ONNX model file (example for YOLOv7-Seg)
 41 | 
 42 | ```
 43 | python3 export_yoloV7_seg.py -w yolov7-seg.pt --dynamic
 44 | ```
 45 | 
 46 | **NOTE**: Minimum detection confidence threshold (example for conf-threshold = 0.25)
 47 | 
 48 | The minimum detection confidence threshold is configured in the ONNX exporter file. The `pre-cluster-threshold` should be >= the value used in the ONNX model.
 49 | 
 50 | ```
 51 | --conf-threshold 0.25
 52 | ```
 53 | 
 54 | **NOTE**: NMS IoU threshold (example for iou-threshold = 0.45)
 55 | 
 56 | ```
 57 | --iou-threshold 0.45
 58 | ```
 59 | 
 60 | **NOTE**: Maximum number of output detections (example for max-detections = 300)
 61 | 
 62 | ```
 63 | --max-detections 300
 64 | ```
 65 | 
 66 | **NOTE**: To convert a P6 model
 67 | 
 68 | ```
 69 | --p6
 70 | ```
 71 | 
 72 | **NOTE**: To change the inference size (defaut: 640 / 1280 for `--p6` models)
 73 | 
 74 | ```
 75 | -s SIZE
 76 | --size SIZE
 77 | -s HEIGHT WIDTH
 78 | --size HEIGHT WIDTH
 79 | ```
 80 | 
 81 | Example for 1280
 82 | 
 83 | ```
 84 | -s 1280
 85 | ```
 86 | 
 87 | or
 88 | 
 89 | ```
 90 | -s 1280 1280
 91 | ```
 92 | 
 93 | **NOTE**: To simplify the ONNX model
 94 | 
 95 | ```
 96 | --simplify
 97 | ```
 98 | 
 99 | **NOTE**: To use dynamic batch-size (DeepStream >= 6.1)
100 | 
101 | ```
102 | --dynamic
103 | ```
104 | 
105 | **NOTE**: To use static batch-size (example for batch-size = 4)
106 | 
107 | ```
108 | --batch 4
109 | ```
110 | 
111 | #### 5. Copy generated files
112 | 
113 | Copy the generated ONNX model file and labels.txt file (if generated) to the `DeepStream-Yolo-Seg` folder.
114 | 
115 | ##
116 | 
117 | ### Compile the lib
118 | 
119 | 1. Open the `DeepStream-Yolo-Seg` folder and compile the lib
120 | 
121 | 2. Set the `CUDA_VER` according to your DeepStream version
122 | 
123 | ```
124 | export CUDA_VER=XY.Z
125 | ```
126 | 
127 | * x86 platform
128 | 
129 |   ```
130 |   DeepStream 8.0 = 12.8
131 |   DeepStream 7.1 = 12.6
132 |   DeepStream 7.0 / 6.4 = 12.2
133 |   DeepStream 6.3 = 12.1
134 |   DeepStream 6.2 = 11.8
135 |   DeepStream 6.1.1 = 11.7
136 |   DeepStream 6.1 = 11.6
137 |   DeepStream 6.0.1 / 6.0 = 11.4
138 |   ```
139 | 
140 | * Jetson platform
141 | 
142 |   ```
143 |   DeepStream 8.0 = 13.0
144 |   DeepStream 7.1 = 12.6
145 |   DeepStream 7.0 / 6.4 = 12.2
146 |   DeepStream 6.3 / 6.2 / 6.1.1 / 6.1 = 11.4
147 |   DeepStream 6.0.1 / 6.0 = 10.2
148 |   ```
149 | 
150 | 3. Make the lib
151 | 
152 | ```
153 | make -C nvdsinfer_custom_impl_Yolo_seg clean && make -C nvdsinfer_custom_impl_Yolo_seg
154 | ```
155 | 
156 | ##
157 | 
158 | ### Edit the config_infer_primary_yoloV7_seg file
159 | 
160 | Edit the `config_infer_primary_yoloV7_seg.txt` file according to your model (example for YOLOv7-Seg)
161 | 
162 | ```
163 | [property]
164 | ...
165 | onnx-file=yolov7-seg.onnx
166 | ...
167 | num-detected-classes=80
168 | ...
169 | parse-bbox-func-name=NvDsInferParseYoloSeg
170 | ...
171 | ```
172 | 
173 | **NOTE**: To output the masks, use
174 | 
175 | ```
176 | [property]
177 | ...
178 | output-instance-mask=1
179 | segmentation-threshold=0.5
180 | ...
181 | ```
182 | 
183 | **NOTE**: The **YOLOv7-Seg** resizes the input with center padding. To get better accuracy, use
184 | 
185 | ```
186 | [property]
187 | ...
188 | maintain-aspect-ratio=1
189 | symmetric-padding=1
190 | ...
191 | ```
192 | 


--------------------------------------------------------------------------------
/docs/YOLOv7_Mask.md:
--------------------------------------------------------------------------------
  1 | # YOLOv7-Mask usage
  2 | 
  3 | **NOTE**: The yaml file is not required.
  4 | 
  5 | * [Convert model](#convert-model)
  6 | * [Compile the lib](#compile-the-lib)
  7 | * [Edit the config_infer_primary_yoloV7_mask file](#edit-the-config_infer_primary_yolov7_mask-file)
  8 | 
  9 | ##
 10 | 
 11 | ### Convert model
 12 | 
 13 | #### 1. Download the YOLOv7 repo and install the requirements
 14 | 
 15 | ```
 16 | git clone -b mask https://github.com/WongKinYiu/yolov7
 17 | cd yolov7
 18 | pip3 install -r requirements.txt
 19 | git clone https://github.com/facebookresearch/detectron2.git
 20 | pip3 install -e detectron2
 21 | pip3 install onnx onnxslim onnxruntime
 22 | ```
 23 | 
 24 | **NOTE**: It is recommended to use Python virtualenv.
 25 | 
 26 | #### 2. Copy conversor
 27 | 
 28 | Copy the `export_yoloV7_mask.py` file from `DeepStream-Yolo-Mask/utils` directory to the `yolov7` folder.
 29 | 
 30 | #### 3. Download the model
 31 | 
 32 | Download the `pt` file from [YOLOv7](https://github.com/WongKinYiu/yolov7/releases/) releases (example for YOLOv7-Mask)
 33 | 
 34 | ```
 35 | wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-mask.pt
 36 | ```
 37 | 
 38 | **NOTE**: You can use your custom model.
 39 | 
 40 | #### 4. Convert model
 41 | 
 42 | Generate the ONNX model file (example for YOLOv7-Mask)
 43 | 
 44 | ```
 45 | python3 export_yoloV7_mask.py -w yolov7-mask.pt --dynamic
 46 | ```
 47 | 
 48 | **NOTE**: Minimum detection confidence threshold (example for conf-threshold = 0.25)
 49 | 
 50 | The minimum detection confidence threshold is configured in the ONNX exporter file. The `pre-cluster-threshold` should be >= the value used in the ONNX model.
 51 | 
 52 | ```
 53 | --conf-threshold 0.25
 54 | ```
 55 | 
 56 | **NOTE**: NMS IoU threshold (example for iou-threshold = 0.45)
 57 | 
 58 | ```
 59 | --iou-threshold 0.45
 60 | ```
 61 | 
 62 | **NOTE**: Maximum number of output detections (example for max-detections = 300)
 63 | 
 64 | ```
 65 | --max-detections 300
 66 | ```
 67 | 
 68 | **NOTE**: To convert a P6 model
 69 | 
 70 | ```
 71 | --p6
 72 | ```
 73 | 
 74 | **NOTE**: To change the inference size (defaut: 640 / 1280 for `--p6` models)
 75 | 
 76 | ```
 77 | -s SIZE
 78 | --size SIZE
 79 | -s HEIGHT WIDTH
 80 | --size HEIGHT WIDTH
 81 | ```
 82 | 
 83 | Example for 1280
 84 | 
 85 | ```
 86 | -s 1280
 87 | ```
 88 | 
 89 | or
 90 | 
 91 | ```
 92 | -s 1280 1280
 93 | ```
 94 | 
 95 | **NOTE**: To simplify the ONNX model
 96 | 
 97 | ```
 98 | --simplify
 99 | ```
100 | 
101 | **NOTE**: To use dynamic batch-size (DeepStream >= 6.1)
102 | 
103 | ```
104 | --dynamic
105 | ```
106 | 
107 | **NOTE**: To use static batch-size (example for batch-size = 4)
108 | 
109 | ```
110 | --batch 4
111 | ```
112 | 
113 | #### 5. Copy generated files
114 | 
115 | Copy the generated ONNX model file and labels.txt file (if generated) to the `DeepStream-Yolo-Mask` folder.
116 | 
117 | ##
118 | 
119 | ### Compile the lib
120 | 
121 | 1. Open the `DeepStream-Yolo-Mask` folder and compile the lib
122 | 
123 | 2. Set the `CUDA_VER` according to your DeepStream version
124 | 
125 | ```
126 | export CUDA_VER=XY.Z
127 | ```
128 | 
129 | * x86 platform
130 | 
131 |   ```
132 |   DeepStream 8.0 = 12.8
133 |   DeepStream 7.1 = 12.6
134 |   DeepStream 7.0 / 6.4 = 12.2
135 |   DeepStream 6.3 = 12.1
136 |   DeepStream 6.2 = 11.8
137 |   DeepStream 6.1.1 = 11.7
138 |   DeepStream 6.1 = 11.6
139 |   DeepStream 6.0.1 / 6.0 = 11.4
140 |   ```
141 | 
142 | * Jetson platform
143 | 
144 |   ```
145 |   DeepStream 8.0 = 13.0
146 |   DeepStream 7.1 = 12.6
147 |   DeepStream 7.0 / 6.4 = 12.2
148 |   DeepStream 6.3 / 6.2 / 6.1.1 / 6.1 = 11.4
149 |   DeepStream 6.0.1 / 6.0 = 10.2
150 |   ```
151 | 
152 | 3. Make the lib
153 | 
154 | ```
155 | make -C nvdsinfer_custom_impl_Yolo_mask clean && make -C nvdsinfer_custom_impl_Yolo_mask
156 | ```
157 | 
158 | ##
159 | 
160 | ### Edit the config_infer_primary_yoloV7_mask file
161 | 
162 | Edit the `config_infer_primary_yoloV7_mask.txt` file according to your model (example for YOLOv7-Mask)
163 | 
164 | ```
165 | [property]
166 | ...
167 | onnx-file=yolov7-mask.onnx
168 | ...
169 | num-detected-classes=80
170 | ...
171 | parse-bbox-func-name=NvDsInferParseYoloSeg
172 | ...
173 | ```
174 | 
175 | **NOTE**: To output the masks, use
176 | 
177 | ```
178 | [property]
179 | ...
180 | output-instance-mask=1
181 | segmentation-threshold=0.5
182 | ...
183 | ```
184 | 
185 | **NOTE**: The **YOLOv7-Mask** resizes the input with center padding. To get better accuracy, use
186 | 
187 | ```
188 | [property]
189 | ...
190 | maintain-aspect-ratio=1
191 | symmetric-padding=1
192 | ...
193 | ```
194 | 


--------------------------------------------------------------------------------
/nvdsinfer_custom_impl_Yolo_seg/trt_plugins/roiAlignPlugin/roiAlignPlugin.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | #ifndef TRT_ROIALIGN_PLUGIN_H
 18 | #define TRT_ROIALIGN_PLUGIN_H
 19 | 
 20 | #include "../common.h"
 21 | #include <cuda_runtime_api.h>
 22 | #include <stdint.h>
 23 | #include <vector>
 24 | 
 25 | #include "NvInfer.h"
 26 | #include "NvInferPlugin.h"
 27 | 
 28 | namespace nvinfer1
 29 | {
 30 | namespace plugin
 31 | {
 32 | 
 33 | class ROIAlign : public IPluginV2DynamicExt
 34 | {
 35 | public:
 36 |     ROIAlign(int32_t outputHeight, int32_t outputWidth, int32_t samplingRatio, int32_t mode, float spatialScale,
 37 |         int32_t aligned);
 38 |     ROIAlign(void const* data, size_t length);
 39 |     ROIAlign() = default;
 40 |     ~ROIAlign() override = default;
 41 | 
 42 |     // IPluginV2 methods
 43 |     char const* getPluginType() const noexcept override;
 44 |     char const* getPluginVersion() const noexcept override;
 45 |     int32_t getNbOutputs() const noexcept override;
 46 |     int32_t initialize() noexcept override;
 47 |     void terminate() noexcept override;
 48 |     size_t getSerializationSize() const noexcept override;
 49 |     void serialize(void* buffer) const noexcept override;
 50 |     void destroy() noexcept override;
 51 |     void setPluginNamespace(char const* libNamespace) noexcept override;
 52 |     char const* getPluginNamespace() const noexcept override;
 53 |     void setClipParam(bool clip) noexcept;
 54 |     void setScoreBits(int32_t scoreBits) noexcept;
 55 |     void setCaffeSemantics(bool caffeSemantics) noexcept;
 56 | 
 57 |     // IPluginV2Ext methods
 58 |     nvinfer1::DataType getOutputDataType(
 59 |         int32_t index, nvinfer1::DataType const* inputType, int32_t nbInputs) const noexcept override;
 60 | 
 61 |     // IPluginV2DynamicExt methods
 62 |     IPluginV2DynamicExt* clone() const noexcept override;
 63 |     DimsExprs getOutputDimensions(
 64 |         int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs, IExprBuilder& exprBuilder) noexcept override;
 65 |     bool supportsFormatCombination(
 66 |         int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override;
 67 |     void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out,
 68 |         int32_t nbOutputs) noexcept override;
 69 |     size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs,
 70 |         int32_t nbOutputs) const noexcept override;
 71 |     int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs,
 72 |         void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
 73 | 
 74 | private:
 75 |     void checkValidInputs(nvinfer1::DynamicPluginTensorDesc const* inputs, int32_t nbInputDims);
 76 |     void validateAttributes(int32_t outputHeight, int32_t outputWidth, int32_t samplingRatio, int32_t mode,
 77 |         float spatialScale, int32_t aligned);
 78 | 
 79 |     int32_t mOutputHeight{};
 80 |     int32_t mOutputWidth{};
 81 |     int32_t mSamplingRatio{};
 82 |     float mSpatialScale{};
 83 |     int32_t mMode{};
 84 |     int32_t mAligned{};
 85 | 
 86 |     int32_t mROICount{};
 87 |     int32_t mFeatureLength{}; // number of channels
 88 |     int32_t mHeight{};
 89 |     int32_t mWidth{};
 90 | 
 91 |     int32_t mMaxThreadsPerBlock{};
 92 | 
 93 |     std::string mNameSpace{};
 94 | };
 95 | 
 96 | class ROIAlignPluginCreator : public nvinfer1::pluginInternal::BaseCreator
 97 | {
 98 | public:
 99 |     ROIAlignPluginCreator();
100 | 
101 |     ~ROIAlignPluginCreator() override = default;
102 | 
103 |     char const* getPluginName() const noexcept override;
104 | 
105 |     char const* getPluginVersion() const noexcept override;
106 | 
107 |     PluginFieldCollection const* getFieldNames() noexcept override;
108 | 
109 |     IPluginV2DynamicExt* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
110 | 
111 |     IPluginV2DynamicExt* deserializePlugin(
112 |         char const* name, void const* serialData, size_t serialLength) noexcept override;
113 | 
114 | private:
115 |     PluginFieldCollection mFC;
116 |     std::vector<PluginField> mPluginAttributes;
117 | };
118 | 
119 | } // namespace plugin
120 | } // namespace nvinfer1
121 | #endif // TRT_ROIALIGN_PLUGIN_H
122 | 


--------------------------------------------------------------------------------
/nvdsinfer_custom_impl_Yolo_seg/trt_plugins/efficientNMSPlugin/efficientNMSInference.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #ifndef TRT_EFFICIENT_NMS_INFERENCE_CUH
 19 | #define TRT_EFFICIENT_NMS_INFERENCE_CUH
 20 | 
 21 | #include <cuda_fp16.h>
 22 | 
 23 | // FP32 Intrinsics
 24 | 
 25 | float __device__ __inline__ exp_mp(const float a)
 26 | {
 27 |     return __expf(a);
 28 | }
 29 | float __device__ __inline__ sigmoid_mp(const float a)
 30 | {
 31 |     return __frcp_rn(__fadd_rn(1.f, __expf(-a)));
 32 | }
 33 | float __device__ __inline__ add_mp(const float a, const float b)
 34 | {
 35 |     return __fadd_rn(a, b);
 36 | }
 37 | float __device__ __inline__ sub_mp(const float a, const float b)
 38 | {
 39 |     return __fsub_rn(a, b);
 40 | }
 41 | float __device__ __inline__ mul_mp(const float a, const float b)
 42 | {
 43 |     return __fmul_rn(a, b);
 44 | }
 45 | bool __device__ __inline__ gt_mp(const float a, const float b)
 46 | {
 47 |     return a > b;
 48 | }
 49 | bool __device__ __inline__ lt_mp(const float a, const float b)
 50 | {
 51 |     return a < b;
 52 | }
 53 | bool __device__ __inline__ lte_mp(const float a, const float b)
 54 | {
 55 |     return a <= b;
 56 | }
 57 | bool __device__ __inline__ gte_mp(const float a, const float b)
 58 | {
 59 |     return a >= b;
 60 | }
 61 | 
 62 | #if __CUDA_ARCH__ >= 530
 63 | 
 64 | // FP16 Intrinsics
 65 | 
 66 | __half __device__ __inline__ exp_mp(const __half a)
 67 | {
 68 |     return hexp(a);
 69 | }
 70 | __half __device__ __inline__ sigmoid_mp(const __half a)
 71 | {
 72 |     return hrcp(__hadd((__half) 1, hexp(__hneg(a))));
 73 | }
 74 | __half __device__ __inline__ add_mp(const __half a, const __half b)
 75 | {
 76 |     return __hadd(a, b);
 77 | }
 78 | __half __device__ __inline__ sub_mp(const __half a, const __half b)
 79 | {
 80 |     return __hsub(a, b);
 81 | }
 82 | __half __device__ __inline__ mul_mp(const __half a, const __half b)
 83 | {
 84 |     return __hmul(a, b);
 85 | }
 86 | bool __device__ __inline__ gt_mp(const __half a, const __half b)
 87 | {
 88 |     return __hgt(a, b);
 89 | }
 90 | bool __device__ __inline__ lt_mp(const __half a, const __half b)
 91 | {
 92 |     return __hlt(a, b);
 93 | }
 94 | bool __device__ __inline__ lte_mp(const __half a, const __half b)
 95 | {
 96 |     return __hle(a, b);
 97 | }
 98 | bool __device__ __inline__ gte_mp(const __half a, const __half b)
 99 | {
100 |     return __hge(a, b);
101 | }
102 | 
103 | #else
104 | 
105 | // FP16 Fallbacks on older architectures that lack support
106 | 
107 | __half __device__ __inline__ exp_mp(const __half a)
108 | {
109 |     return __float2half(exp_mp(__half2float(a)));
110 | }
111 | __half __device__ __inline__ sigmoid_mp(const __half a)
112 | {
113 |     return __float2half(sigmoid_mp(__half2float(a)));
114 | }
115 | __half __device__ __inline__ add_mp(const __half a, const __half b)
116 | {
117 |     return __float2half(add_mp(__half2float(a), __half2float(b)));
118 | }
119 | __half __device__ __inline__ sub_mp(const __half a, const __half b)
120 | {
121 |     return __float2half(sub_mp(__half2float(a), __half2float(b)));
122 | }
123 | __half __device__ __inline__ mul_mp(const __half a, const __half b)
124 | {
125 |     return __float2half(mul_mp(__half2float(a), __half2float(b)));
126 | }
127 | bool __device__ __inline__ gt_mp(const __half a, const __half b)
128 | {
129 |     return __float2half(gt_mp(__half2float(a), __half2float(b)));
130 | }
131 | bool __device__ __inline__ lt_mp(const __half a, const __half b)
132 | {
133 |     return __float2half(lt_mp(__half2float(a), __half2float(b)));
134 | }
135 | bool __device__ __inline__ lte_mp(const __half a, const __half b)
136 | {
137 |     return __float2half(lte_mp(__half2float(a), __half2float(b)));
138 | }
139 | bool __device__ __inline__ gte_mp(const __half a, const __half b)
140 | {
141 |     return __float2half(gte_mp(__half2float(a), __half2float(b)));
142 | }
143 | 
144 | #endif
145 | 
146 | template <typename T>
147 | struct __align__(4 * sizeof(T)) BoxCorner;
148 | 
149 | template <typename T>
150 | struct __align__(4 * sizeof(T)) BoxCenterSize;
151 | 
152 | template <typename T>
153 | struct __align__(4 * sizeof(T)) BoxCorner
154 | {
155 |     // For NMS/IOU purposes, YXYX coding is identical to XYXY
156 |     T y1, x1, y2, x2;
157 | 
158 |     __device__ void reorder()
159 |     {
160 |         if (gt_mp(y1, y2))
161 |         {
162 |             // Swap values, so y1 < y2
163 |             y1 = sub_mp(y1, y2);
164 |             y2 = add_mp(y1, y2);
165 |             y1 = sub_mp(y2, y1);
166 |         }
167 |         if (gt_mp(x1, x2))
168 |         {
169 |             // Swap values, so x1 < x2
170 |             x1 = sub_mp(x1, x2);
171 |             x2 = add_mp(x1, x2);
172 |             x1 = sub_mp(x2, x1);
173 |         }
174 |     }
175 | 
176 |     __device__ BoxCorner<T> clip(T low, T high) const
177 |     {
178 |         return {lt_mp(y1, low) ? low : (gt_mp(y1, high) ? high : y1),
179 |             lt_mp(x1, low) ? low : (gt_mp(x1, high) ? high : x1), lt_mp(y2, low) ? low : (gt_mp(y2, high) ? high : y2),
180 |             lt_mp(x2, low) ? low : (gt_mp(x2, high) ? high : x2)};
181 |     }
182 | 
183 |     __device__ BoxCorner<T> decode(BoxCorner<T> anchor) const
184 |     {
185 |         return {add_mp(y1, anchor.y1), add_mp(x1, anchor.x1), add_mp(y2, anchor.y2), add_mp(x2, anchor.x2)};
186 |     }
187 | 
188 |     __device__ float area() const
189 |     {
190 |         T w = sub_mp(x2, x1);
191 |         T h = sub_mp(y2, y1);
192 |         if (lte_mp(h, (T) 0))
193 |         {
194 |             return 0;
195 |         }
196 |         if (lte_mp(w, (T) 0))
197 |         {
198 |             return 0;
199 |         }
200 |         return (float) h * (float) w;
201 |     }
202 | 
203 |     __device__ operator BoxCenterSize<T>() const
204 |     {
205 |         T w = sub_mp(x2, x1);
206 |         T h = sub_mp(y2, y1);
207 |         return BoxCenterSize<T>{add_mp(y1, mul_mp((T) 0.5, h)), add_mp(x1, mul_mp((T) 0.5, w)), h, w};
208 |     }
209 | 
210 |     __device__ static BoxCorner<T> intersect(BoxCorner<T> a, BoxCorner<T> b)
211 |     {
212 |         return {gt_mp(a.y1, b.y1) ? a.y1 : b.y1, gt_mp(a.x1, b.x1) ? a.x1 : b.x1, lt_mp(a.y2, b.y2) ? a.y2 : b.y2,
213 |             lt_mp(a.x2, b.x2) ? a.x2 : b.x2};
214 |     }
215 | };
216 | 
217 | template <typename T>
218 | struct __align__(4 * sizeof(T)) BoxCenterSize
219 | {
220 |     // For NMS/IOU purposes, YXHW coding is identical to XYWH
221 |     T y, x, h, w;
222 | 
223 |     __device__ void reorder() {}
224 | 
225 |     __device__ BoxCenterSize<T> clip(T low, T high) const
226 |     {
227 |         return BoxCenterSize<T>(BoxCorner<T>(*this).clip(low, high));
228 |     }
229 | 
230 |     __device__ BoxCenterSize<T> decode(BoxCenterSize<T> anchor) const
231 |     {
232 |         return {add_mp(mul_mp(y, anchor.h), anchor.y), add_mp(mul_mp(x, anchor.w), anchor.x),
233 |             mul_mp(anchor.h, exp_mp(h)), mul_mp(anchor.w, exp_mp(w))};
234 |     }
235 | 
236 |     __device__ float area() const
237 |     {
238 |         if (h <= (T) 0)
239 |         {
240 |             return 0;
241 |         }
242 |         if (w <= (T) 0)
243 |         {
244 |             return 0;
245 |         }
246 |         return (float) h * (float) w;
247 |     }
248 | 
249 |     __device__ operator BoxCorner<T>() const
250 |     {
251 |         T h2 = mul_mp(h, (T) 0.5);
252 |         T w2 = mul_mp(w, (T) 0.5);
253 |         return BoxCorner<T>{sub_mp(y, h2), sub_mp(x, w2), add_mp(y, h2), add_mp(x, w2)};
254 |     }
255 |     __device__ static BoxCenterSize<T> intersect(BoxCenterSize<T> a, BoxCenterSize<T> b)
256 |     {
257 |         return BoxCenterSize<T>(BoxCorner<T>::intersect(BoxCorner<T>(a), BoxCorner<T>(b)));
258 |     }
259 | };
260 | 
261 | #endif
262 | 


--------------------------------------------------------------------------------
/utils/export_yoloV5_seg.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import onnx
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | from models.experimental import attempt_load
  7 | from models.yolo import Detect
  8 | 
  9 | 
 10 | class RoiAlign(torch.autograd.Function):
 11 |     @staticmethod
 12 |     def forward(
 13 |         self,
 14 |         X,
 15 |         rois,
 16 |         batch_indices,
 17 |         coordinate_transformation_mode,
 18 |         mode,
 19 |         output_height,
 20 |         output_width,
 21 |         sampling_ratio,
 22 |         spatial_scale
 23 |     ):
 24 |         C = X.shape[1]
 25 |         num_rois = rois.shape[0]
 26 |         return torch.randn([num_rois, C, output_height, output_width], device=rois.device, dtype=rois.dtype)
 27 | 
 28 |     @staticmethod
 29 |     def symbolic(
 30 |         g,
 31 |         X,
 32 |         rois,
 33 |         batch_indices,
 34 |         coordinate_transformation_mode,
 35 |         mode,
 36 |         output_height,
 37 |         output_width,
 38 |         sampling_ratio,
 39 |         spatial_scale
 40 |     ):
 41 |         return g.op(
 42 |             "TRT::ROIAlignX_TRT",
 43 |             X,
 44 |             rois,
 45 |             batch_indices,
 46 |             coordinate_transformation_mode_i=coordinate_transformation_mode,
 47 |             mode_i=mode,
 48 |             output_height_i=output_height,
 49 |             output_width_i=output_width,
 50 |             sampling_ratio_i=sampling_ratio,
 51 |             spatial_scale_f=spatial_scale
 52 |         )
 53 | 
 54 | 
 55 | class NMS(torch.autograd.Function):
 56 |     @staticmethod
 57 |     def forward(self, boxes, scores, score_threshold, iou_threshold, max_output_boxes):
 58 |         batch_size = scores.shape[0]
 59 |         num_classes = scores.shape[-1]
 60 |         num_detections = torch.randint(0, max_output_boxes, (batch_size, 1), dtype=torch.int32)
 61 |         detection_boxes = torch.randn(batch_size, max_output_boxes, 4)
 62 |         detection_scores = torch.randn(batch_size, max_output_boxes)
 63 |         detection_classes = torch.randint(0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32)
 64 |         detections_indices = torch.randint(0, max_output_boxes, (batch_size, max_output_boxes), dtype=torch.int32)
 65 |         return num_detections, detection_boxes, detection_scores, detection_classes, detections_indices
 66 | 
 67 |     @staticmethod
 68 |     def symbolic(g, boxes, scores, score_threshold, iou_threshold, max_output_boxes):
 69 |         return g.op(
 70 |             "TRT::EfficientNMSX_TRT",
 71 |             boxes,
 72 |             scores,
 73 |             score_threshold_f=score_threshold,
 74 |             iou_threshold_f=iou_threshold,
 75 |             max_output_boxes_i=max_output_boxes,
 76 |             background_class_i=-1,
 77 |             score_activation_i=0,
 78 |             class_agnostic_i=0,
 79 |             box_coding_i=0,
 80 |             outputs=5
 81 |         )
 82 | 
 83 | 
 84 | class DeepStreamOutput(nn.Module):
 85 |     def __init__(self, nc, conf_threshold, iou_threshold, max_detections):
 86 |         super().__init__()
 87 |         self.nc = nc
 88 |         self.conf_threshold = conf_threshold
 89 |         self.iou_threshold = iou_threshold
 90 |         self.max_detections = max_detections
 91 | 
 92 |     def forward(self, x):
 93 |         preds = x[0]
 94 |         boxes = preds[:, :, :4]
 95 |         convert_matrix = torch.tensor(
 96 |             [[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], dtype=boxes.dtype, device=boxes.device
 97 |         )
 98 |         boxes @= convert_matrix
 99 |         objectness = preds[:, :, 4:5]
100 |         scores = preds[:, :, 5:self.nc+5]
101 |         scores *= objectness
102 |         masks = preds[:, :, self.nc+5:]
103 |         protos = x[1]
104 | 
105 |         num_detections, detection_boxes, detection_scores, detection_classes, detections_indices = NMS.apply(
106 |             boxes, scores, self.conf_threshold, self.iou_threshold, self.max_detections
107 |         )
108 | 
109 |         batch_size, num_protos, h_protos, w_protos = protos.shape
110 | 
111 |         total_detections = batch_size * self.max_detections
112 | 
113 |         batch_index = torch.ones_like(detections_indices) * torch.arange(
114 |             batch_size, device=boxes.device, dtype=torch.int32
115 |         ).unsqueeze(1)
116 |         batch_index = batch_index.view(total_detections).to(torch.int32)
117 |         box_index = detections_indices.view(total_detections).to(torch.int32)
118 | 
119 |         selected_boxes = boxes[batch_index, box_index]
120 |         selected_masks = masks[batch_index, box_index]
121 | 
122 |         pooled_proto = RoiAlign.apply(protos, selected_boxes, batch_index, 1, 1, int(h_protos), int(w_protos), 0, 0.25)
123 | 
124 |         masks_protos = torch.matmul(
125 |             selected_masks.unsqueeze(1), pooled_proto.view(total_detections, num_protos, h_protos * w_protos)
126 |         )
127 |         masks_protos = masks_protos.sigmoid().view(batch_size, self.max_detections, h_protos * w_protos)
128 | 
129 |         return torch.cat(
130 |             [detection_boxes, detection_scores.unsqueeze(-1), detection_classes.unsqueeze(-1), masks_protos], dim=-1
131 |         )
132 | 
133 | 
134 | def yolov5_seg_export(weights, device, inplace=True, fuse=True):
135 |     model = attempt_load(weights, device=device, inplace=inplace, fuse=fuse)
136 |     model.eval()
137 |     for k, m in model.named_modules():
138 |         if isinstance(m, Detect):
139 |             m.inplace = False
140 |             m.dynamic = False
141 |             m.export = True
142 |     return model
143 | 
144 | 
145 | def suppress_warnings():
146 |     import warnings
147 |     warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)
148 |     warnings.filterwarnings("ignore", category=UserWarning)
149 |     warnings.filterwarnings("ignore", category=DeprecationWarning)
150 |     warnings.filterwarnings("ignore", category=FutureWarning)
151 |     warnings.filterwarnings("ignore", category=ResourceWarning)
152 | 
153 | 
154 | def main(args):
155 |     suppress_warnings()
156 | 
157 |     print(f"\nStarting: {args.weights}")
158 | 
159 |     print("Opening YOLOv5-Seg model")
160 | 
161 |     device = torch.device("cpu")
162 |     model = yolov5_seg_export(args.weights, device)
163 | 
164 |     if len(model.names.keys()) > 0:
165 |         print("Creating labels.txt file")
166 |         with open("labels.txt", "w", encoding="utf-8") as f:
167 |             for name in model.names.values():
168 |                 f.write(f"{name}\n")
169 | 
170 |     model = nn.Sequential(
171 |         model, DeepStreamOutput(len(model.names), args.conf_threshold, args.iou_threshold, args.max_detections)
172 |     )
173 | 
174 |     img_size = args.size * 2 if len(args.size) == 1 else args.size
175 | 
176 |     if img_size == [640, 640] and args.p6:
177 |         img_size = [1280] * 2
178 | 
179 |     onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device)
180 |     onnx_output_file = args.weights.rsplit(".", 1)[0] + ".onnx"
181 | 
182 |     dynamic_axes = {
183 |         "input": {
184 |             0: "batch"
185 |         },
186 |         "output": {
187 |             0: "batch"
188 |         }
189 |     }
190 | 
191 |     print("Exporting the model to ONNX")
192 |     torch.onnx.export(
193 |         model,
194 |         onnx_input_im,
195 |         onnx_output_file,
196 |         verbose=False,
197 |         opset_version=args.opset,
198 |         do_constant_folding=True,
199 |         input_names=["input"],
200 |         output_names=["output"],
201 |         dynamic_axes=dynamic_axes if args.dynamic else None
202 |     )
203 | 
204 |     if args.simplify:
205 |         print("Simplifying the ONNX model")
206 |         import onnxslim
207 |         model_onnx = onnx.load(onnx_output_file)
208 |         model_onnx = onnxslim.slim(model_onnx)
209 |         onnx.save(model_onnx, onnx_output_file)
210 | 
211 |     print(f"Done: {onnx_output_file}\n")
212 | 
213 | 
214 | def parse_args():
215 |     import argparse
216 |     parser = argparse.ArgumentParser(description="DeepStream YOLOv5-Seg conversion")
217 |     parser.add_argument("-w", "--weights", required=True, type=str, help="Input weights (.pt) file path (required)")
218 |     parser.add_argument("-s", "--size", nargs="+", type=int, default=[640], help="Inference size [H,W] (default [640])")
219 |     parser.add_argument("--p6", action="store_true", help="P6 model")
220 |     parser.add_argument("--opset", type=int, default=17, help="ONNX opset version")
221 |     parser.add_argument("--simplify", action="store_true", help="ONNX simplify model")
222 |     parser.add_argument("--dynamic", action="store_true", help="Dynamic batch-size")
223 |     parser.add_argument("--batch", type=int, default=1, help="Static batch-size")
224 |     parser.add_argument(
225 |         "--conf-threshold", type=float, default=0.25, help="Minimum detection confidence threshold (default 0.25)"
226 |     )
227 |     parser.add_argument("--iou-threshold", type=float, default=0.45, help="NMS IoU threshold (default 0.45)")
228 |     parser.add_argument(
229 |         "--max-detections", type=int, default=100, help="Maximum number of output detections (default 100)"
230 |     )
231 |     args = parser.parse_args()
232 |     if not os.path.isfile(args.weights):
233 |         raise SystemExit("Invalid weights file")
234 |     if args.dynamic and args.batch > 1:
235 |         raise SystemExit("Cannot set dynamic batch-size and static batch-size at same time")
236 |     return args
237 | 
238 | 
239 | if __name__ == "__main__":
240 |     args = parse_args()
241 |     main(args)
242 | 


--------------------------------------------------------------------------------
/utils/export_yoloV7_seg.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import onnx
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | from models.experimental import attempt_load
  7 | from models.yolo import Detect
  8 | 
  9 | 
 10 | class RoiAlign(torch.autograd.Function):
 11 |     @staticmethod
 12 |     def forward(
 13 |         self,
 14 |         X,
 15 |         rois,
 16 |         batch_indices,
 17 |         coordinate_transformation_mode,
 18 |         mode,
 19 |         output_height,
 20 |         output_width,
 21 |         sampling_ratio,
 22 |         spatial_scale
 23 |     ):
 24 |         C = X.shape[1]
 25 |         num_rois = rois.shape[0]
 26 |         return torch.randn([num_rois, C, output_height, output_width], device=rois.device, dtype=rois.dtype)
 27 | 
 28 |     @staticmethod
 29 |     def symbolic(
 30 |         g,
 31 |         X,
 32 |         rois,
 33 |         batch_indices,
 34 |         coordinate_transformation_mode,
 35 |         mode,
 36 |         output_height,
 37 |         output_width,
 38 |         sampling_ratio,
 39 |         spatial_scale
 40 |     ):
 41 |         return g.op(
 42 |             "TRT::ROIAlignX_TRT",
 43 |             X,
 44 |             rois,
 45 |             batch_indices,
 46 |             coordinate_transformation_mode_i=coordinate_transformation_mode,
 47 |             mode_i=mode,
 48 |             output_height_i=output_height,
 49 |             output_width_i=output_width,
 50 |             sampling_ratio_i=sampling_ratio,
 51 |             spatial_scale_f=spatial_scale
 52 |         )
 53 | 
 54 | 
 55 | class NMS(torch.autograd.Function):
 56 |     @staticmethod
 57 |     def forward(self, boxes, scores, score_threshold, iou_threshold, max_output_boxes):
 58 |         batch_size = scores.shape[0]
 59 |         num_classes = scores.shape[-1]
 60 |         num_detections = torch.randint(0, max_output_boxes, (batch_size, 1), dtype=torch.int32)
 61 |         detection_boxes = torch.randn(batch_size, max_output_boxes, 4)
 62 |         detection_scores = torch.randn(batch_size, max_output_boxes)
 63 |         detection_classes = torch.randint(0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32)
 64 |         detections_indices = torch.randint(0, max_output_boxes, (batch_size, max_output_boxes), dtype=torch.int32)
 65 |         return num_detections, detection_boxes, detection_scores, detection_classes, detections_indices
 66 | 
 67 |     @staticmethod
 68 |     def symbolic(g, boxes, scores, score_threshold, iou_threshold, max_output_boxes):
 69 |         return g.op(
 70 |             "TRT::EfficientNMSX_TRT",
 71 |             boxes,
 72 |             scores,
 73 |             score_threshold_f=score_threshold,
 74 |             iou_threshold_f=iou_threshold,
 75 |             max_output_boxes_i=max_output_boxes,
 76 |             background_class_i=-1,
 77 |             score_activation_i=0,
 78 |             class_agnostic_i=0,
 79 |             box_coding_i=0,
 80 |             outputs=5
 81 |         )
 82 | 
 83 | 
 84 | class DeepStreamOutput(nn.Module):
 85 |     def __init__(self, nc, conf_threshold, iou_threshold, max_detections):
 86 |         super().__init__()
 87 |         self.nc = nc
 88 |         self.conf_threshold = conf_threshold
 89 |         self.iou_threshold = iou_threshold
 90 |         self.max_detections = max_detections
 91 | 
 92 |     def forward(self, x):
 93 |         preds = x[0]
 94 |         boxes = preds[:, :, :4]
 95 |         convert_matrix = torch.tensor(
 96 |             [[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], dtype=boxes.dtype, device=boxes.device
 97 |         )
 98 |         boxes @= convert_matrix
 99 |         objectness = preds[:, :, 4:5]
100 |         scores = preds[:, :, 5:self.nc+5]
101 |         scores *= objectness
102 |         masks = preds[:, :, self.nc+5:]
103 |         protos = x[1][1]
104 | 
105 |         num_detections, detection_boxes, detection_scores, detection_classes, detections_indices = NMS.apply(
106 |             boxes, scores, self.conf_threshold, self.iou_threshold, self.max_detections
107 |         )
108 | 
109 |         batch_size, num_protos, h_protos, w_protos = protos.shape
110 | 
111 |         total_detections = batch_size * self.max_detections
112 | 
113 |         batch_index = torch.ones_like(detections_indices) * torch.arange(
114 |             batch_size, device=boxes.device, dtype=torch.int32
115 |         ).unsqueeze(1)
116 |         batch_index = batch_index.view(total_detections).to(torch.int32)
117 |         box_index = detections_indices.view(total_detections).to(torch.int32)
118 | 
119 |         selected_boxes = boxes[batch_index, box_index]
120 |         selected_masks = masks[batch_index, box_index]
121 | 
122 |         pooled_proto = RoiAlign.apply(protos, selected_boxes, batch_index, 1, 1, int(h_protos), int(w_protos), 0, 0.25)
123 | 
124 |         masks_protos = torch.matmul(
125 |             selected_masks.unsqueeze(1), pooled_proto.view(total_detections, num_protos, h_protos * w_protos)
126 |         )
127 |         masks_protos = masks_protos.sigmoid().view(batch_size, self.max_detections, h_protos * w_protos)
128 | 
129 |         return torch.cat(
130 |             [detection_boxes, detection_scores.unsqueeze(-1), detection_classes.unsqueeze(-1), masks_protos], dim=-1
131 |         )
132 | 
133 | 
134 | def yolov7_seg_export(weights, device, inplace=True, fuse=True):
135 |     model = attempt_load(weights, device=device, inplace=inplace, fuse=fuse)
136 |     model.eval()
137 |     for k, m in model.named_modules():
138 |         if isinstance(m, Detect):
139 |             m.inplace = False
140 |             m.dynamic = False
141 |             m.export = True
142 |     return model
143 | 
144 | 
145 | def suppress_warnings():
146 |     import warnings
147 |     warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)
148 |     warnings.filterwarnings("ignore", category=UserWarning)
149 |     warnings.filterwarnings("ignore", category=DeprecationWarning)
150 |     warnings.filterwarnings("ignore", category=FutureWarning)
151 |     warnings.filterwarnings("ignore", category=ResourceWarning)
152 | 
153 | 
154 | def main(args):
155 |     suppress_warnings()
156 | 
157 |     print(f"\nStarting: {args.weights}")
158 | 
159 |     print("Opening YOLOv7-Seg model")
160 | 
161 |     device = torch.device("cpu")
162 |     model = yolov7_seg_export(args.weights, device)
163 | 
164 |     if len(model.names.keys()) > 0:
165 |         print("Creating labels.txt file")
166 |         with open("labels.txt", "w", encoding="utf-8") as f:
167 |             for name in model.names.values():
168 |                 f.write(f"{name}\n")
169 | 
170 |     model = nn.Sequential(
171 |         model, DeepStreamOutput(len(model.names), args.conf_threshold, args.iou_threshold, args.max_detections)
172 |     )
173 | 
174 |     img_size = args.size * 2 if len(args.size) == 1 else args.size
175 | 
176 |     if img_size == [640, 640] and args.p6:
177 |         img_size = [1280] * 2
178 | 
179 |     onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device)
180 |     onnx_output_file = args.weights.rsplit(".", 1)[0] + ".onnx"
181 | 
182 |     dynamic_axes = {
183 |         "input": {
184 |             0: "batch"
185 |         },
186 |         "output": {
187 |             0: "batch"
188 |         }
189 |     }
190 | 
191 |     print("Exporting the model to ONNX")
192 |     torch.onnx.export(
193 |         model,
194 |         onnx_input_im,
195 |         onnx_output_file,
196 |         verbose=False,
197 |         opset_version=args.opset,
198 |         do_constant_folding=True,
199 |         input_names=["input"],
200 |         output_names=["output"],
201 |         dynamic_axes=dynamic_axes if args.dynamic else None
202 |     )
203 | 
204 |     if args.simplify:
205 |         print("Simplifying the ONNX model")
206 |         import onnxslim
207 |         model_onnx = onnx.load(onnx_output_file)
208 |         model_onnx = onnxslim.slim(model_onnx)
209 |         onnx.save(model_onnx, onnx_output_file)
210 | 
211 |     print(f"Done: {onnx_output_file}\n")
212 | 
213 | 
214 | def parse_args():
215 |     import argparse
216 |     parser = argparse.ArgumentParser(description="DeepStream YOLOv7-Seg conversion")
217 |     parser.add_argument("-w", "--weights", required=True, type=str, help="Input weights (.pt) file path (required)")
218 |     parser.add_argument("-s", "--size", nargs="+", type=int, default=[640], help="Inference size [H,W] (default [640])")
219 |     parser.add_argument("--p6", action="store_true", help="P6 model")
220 |     parser.add_argument("--opset", type=int, default=17, help="ONNX opset version")
221 |     parser.add_argument("--simplify", action="store_true", help="ONNX simplify model")
222 |     parser.add_argument("--dynamic", action="store_true", help="Dynamic batch-size")
223 |     parser.add_argument("--batch", type=int, default=1, help="Static batch-size")
224 |     parser.add_argument(
225 |         "--conf-threshold", type=float, default=0.25, help="Minimum detection confidence threshold (default 0.25)"
226 |     )
227 |     parser.add_argument("--iou-threshold", type=float, default=0.45, help="NMS IoU threshold (default 0.45)")
228 |     parser.add_argument(
229 |         "--max-detections", type=int, default=100, help="Maximum number of output detections (default 100)"
230 |     )
231 |     args = parser.parse_args()
232 |     if not os.path.isfile(args.weights):
233 |         raise SystemExit("Invalid weights file")
234 |     if args.dynamic and args.batch > 1:
235 |         raise SystemExit("Cannot set dynamic batch-size and static batch-size at same time")
236 |     return args
237 | 
238 | 
239 | if __name__ == "__main__":
240 |     args = parse_args()
241 |     main(args)
242 | 


--------------------------------------------------------------------------------
/utils/export_yoloV7_mask.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import yaml
  3 | import onnx
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | from utils.general import merge_bases
  8 | 
  9 | 
 10 | class RoiAlign(torch.autograd.Function):
 11 |     @staticmethod
 12 |     def forward(
 13 |         self,
 14 |         X,
 15 |         rois,
 16 |         batch_indices,
 17 |         coordinate_transformation_mode,
 18 |         mode,
 19 |         output_height,
 20 |         output_width,
 21 |         sampling_ratio,
 22 |         spatial_scale
 23 |     ):
 24 |         C = X.shape[1]
 25 |         num_rois = rois.shape[0]
 26 |         return torch.randn([num_rois, C, output_height, output_width], device=rois.device, dtype=rois.dtype)
 27 | 
 28 |     @staticmethod
 29 |     def symbolic(
 30 |         g,
 31 |         X,
 32 |         rois,
 33 |         batch_indices,
 34 |         coordinate_transformation_mode,
 35 |         mode,
 36 |         output_height,
 37 |         output_width,
 38 |         sampling_ratio,
 39 |         spatial_scale
 40 |     ):
 41 |         return g.op(
 42 |             "TRT::ROIAlignX_TRT",
 43 |             X,
 44 |             rois,
 45 |             batch_indices,
 46 |             coordinate_transformation_mode_i=coordinate_transformation_mode,
 47 |             mode_i=mode,
 48 |             output_height_i=output_height,
 49 |             output_width_i=output_width,
 50 |             sampling_ratio_i=sampling_ratio,
 51 |             spatial_scale_f=spatial_scale
 52 |         )
 53 | 
 54 | 
 55 | class NMS(torch.autograd.Function):
 56 |     @staticmethod
 57 |     def forward(self, boxes, scores, score_threshold, iou_threshold, max_output_boxes):
 58 |         batch_size = scores.shape[0]
 59 |         num_classes = scores.shape[-1]
 60 |         num_detections = torch.randint(0, max_output_boxes, (batch_size, 1), dtype=torch.int32)
 61 |         detection_boxes = torch.randn(batch_size, max_output_boxes, 4)
 62 |         detection_scores = torch.randn(batch_size, max_output_boxes)
 63 |         detection_classes = torch.randint(0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32)
 64 |         detections_indices = torch.randint(0, max_output_boxes, (batch_size, max_output_boxes), dtype=torch.int32)
 65 |         return num_detections, detection_boxes, detection_scores, detection_classes, detections_indices
 66 | 
 67 |     @staticmethod
 68 |     def symbolic(g, boxes, scores, score_threshold, iou_threshold, max_output_boxes):
 69 |         return g.op(
 70 |             "TRT::EfficientNMSX_TRT",
 71 |             boxes,
 72 |             scores,
 73 |             score_threshold_f=score_threshold,
 74 |             iou_threshold_f=iou_threshold,
 75 |             max_output_boxes_i=max_output_boxes,
 76 |             background_class_i=-1,
 77 |             score_activation_i=0,
 78 |             class_agnostic_i=0,
 79 |             box_coding_i=0,
 80 |             outputs=5
 81 |         )
 82 | 
 83 | 
 84 | class DeepStreamOutput(nn.Module):
 85 |     def __init__(self, nc, conf_threshold, iou_threshold, max_detections, attn_resolution, num_base):
 86 |         super().__init__()
 87 |         self.nc = nc
 88 |         self.conf_threshold = conf_threshold
 89 |         self.iou_threshold = iou_threshold
 90 |         self.max_detections = max_detections
 91 |         self.attn_resolution = attn_resolution
 92 |         self.num_base = num_base
 93 | 
 94 |     def forward(self, x):
 95 |         preds = x["test"]
 96 |         boxes = preds[:, :, :4]
 97 |         convert_matrix = torch.tensor(
 98 |             [[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], dtype=boxes.dtype, device=boxes.device
 99 |         )
100 |         boxes @= convert_matrix
101 |         objectness = preds[:, :, 4:5]
102 |         scores = preds[:, :, 5:self.nc+5]
103 |         scores *= objectness
104 |         attn = x["attn"]
105 |         bases = torch.cat([x["bases"], x["sem"]], dim=1)
106 | 
107 |         num_detections, detection_boxes, detection_scores, detection_classes, detections_indices = NMS.apply(
108 |             boxes, scores, self.conf_threshold, self.iou_threshold, self.max_detections
109 |         )
110 | 
111 |         batch_size, num_protos, h_protos, w_protos = bases.shape
112 | 
113 |         total_detections = batch_size * self.max_detections
114 | 
115 |         batch_index = torch.ones_like(detections_indices) * torch.arange(
116 |             batch_size, device=boxes.device, dtype=torch.int32
117 |         ).unsqueeze(1)
118 |         batch_index = batch_index.view(total_detections).to(torch.int32)
119 |         box_index = detections_indices.view(total_detections).to(torch.int32)
120 | 
121 |         selected_boxes = boxes[batch_index, box_index]
122 |         selected_masks = attn[batch_index, box_index]
123 | 
124 |         pooled_proto = RoiAlign.apply(bases, selected_boxes, batch_index, 1, 1, int(h_protos), int(w_protos), 0, 0.25)
125 | 
126 |         masks_protos = merge_bases(
127 |             pooled_proto, selected_masks, self.attn_resolution, self.num_base
128 |         ).view(batch_size, self.max_detections, h_protos * w_protos).sigmoid()
129 | 
130 |         return torch.cat(
131 |             [detection_boxes, detection_scores.unsqueeze(-1), detection_classes.unsqueeze(-1), masks_protos], dim=-1
132 |         )
133 | 
134 | 
135 | def yolov7_mask_export(weights, device):
136 |     ckpt = torch.load(weights)
137 |     model = ckpt["model"]
138 |     model = model.float().to(device)
139 |     model.eval()
140 |     with open("data/hyp.scratch.mask.yaml") as f:
141 |         hyp = yaml.load(f, Loader=yaml.FullLoader)
142 |     return model, hyp["attn_resolution"], hyp["num_base"]
143 | 
144 | 
145 | def suppress_warnings():
146 |     import warnings
147 |     warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)
148 |     warnings.filterwarnings("ignore", category=UserWarning)
149 |     warnings.filterwarnings("ignore", category=DeprecationWarning)
150 |     warnings.filterwarnings("ignore", category=FutureWarning)
151 |     warnings.filterwarnings("ignore", category=ResourceWarning)
152 | 
153 | 
154 | def main(args):
155 |     suppress_warnings()
156 | 
157 |     print(f"\nStarting: {args.weights}")
158 | 
159 |     print("Opening YOLOv7-Mask model")
160 | 
161 |     device = torch.device("cpu")
162 |     model, attn_resolution, num_base = yolov7_mask_export(args.weights, device)
163 | 
164 |     if hasattr(model, "names") and len(model.names) > 0:
165 |         print("Creating labels.txt file")
166 |         with open("labels.txt", "w", encoding="utf-8") as f:
167 |             for name in model.names:
168 |                 f.write(f"{name}\n")
169 | 
170 |     model = nn.Sequential(
171 |         model, DeepStreamOutput(
172 |             len(model.names), args.conf_threshold, args.iou_threshold, args.max_detections, attn_resolution, num_base
173 |         )
174 |     )
175 | 
176 |     img_size = args.size * 2 if len(args.size) == 1 else args.size
177 | 
178 |     if img_size == [640, 640] and args.p6:
179 |         img_size = [1280] * 2
180 | 
181 |     onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device)
182 |     onnx_output_file = args.weights.rsplit(".", 1)[0] + ".onnx"
183 | 
184 |     dynamic_axes = {
185 |         "input": {
186 |             0: "batch"
187 |         },
188 |         "output": {
189 |             0: "batch"
190 |         }
191 |     }
192 | 
193 |     print("Exporting the model to ONNX")
194 |     torch.onnx.export(
195 |         model,
196 |         onnx_input_im,
197 |         onnx_output_file,
198 |         verbose=False,
199 |         opset_version=args.opset,
200 |         do_constant_folding=True,
201 |         input_names=["input"],
202 |         output_names=["output"],
203 |         dynamic_axes=dynamic_axes if args.dynamic else None
204 |     )
205 | 
206 |     if args.simplify:
207 |         print("Simplifying the ONNX model")
208 |         import onnxslim
209 |         model_onnx = onnx.load(onnx_output_file)
210 |         model_onnx = onnxslim.slim(model_onnx)
211 |         onnx.save(model_onnx, onnx_output_file)
212 | 
213 |     print(f"Done: {onnx_output_file}\n")
214 | 
215 | 
216 | def parse_args():
217 |     import argparse
218 |     parser = argparse.ArgumentParser(description="DeepStream YOLOv7-Mask conversion")
219 |     parser.add_argument("-w", "--weights", required=True, type=str, help="Input weights (.pt) file path (required)")
220 |     parser.add_argument("-s", "--size", nargs="+", type=int, default=[640], help="Inference size [H,W] (default [640])")
221 |     parser.add_argument("--p6", action="store_true", help="P6 model")
222 |     parser.add_argument("--opset", type=int, default=17, help="ONNX opset version")
223 |     parser.add_argument("--simplify", action="store_true", help="ONNX simplify model")
224 |     parser.add_argument("--dynamic", action="store_true", help="Dynamic batch-size")
225 |     parser.add_argument("--batch", type=int, default=1, help="Static batch-size")
226 |     parser.add_argument(
227 |         "--conf-threshold", type=float, default=0.25, help="Minimum detection confidence threshold (default 0.25)"
228 |     )
229 |     parser.add_argument("--iou-threshold", type=float, default=0.45, help="NMS IoU threshold (default 0.45)")
230 |     parser.add_argument(
231 |         "--max-detections", type=int, default=100, help="Maximum number of output detections (default 100)"
232 |     )
233 |     args = parser.parse_args()
234 |     if not os.path.isfile(args.weights):
235 |         raise SystemExit("Invalid weights file")
236 |     if args.dynamic and args.batch > 1:
237 |         raise SystemExit("Cannot set dynamic batch-size and static batch-size at same time")
238 |     return args
239 | 
240 | 
241 | if __name__ == "__main__":
242 |     args = parse_args()
243 |     main(args)
244 | 


--------------------------------------------------------------------------------
/utils/export_yolo11_seg.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import onnx
  4 | import torch
  5 | import torch.nn as nn
  6 | from copy import deepcopy
  7 | 
  8 | from ultralytics import YOLO
  9 | from ultralytics.nn.modules import C2f, Detect, RTDETRDecoder
 10 | import ultralytics.utils
 11 | import ultralytics.models.yolo
 12 | import ultralytics.utils.tal as _m
 13 | 
 14 | sys.modules["ultralytics.yolo"] = ultralytics.models.yolo
 15 | sys.modules["ultralytics.yolo.utils"] = ultralytics.utils
 16 | 
 17 | 
 18 | def _dist2bbox(distance, anchor_points, xywh=False, dim=-1):
 19 |     lt, rb = distance.chunk(2, dim)
 20 |     x1y1 = anchor_points - lt
 21 |     x2y2 = anchor_points + rb
 22 |     return torch.cat([x1y1, x2y2], dim)
 23 | 
 24 | 
 25 | _m.dist2bbox.__code__ = _dist2bbox.__code__
 26 | 
 27 | 
 28 | class RoiAlign(torch.autograd.Function):
 29 |     @staticmethod
 30 |     def forward(
 31 |         self,
 32 |         X,
 33 |         rois,
 34 |         batch_indices,
 35 |         coordinate_transformation_mode,
 36 |         mode,
 37 |         output_height,
 38 |         output_width,
 39 |         sampling_ratio,
 40 |         spatial_scale
 41 |     ):
 42 |         C = X.shape[1]
 43 |         num_rois = rois.shape[0]
 44 |         return torch.randn([num_rois, C, output_height, output_width], device=rois.device, dtype=rois.dtype)
 45 | 
 46 |     @staticmethod
 47 |     def symbolic(
 48 |         g,
 49 |         X,
 50 |         rois,
 51 |         batch_indices,
 52 |         coordinate_transformation_mode,
 53 |         mode,
 54 |         output_height,
 55 |         output_width,
 56 |         sampling_ratio,
 57 |         spatial_scale
 58 |     ):
 59 |         return g.op(
 60 |             "TRT::ROIAlignX_TRT",
 61 |             X,
 62 |             rois,
 63 |             batch_indices,
 64 |             coordinate_transformation_mode_i=coordinate_transformation_mode,
 65 |             mode_i=mode,
 66 |             output_height_i=output_height,
 67 |             output_width_i=output_width,
 68 |             sampling_ratio_i=sampling_ratio,
 69 |             spatial_scale_f=spatial_scale
 70 |         )
 71 | 
 72 | 
 73 | class NMS(torch.autograd.Function):
 74 |     @staticmethod
 75 |     def forward(self, boxes, scores, score_threshold, iou_threshold, max_output_boxes):
 76 |         batch_size = scores.shape[0]
 77 |         num_classes = scores.shape[-1]
 78 |         num_detections = torch.randint(0, max_output_boxes, (batch_size, 1), dtype=torch.int32)
 79 |         detection_boxes = torch.randn(batch_size, max_output_boxes, 4)
 80 |         detection_scores = torch.randn(batch_size, max_output_boxes)
 81 |         detection_classes = torch.randint(0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32)
 82 |         detections_indices = torch.randint(0, max_output_boxes, (batch_size, max_output_boxes), dtype=torch.int32)
 83 |         return num_detections, detection_boxes, detection_scores, detection_classes, detections_indices
 84 | 
 85 |     @staticmethod
 86 |     def symbolic(g, boxes, scores, score_threshold, iou_threshold, max_output_boxes):
 87 |         return g.op(
 88 |             "TRT::EfficientNMSX_TRT",
 89 |             boxes,
 90 |             scores,
 91 |             score_threshold_f=score_threshold,
 92 |             iou_threshold_f=iou_threshold,
 93 |             max_output_boxes_i=max_output_boxes,
 94 |             background_class_i=-1,
 95 |             score_activation_i=0,
 96 |             class_agnostic_i=0,
 97 |             box_coding_i=0,
 98 |             outputs=5
 99 |         )
100 | 
101 | 
102 | class DeepStreamOutput(nn.Module):
103 |     def __init__(self, nc, conf_threshold, iou_threshold, max_detections):
104 |         super().__init__()
105 |         self.nc = nc
106 |         self.conf_threshold = conf_threshold
107 |         self.iou_threshold = iou_threshold
108 |         self.max_detections = max_detections
109 | 
110 |     def forward(self, x):
111 |         preds = x[0].transpose(1, 2)
112 |         boxes = preds[:, :, :4]
113 |         scores = preds[:, :, 4:self.nc+4]
114 |         masks = preds[:, :, self.nc+4:]
115 |         protos = x[1]
116 | 
117 |         num_detections, detection_boxes, detection_scores, detection_classes, detections_indices = NMS.apply(
118 |             boxes, scores, self.conf_threshold, self.iou_threshold, self.max_detections
119 |         )
120 | 
121 |         batch_size, num_protos, h_protos, w_protos = protos.shape
122 | 
123 |         total_detections = batch_size * self.max_detections
124 | 
125 |         batch_index = torch.ones_like(detections_indices) * torch.arange(
126 |             batch_size, device=boxes.device, dtype=torch.int32
127 |         ).unsqueeze(1)
128 |         batch_index = batch_index.view(total_detections).to(torch.int32)
129 |         box_index = detections_indices.view(total_detections).to(torch.int32)
130 | 
131 |         selected_boxes = boxes[batch_index, box_index]
132 |         selected_masks = masks[batch_index, box_index]
133 | 
134 |         pooled_proto = RoiAlign.apply(protos, selected_boxes, batch_index, 1, 1, int(h_protos), int(w_protos), 0, 0.25)
135 | 
136 |         masks_protos = torch.matmul(
137 |             selected_masks.unsqueeze(1), pooled_proto.view(total_detections, num_protos, h_protos * w_protos)
138 |         )
139 |         masks_protos = masks_protos.sigmoid().view(batch_size, self.max_detections, h_protos * w_protos)
140 | 
141 |         return torch.cat(
142 |             [detection_boxes, detection_scores.unsqueeze(-1), detection_classes.unsqueeze(-1), masks_protos], dim=-1
143 |         )
144 | 
145 | 
146 | def yolo11_seg_export(weights, device, fuse=True):
147 |     model = YOLO(weights)
148 |     model = deepcopy(model.model).to(device)
149 |     for p in model.parameters():
150 |         p.requires_grad = False
151 |     model.eval()
152 |     model.float()
153 |     if fuse:
154 |         model = model.fuse()
155 |     for k, m in model.named_modules():
156 |         if isinstance(m, (Detect, RTDETRDecoder)):
157 |             m.dynamic = False
158 |             m.export = True
159 |             m.format = "onnx"
160 |         elif isinstance(m, C2f):
161 |             m.forward = m.forward_split
162 |     return model
163 | 
164 | 
165 | def suppress_warnings():
166 |     import warnings
167 |     warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)
168 |     warnings.filterwarnings("ignore", category=UserWarning)
169 |     warnings.filterwarnings("ignore", category=DeprecationWarning)
170 |     warnings.filterwarnings("ignore", category=FutureWarning)
171 |     warnings.filterwarnings("ignore", category=ResourceWarning)
172 | 
173 | 
174 | def main(args):
175 |     suppress_warnings()
176 | 
177 |     print(f"\nStarting: {args.weights}")
178 | 
179 |     print("Opening YOLO11-Seg model")
180 | 
181 |     device = torch.device("cpu")
182 |     model = yolo11_seg_export(args.weights, device)
183 | 
184 |     if len(model.names.keys()) > 0:
185 |         print("Creating labels.txt file")
186 |         with open("labels.txt", "w", encoding="utf-8") as f:
187 |             for name in model.names.values():
188 |                 f.write(f"{name}\n")
189 | 
190 |     model = nn.Sequential(
191 |         model, DeepStreamOutput(len(model.names), args.conf_threshold, args.iou_threshold, args.max_detections)
192 |     )
193 | 
194 |     img_size = args.size * 2 if len(args.size) == 1 else args.size
195 | 
196 |     onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device)
197 |     onnx_output_file = args.weights.rsplit(".", 1)[0] + ".onnx"
198 | 
199 |     dynamic_axes = {
200 |         "input": {
201 |             0: "batch"
202 |         },
203 |         "output": {
204 |             0: "batch"
205 |         }
206 |     }
207 | 
208 |     print("Exporting the model to ONNX")
209 |     torch.onnx.export(
210 |         model,
211 |         onnx_input_im,
212 |         onnx_output_file,
213 |         verbose=False,
214 |         opset_version=args.opset,
215 |         do_constant_folding=True,
216 |         input_names=["input"],
217 |         output_names=["output"],
218 |         dynamic_axes=dynamic_axes if args.dynamic else None
219 |     )
220 | 
221 |     if args.simplify:
222 |         print("Simplifying the ONNX model")
223 |         import onnxslim
224 |         model_onnx = onnx.load(onnx_output_file)
225 |         model_onnx = onnxslim.slim(model_onnx)
226 |         onnx.save(model_onnx, onnx_output_file)
227 | 
228 |     print(f"Done: {onnx_output_file}\n")
229 | 
230 | 
231 | def parse_args():
232 |     import argparse
233 |     parser = argparse.ArgumentParser(description="DeepStream YOLO11-Seg conversion")
234 |     parser.add_argument("-w", "--weights", required=True, type=str, help="Input weights (.pt) file path (required)")
235 |     parser.add_argument("-s", "--size", nargs="+", type=int, default=[640], help="Inference size [H,W] (default [640])")
236 |     parser.add_argument("--opset", type=int, default=17, help="ONNX opset version")
237 |     parser.add_argument("--simplify", action="store_true", help="ONNX simplify model")
238 |     parser.add_argument("--dynamic", action="store_true", help="Dynamic batch-size")
239 |     parser.add_argument("--batch", type=int, default=1, help="Static batch-size")
240 |     parser.add_argument(
241 |         "--conf-threshold", type=float, default=0.25, help="Minimum detection confidence threshold (default 0.25)"
242 |     )
243 |     parser.add_argument("--iou-threshold", type=float, default=0.45, help="NMS IoU threshold (default 0.45)")
244 |     parser.add_argument(
245 |         "--max-detections", type=int, default=100, help="Maximum number of output detections (default 100)"
246 |     )
247 |     args = parser.parse_args()
248 |     if not os.path.isfile(args.weights):
249 |         raise SystemExit("Invalid weights file")
250 |     if args.dynamic and args.batch > 1:
251 |         raise SystemExit("Cannot set dynamic batch-size and static batch-size at same time")
252 |     return args
253 | 
254 | 
255 | if __name__ == "__main__":
256 |     args = parse_args()
257 |     main(args)
258 | 


--------------------------------------------------------------------------------
/utils/export_yoloV8_seg.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import onnx
  4 | import torch
  5 | import torch.nn as nn
  6 | from copy import deepcopy
  7 | 
  8 | from ultralytics import YOLO
  9 | from ultralytics.nn.modules import C2f, Detect, RTDETRDecoder
 10 | import ultralytics.utils
 11 | import ultralytics.models.yolo
 12 | import ultralytics.utils.tal as _m
 13 | 
 14 | sys.modules["ultralytics.yolo"] = ultralytics.models.yolo
 15 | sys.modules["ultralytics.yolo.utils"] = ultralytics.utils
 16 | 
 17 | 
 18 | def _dist2bbox(distance, anchor_points, xywh=False, dim=-1):
 19 |     lt, rb = distance.chunk(2, dim)
 20 |     x1y1 = anchor_points - lt
 21 |     x2y2 = anchor_points + rb
 22 |     return torch.cat([x1y1, x2y2], dim)
 23 | 
 24 | 
 25 | _m.dist2bbox.__code__ = _dist2bbox.__code__
 26 | 
 27 | 
 28 | class RoiAlign(torch.autograd.Function):
 29 |     @staticmethod
 30 |     def forward(
 31 |         self,
 32 |         X,
 33 |         rois,
 34 |         batch_indices,
 35 |         coordinate_transformation_mode,
 36 |         mode,
 37 |         output_height,
 38 |         output_width,
 39 |         sampling_ratio,
 40 |         spatial_scale
 41 |     ):
 42 |         C = X.shape[1]
 43 |         num_rois = rois.shape[0]
 44 |         return torch.randn([num_rois, C, output_height, output_width], device=rois.device, dtype=rois.dtype)
 45 | 
 46 |     @staticmethod
 47 |     def symbolic(
 48 |         g,
 49 |         X,
 50 |         rois,
 51 |         batch_indices,
 52 |         coordinate_transformation_mode,
 53 |         mode,
 54 |         output_height,
 55 |         output_width,
 56 |         sampling_ratio,
 57 |         spatial_scale
 58 |     ):
 59 |         return g.op(
 60 |             "TRT::ROIAlignX_TRT",
 61 |             X,
 62 |             rois,
 63 |             batch_indices,
 64 |             coordinate_transformation_mode_i=coordinate_transformation_mode,
 65 |             mode_i=mode,
 66 |             output_height_i=output_height,
 67 |             output_width_i=output_width,
 68 |             sampling_ratio_i=sampling_ratio,
 69 |             spatial_scale_f=spatial_scale
 70 |         )
 71 | 
 72 | 
 73 | class NMS(torch.autograd.Function):
 74 |     @staticmethod
 75 |     def forward(self, boxes, scores, score_threshold, iou_threshold, max_output_boxes):
 76 |         batch_size = scores.shape[0]
 77 |         num_classes = scores.shape[-1]
 78 |         num_detections = torch.randint(0, max_output_boxes, (batch_size, 1), dtype=torch.int32)
 79 |         detection_boxes = torch.randn(batch_size, max_output_boxes, 4)
 80 |         detection_scores = torch.randn(batch_size, max_output_boxes)
 81 |         detection_classes = torch.randint(0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32)
 82 |         detections_indices = torch.randint(0, max_output_boxes, (batch_size, max_output_boxes), dtype=torch.int32)
 83 |         return num_detections, detection_boxes, detection_scores, detection_classes, detections_indices
 84 | 
 85 |     @staticmethod
 86 |     def symbolic(g, boxes, scores, score_threshold, iou_threshold, max_output_boxes):
 87 |         return g.op(
 88 |             "TRT::EfficientNMSX_TRT",
 89 |             boxes,
 90 |             scores,
 91 |             score_threshold_f=score_threshold,
 92 |             iou_threshold_f=iou_threshold,
 93 |             max_output_boxes_i=max_output_boxes,
 94 |             background_class_i=-1,
 95 |             score_activation_i=0,
 96 |             class_agnostic_i=0,
 97 |             box_coding_i=0,
 98 |             outputs=5
 99 |         )
100 | 
101 | 
102 | class DeepStreamOutput(nn.Module):
103 |     def __init__(self, nc, conf_threshold, iou_threshold, max_detections):
104 |         super().__init__()
105 |         self.nc = nc
106 |         self.conf_threshold = conf_threshold
107 |         self.iou_threshold = iou_threshold
108 |         self.max_detections = max_detections
109 | 
110 |     def forward(self, x):
111 |         preds = x[0].transpose(1, 2)
112 |         boxes = preds[:, :, :4]
113 |         scores = preds[:, :, 4:self.nc+4]
114 |         masks = preds[:, :, self.nc+4:]
115 |         protos = x[1]
116 | 
117 |         num_detections, detection_boxes, detection_scores, detection_classes, detections_indices = NMS.apply(
118 |             boxes, scores, self.conf_threshold, self.iou_threshold, self.max_detections
119 |         )
120 | 
121 |         batch_size, num_protos, h_protos, w_protos = protos.shape
122 | 
123 |         total_detections = batch_size * self.max_detections
124 | 
125 |         batch_index = torch.ones_like(detections_indices) * torch.arange(
126 |             batch_size, device=boxes.device, dtype=torch.int32
127 |         ).unsqueeze(1)
128 |         batch_index = batch_index.view(total_detections).to(torch.int32)
129 |         box_index = detections_indices.view(total_detections).to(torch.int32)
130 | 
131 |         selected_boxes = boxes[batch_index, box_index]
132 |         selected_masks = masks[batch_index, box_index]
133 | 
134 |         pooled_proto = RoiAlign.apply(protos, selected_boxes, batch_index, 1, 1, int(h_protos), int(w_protos), 0, 0.25)
135 | 
136 |         masks_protos = torch.matmul(
137 |             selected_masks.unsqueeze(1), pooled_proto.view(total_detections, num_protos, h_protos * w_protos)
138 |         )
139 |         masks_protos = masks_protos.sigmoid().view(batch_size, self.max_detections, h_protos * w_protos)
140 | 
141 |         return torch.cat(
142 |             [detection_boxes, detection_scores.unsqueeze(-1), detection_classes.unsqueeze(-1), masks_protos], dim=-1
143 |         )
144 | 
145 | 
146 | def yolov8_seg_export(weights, device, fuse=True):
147 |     model = YOLO(weights)
148 |     model = deepcopy(model.model).to(device)
149 |     for p in model.parameters():
150 |         p.requires_grad = False
151 |     model.eval()
152 |     model.float()
153 |     if fuse:
154 |         model = model.fuse()
155 |     for k, m in model.named_modules():
156 |         if isinstance(m, (Detect, RTDETRDecoder)):
157 |             m.dynamic = False
158 |             m.export = True
159 |             m.format = "onnx"
160 |         elif isinstance(m, C2f):
161 |             m.forward = m.forward_split
162 |     return model
163 | 
164 | 
165 | def suppress_warnings():
166 |     import warnings
167 |     warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)
168 |     warnings.filterwarnings("ignore", category=UserWarning)
169 |     warnings.filterwarnings("ignore", category=DeprecationWarning)
170 |     warnings.filterwarnings("ignore", category=FutureWarning)
171 |     warnings.filterwarnings("ignore", category=ResourceWarning)
172 | 
173 | 
174 | def main(args):
175 |     suppress_warnings()
176 | 
177 |     print(f"\nStarting: {args.weights}")
178 | 
179 |     print("Opening YOLOv8-Seg model")
180 | 
181 |     device = torch.device("cpu")
182 |     model = yolov8_seg_export(args.weights, device)
183 | 
184 |     if len(model.names.keys()) > 0:
185 |         print("Creating labels.txt file")
186 |         with open("labels.txt", "w", encoding="utf-8") as f:
187 |             for name in model.names.values():
188 |                 f.write(f"{name}\n")
189 | 
190 |     model = nn.Sequential(
191 |         model, DeepStreamOutput(len(model.names), args.conf_threshold, args.iou_threshold, args.max_detections)
192 |     )
193 | 
194 |     img_size = args.size * 2 if len(args.size) == 1 else args.size
195 | 
196 |     onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device)
197 |     onnx_output_file = args.weights.rsplit(".", 1)[0] + ".onnx"
198 | 
199 |     dynamic_axes = {
200 |         "input": {
201 |             0: "batch"
202 |         },
203 |         "output": {
204 |             0: "batch"
205 |         }
206 |     }
207 | 
208 |     print("Exporting the model to ONNX")
209 |     torch.onnx.export(
210 |         model,
211 |         onnx_input_im,
212 |         onnx_output_file,
213 |         verbose=False,
214 |         opset_version=args.opset,
215 |         do_constant_folding=True,
216 |         input_names=["input"],
217 |         output_names=["output"],
218 |         dynamic_axes=dynamic_axes if args.dynamic else None
219 |     )
220 | 
221 |     if args.simplify:
222 |         print("Simplifying the ONNX model")
223 |         import onnxslim
224 |         model_onnx = onnx.load(onnx_output_file)
225 |         model_onnx = onnxslim.slim(model_onnx)
226 |         onnx.save(model_onnx, onnx_output_file)
227 | 
228 |     print(f"Done: {onnx_output_file}\n")
229 | 
230 | 
231 | def parse_args():
232 |     import argparse
233 |     parser = argparse.ArgumentParser(description="DeepStream YOLOv8-Seg conversion")
234 |     parser.add_argument("-w", "--weights", required=True, type=str, help="Input weights (.pt) file path (required)")
235 |     parser.add_argument("-s", "--size", nargs="+", type=int, default=[640], help="Inference size [H,W] (default [640])")
236 |     parser.add_argument("--opset", type=int, default=17, help="ONNX opset version")
237 |     parser.add_argument("--simplify", action="store_true", help="ONNX simplify model")
238 |     parser.add_argument("--dynamic", action="store_true", help="Dynamic batch-size")
239 |     parser.add_argument("--batch", type=int, default=1, help="Static batch-size")
240 |     parser.add_argument(
241 |         "--conf-threshold", type=float, default=0.25, help="Minimum detection confidence threshold (default 0.25)"
242 |     )
243 |     parser.add_argument("--iou-threshold", type=float, default=0.45, help="NMS IoU threshold (default 0.45)")
244 |     parser.add_argument(
245 |         "--max-detections", type=int, default=100, help="Maximum number of output detections (default 100)"
246 |     )
247 |     args = parser.parse_args()
248 |     if not os.path.isfile(args.weights):
249 |         raise SystemExit("Invalid weights file")
250 |     if args.dynamic and args.batch > 1:
251 |         raise SystemExit("Cannot set dynamic batch-size and static batch-size at same time")
252 |     return args
253 | 
254 | 
255 | if __name__ == "__main__":
256 |     args = parse_args()
257 |     main(args)
258 | 


--------------------------------------------------------------------------------
/nvdsinfer_custom_impl_Yolo_seg/trt_plugins/common.h:
--------------------------------------------------------------------------------
  1 | #ifndef COMMON_H
  2 | #define COMMON_H
  3 | 
  4 | #include <iostream>
  5 | #include <sstream>
  6 | #include <cstring>
  7 | #include <mutex>
  8 | #include <set>
  9 | #include <memory>
 10 | 
 11 | #include <cuda_runtime.h>
 12 | 
 13 | #include "NvInfer.h"
 14 | #include "NvInferPlugin.h"
 15 | 
 16 | #ifdef _MSC_VER
 17 | #define FN_NAME __FUNCTION__
 18 | #else
 19 | #define FN_NAME __func__
 20 | #endif
 21 | 
 22 | #define PLUGIN_CHECK_CUDA(call)                                                                                        \
 23 |     do                                                                                                                 \
 24 |     {                                                                                                                  \
 25 |         cudaError_t status = call;                                                                                     \
 26 |         if (status != cudaSuccess)                                                                                     \
 27 |         {                                                                                                              \
 28 |             return status;                                                                                             \
 29 |         }                                                                                                              \
 30 |     } while (0)
 31 | 
 32 | #define PLUGIN_CUASSERT(status_)                                                                                       \
 33 |     do                                                                                                                 \
 34 |     {                                                                                                                  \
 35 |         auto s_ = status_;                                                                                             \
 36 |         if (s_ != cudaSuccess)                                                                                         \
 37 |         {                                                                                                              \
 38 |             char const* msg = cudaGetErrorString(s_);                                                                  \
 39 |             nvinfer1::plugin::throwCudaError(__FILE__, FN_NAME, __LINE__, s_, msg);                                    \
 40 |         }                                                                                                              \
 41 |     } while (0)
 42 | 
 43 | #define GET_MACRO(_1, _2, NAME, ...) NAME
 44 | 
 45 | #define PLUGIN_VALIDATE(...) GET_MACRO(__VA_ARGS__, PLUGIN_VALIDATE_MSG, PLUGIN_VALIDATE_DEFAULT, )(__VA_ARGS__)
 46 | 
 47 | #define PLUGIN_VALIDATE_DEFAULT(condition)                                                                             \
 48 |     do                                                                                                                 \
 49 |     {                                                                                                                  \
 50 |         if (!(condition))                                                                                              \
 51 |         {                                                                                                              \
 52 |             nvinfer1::plugin::throwPluginError(__FILE__, FN_NAME, __LINE__, 0, #condition);                            \
 53 |         }                                                                                                              \
 54 |     } while (0)
 55 | 
 56 | #define PLUGIN_VALIDATE_MSG(condition, msg)                                                                            \
 57 |     do                                                                                                                 \
 58 |     {                                                                                                                  \
 59 |         if (!(condition))                                                                                              \
 60 |         {                                                                                                              \
 61 |             nvinfer1::plugin::throwPluginError(__FILE__, FN_NAME, __LINE__, 0, msg);                                   \
 62 |         }                                                                                                              \
 63 |     } while (0)
 64 | 
 65 | #define PLUGIN_ASSERT(assertion)                                                                                       \
 66 |     do                                                                                                                 \
 67 |     {                                                                                                                  \
 68 |         if (!(assertion))                                                                                              \
 69 |         {                                                                                                              \
 70 |             nvinfer1::plugin::reportAssertion(#assertion, __FILE__, __LINE__);                                         \
 71 |         }                                                                                                              \
 72 |     } while (0)
 73 | 
 74 | #define PLUGIN_FAIL(msg)                                                                                               \
 75 |     do                                                                                                                 \
 76 |     {                                                                                                                  \
 77 |         nvinfer1::plugin::reportAssertion(msg, __FILE__, __LINE__);                                                    \
 78 |     } while (0)
 79 | 
 80 | #define PLUGIN_ERROR(msg)                                                                                              \
 81 |     {                                                                                                                  \
 82 |         nvinfer1::plugin::throwPluginError(__FILE__, FN_NAME, __LINE__, 0, msg);                                       \
 83 |     }
 84 | 
 85 | #define CSC(call, err)                                                                                                 \
 86 |     do                                                                                                                 \
 87 |     {                                                                                                                  \
 88 |         cudaError_t cudaStatus = call;                                                                                 \
 89 |         if (cudaStatus != cudaSuccess)                                                                                 \
 90 |         {                                                                                                              \
 91 |             return err;                                                                                                \
 92 |         }                                                                                                              \
 93 |     } while (0)
 94 | 
 95 | typedef enum
 96 | {
 97 |     STATUS_SUCCESS = 0,
 98 |     STATUS_FAILURE = 1,
 99 |     STATUS_BAD_PARAM = 2,
100 |     STATUS_NOT_SUPPORTED = 3,
101 |     STATUS_NOT_INITIALIZED = 4
102 | } pluginStatus_t;
103 | 
104 | namespace nvinfer1
105 | {
106 | 
107 | namespace pluginInternal
108 | {
109 | 
110 | class BaseCreator : public IPluginCreator
111 | {
112 | public:
113 |     void setPluginNamespace(char const* libNamespace) noexcept override
114 |     {
115 |         mNamespace = libNamespace;
116 |     }
117 | 
118 |     char const* getPluginNamespace() const noexcept override
119 |     {
120 |         return mNamespace.c_str();
121 |     }
122 | 
123 | protected:
124 |     std::string mNamespace;
125 | };
126 | 
127 | } // namespace pluginInternal
128 | 
129 | namespace plugin
130 | {
131 | 
132 | template <ILogger::Severity kSeverity>
133 | class LogStream : public std::ostream
134 | {
135 |     class Buf : public std::stringbuf
136 |     {
137 |     public:
138 |         int32_t sync() override;
139 |     };
140 | 
141 |     Buf buffer;
142 |     std::mutex mLogStreamMutex;
143 | 
144 | public:
145 |     std::mutex& getMutex()
146 |     {
147 |         return mLogStreamMutex;
148 |     }
149 |     LogStream()
150 |         : std::ostream(&buffer){};
151 | };
152 | 
153 | class TRTException : public std::exception
154 | {
155 | public:
156 |     TRTException(char const* fl, char const* fn, int32_t ln, int32_t st, char const* msg, char const* nm)
157 |         : file(fl)
158 |         , function(fn)
159 |         , line(ln)
160 |         , status(st)
161 |         , message(msg)
162 |         , name(nm)
163 |     {
164 |     }
165 |     virtual void log(std::ostream& logStream) const;
166 |     void setMessage(char const* msg)
167 |     {
168 |         message = msg;
169 |     }
170 | 
171 | protected:
172 |     char const* file{nullptr};
173 |     char const* function{nullptr};
174 |     int32_t line{0};
175 |     int32_t status{0};
176 |     char const* message{nullptr};
177 |     char const* name{nullptr};
178 | };
179 | 
180 | class CudaError : public TRTException
181 | {
182 | public:
183 |     CudaError(char const* fl, char const* fn, int32_t ln, int32_t stat, char const* msg = nullptr)
184 |         : TRTException(fl, fn, ln, stat, msg, "Cuda")
185 |     {
186 |     }
187 | };
188 | 
189 | class PluginError : public TRTException
190 | {
191 | public:
192 |     PluginError(char const* fl, char const* fn, int32_t ln, int32_t stat, char const* msg = nullptr)
193 |         : TRTException(fl, fn, ln, stat, msg, "Plugin")
194 |     {
195 |     }
196 | };
197 | 
198 | extern LogStream<ILogger::Severity::kERROR> gLogError;
199 | extern LogStream<ILogger::Severity::kWARNING> gLogWarning;
200 | 
201 | void caughtError(std::exception const& e);
202 | 
203 | void throwCudaError(char const* file, char const* function, int32_t line, int32_t status, char const* msg);
204 | 
205 | void throwPluginError(char const* file, char const* function, int32_t line, int32_t status, char const* msg);
206 | 
207 | void reportValidationFailure(char const* msg, char const* file, int32_t line);
208 | 
209 | void reportAssertion(char const* msg, char const* file, int32_t line);
210 | 
211 | void validateRequiredAttributesExist(std::set<std::string> requiredFieldNames, PluginFieldCollection const* fc);
212 | 
213 | template <typename Type, typename BufferType>
214 | void write(BufferType*& buffer, Type const& val)
215 | {
216 |     static_assert(sizeof(BufferType) == 1, "BufferType must be a 1 byte type.");
217 |     std::memcpy(buffer, &val, sizeof(Type));
218 |     buffer += sizeof(Type);
219 | }
220 | 
221 | template <typename OutType, typename BufferType>
222 | OutType read(BufferType const*& buffer)
223 | {
224 |     static_assert(sizeof(BufferType) == 1, "BufferType must be a 1 byte type.");
225 |     OutType val{};
226 |     std::memcpy(&val, static_cast<void const*>(buffer), sizeof(OutType));
227 |     buffer += sizeof(OutType);
228 |     return val;
229 | }
230 | 
231 | size_t dataTypeSize(nvinfer1::DataType dtype);
232 | 
233 | } // namespace plugin
234 | } // namespace nvinfer1
235 | 
236 | #endif // COMMON_H
237 | 


--------------------------------------------------------------------------------
/nvdsinfer_custom_impl_Yolo_seg/trt_plugins/roiAlignPlugin/roiAlignKernel.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  *
 17 |  * ************************************************************************
 18 |  * Modified from Pytorch
 19 |  * Copyright (c) 2016-present, Facebook, Inc.
 20 |  *
 21 |  * See https://github.com/pytorch/pytorch/blob/main/LICENSE for details
 22 |  * ************************************************************************
 23 |  * Modified from ONNX Runtime
 24 |  * Copyright (c) Microsoft Corporation
 25 |  *
 26 |  * See https://github.com/microsoft/onnxruntime/blob/main/LICENSE for details
 27 |  * ************************************************************************
 28 |  */
 29 | 
 30 | 
 31 | #include <cuda.h>
 32 | #include <cuda_fp16.h>
 33 | #include "../common.h"
 34 | #include "roiAlignKernel.h"
 35 | 
 36 | using half = __half;
 37 | 
 38 | __device__ half floatMax(half a, half b)
 39 | {
 40 |     #if __CUDA_ARCH__ >= 800
 41 |         return __hmax(a, b);
 42 |     #else
 43 |         return __float2half(max(__half2float(a), __half2float(b)));
 44 |     #endif
 45 | }
 46 | 
 47 | __device__ float floatMax(float a, float b)
 48 | {
 49 |     return max(a, b);
 50 | }
 51 | 
 52 | template <typename T>
 53 | __device__ T bilinearInterpolate(T const* bottomData, int32_t const height, int32_t const width, T y, T x,
 54 |     int32_t const isModeAvg, int32_t const index /* index for debug only*/)
 55 | {
 56 |     // deal with cases that inverse elements are out of feature map boundary
 57 |     if (y < static_cast<T>(-1.0) || y > static_cast<T>(height) || x < static_cast<T>(-1.0) || x > static_cast<T>(width))
 58 |     {
 59 |         // empty
 60 |         return 0;
 61 |     }
 62 | 
 63 |     if (y <= static_cast<T>(0))
 64 |     {
 65 |         y = 0;
 66 |     }
 67 |     if (x <= static_cast<T>(0))
 68 |     {
 69 |         x = 0;
 70 |     }
 71 | 
 72 |     int32_t yLow = static_cast<int32_t>(y);
 73 |     int32_t xLow = static_cast<int32_t>(x);
 74 |     int32_t yHigh;
 75 |     int32_t xHigh;
 76 | 
 77 |     if (yLow >= height - 1)
 78 |     {
 79 |         yHigh = yLow = height - 1;
 80 |         y = static_cast<T>(yLow);
 81 |     }
 82 |     else
 83 |     {
 84 |         yHigh = yLow + 1;
 85 |     }
 86 | 
 87 |     if (xLow >= width - 1)
 88 |     {
 89 |         xHigh = xLow = width - 1;
 90 |         x = static_cast<T>(xLow);
 91 |     }
 92 |     else
 93 |     {
 94 |         xHigh = xLow + 1;
 95 |     }
 96 | 
 97 |     T ly = y - static_cast<T>(yLow);
 98 |     T lx = x - static_cast<T>(xLow);
 99 |     T hy = static_cast<T>(1.) - ly, hx = static_cast<T>(1.) - lx;
100 |     // do bilinear interpolation
101 |     T v1 = bottomData[yLow * width + xLow];
102 |     T v2 = bottomData[yLow * width + xHigh];
103 |     T v3 = bottomData[yHigh * width + xLow];
104 |     T v4 = bottomData[yHigh * width + xHigh];
105 |     T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
106 | 
107 |     T val;
108 |     if (isModeAvg)
109 |     {
110 |         val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); // mode Avg
111 |     }
112 |     else
113 |     {
114 |         val = floatMax(floatMax(floatMax(w1 * v1, w2 * v2), w3 * v3), w4 * v4); // mode Max
115 |     }
116 | 
117 |     return val;
118 | }
119 | 
120 | template <typename T>
121 | __global__ void RoIAlignForward(int32_t const nthreads, T const* bottomData, T const spatialScale, int32_t const channels,
122 |     int32_t const height, int32_t const width, int32_t const pooledHeight, int32_t const pooledWidth, int32_t const samplingRatio,
123 |     T const* bottomRois, T* topData, int32_t const isModeAvg, int32_t const* batchIndicesPtr,
124 |     int32_t const aligned)
125 | {
126 |     for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x)
127 |     {
128 |         // (n, c, ph, pw) is an element in the pooled output
129 |         int32_t pw = index % pooledWidth;
130 |         int32_t ph = (index / pooledWidth) % pooledHeight;
131 |         int32_t c = (index / pooledWidth / pooledHeight) % channels;
132 |         int32_t n = index / pooledWidth / pooledHeight / channels;
133 | 
134 |         T const* offsetBottomRois = bottomRois + n * 4;
135 |         auto const roiBatchInd = batchIndicesPtr[n];
136 | 
137 |         bool continuousCoordinate = aligned;
138 |         // Do not using rounding; this implementation detail is critical
139 |         T roiOffset = static_cast<T>(continuousCoordinate ? 0.5 : 0);
140 |         T roiStartW = offsetBottomRois[0] * spatialScale - roiOffset;
141 |         T roiStartH = offsetBottomRois[1] * spatialScale - roiOffset;
142 |         T roiEndW = offsetBottomRois[2] * spatialScale - roiOffset;
143 |         T roiEndH = offsetBottomRois[3] * spatialScale - roiOffset;
144 | 
145 |         T roiWidth = roiEndW - roiStartW;
146 |         T roiHeight = roiEndH - roiStartH;
147 |         if (!continuousCoordinate)
148 |         { // backward compatibility
149 |             // Force malformed ROIs to be 1x1
150 |             roiWidth = floatMax(roiWidth, static_cast<T>(1.));
151 |             roiHeight = floatMax(roiHeight, static_cast<T>(1.));
152 |         }
153 |         T binSizeH = static_cast<T>(roiHeight) / static_cast<T>(pooledHeight);
154 |         T binSizeW = static_cast<T>(roiWidth) / static_cast<T>(pooledWidth);
155 | 
156 |         T const* offsetBottomData = bottomData + static_cast<int32_t>((roiBatchInd * channels + c) * height * width);
157 | 
158 |         // We use roiBinGrid to sample the grid and mimic integral
159 |         int32_t roiBinGridH;
160 |         if (samplingRatio > 0)
161 |         {
162 |             roiBinGridH = samplingRatio;
163 |         }
164 |         else
165 |         {
166 |             roiBinGridH = ceilf(roiHeight / static_cast<T>(pooledHeight));
167 |         }
168 | 
169 |         int32_t roiBinGridW;
170 |         if (samplingRatio > 0)
171 |         {
172 |             roiBinGridW = samplingRatio;
173 |         }
174 |         else
175 |         {
176 |             roiBinGridW = ceilf(roiWidth / static_cast<T>(pooledWidth));
177 |         }
178 |         // We do average (integral) pooling inside a bin
179 |         T const count = roiBinGridH * roiBinGridW; // e.g. = 4
180 | 
181 |         T const yOff = roiStartH + static_cast<T>(ph) * binSizeH;
182 |         T const yFac = binSizeH / static_cast<T>(roiBinGridH);
183 | 
184 |         T const xOff = roiStartW + static_cast<T>(pw) * binSizeW;
185 |         T const xFac = binSizeW / static_cast<T>(roiBinGridW);
186 | 
187 |         T outputVal = 0.;
188 |         bool maxFlag = false;
189 |         for (int32_t iy = 0; iy < roiBinGridH; iy++) // e.g., iy = 0, 1
190 |         {
191 |             T const y = yOff + static_cast<T>(iy + .5F) * yFac; // e.g., 0.5, 1.5
192 |             for (int32_t ix = 0; ix < roiBinGridW; ix++)
193 |             {
194 |                 T const x = xOff + static_cast<T>(ix + .5F) * xFac;
195 | 
196 |                 T val = bilinearInterpolate(offsetBottomData, height, width, y, x, isModeAvg, index);
197 | 
198 |                 if (isModeAvg)
199 |                 {
200 |                     outputVal += val;
201 |                 }
202 |                 else
203 |                 {
204 |                     if (!maxFlag)
205 |                     {
206 |                         outputVal = val;
207 |                         maxFlag = true;
208 |                     }
209 |                     else
210 |                     {
211 |                         outputVal = floatMax(outputVal, val);
212 |                     }
213 |                 }
214 |             }
215 |         }
216 |         if (isModeAvg)
217 |         {
218 |             outputVal = outputVal / count;
219 |         }
220 | 
221 |         topData[index] = outputVal;
222 |     }
223 | }
224 | 
225 | template <typename T>
226 | cudaError_t RoiAlignImpl(cudaStream_t stream, int32_t const maxThreadsPerBlock, T const* bottomData, T const spatialScale,
227 |     int32_t const numRois, int32_t const channels, int32_t const height, int32_t const width, int32_t const pooledHeight,
228 |     int32_t const pooledWidth, int32_t const samplingRatio, T const* bottomRois, T* topData, int32_t const isModeAvg,
229 |     int32_t const* batchIndicesPtr, int32_t const aligned)
230 | {
231 |     PLUGIN_ASSERT(bottomData != nullptr);
232 |     PLUGIN_ASSERT(bottomRois != nullptr);
233 |     PLUGIN_ASSERT(batchIndicesPtr != nullptr);
234 |     PLUGIN_ASSERT(topData != nullptr);
235 | 
236 |     PLUGIN_ASSERT(numRois >= 0);
237 |     PLUGIN_ASSERT(maxThreadsPerBlock > 0);
238 | 
239 |     PLUGIN_ASSERT(height > 0);
240 |     PLUGIN_ASSERT(width > 0);
241 |     PLUGIN_ASSERT(pooledHeight > 0);
242 |     PLUGIN_ASSERT(pooledWidth > 0);
243 |     PLUGIN_ASSERT(samplingRatio >= 0);
244 |     PLUGIN_ASSERT(isModeAvg == 0 || isModeAvg == 1);
245 |     PLUGIN_ASSERT(static_cast<float>(spatialScale) > 0.0F);
246 |     PLUGIN_ASSERT(aligned == 0 || aligned == 1);
247 | 
248 |     int32_t const outputSize = numRois * channels * pooledHeight * pooledWidth;
249 | 
250 |     int32_t blocksPerGrid = static_cast<int32_t>(ceil(static_cast<float>(outputSize)
251 |         / maxThreadsPerBlock));
252 | 
253 |     RoIAlignForward<T><<<blocksPerGrid, maxThreadsPerBlock, 0, stream>>>(outputSize,// nthreads
254 |         bottomData,                                                                 // bottomData
255 |         spatialScale,                                                               // spatialScale
256 |         channels,                                                                   // channels
257 |         height,                                                                     // height
258 |         width,                                                                      // width
259 |         pooledHeight,                                                               // pooledHeight
260 |         pooledWidth,                                                                // pooledWidth
261 |         samplingRatio,                                                              // samplingRatio
262 |         bottomRois,                                                                 // bottomRois
263 |         topData,                                                                    // topData
264 |         isModeAvg,                                                                  // isModeAvg
265 |         batchIndicesPtr,                                                            // batchIndicesPtr
266 |         aligned);
267 | 
268 |     return cudaGetLastError();
269 | }
270 | 
271 | #define SPECIALIZED_IMPL(T)                                                                                            \
272 |     template cudaError_t RoiAlignImpl<T>(cudaStream_t stream, int32_t const maxThreadsPerBlock, T const* bottomData,   \
273 |         T const spatialScale, int32_t const numRois, int32_t const channels, int32_t const height,                     \
274 |         int32_t const width, int32_t const pooledHeight, int32_t const pooledWidth, int32_t const samplingRatio,       \
275 |         T const* bottomRois, T* topData, int32_t const isModeAvg, int32_t const* batchIndicesPtr,                      \
276 |         int32_t const aligned);
277 | 
278 | SPECIALIZED_IMPL(float)
279 | SPECIALIZED_IMPL(half)
280 | 


--------------------------------------------------------------------------------
/utils/export_rfdetr_seg.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import onnx
  3 | import torch
  4 | import torch.nn as nn
  5 | from copy import deepcopy
  6 | 
  7 | from rfdetr import RFDETRSegPreview
  8 | import rfdetr.models.backbone.projector as _m1
  9 | import rfdetr.models.segmentation_head as _m2
 10 | import rfdetr.models.ops.modules.ms_deform_attn as _m3
 11 | 
 12 | 
 13 | def LayerNorm_forward(self, x):
 14 |     x = x.permute(0, 2, 3, 1)
 15 |     x = F.layer_norm(x, (int(x.size(3)),), self.weight, self.bias, self.eps)
 16 |     x = x.permute(0, 3, 1, 2)
 17 |     return x
 18 | 
 19 | _m1.LayerNorm.forward.__code__ = LayerNorm_forward.__code__
 20 | 
 21 | 
 22 | def SegmentationHead_forward_export(self, spatial_features, query_features, image_size, skip_blocks=False):
 23 |     assert len(query_features) == 1, "at export time, segmentation head expects exactly one query feature"
 24 | 
 25 |     target_size = (image_size[0] // self.downsample_ratio, image_size[1] // self.downsample_ratio)
 26 |     spatial_features = F.interpolate(spatial_features, size=target_size, mode="bilinear", align_corners=False)
 27 | 
 28 |     if not skip_blocks:
 29 |         for block in self.blocks:
 30 |             spatial_features = block(spatial_features)
 31 | 
 32 |     spatial_features_proj = self.spatial_features_proj(spatial_features)
 33 | 
 34 |     qf = self.query_features_proj(self.query_features_block(query_features[0]))
 35 | 
 36 |     return [[spatial_features_proj, qf, self.bias]]
 37 | 
 38 | 
 39 | _m2.SegmentationHead.forward_export.__code__ = SegmentationHead_forward_export.__code__
 40 | 
 41 | 
 42 | def MSDeformAttn_forward(
 43 |     self,
 44 |     query,
 45 |     reference_points,
 46 |     input_flatten,
 47 |     input_spatial_shapes,
 48 |     input_level_start_index,
 49 |     input_padding_mask=None
 50 | ):
 51 |     class MultiscaleDeformableAttnPlugin(torch.autograd.Function):
 52 |         @staticmethod
 53 |         def forward(self, value, spatial_shapes, level_start_index, sampling_locations, attention_weights):
 54 |             value = value.permute(0, 2, 3, 1)
 55 |             N, Lq, M, L, P, n = sampling_locations.shape
 56 |             attention_weights = attention_weights.view(N, Lq, M, L * P)
 57 |             return ms_deform_attn_core_pytorch(value, spatial_shapes, sampling_locations, attention_weights)
 58 | 
 59 |         @staticmethod
 60 |         def symbolic(g, value, spatial_shapes, level_start_index, sampling_locations, attention_weights):
 61 |             return g.op(
 62 |                 "TRT::MultiscaleDeformableAttnPlugin_TRT",
 63 |                 value,
 64 |                 spatial_shapes,
 65 |                 level_start_index,
 66 |                 sampling_locations,
 67 |                 attention_weights
 68 |             )
 69 | 
 70 |     N, Len_q, _ = query.shape
 71 |     N, Len_in, _ = input_flatten.shape
 72 |     assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
 73 | 
 74 |     value = self.value_proj(input_flatten)
 75 |     if input_padding_mask is not None:
 76 |         value = value.masked_fill(input_padding_mask[..., None], float(0))
 77 | 
 78 |     sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
 79 |     attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
 80 | 
 81 |     if reference_points.shape[-1] == 2:
 82 |         offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
 83 |         sampling_locations = reference_points[:, :, None, :, None, :] \
 84 |                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
 85 |     elif reference_points.shape[-1] == 4:
 86 |         sampling_locations = reference_points[:, :, None, :, None, :2] \
 87 |                                 + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
 88 |     else:
 89 |         raise ValueError(f"Last dim of reference_points must be 2 or 4, but get {reference_points.shape[-1]} instead.")
 90 | 
 91 |     attention_weights = F.softmax(attention_weights, -1)
 92 | 
 93 |     value = value.transpose(1, 2).contiguous().view(N, self.n_heads, self.d_model // self.n_heads, Len_in)
 94 | 
 95 |     value = value.permute(0, 3, 1, 2)
 96 | 
 97 |     L, P = sampling_locations.shape[3:5]
 98 | 
 99 |     attention_weights = attention_weights.view(N, Len_q, self.n_heads, L, P)
100 | 
101 |     output = MultiscaleDeformableAttnPlugin.apply(
102 |         value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights
103 |     )
104 | 
105 |     output = output.view(N, Len_q, self.d_model)
106 | 
107 |     output = self.output_proj(output)
108 |     return output
109 | 
110 | 
111 | _m3.MSDeformAttn.forward.__code__ = MSDeformAttn_forward.__code__
112 | 
113 | 
114 | class RoiAlign(torch.autograd.Function):
115 |     @staticmethod
116 |     def forward(
117 |         self,
118 |         X,
119 |         rois,
120 |         batch_indices,
121 |         coordinate_transformation_mode,
122 |         mode,
123 |         output_height,
124 |         output_width,
125 |         sampling_ratio,
126 |         spatial_scale
127 |     ):
128 |         C = X.shape[1]
129 |         num_rois = rois.shape[0]
130 |         return torch.randn([num_rois, C, output_height, output_width], device=rois.device, dtype=rois.dtype)
131 | 
132 |     @staticmethod
133 |     def symbolic(
134 |         g,
135 |         X,
136 |         rois,
137 |         batch_indices,
138 |         coordinate_transformation_mode,
139 |         mode,
140 |         output_height,
141 |         output_width,
142 |         sampling_ratio,
143 |         spatial_scale
144 |     ):
145 |         return g.op(
146 |             "TRT::ROIAlignX_TRT",
147 |             X,
148 |             rois,
149 |             batch_indices,
150 |             coordinate_transformation_mode_i=coordinate_transformation_mode,
151 |             mode_i=mode,
152 |             output_height_i=output_height,
153 |             output_width_i=output_width,
154 |             sampling_ratio_i=sampling_ratio,
155 |             spatial_scale_f=spatial_scale
156 |         )
157 | 
158 | 
159 | class DeepStreamOutput(nn.Module):
160 |     def __init__(self, img_size, max_detections):
161 |         super().__init__()
162 |         self.img_size = img_size
163 |         self.max_detections = max_detections
164 | 
165 |     def forward(self, x):
166 |         boxes = x[0]
167 |         convert_matrix = torch.tensor(
168 |             [[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], dtype=boxes.dtype, device=boxes.device
169 |         )
170 |         boxes @= convert_matrix
171 |         boxes *= torch.as_tensor([[*self.img_size]]).flip(1).tile([1, 2]).unsqueeze(1)
172 |         scores = x[1].sigmoid()
173 |         protos, masks, mask_bias = x[2]
174 | 
175 |         num_classes = scores.shape[2]
176 |         batch_size, num_protos, h_protos, w_protos = protos.shape
177 | 
178 |         topk_values, topk_indexes = torch.topk(scores.view(batch_size, -1), self.max_detections, dim=1, sorted=False)
179 | 
180 |         scores = topk_values.unsqueeze(-1)
181 | 
182 |         topk_boxes = topk_indexes // num_classes
183 |         labels = topk_indexes % num_classes
184 | 
185 |         topk_boxes = topk_boxes.unsqueeze(-1)
186 |         labels = labels.unsqueeze(-1)
187 | 
188 |         boxes = torch.gather(boxes, 1, topk_boxes.repeat(1, 1, 4))
189 |         masks = torch.gather(masks, 1, topk_boxes.repeat(1, 1, num_protos))
190 | 
191 |         total_detections = batch_size * self.max_detections
192 | 
193 |         batch_index = torch.ones(
194 |             [batch_size, self.max_detections], device=boxes.device, dtype=torch.int32
195 |         ) * torch.arange(batch_size, device=boxes.device, dtype=torch.int32).unsqueeze(1)
196 |         batch_index = batch_index.view(total_detections)
197 | 
198 |         selected_boxes = boxes.view(total_detections, 4)
199 |         selected_masks = masks.view(total_detections, -1)
200 | 
201 |         pooled_proto = RoiAlign.apply(protos, selected_boxes, batch_index, 1, 1, int(h_protos), int(w_protos), 0, 0.25)
202 | 
203 |         masks_protos = torch.matmul(
204 |             selected_masks.unsqueeze(1), pooled_proto.view(total_detections, num_protos, h_protos * w_protos)
205 |         )
206 |         masks_protos = masks_protos.view(batch_size, self.max_detections, h_protos * w_protos) + mask_bias
207 | 
208 |         return torch.cat([boxes, scores, labels.to(boxes.dtype), masks_protos], dim=-1)
209 | 
210 | 
211 | def rfdetr_seg_export(model_name, weights, nc, img_size, max_detections, device):
212 |     if model_name == "rfdetr-seg-preview":
213 |         model = RFDETRSegPreview(pretrain_weights=weights, resolution=img_size[0], num_classes=nc, device=device.type)
214 |     else:
215 |         raise NotImplementedError("Model not supported")
216 |     class_names = model.class_names
217 |     model = deepcopy(model.model.model)
218 |     model.to(device)
219 |     model.eval()
220 |     if hasattr(model, "export"):
221 |         model.export()
222 |     if max_detections > model.num_queries:
223 |         raise ValueError(
224 |             f"The `max_detections={max_detections}` is higher than the model `num_queries={model.num_queries}`")
225 |     return model, class_names
226 | 
227 | 
228 | def suppress_warnings():
229 |     import warnings
230 |     warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)
231 |     warnings.filterwarnings("ignore", category=UserWarning)
232 |     warnings.filterwarnings("ignore", category=DeprecationWarning)
233 |     warnings.filterwarnings("ignore", category=FutureWarning)
234 |     warnings.filterwarnings("ignore", category=ResourceWarning)
235 | 
236 | 
237 | def main(args):
238 |     suppress_warnings()
239 | 
240 |     print(f"\nStarting: {args.weights}")
241 | 
242 |     print("Opening RF-DETR-Seg model")
243 | 
244 |     device = torch.device("cpu")
245 |     model, class_names = rfdetr_seg_export(
246 |         args.model, args.weights, args.classes, args.size, args.max_detections, device
247 |     )
248 | 
249 |     if len(class_names.keys()) > 0:
250 |         print("Creating labels.txt file")
251 |         with open("labels.txt", "w", encoding="utf-8") as f:
252 |             f.write("background\n")
253 |             for i in range(1, args.classes + 1):
254 |                 if i in class_names:
255 |                     f.write(f"{class_names[i]}\n")
256 |                 else:
257 |                     f.write("empty\n")
258 | 
259 | 
260 |     img_size = args.size * 2 if len(args.size) == 1 else args.size
261 | 
262 |     model = nn.Sequential(
263 |         model, DeepStreamOutput(img_size, args.max_detections)
264 |     )
265 | 
266 |     onnx_input_im = torch.zeros(args.batch, 3, *img_size).to(device)
267 |     onnx_output_file = args.weights.rsplit(".", 1)[0] + ".onnx"
268 | 
269 |     dynamic_axes = {
270 |         "input": {
271 |             0: "batch"
272 |         },
273 |         "output": {
274 |             0: "batch"
275 |         }
276 |     }
277 | 
278 |     print("Exporting the model to ONNX")
279 |     torch.onnx.export(
280 |         model,
281 |         onnx_input_im,
282 |         onnx_output_file,
283 |         verbose=False,
284 |         opset_version=args.opset,
285 |         do_constant_folding=True,
286 |         input_names=["input"],
287 |         output_names=["output"],
288 |         dynamic_axes=dynamic_axes if args.dynamic else None
289 |     )
290 | 
291 |     if args.simplify:
292 |         print("Simplifying the ONNX model")
293 |         import onnxslim
294 |         model_onnx = onnx.load(onnx_output_file)
295 |         model_onnx = onnxslim.slim(model_onnx)
296 |         onnx.save(model_onnx, onnx_output_file)
297 | 
298 |     print(f"Done: {onnx_output_file}\n")
299 | 
300 | 
301 | def parse_args():
302 |     import argparse
303 |     parser = argparse.ArgumentParser(description="DeepStream RF-DETR-Seg conversion")
304 |     parser.add_argument("-m", "--model", required=True, help="Model name (required)")
305 |     parser.add_argument("-w", "--weights", required=True, type=str, help="Input weights (.pt) file path (required)")
306 |     parser.add_argument("-n", "--classes", type=int, default=91, help="Number of trained classes (default 91)")
307 |     parser.add_argument("-s", "--size", nargs="+", type=int, default=[432], help="Inference size [H,W] (default [432])")
308 |     parser.add_argument("--opset", type=int, default=17, help="ONNX opset version")
309 |     parser.add_argument("--simplify", action="store_true", help="ONNX simplify model")
310 |     parser.add_argument("--dynamic", action="store_true", help="Dynamic batch-size")
311 |     parser.add_argument("--batch", type=int, default=1, help="Static batch-size")
312 |     parser.add_argument(
313 |         "--max-detections", type=int, default=100, help="Maximum number of output detections (default 100)"
314 |     )
315 |     args = parser.parse_args()
316 |     if not os.path.isfile(args.weights):
317 |         raise SystemExit("Invalid weights file")
318 |     if len(args.size) > 1 and args.size[0] != args.size[1]:
319 |         raise SystemExit("RF-DETR model requires square resolution (width = height)")
320 |     if args.dynamic and args.batch > 1:
321 |         raise SystemExit("Cannot set dynamic batch-size and static batch-size at same time")
322 |     return args
323 | 
324 | 
325 | if __name__ == "__main__":
326 |     args = parse_args()
327 |     main(args)
328 | 


--------------------------------------------------------------------------------
/nvdsinfer_custom_impl_Yolo_seg/trt_plugins/roiAlignPlugin/roiAlignPlugin.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | #include "roiAlignPlugin.h"
 18 | #include "roiAlignKernel.h"
 19 | #include <cuda_fp16.h>
 20 | #include <cuda_runtime_api.h>
 21 | 
 22 | using namespace nvinfer1;
 23 | using namespace plugin;
 24 | using nvinfer1::plugin::ROIAlign;
 25 | using nvinfer1::plugin::ROIAlignPluginCreator;
 26 | 
 27 | namespace
 28 | {
 29 | char const* kROIALIGN_PLUGIN_VERSION{"1"};
 30 | char const* kROIALIGN_PLUGIN_NAME{"ROIAlignX_TRT"};
 31 | size_t constexpr kSERIALIZATION_SIZE{sizeof(int32_t) * 5 + sizeof(float) + sizeof(int32_t) * 4};
 32 | } // namespace
 33 | 
 34 | ROIAlignPluginCreator::ROIAlignPluginCreator()
 35 | {
 36 |     static std::mutex sMutex;
 37 |     std::lock_guard<std::mutex> guard(sMutex);
 38 |     mPluginAttributes.clear();
 39 |     mPluginAttributes.emplace_back(PluginField("coordinate_transformation_mode", nullptr, PluginFieldType::kINT32, 1));
 40 |     mPluginAttributes.emplace_back(PluginField("mode", nullptr, PluginFieldType::kINT32, 1));
 41 |     mPluginAttributes.emplace_back(PluginField("output_height", nullptr, PluginFieldType::kINT32, 1));
 42 |     mPluginAttributes.emplace_back(PluginField("output_width", nullptr, PluginFieldType::kINT32, 1));
 43 |     mPluginAttributes.emplace_back(PluginField("sampling_ratio", nullptr, PluginFieldType::kINT32, 1));
 44 |     mPluginAttributes.emplace_back(PluginField("spatial_scale", nullptr, PluginFieldType::kFLOAT32, 1));
 45 | 
 46 |     mFC.nbFields = mPluginAttributes.size();
 47 |     mFC.fields = mPluginAttributes.data();
 48 | }
 49 | 
 50 | char const* ROIAlignPluginCreator::getPluginName() const noexcept
 51 | {
 52 |     return kROIALIGN_PLUGIN_NAME;
 53 | }
 54 | 
 55 | char const* ROIAlignPluginCreator::getPluginVersion() const noexcept
 56 | {
 57 |     return kROIALIGN_PLUGIN_VERSION;
 58 | }
 59 | 
 60 | PluginFieldCollection const* ROIAlignPluginCreator::getFieldNames() noexcept
 61 | {
 62 |     return &mFC;
 63 | }
 64 | 
 65 | IPluginV2DynamicExt* ROIAlignPluginCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept
 66 | {
 67 |     try
 68 |     {
 69 |         PLUGIN_VALIDATE(fc != nullptr);
 70 |         PluginField const* fields = fc->fields;
 71 | 
 72 |         // default values
 73 |         int32_t outputHeight = 1;
 74 |         int32_t outputWidth = 1;
 75 |         int32_t samplingRatio = 0;
 76 |         int32_t mode = 1;
 77 |         int32_t aligned = 1;
 78 |         float spatialScale = 1.0F;
 79 | 
 80 |         for (int32_t i = 0; i < fc->nbFields; ++i)
 81 |         {
 82 |             char const* attrName = fields[i].name;
 83 |             if (!strcmp(attrName, "output_height"))
 84 |             {
 85 |                 PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
 86 |                 outputHeight = static_cast<int32_t>(*(static_cast<int32_t const*>(fields[i].data)));
 87 |             }
 88 |             else if (!strcmp(attrName, "output_width"))
 89 |             {
 90 |                 PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
 91 |                 outputWidth = static_cast<int32_t>(*(static_cast<int32_t const*>(fields[i].data)));
 92 |             }
 93 |             else if (!strcmp(attrName, "sampling_ratio"))
 94 |             {
 95 |                 PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
 96 |                 samplingRatio = static_cast<int32_t>(*(static_cast<int32_t const*>(fields[i].data)));
 97 |             }
 98 |             else if (!strcmp(attrName, "mode"))
 99 |             {
100 |                 PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
101 |                 mode = static_cast<int32_t>(*(static_cast<int32_t const*>(fields[i].data)));
102 |             }
103 |             else if (!strcmp(attrName, "spatial_scale"))
104 |             {
105 |                 PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kFLOAT32);
106 |                 spatialScale = static_cast<float>(*(static_cast<float const*>(fields[i].data)));
107 |             }
108 |             else if (!strcmp(attrName, "coordinate_transformation_mode"))
109 |             {
110 |                 PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
111 |                 aligned = static_cast<int32_t>(*(static_cast<int32_t const*>(fields[i].data)));
112 |             }
113 |         }
114 |         return new ROIAlign(outputHeight, outputWidth, samplingRatio, mode, spatialScale, aligned);
115 |     }
116 |     catch (std::exception const& e)
117 |     {
118 |         caughtError(e);
119 |     }
120 |     return nullptr;
121 | }
122 | 
123 | IPluginV2DynamicExt* ROIAlignPluginCreator::deserializePlugin(
124 |     char const* name, void const* data, size_t length) noexcept
125 | {
126 |     try
127 |     {
128 |         PLUGIN_VALIDATE(data != nullptr);
129 |         return new ROIAlign(data, length);
130 |     }
131 |     catch (std::exception const& e)
132 |     {
133 |         caughtError(e);
134 |     }
135 |     return nullptr;
136 | }
137 | 
138 | int32_t ROIAlign::getNbOutputs() const noexcept
139 | {
140 |     return 1;
141 | }
142 | 
143 | int32_t ROIAlign::initialize() noexcept
144 | {
145 |     int32_t device;
146 |     PLUGIN_CHECK_CUDA(cudaGetDevice(&device));
147 |     cudaDeviceProp props;
148 |     PLUGIN_CHECK_CUDA(cudaGetDeviceProperties(&props, device));
149 | 
150 |     mMaxThreadsPerBlock = props.maxThreadsPerBlock;
151 | 
152 |     return 0;
153 | }
154 | 
155 | void ROIAlign::terminate() noexcept {}
156 | 
157 | void ROIAlign::destroy() noexcept
158 | {
159 |     delete this;
160 | }
161 | 
162 | size_t ROIAlign::getWorkspaceSize(
163 |     PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept
164 | {
165 |     return 0;
166 | }
167 | 
168 | bool ROIAlign::supportsFormatCombination(
169 |     int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept
170 | {
171 |     PLUGIN_ASSERT(inOut != nullptr);
172 |     PLUGIN_ASSERT(pos >= 0 && pos <= 3);
173 |     PLUGIN_ASSERT(nbInputs == 3);
174 |     PLUGIN_ASSERT(nbOutputs == 1);
175 | 
176 |     PluginTensorDesc const& desc = inOut[pos];
177 |     if (desc.format != TensorFormat::kLINEAR)
178 |     {
179 |         return false;
180 |     }
181 | 
182 |     // first input should be float16 or float32
183 |     if (pos == 0)
184 |     {
185 |         return (inOut[pos].type == nvinfer1::DataType::kFLOAT || inOut[pos].type == nvinfer1::DataType::kHALF);
186 |     }
187 | 
188 |     // batch_indices always has to be int32
189 |     if (pos == 2)
190 |     {
191 |         return (inOut[pos].type == nvinfer1::DataType::kINT32);
192 |     }
193 | 
194 |     // rois and the output should have the same type as the first input
195 |     return (inOut[pos].type == inOut[0].type);
196 | }
197 | 
198 | char const* ROIAlign::getPluginType() const noexcept
199 | {
200 |     return kROIALIGN_PLUGIN_NAME;
201 | }
202 | 
203 | char const* ROIAlign::getPluginVersion() const noexcept
204 | {
205 |     return kROIALIGN_PLUGIN_VERSION;
206 | }
207 | 
208 | IPluginV2DynamicExt* ROIAlign::clone() const noexcept
209 | {
210 |     try
211 |     {
212 |         auto plugin = new ROIAlign(*this);
213 |         plugin->setPluginNamespace(mNameSpace.c_str());
214 |         return plugin;
215 |     }
216 |     catch (std::exception const& e)
217 |     {
218 |         caughtError(e);
219 |     }
220 |     return nullptr;
221 | }
222 | 
223 | void ROIAlign::setPluginNamespace(char const* libNamespace) noexcept
224 | {
225 |     try
226 |     {
227 |         PLUGIN_ASSERT(libNamespace != nullptr);
228 |         mNameSpace = libNamespace;
229 |     }
230 |     catch (std::exception const& e)
231 |     {
232 |         gLogError << e.what() << std::endl;
233 |     }
234 | }
235 | 
236 | char const* ROIAlign::getPluginNamespace() const noexcept
237 | {
238 |     return mNameSpace.c_str();
239 | }
240 | 
241 | void ROIAlign::checkValidInputs(nvinfer1::DynamicPluginTensorDesc const* inputs, int32_t nbInputDims)
242 | {
243 |     PLUGIN_ASSERT(inputs != nullptr);
244 |     PLUGIN_ASSERT(nbInputDims == 3);
245 | 
246 |     nvinfer1::Dims rois = inputs[1].desc.dims;
247 |     nvinfer1::Dims batchIndices = inputs[2].desc.dims;
248 | 
249 |     PLUGIN_ASSERT(rois.nbDims == 2);
250 |     PLUGIN_ASSERT(rois.d[1] == 4);
251 | 
252 |     PLUGIN_ASSERT(batchIndices.nbDims == 1);
253 |     // Check batch_indices matches rois in length
254 |     PLUGIN_ASSERT(rois.d[0] == batchIndices.d[0]);
255 | }
256 | 
257 | void ROIAlign::validateAttributes(
258 |     int32_t outputHeight, int32_t outputWidth, int32_t samplingRatio, int32_t mode, float spatialScale, int32_t aligned)
259 | {
260 |     PLUGIN_VALIDATE(outputHeight > 0);
261 |     PLUGIN_VALIDATE(outputWidth > 0);
262 |     PLUGIN_VALIDATE(samplingRatio >= 0);
263 |     PLUGIN_VALIDATE(mode == 0 || mode == 1);
264 |     PLUGIN_VALIDATE(spatialScale > 0.0F);
265 |     PLUGIN_VALIDATE(aligned == 0 || aligned == 1);
266 | }
267 | 
268 | DimsExprs ROIAlign::getOutputDimensions(
269 |     int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs, IExprBuilder& exprBuilder) noexcept
270 | {
271 |     PLUGIN_ASSERT(inputs != nullptr);
272 |     PLUGIN_ASSERT(nbInputs == 3);
273 |     PLUGIN_ASSERT(outputIndex == 0); // there is only one output
274 | 
275 |     nvinfer1::DimsExprs result;
276 |     result.nbDims = 4;
277 | 
278 |     // mROICount
279 |     result.d[0] = inputs[1].d[0];
280 |     // mFeatureLength
281 |     result.d[1] = inputs[0].d[1];
282 |     // height
283 |     auto const* height = exprBuilder.constant(mOutputHeight);
284 |     PLUGIN_ASSERT(height != nullptr);
285 |     result.d[2] = height;
286 |     // width
287 |     auto const* width = exprBuilder.constant(mOutputWidth);
288 |     PLUGIN_ASSERT(width != nullptr);
289 |     result.d[3] = width;
290 | 
291 |     return result;
292 | }
293 | 
294 | int32_t ROIAlign::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* /* outputDesc */,
295 |     void const* const* inputs, void* const* outputs, void* /* workspace */, cudaStream_t stream) noexcept
296 | {
297 |     PLUGIN_VALIDATE(inputDesc != nullptr && inputs != nullptr && outputs != nullptr);
298 | 
299 |     // No-op pass-through for empty ROIs
300 |     if (mROICount == 0)
301 |     {
302 |         return 0;
303 |     }
304 | 
305 |     auto type = inputDesc[0].type;
306 | 
307 |     PLUGIN_ASSERT(type == nvinfer1::DataType::kHALF || type == nvinfer1::DataType::kFLOAT);
308 | 
309 |     switch (type)
310 |     {
311 |     case nvinfer1::DataType::kFLOAT:
312 |     {
313 |         auto bottomData = static_cast<float const*>(inputs[0]);
314 |         auto bottomRois = static_cast<float const*>(inputs[1]);
315 |         auto batchIndicesPtr = static_cast<int32_t const*>(inputs[2]);
316 |         auto topData = static_cast<float*>(outputs[0]);
317 | 
318 |         return RoiAlignImpl<float>(stream, mMaxThreadsPerBlock, bottomData, mSpatialScale, mROICount, mFeatureLength,
319 |             mHeight, mWidth, mOutputHeight, mOutputWidth, mSamplingRatio, bottomRois, topData, mMode, batchIndicesPtr,
320 |             mAligned);
321 |     }
322 |     break;
323 |     case nvinfer1::DataType::kHALF:
324 |     {
325 |         auto bottomData = static_cast<__half const*>(inputs[0]);
326 |         auto bottomRois = static_cast<__half const*>(inputs[1]);
327 |         auto batchIndicesPtr = static_cast<int32_t const*>(inputs[2]);
328 |         auto topData = static_cast<__half*>(outputs[0]);
329 | 
330 |         return RoiAlignImpl<__half>(stream, mMaxThreadsPerBlock, bottomData, mSpatialScale, mROICount, mFeatureLength,
331 |             mHeight, mWidth, mOutputHeight, mOutputWidth, mSamplingRatio, bottomRois, topData, mMode, batchIndicesPtr,
332 |             mAligned);
333 |     }
334 |     break;
335 |     default: return -1;
336 |     }
337 | 
338 |     return 0;
339 | }
340 | 
341 | size_t ROIAlign::getSerializationSize() const noexcept
342 | {
343 |     return kSERIALIZATION_SIZE;
344 | }
345 | 
346 | void ROIAlign::serialize(void* buffer) const noexcept
347 | {
348 |     PLUGIN_VALIDATE(buffer != nullptr);
349 |     char* d = static_cast<char*>(buffer);
350 |     char* a = d;
351 |     write(d, mAligned);       // int32_t
352 |     write(d, mMode);          // int32_t
353 |     write(d, mOutputHeight);  // int32_t
354 |     write(d, mOutputWidth);   // int32_t
355 |     write(d, mSamplingRatio); // int32_t
356 |     write(d, mSpatialScale);  // float
357 | 
358 |     write(d, mROICount);      // int32_t
359 |     write(d, mFeatureLength); // int32_t
360 |     write(d, mHeight);        // int32_t
361 |     write(d, mWidth);         // int32_t
362 |     PLUGIN_ASSERT(d == a + getSerializationSize());
363 | }
364 | 
365 | ROIAlign::ROIAlign(
366 |     int32_t outputHeight, int32_t outputWidth, int32_t samplingRatio, int32_t mode, float spatialScale, int32_t aligned)
367 |     : mOutputHeight(outputHeight)
368 |     , mOutputWidth(outputWidth)
369 |     , mSamplingRatio(samplingRatio)
370 |     , mSpatialScale(spatialScale)
371 |     , mMode(mode)
372 |     , mAligned(aligned)
373 | {
374 |     validateAttributes(mOutputHeight, mOutputWidth, mSamplingRatio, mMode, mSpatialScale, mAligned);
375 | }
376 | 
377 | ROIAlign::ROIAlign(void const* data, size_t length)
378 | {
379 |     PLUGIN_VALIDATE(data != nullptr);
380 |     PLUGIN_VALIDATE(length == kSERIALIZATION_SIZE);
381 | 
382 |     char const* d = static_cast<char const*>(data);
383 |     char const* a = d;
384 | 
385 |     mAligned = read<int32_t>(d);
386 |     mMode = read<int32_t>(d);
387 |     mOutputHeight = read<int32_t>(d);
388 |     mOutputWidth = read<int32_t>(d);
389 |     mSamplingRatio = read<int32_t>(d);
390 |     mSpatialScale = read<float>(d);
391 | 
392 |     mROICount = read<int32_t>(d);
393 |     mFeatureLength = read<int32_t>(d);
394 |     mHeight = read<int32_t>(d);
395 |     mWidth = read<int32_t>(d);
396 | 
397 |     PLUGIN_VALIDATE(d == a + length);
398 |     validateAttributes(mOutputHeight, mOutputWidth, mSamplingRatio, mMode, mSpatialScale, mAligned);
399 | }
400 | 
401 | DataType ROIAlign::getOutputDataType(
402 |     int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept
403 | {
404 |     PLUGIN_ASSERT(inputTypes != nullptr);
405 |     PLUGIN_ASSERT(nbInputs == 3);
406 |     PLUGIN_ASSERT(index == 0);
407 |     return inputTypes[0];
408 | }
409 | 
410 | void ROIAlign::configurePlugin(
411 |     DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept
412 | {
413 |     PLUGIN_ASSERT(in != nullptr);
414 |     PLUGIN_ASSERT(out != nullptr);
415 |     PLUGIN_ASSERT(nbOutputs == 1);
416 |     PLUGIN_ASSERT(nbInputs == 3);
417 | 
418 |     checkValidInputs(in, nbInputs);
419 | 
420 |     mFeatureLength = in[0].desc.dims.d[1];
421 |     mHeight = in[0].desc.dims.d[2];
422 |     mWidth = in[0].desc.dims.d[3];
423 | 
424 |     mROICount = in[1].desc.dims.d[0];
425 | }
426 | 
427 | REGISTER_TENSORRT_PLUGIN(ROIAlignPluginCreator);
428 | 


--------------------------------------------------------------------------------
/nvdsinfer_custom_impl_Yolo_seg/trt_plugins/efficientNMSPlugin/efficientNMSPlugin.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #include "efficientNMSPlugin.h"
 19 | #include "efficientNMSInference.h"
 20 | 
 21 | using namespace nvinfer1;
 22 | using nvinfer1::plugin::EfficientNMSPlugin;
 23 | using nvinfer1::plugin::EfficientNMSParameters;
 24 | using nvinfer1::plugin::EfficientNMSPluginCreator;
 25 | 
 26 | namespace
 27 | {
 28 | char const* const kEFFICIENT_NMS_PLUGIN_VERSION{"1"};
 29 | char const* const kEFFICIENT_NMS_PLUGIN_NAME{"EfficientNMSX_TRT"};
 30 | } // namespace
 31 | 
 32 | EfficientNMSPlugin::EfficientNMSPlugin(EfficientNMSParameters param)
 33 |     : mParam(std::move(param))
 34 | {
 35 | }
 36 | 
 37 | EfficientNMSPlugin::EfficientNMSPlugin(void const* data, size_t length)
 38 | {
 39 |     deserialize(static_cast<int8_t const*>(data), length);
 40 | }
 41 | 
 42 | void EfficientNMSPlugin::deserialize(int8_t const* data, size_t length)
 43 | {
 44 |     auto const* d{data};
 45 |     mParam = read<EfficientNMSParameters>(d);
 46 |     PLUGIN_VALIDATE(d == data + length);
 47 | }
 48 | 
 49 | char const* EfficientNMSPlugin::getPluginType() const noexcept
 50 | {
 51 |     return kEFFICIENT_NMS_PLUGIN_NAME;
 52 | }
 53 | 
 54 | char const* EfficientNMSPlugin::getPluginVersion() const noexcept
 55 | {
 56 |     return kEFFICIENT_NMS_PLUGIN_VERSION;
 57 | }
 58 | 
 59 | int32_t EfficientNMSPlugin::getNbOutputs() const noexcept
 60 | {
 61 |     // Standard Plugin Implementation
 62 |     return 5;
 63 | }
 64 | 
 65 | int32_t EfficientNMSPlugin::initialize() noexcept
 66 | {
 67 |     if (!initialized)
 68 |     {
 69 |         int32_t device;
 70 |         CSC(cudaGetDevice(&device), STATUS_FAILURE);
 71 |         struct cudaDeviceProp properties;
 72 |         CSC(cudaGetDeviceProperties(&properties, device), STATUS_FAILURE);
 73 |         if (properties.regsPerBlock >= 65536)
 74 |         {
 75 |             // Most Devices
 76 |             mParam.numSelectedBoxes = 5000;
 77 |         }
 78 |         else
 79 |         {
 80 |             // Jetson TX1/TX2
 81 |             mParam.numSelectedBoxes = 2000;
 82 |         }
 83 |         initialized = true;
 84 |     }
 85 |     return STATUS_SUCCESS;
 86 | }
 87 | 
 88 | void EfficientNMSPlugin::terminate() noexcept {}
 89 | 
 90 | size_t EfficientNMSPlugin::getSerializationSize() const noexcept
 91 | {
 92 |     return sizeof(EfficientNMSParameters);
 93 | }
 94 | 
 95 | void EfficientNMSPlugin::serialize(void* buffer) const noexcept
 96 | {
 97 |     char *d = reinterpret_cast<char*>(buffer), *a = d;
 98 |     write(d, mParam);
 99 |     PLUGIN_ASSERT(d == a + getSerializationSize());
100 | }
101 | 
102 | void EfficientNMSPlugin::destroy() noexcept
103 | {
104 |     delete this;
105 | }
106 | 
107 | void EfficientNMSPlugin::setPluginNamespace(char const* pluginNamespace) noexcept
108 | {
109 |     try
110 |     {
111 |         mNamespace = pluginNamespace;
112 |     }
113 |     catch (std::exception const& e)
114 |     {
115 |         caughtError(e);
116 |     }
117 | }
118 | 
119 | char const* EfficientNMSPlugin::getPluginNamespace() const noexcept
120 | {
121 |     return mNamespace.c_str();
122 | }
123 | 
124 | nvinfer1::DataType EfficientNMSPlugin::getOutputDataType(
125 |     int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept
126 | {
127 |     // On standard NMS, num_detections and detection_classes use integer outputs
128 |     if (index == 0 || index == 3 || index == 4)
129 |     {
130 |         return nvinfer1::DataType::kINT32;
131 |     }
132 |     // All others should use the same datatype as the input
133 |     return inputTypes[0];
134 | }
135 | 
136 | IPluginV2DynamicExt* EfficientNMSPlugin::clone() const noexcept
137 | {
138 |     try
139 |     {
140 |         auto* plugin = new EfficientNMSPlugin(mParam);
141 |         plugin->setPluginNamespace(mNamespace.c_str());
142 |         return plugin;
143 |     }
144 |     catch (std::exception const& e)
145 |     {
146 |         caughtError(e);
147 |     }
148 |     return nullptr;
149 | }
150 | 
151 | DimsExprs EfficientNMSPlugin::getOutputDimensions(
152 |     int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs, IExprBuilder& exprBuilder) noexcept
153 | {
154 |     try
155 |     {
156 |         DimsExprs out_dim;
157 | 
158 |         // When pad per class is set, the output size may need to be reduced:
159 |         // i.e.: outputBoxes = min(outputBoxes, outputBoxesPerClass * numClasses)
160 |         // As the number of classes may not be static, numOutputBoxes must be a dynamic
161 |         // expression. The corresponding parameter can not be set at this time, so the
162 |         // value will be calculated again in configurePlugin() and the param overwritten.
163 |         IDimensionExpr const* numOutputBoxes = exprBuilder.constant(mParam.numOutputBoxes);
164 |         if (mParam.padOutputBoxesPerClass && mParam.numOutputBoxesPerClass > 0)
165 |         {
166 |             IDimensionExpr const* numOutputBoxesPerClass = exprBuilder.constant(mParam.numOutputBoxesPerClass);
167 |             IDimensionExpr const* numClasses = inputs[1].d[2];
168 |             numOutputBoxes = exprBuilder.operation(DimensionOperation::kMIN, *numOutputBoxes,
169 |                 *exprBuilder.operation(DimensionOperation::kPROD, *numOutputBoxesPerClass, *numClasses));
170 |         }
171 | 
172 |         // Standard NMS
173 |         PLUGIN_ASSERT(outputIndex >= 0 && outputIndex <= 4);
174 | 
175 |         // num_detections
176 |         if (outputIndex == 0)
177 |         {
178 |             out_dim.nbDims = 2;
179 |             out_dim.d[0] = inputs[0].d[0];
180 |             out_dim.d[1] = exprBuilder.constant(1);
181 |         }
182 |         // detection_boxes
183 |         else if (outputIndex == 1)
184 |         {
185 |             out_dim.nbDims = 3;
186 |             out_dim.d[0] = inputs[0].d[0];
187 |             out_dim.d[1] = numOutputBoxes;
188 |             out_dim.d[2] = exprBuilder.constant(4);
189 |         }
190 |         // detection_scores: outputIndex == 2
191 |         // detection_classes: outputIndex == 3
192 |         else if (outputIndex == 2 || outputIndex == 3 || outputIndex == 4)
193 |         {
194 |             out_dim.nbDims = 2;
195 |             out_dim.d[0] = inputs[0].d[0];
196 |             out_dim.d[1] = numOutputBoxes;
197 |         }
198 | 
199 |         return out_dim;
200 |     }
201 |     catch (std::exception const& e)
202 |     {
203 |         caughtError(e);
204 |     }
205 |     return DimsExprs{};
206 | }
207 | 
208 | bool EfficientNMSPlugin::supportsFormatCombination(
209 |     int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept
210 | {
211 |     if (inOut[pos].format != PluginFormat::kLINEAR)
212 |     {
213 |         return false;
214 |     }
215 | 
216 |     PLUGIN_ASSERT(nbInputs == 2 || nbInputs == 3);
217 |     PLUGIN_ASSERT(nbOutputs == 5);
218 |     if (nbInputs == 2)
219 |     {
220 |         PLUGIN_ASSERT(0 <= pos && pos <= 6);
221 |     }
222 |     if (nbInputs == 3)
223 |     {
224 |         PLUGIN_ASSERT(0 <= pos && pos <= 7);
225 |     }
226 | 
227 |     // num_detections and detection_classes output: int32_t
228 |     int32_t const posOut = pos - nbInputs;
229 |     if (posOut == 0 || posOut == 3 || posOut == 4)
230 |     {
231 |         return inOut[pos].type == DataType::kINT32 && inOut[pos].format == PluginFormat::kLINEAR;
232 |     }
233 | 
234 |     // all other inputs/outputs: fp32 or fp16
235 |     return (inOut[pos].type == DataType::kHALF || inOut[pos].type == DataType::kFLOAT)
236 |         && (inOut[0].type == inOut[pos].type);
237 | }
238 | 
239 | void EfficientNMSPlugin::configurePlugin(
240 |     DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept
241 | {
242 |     try
243 |     {
244 |         // Accepts two or three inputs
245 |         // If two inputs: [0] boxes, [1] scores
246 |         // If three inputs: [0] boxes, [1] scores, [2] anchors
247 |         PLUGIN_ASSERT(nbInputs == 2 || nbInputs == 3);
248 |         PLUGIN_ASSERT(nbOutputs == 5);
249 | 
250 |         mParam.datatype = in[0].desc.type;
251 | 
252 |         // Shape of scores input should be
253 |         // [batch_size, num_boxes, num_classes] or [batch_size, num_boxes, num_classes, 1]
254 |         PLUGIN_ASSERT(in[1].desc.dims.nbDims == 3 || (in[1].desc.dims.nbDims == 4 && in[1].desc.dims.d[3] == 1));
255 |         mParam.numScoreElements = in[1].desc.dims.d[1] * in[1].desc.dims.d[2];
256 |         mParam.numClasses = in[1].desc.dims.d[2];
257 | 
258 |         // When pad per class is set, the total output boxes size may need to be reduced.
259 |         // This operation is also done in getOutputDimension(), but for dynamic shapes, the
260 |         // numOutputBoxes param can't be set until the number of classes is fully known here.
261 |         if (mParam.padOutputBoxesPerClass && mParam.numOutputBoxesPerClass > 0)
262 |         {
263 |             if (mParam.numOutputBoxesPerClass * mParam.numClasses < mParam.numOutputBoxes)
264 |             {
265 |                 mParam.numOutputBoxes = mParam.numOutputBoxesPerClass * mParam.numClasses;
266 |             }
267 |         }
268 | 
269 |         // Shape of boxes input should be
270 |         // [batch_size, num_boxes, 4] or [batch_size, num_boxes, 1, 4] or [batch_size, num_boxes, num_classes, 4]
271 |         PLUGIN_ASSERT(in[0].desc.dims.nbDims == 3 || in[0].desc.dims.nbDims == 4);
272 |         if (in[0].desc.dims.nbDims == 3)
273 |         {
274 |             PLUGIN_ASSERT(in[0].desc.dims.d[2] == 4);
275 |             mParam.shareLocation = true;
276 |             mParam.numBoxElements = in[0].desc.dims.d[1] * in[0].desc.dims.d[2];
277 |         }
278 |         else
279 |         {
280 |             mParam.shareLocation = (in[0].desc.dims.d[2] == 1);
281 |             PLUGIN_ASSERT(in[0].desc.dims.d[2] == mParam.numClasses || mParam.shareLocation);
282 |             PLUGIN_ASSERT(in[0].desc.dims.d[3] == 4);
283 |             mParam.numBoxElements = in[0].desc.dims.d[1] * in[0].desc.dims.d[2] * in[0].desc.dims.d[3];
284 |         }
285 |         mParam.numAnchors = in[0].desc.dims.d[1];
286 | 
287 |         if (nbInputs == 2)
288 |         {
289 |             // Only two inputs are used, disable the fused box decoder
290 |             mParam.boxDecoder = false;
291 |         }
292 |         if (nbInputs == 3)
293 |         {
294 |             // All three inputs are used, enable the box decoder
295 |             // Shape of anchors input should be
296 |             // Constant shape: [1, numAnchors, 4] or [batch_size, numAnchors, 4]
297 |             PLUGIN_ASSERT(in[2].desc.dims.nbDims == 3);
298 |             mParam.boxDecoder = true;
299 |             mParam.shareAnchors = (in[2].desc.dims.d[0] == 1);
300 |         }
301 |     }
302 |     catch (std::exception const& e)
303 |     {
304 |         caughtError(e);
305 |     }
306 | }
307 | 
308 | size_t EfficientNMSPlugin::getWorkspaceSize(
309 |     PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept
310 | {
311 |     int32_t batchSize = inputs[1].dims.d[0];
312 |     int32_t numScoreElements = inputs[1].dims.d[1] * inputs[1].dims.d[2];
313 |     int32_t numClasses = inputs[1].dims.d[2];
314 |     return EfficientNMSWorkspaceSize(batchSize, numScoreElements, numClasses, mParam.datatype);
315 | }
316 | 
317 | int32_t EfficientNMSPlugin::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* /* outputDesc */,
318 |     void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept
319 | {
320 |     try
321 |     {
322 |         PLUGIN_VALIDATE(inputDesc != nullptr && inputs != nullptr && outputs != nullptr && workspace != nullptr);
323 | 
324 |         mParam.batchSize = inputDesc[0].dims.d[0];
325 | 
326 |         // Standard NMS Operation
327 |         void const* const boxesInput = inputs[0];
328 |         void const* const scoresInput = inputs[1];
329 |         void const* const anchorsInput = mParam.boxDecoder ? inputs[2] : nullptr;
330 | 
331 |         void* numDetectionsOutput = outputs[0];
332 |         void* nmsBoxesOutput = outputs[1];
333 |         void* nmsScoresOutput = outputs[2];
334 |         void* nmsClassesOutput = outputs[3];
335 |         void* nmsIndicesOutput = outputs[4];
336 | 
337 |         return EfficientNMSInference(mParam, boxesInput, scoresInput, anchorsInput, numDetectionsOutput, nmsBoxesOutput,
338 |             nmsScoresOutput, nmsClassesOutput, nmsIndicesOutput, workspace, stream);
339 |     }
340 |     catch (std::exception const& e)
341 |     {
342 |         caughtError(e);
343 |     }
344 |     return -1;
345 | }
346 | 
347 | // Standard NMS Plugin Operation
348 | 
349 | EfficientNMSPluginCreator::EfficientNMSPluginCreator()
350 |     : mParam{}
351 | {
352 |     mPluginAttributes.clear();
353 |     mPluginAttributes.emplace_back(PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
354 |     mPluginAttributes.emplace_back(PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
355 |     mPluginAttributes.emplace_back(PluginField("max_output_boxes", nullptr, PluginFieldType::kINT32, 1));
356 |     mPluginAttributes.emplace_back(PluginField("background_class", nullptr, PluginFieldType::kINT32, 1));
357 |     mPluginAttributes.emplace_back(PluginField("score_activation", nullptr, PluginFieldType::kINT32, 1));
358 |     mPluginAttributes.emplace_back(PluginField("class_agnostic", nullptr, PluginFieldType::kINT32, 1));
359 |     mPluginAttributes.emplace_back(PluginField("box_coding", nullptr, PluginFieldType::kINT32, 1));
360 |     mFC.nbFields = mPluginAttributes.size();
361 |     mFC.fields = mPluginAttributes.data();
362 | }
363 | 
364 | char const* EfficientNMSPluginCreator::getPluginName() const noexcept
365 | {
366 |     return kEFFICIENT_NMS_PLUGIN_NAME;
367 | }
368 | 
369 | char const* EfficientNMSPluginCreator::getPluginVersion() const noexcept
370 | {
371 |     return kEFFICIENT_NMS_PLUGIN_VERSION;
372 | }
373 | 
374 | PluginFieldCollection const* EfficientNMSPluginCreator::getFieldNames() noexcept
375 | {
376 |     return &mFC;
377 | }
378 | 
379 | IPluginV2DynamicExt* EfficientNMSPluginCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept
380 | {
381 |     try
382 |     {
383 |         PLUGIN_VALIDATE(fc != nullptr);
384 |         PluginField const* fields = fc->fields;
385 |         PLUGIN_VALIDATE(fields != nullptr);
386 |         plugin::validateRequiredAttributesExist({"score_threshold", "iou_threshold", "max_output_boxes",
387 |                                                     "background_class", "score_activation", "box_coding"},
388 |             fc);
389 |         for (int32_t i{0}; i < fc->nbFields; ++i)
390 |         {
391 |             char const* attrName = fields[i].name;
392 |             if (!strcmp(attrName, "score_threshold"))
393 |             {
394 |                 PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kFLOAT32);
395 |                 auto const scoreThreshold = *(static_cast<float const*>(fields[i].data));
396 |                 PLUGIN_VALIDATE(scoreThreshold >= 0.0F);
397 |                 mParam.scoreThreshold = scoreThreshold;
398 |             }
399 |             if (!strcmp(attrName, "iou_threshold"))
400 |             {
401 |                 PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kFLOAT32);
402 |                 auto const iouThreshold = *(static_cast<float const*>(fields[i].data));
403 |                 PLUGIN_VALIDATE(iouThreshold > 0.0F);
404 |                 mParam.iouThreshold = iouThreshold;
405 |             }
406 |             if (!strcmp(attrName, "max_output_boxes"))
407 |             {
408 |                 PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
409 |                 auto const numOutputBoxes = *(static_cast<int32_t const*>(fields[i].data));
410 |                 PLUGIN_VALIDATE(numOutputBoxes > 0);
411 |                 mParam.numOutputBoxes = numOutputBoxes;
412 |             }
413 |             if (!strcmp(attrName, "background_class"))
414 |             {
415 |                 PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
416 |                 mParam.backgroundClass = *(static_cast<int32_t const*>(fields[i].data));
417 |             }
418 |             if (!strcmp(attrName, "score_activation"))
419 |             {
420 |                 auto const scoreSigmoid = *(static_cast<int32_t const*>(fields[i].data));
421 |                 PLUGIN_VALIDATE(scoreSigmoid == 0 || scoreSigmoid == 1);
422 |                 mParam.scoreSigmoid = static_cast<bool>(scoreSigmoid);
423 |             }
424 |             if (!strcmp(attrName, "class_agnostic"))
425 |             {
426 |                 auto const classAgnostic = *(static_cast<int32_t const*>(fields[i].data));
427 |                 PLUGIN_VALIDATE(classAgnostic == 0 || classAgnostic == 1);
428 |                 mParam.classAgnostic = static_cast<bool>(classAgnostic);
429 |             }
430 |             if (!strcmp(attrName, "box_coding"))
431 |             {
432 |                 PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
433 |                 auto const boxCoding = *(static_cast<int32_t const*>(fields[i].data));
434 |                 PLUGIN_VALIDATE(boxCoding == 0 || boxCoding == 1);
435 |                 mParam.boxCoding = boxCoding;
436 |             }
437 |         }
438 | 
439 |         auto* plugin = new EfficientNMSPlugin(mParam);
440 |         plugin->setPluginNamespace(mNamespace.c_str());
441 |         return plugin;
442 |     }
443 |     catch (std::exception const& e)
444 |     {
445 |         caughtError(e);
446 |     }
447 |     return nullptr;
448 | }
449 | 
450 | IPluginV2DynamicExt* EfficientNMSPluginCreator::deserializePlugin(
451 |     char const* name, void const* serialData, size_t serialLength) noexcept
452 | {
453 |     try
454 |     {
455 |         // This object will be deleted when the network is destroyed, which will
456 |         // call EfficientNMSPlugin::destroy()
457 |         auto* plugin = new EfficientNMSPlugin(serialData, serialLength);
458 |         plugin->setPluginNamespace(mNamespace.c_str());
459 |         return plugin;
460 |     }
461 |     catch (std::exception const& e)
462 |     {
463 |         caughtError(e);
464 |     }
465 |     return nullptr;
466 | }
467 | 
468 | REGISTER_TENSORRT_PLUGIN(EfficientNMSPluginCreator);
469 | 


--------------------------------------------------------------------------------
/nvdsinfer_custom_impl_Yolo_seg/trt_plugins/efficientNMSPlugin/efficientNMSInference.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: Apache-2.0
  4 |  *
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  *
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | #include "../common.h"
 19 | #include "cub/cub.cuh"
 20 | #include "cuda_runtime_api.h"
 21 | 
 22 | #include "efficientNMSInference.cuh"
 23 | #include "efficientNMSInference.h"
 24 | 
 25 | #define NMS_TILES 5
 26 | 
 27 | using namespace nvinfer1;
 28 | using namespace nvinfer1::plugin;
 29 | 
 30 | template <typename T>
 31 | __device__ float IOU(EfficientNMSParameters param, BoxCorner<T> box1, BoxCorner<T> box2)
 32 | {
 33 |     // Regardless of the selected box coding, IOU is always performed in BoxCorner coding.
 34 |     // The boxes are copied so that they can be reordered without affecting the originals.
 35 |     BoxCorner<T> b1 = box1;
 36 |     BoxCorner<T> b2 = box2;
 37 |     b1.reorder();
 38 |     b2.reorder();
 39 |     float intersectArea = BoxCorner<T>::intersect(b1, b2).area();
 40 |     if (intersectArea <= 0.f)
 41 |     {
 42 |         return 0.f;
 43 |     }
 44 |     float unionArea = b1.area() + b2.area() - intersectArea;
 45 |     if (unionArea <= 0.f)
 46 |     {
 47 |         return 0.f;
 48 |     }
 49 |     return intersectArea / unionArea;
 50 | }
 51 | 
 52 | template <typename T, typename Tb>
 53 | __device__ BoxCorner<T> DecodeBoxes(EfficientNMSParameters param, int boxIdx, int anchorIdx,
 54 |     const Tb* __restrict__ boxesInput, const Tb* __restrict__ anchorsInput)
 55 | {
 56 |     // The inputs will be in the selected coding format, as well as the decoding function. But the decoded box
 57 |     // will always be returned as BoxCorner.
 58 |     Tb box = boxesInput[boxIdx];
 59 |     if (!param.boxDecoder)
 60 |     {
 61 |         return BoxCorner<T>(box);
 62 |     }
 63 |     Tb anchor = anchorsInput[anchorIdx];
 64 |     box.reorder();
 65 |     anchor.reorder();
 66 |     return BoxCorner<T>(box.decode(anchor));
 67 | }
 68 | 
 69 | template <typename T, typename Tb>
 70 | __device__ void MapNMSData(EfficientNMSParameters param, int idx, int imageIdx, const Tb* __restrict__ boxesInput,
 71 |     const Tb* __restrict__ anchorsInput, const int* __restrict__ topClassData, const int* __restrict__ topAnchorsData,
 72 |     const int* __restrict__ topNumData, const T* __restrict__ sortedScoresData, const int* __restrict__ sortedIndexData,
 73 |     T& scoreMap, int& classMap, BoxCorner<T>& boxMap, int& boxIdxMap)
 74 | {
 75 |     // idx: Holds the NMS box index, within the current batch.
 76 |     // idxSort: Holds the batched NMS box index, which indexes the (filtered, but sorted) score buffer.
 77 |     // scoreMap: Holds the score that corresponds to the indexed box being processed by NMS.
 78 |     if (idx >= topNumData[imageIdx])
 79 |     {
 80 |         return;
 81 |     }
 82 |     int idxSort = imageIdx * param.numScoreElements + idx;
 83 |     scoreMap = sortedScoresData[idxSort];
 84 | 
 85 |     // idxMap: Holds the re-mapped index, which indexes the (filtered, but unsorted) buffers.
 86 |     // classMap: Holds the class that corresponds to the idx'th sorted score being processed by NMS.
 87 |     // anchorMap: Holds the anchor that corresponds to the idx'th sorted score being processed by NMS.
 88 |     int idxMap = imageIdx * param.numScoreElements + sortedIndexData[idxSort];
 89 |     classMap = topClassData[idxMap];
 90 |     int anchorMap = topAnchorsData[idxMap];
 91 | 
 92 |     // boxIdxMap: Holds the re-re-mapped index, which indexes the (unfiltered, and unsorted) boxes input buffer.
 93 |     boxIdxMap = -1;
 94 |     if (param.shareLocation) // Shape of boxesInput: [batchSize, numAnchors, 1, 4]
 95 |     {
 96 |         boxIdxMap = imageIdx * param.numAnchors + anchorMap;
 97 |     }
 98 |     else // Shape of boxesInput: [batchSize, numAnchors, numClasses, 4]
 99 |     {
100 |         int batchOffset = imageIdx * param.numAnchors * param.numClasses;
101 |         int anchorOffset = anchorMap * param.numClasses;
102 |         boxIdxMap = batchOffset + anchorOffset + classMap;
103 |     }
104 |     // anchorIdxMap: Holds the re-re-mapped index, which indexes the (unfiltered, and unsorted) anchors input buffer.
105 |     int anchorIdxMap = -1;
106 |     if (param.shareAnchors) // Shape of anchorsInput: [1, numAnchors, 4]
107 |     {
108 |         anchorIdxMap = anchorMap;
109 |     }
110 |     else // Shape of anchorsInput: [batchSize, numAnchors, 4]
111 |     {
112 |         anchorIdxMap = imageIdx * param.numAnchors + anchorMap;
113 |     }
114 |     // boxMap: Holds the box that corresponds to the idx'th sorted score being processed by NMS.
115 |     boxMap = DecodeBoxes<T, Tb>(param, boxIdxMap, anchorIdxMap, boxesInput, anchorsInput);
116 | }
117 | 
118 | template <typename T>
119 | __device__ void WriteNMSResult(EfficientNMSParameters param, int* __restrict__ numDetectionsOutput,
120 |     T* __restrict__ nmsScoresOutput, int* __restrict__ nmsClassesOutput, BoxCorner<T>* __restrict__ nmsBoxesOutput,
121 |     int* __restrict__ nmsIndicesOutput, T threadScore, int threadClass, BoxCorner<T> threadBox, int boxIdxMap,
122 |     int imageIdx, unsigned int resultsCounter)
123 | {
124 |     int outputIdx = imageIdx * param.numOutputBoxes + resultsCounter - 1;
125 |     if (param.scoreSigmoid)
126 |     {
127 |         nmsScoresOutput[outputIdx] = sigmoid_mp(threadScore);
128 |     }
129 |     else if (param.scoreBits > 0)
130 |     {
131 |         nmsScoresOutput[outputIdx] = add_mp(threadScore, (T) -1);
132 |     }
133 |     else
134 |     {
135 |         nmsScoresOutput[outputIdx] = threadScore;
136 |     }
137 |     nmsClassesOutput[outputIdx] = threadClass;
138 |     if (param.clipBoxes)
139 |     {
140 |         nmsBoxesOutput[outputIdx] = threadBox.clip((T) 0, (T) 1);
141 |     }
142 |     else
143 |     {
144 |         nmsBoxesOutput[outputIdx] = threadBox;
145 |     }
146 |     nmsIndicesOutput[outputIdx] = boxIdxMap % param.numAnchors;
147 |     numDetectionsOutput[imageIdx] = resultsCounter;
148 |     
149 | }
150 | 
151 | template <typename T, typename Tb>
152 | __global__ void EfficientNMS(EfficientNMSParameters param, const int* topNumData, int* outputIndexData,
153 |     int* outputClassData, const int* sortedIndexData, const T* __restrict__ sortedScoresData,
154 |     const int* __restrict__ topClassData, const int* __restrict__ topAnchorsData, const Tb* __restrict__ boxesInput,
155 |     const Tb* __restrict__ anchorsInput, int* __restrict__ numDetectionsOutput, T* __restrict__ nmsScoresOutput,
156 |     int* __restrict__ nmsClassesOutput, int* __restrict__ nmsIndicesOutput, BoxCorner<T>* __restrict__ nmsBoxesOutput)
157 | {
158 |     unsigned int thread = threadIdx.x;
159 |     unsigned int imageIdx = blockIdx.y;
160 |     unsigned int tileSize = blockDim.x;
161 |     if (imageIdx >= param.batchSize)
162 |     {
163 |         return;
164 |     }
165 | 
166 |     int numSelectedBoxes = min(topNumData[imageIdx], param.numSelectedBoxes);
167 |     int numTiles = (numSelectedBoxes + tileSize - 1) / tileSize;
168 |     if (thread >= numSelectedBoxes)
169 |     {
170 |         return;
171 |     }
172 | 
173 |     __shared__ int blockState;
174 |     __shared__ unsigned int resultsCounter;
175 |     if (thread == 0)
176 |     {
177 |         blockState = 0;
178 |         resultsCounter = 0;
179 |     }
180 | 
181 |     int threadState[NMS_TILES];
182 |     unsigned int boxIdx[NMS_TILES];
183 |     T threadScore[NMS_TILES];
184 |     int threadClass[NMS_TILES];
185 |     BoxCorner<T> threadBox[NMS_TILES];
186 |     int boxIdxMap[NMS_TILES];
187 |     for (int tile = 0; tile < numTiles; tile++)
188 |     {
189 |         threadState[tile] = 0;
190 |         boxIdx[tile] = thread + tile * blockDim.x;
191 |         MapNMSData<T, Tb>(param, boxIdx[tile], imageIdx, boxesInput, anchorsInput, topClassData, topAnchorsData,
192 |             topNumData, sortedScoresData, sortedIndexData, threadScore[tile], threadClass[tile], threadBox[tile],
193 |             boxIdxMap[tile]);
194 |     }
195 | 
196 |     // Iterate through all boxes to NMS against.
197 |     for (int i = 0; i < numSelectedBoxes; i++)
198 |     {
199 |         int tile = i / tileSize;
200 | 
201 |         if (boxIdx[tile] == i)
202 |         {
203 |             // Iteration lead thread, figure out what the other threads should do,
204 |             // this will be signaled via the blockState shared variable.
205 |             if (threadState[tile] == -1)
206 |             {
207 |                 // Thread already dead, this box was already dropped in a previous iteration,
208 |                 // because it had a large IOU overlap with another lead thread previously, so
209 |                 // it would never be kept anyway, therefore it can safely be skip all IOU operations
210 |                 // in this iteration.
211 |                 blockState = -1; // -1 => Signal all threads to skip iteration
212 |             }
213 |             else if (threadState[tile] == 0)
214 |             {
215 |                 // As this box will be kept, this is a good place to find what index in the results buffer it
216 |                 // should have, as this allows to perform an early loop exit if there are enough results.
217 |                 if (resultsCounter >= param.numOutputBoxes)
218 |                 {
219 |                     blockState = -2; // -2 => Signal all threads to do an early loop exit.
220 |                 }
221 |                 else
222 |                 {
223 |                     // Thread is still alive, because it has not had a large enough IOU overlap with
224 |                     // any other kept box previously. Therefore, this box will be kept for sure. However,
225 |                     // we need to check against all other subsequent boxes from this position onward,
226 |                     // to see how those other boxes will behave in future iterations.
227 |                     blockState = 1;        // +1 => Signal all (higher index) threads to calculate IOU against this box
228 |                     threadState[tile] = 1; // +1 => Mark this box's thread to be kept and written out to results
229 | 
230 |                     // If the numOutputBoxesPerClass check is enabled, write the result only if the limit for this
231 |                     // class on this image has not been reached yet. Other than (possibly) skipping the write, this
232 |                     // won't affect anything else in the NMS threading.
233 |                     bool write = true;
234 |                     if (param.numOutputBoxesPerClass >= 0)
235 |                     {
236 |                         int classCounterIdx = imageIdx * param.numClasses + threadClass[tile];
237 |                         write = (outputClassData[classCounterIdx] < param.numOutputBoxesPerClass);
238 |                         outputClassData[classCounterIdx]++;
239 |                     }
240 |                     if (write)
241 |                     {
242 |                         // This branch is visited by one thread per iteration, so it's safe to do non-atomic increments.
243 |                         resultsCounter++;
244 | 
245 |                         WriteNMSResult<T>(param, numDetectionsOutput, nmsScoresOutput, nmsClassesOutput, nmsBoxesOutput,
246 |                             nmsIndicesOutput, threadScore[tile], threadClass[tile], threadBox[tile], boxIdxMap[tile],
247 |                             imageIdx, resultsCounter);
248 |                     }
249 |                 }
250 |             }
251 |             else
252 |             {
253 |                 // This state should never be reached, but just in case...
254 |                 blockState = 0; // 0 => Signal all threads to not do any updates, nothing happens.
255 |             }
256 |         }
257 | 
258 |         __syncthreads();
259 | 
260 |         if (blockState == -2)
261 |         {
262 |             // This is the signal to exit from the loop.
263 |             return;
264 |         }
265 | 
266 |         if (blockState == -1)
267 |         {
268 |             // This is the signal for all threads to just skip this iteration, as no IOU's need to be checked.
269 |             continue;
270 |         }
271 | 
272 |         // Grab a box and class to test the current box against. The test box corresponds to iteration i,
273 |         // therefore it will have a lower index than the current thread box, and will therefore have a higher score
274 |         // than the current box because it's located "before" in the sorted score list.
275 |         T testScore;
276 |         int testClass;
277 |         BoxCorner<T> testBox;
278 |         int testBoxIdxMap;
279 |         MapNMSData<T, Tb>(param, i, imageIdx, boxesInput, anchorsInput, topClassData, topAnchorsData, topNumData,
280 |             sortedScoresData, sortedIndexData, testScore, testClass, testBox, testBoxIdxMap);
281 | 
282 |         for (int tile = 0; tile < numTiles; tile++)
283 |         {
284 |             bool ignoreClass = true;
285 |             if (!param.classAgnostic)
286 |             {
287 |                 ignoreClass = threadClass[tile] == testClass;
288 |             }
289 | 
290 |             // IOU
291 |             if (boxIdx[tile] > i && // Make sure two different boxes are being tested, and that it's a higher index;
292 |                 boxIdx[tile] < numSelectedBoxes && // Make sure the box is within numSelectedBoxes;
293 |                 blockState == 1 &&                 // Signal that allows IOU checks to be performed;
294 |                 threadState[tile] == 0 &&          // Make sure this box hasn't been either dropped or kept already;
295 |                 ignoreClass &&                     // Compare only boxes of matching classes when classAgnostic is false;
296 |                 lte_mp(threadScore[tile], testScore) && // Make sure the sorting order of scores is as expected;
297 |                 IOU<T>(param, threadBox[tile], testBox) >= param.iouThreshold) // And... IOU overlap.
298 |             {
299 |                 // Current box overlaps with the box tested in this iteration, this box will be skipped.
300 |                 threadState[tile] = -1; // -1 => Mark this box's thread to be dropped.
301 |             }
302 |         }
303 |     }
304 | }
305 | 
306 | template <typename T>
307 | cudaError_t EfficientNMSLauncher(EfficientNMSParameters& param, int* topNumData, int* outputIndexData,
308 |     int* outputClassData, int* sortedIndexData, T* sortedScoresData, int* topClassData, int* topAnchorsData,
309 |     const void* boxesInput, const void* anchorsInput, int* numDetectionsOutput, T* nmsScoresOutput,
310 |     int* nmsClassesOutput, int* nmsIndicesOutput, void* nmsBoxesOutput, cudaStream_t stream)
311 | {
312 |     unsigned int tileSize = param.numSelectedBoxes / NMS_TILES;
313 |     if (param.numSelectedBoxes <= 512)
314 |     {
315 |         tileSize = 512;
316 |     }
317 |     if (param.numSelectedBoxes <= 256)
318 |     {
319 |         tileSize = 256;
320 |     }
321 | 
322 |     const dim3 blockSize = {tileSize, 1, 1};
323 |     const dim3 gridSize = {1, (unsigned int) param.batchSize, 1};
324 | 
325 |     if (param.boxCoding == 0)
326 |     {
327 |         EfficientNMS<T, BoxCorner<T>><<<gridSize, blockSize, 0, stream>>>(param, topNumData, outputIndexData,
328 |             outputClassData, sortedIndexData, sortedScoresData, topClassData, topAnchorsData,
329 |             (BoxCorner<T>*) boxesInput, (BoxCorner<T>*) anchorsInput, numDetectionsOutput, nmsScoresOutput,
330 |             nmsClassesOutput, nmsIndicesOutput, (BoxCorner<T>*) nmsBoxesOutput);
331 |     }
332 |     else if (param.boxCoding == 1)
333 |     {
334 |         // Note that nmsBoxesOutput is always coded as BoxCorner<T>, regardless of the input coding type.
335 |         EfficientNMS<T, BoxCenterSize<T>><<<gridSize, blockSize, 0, stream>>>(param, topNumData, outputIndexData,
336 |             outputClassData, sortedIndexData, sortedScoresData, topClassData, topAnchorsData,
337 |             (BoxCenterSize<T>*) boxesInput, (BoxCenterSize<T>*) anchorsInput, numDetectionsOutput, nmsScoresOutput,
338 |             nmsClassesOutput, nmsIndicesOutput, (BoxCorner<T>*) nmsBoxesOutput);
339 |     }
340 | 
341 |     return cudaGetLastError();
342 | }
343 | 
344 | __global__ void EfficientNMSFilterSegments(EfficientNMSParameters param, const int* __restrict__ topNumData,
345 |     int* __restrict__ topOffsetsStartData, int* __restrict__ topOffsetsEndData)
346 | {
347 |     int imageIdx = threadIdx.x;
348 |     if (imageIdx > param.batchSize)
349 |     {
350 |         return;
351 |     }
352 |     topOffsetsStartData[imageIdx] = imageIdx * param.numScoreElements;
353 |     topOffsetsEndData[imageIdx] = imageIdx * param.numScoreElements + topNumData[imageIdx];
354 | }
355 | 
356 | template <typename T>
357 | __global__ void EfficientNMSFilter(EfficientNMSParameters param, const T* __restrict__ scoresInput,
358 |     int* __restrict__ topNumData, int* __restrict__ topIndexData, int* __restrict__ topAnchorsData,
359 |     T* __restrict__ topScoresData, int* __restrict__ topClassData)
360 | {
361 |     int elementIdx = blockDim.x * blockIdx.x + threadIdx.x;
362 |     int imageIdx = blockDim.y * blockIdx.y + threadIdx.y;
363 | 
364 |     // Boundary Conditions
365 |     if (elementIdx >= param.numScoreElements || imageIdx >= param.batchSize)
366 |     {
367 |         return;
368 |     }
369 | 
370 |     // Shape of scoresInput: [batchSize, numAnchors, numClasses]
371 |     int scoresInputIdx = imageIdx * param.numScoreElements + elementIdx;
372 | 
373 |     // For each class, check its corresponding score if it crosses the threshold, and if so select this anchor,
374 |     // and keep track of the maximum score and the corresponding (argmax) class id
375 |     T score = scoresInput[scoresInputIdx];
376 |     if (gte_mp(score, (T) param.scoreThreshold))
377 |     {
378 |         // Unpack the class and anchor index from the element index
379 |         int classIdx = elementIdx % param.numClasses;
380 |         int anchorIdx = elementIdx / param.numClasses;
381 | 
382 |         // If this is a background class, ignore it.
383 |         if (classIdx == param.backgroundClass)
384 |         {
385 |             return;
386 |         }
387 | 
388 |         // Use an atomic to find an open slot where to write the selected anchor data.
389 |         if (topNumData[imageIdx] >= param.numScoreElements)
390 |         {
391 |             return;
392 |         }
393 |         int selectedIdx = atomicAdd((unsigned int*) &topNumData[imageIdx], 1);
394 |         if (selectedIdx >= param.numScoreElements)
395 |         {
396 |             topNumData[imageIdx] = param.numScoreElements;
397 |             return;
398 |         }
399 | 
400 |         // Shape of topScoresData / topClassData: [batchSize, numScoreElements]
401 |         int topIdx = imageIdx * param.numScoreElements + selectedIdx;
402 | 
403 |         if (param.scoreBits > 0)
404 |         {
405 |             score = add_mp(score, (T) 1);
406 |             if (gt_mp(score, (T) (2.f - 1.f / 1024.f)))
407 |             {
408 |                 // Ensure the incremented score fits in the mantissa without changing the exponent
409 |                 score = (2.f - 1.f / 1024.f);
410 |             }
411 |         }
412 | 
413 |         topIndexData[topIdx] = selectedIdx;
414 |         topAnchorsData[topIdx] = anchorIdx;
415 |         topScoresData[topIdx] = score;
416 |         topClassData[topIdx] = classIdx;
417 |     }
418 | }
419 | 
420 | template <typename T>
421 | __global__ void EfficientNMSDenseIndex(EfficientNMSParameters param, int* __restrict__ topNumData,
422 |     int* __restrict__ topIndexData, int* __restrict__ topAnchorsData, int* __restrict__ topOffsetsStartData,
423 |     int* __restrict__ topOffsetsEndData, T* __restrict__ topScoresData, int* __restrict__ topClassData)
424 | {
425 |     int elementIdx = blockDim.x * blockIdx.x + threadIdx.x;
426 |     int imageIdx = blockDim.y * blockIdx.y + threadIdx.y;
427 | 
428 |     if (elementIdx >= param.numScoreElements || imageIdx >= param.batchSize)
429 |     {
430 |         return;
431 |     }
432 | 
433 |     int dataIdx = imageIdx * param.numScoreElements + elementIdx;
434 |     int anchorIdx = elementIdx / param.numClasses;
435 |     int classIdx = elementIdx % param.numClasses;
436 |     if (param.scoreBits > 0)
437 |     {
438 |         T score = topScoresData[dataIdx];
439 |         if (lt_mp(score, (T) param.scoreThreshold))
440 |         {
441 |             score = (T) 1;
442 |         }
443 |         else if (classIdx == param.backgroundClass)
444 |         {
445 |             score = (T) 1;
446 |         }
447 |         else
448 |         {
449 |             score = add_mp(score, (T) 1);
450 |             if (gt_mp(score, (T) (2.f - 1.f / 1024.f)))
451 |             {
452 |                 // Ensure the incremented score fits in the mantissa without changing the exponent
453 |                 score = (2.f - 1.f / 1024.f);
454 |             }
455 |         }
456 |         topScoresData[dataIdx] = score;
457 |     }
458 |     else
459 |     {
460 |         T score = topScoresData[dataIdx];
461 |         if (lt_mp(score, (T) param.scoreThreshold))
462 |         {
463 |             topScoresData[dataIdx] = -(1 << 15);
464 |         }
465 |         else if (classIdx == param.backgroundClass)
466 |         {
467 |             topScoresData[dataIdx] = -(1 << 15);
468 |         }
469 |     }
470 | 
471 |     topIndexData[dataIdx] = elementIdx;
472 |     topAnchorsData[dataIdx] = anchorIdx;
473 |     topClassData[dataIdx] = classIdx;
474 | 
475 |     if (elementIdx == 0)
476 |     {
477 |         // Saturate counters
478 |         topNumData[imageIdx] = param.numScoreElements;
479 |         topOffsetsStartData[imageIdx] = imageIdx * param.numScoreElements;
480 |         topOffsetsEndData[imageIdx] = (imageIdx + 1) * param.numScoreElements;
481 |     }
482 | }
483 | 
484 | template <typename T>
485 | cudaError_t EfficientNMSFilterLauncher(EfficientNMSParameters& param, const T* scoresInput, int* topNumData,
486 |     int* topIndexData, int* topAnchorsData, int* topOffsetsStartData, int* topOffsetsEndData, T* topScoresData,
487 |     int* topClassData, cudaStream_t stream)
488 | {
489 |     const unsigned int elementsPerBlock = 512;
490 |     const unsigned int imagesPerBlock = 1;
491 |     const unsigned int elementBlocks = (param.numScoreElements + elementsPerBlock - 1) / elementsPerBlock;
492 |     const unsigned int imageBlocks = (param.batchSize + imagesPerBlock - 1) / imagesPerBlock;
493 |     const dim3 blockSize = {elementsPerBlock, imagesPerBlock, 1};
494 |     const dim3 gridSize = {elementBlocks, imageBlocks, 1};
495 | 
496 |     float kernelSelectThreshold = 0.007f;
497 |     if (param.scoreSigmoid)
498 |     {
499 |         // Inverse Sigmoid
500 |         if (param.scoreThreshold <= 0.f)
501 |         {
502 |             param.scoreThreshold = -(1 << 15);
503 |         }
504 |         else
505 |         {
506 |             param.scoreThreshold = logf(param.scoreThreshold / (1.f - param.scoreThreshold));
507 |         }
508 |         kernelSelectThreshold = logf(kernelSelectThreshold / (1.f - kernelSelectThreshold));
509 |         // Disable Score Bits Optimization
510 |         param.scoreBits = -1;
511 |     }
512 | 
513 |     if (param.scoreThreshold < kernelSelectThreshold)
514 |     {
515 |         // A full copy of the buffer is necessary because sorting will scramble the input data otherwise.
516 |         PLUGIN_CHECK_CUDA(cudaMemcpyAsync(topScoresData, scoresInput,
517 |             param.batchSize * param.numScoreElements * sizeof(T), cudaMemcpyDeviceToDevice, stream));
518 | 
519 |         EfficientNMSDenseIndex<T><<<gridSize, blockSize, 0, stream>>>(param, topNumData, topIndexData, topAnchorsData,
520 |             topOffsetsStartData, topOffsetsEndData, topScoresData, topClassData);
521 |     }
522 |     else
523 |     {
524 |         EfficientNMSFilter<T><<<gridSize, blockSize, 0, stream>>>(
525 |             param, scoresInput, topNumData, topIndexData, topAnchorsData, topScoresData, topClassData);
526 | 
527 |         EfficientNMSFilterSegments<<<1, param.batchSize, 0, stream>>>(
528 |             param, topNumData, topOffsetsStartData, topOffsetsEndData);
529 |     }
530 | 
531 |     return cudaGetLastError();
532 | }
533 | 
534 | template <typename T>
535 | size_t EfficientNMSSortWorkspaceSize(int batchSize, int numScoreElements)
536 | {
537 |     size_t sortedWorkspaceSize = 0;
538 |     cub::DoubleBuffer<T> keysDB(nullptr, nullptr);
539 |     cub::DoubleBuffer<int> valuesDB(nullptr, nullptr);
540 |     cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr, sortedWorkspaceSize, keysDB, valuesDB,
541 |         numScoreElements, batchSize, (const int*) nullptr, (const int*) nullptr);
542 |     return sortedWorkspaceSize;
543 | }
544 | 
545 | size_t EfficientNMSWorkspaceSize(int batchSize, int numScoreElements, int numClasses, DataType datatype)
546 | {
547 |     size_t total = 0;
548 |     const size_t align = 256;
549 |     // Counters
550 |     // 3 for Filtering
551 |     // 1 for Output Indexing
552 |     // C for Max per Class Limiting
553 |     size_t size = (3 + 1 + numClasses) * batchSize * sizeof(int);
554 |     total += size + (size % align ? align - (size % align) : 0);
555 |     // Int Buffers
556 |     for (int i = 0; i < 4; i++)
557 |     {
558 |         size = batchSize * numScoreElements * sizeof(int);
559 |         total += size + (size % align ? align - (size % align) : 0);
560 |     }
561 |     // Float Buffers
562 |     for (int i = 0; i < 2; i++)
563 |     {
564 |         size = batchSize * numScoreElements * dataTypeSize(datatype);
565 |         total += size + (size % align ? align - (size % align) : 0);
566 |     }
567 |     // Sort Workspace
568 |     if (datatype == DataType::kHALF)
569 |     {
570 |         size = EfficientNMSSortWorkspaceSize<__half>(batchSize, numScoreElements);
571 |         total += size + (size % align ? align - (size % align) : 0);
572 |     }
573 |     else if (datatype == DataType::kFLOAT)
574 |     {
575 |         size = EfficientNMSSortWorkspaceSize<float>(batchSize, numScoreElements);
576 |         total += size + (size % align ? align - (size % align) : 0);
577 |     }
578 | 
579 |     return total;
580 | }
581 | 
582 | template <typename T>
583 | T* EfficientNMSWorkspace(void* workspace, size_t& offset, size_t elements)
584 | {
585 |     T* buffer = (T*) ((size_t) workspace + offset);
586 |     size_t align = 256;
587 |     size_t size = elements * sizeof(T);
588 |     size_t sizeAligned = size + (size % align ? align - (size % align) : 0);
589 |     offset += sizeAligned;
590 |     return buffer;
591 | }
592 | 
593 | template <typename T>
594 | pluginStatus_t EfficientNMSDispatch(EfficientNMSParameters param, const void* boxesInput, const void* scoresInput,
595 |     const void* anchorsInput, void* numDetectionsOutput, void* nmsBoxesOutput, void* nmsScoresOutput,
596 |     void* nmsClassesOutput, void* nmsIndicesOutput, void* workspace, cudaStream_t stream)
597 | {
598 |     // Clear Outputs (not all elements will get overwritten by the kernels, so safer to clear everything out)
599 | 
600 |     CSC(cudaMemsetAsync(numDetectionsOutput, 0x00, param.batchSize * sizeof(int), stream), STATUS_FAILURE);
601 |     CSC(cudaMemsetAsync(nmsScoresOutput, 0x00, param.batchSize * param.numOutputBoxes * sizeof(T), stream), STATUS_FAILURE);
602 |     CSC(cudaMemsetAsync(nmsBoxesOutput, 0x00, param.batchSize * param.numOutputBoxes * 4 * sizeof(T), stream), STATUS_FAILURE);
603 |     CSC(cudaMemsetAsync(nmsClassesOutput, 0x00, param.batchSize * param.numOutputBoxes * sizeof(int), stream), STATUS_FAILURE);
604 |     CSC(cudaMemsetAsync(nmsIndicesOutput, 0xFF, param.batchSize * param.numOutputBoxes * sizeof(int), stream), STATUS_FAILURE);
605 | 
606 |     // Empty Inputs
607 |     if (param.numScoreElements < 1)
608 |     {
609 |         return STATUS_SUCCESS;
610 |     }
611 | 
612 |     // Counters Workspace
613 |     size_t workspaceOffset = 0;
614 |     int countersTotalSize = (3 + 1 + param.numClasses) * param.batchSize;
615 |     int* topNumData = EfficientNMSWorkspace<int>(workspace, workspaceOffset, countersTotalSize);
616 |     int* topOffsetsStartData = topNumData + param.batchSize;
617 |     int* topOffsetsEndData = topNumData + 2 * param.batchSize;
618 |     int* outputIndexData = topNumData + 3 * param.batchSize;
619 |     int* outputClassData = topNumData + 4 * param.batchSize;
620 |     CSC(cudaMemsetAsync(topNumData, 0x00, countersTotalSize * sizeof(int), stream), STATUS_FAILURE);
621 |     cudaError_t status = cudaGetLastError();
622 |     CSC(status, STATUS_FAILURE);
623 | 
624 |     // Other Buffers Workspace
625 |     int* topIndexData
626 |         = EfficientNMSWorkspace<int>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
627 |     int* topClassData
628 |         = EfficientNMSWorkspace<int>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
629 |     int* topAnchorsData
630 |         = EfficientNMSWorkspace<int>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
631 |     int* sortedIndexData
632 |         = EfficientNMSWorkspace<int>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
633 |     T* topScoresData = EfficientNMSWorkspace<T>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
634 |     T* sortedScoresData
635 |         = EfficientNMSWorkspace<T>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
636 |     size_t sortedWorkspaceSize = EfficientNMSSortWorkspaceSize<T>(param.batchSize, param.numScoreElements);
637 |     char* sortedWorkspaceData = EfficientNMSWorkspace<char>(workspace, workspaceOffset, sortedWorkspaceSize);
638 |     cub::DoubleBuffer<T> scoresDB(topScoresData, sortedScoresData);
639 |     cub::DoubleBuffer<int> indexDB(topIndexData, sortedIndexData);
640 | 
641 |     // Kernels
642 |     status = EfficientNMSFilterLauncher<T>(param, (T*) scoresInput, topNumData, topIndexData, topAnchorsData,
643 |         topOffsetsStartData, topOffsetsEndData, topScoresData, topClassData, stream);
644 |     CSC(status, STATUS_FAILURE);
645 | 
646 |     status = cub::DeviceSegmentedRadixSort::SortPairsDescending(sortedWorkspaceData, sortedWorkspaceSize, scoresDB,
647 |         indexDB, param.batchSize * param.numScoreElements, param.batchSize, topOffsetsStartData, topOffsetsEndData,
648 |         param.scoreBits > 0 ? (10 - param.scoreBits) : 0, param.scoreBits > 0 ? 10 : sizeof(T) * 8, stream);
649 |     CSC(status, STATUS_FAILURE);
650 | 
651 |     status = EfficientNMSLauncher<T>(param, topNumData, outputIndexData, outputClassData, indexDB.Current(),
652 |         scoresDB.Current(), topClassData, topAnchorsData, boxesInput, anchorsInput, (int*) numDetectionsOutput,
653 |         (T*) nmsScoresOutput, (int*) nmsClassesOutput, (int*) nmsIndicesOutput, nmsBoxesOutput, stream);
654 |     CSC(status, STATUS_FAILURE);
655 | 
656 |     return STATUS_SUCCESS;
657 | }
658 | 
659 | pluginStatus_t EfficientNMSInference(EfficientNMSParameters param, const void* boxesInput, const void* scoresInput,
660 |     const void* anchorsInput, void* numDetectionsOutput, void* nmsBoxesOutput, void* nmsScoresOutput,
661 |     void* nmsClassesOutput, void* nmsIndicesOutput, void* workspace, cudaStream_t stream)
662 | {
663 |     if (param.datatype == DataType::kFLOAT)
664 |     {
665 |         param.scoreBits = -1;
666 |         return EfficientNMSDispatch<float>(param, boxesInput, scoresInput, anchorsInput, numDetectionsOutput,
667 |             nmsBoxesOutput, nmsScoresOutput, nmsClassesOutput, nmsIndicesOutput, workspace, stream);
668 |     }
669 |     else if (param.datatype == DataType::kHALF)
670 |     {
671 |         if (param.scoreBits <= 0 || param.scoreBits > 10)
672 |         {
673 |             param.scoreBits = -1;
674 |         }
675 |         return EfficientNMSDispatch<__half>(param, boxesInput, scoresInput, anchorsInput, numDetectionsOutput,
676 |             nmsBoxesOutput, nmsScoresOutput, nmsClassesOutput, nmsIndicesOutput, workspace, stream);
677 |     }
678 |     else
679 |     {
680 |         return STATUS_NOT_SUPPORTED;
681 |     }
682 | }
683 | 


--------------------------------------------------------------------------------