├── Deeplearning
├── ComputerVision
│ ├── README.md
│ └── docs
│ │ ├── gated_convolution.md
│ │ ├── multihead_attn.md
│ │ └── resblock.md
└── NLP
│ └── README.md
├── Deploy
├── Deepstream
│ ├── FAQ.md
│ ├── README.md
│ ├── sample-ALPR
│ │ ├── README.md
│ │ ├── config_deepstream.txt
│ │ ├── config_lpd.txt
│ │ ├── config_lpr.txt
│ │ ├── config_tracker.txt
│ │ ├── config_vehicletype.txt
│ │ ├── config_yolov4.txt
│ │ ├── dict.txt
│ │ ├── fig
│ │ │ ├── lpr_pipeline.png
│ │ │ ├── lpr_result1.png
│ │ │ └── lpr_result2.png
│ │ ├── labels.txt
│ │ ├── nvdsinfer_custom_impl_Yolo
│ │ │ ├── Makefile
│ │ │ └── nvdsparsebbox_Yolo.cpp
│ │ └── weights
│ │ │ ├── README.md
│ │ │ ├── license-plate-detection
│ │ │ └── labels.txt
│ │ │ ├── license-plate-recognition
│ │ │ └── labels.txt
│ │ │ └── vehicletypenet
│ │ │ └── labels.txt
│ ├── sample-scrfd
│ │ ├── README.md
│ │ ├── config_scrfd.txt
│ │ ├── nvdsinfer_custom_impl_Yolo
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Makefile
│ │ │ ├── README.md
│ │ │ ├── batchedNMSCustomInference.cu
│ │ │ ├── batchedNMSCustomPlugin.cpp
│ │ │ ├── batchedNMSCustomPlugin.h
│ │ │ ├── batchedNMSCustomPlugin.o
│ │ │ ├── cmake
│ │ │ │ └── set_ifndef.cmake
│ │ │ ├── common
│ │ │ │ ├── ErrorRecorder.h
│ │ │ │ ├── bboxUtils.h
│ │ │ │ ├── checkMacrosPlugin.cpp
│ │ │ │ ├── checkMacrosPlugin.h
│ │ │ │ ├── common.cuh
│ │ │ │ ├── cub_helper.h
│ │ │ │ ├── cudaDriverWrapper.cpp
│ │ │ │ ├── cudaDriverWrapper.h
│ │ │ │ ├── half.h
│ │ │ │ ├── kernel.cpp
│ │ │ │ ├── kernel.h
│ │ │ │ ├── kernels
│ │ │ │ │ ├── CMakeLists.txt
│ │ │ │ │ ├── allClassNMS.cu
│ │ │ │ │ ├── common.cu
│ │ │ │ │ ├── decodeBBoxes.cu
│ │ │ │ │ ├── nmsLayer.cu
│ │ │ │ │ ├── permuteData.cu
│ │ │ │ │ ├── reducedMathPlugin.h
│ │ │ │ │ ├── sortScoresPerClass.cu
│ │ │ │ │ └── sortScoresPerImage.cu
│ │ │ │ ├── logger.cpp
│ │ │ │ ├── logger.h
│ │ │ │ ├── logging.h
│ │ │ │ ├── nmsHelper.cpp
│ │ │ │ ├── nmsUtils.h
│ │ │ │ ├── plugin.h
│ │ │ │ ├── reducedMathPlugin.cpp
│ │ │ │ └── serialize.hpp
│ │ │ ├── gatherNMSCustomOutputs.cu
│ │ │ ├── gatherNMSCustomOutputs.h
│ │ │ ├── nvdsparsebbox_Yolo.cpp
│ │ │ └── nvdsparsebbox_Yolo.o
│ │ ├── parser_scrfd.py
│ │ └── run_scrfd.py
│ └── sample-yolov4
│ │ ├── config_deepstream.txt
│ │ ├── config_tracker.txt
│ │ ├── config_yolov4.txt
│ │ ├── exec_backends
│ │ ├── __pycache__
│ │ │ └── trt_backend.cpython-36.pyc
│ │ └── trt_backend.py
│ │ ├── labels.txt
│ │ ├── nvdsinfer_custom_impl_Yolo
│ │ ├── Makefile
│ │ └── nvdsparsebbox_Yolo.cpp
│ │ ├── run_yolov4.py
│ │ ├── test_images
│ │ └── test.png
│ │ ├── test_onnx.py
│ │ └── tools
│ │ └── add_nms_plugins.py
├── NVIDIA
│ ├── README.md
│ ├── docs
│ │ ├── multi_instance_gpu.md
│ │ └── nvidia_video_sdk.md
│ └── fig
│ │ ├── gpu-mig-overview.jpg
│ │ ├── mig_bert.png
│ │ └── support_nvenc_nvdec.png
├── README.md
├── Transfer-Learning-Toolkit
│ ├── README.md
│ ├── docs
│ │ ├── detectnet_v2.md
│ │ └── yolov4.md
│ └── fig
│ │ ├── detectnet_v2-inference.jpg
│ │ ├── nvidia-retrain-qat.png
│ │ └── yolov4-inference.png
└── Triton-inference-server
│ ├── README.md
│ ├── docs
│ ├── backend.md
│ ├── install.md
│ ├── model_batching.md
│ ├── model_configuration.md
│ ├── model_ensemble.md
│ ├── model_instance.md
│ ├── model_management.md
│ ├── optimization_pytorch.md
│ ├── perf_analyzer.md
│ ├── triton_kaldi.md
│ ├── triton_onnx.md
│ ├── triton_pytorch.md
│ └── triton_tensorrt.md
│ ├── fig
│ ├── multi_model_exec.png
│ ├── multi_model_parallel_exec.png
│ ├── multi_model_serial_exec.png
│ ├── wav2vec_general_perf_onnx.jpg
│ ├── wav2vec_general_perf_tensorrt.jpg
│ └── wav2vec_general_start.jpg
│ └── src
│ ├── sample_grpc.py
│ └── sample_load_unload.py
├── Framework
├── ONNX
│ └── README.md
├── Pytorch
│ ├── README.md
│ └── docs
│ │ └── build_from_source.md
├── TensorRT
│ ├── README.md
│ ├── docs
│ │ └── tutorial.md
│ └── fig
│ │ └── sample_netron_scrfd.png
└── Tensorflow
│ └── README.md
├── Linux
├── README.md
└── docs
│ └── build_opencv.md
└── README.md
/Deeplearning/ComputerVision/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deeplearning/ComputerVision/README.md
--------------------------------------------------------------------------------
/Deeplearning/ComputerVision/docs/gated_convolution.md:
--------------------------------------------------------------------------------
1 | ## Gated Convolution
2 |
3 | ### 1. Expland
4 | ### 2. Pytorch Implementation
5 | ```
6 | import torch
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 |
10 | class GatedConv2dWithActivation(torch.nn.Module):
11 | """
12 | Gated Convlution layer with activation (default activation:LeakyReLU)
13 | Params: same as conv2d
14 | Input: The feature from last layer "I"
15 | Output:\phi(f(I))*\sigmoid(g(I))
16 | """
17 |
18 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True,batch_norm=True, activation=torch.nn.LeakyReLU(0.2, inplace=True)):
19 | super(GatedConv2dWithActivation, self).__init__()
20 | self.batch_norm = batch_norm
21 | self.activation = activation
22 | self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias)
23 | self.mask_conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias)
24 | self.batch_norm2d = torch.nn.BatchNorm2d(out_channels)
25 | self.sigmoid = torch.nn.Sigmoid()
26 |
27 | for m in self.modules():
28 | if isinstance(m, nn.Conv2d):
29 | nn.init.kaiming_normal_(m.weight)
30 | def gated(self, mask):
31 | return self.sigmoid(mask)
32 | def forward(self, input):
33 | x = self.conv2d(input)
34 | mask = self.mask_conv2d(input)
35 | if self.activation is not None:
36 | x = self.activation(x) * self.gated(mask)
37 | else:
38 | x = x * self.gated(mask)
39 | if self.batch_norm:
40 | return self.batch_norm2d(x)
41 | else:
42 | return x
43 |
44 | class GatedDeConv2dWithActivation(torch.nn.Module):
45 | """
46 | Gated DeConvlution layer with activation (default activation:LeakyReLU)
47 | resize + conv
48 | Params: same as conv2d
49 | Input: The feature from last layer "I"
50 | Output:\phi(f(I))*\sigmoid(g(I))
51 | """
52 | def __init__(self, scale_factor, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, batch_norm=True,activation=torch.nn.LeakyReLU(0.2, inplace=True)):
53 | super(GatedDeConv2dWithActivation, self).__init__()
54 | self.conv2d = GatedConv2dWithActivation(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, batch_norm, activation)
55 | self.scale_factor = scale_factor
56 |
57 | def forward(self, input):
58 | #print(input.size())
59 | x = F.interpolate(input, scale_factor=2)
60 | return self.conv2d(x)
61 |
62 | class SNGatedConv2dWithActivation(torch.nn.Module):
63 | """
64 | Gated Convolution with spetral normalization
65 | """
66 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, batch_norm=True, activation=torch.nn.LeakyReLU(0.2, inplace=True)):
67 | super(SNGatedConv2dWithActivation, self).__init__()
68 | self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias)
69 | self.mask_conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias)
70 | self.activation = activation
71 | self.batch_norm = batch_norm
72 | self.batch_norm2d = torch.nn.BatchNorm2d(out_channels)
73 | self.sigmoid = torch.nn.Sigmoid()
74 | self.conv2d = torch.nn.utils.spectral_norm(self.conv2d)
75 | self.mask_conv2d = torch.nn.utils.spectral_norm(self.mask_conv2d)
76 | for m in self.modules():
77 | if isinstance(m, nn.Conv2d):
78 | nn.init.kaiming_normal_(m.weight)
79 |
80 | def gated(self, mask):
81 | return self.sigmoid(mask)
82 |
83 | def forward(self, input):
84 | x = self.conv2d(input)
85 | mask = self.mask_conv2d(input)
86 | if self.activation is not None:
87 | x = self.activation(x) * self.gated(mask)
88 | else:
89 | x = x * self.gated(mask)
90 | if self.batch_norm:
91 | return self.batch_norm2d(x)
92 | else:
93 | return x
94 |
95 | class SNGatedDeConv2dWithActivation(torch.nn.Module):
96 | """
97 | Gated DeConvlution layer with activation (default activation:LeakyReLU)
98 | resize + conv
99 | Params: same as conv2d
100 | Input: The feature from last layer "I"
101 | Output:\phi(f(I))*\sigmoid(g(I))
102 | """
103 | def __init__(self, scale_factor, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, batch_norm=True, activation=torch.nn.LeakyReLU(0.2, inplace=True)):
104 | super(SNGatedDeConv2dWithActivation, self).__init__()
105 | self.conv2d = SNGatedConv2dWithActivation(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, batch_norm, activation)
106 | self.scale_factor = scale_factor
107 |
108 | def forward(self, input):
109 | #print(input.size())
110 | x = F.interpolate(input, scale_factor=2)
111 | return self.conv2d(x)
112 | ```
--------------------------------------------------------------------------------
/Deeplearning/ComputerVision/docs/multihead_attn.md:
--------------------------------------------------------------------------------
1 | ## Multi-head Attention Block
2 |
3 | ### 1. Expland
4 | ### 2. Pytorch Implementation
5 | ```
6 | import torch
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 |
10 | def Normalize(in_channels):
11 | return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
12 |
13 | class MultiHeadAttnBlock(nn.Module):
14 | def __init__(self, in_channels, head_size=1):
15 | super().__init__()
16 | self.in_channels = in_channels
17 | self.head_size = head_size
18 | self.att_size = in_channels // head_size
19 | assert(in_channels % head_size == 0), 'The size of head should be divided by the number of channels.'
20 |
21 | self.norm1 = Normalize(in_channels)
22 | self.norm2 = Normalize(in_channels)
23 |
24 | self.q = torch.nn.Conv2d(in_channels,
25 | in_channels,
26 | kernel_size=1,
27 | stride=1,
28 | padding=0)
29 | self.k = torch.nn.Conv2d(in_channels,
30 | in_channels,
31 | kernel_size=1,
32 | stride=1,
33 | padding=0)
34 | self.v = torch.nn.Conv2d(in_channels,
35 | in_channels,
36 | kernel_size=1,
37 | stride=1,
38 | padding=0)
39 | self.proj_out = torch.nn.Conv2d(in_channels,
40 | in_channels,
41 | kernel_size=1,
42 | stride=1,
43 | padding=0)
44 | self.num = 0
45 |
46 | def forward(self, x, y=None):
47 | h_ = x
48 | h_ = self.norm1(h_)
49 | if y is None:
50 | y = h_
51 | else:
52 | y = self.norm2(y)
53 |
54 | q = self.q(y)
55 | k = self.k(h_)
56 | v = self.v(h_)
57 |
58 | # compute attention
59 | b,c,h,w = q.shape
60 | q = q.reshape(b, self.head_size, self.att_size ,h*w)
61 | q = q.permute(0, 3, 1, 2) # b, hw, head, att
62 |
63 | k = k.reshape(b, self.head_size, self.att_size ,h*w)
64 | k = k.permute(0, 3, 1, 2)
65 |
66 | v = v.reshape(b, self.head_size, self.att_size ,h*w)
67 | v = v.permute(0, 3, 1, 2)
68 |
69 |
70 | q = q.transpose(1, 2)
71 | v = v.transpose(1, 2)
72 | k = k.transpose(1, 2).transpose(2,3)
73 |
74 | scale = int(self.att_size)**(-0.5)
75 | q.mul_(scale)
76 | w_ = torch.matmul(q, k)
77 | w_ = F.softmax(w_, dim=3)
78 |
79 | w_ = w_.matmul(v)
80 |
81 | w_ = w_.transpose(1, 2).contiguous() # [b, h*w, head, att]
82 | w_ = w_.view(b, h, w, -1)
83 | w_ = w_.permute(0, 3, 1, 2)
84 |
85 | w_ = self.proj_out(w_)
86 |
87 | return x+w_
88 | ```
--------------------------------------------------------------------------------
/Deeplearning/ComputerVision/docs/resblock.md:
--------------------------------------------------------------------------------
1 | ## Resblock
2 |
3 | ### 1. Expland
4 | ### 2. Pytorch Implementation
5 | ```
6 | import torch
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 |
10 | def Normalize(in_channels):
11 | return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
12 |
13 | class ResnetBlock(nn.Module):
14 | def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
15 | dropout, temb_channels=512):
16 | super().__init__()
17 | self.nonlinearity = torch.nn.LeakyReLU(0.2)
18 | self.in_channels = in_channels
19 | out_channels = in_channels if out_channels is None else out_channels
20 | self.out_channels = out_channels
21 | self.use_conv_shortcut = conv_shortcut
22 |
23 | self.norm1 = Normalize(in_channels)
24 | self.conv1 = torch.nn.Conv2d(in_channels,
25 | out_channels,
26 | kernel_size=3,
27 | stride=1,
28 | padding=1)
29 | if temb_channels > 0:
30 | self.temb_proj = torch.nn.Linear(temb_channels,
31 | out_channels)
32 | self.norm2 = Normalize(out_channels)
33 | self.dropout = torch.nn.Dropout(dropout)
34 | self.conv2 = torch.nn.Conv2d(out_channels,
35 | out_channels,
36 | kernel_size=3,
37 | stride=1,
38 | padding=1)
39 | if self.in_channels != self.out_channels:
40 | if self.use_conv_shortcut:
41 | self.conv_shortcut = torch.nn.Conv2d(in_channels,
42 | out_channels,
43 | kernel_size=3,
44 | stride=1,
45 | padding=1)
46 | else:
47 | self.nin_shortcut = torch.nn.Conv2d(in_channels,
48 | out_channels,
49 | kernel_size=1,
50 | stride=1,
51 | padding=0)
52 |
53 |
54 | def forward(self, x, temb):
55 | h = x
56 | h = self.norm1(h)
57 | h = self.nonlinearity(h)
58 | h = self.conv1(h)
59 |
60 | if temb is not None:
61 | h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
62 |
63 | h = self.norm2(h)
64 | h = self.nonlinearity(h)
65 | h = self.dropout(h)
66 | h = self.conv2(h)
67 |
68 | if self.in_channels != self.out_channels:
69 | if self.use_conv_shortcut:
70 | x = self.conv_shortcut(x)
71 | else:
72 | x = self.nin_shortcut(x)
73 |
74 | return x+h
75 | ```
--------------------------------------------------------------------------------
/Deeplearning/NLP/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deeplearning/NLP/README.md
--------------------------------------------------------------------------------
/Deploy/Deepstream/FAQ.md:
--------------------------------------------------------------------------------
1 | ## FAQ about Deepstream
2 |
3 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/README.md:
--------------------------------------------------------------------------------
1 | ## 1. Requirement
2 | ```
3 | sudo apt install libgirepository1.0-dev libgstreamer1.0-dev
4 | ```
5 | ```
6 | sudo apt install \
7 | libssl1.0.0 \
8 | libgstreamer1.0-0 \
9 | gstreamer1.0-tools \
10 | gstreamer1.0-plugins-good \
11 | gstreamer1.0-plugins-bad \
12 | gstreamer1.0-plugins-ugly \
13 | gstreamer1.0-libav \
14 | libgstrtspserver-1.0-0 \
15 | libjansson4=2.11-1
16 | ```
17 | ## 2. Examples
18 | - [Sample Yolov4](sample-yolov4)
19 | - [Sample ALPR](sample-ALPR)
20 | - [Sample SCRFD Face Detection](sample-scrfd)
21 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/README.md:
--------------------------------------------------------------------------------
1 | # Deepstream ALPR
2 |
3 |
4 |
5 |
6 |
7 | ## 1. Requirement
8 | - Deepstream 6.0
9 |
10 | ## 2. Run demo
11 | ```
12 | cd nvdsinfer_custom_impl_Yolo
13 | make
14 | cd ..
15 | deepstream-app -c config_deepstream.txt
16 | ```
17 |
18 |
19 |
20 |
21 |
22 | ## 3. Models
23 | ### 3.1 Object detection
24 | - Sử dụng phiên bản Darknet COCO yolov4-608x608
25 | - Convert sang ONNX
26 | - [Bổ sung NMS Plugin](../sample-yolov4/tools/add_nms_plugins.py)
27 | - Customized parser: **NvDsInferParseCustomYoloV4**
28 |
29 | ### 3.2 Vehicle Type Net
30 | - Sử dụng model Resnet18 classification từ NVIDIA TAO
31 | - Tiến hành training & prune & INT8 quantization
32 |
33 | ### 3.3 License Plate Detection
34 | - Sử dụng model yolov4 từ NVIDIA TAO
35 | - Tiến hành training & prune & INT8 quantization
36 | - Customized parser: **NvDsInferParseCustomYoloV4TLT**
37 |
38 | ### 3.4 License Plate Recognition
39 | - Sử dụng model yolov4 từ NVIDIA TAO
40 | - Tiến hành training & prune & INT8 quantization
41 | - Customized parser: **NvDsInferParseCustomYoloV4LPR** (sort các ký tự detect được và gán vào **attributeLabel**)
42 |
43 | ## References
44 | - https://github.com/NVIDIA-AI-IOT/deepstream_lpr_app
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/config_deepstream.txt:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | #
3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | ################################################################################
18 |
19 | [application]
20 | enable-perf-measurement=1
21 | perf-measurement-interval-sec=3
22 | #gie-kitti-output-dir=streamscl
23 |
24 | [tiled-display]
25 | enable=1
26 | rows=1
27 | columns=0
28 | width=1280
29 | height=720
30 | gpu-id=0
31 | #(0): nvbuf-mem-default - Default memory allocated, specific to particular platform
32 | #(1): nvbuf-mem-cuda-pinned - Allocate Pinned/Host cuda memory, applicable for Tesla
33 | #(2): nvbuf-mem-cuda-device - Allocate Device cuda memory, applicable for Tesla
34 | #(3): nvbuf-mem-cuda-unified - Allocate Unified cuda memory, applicable for Tesla
35 | #(4): nvbuf-mem-surface-array - Allocate Surface Array memory, applicable for Jetson
36 | nvbuf-memory-type=0
37 |
38 | [source0]
39 | enable=1
40 | #Type - 1=CameraV4L2 2=URI 3=MultiURI
41 | type=3
42 | uri=file:///home/damnguyen/Deploy/deepstream/videos/video_%d.mp4
43 | num-sources=1
44 | gpu-id=0
45 | #drop-frame-interval=2
46 | # (0): memtype_device - Memory type Device
47 | # (1): memtype_pinned - Memory type Host Pinned
48 | # (2): memtype_unified - Memory type Unified
49 | cudadec-memtype=0
50 |
51 | [sink0]
52 | enable=0
53 | #Type - 1=FakeSink 2=EglSink 3=File
54 | type=1
55 | sync=0
56 | source-id=0
57 | gpu-id=0
58 | qos=0
59 | nvbuf-memory-type=0
60 | overlay-id=1
61 |
62 | [sink1]
63 | enable=1
64 | type=3
65 | enc-type=1
66 | #1=mp4 2=mkv
67 | container=1
68 | #1=h264 2=h265
69 | codec=1
70 | sync=0
71 | #iframeinterval=10
72 | bitrate=2000000
73 | output-file=out1.mp4
74 |
75 |
76 | [osd]
77 | enable=1
78 | gpu-id=0
79 | border-width=1
80 | text-size=15
81 | text-color=1;1;1;1;
82 | text-bg-color=0.3;0.3;0.3;1
83 | font=Serif
84 | show-clock=0
85 | clock-x-offset=800
86 | clock-y-offset=820
87 | clock-text-size=12
88 | clock-color=1;0;0;0
89 | nvbuf-memory-type=0
90 |
91 | [streammux]
92 | gpu-id=0
93 | ##Boolean property to inform muxer that sources are live
94 | live-source=0
95 | batch-size=1
96 | ##time out in usec, to wait after the first buffer is available
97 | ##to push the batch even if the complete batch is not formed
98 | batched-push-timeout=40000
99 | ## Set muxer output width and height
100 | width=1920
101 | height=1080
102 | ##Enable to maintain aspect ratio wrt source, and allow black borders, works
103 | ##along with width, height properties
104 | enable-padding=0
105 | nvbuf-memory-type=0
106 |
107 |
108 | [primary-gie]
109 | enable=1
110 | gpu-id=0
111 | labelfile-path=labels.txt
112 | #Required by the app for OSD, not a plugin property
113 | bbox-border-color0=1;0;0;1
114 | bbox-border-color1=0;1;1;1
115 | bbox-border-color2=0;0;1;1
116 | bbox-border-color3=0;1;0;1
117 | gie-unique-id=1
118 | nvbuf-memory-type=0
119 | config-file=config_yolov4.txt
120 |
121 | [tracker]
122 | enable=1
123 | tracker-width=608
124 | tracker-height=608
125 | ll-lib-file=/opt/nvidia/deepstream/deepstream/lib/libnvds_nvmultiobjecttracker.so
126 | ll-config-file=/opt/nvidia/deepstream/deepstream/samples/configs/deepstream-app/config_tracker_NvDCF_max_perf.yml
127 | enable-batch-process=1
128 | display-tracking-id=1
129 | enable-past-frame=1
130 |
131 | [secondary-gie0]
132 | enable=1
133 | gpu-id=0
134 | gie-unique-id=2
135 | operate-on-gie-id=1
136 | operate-on-class-ids=2;5;7
137 | config-file=config_vehicletype.txt
138 |
139 | [secondary-gie1]
140 | enable=1
141 | gpu-id=0
142 | gie-unique-id=3
143 | operate-on-gie-id=1
144 | operate-on-class-ids=2;3;5;7
145 | config-file=config_lpd.txt
146 |
147 | [secondary-gie2]
148 | enable=1
149 | gpu-id=0
150 | gie-unique-id=4
151 | operate-on-gie-id=3
152 | operate-on-class-ids=0
153 | config-file=config_lpr.txt
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/config_lpd.txt:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Permission is hereby granted, free of charge, to any person obtaining a
5 | # copy of this software and associated documentation files (the "Software"),
6 | # to deal in the Software without restriction, including without limitation
7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 | # and/or sell copies of the Software, and to permit persons to whom the
9 | # Software is furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in
12 | # all copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 | # DEALINGS IN THE SOFTWARE.
21 | ################################################################################
22 |
23 | [property]
24 | gpu-id=0
25 | net-scale-factor=1
26 | offsets=103.939;116.779;123.68
27 | tlt-model-key=license-plate-yolov4
28 | tlt-encoded-model=weights/license-plate-detection/yolov4_resnet18_epoch_050-fp32.etlt
29 | labelfile-path=weights/license-plate-detection/labels.txt
30 | int8-calib-file=weights/license-plate-detection/cal.bin
31 | model-engine-file=weights/license-plate-detection/yolov4_resnet18_epoch_050-fp32.etlt_b4_gpu0_fp32.engine
32 | infer-dims=3;320;320
33 | uff-input-blob-name=Input
34 | batch-size=4
35 | process-mode=2
36 | model-color-format=0
37 | ## 0=FP32, 1=INT8, 2=FP16 mode
38 | network-mode=0
39 | #0 detector 1 classifier 2 segmentatio 3 instance segmentation
40 | network-type=0
41 | num-detected-classes=1
42 | interval=0
43 | gie-unique-id=5
44 | operate-on-class-ids=2;3;5;7
45 | operate-on-gie-id=1
46 | output-blob-names=BatchedNMS
47 | parse-bbox-func-name=NvDsInferParseCustomYoloV4TLT
48 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
49 | input-object-min-width=64
50 | input-object-min-height=64
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/config_lpr.txt:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Permission is hereby granted, free of charge, to any person obtaining a
5 | # copy of this software and associated documentation files (the "Software"),
6 | # to deal in the Software without restriction, including without limitation
7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 | # and/or sell copies of the Software, and to permit persons to whom the
9 | # Software is furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in
12 | # all copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 | # DEALINGS IN THE SOFTWARE.
21 | ################################################################################
22 |
23 | [property]
24 | gpu-id=0
25 | net-scale-factor=1
26 | offsets=103.939;116.779;123.68
27 | tlt-model-key=license-plate-recognition
28 | tlt-encoded-model=weights/license-plate-recognition/yolov4_resnet18-pruned-retrain-int8.etlt
29 | labelfile-path=weights/license-plate-recognition/labels.txt
30 | int8-calib-file=weights/license-plate-recognition/yolov4_resnet18-pruned-retrain.bin
31 | model-engine-file=weights/license-plate-recognition/yolov4_resnet18-pruned-retrain-int8.etlt_b4_gpu0_int8.engine
32 | infer-dims=3;224;224
33 | uff-input-blob-name=Input
34 | batch-size=4
35 | process-mode=2
36 | model-color-format=0
37 | ## 0=FP32, 1=INT8, 2=FP16 mode
38 | network-mode=1
39 | #0 detector 1 classifier 2 segmentatio 3 instance segmentation
40 | network-type=1
41 | interval=0
42 | gie-unique-id=5
43 | operate-on-class-ids=0
44 | operate-on-gie-id=1
45 | output-blob-names=BatchedNMS
46 | classifier-threshold=0.7
47 | classifier-async-mode=0
48 | #parse-bbox-func-name=NvDsInferParseCustomYoloV4LPR
49 | parse-classifier-func-name=NvDsInferParseCustomYoloV4LPR
50 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
51 | input-object-min-width=16
52 | input-object-min-height=16
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/config_tracker.txt:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # SPDX-FileCopyrightText: Copyright (c) 2019-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | ################################################################################
17 |
18 | # Mandatory properties for the tracker:
19 | # tracker-width
20 | # tracker-height: needs to be multiple of 6 for NvDCF
21 | # gpu-id
22 | # ll-lib-file: path to low-level tracker lib
23 | # ll-config-file: required for NvDCF, optional for KLT and IOU
24 | #
25 | [tracker]
26 | tracker-width=608
27 | tracker-height=608
28 | gpu-id=0
29 | ll-lib-file=/opt/nvidia/deepstream/deepstream/lib/libnvds_nvmultiobjecttracker.so
30 | ll-config-file=config_tracker_NvDCF_perf.yml
31 | #enable-past-frame=1
32 | enable-batch-process=1
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/config_vehicletype.txt:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Permission is hereby granted, free of charge, to any person obtaining a
5 | # copy of this software and associated documentation files (the "Software"),
6 | # to deal in the Software without restriction, including without limitation
7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 | # and/or sell copies of the Software, and to permit persons to whom the
9 | # Software is furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in
12 | # all copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 | # DEALINGS IN THE SOFTWARE.
21 | ################################################################################
22 |
23 | [property]
24 | gpu-id=0
25 | net-scale-factor=1
26 | offsets=124;117;104
27 | tlt-model-key=vehicle-type-net
28 | tlt-encoded-model=weights/vehicletypenet/vehicle-type-net-r18-pruned-retrain-int8.etlt
29 | labelfile-path=weights/vehicletypenet/labels.txt
30 | int8-calib-file=weights/vehicletypenet/vehicle-type-net-r18-pruned-retrain.bin
31 | model-engine-file=weights/vehicletypenet/vehicle-type-net-r18-pruned-retrain-int8.etlt_b4_gpu0_int8.engine
32 | input-dims=3;224;224;0
33 | uff-input-blob-name=input_1
34 | batch-size=4
35 | process-mode=2
36 | model-color-format=0
37 | ## 0=FP32, 1=INT8, 2=FP16 mode
38 | network-mode=1
39 | #0 detector 1 classifier 2 segmentatio 3 instance segmentation
40 | network-type=1
41 | interval=0
42 | gie-unique-id=4
43 | operate-on-class-ids=2;5;7
44 | operate-on-gie-id=1
45 | output-blob-names=predictions/Softmax
46 | classifier-threshold=0.2
47 | input-object-min-width=64
48 | input-object-min-height=64
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/config_yolov4.txt:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | #
3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | ################################################################################
18 |
19 | # Following properties are mandatory when engine files are not specified:
20 | # int8-calib-file(Only in INT8), model-file-format
21 | # Caffemodel mandatory properties: model-file, proto-file, output-blob-names
22 | # UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names
23 | # ONNX: onnx-file
24 | #
25 | # Mandatory properties for detectors:
26 | # num-detected-classes
27 | #
28 | # Optional properties for detectors:
29 | # cluster-mode(Default=Group Rectangles), interval(Primary mode only, Default=0)
30 | # custom-lib-path
31 | # parse-bbox-func-name
32 | #
33 | # Mandatory properties for classifiers:
34 | # classifier-threshold, is-classifier
35 | #
36 | # Optional properties for classifiers:
37 | # classifier-async-mode(Secondary mode only, Default=false)
38 | #
39 | # Optional properties in secondary mode:
40 | # operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes),
41 | # input-object-min-width, input-object-min-height, input-object-max-width,
42 | # input-object-max-height
43 | #
44 | # Following properties are always recommended:
45 | # batch-size(Default=1)
46 | #
47 | # Other optional properties:
48 | # net-scale-factor(Default=1), network-mode(Default=0 i.e FP32),
49 | # model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path,
50 | # mean-file, gie-unique-id(Default=0), offsets, process-mode (Default=1 i.e. primary),
51 | # custom-lib-path, network-mode(Default=0 i.e FP32)
52 | #
53 | # The values in the config file are overridden by values set through GObject
54 | # properties.
55 |
56 | [property]
57 | gpu-id=0
58 | net-scale-factor=0.0039215697906911373
59 | # Skip frame
60 | interval=0
61 | #0=RGB, 1=BGR
62 | model-color-format=0
63 | input-dims=3;608;608;0
64 | onnx-file=weights/yolov4_-1_3_608_608_dynamic.nms.onnx
65 | model-engine-file=weights/yolov4_-1_3_608_608_dynamic.nms.onnx_b4_gpu0_fp32.engine
66 | labelfile-path=labels.txt
67 | batch-size=4
68 | ## 0=FP32, 1=INT8, 2=FP16 mode
69 | network-mode=0
70 | num-detected-classes=80
71 | gie-unique-id=1
72 | network-type=0
73 | is-classifier=0
74 | ## 0=Group Rectangles, 1=DBSCAN, 2=NMS, 3= DBSCAN+NMS Hybrid, 4 = None(No clustering)
75 | cluster-mode=2
76 | maintain-aspect-ratio=1
77 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
78 | parse-bbox-func-name=NvDsInferParseCustomYoloV4
79 | #scaling-filter=0
80 | #scaling-compute-hw=0
81 |
82 | [class-attrs-all]
83 | nms-iou-threshold=0.6
84 | pre-cluster-threshold=0.4
85 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/dict.txt:
--------------------------------------------------------------------------------
1 | A
2 | B
3 | C
4 | D
5 | E
6 | 8
7 | F
8 | 5
9 | 4
10 | G
11 | H
12 | I
13 | J
14 | K
15 | L
16 | M
17 | N
18 | 9
19 | 1
20 | P
21 | Q
22 | R
23 | S
24 | 7
25 | 6
26 | T
27 | 3
28 | 2
29 | U
30 | V
31 | W
32 | X
33 | Y
34 | Z
35 | 0
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/fig/lpr_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-ALPR/fig/lpr_pipeline.png
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/fig/lpr_result1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-ALPR/fig/lpr_result1.png
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/fig/lpr_result2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-ALPR/fig/lpr_result2.png
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/labels.txt:
--------------------------------------------------------------------------------
1 | person
2 | bicycle
3 | car
4 | motorbike
5 | aeroplane
6 | bus
7 | train
8 | truck
9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/nvdsinfer_custom_impl_Yolo/Makefile:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 |
17 | CUDA_VER?=
18 | ifeq ($(CUDA_VER),)
19 | $(error "CUDA_VER is not set")
20 | endif
21 | CC:= g++
22 | NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc
23 |
24 | CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations
25 | CFLAGS+= -I../../includes -I/usr/local/cuda-$(CUDA_VER)/include -I/opt/nvidia/deepstream/deepstream/sources/includes
26 |
27 | LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs
28 | LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group
29 |
30 | INCS:= $(wildcard *.h)
31 | SRCFILES:= nvdsparsebbox_Yolo.cpp
32 |
33 | TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so
34 |
35 | TARGET_OBJS:= $(SRCFILES:.cpp=.o)
36 | TARGET_OBJS:= $(TARGET_OBJS:.cu=.o)
37 |
38 | all: $(TARGET_LIB)
39 |
40 | %.o: %.cpp $(INCS) Makefile
41 | $(CC) -c -o $@ $(CFLAGS) $<
42 |
43 | %.o: %.cu $(INCS) Makefile
44 | $(NVCC) -c -o $@ --compiler-options '-fPIC' $<
45 |
46 | $(TARGET_LIB) : $(TARGET_OBJS)
47 | $(CC) -o $@ $(TARGET_OBJS) $(LFLAGS)
48 |
49 | clean:
50 | rm -rf $(TARGET_LIB)
51 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/weights/README.md:
--------------------------------------------------------------------------------
1 | # To do
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/weights/license-plate-detection/labels.txt:
--------------------------------------------------------------------------------
1 | license_plate
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/weights/license-plate-recognition/labels.txt:
--------------------------------------------------------------------------------
1 | A
2 | B
3 | C
4 | D
5 | E
6 | 8
7 | F
8 | 5
9 | 4
10 | G
11 | H
12 | I
13 | J
14 | K
15 | L
16 | M
17 | N
18 | 9
19 | 1
20 | P
21 | Q
22 | R
23 | S
24 | 7
25 | 6
26 | T
27 | 3
28 | 2
29 | U
30 | V
31 | W
32 | X
33 | Y
34 | Z
35 | 0
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/weights/vehicletypenet/labels.txt:
--------------------------------------------------------------------------------
1 | hatchback;bus;pickup;sedan;suv;truck;van
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/README.md:
--------------------------------------------------------------------------------
1 | ## Build custom plugins
2 |
3 | ```
4 | cd nvdsinfer_custom_impl_Yolo
5 | mkdir build && cd build
6 | cmake ..
7 | make -j8
8 | ```
9 |
10 | ## Run deepstream-python
11 | ```
12 | LD_PRELOAD= python3 run_scrfd.py file:/
13 | ```
14 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/config_scrfd.txt:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | #
3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | ################################################################################
18 |
19 | # Following properties are mandatory when engine files are not specified:
20 | # int8-calib-file(Only in INT8), model-file-format
21 | # Caffemodel mandatory properties: model-file, proto-file, output-blob-names
22 | # UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names
23 | # ONNX: onnx-file
24 | #
25 | # Mandatory properties for detectors:
26 | # num-detected-classes
27 | #
28 | # Optional properties for detectors:
29 | # cluster-mode(Default=Group Rectangles), interval(Primary mode only, Default=0)
30 | # custom-lib-path
31 | # parse-bbox-func-name
32 | #
33 | # Mandatory properties for classifiers:
34 | # classifier-threshold, is-classifier
35 | #
36 | # Optional properties for classifiers:
37 | # classifier-async-mode(Secondary mode only, Default=false)
38 | #
39 | # Optional properties in secondary mode:
40 | # operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes),
41 | # input-object-min-width, input-object-min-height, input-object-max-width,
42 | # input-object-max-height
43 | #
44 | # Following properties are always recommended:
45 | # batch-size(Default=1)
46 | #
47 | # Other optional properties:
48 | # net-scale-factor(Default=1), network-mode(Default=0 i.e FP32),
49 | # model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path,
50 | # mean-file, gie-unique-id(Default=0), offsets, process-mode (Default=1 i.e. primary),
51 | # custom-lib-path, network-mode(Default=0 i.e FP32)
52 | #
53 | # The values in the config file are overridden by values set through GObject
54 | # properties.
55 |
56 | [property]
57 | gpu-id=0
58 | net-scale-factor=0.0039215697906911373
59 | # Skip frame
60 | interval=0
61 | #0=RGB, 1=BGR
62 | model-color-format=0
63 | input-dims=3;640;640;0
64 | onnx-file=weights/face-detection/scrfd-nms-full.nms.onnx
65 | model-engine-file=weights/face-detection/scrfd-nms-full.nms.onnx_b4_gpu0_fp32.engine
66 | labelfile-path=weights/face-detection/labels.txt
67 | batch-size=4
68 | ## 0=FP32, 1=INT8, 2=FP16 mode
69 | network-mode=0
70 | num-detected-classes=2
71 | gie-unique-id=1
72 | network-type=100
73 | ## 0=Group Rectangles, 1=DBSCAN, 2=NMS, 3= DBSCAN+NMS Hybrid, 4 = None(No clustering)
74 | cluster-mode=4
75 | maintain-aspect-ratio=0
76 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
77 | #parse-bbox-func-name=NvDsInferParseCustomFaceDetection
78 | #scaling-filter=0
79 | #scaling-compute-hw=0
80 | output-tensor-meta=1
81 | #[class-attrs-all]
82 | #nms-iou-threshold=0.6
83 | #pre-cluster-threshold=0.4
84 | input-object-min-width=0
85 | input-object-min-height=0
86 | process-mode=1
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
2 | include(cmake/set_ifndef.cmake)
3 |
4 | project(TensorRT
5 | LANGUAGES CXX CUDA
6 | VERSION 8.2
7 | DESCRIPTION "TensorRT is a C++ library that facilitates high performance inference on NVIDIA GPUs and deep learning accelerators."
8 | HOMEPAGE_URL "https://github.com/NVIDIA/TensorRT")
9 |
10 | # C++14
11 | set(CMAKE_CXX_STANDARD 14)
12 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
13 | set(CMAKE_CXX_EXTENSIONS OFF)
14 | set(CMAKE_CXX_FLAGS "-Wno-deprecated-declarations ${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss")
15 |
16 | find_package(Threads REQUIRED)
17 |
18 | ## find_package(CUDA) is broken for cross-compilation. Enable CUDA language instead.
19 | if(NOT DEFINED CMAKE_TOOLCHAIN_FILE)
20 | find_package(CUDA ${CUDA_VERSION} REQUIRED)
21 | endif()
22 |
23 | include_directories(
24 | ${CUDA_INCLUDE_DIRS}
25 | ${CUDNN_ROOT_DIR}/include
26 | )
27 | find_library(CUDNN_LIB cudnn HINTS
28 | ${CUDA_TOOLKIT_ROOT_DIR} ${CUDNN_ROOT_DIR} PATH_SUFFIXES lib64 lib)
29 | find_library(CUBLAS_LIB cublas HINTS
30 | ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib64 lib lib/stubs)
31 | find_library(CUBLASLT_LIB cublasLt HINTS
32 | ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib64 lib lib/stubs)
33 | find_library(CUDART_LIB cudart HINTS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib64)
34 | find_library(RT_LIB rt)
35 | set(CUDA_LIBRARIES ${CUDART_LIB})
36 |
37 |
38 | message(STATUS "CUBLAS_LIB: ${CUBLAS_LIB}")
39 | message(STATUS "CUBLASLT_LIB: ${CUBLASLT_LIB}")
40 | message(STATUS "CUDART_LIB: ${CUDART_LIB}")
41 | message(STATUS "CUDNN_LIB: ${CUDNN_LIB}")
42 |
43 | file(GLOB SRCS *.cpp)
44 | set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS})
45 | file(GLOB CU_SRCS *.cu)
46 | set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} ${CU_SRCS})
47 | file(GLOB COMMON_SRCS common/*.cpp)
48 | set(COMMON_SOURCES ${COMMON_SOURCES} ${COMMON_SRCS})
49 | file(GLOB COMMON_CU_SRCS common/kernels/*.cu)
50 | set(COMMON_CU_SOURCES ${COMMON_CU_SOURCES} ${COMMON_CU_SRCS})
51 |
52 | # Generate Gencode
53 | if (DEFINED GPU_ARCHS)
54 | message(STATUS "GPU_ARCHS defined as ${GPU_ARCHS}. Generating CUDA code for SM ${GPU_ARCHS}")
55 | separate_arguments(GPU_ARCHS)
56 | else()
57 | list(APPEND GPU_ARCHS
58 | 53
59 | 60
60 | 61
61 | 70
62 | 75
63 | )
64 |
65 | string(REGEX MATCH "aarch64" IS_ARM "${TRT_PLATFORM_ID}")
66 | if (IS_ARM)
67 | # Xavier (SM72) only supported for aarch64.
68 | list(APPEND GPU_ARCHS 72)
69 | endif()
70 |
71 | if (CUDA_VERSION VERSION_GREATER_EQUAL 11.0)
72 | # Ampere GPU (SM80) support is only available in CUDA versions > 11.0
73 | list(APPEND GPU_ARCHS 80)
74 | endif()
75 | if (CUDA_VERSION VERSION_GREATER_EQUAL 11.1)
76 | list(APPEND GPU_ARCHS 86)
77 | endif()
78 |
79 | message(STATUS "GPU_ARCHS is not defined. Generating CUDA code for default SMs: ${GPU_ARCHS}")
80 | endif()
81 | foreach(arch ${GPU_ARCHS})
82 | set(GENCODES "${GENCODES} -gencode arch=compute_${arch},code=sm_${arch}")
83 | endforeach()
84 | # Generate PTX for the last architecture in the list.
85 | list(GET GPU_ARCHS -1 LATEST_SM)
86 | set(GENCODES "${GENCODES} -gencode arch=compute_${LATEST_SM},code=compute_${LATEST_SM}")
87 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wno-deprecated-declarations")
88 |
89 |
90 | include_directories(common common/kernels)
91 | list(APPEND PLUGIN_CU_SOURCES "${COMMON_CU_SOURCES}")
92 | set_source_files_properties(${PLUGIN_CU_SOURCES} PROPERTIES COMPILE_FLAGS ${GENCODES})
93 | list(APPEND PLUGIN_SOURCES "${PLUGIN_CU_SOURCES}")
94 | list(APPEND PLUGIN_SOURCES "${COMMON_SOURCES}")
95 |
96 | message(STATUS "PLUGIN_SOURCES: ${PLUGIN_SOURCES}")
97 | message(STATUS "GENCODES: ${GENCODES}")
98 |
99 | add_library(my_plugin SHARED
100 | ${PLUGIN_SOURCES}
101 | )
102 |
103 | target_include_directories(my_plugin
104 | PUBLIC /opt/nvidia/deepstream/deepstream/sources/includes
105 | )
106 | target_include_directories(my_plugin
107 | PUBLIC /usr/include/gstreamer-1.0 /usr/include/glib-2.0 /usr/lib/x86_64-linux-gnu/glib-2.0/include
108 | )
109 |
110 | target_link_libraries(my_plugin
111 | ${CUBLAS_LIB}
112 | ${CUBLASLT_LIB}
113 | ${CUDART_LIB}
114 | ${CUDNN_LIB}
115 | nvinfer
116 | )
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/Makefile:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 |
17 | CUDA_VER?=
18 | ifeq ($(CUDA_VER),)
19 | $(error "CUDA_VER is not set")
20 | endif
21 | CC:= g++
22 | NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc
23 |
24 | CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations
25 | CFLAGS+= -I../../includes -I/home/damnguyen/Deploy/deepstream/nvdsinfer_custom_impl_Yolo/common -I/usr/local/cuda-$(CUDA_VER)/include -I/opt/nvidia/deepstream/deepstream/sources/includes
26 |
27 | LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs
28 | LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group
29 |
30 | INCS:= $(wildcard *.h)
31 | SRCFILES:= *.cpp *.cu common/*.cpp common/kernels/*.cu
32 |
33 | TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so
34 |
35 | TARGET_OBJS:= $(SRCFILES:.cpp=.o)
36 | TARGET_OBJS:= $(TARGET_OBJS:.cu=.o)
37 |
38 | all: $(TARGET_LIB)
39 |
40 | %.o: %.cpp $(INCS) Makefile
41 | $(CC) -c -o $@ $(CFLAGS) $<
42 |
43 | %.o: %.cu $(INCS) Makefile
44 | $(NVCC) -c -o $@ --compiler-options '-fPIC' $<
45 |
46 | $(TARGET_LIB) : $(TARGET_OBJS)
47 | $(CC) -o $@ $(TARGET_OBJS) $(LFLAGS)
48 |
49 | clean:
50 | rm -rf $(TARGET_LIB)
51 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/batchedNMSCustomInference.cu:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | #include "bboxUtils.h"
17 | #include "cuda_runtime_api.h"
18 | #include "gatherNMSCustomOutputs.h"
19 | #include "kernel.h"
20 | #include "nmsUtils.h"
21 |
22 | pluginStatus_t nmsCustomInference(cudaStream_t stream, const int N, const int perBatchBoxesSize, const int perBatchScoresSize, const int perBatchLandmarksSize,
23 | const bool shareLocation, const int backgroundLabelId, const int numPredsPerClass, const int numClasses,
24 | const int topK, const int keepTopK, const float scoreThreshold, const float iouThreshold, const DataType DT_BBOX,
25 | const void* locData, const DataType DT_SCORE, const void* confData, const void* landData, void* keepCount, void* nmsedBoxes,
26 | void* nmsedScores, void* nmsedClasses, void* nmsedLandmarks, void* workspace, bool isNormalized, bool confSigmoid, bool clipBoxes, int scoreBits)
27 | {
28 | // locCount = batch_size * number_boxes_per_sample * 4
29 | const int locCount = N * perBatchBoxesSize;
30 | /*
31 | * shareLocation
32 | * Bounding box are shared among all classes, i.e., a bounding box could be classified as any candidate class.
33 | * Otherwise
34 | * Bounding box are designed for specific classes, i.e., a bounding box could be classified as one certain class or
35 | * not (binary classification).
36 | */
37 | const int numLocClasses = shareLocation ? 1 : numClasses;
38 |
39 | size_t bboxDataSize = detectionForwardBBoxDataSize(N, perBatchBoxesSize, DT_BBOX);
40 | void* bboxDataRaw = workspace;
41 | cudaMemcpyAsync(bboxDataRaw, locData, bboxDataSize, cudaMemcpyDeviceToDevice, stream);
42 | pluginStatus_t status;
43 |
44 | /*
45 | * bboxDataRaw format:
46 | * [batch size, numPriors (per sample), numLocClasses, 4]
47 | */
48 | // float for now
49 | void* bboxData;
50 | size_t bboxPermuteSize = detectionForwardBBoxPermuteSize(shareLocation, N, perBatchBoxesSize, DT_BBOX);
51 | void* bboxPermute = nextWorkspacePtr((int8_t*) bboxDataRaw, bboxDataSize);
52 |
53 | /*
54 | * After permutation, bboxData format:
55 | * [batch_size, numLocClasses, numPriors (per sample) (numPredsPerClass), 4]
56 | * This is equivalent to swapping axis
57 | */
58 | if (!shareLocation)
59 | {
60 | status = permuteData(
61 | stream, locCount, numLocClasses, numPredsPerClass, 4, DT_BBOX, false, bboxDataRaw, bboxPermute);
62 | ASSERT_FAILURE(status == STATUS_SUCCESS);
63 | bboxData = bboxPermute;
64 | }
65 | /*
66 | * If shareLocation, numLocClasses = 1
67 | * No need to permute data on linear memory
68 | */
69 | else
70 | {
71 | bboxData = bboxDataRaw;
72 | }
73 |
74 | /*
75 | * Conf data format
76 | * [batch size, numPriors * param.numClasses, 1, 1]
77 | */
78 | const int numScores = N * perBatchScoresSize;
79 | size_t totalScoresSize = detectionForwardPreNMSSize(N, perBatchScoresSize);
80 | if(DT_SCORE == DataType::kHALF) totalScoresSize /= 2; // detectionForwardPreNMSSize is implemented in terms of kFLOAT
81 | void* scores = nextWorkspacePtr((int8_t*) bboxPermute, bboxPermuteSize);
82 |
83 | // need a conf_scores
84 | /*
85 | * After permutation, bboxData format:
86 | * [batch_size, numClasses, numPredsPerClass, 1]
87 | */
88 | status = permuteData(
89 | stream, numScores, numClasses, numPredsPerClass, 1, DT_SCORE, confSigmoid, confData, scores);
90 | ASSERT_FAILURE(status == STATUS_SUCCESS);
91 |
92 | size_t indicesSize = detectionForwardPreNMSSize(N, perBatchScoresSize);
93 | void* indices = nextWorkspacePtr((int8_t*) scores, totalScoresSize);
94 |
95 | size_t postNMSScoresSize = detectionForwardPostNMSSize(N, numClasses, topK);
96 | if(DT_SCORE == DataType::kHALF) postNMSScoresSize /= 2; // detectionForwardPostNMSSize is implemented in terms of kFLOAT
97 | size_t postNMSIndicesSize = detectionForwardPostNMSSize(N, numClasses, topK); // indices are full int32
98 | void* postNMSScores = nextWorkspacePtr((int8_t*) indices, indicesSize);
99 | void* postNMSIndices = nextWorkspacePtr((int8_t*) postNMSScores, postNMSScoresSize);
100 |
101 | void* sortingWorkspace = nextWorkspacePtr((int8_t*) postNMSIndices, postNMSIndicesSize);
102 | // Sort the scores so that the following NMS could be applied.
103 | float scoreShift = 0.f;
104 | if(DT_SCORE == DataType::kHALF && scoreBits > 0 && scoreBits <= 10)
105 | scoreShift = 1.f;
106 | status = sortScoresPerClass(stream, N, numClasses, numPredsPerClass, backgroundLabelId, scoreThreshold,
107 | DT_SCORE, scores, indices, sortingWorkspace, scoreBits, scoreShift);
108 |
109 | ASSERT_FAILURE(status == STATUS_SUCCESS);
110 |
111 | // This is set to true as the input bounding boxes are of the format [ymin,
112 | // xmin, ymax, xmax]. The default implementation assumes [xmin, ymin, xmax, ymax]
113 | bool flipXY = true;
114 | // NMS
115 | status = allClassNMS(stream, N, numClasses, numPredsPerClass, topK, iouThreshold, shareLocation, isNormalized,
116 | DT_SCORE, DT_BBOX, bboxData, scores, indices, postNMSScores, postNMSIndices, flipXY, scoreShift);
117 | ASSERT_FAILURE(status == STATUS_SUCCESS);
118 |
119 | // Sort the bounding boxes after NMS using scores
120 | status = sortScoresPerImage(stream, N, numClasses * topK, DT_SCORE, postNMSScores, postNMSIndices, scores,
121 | indices, sortingWorkspace, scoreBits);
122 |
123 | ASSERT_FAILURE(status == STATUS_SUCCESS);
124 |
125 | // Gather data from the sorted bounding boxes after NMS
126 | status = gatherNMSCustomOutputs(stream, shareLocation, N, numPredsPerClass, numClasses, topK, keepTopK, DT_BBOX,
127 | DT_SCORE, indices, scores, bboxData, landData, keepCount, nmsedBoxes, nmsedScores, nmsedClasses, nmsedLandmarks, clipBoxes, scoreShift);
128 | ASSERT_FAILURE(status == STATUS_SUCCESS);
129 |
130 | return STATUS_SUCCESS;
131 | }
132 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/batchedNMSCustomPlugin.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/batchedNMSCustomPlugin.o
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/cmake/set_ifndef.cmake:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | function (set_ifndef variable value)
17 | if(NOT DEFINED ${variable})
18 | set(${variable} ${value} PARENT_SCOPE)
19 | endif()
20 | endfunction()
21 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/ErrorRecorder.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #ifndef ERROR_RECORDER_H
18 | #define ERROR_RECORDER_H
19 | #include "NvInferRuntimeCommon.h"
20 | #include "logger.h"
21 | #include
22 | #include
23 | #include
24 | #include
25 | #include
26 |
27 | using nvinfer1::IErrorRecorder;
28 | using nvinfer1::ErrorCode;
29 |
30 | //!
31 | //! A simple implementation of the IErrorRecorder interface for
32 | //! use by samples. This interface also can be used as a reference
33 | //! implementation.
34 | //! The sample Error recorder is based on a vector that pairs the error
35 | //! code and the error string into a single element. It also uses
36 | //! standard mutex's and atomics in order to make sure that the code
37 | //! works in a multi-threaded environment.
38 | //!
39 | class SampleErrorRecorder : public IErrorRecorder
40 | {
41 | using errorPair = std::pair;
42 | using errorStack = std::vector;
43 |
44 | public:
45 | SampleErrorRecorder() = default;
46 |
47 | virtual ~SampleErrorRecorder() noexcept {}
48 | int32_t getNbErrors() const noexcept final
49 | {
50 | return mErrorStack.size();
51 | }
52 | ErrorCode getErrorCode(int32_t errorIdx) const noexcept final
53 | {
54 | return invalidIndexCheck(errorIdx) ? ErrorCode::kINVALID_ARGUMENT : (*this)[errorIdx].first;
55 | };
56 | IErrorRecorder::ErrorDesc getErrorDesc(int32_t errorIdx) const noexcept final
57 | {
58 | return invalidIndexCheck(errorIdx) ? "errorIdx out of range." : (*this)[errorIdx].second.c_str();
59 | }
60 | // This class can never overflow since we have dynamic resize via std::vector usage.
61 | bool hasOverflowed() const noexcept final
62 | {
63 | return false;
64 | }
65 |
66 | // Empty the errorStack.
67 | void clear() noexcept final
68 | {
69 | try
70 | {
71 | // grab a lock so that there is no addition while clearing.
72 | std::lock_guard guard(mStackLock);
73 | mErrorStack.clear();
74 | }
75 | catch (const std::exception& e)
76 | {
77 | sample::gLogFatal << "Internal Error: " << e.what() << std::endl;
78 | }
79 | };
80 |
81 | //! Simple helper function that
82 | bool empty() const noexcept
83 | {
84 | return mErrorStack.empty();
85 | }
86 |
87 | bool reportError(ErrorCode val, IErrorRecorder::ErrorDesc desc) noexcept final
88 | {
89 | try
90 | {
91 | std::lock_guard guard(mStackLock);
92 | sample::gLogError << "Error[" << static_cast(val) << "]: " << desc << std::endl;
93 | mErrorStack.push_back(errorPair(val, desc));
94 | }
95 | catch (const std::exception& e)
96 | {
97 | sample::gLogFatal << "Internal Error: " << e.what() << std::endl;
98 | }
99 | // All errors are considered fatal.
100 | return true;
101 | }
102 |
103 | // Atomically increment or decrement the ref counter.
104 | IErrorRecorder::RefCount incRefCount() noexcept final
105 | {
106 | return ++mRefCount;
107 | }
108 | IErrorRecorder::RefCount decRefCount() noexcept final
109 | {
110 | return --mRefCount;
111 | }
112 |
113 | private:
114 | // Simple helper functions.
115 | const errorPair& operator[](size_t index) const noexcept
116 | {
117 | return mErrorStack[index];
118 | }
119 |
120 | bool invalidIndexCheck(int32_t index) const noexcept
121 | {
122 | // By converting signed to unsigned, we only need a single check since
123 | // negative numbers turn into large positive greater than the size.
124 | size_t sIndex = index;
125 | return sIndex >= mErrorStack.size();
126 | }
127 | // Mutex to hold when locking mErrorStack.
128 | std::mutex mStackLock;
129 |
130 | // Reference count of the class. Destruction of the class when mRefCount
131 | // is not zero causes undefined behavior.
132 | std::atomic mRefCount{0};
133 |
134 | // The error stack that holds the errors recorded by TensorRT.
135 | errorStack mErrorStack;
136 | }; // class SampleErrorRecorder
137 | #endif // ERROR_RECORDER_H
138 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/bboxUtils.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | #ifndef TRT_BBOX_UTILS_H
17 | #define TRT_BBOX_UTILS_H
18 |
19 | #include "plugin.h"
20 |
21 | using namespace nvinfer1;
22 | using namespace nvinfer1::plugin;
23 |
24 | template
25 | struct Bbox
26 | {
27 | T xmin, ymin, xmax, ymax;
28 | Bbox(T xmin, T ymin, T xmax, T ymax)
29 | : xmin(xmin)
30 | , ymin(ymin)
31 | , xmax(xmax)
32 | , ymax(ymax)
33 | {
34 | }
35 | Bbox() = default;
36 | };
37 |
38 | template
39 | struct BboxInfo
40 | {
41 | T conf_score;
42 | int label;
43 | int bbox_idx;
44 | bool kept;
45 | BboxInfo(T conf_score, int label, int bbox_idx, bool kept)
46 | : conf_score(conf_score)
47 | , label(label)
48 | , bbox_idx(bbox_idx)
49 | , kept(kept)
50 | {
51 | }
52 | BboxInfo() = default;
53 | };
54 |
55 | template
56 | bool operator<(const Bbox& lhs, const Bbox& rhs)
57 | {
58 | return lhs.x1 < rhs.x1;
59 | }
60 |
61 | template
62 | bool operator==(const Bbox& lhs, const Bbox& rhs)
63 | {
64 | return lhs.x1 == rhs.x1 && lhs.y1 == rhs.y1 && lhs.x2 == rhs.x2 && lhs.y2 == rhs.y2;
65 | }
66 | // }}}
67 |
68 | int8_t* alignPtr(int8_t* ptr, uintptr_t to);
69 |
70 | int8_t* nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize);
71 |
72 | size_t dataTypeSize(DataType dtype);
73 |
74 | void setUniformOffsets(cudaStream_t stream, int num_segments, int offset, int* d_offsets);
75 |
76 | #endif
77 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/checkMacrosPlugin.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include "checkMacrosPlugin.h"
18 | #include
19 | #include
20 | #include
21 |
22 | namespace nvinfer1
23 | {
24 | namespace plugin
25 | {
26 |
27 | // This will be populated by the logger supplied by the user to initLibNvInferPlugins()
28 | ILogger* gLogger{};
29 |
30 | template
31 | int LogStream::Buf::sync()
32 | {
33 | std::string s = str();
34 | while (!s.empty() && s.back() == '\n')
35 | {
36 | s.pop_back();
37 | }
38 | if (gLogger != nullptr)
39 | {
40 | gLogger->log(kSeverity, s.c_str());
41 | }
42 | str("");
43 | return 0;
44 | }
45 |
46 | // These use gLogger, and therefore require initLibNvInferPlugins() to be called with a logger
47 | // (otherwise, it will not log)
48 | LogStream gLogError;
49 | LogStream gLogWarning;
50 | LogStream gLogInfo;
51 | LogStream gLogVerbose;
52 |
53 | // break-pointable
54 | void throwCudaError(const char* file, const char* function, int line, int status, const char* msg)
55 | {
56 | CudaError error(file, function, line, status, msg);
57 | error.log(gLogError);
58 | throw error;
59 | }
60 |
61 | // break-pointable
62 | void throwCublasError(const char* file, const char* function, int line, int status, const char* msg)
63 | {
64 | if (msg == nullptr)
65 | {
66 | auto s_ = static_cast(status);
67 | switch (s_)
68 | {
69 | case CUBLAS_STATUS_SUCCESS: msg = "CUBLAS_STATUS_SUCCESS"; break;
70 | case CUBLAS_STATUS_NOT_INITIALIZED: msg = "CUBLAS_STATUS_NOT_INITIALIZED"; break;
71 | case CUBLAS_STATUS_ALLOC_FAILED: msg = "CUBLAS_STATUS_ALLOC_FAILED"; break;
72 | case CUBLAS_STATUS_INVALID_VALUE: msg = "CUBLAS_STATUS_INVALID_VALUE"; break;
73 | case CUBLAS_STATUS_ARCH_MISMATCH: msg = "CUBLAS_STATUS_ARCH_MISMATCH"; break;
74 | case CUBLAS_STATUS_MAPPING_ERROR: msg = "CUBLAS_STATUS_MAPPING_ERROR"; break;
75 | case CUBLAS_STATUS_EXECUTION_FAILED: msg = "CUBLAS_STATUS_EXECUTION_FAILED"; break;
76 | case CUBLAS_STATUS_INTERNAL_ERROR: msg = "CUBLAS_STATUS_INTERNAL_ERROR"; break;
77 | case CUBLAS_STATUS_NOT_SUPPORTED: msg = "CUBLAS_STATUS_NOT_SUPPORTED"; break;
78 | case CUBLAS_STATUS_LICENSE_ERROR: msg = "CUBLAS_STATUS_LICENSE_ERROR"; break;
79 | }
80 | }
81 | CublasError error(file, function, line, status, msg);
82 | error.log(gLogError);
83 | throw error;
84 | }
85 |
86 | // break-pointable
87 | void throwCudnnError(const char* file, const char* function, int line, int status, const char* msg)
88 | {
89 | CudnnError error(file, function, line, status, msg);
90 | error.log(gLogError);
91 | throw error;
92 | }
93 |
94 | void logError(const char* msg, const char* file, const char* fn, int line)
95 | {
96 | gLogError << "Parameter check failed at: " << file << "::" << fn << "::" << line;
97 | gLogError << ", condition: " << msg << std::endl;
98 | }
99 |
100 | // break-pointable
101 | void reportAssertion(const char* msg, const char* file, int line)
102 | {
103 | std::ostringstream stream;
104 | stream << "Assertion failed: " << msg << std::endl
105 | << file << ':' << line << std::endl
106 | << "Aborting..." << std::endl;
107 | getLogger()->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str());
108 | cudaDeviceReset();
109 | abort();
110 | }
111 |
112 | void TRTException::log(std::ostream& logStream) const
113 | {
114 | logStream << file << " (" << line << ") - " << name << " Error in " << function << ": " << status;
115 | if (message != nullptr)
116 | {
117 | logStream << " (" << message << ")";
118 | }
119 | logStream << std::endl;
120 | }
121 |
122 | } // namespace plugin
123 |
124 | } // namespace nvinfer1
125 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/cub_helper.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | #include "kernel.h"
17 | template
18 | size_t cubSortPairsWorkspaceSize(int num_items, int num_segments)
19 | {
20 | size_t temp_storage_bytes = 0;
21 | cub::DeviceSegmentedRadixSort::SortPairsDescending((void*) NULL, temp_storage_bytes, (const KeyT*) NULL,
22 | (KeyT*) NULL, (const ValueT*) NULL, (ValueT*) NULL,
23 | num_items, // # items
24 | num_segments, // # segments
25 | (const int*) NULL, (const int*) NULL);
26 | return temp_storage_bytes;
27 | }
28 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/cudaDriverWrapper.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | #define CUDA_LIB_NAME "cuda"
17 |
18 | #if defined(_WIN32)
19 | #if !defined(WIN32_LEAN_AND_MEAN)
20 | #define WIN32_LEAN_AND_MEAN
21 | #endif // defined(WIN32_LEAN_AND_MEAN)
22 | #include
23 | #define dllOpen(name) (void*) LoadLibraryA("nv" name ".dll")
24 | #define dllClose(handle) FreeLibrary(static_cast(handle))
25 | #define dllGetSym(handle, name) GetProcAddress(static_cast(handle), name)
26 | #else
27 | #include
28 | #define dllOpen(name) dlopen("lib" name ".so.1", RTLD_LAZY)
29 | #define dllClose(handle) dlclose(handle)
30 | #define dllGetSym(handle, name) dlsym(handle, name)
31 | #endif
32 |
33 | #include "cudaDriverWrapper.h"
34 | #include "plugin.h"
35 | #include
36 | #include
37 | #include
38 |
39 | using namespace nvinfer1;
40 |
41 | CUDADriverWrapper::CUDADriverWrapper()
42 | {
43 | handle = dllOpen(CUDA_LIB_NAME);
44 | ASSERT(handle != nullptr);
45 |
46 | auto load_sym = [](void* handle, const char *name) {
47 | void* ret = dllGetSym(handle, name);
48 | ASSERT(ret != nullptr);
49 | return ret;
50 | };
51 |
52 | *(void**)(&_cuGetErrorName) = load_sym(handle, "cuGetErrorName");
53 | *(void**)(&_cuFuncSetAttribute) = load_sym(handle, "cuFuncSetAttribute");
54 | *(void**)(&_cuLinkComplete) = load_sym(handle, "cuLinkComplete");
55 | *(void**)(&_cuModuleUnload) = load_sym(handle, "cuModuleUnload");
56 | *(void**)(&_cuLinkDestroy) = load_sym(handle, "cuLinkDestroy");
57 | *(void**)(&_cuModuleLoadData) = load_sym(handle, "cuModuleLoadData");
58 | *(void**)(&_cuLinkCreate) = load_sym(handle, "cuLinkCreate_v2");
59 | *(void**)(&_cuModuleGetFunction) = load_sym(handle, "cuModuleGetFunction");
60 | *(void**)(&_cuLinkAddFile) = load_sym(handle, "cuLinkAddFile_v2");
61 | *(void**)(&_cuLinkAddData) = load_sym(handle, "cuLinkAddData_v2");
62 | *(void**)(&_cuLaunchCooperativeKernel) = load_sym(handle, "cuLaunchCooperativeKernel");
63 | *(void**)(&_cuLaunchKernel) = load_sym(handle, "cuLaunchKernel");
64 | }
65 |
66 | CUDADriverWrapper::~CUDADriverWrapper()
67 | {
68 | dllClose(handle);
69 | }
70 |
71 | CUresult CUDADriverWrapper::cuGetErrorName(CUresult error, const char** pStr) const
72 | {
73 | return (*_cuGetErrorName)(error, pStr);
74 | }
75 |
76 | CUresult CUDADriverWrapper::cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) const
77 | {
78 | return (*_cuFuncSetAttribute)(hfunc, attrib, value);
79 | }
80 |
81 | CUresult CUDADriverWrapper::cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const
82 | {
83 | return (*_cuLinkComplete)(state, cubinOut, sizeOut);
84 | }
85 |
86 | CUresult CUDADriverWrapper::cuModuleUnload(CUmodule hmod) const
87 | {
88 | return (*_cuModuleUnload)(hmod);
89 | }
90 |
91 | CUresult CUDADriverWrapper::cuLinkDestroy(CUlinkState state) const
92 | {
93 | return (*_cuLinkDestroy)(state);
94 | }
95 |
96 | CUresult CUDADriverWrapper::cuModuleLoadData(CUmodule* module, const void* image) const
97 | {
98 | return (*_cuModuleLoadData)(module, image);
99 | }
100 |
101 | CUresult CUDADriverWrapper::cuLinkCreate(
102 | uint32_t numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const
103 | {
104 | return (*_cuLinkCreate)(numOptions, options, optionValues, stateOut);
105 | }
106 |
107 | CUresult CUDADriverWrapper::cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name) const
108 | {
109 | return (*_cuModuleGetFunction)(hfunc, hmod, name);
110 | }
111 |
112 | CUresult CUDADriverWrapper::cuLinkAddFile(CUlinkState state, CUjitInputType type, const char* path, uint32_t numOptions,
113 | CUjit_option* options, void** optionValues) const
114 | {
115 | return (*_cuLinkAddFile)(state, type, path, numOptions, options, optionValues);
116 | }
117 |
118 | CUresult CUDADriverWrapper::cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size,
119 | const char* name, uint32_t numOptions, CUjit_option* options, void** optionValues) const
120 | {
121 | return (*_cuLinkAddData)(state, type, data, size, name, numOptions, options, optionValues);
122 | }
123 |
124 | CUresult CUDADriverWrapper::cuLaunchCooperativeKernel(CUfunction f, uint32_t gridDimX, uint32_t gridDimY,
125 | uint32_t gridDimZ, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes,
126 | CUstream hStream, void** kernelParams) const
127 | {
128 | return (*_cuLaunchCooperativeKernel)(
129 | f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams);
130 | }
131 |
132 | CUresult CUDADriverWrapper::cuLaunchKernel(CUfunction f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
133 | uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, CUstream hStream,
134 | void** kernelParams, void** extra) const
135 | {
136 | return (*_cuLaunchKernel)(
137 | f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
138 | }
139 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/cudaDriverWrapper.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #ifndef CUDA_DRIVER_WRAPPER_H
18 | #define CUDA_DRIVER_WRAPPER_H
19 |
20 | #include
21 | #include
22 | #include
23 |
24 | #define cuErrCheck(stat, wrap) \
25 | { \
26 | nvinfer1::cuErrCheck_((stat), wrap, __FILE__, __LINE__); \
27 | }
28 |
29 | namespace nvinfer1
30 | {
31 | class CUDADriverWrapper
32 | {
33 | public:
34 | CUDADriverWrapper();
35 |
36 | ~CUDADriverWrapper();
37 |
38 | // Delete default copy constructor and copy assignment constructor
39 | CUDADriverWrapper(const CUDADriverWrapper&) = delete;
40 | CUDADriverWrapper& operator=(const CUDADriverWrapper&) = delete;
41 |
42 | CUresult cuGetErrorName(CUresult error, const char** pStr) const;
43 |
44 | CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) const;
45 |
46 | CUresult cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const;
47 |
48 | CUresult cuModuleUnload(CUmodule hmod) const;
49 |
50 | CUresult cuLinkDestroy(CUlinkState state) const;
51 |
52 | CUresult cuModuleLoadData(CUmodule* module, const void* image) const;
53 |
54 | CUresult cuLinkCreate(uint32_t numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const;
55 |
56 | CUresult cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name) const;
57 |
58 | CUresult cuLinkAddFile(CUlinkState state, CUjitInputType type, const char* path, uint32_t numOptions,
59 | CUjit_option* options, void** optionValues) const;
60 |
61 | CUresult cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size, const char* name,
62 | uint32_t numOptions, CUjit_option* options, void** optionValues) const;
63 |
64 | CUresult cuLaunchCooperativeKernel(CUfunction f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
65 | uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, CUstream hStream,
66 | void** kernelParams) const;
67 |
68 | CUresult cuLaunchKernel(CUfunction f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, uint32_t blockDimX,
69 | uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, CUstream hStream, void** kernelParams,
70 | void** extra) const;
71 |
72 | private:
73 | void* handle;
74 | CUresult (*_cuGetErrorName)(CUresult, const char**);
75 | CUresult (*_cuFuncSetAttribute)(CUfunction, CUfunction_attribute, int);
76 | CUresult (*_cuLinkComplete)(CUlinkState, void**, size_t*);
77 | CUresult (*_cuModuleUnload)(CUmodule);
78 | CUresult (*_cuLinkDestroy)(CUlinkState);
79 | CUresult (*_cuLinkCreate)(unsigned int, CUjit_option*, void**, CUlinkState*);
80 | CUresult (*_cuModuleLoadData)(CUmodule*, const void*);
81 | CUresult (*_cuModuleGetFunction)(CUfunction*, CUmodule, const char*);
82 | CUresult (*_cuLinkAddFile)(CUlinkState, CUjitInputType, const char*, unsigned int, CUjit_option*, void**);
83 | CUresult (*_cuLinkAddData)(
84 | CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
85 | CUresult (*_cuLaunchCooperativeKernel)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
86 | unsigned int, unsigned int, unsigned int, CUstream, void**);
87 | CUresult (*_cuLaunchKernel)(CUfunction f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
88 | uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, CUstream hStream,
89 | void** kernelParams, void** extra);
90 | };
91 |
92 | inline void cuErrCheck_(CUresult stat, const CUDADriverWrapper& wrap, const char* file, int line)
93 | {
94 | if (stat != CUDA_SUCCESS)
95 | {
96 | const char* msg = nullptr;
97 | wrap.cuGetErrorName(stat, &msg);
98 | fprintf(stderr, "CUDA Error: %s %s %d\n", msg, file, line);
99 | }
100 | }
101 |
102 | } // namespace nvinfer1
103 |
104 | #endif // CUDA_DRIVER_WRAPPER_H
105 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/half.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | //
17 | // Custom wrapper around external half-precision header
18 | //
19 | // Header has some "extra parentheses" warnings when different rounding modes are used.
20 |
21 | #if defined(__GNUC__)
22 | #pragma GCC diagnostic push
23 | #pragma GCC diagnostic ignored "-Wparentheses"
24 | #endif
25 |
26 |
27 | #if defined(__clang__)
28 | #pragma clang diagnostic push
29 | #pragma clang diagnostic ignored "-Wmismatched-tags"
30 | #endif
31 |
32 | #include "ieee/half.h"
33 |
34 | #if defined(__clang__)
35 | #pragma clang diagnostic pop
36 | #endif
37 |
38 | #if defined(__GNUC__)
39 | #pragma GCC diagnostic pop
40 | #endif
41 |
42 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernel.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include "kernel.h"
18 | #include "plugin.h"
19 |
20 | size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses, int numPredsPerClass,
21 | int topK, DataType DT_BBOX, DataType DT_SCORE)
22 | {
23 | size_t wss[7];
24 | wss[0] = detectionForwardBBoxDataSize(N, C1, DT_BBOX);
25 | wss[1] = detectionForwardBBoxPermuteSize(shareLocation, N, C1, DT_BBOX);
26 | wss[2] = detectionForwardPreNMSSize(N, C2);
27 | wss[3] = detectionForwardPreNMSSize(N, C2);
28 | wss[4] = detectionForwardPostNMSSize(N, numClasses, topK);
29 | wss[5] = detectionForwardPostNMSSize(N, numClasses, topK);
30 | wss[6] = std::max(sortScoresPerClassWorkspaceSize(N, numClasses, numPredsPerClass, DT_SCORE),
31 | sortScoresPerImageWorkspaceSize(N, numClasses * topK, DT_SCORE));
32 | return calculateTotalWorkspaceSize(wss, 7);
33 | }
34 |
35 | size_t detectionInferenceWorkspaceSizeCustom(bool shareLocation, int N, int C1, int C2, int C3, int numClasses, int numPredsPerClass,
36 | int topK, DataType DT_BBOX, DataType DT_SCORE)
37 | {
38 | size_t wss[8];
39 | wss[0] = detectionForwardBBoxDataSize(N, C1, DT_BBOX);
40 | wss[1] = detectionForwardBBoxPermuteSize(shareLocation, N, C1, DT_BBOX);
41 | wss[2] = detectionForwardPreNMSSize(N, C2);
42 | wss[3] = detectionForwardPreNMSSize(N, C2);
43 | wss[4] = detectionForwardPostNMSSize(N, numClasses, topK);
44 | wss[5] = detectionForwardPostNMSSize(N, numClasses, topK);
45 | wss[6] = std::max(sortScoresPerClassWorkspaceSize(N, numClasses, numPredsPerClass, DT_SCORE),
46 | sortScoresPerImageWorkspaceSize(N, numClasses * topK, DT_SCORE));
47 | wss[7] = detectionForwardLandmarkDataSize(N, C3, DT_BBOX);
48 | return calculateTotalWorkspaceSize(wss, 8);
49 | }
50 |
51 | namespace nvinfer1
52 | {
53 | namespace plugin
54 | {
55 | size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses, int numPredsPerClass,
56 | int topK, DataType DT_BBOX, DataType DT_SCORE)
57 | {
58 | size_t wss[7];
59 | wss[0] = detectionForwardBBoxDataSize(N, C1, DT_BBOX);
60 | wss[1] = detectionForwardBBoxPermuteSize(shareLocation, N, C1, DT_BBOX);
61 | wss[2] = detectionForwardPreNMSSize(N, C2);
62 | wss[3] = detectionForwardPreNMSSize(N, C2);
63 | wss[4] = detectionForwardPostNMSSize(N, numClasses, topK);
64 | wss[5] = detectionForwardPostNMSSize(N, numClasses, topK);
65 | wss[6] = std::max(sortScoresPerClassWorkspaceSize(N, numClasses, numPredsPerClass, DT_SCORE),
66 | sortScoresPerImageWorkspaceSize(N, numClasses * topK, DT_SCORE));
67 | return calculateTotalWorkspaceSize(wss, 7);
68 | }
69 | } // namespace plugin
70 | } // namespace nvinfer1
71 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernels/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | file(GLOB SRCS *.cpp)
17 | set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS})
18 | set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE)
19 | file(GLOB CU_SRCS *.cu)
20 | set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} ${CU_SRCS})
21 | set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} PARENT_SCOPE)
22 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernels/common.cu:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include "cuda.h"
18 | #include "cublas_v2.h"
19 | #include
20 | #include
21 | #include "kernel.h"
22 | #include "bboxUtils.h"
23 |
24 | #define CUDA_MEM_ALIGN 256
25 |
26 | // HASH
27 | unsigned int hash(const void* array_, size_t size)
28 | {
29 | // Apply hashing only when debugging RPN codes.
30 | if (DEBUG_ENABLE)
31 | {
32 | const char* array_const;
33 | char* array;
34 | cudaMallocHost((void**) &array, size);
35 | cudaMemcpy(array, array_, size, cudaMemcpyDeviceToHost);
36 | array_const = array;
37 | unsigned int hash = 45599;
38 | for (size_t i = 0; i < size; i++)
39 | {
40 | unsigned int value = array_const[i];
41 | hash = hash * 1487 + value;
42 | hash = hash * 317;
43 | hash = hash % 105359;
44 | }
45 | return hash;
46 | }
47 | else
48 | {
49 | return 0;
50 | }
51 | }
52 |
53 | // ALIGNPTR
54 | int8_t* alignPtr(int8_t* ptr, uintptr_t to)
55 | {
56 | uintptr_t addr = (uintptr_t) ptr;
57 | if (addr % to)
58 | {
59 | addr += to - addr % to;
60 | }
61 | return (int8_t*) addr;
62 | }
63 |
64 | // NEXTWORKSPACEPTR
65 | int8_t* nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize)
66 | {
67 | uintptr_t addr = (uintptr_t) ptr;
68 | addr += previousWorkspaceSize;
69 | return alignPtr((int8_t*) addr, CUDA_MEM_ALIGN);
70 | }
71 |
72 | // CALCULATE TOTAL WORKSPACE SIZE
73 | size_t calculateTotalWorkspaceSize(size_t* workspaces, int count)
74 | {
75 | size_t total = 0;
76 | for (int i = 0; i < count; i++)
77 | {
78 | total += workspaces[i];
79 | if (workspaces[i] % CUDA_MEM_ALIGN)
80 | {
81 | total += CUDA_MEM_ALIGN - (workspaces[i] % CUDA_MEM_ALIGN);
82 | }
83 | }
84 | return total;
85 | }
86 |
87 | using nvinfer1::DataType;
88 |
89 | // DATA TYPE SIZE
90 | size_t dataTypeSize(const DataType dtype)
91 | {
92 | switch (dtype)
93 | {
94 | case DataType::kINT8: return sizeof(char);
95 | case DataType::kHALF: return sizeof(short);
96 | case DataType::kFLOAT: return sizeof(float);
97 | default: return 0;
98 | }
99 | }
100 |
101 | // CUB
102 | /*
103 | size_t cubSortFloatIntPairsWorkspaceSize(int num_items, int num_segments)
104 | {
105 | size_t temp_storage_bytes = 0;
106 | cub::DeviceSegmentedRadixSort::SortPairsDescending(
107 | (int *)NULL, temp_storage_bytes,
108 | (const float *)NULL, (float *)NULL,
109 | (const int *)NULL, (int *)NULL,
110 | num_items, // # items
111 | num_segments, // # segments
112 | (const int *)NULL, (const int *)NULL);
113 | return temp_storage_bytes;
114 | }
115 |
116 | size_t cubSortFloatBboxInfoPairsWorkspaceSize(int num_items, int num_segments)
117 | {
118 | size_t temp_storage_bytes = 0;
119 | cub::DeviceSegmentedRadixSort::SortPairsDescending(
120 | (int *)NULL, temp_storage_bytes,
121 | (const float *)NULL, (float *)NULL,
122 | (const BboxInfo *)NULL, (BboxInfo *)NULL,
123 | num_items, // # items
124 | num_segments, // # segments
125 | (const int *)NULL, (const int *)NULL);
126 | return temp_storage_bytes;
127 | }
128 | */
129 |
130 | template
131 | __launch_bounds__(nthds_per_cta)
132 | __global__ void setUniformOffsets_kernel(
133 | const int num_segments,
134 | const int offset,
135 | int* d_offsets)
136 | {
137 | const int idx = blockIdx.x * nthds_per_cta + threadIdx.x;
138 | if (idx <= num_segments)
139 | d_offsets[idx] = idx * offset;
140 | }
141 |
142 | void setUniformOffsets(
143 | cudaStream_t stream,
144 | const int num_segments,
145 | const int offset,
146 | int* d_offsets)
147 | {
148 | const int BS = 32;
149 | const int GS = (num_segments + 1 + BS - 1) / BS;
150 | setUniformOffsets_kernel<<>>(num_segments, offset, d_offsets);
151 | }
152 |
153 |
154 | const char* cublasGetErrorString(cublasStatus_t error)
155 | {
156 | switch (error)
157 | {
158 | case CUBLAS_STATUS_SUCCESS:
159 | return "CUBLAS_STATUS_SUCCESS";
160 | case CUBLAS_STATUS_NOT_INITIALIZED:
161 | return "CUBLAS_STATUS_NOT_INITIALIZED";
162 | case CUBLAS_STATUS_ALLOC_FAILED:
163 | return "CUBLAS_STATUS_ALLOC_FAILED";
164 | case CUBLAS_STATUS_INVALID_VALUE:
165 | return "CUBLAS_STATUS_INVALID_VALUE";
166 | case CUBLAS_STATUS_ARCH_MISMATCH:
167 | return "CUBLAS_STATUS_ARCH_MISMATCH";
168 | case CUBLAS_STATUS_MAPPING_ERROR:
169 | return "CUBLAS_STATUS_MAPPING_ERROR";
170 | case CUBLAS_STATUS_EXECUTION_FAILED:
171 | return "CUBLAS_STATUS_EXECUTION_FAILED";
172 | case CUBLAS_STATUS_INTERNAL_ERROR:
173 | return "CUBLAS_STATUS_INTERNAL_ERROR";
174 | #if CUDA_VERSION >= 6000
175 | case CUBLAS_STATUS_NOT_SUPPORTED:
176 | return "CUBLAS_STATUS_NOT_SUPPORTED";
177 | #endif
178 | #if CUDA_VERSION >= 6050
179 | case CUBLAS_STATUS_LICENSE_ERROR:
180 | return "CUBLAS_STATUS_LICENSE_ERROR";
181 | #endif
182 | }
183 | return "Unknown cublas status";
184 | }
185 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernels/permuteData.cu:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | #include
17 | #include "kernel.h"
18 |
19 | template
20 | __launch_bounds__(nthds_per_cta)
21 | __global__ void permuteData_kernel(
22 | const int nthreads,
23 | const int num_classes,
24 | const int num_data,
25 | const int num_dim,
26 | bool confSigmoid,
27 | const Dtype* data,
28 | Dtype* new_data)
29 | {
30 | // data format: [batch_size, num_data, num_classes, num_dim]
31 | for (int index = blockIdx.x * nthds_per_cta + threadIdx.x;
32 | index < nthreads;
33 | index += nthds_per_cta * gridDim.x)
34 | {
35 | const int i = index % num_dim;
36 | const int c = (index / num_dim) % num_classes;
37 | const int d = (index / num_dim / num_classes) % num_data;
38 | const int n = index / num_dim / num_classes / num_data;
39 | const int new_index = ((n * num_classes + c) * num_data + d) * num_dim + i;
40 | float result = data[index];
41 | if (confSigmoid)
42 | result = exp(result) / (1 + exp(result));
43 |
44 | new_data[new_index] = result;
45 | }
46 | // new data format: [batch_size, num_classes, num_data, num_dim]
47 | }
48 |
49 | template
50 | pluginStatus_t permuteData_gpu(
51 | cudaStream_t stream,
52 | const int nthreads,
53 | const int num_classes,
54 | const int num_data,
55 | const int num_dim,
56 | bool confSigmoid,
57 | const void* data,
58 | void* new_data)
59 | {
60 | const int BS = 512;
61 | const int GS = (nthreads + BS - 1) / BS;
62 | permuteData_kernel<<>>(nthreads, num_classes, num_data, num_dim, confSigmoid,
63 | (const Dtype*) data, (Dtype*) new_data);
64 | CSC(cudaGetLastError(), STATUS_FAILURE);
65 | return STATUS_SUCCESS;
66 | }
67 |
68 | // permuteData LAUNCH CONFIG
69 | typedef pluginStatus_t (*pdFunc)(cudaStream_t, const int, const int, const int, const int, bool, const void*, void*);
70 |
71 | struct pdLaunchConfig
72 | {
73 | DataType t_data;
74 | pdFunc function;
75 |
76 | pdLaunchConfig(DataType t_data)
77 | : t_data(t_data)
78 | {
79 | }
80 | pdLaunchConfig(DataType t_data, pdFunc function)
81 | : t_data(t_data)
82 | , function(function)
83 | {
84 | }
85 | bool operator==(const pdLaunchConfig& other)
86 | {
87 | return t_data == other.t_data;
88 | }
89 | };
90 |
91 | static std::array pdLCOptions = {
92 | pdLaunchConfig(DataType::kFLOAT, permuteData_gpu), pdLaunchConfig(DataType::kHALF, permuteData_gpu<__half>)};
93 |
94 | pluginStatus_t permuteData(cudaStream_t stream, const int nthreads, const int num_classes, const int num_data,
95 | const int num_dim, const DataType DT_DATA, bool confSigmoid, const void* data, void* new_data)
96 | {
97 | pdLaunchConfig lc = pdLaunchConfig(DT_DATA);
98 | for (unsigned i = 0; i < pdLCOptions.size(); ++i)
99 | {
100 | if (lc == pdLCOptions[i])
101 | {
102 | DEBUG_PRINTF("permuteData kernel %d\n", i);
103 | return pdLCOptions[i].function(stream,
104 | nthreads,
105 | num_classes,
106 | num_data,
107 | num_dim,
108 | confSigmoid,
109 | data,
110 | new_data);
111 | }
112 | }
113 | return STATUS_BAD_PARAM;
114 | }
115 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernels/reducedMathPlugin.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #ifndef _REDUCED_MATH_PLUGIN_H
18 | #define _REDUCED_MATH_PLUGIN_H
19 | #include
20 | // Dynamically strength-reduced div and mod
21 | //
22 | // Ideas taken from Sean Baxter's MGPU library.
23 | // These classes provide for reduced complexity division and modulus
24 | // on integers, for the case where the same divisor or modulus will
25 | // be used repeatedly.
26 |
27 | namespace nvinfer1
28 | {
29 | namespace plugin
30 | {
31 | namespace detail
32 | {
33 |
34 | void findDivisor(int denom, unsigned int& mul_coeff, unsigned int& shift_coeff);
35 |
36 | __host__ __device__ __forceinline__ uint32_t umulhi(uint32_t x, uint32_t y)
37 | {
38 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 100
39 | return __umulhi(x, y);
40 | #else
41 | uint64_t z = (uint64_t) x * (uint64_t) y;
42 | return (uint32_t) (z >> 32);
43 | #endif
44 | }
45 |
46 | // This is a weird implementation that returns div_up(0,1)=0 but
47 | // div_up(0,2)=1 (wrong) -- just do not use it with a=0.
48 | __host__ __device__ inline int div_up(int a, int b)
49 | {
50 | return (a - 1) / b + 1;
51 | }
52 |
53 | } //end namespace detail
54 |
55 | class ReducedDivisor
56 | {
57 | public:
58 | ReducedDivisor() {}
59 | __host__ __forceinline__
60 | ReducedDivisor(int _y)
61 | : y(_y)
62 | {
63 | detail::findDivisor(y, mul_coeff, shift_coeff);
64 | }
65 | __host__ __device__ __forceinline__
66 | ReducedDivisor(unsigned _mul_coeff, unsigned _shift_coeff, int _y)
67 | : mul_coeff(_mul_coeff)
68 | , shift_coeff(_shift_coeff)
69 | , y(_y)
70 | {
71 | }
72 | __host__ __device__ __forceinline__ int div(int x) const
73 | {
74 | // if dividing by 1, then findDivisor wouldn't have worked because
75 | // mul_coeff would have had to be 2^32, which can't be represented,
76 | // so we have to special case that one.
77 | return (y != 1) ? detail::umulhi((uint32_t) x, mul_coeff) >> shift_coeff : x;
78 | }
79 | __host__ __device__ __forceinline__ int mod(int x) const
80 | {
81 | return x - (div(x) * y);
82 | }
83 | __host__ __device__ __forceinline__ void divmod(int x, int& q, int& mod) const
84 | {
85 | q = div(x);
86 | mod = x - (q * y);
87 | }
88 | __host__ __device__ __forceinline__ int get() const
89 | {
90 | return y;
91 | }
92 | inline __host__ void get_mul_shift(unsigned& mul, unsigned& shift)
93 | {
94 | mul = mul_coeff;
95 | shift = shift_coeff;
96 | }
97 |
98 | protected:
99 | uint32_t mul_coeff;
100 | uint32_t shift_coeff;
101 | int y;
102 | };
103 |
104 | } // namespace plugin
105 |
106 | } // namespace nvinfer1
107 | #endif /*_REDUCED_MATH_PLUGIN_H*/
108 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernels/sortScoresPerImage.cu:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | #include "cub/cub.cuh"
17 | #include
18 | #include "kernel.h"
19 | #include "bboxUtils.h"
20 | #include "cub_helper.h"
21 |
22 | template
23 | pluginStatus_t sortScoresPerImage_gpu(
24 | cudaStream_t stream,
25 | const int num_images,
26 | const int num_items_per_image,
27 | void* unsorted_scores,
28 | void* unsorted_bbox_indices,
29 | void* sorted_scores,
30 | void* sorted_bbox_indices,
31 | void* workspace,
32 | int score_bits
33 | )
34 | {
35 | void* d_offsets = workspace;
36 | void* cubWorkspace = nextWorkspacePtr((int8_t*) d_offsets, (num_images + 1) * sizeof(int));
37 |
38 | setUniformOffsets(stream, num_images, num_items_per_image, (int*) d_offsets);
39 |
40 | const int arrayLen = num_images * num_items_per_image;
41 | size_t temp_storage_bytes = cubSortPairsWorkspaceSize(arrayLen, num_images);
42 | size_t begin_bit = 0;
43 | size_t end_bit = sizeof(T_SCORE) * 8;
44 | if (sizeof(T_SCORE) == 2 && score_bits > 0 && score_bits <= 10)
45 | {
46 | end_bit = 10;
47 | begin_bit = end_bit - score_bits;
48 | }
49 | cub::DeviceSegmentedRadixSort::SortPairsDescending(
50 | cubWorkspace, temp_storage_bytes,
51 | (const T_SCORE*) (unsorted_scores), (T_SCORE*) (sorted_scores),
52 | (const int*) (unsorted_bbox_indices), (int*) (sorted_bbox_indices),
53 | arrayLen, num_images,
54 | (const int*) d_offsets, (const int*) d_offsets + 1,
55 | begin_bit, end_bit,
56 | stream);
57 | CSC(cudaGetLastError(), STATUS_FAILURE);
58 | return STATUS_SUCCESS;
59 | }
60 |
61 | // sortScoresPerImage LAUNCH CONFIG
62 | typedef pluginStatus_t (*sspiFunc)(cudaStream_t,
63 | const int,
64 | const int,
65 | void*,
66 | void*,
67 | void*,
68 | void*,
69 | void*,
70 | int);
71 | struct sspiLaunchConfig
72 | {
73 | DataType t_score;
74 | sspiFunc function;
75 |
76 | sspiLaunchConfig(DataType t_score)
77 | : t_score(t_score)
78 | {
79 | }
80 | sspiLaunchConfig(DataType t_score, sspiFunc function)
81 | : t_score(t_score)
82 | , function(function)
83 | {
84 | }
85 | bool operator==(const sspiLaunchConfig& other)
86 | {
87 | return t_score == other.t_score;
88 | }
89 | };
90 |
91 | static std::array sspiLCOptions = {
92 | sspiLaunchConfig(DataType::kFLOAT, sortScoresPerImage_gpu),
93 | sspiLaunchConfig(DataType::kHALF, sortScoresPerImage_gpu<__half>),
94 | };
95 |
96 | pluginStatus_t sortScoresPerImage(
97 | cudaStream_t stream,
98 | const int num_images,
99 | const int num_items_per_image,
100 | const DataType DT_SCORE,
101 | void* unsorted_scores,
102 | void* unsorted_bbox_indices,
103 | void* sorted_scores,
104 | void* sorted_bbox_indices,
105 | void* workspace,
106 | int score_bits
107 | )
108 | {
109 | sspiLaunchConfig lc = sspiLaunchConfig(DT_SCORE);
110 | for (unsigned i = 0; i < sspiLCOptions.size(); ++i)
111 | {
112 | if (lc == sspiLCOptions[i])
113 | {
114 | DEBUG_PRINTF("sortScoresPerImage kernel %d\n", i);
115 | return sspiLCOptions[i].function(stream,
116 | num_images,
117 | num_items_per_image,
118 | unsorted_scores,
119 | unsorted_bbox_indices,
120 | sorted_scores,
121 | sorted_bbox_indices,
122 | workspace,
123 | score_bits);
124 | }
125 | }
126 | return STATUS_BAD_PARAM;
127 | }
128 |
129 | size_t sortScoresPerImageWorkspaceSize(
130 | const int num_images,
131 | const int num_items_per_image,
132 | const DataType DT_SCORE)
133 | {
134 | const int arrayLen = num_images * num_items_per_image;
135 | size_t wss[2];
136 | wss[0] = (num_images + 1) * sizeof(int); // offsets
137 | if (DT_SCORE == DataType::kFLOAT)
138 | {
139 | wss[1] = cubSortPairsWorkspaceSize(arrayLen, num_images); // cub workspace
140 | }
141 | else if (DT_SCORE == DataType::kHALF)
142 | {
143 | wss[1] = cubSortPairsWorkspaceSize<__half, int>(arrayLen, num_images); // cub workspace
144 | }
145 | else
146 | {
147 | printf("SCORE type not supported.\n");
148 | return (size_t) -1;
149 | }
150 |
151 | return calculateTotalWorkspaceSize(wss, 2);
152 | }
153 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/logger.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include "logger.h"
18 | #include "ErrorRecorder.h"
19 | #include "logging.h"
20 |
21 | SampleErrorRecorder gRecorder;
22 | namespace sample
23 | {
24 | Logger gLogger{Logger::Severity::kINFO};
25 | LogStreamConsumer gLogVerbose{LOG_VERBOSE(gLogger)};
26 | LogStreamConsumer gLogInfo{LOG_INFO(gLogger)};
27 | LogStreamConsumer gLogWarning{LOG_WARN(gLogger)};
28 | LogStreamConsumer gLogError{LOG_ERROR(gLogger)};
29 | LogStreamConsumer gLogFatal{LOG_FATAL(gLogger)};
30 |
31 | void setReportableSeverity(Logger::Severity severity)
32 | {
33 | gLogger.setReportableSeverity(severity);
34 | gLogVerbose.setReportableSeverity(severity);
35 | gLogInfo.setReportableSeverity(severity);
36 | gLogWarning.setReportableSeverity(severity);
37 | gLogError.setReportableSeverity(severity);
38 | gLogFatal.setReportableSeverity(severity);
39 | }
40 | } // namespace sample
41 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/logger.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #ifndef LOGGER_H
18 | #define LOGGER_H
19 |
20 | #include "logging.h"
21 |
22 | class SampleErrorRecorder;
23 | extern SampleErrorRecorder gRecorder;
24 | namespace sample
25 | {
26 | extern Logger gLogger;
27 | extern LogStreamConsumer gLogVerbose;
28 | extern LogStreamConsumer gLogInfo;
29 | extern LogStreamConsumer gLogWarning;
30 | extern LogStreamConsumer gLogError;
31 | extern LogStreamConsumer gLogFatal;
32 |
33 | void setReportableSeverity(Logger::Severity severity);
34 | } // namespace sample
35 |
36 | #endif // LOGGER_H
37 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/nmsHelper.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include "cuda_fp16.h"
18 | #include "plugin.h"
19 | #include
20 |
21 | using namespace nvinfer1;
22 | using namespace nvinfer1::plugin;
23 |
24 | size_t detectionForwardBBoxDataSize(int N, int C1, DataType DT_BBOX)
25 | {
26 | if (DT_BBOX == DataType::kFLOAT)
27 | {
28 | return N * C1 * sizeof(float);
29 | }
30 | if (DT_BBOX == DataType::kHALF)
31 | {
32 | return N * C1 * sizeof(__half);
33 | }
34 |
35 | printf("Only FP32/FP16 type bounding boxes are supported.\n");
36 | return (size_t) -1;
37 | }
38 |
39 | size_t detectionForwardBBoxPermuteSize(bool shareLocation, int N, int C1, DataType DT_BBOX)
40 | {
41 | if (DT_BBOX == DataType::kFLOAT)
42 | {
43 | return shareLocation ? 0 : N * C1 * sizeof(float);
44 | }
45 | if (DT_BBOX == DataType::kHALF)
46 | {
47 | return shareLocation ? 0 : N * C1 * sizeof(__half);
48 | }
49 |
50 | printf("Only FP32/FP16 type bounding boxes are supported.\n");
51 | return (size_t) -1;
52 | }
53 |
54 | size_t detectionForwardLandmarkDataSize(int N, int C3, DataType DT_BBOX)
55 | {
56 | if (DT_BBOX == DataType::kFLOAT)
57 | {
58 | return N * C3 * sizeof(float);
59 | }
60 | if (DT_BBOX == DataType::kHALF)
61 | {
62 | return N * C3 * sizeof(__half);
63 | }
64 |
65 | printf("Only FP32/FP16 type bounding boxes are supported.\n");
66 | return (size_t) -1;
67 | }
68 |
69 | size_t detectionForwardLandmarkPermuteSize(bool shareLocation, int N, int C3, DataType DT_BBOX)
70 | {
71 | if (DT_BBOX == DataType::kFLOAT)
72 | {
73 | return shareLocation ? 0 : N * C3 * sizeof(float);
74 | }
75 | if (DT_BBOX == DataType::kHALF)
76 | {
77 | return shareLocation ? 0 : N * C3 * sizeof(__half);
78 | }
79 |
80 | printf("Only FP32/FP16 type bounding boxes are supported.\n");
81 | return (size_t) -1;
82 | }
83 |
84 | size_t detectionForwardPreNMSSize(int N, int C2)
85 | {
86 | ASSERT(sizeof(float) == sizeof(int));
87 | return N * C2 * sizeof(float);
88 | }
89 |
90 | size_t detectionForwardPostNMSSize(int N, int numClasses, int topK)
91 | {
92 | ASSERT(sizeof(float) == sizeof(int));
93 | return N * numClasses * topK * sizeof(float);
94 | }
95 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/nmsUtils.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | #ifndef TRT_NMS_UTILS_H
17 | #define TRT_NMS_UTILS_H
18 |
19 | #include "plugin.h"
20 |
21 | using namespace nvinfer1;
22 | using namespace nvinfer1::plugin;
23 |
24 | size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses, int numPredsPerClass,
25 | int topK, DataType DT_BBOX, DataType DT_SCORE);
26 | size_t detectionInferenceWorkspaceSizeCustom(bool shareLocation, int N, int C1, int C2, int C3, int numClasses, int numPredsPerClass,
27 | int topK, DataType DT_BBOX, DataType DT_SCORE);
28 | #endif
29 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/reducedMathPlugin.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | #include
17 | namespace nvinfer1
18 | {
19 | namespace plugin
20 | {
21 | namespace detail
22 | {
23 |
24 | // Count leading zeros - start from most significant bit.
25 | int clz(int x)
26 | {
27 | for (int i = 31; i >= 0; --i)
28 | {
29 | if ((1U << i) & x)
30 | {
31 | return 31 - i;
32 | }
33 | }
34 | return 32;
35 | }
36 |
37 | #define CUDNN_IS_POW_2(x) (0 == ((x) & ((x) -1)))
38 |
39 | int find_log_2(int x, bool round_up = false)
40 | {
41 | int a = 31 - clz(x);
42 | if (round_up)
43 | {
44 | a += !CUDNN_IS_POW_2(x);
45 | }
46 | return a;
47 | }
48 |
49 | void findDivisor(int denom,
50 | unsigned int& mul_coeff, unsigned int& shift_coeff)
51 | {
52 | if (denom == 0)
53 | {
54 | return;
55 | }
56 | if (denom == 1)
57 | {
58 | // if dividing by 1, reduced math doesn't work because mul_coeff would
59 | // need to be 2^32, which doesn't fit into unsigned int. the div()
60 | // routine handles this special case separately.
61 | mul_coeff = 0;
62 | shift_coeff = 0;
63 | return;
64 | }
65 | // To express the division N/D in terms of a multiplication, what we first
66 | // imagine is simply N*(1/D). However, 1/D will always evaluate to 0 (for D>1),
67 | // so we need another way. There's nothing that says we have to use exactly
68 | // the fraction 1/D; instead it could be any X/Y that reduces to 1/D (i.e.,
69 | // Y=X*D), or at least to "close enough" to it. If we pick Y that is a power
70 | // of two, then the N*(X/Y) can be N*X followed by a right-shift by some amount.
71 | // The power of two we should pick should be at least 2^32, because in the
72 | // div() routine we'll use umulhi(), which returns only the upper 32 bits --
73 | // this being equivalent to a right-shift by 32. But we might want a higher
74 | // power of two for better accuracy depending on the magnitude of the denominator.
75 | // Once we've picked Y, then X [our mul_coeff value] is simply Y/D, rounding up,
76 | // and we save shift_coeff as whatever further shift we have to do beyond
77 | // what the umulhi() implies.
78 | uint32_t p = 31 + find_log_2(denom, true);
79 | uint32_t m = ((1ull << p) + (uint32_t) denom - 1) / (uint32_t) denom;
80 | mul_coeff = m;
81 | shift_coeff = p - 32;
82 | }
83 |
84 | } // namespace detail
85 |
86 | } // namespace plugin
87 |
88 | } // namespace nvinfer1
89 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/serialize.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | #pragma once
17 |
18 | #include
19 | #include
20 | #include
21 | #include
22 |
23 | #include
24 | using std::cerr;
25 | using std::cout;
26 | using std::endl;
27 |
28 | template
29 | inline void serialize_value(void** buffer, T const& value);
30 |
31 | template
32 | inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value);
33 |
34 | namespace
35 | {
36 |
37 | template
38 | struct Serializer
39 | {
40 | };
41 |
42 | template
43 | struct Serializer::value || std::is_enum::value || std::is_pod::value>::type>
45 | {
46 | static size_t serialized_size(T const&)
47 | {
48 | return sizeof(T);
49 | }
50 | static void serialize(void** buffer, T const& value)
51 | {
52 | ::memcpy(*buffer, &value, sizeof(T));
53 | reinterpret_cast(*buffer) += sizeof(T);
54 | }
55 | static void deserialize(void const** buffer, size_t* buffer_size, T* value)
56 | {
57 | assert(*buffer_size >= sizeof(T));
58 | ::memcpy(value, *buffer, sizeof(T));
59 | reinterpret_cast(*buffer) += sizeof(T);
60 | *buffer_size -= sizeof(T);
61 | }
62 | };
63 |
64 | template <>
65 | struct Serializer
66 | {
67 | static size_t serialized_size(const char* value)
68 | {
69 | return strlen(value) + 1;
70 | }
71 | static void serialize(void** buffer, const char* value)
72 | {
73 | ::strcpy(static_cast(*buffer), value);
74 | reinterpret_cast(*buffer) += strlen(value) + 1;
75 | }
76 | static void deserialize(void const** buffer, size_t* buffer_size, const char** value)
77 | {
78 | *value = static_cast(*buffer);
79 | size_t data_size = strnlen(*value, *buffer_size) + 1;
80 | assert(*buffer_size >= data_size);
81 | reinterpret_cast(*buffer) += data_size;
82 | *buffer_size -= data_size;
83 | }
84 | };
85 |
86 | template
87 | struct Serializer,
88 | typename std::enable_if::value || std::is_enum::value || std::is_pod::value>::type>
89 | {
90 | static size_t serialized_size(std::vector const& value)
91 | {
92 | return sizeof(value.size()) + value.size() * sizeof(T);
93 | }
94 | static void serialize(void** buffer, std::vector const& value)
95 | {
96 | serialize_value(buffer, value.size());
97 | size_t nbyte = value.size() * sizeof(T);
98 | ::memcpy(*buffer, value.data(), nbyte);
99 | reinterpret_cast(*buffer) += nbyte;
100 | }
101 | static void deserialize(void const** buffer, size_t* buffer_size, std::vector* value)
102 | {
103 | size_t size;
104 | deserialize_value(buffer, buffer_size, &size);
105 | value->resize(size);
106 | size_t nbyte = value->size() * sizeof(T);
107 | assert(*buffer_size >= nbyte);
108 | ::memcpy(value->data(), *buffer, nbyte);
109 | reinterpret_cast(*buffer) += nbyte;
110 | *buffer_size -= nbyte;
111 | }
112 | };
113 |
114 | } // namespace
115 |
116 | template
117 | inline size_t serialized_size(T const& value)
118 | {
119 | return Serializer::serialized_size(value);
120 | }
121 |
122 | template
123 | inline void serialize_value(void** buffer, T const& value)
124 | {
125 | return Serializer::serialize(buffer, value);
126 | }
127 |
128 | template
129 | inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value)
130 | {
131 | return Serializer::deserialize(buffer, buffer_size, value);
132 | }
133 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/gatherNMSCustomOutputs.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | #ifndef TRT_BATCHED_NMS_HELPER_H
17 | #define TRT_BATCHED_NMS_HELPER_H
18 | #include "plugin.h"
19 | using namespace nvinfer1;
20 | using namespace nvinfer1::plugin;
21 |
22 | pluginStatus_t gatherNMSCustomOutputs(cudaStream_t stream, bool shareLocation, int numImages, int numPredsPerClass,
23 | int numClasses, int topK, int keepTopK, DataType DT_BBOX, DataType DT_SCORE, const void* indices,
24 | const void* scores, const void* bboxData, const void* landData, void* keepCount, void* nmsedBoxes, void* nmsedScores, void* nmsedClasses, void* nmsedLandmarks,
25 | bool clipBoxes, const float scoreShift);
26 |
27 | #endif
28 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.o
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/parser_scrfd.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import pyds
3 | import ctypes
4 | import numpy as np
5 |
6 | def layer_finder(output_layer_info, name):
7 | """ Return the layer contained in output_layer_info which corresponds
8 | to the given name.
9 | """
10 | for layer in output_layer_info:
11 | # dataType == 0 <=> dataType == FLOAT
12 | # print(layer.layerName)
13 | if layer.dataType == 0 and layer.layerName == name:
14 | return layer
15 | return None
16 |
17 |
18 | def clip(x):
19 | return min(max(0.0, x), 1.0)
20 |
21 | def make_object(index, layers, default_classId = 1):
22 | """ Creates a NvDsInferObjectDetectionInfo object from one layer of SSD.
23 | Return None if the class Id is invalid, if the detection confidence
24 | is under the threshold or if the width/height of the bounding box is
25 | null/negative.
26 | Return the created NvDsInferObjectDetectionInfo object otherwise.
27 | """
28 | box_layer, score_layer = layers
29 | res = pyds.NvDsInferObjectDetectionInfo()
30 | res.detectionConfidence = score_layer[index]
31 | res.classId = default_classId
32 |
33 | rect_x1_f = box_layer[index][0]
34 | rect_y1_f = box_layer[index][1]
35 | rect_x2_f = box_layer[index][2]
36 | rect_y2_f = box_layer[index][3]
37 | res.left = clip(rect_x1_f)
38 | res.top = clip(rect_y1_f)
39 | res.width = clip(rect_x2_f - rect_x1_f)
40 | res.height = clip(rect_y2_f - rect_y1_f)
41 |
42 | return res
43 |
44 | def nvds_infer_parse_scrfd(output_layer_info, input_size):
45 | """ Get data from output_layer_info and fill object_list
46 | num_detections: [1]
47 | nmsed_bboxes: [200, 4]
48 | nmsed_scores: [200]
49 | nmsed_classes: [200]
50 | nmsed_landmarks:[200, 10]
51 | """
52 | num_detection_layer = output_layer_info[0]
53 | box_layer = output_layer_info[1]
54 | score_layer = output_layer_info[2]
55 | class_layer = output_layer_info[3]
56 | landmark_layer = output_layer_info[4]
57 |
58 | # if not num_detection_layer or not score_layer or not class_layer or not box_layer or not landmark_layer:
59 | # sys.stderr.write("ERROR: some layers missing in output tensors\n")
60 | # return []
61 |
62 | ptr = ctypes.cast(pyds.get_ptr(num_detection_layer.buffer), ctypes.POINTER(ctypes.c_int32))
63 | num_detection = np.ctypeslib.as_array(ptr, shape=(1,))[0]
64 | object_list = []
65 | landmark_list = []
66 |
67 | if num_detection > 0:
68 | ptr = ctypes.cast(pyds.get_ptr(box_layer.buffer), ctypes.POINTER(ctypes.c_float))
69 | box_result = np.ctypeslib.as_array(ptr, shape=(200,4))
70 |
71 | # Normalize
72 | box_result = box_result.astype('float32')
73 | box_result[:, 0] /= input_size[0]
74 | box_result[:, 1] /= input_size[1]
75 | box_result[:, 2] /= input_size[0]
76 | box_result[:, 3] /= input_size[1]
77 |
78 | ptr = ctypes.cast(pyds.get_ptr(score_layer.buffer), ctypes.POINTER(ctypes.c_float))
79 | score_result = np.ctypeslib.as_array(ptr, shape=(200,))
80 | ptr = ctypes.cast(pyds.get_ptr(landmark_layer.buffer), ctypes.POINTER(ctypes.c_float))
81 | landmark_result = np.ctypeslib.as_array(ptr, shape=(200,10))
82 | x3_layers = box_result, score_result
83 | for i in range(num_detection):
84 | obj = make_object(i, x3_layers)
85 | if obj:
86 | object_list.append(obj)
87 | landmark_list.append(landmark_result[i])
88 | # print(landmark_list)
89 | return object_list, landmark_list
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/config_deepstream.txt:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | #
3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | ################################################################################
18 |
19 | [application]
20 | enable-perf-measurement=1
21 | perf-measurement-interval-sec=3
22 | #gie-kitti-output-dir=streamscl
23 |
24 | [tiled-display]
25 | enable=1
26 | rows=1
27 | columns=1
28 | width=1280
29 | height=720
30 | gpu-id=0
31 | #(0): nvbuf-mem-default - Default memory allocated, specific to particular platform
32 | #(1): nvbuf-mem-cuda-pinned - Allocate Pinned/Host cuda memory, applicable for Tesla
33 | #(2): nvbuf-mem-cuda-device - Allocate Device cuda memory, applicable for Tesla
34 | #(3): nvbuf-mem-cuda-unified - Allocate Unified cuda memory, applicable for Tesla
35 | #(4): nvbuf-mem-surface-array - Allocate Surface Array memory, applicable for Jetson
36 | nvbuf-memory-type=0
37 |
38 | [source0]
39 | enable=1
40 | #Type - 1=CameraV4L2 2=URI 3=MultiURI
41 | type=3
42 | uri=file:/home/nndam/Desktop/survelliance-videos/capture_0.mp4
43 | num-sources=1
44 | gpu-id=0
45 | # (0): memtype_device - Memory type Device
46 | # (1): memtype_pinned - Memory type Host Pinned
47 | # (2): memtype_unified - Memory type Unified
48 | cudadec-memtype=0
49 |
50 | [sink0]
51 | enable=1
52 | #Type - 1=FakeSink 2=EglSink 3=File
53 | type=2
54 | sync=0
55 | source-id=0
56 | gpu-id=0
57 | nvbuf-memory-type=0
58 | #1=mp4 2=mkv
59 | container=1
60 | #1=h264 2=h265
61 | codec=1
62 | output-file=yolov4.mp4
63 |
64 | [osd]
65 | enable=1
66 | gpu-id=0
67 | border-width=1
68 | text-size=12
69 | text-color=1;1;1;1;
70 | text-bg-color=0.3;0.3;0.3;1
71 | font=Serif
72 | show-clock=0
73 | clock-x-offset=800
74 | clock-y-offset=820
75 | clock-text-size=12
76 | clock-color=1;0;0;0
77 | nvbuf-memory-type=0
78 |
79 | [streammux]
80 | gpu-id=0
81 | ##Boolean property to inform muxer that sources are live
82 | live-source=0
83 | batch-size=1
84 | ##time out in usec, to wait after the first buffer is available
85 | ##to push the batch even if the complete batch is not formed
86 | batched-push-timeout=40000
87 | ## Set muxer output width and height
88 | width=1280
89 | height=720
90 | ##Enable to maintain aspect ratio wrt source, and allow black borders, works
91 | ##along with width, height properties
92 | enable-padding=0
93 | nvbuf-memory-type=0
94 |
95 | # config-file property is mandatory for any gie section.
96 | # Other properties are optional and if set will override the properties set in
97 | # the infer config file.
98 | [primary-gie]
99 | enable=1
100 | gpu-id=0
101 | labelfile-path=labels.txt
102 | batch-size=1
103 |
104 | #Required by the app for OSD, not a plugin property
105 | bbox-border-color0=1;0;0;1
106 | bbox-border-color1=0;1;1;1
107 | bbox-border-color2=0;0;1;1
108 | bbox-border-color3=0;1;0;1
109 | interval=0
110 | gie-unique-id=1
111 | nvbuf-memory-type=0
112 | config-file=config_yolov4.txt
113 |
114 | [tracker]
115 | enable=1
116 | tracker-width=416
117 | tracker-height=416
118 | ll-lib-file=/opt/nvidia/deepstream/deepstream/lib/libnvds_nvmultiobjecttracker.so
119 | ll-config-file=/opt/nvidia/deepstream/deepstream/samples/configs/deepstream-app/config_tracker_NvDCF_max_perf.yml
120 | enable-batch-process=1
121 | display-tracking-id=1
122 |
123 | [tests]
124 | file-loop=0
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/config_tracker.txt:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # SPDX-FileCopyrightText: Copyright (c) 2019-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | ################################################################################
17 |
18 | # Mandatory properties for the tracker:
19 | # tracker-width
20 | # tracker-height: needs to be multiple of 6 for NvDCF
21 | # gpu-id
22 | # ll-lib-file: path to low-level tracker lib
23 | # ll-config-file: required for NvDCF, optional for KLT and IOU
24 | #
25 | [tracker]
26 | tracker-width=608
27 | tracker-height=608
28 | gpu-id=0
29 | ll-lib-file=/opt/nvidia/deepstream/deepstream/lib/libnvds_nvmultiobjecttracker.so
30 | ll-config-file=config_tracker_NvDCF_perf.yml
31 | #enable-past-frame=1
32 | enable-batch-process=1
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/config_yolov4.txt:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | #
3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | ################################################################################
18 |
19 | # Following properties are mandatory when engine files are not specified:
20 | # int8-calib-file(Only in INT8), model-file-format
21 | # Caffemodel mandatory properties: model-file, proto-file, output-blob-names
22 | # UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names
23 | # ONNX: onnx-file
24 | #
25 | # Mandatory properties for detectors:
26 | # num-detected-classes
27 | #
28 | # Optional properties for detectors:
29 | # cluster-mode(Default=Group Rectangles), interval(Primary mode only, Default=0)
30 | # custom-lib-path
31 | # parse-bbox-func-name
32 | #
33 | # Mandatory properties for classifiers:
34 | # classifier-threshold, is-classifier
35 | #
36 | # Optional properties for classifiers:
37 | # classifier-async-mode(Secondary mode only, Default=false)
38 | #
39 | # Optional properties in secondary mode:
40 | # operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes),
41 | # input-object-min-width, input-object-min-height, input-object-max-width,
42 | # input-object-max-height
43 | #
44 | # Following properties are always recommended:
45 | # batch-size(Default=1)
46 | #
47 | # Other optional properties:
48 | # net-scale-factor(Default=1), network-mode(Default=0 i.e FP32),
49 | # model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path,
50 | # mean-file, gie-unique-id(Default=0), offsets, process-mode (Default=1 i.e. primary),
51 | # custom-lib-path, network-mode(Default=0 i.e FP32)
52 | #
53 | # The values in the config file are overridden by values set through GObject
54 | # properties.
55 |
56 | [property]
57 | gpu-id=0
58 | net-scale-factor=0.0039215697906911373
59 | #0=RGB, 1=BGR
60 | model-color-format=0
61 | model-engine-file=weights/model-1x3x416x416-fp16.engine
62 | labelfile-path=labels.txt
63 | batch-size=1
64 | ## 0=FP32, 1=INT8, 2=FP16 mode
65 | network-mode=2
66 | num-detected-classes=80
67 | gie-unique-id=1
68 | network-type=0
69 | is-classifier=0
70 | ## 0=Group Rectangles, 1=DBSCAN, 2=NMS, 3= DBSCAN+NMS Hybrid, 4 = None(No clustering)
71 | cluster-mode=2
72 | maintain-aspect-ratio=1
73 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
74 | parse-bbox-func-name=NvDsInferParseCustomYoloV4
75 | #scaling-filter=0
76 | #scaling-compute-hw=0
77 |
78 | [class-attrs-all]
79 | nms-iou-threshold=0.6
80 | pre-cluster-threshold=0.4
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/exec_backends/__pycache__/trt_backend.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-yolov4/exec_backends/__pycache__/trt_backend.cpython-36.pyc
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/exec_backends/trt_backend.py:
--------------------------------------------------------------------------------
1 | import pycuda.driver as cuda
2 | import pycuda.autoinit
3 | import numpy as np
4 |
5 | import tensorrt as trt
6 |
7 | TRT_LOGGER = trt.Logger()
8 | trt.init_libnvinfer_plugins(None, "")
9 | # Simple helper data class that's a little nicer to use than a 2-tuple.
10 | class HostDeviceMem(object):
11 | def __init__(self, host_mem, device_mem):
12 | self.host = host_mem
13 | self.device = device_mem
14 |
15 | def __str__(self):
16 | return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
17 |
18 | def __repr__(self):
19 | return self.__str__()
20 |
21 | # Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
22 | def allocate_buffers(engine, max_boxes, total_classes):
23 | inputs = []
24 | outputs = []
25 | bindings = []
26 | stream = cuda.Stream()
27 | out_shapes = []
28 | input_shapes = []
29 | out_names = []
30 | max_batch_size = engine.get_profile_shape(0, 0)[2][0]
31 | # max_batch_size = 1
32 | for binding in engine:
33 | binding_shape = engine.get_binding_shape(binding)
34 |
35 | # #Fix -1 dimension for proper memory allocation for batch_size > 1
36 | # if binding == 'input':
37 | # max_width = engine.get_profile_shape(0, 0)[2][3]
38 | # max_height = engine.get_profile_shape(0, 0)[2][2]
39 | # size = max_batch_size * max_width * max_height * 3
40 | # elif binding == 'confs':
41 | # size = max_batch_size * max_boxes * (total_classes)
42 | # elif binding == 'boxes':
43 | # size = max_batch_size * max_boxes * (4)
44 | # else:
45 | # raise NotImplementedError("Not support binding: {}".format(binding))
46 | print(binding, binding_shape)
47 | assert min(binding_shape) > 0, print(binding, binding_shape)
48 | size = 1
49 | for i in range(len(binding_shape)):
50 | size *= binding_shape[i]
51 |
52 | dtype = trt.nptype(engine.get_binding_dtype(binding))
53 | # Allocate host and device buffers
54 | host_mem = cuda.pagelocked_empty(size, dtype)
55 | device_mem = cuda.mem_alloc(host_mem.nbytes)
56 | # Append the device buffer to device bindings.
57 | bindings.append(int(device_mem))
58 | # Append to the appropriate list.
59 | if engine.binding_is_input(binding):
60 | inputs.append(HostDeviceMem(host_mem, device_mem))
61 | input_shapes.append(engine.get_binding_shape(binding))
62 | else:
63 | outputs.append(HostDeviceMem(host_mem, device_mem))
64 | #Collect original output shapes and names from engine
65 | out_shapes.append(engine.get_binding_shape(binding))
66 | out_names.append(binding)
67 | return inputs, outputs, bindings, stream, input_shapes, out_shapes, out_names, max_batch_size
68 |
69 | # This function is generalized for multiple inputs/outputs.
70 | # inputs and outputs are expected to be lists of HostDeviceMem objects.
71 | def do_inference(context, bindings, inputs, outputs, stream):
72 | # Transfer input data to the GPU.
73 | [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
74 | # Run inference.
75 | context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
76 | # Transfer predictions back from the GPU.
77 | [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
78 | # Synchronize the stream
79 | stream.synchronize()
80 | # Return only the host outputs.
81 | return [out.host for out in outputs]
82 |
83 | class TrtModel(object):
84 | def __init__(self, model, max_size, total_classes = 80):
85 | self.engine_file = model
86 | self.engine = None
87 | self.inputs = None
88 | self.outputs = None
89 | self.bindings = None
90 | self.stream = None
91 | self.context = None
92 | self.input_shapes = None
93 | self.out_shapes = None
94 | self.max_batch_size = 1
95 | self.max_size = max_size
96 | self.total_classes = total_classes
97 |
98 | def build(self):
99 | with open(self.engine_file, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
100 | self.engine = runtime.deserialize_cuda_engine(f.read())
101 | # Allocate
102 | self.max_boxes = self.get_number_of_boxes(self.max_size, self.max_size)
103 | self.inputs, self.outputs, self.bindings, self.stream, self.input_shapes, self.out_shapes, self.out_names, self.max_batch_size = \
104 | allocate_buffers(self.engine, max_boxes = self.max_boxes, total_classes = self.total_classes)
105 | self.context = self.engine.create_execution_context()
106 | self.context.active_optimization_profile = 0
107 |
108 | def get_number_of_boxes(self, im_width, im_height):
109 | # Calculate total boxes (3 detect layers)
110 | assert im_width % 32 == 0 and im_height % 32 == 0
111 | return (int(im_width*im_height/32/32) + int(im_width*im_height/16/16) + int(im_width*im_height/8/8))*3
112 |
113 | def run(self, input, deflatten: bool = True, as_dict = False):
114 | # lazy load implementation
115 | if self.engine is None:
116 | self.build()
117 |
118 | input = np.asarray(input)
119 | batch_size, _, im_height, im_width = input.shape
120 | assert batch_size <= self.max_batch_size
121 | assert max(im_width, im_height) <= self.max_size, "Invalid shape: {}x{}, max shape: {}".format(im_width, im_height, self.max_size)
122 | allocate_place = np.prod(input.shape)
123 | # print('allocate_place', input.shape)
124 | self.inputs[0].host[:allocate_place] = input.flatten(order='C').astype(np.float32)
125 | self.context.set_binding_shape(0, input.shape)
126 | trt_outputs = do_inference(
127 | self.context, bindings=self.bindings,
128 | inputs=self.inputs, outputs=self.outputs, stream=self.stream)
129 | if deflatten:
130 | trt_outputs = [output[:np.prod(shape)].reshape(shape) for output, shape in zip(trt_outputs, self.out_shapes)]
131 | if as_dict:
132 | return {self.out_names[ix]: trt_output[:batch_size] for ix, trt_output in enumerate(trt_outputs)}
133 | return [trt_output[:batch_size] for trt_output in trt_outputs]
134 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/labels.txt:
--------------------------------------------------------------------------------
1 | person
2 | bicycle
3 | car
4 | motorbike
5 | aeroplane
6 | bus
7 | train
8 | truck
9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/nvdsinfer_custom_impl_Yolo/Makefile:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 |
17 | CUDA_VER?=
18 | ifeq ($(CUDA_VER),)
19 | $(error "CUDA_VER is not set")
20 | endif
21 | CC:= g++
22 | NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc
23 |
24 | CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations
25 | CFLAGS+= -I../../includes -I/usr/local/cuda-$(CUDA_VER)/include -I/opt/nvidia/deepstream/deepstream/sources/includes
26 |
27 | LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs
28 | LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group
29 |
30 | INCS:= $(wildcard *.h)
31 | SRCFILES:= nvdsparsebbox_Yolo.cpp
32 |
33 | TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so
34 |
35 | TARGET_OBJS:= $(SRCFILES:.cpp=.o)
36 | TARGET_OBJS:= $(TARGET_OBJS:.cu=.o)
37 |
38 | all: $(TARGET_LIB)
39 |
40 | %.o: %.cpp $(INCS) Makefile
41 | $(CC) -c -o $@ $(CFLAGS) $<
42 |
43 | %.o: %.cu $(INCS) Makefile
44 | $(NVCC) -c -o $@ --compiler-options '-fPIC' $<
45 |
46 | $(TARGET_LIB) : $(TARGET_OBJS)
47 | $(CC) -o $@ $(TARGET_OBJS) $(LFLAGS)
48 |
49 | clean:
50 | rm -rf $(TARGET_LIB)
51 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | #include
18 | #include
19 | #include
20 | #include
21 | #include
22 | #include
23 | #include
24 | #include "nvdsinfer_custom_impl.h"
25 |
26 | static const int NUM_CLASSES_YOLO = 80;
27 |
28 | float clamp(const float val, const float minVal, const float maxVal)
29 | {
30 | assert(minVal <= maxVal);
31 | return std::min(maxVal, std::max(minVal, val));
32 | }
33 |
34 | extern "C" bool NvDsInferParseCustomYoloV4(
35 | std::vector const& outputLayersInfo,
36 | NvDsInferNetworkInfo const& networkInfo,
37 | NvDsInferParseDetectionParams const& detectionParams,
38 | std::vector& objectList);
39 |
40 |
41 | /* YOLOv4 implementations */
42 | static NvDsInferParseObjectInfo convertBBoxYoloV4(const float& bx1, const float& by1, const float& bx2,
43 | const float& by2, const uint& netW, const uint& netH)
44 | {
45 | NvDsInferParseObjectInfo b;
46 | // Restore coordinates to network input resolution
47 |
48 | float x1 = bx1 * netW;
49 | float y1 = by1 * netH;
50 | float x2 = bx2 * netW;
51 | float y2 = by2 * netH;
52 |
53 | x1 = clamp(x1, 0, netW);
54 | y1 = clamp(y1, 0, netH);
55 | x2 = clamp(x2, 0, netW);
56 | y2 = clamp(y2, 0, netH);
57 |
58 | b.left = x1;
59 | b.width = clamp(x2 - x1, 0, netW);
60 | b.top = y1;
61 | b.height = clamp(y2 - y1, 0, netH);
62 |
63 | return b;
64 | }
65 |
66 | static void addBBoxProposalYoloV4(const float bx, const float by, const float bw, const float bh,
67 | const uint& netW, const uint& netH, const int maxIndex,
68 | const float maxProb, std::vector& binfo)
69 | {
70 | NvDsInferParseObjectInfo bbi = convertBBoxYoloV4(bx, by, bw, bh, netW, netH);
71 | if (bbi.width < 1 || bbi.height < 1) return;
72 |
73 | bbi.detectionConfidence = maxProb;
74 | bbi.classId = maxIndex;
75 | binfo.push_back(bbi);
76 | }
77 |
78 | static std::vector
79 | decodeYoloV4Tensor(
80 | const float* boxes, const float* scores, const float* classes,
81 | const uint num_bboxes, NvDsInferParseDetectionParams const& detectionParams,
82 | const uint& netW, const uint& netH)
83 | {
84 | std::vector binfo;
85 |
86 | uint bbox_location = 0;
87 | uint score_location = 0;
88 | for (uint b = 0; b < num_bboxes; ++b)
89 | {
90 | float bx1 = boxes[bbox_location];
91 | float by1 = boxes[bbox_location + 1];
92 | float bx2 = boxes[bbox_location + 2];
93 | float by2 = boxes[bbox_location + 3];
94 | float maxProb = scores[score_location];
95 | int maxIndex = (int) classes[score_location];
96 |
97 | if (maxProb > detectionParams.perClassPreclusterThreshold[maxIndex])
98 | {
99 | addBBoxProposalYoloV4(bx1, by1, bx2, by2, netW, netH, maxIndex, maxProb, binfo);
100 | }
101 |
102 | bbox_location += 4;
103 | score_location += 1;
104 | }
105 |
106 | return binfo;
107 | }
108 |
109 | extern "C" bool NvDsInferParseCustomYoloV4(
110 | std::vector const& outputLayersInfo,
111 | NvDsInferNetworkInfo const& networkInfo,
112 | NvDsInferParseDetectionParams const& detectionParams,
113 | std::vector& objectList)
114 | {
115 | if (NUM_CLASSES_YOLO != detectionParams.numClassesConfigured)
116 | {
117 | std::cerr << "WARNING: Num classes mismatch. Configured:"
118 | << detectionParams.numClassesConfigured
119 | << ", detected by network: " << NUM_CLASSES_YOLO << std::endl;
120 | }
121 |
122 | std::vector objects;
123 | const NvDsInferLayerInfo &n_bboxes = outputLayersInfo[0];
124 | const NvDsInferLayerInfo &boxes = outputLayersInfo[1]; // (num_boxes, 4)
125 | const NvDsInferLayerInfo &scores = outputLayersInfo[2]; // (num_boxes, )
126 | const NvDsInferLayerInfo &classes = outputLayersInfo[3]; // (num_boxes, )
127 |
128 |
129 | int num_bboxes = *(const int*)(n_bboxes.buffer);
130 |
131 |
132 | assert(boxes.inferDims.numDims == 2);
133 | assert(scores.inferDims.numDims == 1);
134 | assert(classes.inferDims.numDims == 1);
135 |
136 | // std::cout << "Network Info: " << networkInfo.height << " " << networkInfo.width << std::endl;
137 |
138 | std::vector outObjs =
139 | decodeYoloV4Tensor(
140 | (const float*)(boxes.buffer), (const float*)(scores.buffer), (const float*)(classes.buffer), num_bboxes, detectionParams,
141 | networkInfo.width, networkInfo.height);
142 |
143 | objects.insert(objects.end(), outObjs.begin(), outObjs.end());
144 |
145 | objectList = objects;
146 |
147 | return true;
148 | }
149 | /* YOLOv4 implementations end*/
150 |
151 |
152 | /* Check that the custom function has been defined correctly */
153 | CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV4);
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/test_images/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-yolov4/test_images/test.png
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/test_onnx.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np
3 | from exec_backends.trt_backend import TrtModel
4 |
5 |
6 | def preprocess(img, input_size = (416, 416)):
7 | resized_img = cv2.resize(img, (input_size[1], input_size[0]))
8 | resized_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
9 | resized_img = np.expand_dims(resized_img, 0)
10 | resized_img = resized_img.astype('float32') / 255.0
11 | resized_img = np.transpose(resized_img, (0, 3, 1, 2))
12 | return resized_img
13 |
14 | def visualize(img, bboxes):
15 | height, width, _ = img.shape
16 | bboxes[:, 0] *= width
17 | bboxes[:, 1] *= height
18 | bboxes[:, 2] *= width
19 | bboxes[:, 3] *= height
20 | for x1, y1, x2, y2 in bboxes:
21 | x1 = int(x1)
22 | y1 = int(y1)
23 | x2 = int(x2)
24 | y2 = int(y2)
25 | cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2)
26 | return img
27 |
28 | if __name__ == '__main__':
29 | model_path = 'weights/model-1x3x416x416-fp16.engine'
30 | img_path = 'test_images/test.png'
31 |
32 | model = TrtModel(model_path, max_size = 416)
33 | img = cv2.imread(img_path)
34 | batch = preprocess(img)
35 |
36 | num_detections, bboxes, confs, classes = model.run(batch)
37 | print(num_detections.shape, bboxes.shape, confs.shape, classes.shape)
38 | bboxes = bboxes[0][:num_detections[0][0]]
39 | confs = confs[0][:num_detections[0][0]]
40 | classes = classes[0][:num_detections[0][0]]
41 | print(bboxes)
42 | vis = visualize(img.copy(), bboxes)
43 | cv2.imshow('vis.jpg', vis)
44 | cv2.waitKey(0)
45 |
--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/tools/add_nms_plugins.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 |
17 | #!/usr/bin/env python3
18 | import onnx_graphsurgeon as gs
19 | import argparse
20 | import onnx
21 | import numpy as np
22 |
23 | def create_and_add_plugin_node(graph, topK, keepTopK):
24 |
25 | batch_size = graph.inputs[0].shape[0]
26 | input_h = graph.inputs[0].shape[2]
27 | input_w = graph.inputs[0].shape[3]
28 | print('batch_size', batch_size)
29 |
30 | tensors = graph.tensors()
31 | boxes_tensor = tensors["boxes"]
32 | confs_tensor = tensors["confs"]
33 |
34 | num_detections = gs.Variable(name="num_detections").to_variable(dtype=np.int32, shape=[-1, 1])
35 | nmsed_boxes = gs.Variable(name="nmsed_boxes").to_variable(dtype=np.float32, shape=[-1, keepTopK, 4])
36 | nmsed_scores = gs.Variable(name="nmsed_scores").to_variable(dtype=np.float32, shape=[-1, keepTopK])
37 | nmsed_classes = gs.Variable(name="nmsed_classes").to_variable(dtype=np.float32, shape=[-1, keepTopK])
38 |
39 | new_outputs = [num_detections, nmsed_boxes, nmsed_scores, nmsed_classes]
40 |
41 | mns_node = gs.Node(
42 | op="BatchedNMSDynamic_TRT",
43 | attrs=create_attrs(input_h, input_w, topK, keepTopK),
44 | inputs=[boxes_tensor, confs_tensor],
45 | outputs=new_outputs)
46 |
47 | graph.nodes.append(mns_node)
48 | graph.outputs = new_outputs
49 |
50 | return graph.cleanup().toposort()
51 |
52 |
53 |
54 |
55 | def create_attrs(input_h, input_w, topK, keepTopK):
56 |
57 | num_anchors = 3
58 |
59 | h1 = input_h // 8
60 | h2 = input_h // 16
61 | h3 = input_h // 32
62 |
63 | w1 = input_w // 8
64 | w2 = input_w // 16
65 | w3 = input_w // 32
66 |
67 | num_boxes = num_anchors * (h1 * w1 + h2 * w2 + h3 * w3)
68 |
69 | attrs = {}
70 |
71 | attrs["shareLocation"] = 1
72 | attrs["backgroundLabelId"] = -1
73 | attrs["numClasses"] = 80
74 | attrs["topK"] = topK
75 | attrs["keepTopK"] = keepTopK
76 | attrs["scoreThreshold"] = 0.4
77 | attrs["iouThreshold"] = 0.6
78 | attrs["isNormalized"] = 1
79 | attrs["clipBoxes"] = 1
80 |
81 | # 001 is the default plugin version the parser will search for, and therefore can be omitted,
82 | # but we include it here for illustrative purposes.
83 | attrs["plugin_version"] = "1"
84 |
85 | return attrs
86 |
87 |
88 | def main():
89 | parser = argparse.ArgumentParser(description="Add batchedNMSPlugin")
90 | parser.add_argument("-f", "--model", help="Path to the ONNX model generated by export_model.py", default="yolov4_1_3_416_416.onnx")
91 | parser.add_argument("-t", "--topK", help="number of bounding boxes for nms", default=2000)
92 | parser.add_argument("-k", "--keepTopK", help="bounding boxes to be kept per image", default=1000)
93 |
94 | args, _ = parser.parse_known_args()
95 |
96 | graph = gs.import_onnx(onnx.load(args.model))
97 |
98 | graph = create_and_add_plugin_node(graph, int(args.topK), int(args.keepTopK))
99 |
100 | onnx.save(gs.export_onnx(graph), args.model + ".nms.onnx")
101 |
102 |
103 | if __name__ == '__main__':
104 | main()
105 |
--------------------------------------------------------------------------------
/Deploy/NVIDIA/README.md:
--------------------------------------------------------------------------------
1 | # NVIDIA frameworks, platforms, engines, toolkits, blogs, ...
2 |
3 | - [Multi-instance GPU (MIG)](docs/multi_instance_gpu.md)
4 | - [FFMPEG with NVENC NVDEC hardware-acceleration](docs/nvidia_video_sdk.md)
--------------------------------------------------------------------------------
/Deploy/NVIDIA/docs/nvidia_video_sdk.md:
--------------------------------------------------------------------------------
1 | # FFMPEG hardware acceleration with Nvidia Video SDK
2 | ## 1. Requirements
3 | - GPU with hardware-acceleration support, check here: https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new
4 |
5 |
6 | Example of NVDEC support
7 |
8 |
9 | - Nvidia Driver
10 | - CUDA Toolkit
11 |
12 | ## 2. Install FFMPEG with hardware acceleration
13 | System Information
14 | - OS: Ubuntu 18.04
15 | - CPU: Intel(R) Xeon(R) X5650 (12M Cache, 2.66 GHz, 6.40 GT/S Intel® QPI)
16 | - NVIDIA GTX 1060 OC 3Gb
17 |
18 | ```
19 | sudo apt-get install build-essential yasm cmake libtool libc6 libc6-dev unzip wget libnuma1 libnuma-dev libx264-dev libvpx-dev libvorbis-dev
20 |
21 | git clone --branch sdk/11.1 https://git.videolan.org/git/ffmpeg/nv-codec-headers.git
22 |
23 | cd nv-codec-headers && sudo make install && cd ..
24 |
25 | git clone --branch n4.4.3 https://git.ffmpeg.org/ffmpeg.git ffmpeg/ && cd ffmpeg
26 |
27 | ./configure --enable-nonfree --enable-cuda-nvcc --enable-nvenc --enable-cuvid --enable-nvdec --enable-libnpp --extra-cflags=-I/usr/local/cuda/include --extra-ldflags=-L/usr/local/cuda/lib64 --disable-static --enable-shared --enable-libx264 --enable-libvpx --enable-libvorbis --enable-gpl --enable-cuda
28 |
29 | make -j8
30 |
31 | sudo make install
32 |
33 | sudo ldconfig
34 |
35 | ffmpeg --help
36 | ```
37 | If you meet error about **nvcc**, try to change line 4355 of ```ffmpeg/configure``` to ```nvccflags_default="-gencode arch=compute_35,code=sm_35 -O2"```
38 |
39 | ## 3. Benchmark
40 | ### 3.1. Convert MPEG-4 to H264
41 | - Public **libx264**
42 | ```
43 | ffmpeg -y -i test.avi -c:v libx264 test.mp4
44 |
45 | Output #0, mp4, to 'test.mp4':
46 | Metadata:
47 | major_brand : mp42
48 | minor_version : 0
49 | compatible_brands: isommp42
50 | com.android.model: 21121210C
51 | com.android.version: 12
52 | com.android.manufacturer: Xiaomi
53 | encoder : Lavf58.76.100
54 | Stream #0:0(eng): Video: h264 (avc1 / 0x31637661), yuv420p(tv, bt709, progressive), 1920x1080, q=2-31, 30 fps, 15360 tbn (default)
55 | Metadata:
56 | creation_time : 2022-11-23T08:27:41.000000Z
57 | handler_name : VideoHandle
58 | vendor_id : [0][0][0][0]
59 | encoder : Lavc58.134.100 libx264
60 | Side data:
61 | cpb: bitrate max/min/avg: 0/0/0 buffer size: 0 vbv_delay: N/A
62 | Stream #0:1(eng): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, stereo, fltp, 128 kb/s (default)
63 | Metadata:
64 | creation_time : 2022-11-23T08:27:41.000000Z
65 | handler_name : SoundHandle
66 | vendor_id : [0][0][0][0]
67 | encoder : Lavc58.134.100 aac
68 | frame= 4871 fps= 44 q=-1.0 Lsize= 263346kB time=00:02:42.27 bitrate=13294.1kbits/s dup=0 drop=3 speed=1.48x
69 | video:260623kB audio:2550kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.065733%
70 | ```
71 | - Hardware acceleration
72 | ```
73 | ffmpeg -y -i test.avi -c:v h264_nvenc test.mp4
74 |
75 | Output #0, mp4, to 'test2.mp4':
76 | Metadata:
77 | major_brand : mp42
78 | minor_version : 0
79 | compatible_brands: isommp42
80 | com.android.model: 21121210C
81 | com.android.version: 12
82 | com.android.manufacturer: Xiaomi
83 | encoder : Lavf58.76.100
84 | Stream #0:0(eng): Video: h264 (Main) (avc1 / 0x31637661), yuv420p(tv, bt709, progressive), 1920x1080, q=2-31, 2000 kb/s, 30 fps, 15360 tbn (default)
85 | Metadata:
86 | creation_time : 2022-11-23T08:27:41.000000Z
87 | handler_name : VideoHandle
88 | vendor_id : [0][0][0][0]
89 | encoder : Lavc58.134.100 h264_nvenc
90 | Side data:
91 | cpb: bitrate max/min/avg: 0/0/2000000 buffer size: 4000000 vbv_delay: N/A
92 | Stream #0:1(eng): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, stereo, fltp, 128 kb/s (default)
93 | Metadata:
94 | creation_time : 2022-11-23T08:27:41.000000Z
95 | handler_name : SoundHandle
96 | vendor_id : [0][0][0][0]
97 | encoder : Lavc58.134.100 aac
98 | frame= 4871 fps=269 q=41.0 Lsize= 44291kB time=00:02:42.27 bitrate=2235.9kbits/s dup=0 drop=3 speed=8.95x
99 | video:41583kB audio:2550kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.356228%
100 | [aac @ 0x5590e2d37e00] Qavg: 182.528
101 | ```
102 | So basically, without care about bitrate, we can increase performance from **1.48x** to **8.95x** with NVIDIA hardware-acceleration
103 |
--------------------------------------------------------------------------------
/Deploy/NVIDIA/fig/gpu-mig-overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/NVIDIA/fig/gpu-mig-overview.jpg
--------------------------------------------------------------------------------
/Deploy/NVIDIA/fig/mig_bert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/NVIDIA/fig/mig_bert.png
--------------------------------------------------------------------------------
/Deploy/NVIDIA/fig/support_nvenc_nvdec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/NVIDIA/fig/support_nvenc_nvdec.png
--------------------------------------------------------------------------------
/Deploy/README.md:
--------------------------------------------------------------------------------
1 | # Deploy
2 | Tất cả những thứ liên quan đến Deploy & Deploy engines
--------------------------------------------------------------------------------
/Deploy/Transfer-Learning-Toolkit/README.md:
--------------------------------------------------------------------------------
1 | # Transfer-Learning-Toolkit (TLT) from NVIDIA
2 |
3 | - [Yolov4](docs/yolov4.md)
4 | - [Detectnet_V2](docs/detectnet_v2.md)
--------------------------------------------------------------------------------
/Deploy/Transfer-Learning-Toolkit/fig/detectnet_v2-inference.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Transfer-Learning-Toolkit/fig/detectnet_v2-inference.jpg
--------------------------------------------------------------------------------
/Deploy/Transfer-Learning-Toolkit/fig/nvidia-retrain-qat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Transfer-Learning-Toolkit/fig/nvidia-retrain-qat.png
--------------------------------------------------------------------------------
/Deploy/Transfer-Learning-Toolkit/fig/yolov4-inference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Transfer-Learning-Toolkit/fig/yolov4-inference.png
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/README.md:
--------------------------------------------------------------------------------
1 | # AI-Engineer-Note
2 |
3 | Tất cả những thứ liên quan đến Triton-inference-server
4 | ## Basic
5 | - [1. Cài đặt triton-server và triton-client](docs/install.md)
6 | + [1.1. Các chế độ quản lý model (load/unload/reload)](docs/model_management.md)
7 | - [2. Sơ lược về các backend trong Triton](docs/backend.md)
8 | - [3. Cấu hình cơ bản khi deploy mô hình](docs/model_configuration.md)
9 | - [4. Deploy mô hình](#)
10 | - [4.1 ONNX-runtime](docs/triton_onnx.md)
11 | - [4.2 TensorRT](docs/triton_tensorrt.md)
12 | - [4.3 Pytorch & TorchScript](docs/triton_pytorch.md)
13 | - [4.4 Kaldi (Advanced)](docs/triton_kaldi.md)
14 | - [5. Model Batching](docs/model_batching.md)
15 | - [6. Ensemble Model và pre/post processing](docs/model_ensemble.md)
16 | ## Advanced
17 | - [Sử dụng Performance Analyzer Tool](docs/perf_analyzer.md)
18 | - [Optimizations](#)
19 | + [Tối ưu Pytorch backend](docs/optimization_pytorch.md)
20 |
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/backend.md:
--------------------------------------------------------------------------------
1 | # Triton backend
2 |
3 | Triton backend được xây dựng trong việc thực thi mô hình. Một backend thông thường có thể được wrap bằng việc sử dụng các deep-learning framework như Pytorch, Tensorflow, TensorRT, ONNX-runtime hoặc OpenVINO như chúng ta đã từng làm để deploy mô hình (chẳng hạn như việc xây dựng một class load mô hình, warmup, pre-processing, inference, post-processing, ...). Dựa trên ý tưởng như vậy, ```triton-backend``` cũng được xây dựng bằng việc tổng hợp các backend của các deep-learning framework trên, sau đó cung cấp ra ngoài những API để người dùng có thể kết nối tới các mô hình deep-learning đã được load bằng ```triton-server```. Cho đến phiên bản hiện tại, ```triton-server``` hỗ trợ các backend sau:
4 | - TensorRT (platform: ```tensorrt_plan```)
5 | - Pytorch (platform: ```pytorch_libtorch```)
6 | - ONNX (platform: ```onnxruntime_onnx```)
7 | - Tensorflow (platform: ```tensorflow_savedmodel```)
8 | - Other backends (platform: phụ thuộc vào backend đã được định nghĩa)
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/install.md:
--------------------------------------------------------------------------------
1 | # Install Triton
2 |
3 | Nội dung phần này sẽ đề cập đến việc cài đặt và sử dụng nhanh triton-server và triton-client.
4 |
5 | ## 1. Cài đặt triton-server
6 | Nếu server đã cài đặt **triton-server** rồi thì có thể bỏ qua bước này và chuyển đến cài đặt và sử dụng **triton-client**. Hiện tại cách nhanh nhất để sử dụng triton-inference-server là docker từ NVIDIA NGC. Ngoài ra phương pháp build-from-source nếu có dịp mình sẽ trình bày sau.
7 | ### 1.1 Cài đặt sử dụng NVIDIA NGC
8 | ```
9 | docker pull nvcr.io/nvidia/tritonserver:-py3
10 | ```
11 | trong đó `````` là phiên bản, chẳng hạn
12 | ```
13 | docker pull nvcr.io/nvidia/tritonserver:21.12-py3
14 | ```
15 | ### 1.2 Chạy thử model
16 | Ở đây mình sẽ chạy thử mô hình wav2vec-base (mà mình đã convert sang ONNX) sử dụng backend là ONNX-runtime. Cấu trúc thư mục mình xây dựng như sau:
17 | ```bash
18 | ├── models
19 | │ ├── wav2vec_general_v2
20 | │ │ ├── 1
21 | │ │ │ ├── model.onnx
22 | │ │ ├── config.pbtxt
23 | ```
24 | File ```config.pbtxt```
25 | ```
26 | name: "wav2vec_general_v2"
27 | platform: "onnxruntime_onnx"
28 | max_batch_size : 0
29 | input [
30 | {
31 | name: "input"
32 | data_type: TYPE_FP32
33 | dims: [1, -1]
34 | }
35 | ]
36 | output [
37 | {
38 | name: "output"
39 | data_type: TYPE_FP32
40 | dims: [-1, -1, 105]
41 | }
42 | ]
43 | ```
44 | Chạy triton-server sử dụng GPU 1 (Hiện tại mình đang ở thư mục cùng bậc với thư mục ```models```)
45 | ```
46 | docker run --gpus device=1 --rm -p8000:8000 -p8001:8001 -p 3 tritonserver --model-repository=/models
47 | ```
48 | Hoặc chạy triton-server sử dụng GPU 1 và cơ chế ```share-memory```
49 | ```
50 | docker run --gpus device=1 --rm --ipc=host --shm-size=128m -p8000:8000 -p8001:8001 -p 3 tritonserver --model-repository=/models
51 | ```
52 |
53 | Output
54 |
55 |
56 |
57 |
58 |
59 | ## 2. Cài đặt triton-client
60 | ### 2.1 Cài đặt cơ bản
61 | Với mục đích cơ bản là gọi từ ```python```, ta có thể cài đặt nhanh sử dụng ```pip```
62 | ```
63 | pip install tritonclient grpcio-tools
64 | ```
65 | ### 2.2 Cài đặt nâng cao
66 | Khác việc sử dụng ```pip``` để cài đặt và sử ```python``` để gọi đến **triton-server**, ở đây chủ yếu đề cập đến việc ```build-from-source``` để sử dụng một số thư viện đi kèm như **Model Analyst** và **Performance Analyst**
67 | - Cài đặt các thư viện Linux
68 | ```
69 | sudo apt-get install curl libcurl4-openssl-dev libb64-dev default-jdk maven
70 | ```
71 | - Cài đặt ```rapidjson```
72 | ```
73 | git clone https://github.com/Tencent/rapidjson.git
74 | cd rapidjson
75 | cmake .
76 | make
77 | sudo make install
78 | ```
79 | - Thêm 1 thư viện ```python``` không nó lại bắn lỗi giữa chừng lại mất công build lại 1 đoạn
80 | ```
81 | python3 -m pip install grpcio-tools
82 | ```
83 | - Tiến hành build **triton-client** (ở đây **triton-server** mình sử dụng Docker phiên bản r21.12)
84 | ```
85 | git clone --recursive https://github.com/triton-inference-server/client.git triton-client
86 | cd triton-client
87 | mkdir build && cd build
88 | cmake -DCMAKE_INSTALL_PREFIX=`pwd`/install -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON -DTRITON_ENABLE_PERF_ANALYZER=ON -DTRITON_ENABLE_PYTHON_HTTP=ON -DTRITON_ENABLE_PYTHON_GRPC=ON -DTRITON_ENABLE_JAVA_HTTP=ON -DTRITON_ENABLE_GPU=ON -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON -DTRITON_COMMON_REPO_TAG=r21.12 -DTRITON_THIRD_PARTY_REPO_TAG=r21.12 -DTRITON_CORE_REPO_TAG=r21.12 -DTRITON_BACKEND_REPO_TAG=r21.12 ..
89 | make cc-clients python-clients java-clients
90 | ```
91 | - Sau đó những thư viện ta đã build sẽ xuất hiện trong thư mục ```triton-client/build/install``` và cái chúng ta cần quan tâm sẽ là ```bin/perf_analyzer```
92 | ### 2.3 Sử dụng triton-client để gọi và lấy kết quả
93 | Tham khảo [src/sample_grpc.py](../src/sample_grpc.py)
94 |
95 |
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/model_batching.md:
--------------------------------------------------------------------------------
1 | # Model Batching
2 |
3 | Phần này ta sẽ tìm hiểu về một số cơ chế batching hỗ trợ bởi Triton
4 |
5 | ### Dynamic Batching
6 | Dynamic Batching thì không cần đề cập nhiều, luồng các messages vào đồng thời sẽ được gom lại và infer theo batch, phương pháp này chủ yếu nhằm tăng [throughput](../docs/perf_analyzer.md) (dẫn đến tăng [latency](../docs/perf_analyzer.md) khi trong cùng một điều kiện về resources)
7 | ```
8 | dynamic_batching { }
9 | ```
10 | hoặc thêm cấu hình thời gian tối đa queue chờ messages mới (microseconds)
11 | ```
12 | dynamic_batching {
13 | max_queue_delay_microseconds: 100
14 | }
15 | ```
16 |
17 | ### Ragged Batching
18 |
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/model_configuration.md:
--------------------------------------------------------------------------------
1 | # Model Configuration
2 | Mặc định, cấu hình phải định nghĩa trước cho mô hình các thông số như tên model, platform sử dụng (```tensorrt_plan, pytorch_libtorch, tensorflow_savedmodel, ...```), kiểu dữ liệu, kích thước cho input, output, cấu hình wramup, cấu hình optimization, ...
3 | ### 1. Cấu hình cơ bản (minimal model configuration)
4 | Mặc định ta không cần xây dựng cấu hình cho các model TensorRT, Tensorflow saved-model và ONNX vì Triton có thể tự động generate. Đối với các model này nếu như không tồn tại ```config.pbtxt``` và ta khởi động triton-server với tham số ```--strict-model-config = false```, triton-server sẽ tự động generate ra file ```config.pbtxt``` ở mức cơ bản. Hoặc ta có thể xây dựng file ```config.pbtxt``` bằng tay. Ở đây mình sẽ xây dựng cấu hình cho đoạn code Pre-processing, Inference và Post-processing GFPGan đều sử dụng Pytorch.
5 | - Pre-processing
6 | ```
7 | name: "pre_gfpgan_batch"
8 | platform: "pytorch_libtorch"
9 | max_batch_size: 8
10 | input [
11 | {
12 | name: "input__0"
13 | data_type: TYPE_UINT8
14 | dims: [-1, -1, 3]
15 | }
16 | ]
17 | output [
18 | {
19 | name: "output__0"
20 | data_type: TYPE_FP32
21 | dims: [3, -1, -1]
22 | }
23 | ]
24 | ```
25 | - Inference
26 | ```
27 | name: "infer_face_restoration_v2.1"
28 | platform: "pytorch_libtorch"
29 | max_batch_size: 8
30 | input [
31 | {
32 | name: "input__0"
33 | data_type: TYPE_FP32
34 | dims: [3, 512, 512]
35 | }
36 | ]
37 | output [
38 | {
39 | name: "output__0"
40 | data_type: TYPE_FP32
41 | dims: [3, 512, 512]
42 | }
43 | ]
44 | ```
45 | - Post-processing
46 | ```
47 | name: "post_gfpgan_batch"
48 | platform: "pytorch_libtorch"
49 | max_batch_size: 8
50 | input [
51 | {
52 | name: "input__0"
53 | data_type: TYPE_FP32
54 | dims: [3, -1, -1]
55 | }
56 | ]
57 | output [
58 | {
59 | name: "output__0"
60 | data_type: TYPE_UINT8
61 | dims: [-1, -1, 3]
62 | }
63 | ]
64 | ```
65 |
66 | Giá trị **-1** thể hiện cho **dynamic-shape**
67 |
68 | Cần lưu ý giá trị ```max_batch_size```, khi giá trị này **khác** 0, giá trị ```dims``` sẽ được hiểu là kích thước của **1 dữ liệu đầu vào**, model sẽ chấp nhận kích thước đầu vào từ ```1 x dims``` đến ```max_batch_size x dims``` (dynamic batch), và nếu giá trị này **bằng** 0, giá trị ```dims``` sẽ được hiểu là **kích thước đầu vào** (static batch)
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/model_ensemble.md:
--------------------------------------------------------------------------------
1 | # Ensemble multiple models and pre/post-processing
2 |
3 | Phần này mình sẽ trình bày nội dung liên quan đến Model Ensemble trong việc giải quyết 2 tình huống
4 | - Xây dựng pipeline end-to-end khi kết hợp 2 hoặc nhiều model với nhau (output của model này là input của model khác)
5 | - Tích hợp tiền xử lý / hậu xử lý vào pipeline
6 |
7 | Lưu ý: Cách giải quyết của 2 tình huống này là giống nhau
8 | ### 1. Đặt vấn đề
9 | Ví dụ như trong trường hợp của mình, mình sử dụng mô hình GFPGan với nhiều bộ dữ liệu khác nhau, từ đó có các phiên bản khác nhau của mô hình, các phiên bản này đều có đặc điểm chung là sử dụng **cùng** một phương pháp **tiền xử lý (pre-processing)** và **hậu xử lý (post-processing)**. Cách thức deploy hiện tại là đặt tiền/hậu xử lý ở phía ```client```, nhưng điều này sẽ khá bất cập khi scalable. Do vậy câu hỏi đặt ra là làm thế nào để tích hợp 2 thứ này vào triton một cách nhanh chóng và linh hoạt nhất để giảm thiểu chi phí chuyển giao trung gian và số lượng requests gửi đến. Triton có hỗ trợ chúng ta dưới dạng **Model Ensemble**. Ý tưởng chủ yếu được gói gọn trong 2 gạch đầu dòng sau:
10 | - Quá trình tiền/hậu xử lý được build thành 1 model triton
11 | - Tạo model ensemble: ```pre-processing -> infer -> post-processing```. Model này không phải là một model thực sự mà là một ```dataflow``` được xây dựng dựa trên model configuration
12 | ### 2. Convert model tiền/hậu xử lý
13 | Lấy ví dụ việc tiền xử lý ảnh của mình như sau (sử dụng numpy & opencv-python, thuần CPU)
14 | ```
15 | def triton_preprocess(cropped_face):
16 | rgb = cv2.cvtColor(cropped_face, cv2.COLOR_BGR2RGB) # BGR sang RGB
17 | rgb = rgb.astype("float32") / 255.0 # Rescale về đoạn [0, 1]
18 | rgb = (rgb - 0.5)/0.5 # Rescale từ [0, 1] về [-1, 1]
19 | rgb = np.expand_dims(rgb, axis = 0) # [256, 256, 3] -> [1, 256, 256, 3]
20 | return np.transpose(rgb, (0, 3, 1, 2)) # [1, 256, 256, 3] -> [1, 3, 256, 256]
21 |
22 | def triton_postprocess(net_out, min_max = (-1, 1)):
23 | net_out = np.transpose(net_out, (0, 2, 3, 1)) # [1, 3, 256, 256] -> [1, 256, 256, 3]
24 | net_out = np.clip(net_out[0], min_max[0], min_max[1]) # [1, 256, 256, 3] -> [256, 256, 3] & clip
25 | net_out = (net_out - min_max[0]) / (min_max[1] - min_max[0]) # Rescale từ [-1, 1] về [0, 1]
26 | net_out = np.array(net_out * 255.0, dtype = np.uint8) # Rescale từ [0, 1] về [0, 255] với uint8
27 | return cv2.cvtColor(net_out, cv2.COLOR_RGB2BGR) # RGB sang BGR
28 | ```
29 | Tiến hành convert sang pytorch
30 | ```
31 | class GFPGanPreprocessor(nn.Module):
32 | def __init__(self):
33 | super(GFPGanPreprocessor, self).__init__()
34 | def forward(self, x):
35 | x = x[:, :, [2, 1, 0]]
36 | x = x / 255.0
37 | x = (x - 0.5)/0.5
38 | x = torch.unsqueeze(x, 0)
39 | return torch.permute(x, (0, 3, 1, 2))
40 |
41 | class GFPGanPostprocessor(nn.Module):
42 | def __init__(self):
43 | super(GFPGanPostprocessor, self).__init__()
44 | def forward(self, x):
45 | x = torch.permute(x, (0, 2, 3, 1))
46 | x = torch.clamp(x, -1.0, 1.0)
47 | x = ((x + 1.0)/2.0*255.0).byte()
48 | return x[:, :, [2, 1, 0]]
49 | ```
50 | Sử dụng pytorch JIT, nếu mọi người chưa biết về JIT có thể xem qua bài viết này
51 | - [Deploy mô hình sử dụng Pytorch (TorchScript) và Triton](./triton_pytorch.md)
52 | ```
53 | # JIT
54 | pre_model = GFPGanPreprocessor()
55 | post_model = GFPGanPostprocessor()
56 | pre_model.eval()
57 | post_model.eval()
58 |
59 | pre_x = torch.rand((256, 256, 3))
60 | pre_traced_cell = torch.jit.trace(pre_model, (pre_x,), strict=False, check_trace=True)
61 | print(pre_model(pre_x))
62 | print(pre_traced_cell(pre_x))
63 | pre_traced_cell.save('pre_traced_cell.pt')
64 |
65 | post_x = torch.rand((1, 3, 256, 256))
66 | post_traced_cell = torch.jit.trace(post_model, (post_x,), strict=False, check_trace=True)
67 | print(post_model(post_x))
68 | print(post_traced_cell(post_x))
69 | post_traced_cell.save('post_traced_cell.pt')
70 | ```
71 | Kết quả chúng ta thu được 2 file ```pre_traced_cell.pt``` và ```post_traced_cell.pt``` là 2 model pre/post-process
72 | ### 3. Đẩy model lên triton-server
73 | Bước này khá là cơ bản, mình tiến hành đẩy 2 model lên triton với các cấu hình tương ứng sau
74 | - Pre-process
75 | ```
76 | name: "pre_gfpgan"
77 | platform: "pytorch_libtorch"
78 | max_batch_size: 0
79 | input [
80 | {
81 | name: "input__0"
82 | data_type: TYPE_UINT8
83 | dims: [-1, -1, 3]
84 | }
85 | ]
86 | output [
87 | {
88 | name: "output__0"
89 | data_type: TYPE_FP32
90 | dims: [1, 3, -1, -1]
91 | }
92 | ]
93 | ```
94 | - Post-process
95 | ```
96 | name: "post_gfpgan"
97 | platform: "pytorch_libtorch"
98 | max_batch_size: 0
99 | input [
100 | {
101 | name: "input__0"
102 | data_type: TYPE_FP32
103 | dims: [1, 3, -1, -1]
104 | }
105 | ]
106 | output [
107 | {
108 | name: "output__0"
109 | data_type: TYPE_UINT8
110 | dims: [-1, -1, 3]
111 | }
112 | ]
113 | ```
114 | - Đẩy lên triton, nên sử dụng EXPLICIT MODE như trong hướng dẫn sau:
115 | + [Các chế độ quản lý model (load/unload/reload)](./model_management.md)
116 |
117 | ### 4. Tạo Ensemble Model
118 | Thiết lập mô hình ensemble với input là ```raw_image```, output là ```image_out```
119 | - Trong quá trình tiền xử lý, ```raw_image``` là input đầu vào ```input__0``` của model ```pre_gfpgan``` ta vừa load lên triton ở bước trên
120 | - Model ```pre_gfpgan``` trả về ```preprocessed_image``` lại feed tương ứng vào ```input__0``` của model ```infer_face_restoration_v2.1```
121 | - Output của model ```infer_face_restoration_v2.1``` ta đặt là ```net_out``` lại là input của model ```post_gfpgan``` - - Cuối cùng trả ra output của ```post_gfpgan``` là ```image_out``` đồng thời là output cuối cùng của model
122 | ```
123 | name: "ens_face_restoration_v2.1"
124 | platform: "ensemble"
125 | max_batch_size: 0
126 | input [
127 | {
128 | name: "raw_image"
129 | data_type: TYPE_UINT8
130 | dims: [-1, -1, 3]
131 | }
132 | ]
133 | output [
134 | {
135 | name: "image_out"
136 | data_type: TYPE_UINT8
137 | dims: [-1, -1, 3]
138 | }
139 | ]
140 | ensemble_scheduling {
141 | step [
142 | {
143 | model_name: "pre_gfpgan"
144 | model_version: -1
145 | input_map {
146 | key: "input__0"
147 | value: "raw_image"
148 | }
149 | output_map {
150 | key: "output__0"
151 | value: "preprocessed_image"
152 | }
153 | },
154 | {
155 | model_name: "infer_face_restoration_v2.1"
156 | model_version: -1
157 | input_map {
158 | key: "input__0"
159 | value: "preprocessed_image"
160 | }
161 | output_map {
162 | key: "output__0"
163 | value: "net_out"
164 | }
165 | },
166 | {
167 | model_name: "post_gfpgan"
168 | model_version: -1
169 | input_map {
170 | key: "input__0"
171 | value: "net_out"
172 | }
173 | output_map {
174 | key: "output__0"
175 | value: "image_out"
176 | }
177 | }
178 | ]
179 | }
180 | ```
181 |
182 | Thiết lập xong cấu hình, ta khởi tạo thư mục ```1``` **rỗng** để tạo phiên bản đầu tiên và đẩy lên triton-server là done
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/model_instance.md:
--------------------------------------------------------------------------------
1 | # Model Instance
2 |
3 | Khi muốn scale up quy mô hệ thông, ta muốn sử dụng nhiều instance của model tương ứng với 1 hoặc nhiều GPU để tối đa hóa tốc độ, giảm thiểu độ trễ phía người dùng. Nghĩa là một requests từ phía người dùng có thể có nhiều lựa chọn hơn, khắc phục hiện tượng bottleneck phía inference. Do vậy, phần này mình sẽ trình bày về cấu hình Model Instance của ```triton-server```.
4 | ### 1. Cơ chế Model Instance trong triton-server
5 | Kiến trúc Triton cho phép nhiều model và một hoặc nhiều instance của cùng một model thực thi song song trên hệ thống. Hệ thống có thể không có, có một hoặc nhiều GPU. Hình dưới đây minh họa với 2 model, giả sử Triton hiện không xử lý bất kỳ yêu cầu nào, khi 2 requests đến đồng thời, 1 request cho mỗi 1 model, Triton ngay lập tức lên lịch cho cả 2 requests trên GPU và thực hiện song song chúng. Nếu hệ thống không có GPU, lập lịch trên CPU thì sẽ tiến hành trên các luồng và phụ thuộc vào OS hệ thống.
6 |
7 |
8 |
9 |
10 | Mặc định, nếu nhiều requests đến cùng 1 model tại 1 thời điểm, Triton sẽ lập lịch sao cho chỉ xử lý 1 request mỗi một thời điểm
11 |
12 |
13 |
14 |
15 | Triton cung cấp một config cho model được gọi là **instance-group** chỉ định số lượng executions được thực thi song song, mỗi execution như vậy được gọi là **instance**. Mặc định, Triton sẽ khởi tạo các **instance** trên các GPU khác nhau. Ví dụ như trong hình dưới đây, có 3 instances và 4 requests được gọi đến, request thứ 4 phải đợi cho đến khi 1 trong 3 lần thực thi đầu tiên hoàn thành trước khi bắt đầu.
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/model_management.md:
--------------------------------------------------------------------------------
1 | # Model Management
2 |
3 | Có 3 chế độ quản lý model trong triton đó là **NONE** (mặc định), **EXPLICIT** (động) và **POLL**
4 |
5 | ### NONE Mode (Default)
6 | - Cấu hình ```--model-control-mode=none```
7 | - Triton sẽ tiến hành load toàn bộ mô hình cùng cấu hình tương ứng lên bộ nhớ, những model nào bị lỗi sẽ bỏ qua và không khả dụng.
8 | - Việc thay đổi repo của model khi server đang chạy sẽ không tác động đến hệ thống hiện tại
9 | - **Không thể** sử dụng ```load``` và ```unload``` API từ ```triton-client```
10 | - Ưu điểm:
11 | + Dễ sử dụng
12 | - Nhược điểm:
13 | + Khó tùy biến
14 | + Việc bổ sung/loại bỏ models đòi hỏi **phải** khởi động lại ```triton-server```
15 | ### EXPLICIT Mode (Recommend)
16 | - Cấu hình ```--model-control-mode=explicit```
17 | - Mặc định triton sẽ **không** ```load``` model nào vào bộ nhớ nếu flag ```--load-model``` không được khai báo. Do vậy, với khởi động mặc định cần phải call API ```load``` các model cần thiết **bằng tay**
18 | - Các model có thể được gọi ```load``` và ```unload``` tùy ý thông qua API từ ```triton-client```
19 | - Việc thay đổi repo của model khi server đang chạy sẽ tác động đến hệ thống hiện tại: **load lại model đó**
20 | - Ưu điểm:
21 | + Dễ tùy biến
22 | + Việc bổ sung/loại bỏ models **không cần** khởi động lại ```triton-server```
23 | - Nhược điểm:
24 | + Hơi khó để làm quen và sử dụng
25 |
26 | Tham khảo API ```Load/Unload/Reload``` model sử dụng Python tại [đây](../src/sample_load_unload.py)
27 | ### POLL
28 | Thấy bảo là không recommend trong **production** nên cũng lười không đọc luôn ...
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/optimization_pytorch.md:
--------------------------------------------------------------------------------
1 | # Optimize Pytorch Backend
2 | Trong quá trình khởi động ```triton-server``` để load các mô hình sử dụng ```pytorch``` backend đôi khi ta sẽ gặp những thông báo kiểu:
3 | ```
4 | I1227 03:45:06.216251 1 libtorch.cc:1255] TRITONBACKEND_ModelInitialize: license_plate_restoration_square_v1.1 (version 1)
5 | I1227 03:45:06.216786 1 libtorch.cc:251] Optimized execution is enabled for model instance 'license_plate_restoration_square_v1.1'
6 | I1227 03:45:06.216796 1 libtorch.cc:269] Inference Mode is disabled for model instance 'license_plate_restoration_square_v1.1'
7 | I1227 03:45:06.216800 1 libtorch.cc:344] NvFuser is not specified for model instance 'license_plate_restoration_square_v1.1'
8 | ```
9 | Đây là thông báo khi **Inference Mode** và **NvFuser** chưa được bật để tối ưu tốc độ. Do vậy trong phần này mình sẽ trình bày về cấu hình tối ưu ```triton-server``` khi sử dụng ```pytorch``` backend với các tham số phù hợp.
10 |
11 | ### 1. Inference Mode
12 |
13 | **InferenceMode** hoạt động tương tự như **NoGradMode** khi không sử dụng autograd. Do vậy, trong đại đa số trường hợp khi mà model của chúng ta không quá đặc biệt (chứa những toán tử bị ảnh hưởng bởi autograd) thì ta có thể bật **InferenceMode** trong file cấu hình như sau:
14 |
15 | ```
16 | parameters: {
17 | key: "INFERENCE_MODE"
18 | value: {
19 | string_value:"true"
20 | }
21 | }
22 | ```
23 |
24 | - Kết quả khi tắt **Inference Mode** (mặc định)
25 | ```
26 | Inferences/Second vs. Client p95 Batch Latency
27 | Concurrency: 1, throughput: 46.4 infer/sec, latency 24657 usec
28 | Concurrency: 2, throughput: 53.8 infer/sec, latency 41444 usec
29 | Concurrency: 3, throughput: 54 infer/sec, latency 59257 usec
30 | Concurrency: 4, throughput: 53.4 infer/sec, latency 81955 usec
31 | ```
32 | - Kết quả sau khi bật (được cải thiện một chút)
33 | ```
34 | Inferences/Second vs. Client p95 Batch Latency
35 | Concurrency: 1, throughput: 42.6 infer/sec, latency 27506 usec
36 | Concurrency: 2, throughput: 54.4 infer/sec, latency 40857 usec
37 | Concurrency: 3, throughput: 54 infer/sec, latency 60192 usec
38 | Concurrency: 4, throughput: 53.6 infer/sec, latency 81830 usec
39 | ```
40 |
41 | ### 2. NvFuser (CUDA Graph Fuser)
42 | Nếu như các bạn có đọc qua về **TensorRT Optimization** thì cơ chế của **NvFuser** sẽ tương tự. Đơn giản là sẽ tiến hành Fuse một số toán tử lại với nhau để tăng tốc độ thực thi. Cơ chế fusing này đã trở nên rất phổ biến và được tích hợp vào hầu hết các framework hiện nay.
43 | Tiến hành bật **NvFuser**:
44 | ```
45 | parameters: {
46 | key: "ENABLE_NVFUSER"
47 | value: {
48 | string_value:"true"
49 | }
50 | }
51 | ```
52 |
53 | ### 3. Các chế độ Optimization khác
54 | Ngoài ra, ta có một số **optimization flags** khác có thể thử
55 | ```
56 | ENABLE_JIT_EXECUTOR
57 | ```
58 | ```
59 | ENABLE_JIT_PROFILING
60 | ```
61 | ```
62 | ENABLE_TENSOR_FUSER
63 | ```
64 | Lưu ý rằng việc enable toàn bộ các ```optimization flags``` chưa chắc đã mang lại kết quả tốt nhất. Khuyến nghị chỉ sử dụng **INFERENCE_MODE** làm mặc định. Dưới đây là kết quả khi enable tất cả các ```optimization flags```
65 |
66 | ```
67 | Inferences/Second vs. Client p95 Batch Latency
68 | Concurrency: 1, throughput: 42.2 infer/sec, latency 27052 usec
69 | Concurrency: 2, throughput: 48.2 infer/sec, latency 46771 usec
70 | Concurrency: 3, throughput: 49.8 infer/sec, latency 65506 usec
71 | Concurrency: 4, throughput: 29 infer/sec, latency 189399 usec
72 | ```
73 |
74 | ### 4. Model Instance
75 | Việc bật nhiều **instance** giúp chúng ta tăng tốc độ khi luồng requests đầu vào có nhiều lựa chọn hơn (nhiều consumers). Tuy nhiên trong một số trường hợp các ```optimization flags``` thường gây ra một số lỗi cho nên khi lựa chọn việc sử dụng nhiều **instance models** ta nên ```DISABLE_OPTIMIZED_EXECUTION```
76 | ```
77 | parameters: {
78 | key: "DISABLE_OPTIMIZED_EXECUTION"
79 | value: {
80 | string_value:"true"
81 | }
82 | }
83 | ```
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/perf_analyzer.md:
--------------------------------------------------------------------------------
1 | # Performance Analyst Tool
2 | **Performance Analyst** là tool dùng để phân tích tốc độ từ phía người dùng. Để sử dụng, ta phải cài đặt **triton-client** từ source như trong hướng dẫn sau:
3 | - [Cài đặt Triton-inference-server](install.md#2-2-cài-đặt-nâng-cao)
4 |
5 | Một số thuật ngữ mà ta cần chú ý:
6 | - **Throughput**: tốc độ xử lý requests (thường là số lượng requests/s)
7 | - **Latency**: thời gian chờ để xử lý xong một requests
8 |
9 | Ví dụ: Throughput hiện tại trên 1 concurency đạt được là 50 requests/s với latency 100ms, khi tăng số lượng concurrencies lên 2 thì throughput vẫn như vậy nhưng latency tăng lên 200ms
10 |
11 | Ta sẽ lấy ví dụ cho việc phân tích model ```wav2vec_general_v2``` mình đang deploy như sau:
12 | ```
13 | perf_analyzer -m wav2vec_general_v2 --percentile=95 --concurrency-range 1:8 --shape input:1,320000
14 | ```
15 | Kết quả trả về cho chúng ta **throughput** và **latency** với backend **ONNX-runtime**
16 |
17 |
18 |
19 |
20 | Ta sẽ tiến hành thay đổi cấu hình ```config.pbtxt``` để sử dụng optimized **ONNX-TensorRT**, khởi động lại Triton-inference-server và so sánh kết quả. Lưu ý rằng quá trình load lại mô hình sẽ lâu hơn vì phải mỗi lần khởi động lại triton sẽ convert mô hình từ **ONNX** sang **TensorRT**
21 | ```
22 | optimization { execution_accelerators {
23 | gpu_execution_accelerator : [ {
24 | name : "tensorrt"
25 | parameters { key: "precision_mode" value: "FP32" }
26 | parameters { key: "max_workspace_size_bytes" value: "1073741824" }
27 | }]
28 | }}
29 | ```
30 |
31 | Lưu ý giá trị 1073741824 = 1 x 1024 x 1024 x 1024 (bytes) = 1Gb là giá trị ```workspace``` mặc định, đối với model **kích thước lớn** ta cần đẩy giá trị này lên cao, chẳng hạn **4Gb = 4294967296**
32 |
33 | Kết quả
34 |
35 |
36 |
37 |
38 | Như vậy, đối với mô hình trên, việc sử dụng backend TensorRT (FP32) giúp cải thiện tốc độ đáng kể (**1.76** lần) so với sử dụng backend ONNX-runtime thông thường.
39 | *(Bạn có thể thử với FP16, nó sẽ không chỉ dừng ở con số **1.76** kia đâu)*
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/triton_kaldi.md:
--------------------------------------------------------------------------------
1 | # Kaldi ASR with Triton-inference-server
2 | Phần này sẽ đề cập đến cách sử dụng Kaldi backend trong Triton
3 | ### 1. Build
4 | - Build docker image
5 | ```
6 | git clone https://github.com/NVIDIA/DeepLearningExamples.git
7 | cd DeepLearningExamples/Kaldi/SpeechRecognition
8 | scripts/docker/build.sh
9 | ```
10 | - Download mô hình sample LibriSpeech
11 | ```
12 | scripts/docker/launch_download.sh
13 | ```
14 | - Khởi chạy triton-kaldi-server với LibriSpeech
15 | ```
16 | scripts/docker/launch_server.sh
17 | ```
18 | ### 2. Load custom model
19 | Phần này mình sẽ tiến hành sử dụng triton để load customized model.
20 | - Tạo thư mục mới tại thư mục làm việc hiện tại
21 | ```
22 | models/infer_asr_kaldi_radio_v1/1
23 | ```
24 | với ```infer_asr_kaldi_radio_v1``` là tên model của mình.
25 | - Run triton tại thư mục hiện tại với MODE ```EXPLICIT```
26 | ```
27 | docker run --rm -it \
28 | --gpus device=0 \
29 | --shm-size=1g \
30 | --ulimit memlock=-1 \
31 | --ulimit stack=67108864 \
32 | -p8005:8000 \
33 | -p8006:8001 \
34 | -p8007:8002 \
35 | --name trt_server_asr \
36 | -v $PWD/data:/data \
37 | -v $PWD/model-repo:/mnt/model-repo \
38 | -v $PWD/models:/models \
39 | triton_kaldi_server tritonserver --model-repo=/models --model-control-mode=explicit
40 | ```
41 | trong đó ```$PWD/models``` là thư mục ta vừa tạo
42 | - Sử dụng một screen khác copy ```libtriton_kaldi.so```
43 | ```
44 | docker ps
45 | docker exec -it bash
46 | cp /workspace/model-repo/kaldi_online/1/libtriton_kaldi.so /models/infer_asr_kaldi_radio_v1/
47 | ```
48 | - Xây dựng cấu trúc thư mục như sau (nhớ sửa lại đường dẫn trong các file ```.conf``` cho đúng):
49 | ```
50 | ├── models
51 | │ ├── infer_asr_kaldi_radio_v1
52 | │ │ ├── 1
53 | │ │ │ ├── conf
54 | │ │ │ │ ├── ivector_extractor.conf
55 | │ │ │ │ ├── mfcc.conf
56 | │ │ │ │ ├── online.conf
57 | │ │ │ │ ├── online_cmvn.conf
58 | │ │ │ │ ├── splice.conf
59 | │ │ │ ├── ivector_extractor
60 | │ │ │ │ ├── final.dubm
61 | │ │ │ │ ├── final.ie
62 | │ │ │ │ ├── final.mat
63 | │ │ │ │ ├── global_cmvn.stats
64 | │ │ │ │ ├── online_cmvn.conf
65 | │ │ │ │ ├── online_cmvn_iextractor
66 | │ │ │ │ ├── splice_opts
67 | │ │ │ ├── final.mdl
68 | │ │ │ ├── global_cmvn.stats
69 | │ │ │ ├── HCLG.fst
70 | │ │ │ ├── words.txt
71 | │ │ ├── config.pbtxt
72 | │ │ ├── libtriton_kaldi.so
73 | ```
74 | Lưu ý: file ```/models/infer_asr_kaldi_radio_v1/1/global_cmvn.stats``` khác với file ```/models/infer_asr_kaldi_radio_v1/1/ivector_extractor/global_cmvn.stats```
75 | - Load model lên triton bằng [gRPC API](../docs/model_management.md)
76 |
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/triton_onnx.md:
--------------------------------------------------------------------------------
1 | # ONNX-runtime with Triton-inference-server
2 |
3 | Để deploy ONNX model chạy với ONNX-runtime (ngoài ONNX-runtime có thể sử dụng TensorRT-runtime nếu support), ta cần để platform là ```onnxruntime_onnx```, ngoài ra các tham số cơ bản trong cấu hình cũng tương tự. Mình sẽ tiến hành deploy model ```wav2vec_general_v2``` như sau:
4 | - Trong thư mục ```models```, khởi tạo thư mục ```wav2vec_general_v2``` chứa file cấu hình và weights
5 | - Để file weights dưới đường dẫn ```models/wav2vec_general_v2/1/model.onnx```, trong đó ```1``` là phiên bản của mô hình
6 | - Để file config dưới đường dẫn ```models/wav2vec_general_v2/config.pbtxt```, lưu ý không ném trong thư mục phiên bản
7 |
8 | ```
9 | name: "wav2vec_general_v2"
10 | platform: "onnxruntime_onnx"
11 | max_batch_size : 0
12 | input [
13 | {
14 | name: "input"
15 | data_type: TYPE_FP32
16 | dims: [1, -1]
17 | }
18 | ]
19 | output [
20 | {
21 | name: "output"
22 | data_type: TYPE_FP32
23 | dims: [-1, -1, 105]
24 | }
25 | ]
26 | ```
27 | - Đẩy model lên triton-server
28 | ```
29 | python src/sample_load_unload.py wav2vec_general_v2
30 | ```
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/triton_tensorrt.md:
--------------------------------------------------------------------------------
1 | # TensorRT-runtime with Triton-inference-server
2 |
3 | Trong trường hợp muốn deploy mô hình sử dụng TensorRT-runtime thay vì ONNX-runtime (model thường phải convert sang ONNX trước khi sang TensorRT), file weight **phải** được convert theo **đúng** phiên bản TensorRT mà docker triton-inference-server đang sử dụng. Do vậy, ta truy cập vào môi trường docker hiện tại như sau:
4 |
5 | - Lấy ID của Docker đang chạy triton-inference-server
6 | ```
7 | damnguyen@rnd3:~$ docker ps
8 |
9 | CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
10 | 6ef0b4972292 nvcr.io/nvidia/tritonserver:21.12-py3 "/opt/tritonserver/n…" 23 hours ago Up 23 hours 0.0.0.0:8000-8002->8000-8002/tcp, :::8000-8002->8000-8002/tcp cranky_hamilton
11 | b09d98350935 quay.io/cloudhut/kowl:master-645e3b4 "./kowl" 6 days ago Up 6 days gifted_davinci
12 | ```
13 | ta có CONTAINER ID của triton là ```6ef0b4972292```
14 | - Chạy bash sử dụng triton container
15 | ```
16 | damnguyen@rnd3:~$ docker exec -it 6ef0b4972292 bash
17 | root@6ef0b4972292:/opt/tritonserver#
18 | ```
19 | - Convert model ONNX sang TensorRT (cú pháp tương tự khi làm việc với engine TensorRT thông thường)
20 | ```
21 | /usr/src/tensorrt/bin/trtexec --onnx= --saveEngine=
22 | ```
23 | - Deploy model lên triton tương tự như ONNX sử dụng tên platform là ```tensorrt_plan``` thay vì ```onnxruntime_onnx```
24 | + [Deploy mô hình sử dụng ONNX-runtime và Triton](./triton_onnx.md)
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/fig/multi_model_exec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/multi_model_exec.png
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/fig/multi_model_parallel_exec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/multi_model_parallel_exec.png
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/fig/multi_model_serial_exec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/multi_model_serial_exec.png
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/fig/wav2vec_general_perf_onnx.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/wav2vec_general_perf_onnx.jpg
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/fig/wav2vec_general_perf_tensorrt.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/wav2vec_general_perf_tensorrt.jpg
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/fig/wav2vec_general_start.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/wav2vec_general_start.jpg
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/src/sample_grpc.py:
--------------------------------------------------------------------------------
1 | import tritonclient.grpc as grpcclient
2 |
3 | class TritonModelGRPC:
4 | '''
5 | Sample model-request triton-inference-server with gRPC
6 | '''
7 | def __init__(self,
8 | triton_host = 'localhost:8001', # default gRPC port
9 | triton_model_name = 'wav2vec_general_v2',
10 | verbose = False):
11 | print('Init connection from Triton-inference-server')
12 | print('- Host: {}'.format(triton_host))
13 | print('- Model: {}'.format(triton_model_name))
14 | self.triton_host = triton_host
15 | self.triton_model_name = triton_model_name
16 | self.model = grpcclient.InferenceServerClient(url=self.triton_host,
17 | verbose=verbose,
18 | ssl=False,
19 | root_certificates=None,
20 | private_key=None,
21 | certificate_chain=None)
22 | if not self.model.is_server_live():
23 | print("FAILED : is_server_live")
24 | sys.exit(1)
25 |
26 | if not self.model.is_server_ready():
27 | print("FAILED : is_server_ready")
28 | sys.exit(1)
29 |
30 | if not self.model.is_model_ready("wav2vec_general_v2"):
31 | print("FAILED : is_model_ready")
32 | sys.exit(1)
33 | self.verbose = verbose
34 |
35 | def run(self, feats):
36 | # Input shape must be [-1]
37 | assert len(feats.shape) == 2, "Shape not support: {}".format(feats.shape)
38 | assert feats.shape[0] == 1, "Shape not support: {}".format(feats.shape)
39 | feats_length = feats.shape[-1]
40 | if self.verbose:
41 | print('='*50)
42 | print('- Input shape: [1, {}]'.format(feats_length))
43 | inputs = []
44 | outputs = []
45 | inputs.append(grpcclient.InferInput('input', [1, feats_length], "FP32"))
46 | inputs[0].set_data_from_numpy(feats)
47 | outputs.append(grpcclient.InferRequestedOutput('output'))
48 | if self.verbose:
49 | tik = time.time()
50 | results = self.model.infer(
51 | model_name="wav2vec_general_v2",
52 | inputs=inputs,
53 | outputs=outputs,
54 | client_timeout=None)
55 | if self.verbose:
56 | tok = time.time()
57 | print('- Time cost:', tok - tik)
58 | output = results.as_numpy('output')
59 | return output
60 |
61 |
--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/src/sample_load_unload.py:
--------------------------------------------------------------------------------
1 | '''
2 | Example
3 | - Load model
4 | python3 sample_load_unload.py --models emotion_recognition_v1.1
5 | - Unload model
6 | python3 sample_load_unload.py --unload --models emotion_recognition_v1.1
7 | - Load model from file
8 | python3 sample_load_unload.py --path --models model_list.txt
9 | '''
10 | import argparse
11 | import tritonclient.grpc as grpcclient
12 |
13 | parser = argparse.ArgumentParser(description='Load/Unload model')
14 | parser.add_argument('--models', default="", help='list of model names to load/unload')
15 | parser.add_argument('--unload', action = "store_true", help='load or unload model')
16 | parser.add_argument('--reload', action = "store_true", help='reload model')
17 | parser.add_argument('--path', action = "store_true", help='get list of models from filepath')
18 | parser.add_argument('--url', default="localhost:8001", help='default triton-server URL')
19 | args = parser.parse_args()
20 |
21 | if not args.path:
22 | MODEL_NAMES = args.models.strip().split(',')
23 | else:
24 | MODEL_NAMES = open(args.models).read().strip('\n').split('\n')
25 | URL = args.url
26 | triton_client = grpcclient.InferenceServerClient(url=URL, verbose=True)
27 | triton_client.is_server_live()
28 | triton_client.get_model_repository_index().models
29 | if args.unload:
30 | for MODEL_NAME in MODEL_NAMES:
31 | if triton_client.is_model_ready(MODEL_NAME):
32 | print('UNLOAD: {}'.format(MODEL_NAME))
33 | triton_client.unload_model(MODEL_NAME)
34 | else:
35 | print('Skip: {}'.format(MODEL_NAME))
36 | else:
37 | for MODEL_NAME in MODEL_NAMES:
38 | if triton_client.is_model_ready(MODEL_NAME):
39 | if args.reload:
40 | print('RELOAD: {}'.format(MODEL_NAME))
41 | triton_client.unload_model(MODEL_NAME)
42 | triton_client.load_model(MODEL_NAME)
43 | else:
44 | print('Skip: {}'.format(MODEL_NAME))
45 | else:
46 | print('LOAD: {}'.format(MODEL_NAME))
47 | triton_client.load_model(MODEL_NAME)
48 |
49 |
50 | print('='*70)
51 | triton_client.get_model_repository_index().models
--------------------------------------------------------------------------------
/Framework/ONNX/README.md:
--------------------------------------------------------------------------------
1 | # AI-Engineer-Howto
2 |
3 | Tất cả những thứ liên quan đến ONNX và ONNX-runtime
--------------------------------------------------------------------------------
/Framework/Pytorch/README.md:
--------------------------------------------------------------------------------
1 | # AI-Engineer-Howto
2 |
3 | Tất cả những thứ liên quan đến Pytorch & Pytorch-serving
4 | - [Build Pytorch from source](docs/build_from_source.md)
--------------------------------------------------------------------------------
/Framework/Pytorch/docs/build_from_source.md:
--------------------------------------------------------------------------------
1 | # Pytorch
2 |
3 | ## Build pytorch from source (best config for AMD CPU & NVIDIA-GPU)
4 | We will use OpenBLAS instead of MKL & MKLDNN
5 | ```
6 | # Install anaconda (if not)
7 | curl -O https://repo.anaconda.com/archive/Anaconda3-2020.07-Linux-x86_64.sh
8 | bash Anaconda3-2020.07-Linux-x86_64.sh
9 | source ~/anaconda3/bin/activate
10 |
11 | # Install dependencies
12 | conda create -n myenv_pytorch_1.9 python=3.8
13 | conda activate myenv_pytorch_1.9
14 | conda install astunparse numpy ninja pyyaml setuptools cmake cffi typing_extensions future six requests dataclasses
15 | pip install ninja
16 |
17 | # Build
18 | git clone --recursive --branch v1.9.1 https://github.com/pytorch/pytorch.git
19 | cd pytorch
20 | export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
21 | USE_NCCL=ON USE_CUDNN=OFF USE_CUDA=ON USE_MKL=OFF USE_MKLDNN=OFF python setup.py install
22 | ```
23 | ## Compatible with
24 | - TorchVision: 0.10.1
25 | - OpenCV: 4.6.0
26 | - MMCV: 1.3.3
27 | - MMCV Compiler: GCC 9.4
28 | - MMCV CUDA Compiler: 11.3
29 | - MMDetection: 2.7.0+e78eee5
30 |
31 |
--------------------------------------------------------------------------------
/Framework/TensorRT/README.md:
--------------------------------------------------------------------------------
1 | # TensorRT
2 | - [Convert ONNX model to TensorRT](docs/tutorial.md)
3 |
--------------------------------------------------------------------------------
/Framework/TensorRT/docs/tutorial.md:
--------------------------------------------------------------------------------
1 | # AI-Engineer-Howto
2 | ## Convert model to TensorRT
3 | ### 1. Convert model to ONNX
4 | Để thuận tiện cho việc deploy, các model sử dụng các framework khác nhau nên được convert sang ONNX, việc convert model từ ONNX sang các runtime khác cũng được dễ dàng hơn, đặc biệt là TensorRT
5 | ### 2. Get input/output shape
6 | Sau khi convert model sang ONNX, ta cần xác định kích thước của input/output (cơ bản chỉ cần input) và các tham số tương ứng. Dễ dàng nhất là ta sử dụng [netron](https://netron.app/) để xem kiến trúc. Ví dụ trong hình dưới đây là model SCFD face detection được visualize sử dụng [netron](https://netron.app/):
7 |
8 |
9 |
10 |
11 | - Input:
12 | - **input.1** (float32): [batch_size, 3, 640, 640] hay [-1, 3, 640, 640] (những giá trị **khác số** được hiểu là giá trị dynamic (động))
13 | - Output:
14 | - **num_detections** (int32): [-1, 1]
15 | - **nmsed_boxes** (float32): [-1, 200, 4]
16 | - **nmsed_scores** (float32): [-1, 200]
17 | - **nmsed_classes** (float32): [-1, 200]
18 | - **nmsed_landmarks** (float32): [-1, 200, 10]
19 | ### 3. Serialize Engine
20 | Đầu tiên cần hiểu về **dynamic** và **static** đối với **shape** và **batch**
21 | - **batch** (batch size): số lượng các input đầu vào, thường là dimension đầu tiên của tensor
22 | - **shape**: kích thước các dimension của tensor, bao gồm cả **batch**
23 | - **dynamic**: động
24 | - **static**: cố định
25 |
26 | Như vậy ta có:
27 | - **dynamic batch**: chỉ **batch** dạng động, các shape khác giữ nguyên, chẳng hạn [-1, 3, 640, 640] thì ta có các input thỏa mãn là [1, 3, 640, 640], [7, 3, 640, 640], ... các input không thỏa mãn là [1, 3, 640, 512], [1, 4, 640, 640], ...
28 | - **static shapes**: chỉ chấp nhận 1 kích thước cố định, chẳng hạn [4, 3, 640, 640] chỉ chấp nhận input [4, 3, 640, 640], còn [7, 4, 640, 512] chỉ chấp nhận input [7, 4, 640, 512], ...
29 | - **dynamic shapes**: một số các dimension động, chẳng hạn [-1, 3, -1, 32] có thể chấp nhận các input [4, 3, 214, 32], [12, 3, 320, 32], ...
30 |
31 | Thông thường ta chỉ quan tâm **dynamic shapes** và **static shapes**.
32 |
33 | Tiến hành convert (serialize), ta có 2 kiểu convert model chính là **implicitBatch** (mặc định) và **explicitBatch**. Giá trị batch_size được hiểu mặc định là giá trị đầu tiên, như trong model phía trên của mình đó là giá trị **-1**, nếu như trong trường hợp trên model của mình có kích thước input là **[1, 3, 640, 640]** thì model không hỗ trợ **dynamic shapes** mà chỉ hỗ trợ **static shapes**, tức là chấp nhận 1 kích thước đầu vào duy nhất. Tuy nhiên ta có một số phương pháp hỗ trợ convert model ONNX từ **dynamic** thành **static** và ngược lại.
34 | - **implicitBatch** (default): Hoạt động với model có input dạng **static shapes**
35 | - **explicitBatch**: Hoạt động với model có input dạng **dynamic shapes**
36 |
37 | Ví dụ cho việc convert model sử dụng **implicitBatch** (kích thước input được xác định sẵn **static** trong meta ONNX model):
38 |
39 | ```
40 | /usr/src/tensorrt/bin/trtexec \
41 | --implicitBatch \
42 | --onnx= \
43 | --saveEngine=output.plan \
44 | --device=0 \
45 | --verbose
46 | ```
47 |
48 | Ví dụ cho việc convert model sử dụng **explicitBatch** (ở đây ta phải xác định thêm **minShapes**, **optShapes** và **maxShapes** của từng input):
49 |
50 | ```
51 | /usr/src/tensorrt/bin/trtexec \
52 | --explicitBatch \
53 | --onnx= \
54 | --minShapes=input.1:1x3x640x640 \
55 | --optShapes=input.1:1x3x640x640 \
56 | --maxShapes=input.1:4x3x640x640 \
57 | --saveEngine=output.plan \
58 | --device=0 \
59 | --verbose
60 | ```
61 |
62 | trong đó:
63 | - **saveEngine**: đường dẫn đến model TensorRT output, thường để đuôi **.plan** hoặc **.trt**
64 | - **device**: GPU ID
65 | - **verbose**: in ra log của quá trình convert
66 | - Cú pháp định nghĩa các shape trong trường hợp có nhiều input: ```:,:,...```
67 |
68 | ### 3. Deserialize Engine & Inference
69 | Sau khi có model tensorrt (hay còn gọi là engine file), ta cần thực hiện việc load model và inference
70 | - Cài đặt pycuda & tensorrt python binding như trong hướng dẫn ở [đây](https://github.com/NNDam/Retinaface-TensorRT)
71 | - Wrap model gồm 3 function chính: allocate_buffers, do_inference, post_process
72 | - Tham khảo việc wrap model tại example repositories
73 |
74 | #### 3.1. Allocate buffers
75 | Khởi tạo bộ nhớ cho các inputs & outputs. Lưu ý rằng đối với **dynamic shapes** model ta cần allocate theo kích thước của inputs và outputs theo **maxShapes**
76 |
77 | ```
78 | # Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
79 | def allocate_buffers(engine):
80 | inputs = []
81 | outputs = []
82 | bindings = []
83 | stream = cuda.Stream()
84 | out_shapes = []
85 | input_shapes = []
86 | out_names = []
87 | max_batch_size = engine.get_profile_shape(0, 0)[2][0]
88 | for binding in engine:
89 | binding_shape = engine.get_binding_shape(binding)
90 | # Fix -1 dimension for proper memory allocation for batch_size > 1
91 | if binding_shape[0] == -1: # Dynamic batch size
92 | binding_shape = (max_batch_size,) + binding_shape[1:]
93 | size = trt.volume(binding_shape)
94 | dtype = trt.nptype(engine.get_binding_dtype(binding))
95 | # Allocate host and device buffers
96 | host_mem = cuda.pagelocked_empty(size, dtype)
97 | device_mem = cuda.mem_alloc(host_mem.nbytes)
98 | # Append the device buffer to device bindings.
99 | bindings.append(int(device_mem))
100 | # Append to the appropriate list.
101 | if engine.binding_is_input(binding):
102 | inputs.append(HostDeviceMem(host_mem, device_mem))
103 | input_shapes.append(engine.get_binding_shape(binding))
104 | else:
105 | outputs.append(HostDeviceMem(host_mem, device_mem))
106 | #Collect original output shapes and names from engine
107 | out_shapes.append(engine.get_binding_shape(binding))
108 | out_names.append(binding)
109 | return inputs, outputs, bindings, stream, input_shapes, out_shapes, out_names, max_batch_size
110 | ```
111 |
112 | #### 3.2. Inference
113 | Tiến hành inference, bao gồm lấy dữ liệu inputs từ host sang device GPU, thực hiện execute trên device GPU để thu được outputs và copy outputs từ device GPU về host
114 |
115 | ```
116 | def do_inference(context, bindings, inputs, outputs, stream):
117 | # Transfer input data to the GPU.
118 | [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
119 | # Run inference.
120 | context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
121 | # Transfer predictions back from the GPU.
122 | [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
123 | # Synchronize the stream
124 | stream.synchronize()
125 | # Return only the host outputs.
126 | return [out.host for out in outputs]
127 | ```
128 |
129 | #### 3.3. Post-processing
130 | Tiến hành reshape lại dữ liệu outputs tùy vào yêu cầu của bài toán và thực hiện các post-processing khác
131 |
132 | ## Example repositories
133 | - [Retinaface](https://github.com/NNDam/Retinaface-TensorRT)
134 | - [vietocr](https://github.com/NNDam/vietocr-tensorrt)
135 | - [yolor](https://github.com/NNDam/yolor)
136 |
--------------------------------------------------------------------------------
/Framework/TensorRT/fig/sample_netron_scrfd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Framework/TensorRT/fig/sample_netron_scrfd.png
--------------------------------------------------------------------------------
/Framework/Tensorflow/README.md:
--------------------------------------------------------------------------------
1 | # AI-Engineer-Howto
2 |
3 | Tất cả những thứ liên quan đến Tensorflow và Tensorflow-serving
--------------------------------------------------------------------------------
/Linux/README.md:
--------------------------------------------------------------------------------
1 | # Collection of FAQ about CUDA & Linux & apt-packages
2 |
3 | Build OpenCV from source
4 |
5 | - [Build OpenCV from source](docs/build_opencv.md)
6 |
7 |
8 |
9 | Install Math Kernel Library (MKL/BLAS/LAPACK/OPENBLAS)
10 | You are recommended to install all Math Kernel Library and then compile framework (e.g pytorch, mxnet) from source using custom config for optimization.
11 | Install all LAPACK+BLAS:
12 |
13 | ```
14 | sudo apt install libjpeg-dev libpng-dev libblas-dev libopenblas-dev libatlas-base-dev liblapack-dev liblapacke-dev gfortran
15 | ```
16 |
17 | Install MKL:
18 |
19 | ```
20 | # Get the key
21 | wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB
22 | # now install that key
23 | apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB
24 | # now remove the public key file exit the root shell
25 | rm GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB
26 | # Add to apt
27 | sudo wget https://apt.repos.intel.com/setup/intelproducts.list -O /etc/apt/sources.list.d/intelproducts.list
28 | sudo sh -c 'echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list'
29 | # Install
30 | sudo apt-get update
31 | sudo apt-get install intel-mkl-2020.4-912
32 | ```
33 |
34 |
35 |
36 | Fresh install NVIDIA driver (PC/Laptop/Workstation)
37 |
38 | ```
39 | # Remove old packages
40 | sudo apt-get remove --purge '^nvidia-.*'
41 | sudo apt-get install ubuntu-desktop
42 | sudo apt-get --purge remove "*cublas*" "cuda*"
43 | sudo apt-get --purge remove "*nvidia*"
44 | sudo add-apt-repository --remove ppa:graphics-drivers/ppa
45 | sudo rm /etc/X11/xorg.conf
46 | sudo apt autoremove
47 | sudo reboot
48 |
49 | # After restart
50 | sudo ubuntu-drivers devices
51 | sudo ubuntu-drivers autoinstall
52 | sudo reboot
53 | ```
54 |
55 |
56 |
57 | NVIDIA-SMI has failed because it couldn’t communicate with the NVIDIA driver
58 |
59 | First, make sure that you have "Fresh install NVIDIA driver". If not work, try this bellow
60 |
61 | - Make sure the package nvidia-prime is installed:
62 |
63 | ```
64 | sudo apt install nvidia-prime
65 | ```
66 |
67 | Afterwards, run
68 | ```
69 | sudo prime-select nvidia
70 | ```
71 |
72 | - Make sure that NVIDIA is not in blacklist
73 |
74 | ```
75 | grep nvidia /etc/modprobe.d/* /lib/modprobe.d/*
76 | ```
77 |
78 | to find a file containing ```blacklist nvidia``` and remove it, then run
79 |
80 | ```
81 | sudo update-initramfs -u
82 | ```
83 |
84 | - If get error ```This PCI I/O region assigned to your NVIDIA device is invalid```:
85 |
86 | ```
87 | sudo nano /etc/default/grub
88 | ```
89 |
90 | edit ```GRUB_CMDLINE_LINUX_DEFAULT="quiet splash pci=realloc=off"```
91 |
92 | ```
93 | sudo update-grub
94 | sudo reboot
95 | ```
96 |
97 |
98 |
99 | Check current CUDA version
100 |
101 | ```
102 | nvcc --version
103 | ```
104 |
105 |
106 |
107 | Check current supported CUDA versions
108 |
109 | ```
110 | ls /usr/local/
111 | ```
112 |
113 |
114 |
115 | Select GPU devices
116 |
117 | ```
118 | CUDA_VISIBLE_DEVICES=
119 | CUDA_VISIBLE_DEVICES=0 python abc.py
120 | CUDA_VISIBLE_DEVICES=0 ./sample.sh
121 | CUDA_VISIBLE_DEVICES=0,1,2,3 python abc.py
122 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./sample.sh
123 | ```
124 |
125 |
126 |
127 | Switch CUDA version
128 |
129 | ```
130 | CUDA_VER=11.3
131 | export PATH="/usr/local/cuda-$CUDA_VER/bin:$PATH"
132 | export LD_LIBRARY_PATH=/usr/local/cuda-$CUDA_VER/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
133 | ```
134 |
135 |
136 |
137 | Check NVENV/NVDEC status
138 |
139 | ```
140 | nvidia-smi dmon
141 | ```
142 | see the tab **%enc** and **%dec**
143 |
144 |
145 | Error with distributed training NCCL (got freezed)
146 |
147 | ```
148 | export NCCL_P2P_DISABLE="1"
149 | ```
150 |
151 |
152 |
153 | Install CMake from source
154 |
155 | ```
156 | version=3.23
157 | build=2 ## don't modify from here
158 | mkdir ~/temp
159 | cd ~/temp
160 | wget https://cmake.org/files/v$version/cmake-$version.$build.tar.gz
161 | tar -xzvf cmake-$version.$build.tar.gz
162 | cd cmake-$version.$build/
163 | ./bootstrap
164 | make -j8
165 | sudo make install
166 | ```
167 |
168 |
169 |
170 | Install MXNet from source (for AMD CPU & NVIDIA GPU)
171 |
172 | ```
173 | git clone --recursive --branch 1.9.1 https://github.com/apache/incubator-mxnet.git mxnet
174 | cd mxnet
175 | cp config/linux_gpu.cmake config.cmake
176 | rm -rf build
177 | mkdir -p build && cd build
178 | cmake -DUSE_CUDA=ON -DUSE_CUDNN=OFF -DUSE_MKL_IF_AVAILABLE=OFF -DUSE_MKLDNN=OFF -DUSE_OPENMP=OFF -DUSE_OPENCV=ON -DUSE_BLAS=open ..
179 | make -j32
180 | cd ../python
181 | pip install --user -e .
182 | ```
183 |
184 |
185 |
186 |
187 | Tensorflow could not load dynamic library 'cudart64_101.dll'
188 | For above example tensorflow would require CUDA 10.1, please switch to CUDA 10.1 or change tensorflow version which compatible with CUDA version, check here: https://www.tensorflow.org/install/source#gpu
189 |
190 |
191 | ### Computer Vision
192 | Fix Deepstream (6.2+) FFMPEG OpenCV installation
193 | Fix some errors about undefined reference & not found of libavcodec, libavutil, libvpx, ...
194 |
195 | ```
196 | apt-get install --reinstall --no-install-recommends -y libavcodec58 libavcodec-dev libavformat58 libavformat-dev libavutil56 libavutil-dev gstreamer1.0-libav
197 | apt install --reinstall gstreamer1.0-plugins-good
198 | apt install --reinstall libvpx6 libx264-155 libx265-179 libmpg123-0 libmpeg2-4 libmpeg2encpp-2.1-0
199 | gst-inspect-1.0 | grep 264
200 | rm ~/.cache/gstreamer-1.0/registry.x86_64.bin
201 | apt install --reinstall libx264-155
202 | apt-get install gstreamer1.0-libav
203 | apt-get install --reinstall gstreamer1.0-plugins-ugly
204 | ```
205 |
206 |
207 |
208 | Gstreamer pipeline to convert MP4-MP4 with re-encoding
209 |
210 | ```
211 | gst-launch-1.0 filesrc location="" ! qtdemux ! video/x-h264 ! h264parse ! avdec_h264 ! videoconvert ! x264enc ! h264parse ! qtmux ! filesink location=
212 | ```
213 |
214 |
215 |
216 | Gstreamer pipeline to convert RTSP-RTMP
217 |
218 | ```
219 | gst-launch-1.0 rtspsrc location='rtsp://' ! rtph264depay ! h264parse ! flvmux ! rtmpsink location='rtmp://rtmp://'
220 | ```
221 |
222 |
223 |
224 | Gstreamer pipeline to convert RTSP-RTMP with reducing resolution
225 |
226 | ```
227 | gst-launch-1.0 rtspsrc location='rtsp://' ! rtpbin ! rtph264depay ! h264parse ! avdec_h264 ! videoconvert ! videoscale ! video/x-raw,width=640,height=640 ! x264enc ! h264parse ! flvmux streamable=true ! rtmpsink location='rtmp://'
228 | ```
229 |
230 |
231 |
--------------------------------------------------------------------------------
/Linux/docs/build_opencv.md:
--------------------------------------------------------------------------------
1 | # Build OpenCV from source
2 |
3 | ### 1. Install the required dependencies
4 | ```
5 | sudo apt install build-essential cmake git pkg-config libgtk-3-dev \
6 | libavcodec-dev libavformat-dev libswscale-dev libv4l-dev \
7 | libxvidcore-dev libx264-dev libjpeg-dev libpng-dev libtiff-dev \
8 | gfortran openexr libatlas-base-dev python3-dev python3-numpy \
9 | libtbb2 libtbb-dev libdc1394-22-dev
10 | sudo apt install libopenblas-dev libopenblas-base
11 | ```
12 | ### 2. Clone the OpenCV’s and OpenCV contrib repositories
13 | ```
14 | mkdir ~/opencv_build && cd ~/opencv_build
15 | git clone https://github.com/opencv/opencv.git
16 | git clone https://github.com/opencv/opencv_contrib.git
17 | cd ~/opencv_build/opencv
18 | mkdir build && cd build
19 | ```
20 | ### 3. Fix OpenBlas search Path:
21 | ```
22 | https://github.com/opencv/opencv/issues/12957
23 | ```
24 | and header
25 | ```
26 | sudo cp /usr/include/lapacke*.h /usr/include/x86_64-linux-gnu/
27 | ```
28 | ### 4. Check CPU tags for optimization
29 | ```
30 | damnguyen@rnd3:~/opencv_build/opencv/build$ lscpu
31 | Architecture: x86_64
32 | CPU op-mode(s): 32-bit, 64-bit
33 | Byte Order: Little Endian
34 | CPU(s): 96
35 | On-line CPU(s) list: 0-95
36 | Thread(s) per core: 2
37 | Core(s) per socket: 24
38 | Socket(s): 2
39 | NUMA node(s): 2
40 | Vendor ID: AuthenticAMD
41 | CPU family: 23
42 | Model: 49
43 | Model name: AMD EPYC 7352 24-Core Processor
44 | Stepping: 0
45 | CPU MHz: 1495.927
46 | CPU max MHz: 2300.0000
47 | CPU min MHz: 1500.0000
48 | BogoMIPS: 4600.06
49 | Virtualization: AMD-V
50 | L1d cache: 32K
51 | L1i cache: 32K
52 | L2 cache: 512K
53 | L3 cache: 16384K
54 | NUMA node0 CPU(s): 0-23,48-71
55 | NUMA node1 CPU(s): 24-47,72-95
56 | Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate sme ssbd ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif umip rdpid overflow_recov succor smca
57 | ```
58 | This **AMD EPYC 7352 24-Core Processor** support **avx**, **avx2**, **sse4_1**, **sse4_2**
59 | ### 5. Config
60 | ```
61 | cmake -D CMAKE_BUILD_TYPE=RELEASE \
62 | -D CMAKE_INSTALL_PREFIX=$(python3 -c "import sys; print(sys.prefix)") \
63 | -D INSTALL_C_EXAMPLES=ON \
64 | -D INSTALL_PYTHON_EXAMPLES=ON \
65 | -D OPENCV_GENERATE_PKGCONFIG=ON \
66 | -D OPENCV_EXTRA_MODULES_PATH=~/opencv_build/opencv_contrib/modules \
67 | -D WITH_CUDA=OFF \
68 | -D BUILD_NEW_PYTHON_SUPPORT=ON \
69 | -D BUILD_opencv_python3=ON \
70 | -D HAVE_opencv_python3=ON \
71 | -D OPENCV_PYTHON3_INSTALL_PATH=$(python3 -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())") \
72 | -D PYTHON_EXECUTABLE=$(which python3) \
73 | -D BUILD_EXAMPLES=ON -D WITH_FFMPEG=OFF ..
74 | ```
75 | Remember to check any error with OpenBLAS
76 | ### 6. Build
77 | ```
78 | make -j8
79 | make install
80 | ```
81 | ### 7. Verify
82 | ```
83 | pkg-config --modversion opencv4
84 | python3 -c "import cv2; print(cv2.__version__)"
85 | ```
86 |
--------------------------------------------------------------------------------