├── Deeplearning ├── ComputerVision │ ├── README.md │ └── docs │ │ ├── gated_convolution.md │ │ ├── multihead_attn.md │ │ └── resblock.md └── NLP │ └── README.md ├── Deploy ├── Deepstream │ ├── FAQ.md │ ├── README.md │ ├── sample-ALPR │ │ ├── README.md │ │ ├── config_deepstream.txt │ │ ├── config_lpd.txt │ │ ├── config_lpr.txt │ │ ├── config_tracker.txt │ │ ├── config_vehicletype.txt │ │ ├── config_yolov4.txt │ │ ├── dict.txt │ │ ├── fig │ │ │ ├── lpr_pipeline.png │ │ │ ├── lpr_result1.png │ │ │ └── lpr_result2.png │ │ ├── labels.txt │ │ ├── nvdsinfer_custom_impl_Yolo │ │ │ ├── Makefile │ │ │ └── nvdsparsebbox_Yolo.cpp │ │ └── weights │ │ │ ├── README.md │ │ │ ├── license-plate-detection │ │ │ └── labels.txt │ │ │ ├── license-plate-recognition │ │ │ └── labels.txt │ │ │ └── vehicletypenet │ │ │ └── labels.txt │ ├── sample-scrfd │ │ ├── README.md │ │ ├── config_scrfd.txt │ │ ├── nvdsinfer_custom_impl_Yolo │ │ │ ├── CMakeLists.txt │ │ │ ├── Makefile │ │ │ ├── README.md │ │ │ ├── batchedNMSCustomInference.cu │ │ │ ├── batchedNMSCustomPlugin.cpp │ │ │ ├── batchedNMSCustomPlugin.h │ │ │ ├── batchedNMSCustomPlugin.o │ │ │ ├── cmake │ │ │ │ └── set_ifndef.cmake │ │ │ ├── common │ │ │ │ ├── ErrorRecorder.h │ │ │ │ ├── bboxUtils.h │ │ │ │ ├── checkMacrosPlugin.cpp │ │ │ │ ├── checkMacrosPlugin.h │ │ │ │ ├── common.cuh │ │ │ │ ├── cub_helper.h │ │ │ │ ├── cudaDriverWrapper.cpp │ │ │ │ ├── cudaDriverWrapper.h │ │ │ │ ├── half.h │ │ │ │ ├── kernel.cpp │ │ │ │ ├── kernel.h │ │ │ │ ├── kernels │ │ │ │ │ ├── CMakeLists.txt │ │ │ │ │ ├── allClassNMS.cu │ │ │ │ │ ├── common.cu │ │ │ │ │ ├── decodeBBoxes.cu │ │ │ │ │ ├── nmsLayer.cu │ │ │ │ │ ├── permuteData.cu │ │ │ │ │ ├── reducedMathPlugin.h │ │ │ │ │ ├── sortScoresPerClass.cu │ │ │ │ │ └── sortScoresPerImage.cu │ │ │ │ ├── logger.cpp │ │ │ │ ├── logger.h │ │ │ │ ├── logging.h │ │ │ │ ├── nmsHelper.cpp │ │ │ │ ├── nmsUtils.h │ │ │ │ ├── plugin.h │ │ │ │ ├── reducedMathPlugin.cpp │ │ │ │ └── serialize.hpp │ │ │ ├── gatherNMSCustomOutputs.cu │ │ │ ├── gatherNMSCustomOutputs.h │ │ │ ├── nvdsparsebbox_Yolo.cpp │ │ │ └── nvdsparsebbox_Yolo.o │ │ ├── parser_scrfd.py │ │ └── run_scrfd.py │ └── sample-yolov4 │ │ ├── config_deepstream.txt │ │ ├── config_tracker.txt │ │ ├── config_yolov4.txt │ │ ├── exec_backends │ │ ├── __pycache__ │ │ │ └── trt_backend.cpython-36.pyc │ │ └── trt_backend.py │ │ ├── labels.txt │ │ ├── nvdsinfer_custom_impl_Yolo │ │ ├── Makefile │ │ └── nvdsparsebbox_Yolo.cpp │ │ ├── run_yolov4.py │ │ ├── test_images │ │ └── test.png │ │ ├── test_onnx.py │ │ └── tools │ │ └── add_nms_plugins.py ├── NVIDIA │ ├── README.md │ ├── docs │ │ ├── multi_instance_gpu.md │ │ └── nvidia_video_sdk.md │ └── fig │ │ ├── gpu-mig-overview.jpg │ │ ├── mig_bert.png │ │ └── support_nvenc_nvdec.png ├── README.md ├── Transfer-Learning-Toolkit │ ├── README.md │ ├── docs │ │ ├── detectnet_v2.md │ │ └── yolov4.md │ └── fig │ │ ├── detectnet_v2-inference.jpg │ │ ├── nvidia-retrain-qat.png │ │ └── yolov4-inference.png └── Triton-inference-server │ ├── README.md │ ├── docs │ ├── backend.md │ ├── install.md │ ├── model_batching.md │ ├── model_configuration.md │ ├── model_ensemble.md │ ├── model_instance.md │ ├── model_management.md │ ├── optimization_pytorch.md │ ├── perf_analyzer.md │ ├── triton_kaldi.md │ ├── triton_onnx.md │ ├── triton_pytorch.md │ └── triton_tensorrt.md │ ├── fig │ ├── multi_model_exec.png │ ├── multi_model_parallel_exec.png │ ├── multi_model_serial_exec.png │ ├── wav2vec_general_perf_onnx.jpg │ ├── wav2vec_general_perf_tensorrt.jpg │ └── wav2vec_general_start.jpg │ └── src │ ├── sample_grpc.py │ └── sample_load_unload.py ├── Framework ├── ONNX │ └── README.md ├── Pytorch │ ├── README.md │ └── docs │ │ └── build_from_source.md ├── TensorRT │ ├── README.md │ ├── docs │ │ └── tutorial.md │ └── fig │ │ └── sample_netron_scrfd.png └── Tensorflow │ └── README.md ├── Linux ├── README.md └── docs │ └── build_opencv.md └── README.md /Deeplearning/ComputerVision/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deeplearning/ComputerVision/README.md -------------------------------------------------------------------------------- /Deeplearning/ComputerVision/docs/gated_convolution.md: -------------------------------------------------------------------------------- 1 | ## Gated Convolution 2 | 3 | ### 1. Expland 4 | ### 2. Pytorch Implementation 5 | ``` 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | class GatedConv2dWithActivation(torch.nn.Module): 11 | """ 12 | Gated Convlution layer with activation (default activation:LeakyReLU) 13 | Params: same as conv2d 14 | Input: The feature from last layer "I" 15 | Output:\phi(f(I))*\sigmoid(g(I)) 16 | """ 17 | 18 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True,batch_norm=True, activation=torch.nn.LeakyReLU(0.2, inplace=True)): 19 | super(GatedConv2dWithActivation, self).__init__() 20 | self.batch_norm = batch_norm 21 | self.activation = activation 22 | self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias) 23 | self.mask_conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias) 24 | self.batch_norm2d = torch.nn.BatchNorm2d(out_channels) 25 | self.sigmoid = torch.nn.Sigmoid() 26 | 27 | for m in self.modules(): 28 | if isinstance(m, nn.Conv2d): 29 | nn.init.kaiming_normal_(m.weight) 30 | def gated(self, mask): 31 | return self.sigmoid(mask) 32 | def forward(self, input): 33 | x = self.conv2d(input) 34 | mask = self.mask_conv2d(input) 35 | if self.activation is not None: 36 | x = self.activation(x) * self.gated(mask) 37 | else: 38 | x = x * self.gated(mask) 39 | if self.batch_norm: 40 | return self.batch_norm2d(x) 41 | else: 42 | return x 43 | 44 | class GatedDeConv2dWithActivation(torch.nn.Module): 45 | """ 46 | Gated DeConvlution layer with activation (default activation:LeakyReLU) 47 | resize + conv 48 | Params: same as conv2d 49 | Input: The feature from last layer "I" 50 | Output:\phi(f(I))*\sigmoid(g(I)) 51 | """ 52 | def __init__(self, scale_factor, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, batch_norm=True,activation=torch.nn.LeakyReLU(0.2, inplace=True)): 53 | super(GatedDeConv2dWithActivation, self).__init__() 54 | self.conv2d = GatedConv2dWithActivation(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, batch_norm, activation) 55 | self.scale_factor = scale_factor 56 | 57 | def forward(self, input): 58 | #print(input.size()) 59 | x = F.interpolate(input, scale_factor=2) 60 | return self.conv2d(x) 61 | 62 | class SNGatedConv2dWithActivation(torch.nn.Module): 63 | """ 64 | Gated Convolution with spetral normalization 65 | """ 66 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, batch_norm=True, activation=torch.nn.LeakyReLU(0.2, inplace=True)): 67 | super(SNGatedConv2dWithActivation, self).__init__() 68 | self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias) 69 | self.mask_conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias) 70 | self.activation = activation 71 | self.batch_norm = batch_norm 72 | self.batch_norm2d = torch.nn.BatchNorm2d(out_channels) 73 | self.sigmoid = torch.nn.Sigmoid() 74 | self.conv2d = torch.nn.utils.spectral_norm(self.conv2d) 75 | self.mask_conv2d = torch.nn.utils.spectral_norm(self.mask_conv2d) 76 | for m in self.modules(): 77 | if isinstance(m, nn.Conv2d): 78 | nn.init.kaiming_normal_(m.weight) 79 | 80 | def gated(self, mask): 81 | return self.sigmoid(mask) 82 | 83 | def forward(self, input): 84 | x = self.conv2d(input) 85 | mask = self.mask_conv2d(input) 86 | if self.activation is not None: 87 | x = self.activation(x) * self.gated(mask) 88 | else: 89 | x = x * self.gated(mask) 90 | if self.batch_norm: 91 | return self.batch_norm2d(x) 92 | else: 93 | return x 94 | 95 | class SNGatedDeConv2dWithActivation(torch.nn.Module): 96 | """ 97 | Gated DeConvlution layer with activation (default activation:LeakyReLU) 98 | resize + conv 99 | Params: same as conv2d 100 | Input: The feature from last layer "I" 101 | Output:\phi(f(I))*\sigmoid(g(I)) 102 | """ 103 | def __init__(self, scale_factor, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, batch_norm=True, activation=torch.nn.LeakyReLU(0.2, inplace=True)): 104 | super(SNGatedDeConv2dWithActivation, self).__init__() 105 | self.conv2d = SNGatedConv2dWithActivation(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, batch_norm, activation) 106 | self.scale_factor = scale_factor 107 | 108 | def forward(self, input): 109 | #print(input.size()) 110 | x = F.interpolate(input, scale_factor=2) 111 | return self.conv2d(x) 112 | ``` -------------------------------------------------------------------------------- /Deeplearning/ComputerVision/docs/multihead_attn.md: -------------------------------------------------------------------------------- 1 | ## Multi-head Attention Block 2 | 3 | ### 1. Expland 4 | ### 2. Pytorch Implementation 5 | ``` 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | def Normalize(in_channels): 11 | return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True) 12 | 13 | class MultiHeadAttnBlock(nn.Module): 14 | def __init__(self, in_channels, head_size=1): 15 | super().__init__() 16 | self.in_channels = in_channels 17 | self.head_size = head_size 18 | self.att_size = in_channels // head_size 19 | assert(in_channels % head_size == 0), 'The size of head should be divided by the number of channels.' 20 | 21 | self.norm1 = Normalize(in_channels) 22 | self.norm2 = Normalize(in_channels) 23 | 24 | self.q = torch.nn.Conv2d(in_channels, 25 | in_channels, 26 | kernel_size=1, 27 | stride=1, 28 | padding=0) 29 | self.k = torch.nn.Conv2d(in_channels, 30 | in_channels, 31 | kernel_size=1, 32 | stride=1, 33 | padding=0) 34 | self.v = torch.nn.Conv2d(in_channels, 35 | in_channels, 36 | kernel_size=1, 37 | stride=1, 38 | padding=0) 39 | self.proj_out = torch.nn.Conv2d(in_channels, 40 | in_channels, 41 | kernel_size=1, 42 | stride=1, 43 | padding=0) 44 | self.num = 0 45 | 46 | def forward(self, x, y=None): 47 | h_ = x 48 | h_ = self.norm1(h_) 49 | if y is None: 50 | y = h_ 51 | else: 52 | y = self.norm2(y) 53 | 54 | q = self.q(y) 55 | k = self.k(h_) 56 | v = self.v(h_) 57 | 58 | # compute attention 59 | b,c,h,w = q.shape 60 | q = q.reshape(b, self.head_size, self.att_size ,h*w) 61 | q = q.permute(0, 3, 1, 2) # b, hw, head, att 62 | 63 | k = k.reshape(b, self.head_size, self.att_size ,h*w) 64 | k = k.permute(0, 3, 1, 2) 65 | 66 | v = v.reshape(b, self.head_size, self.att_size ,h*w) 67 | v = v.permute(0, 3, 1, 2) 68 | 69 | 70 | q = q.transpose(1, 2) 71 | v = v.transpose(1, 2) 72 | k = k.transpose(1, 2).transpose(2,3) 73 | 74 | scale = int(self.att_size)**(-0.5) 75 | q.mul_(scale) 76 | w_ = torch.matmul(q, k) 77 | w_ = F.softmax(w_, dim=3) 78 | 79 | w_ = w_.matmul(v) 80 | 81 | w_ = w_.transpose(1, 2).contiguous() # [b, h*w, head, att] 82 | w_ = w_.view(b, h, w, -1) 83 | w_ = w_.permute(0, 3, 1, 2) 84 | 85 | w_ = self.proj_out(w_) 86 | 87 | return x+w_ 88 | ``` -------------------------------------------------------------------------------- /Deeplearning/ComputerVision/docs/resblock.md: -------------------------------------------------------------------------------- 1 | ## Resblock 2 | 3 | ### 1. Expland 4 | ### 2. Pytorch Implementation 5 | ``` 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | def Normalize(in_channels): 11 | return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True) 12 | 13 | class ResnetBlock(nn.Module): 14 | def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False, 15 | dropout, temb_channels=512): 16 | super().__init__() 17 | self.nonlinearity = torch.nn.LeakyReLU(0.2) 18 | self.in_channels = in_channels 19 | out_channels = in_channels if out_channels is None else out_channels 20 | self.out_channels = out_channels 21 | self.use_conv_shortcut = conv_shortcut 22 | 23 | self.norm1 = Normalize(in_channels) 24 | self.conv1 = torch.nn.Conv2d(in_channels, 25 | out_channels, 26 | kernel_size=3, 27 | stride=1, 28 | padding=1) 29 | if temb_channels > 0: 30 | self.temb_proj = torch.nn.Linear(temb_channels, 31 | out_channels) 32 | self.norm2 = Normalize(out_channels) 33 | self.dropout = torch.nn.Dropout(dropout) 34 | self.conv2 = torch.nn.Conv2d(out_channels, 35 | out_channels, 36 | kernel_size=3, 37 | stride=1, 38 | padding=1) 39 | if self.in_channels != self.out_channels: 40 | if self.use_conv_shortcut: 41 | self.conv_shortcut = torch.nn.Conv2d(in_channels, 42 | out_channels, 43 | kernel_size=3, 44 | stride=1, 45 | padding=1) 46 | else: 47 | self.nin_shortcut = torch.nn.Conv2d(in_channels, 48 | out_channels, 49 | kernel_size=1, 50 | stride=1, 51 | padding=0) 52 | 53 | 54 | def forward(self, x, temb): 55 | h = x 56 | h = self.norm1(h) 57 | h = self.nonlinearity(h) 58 | h = self.conv1(h) 59 | 60 | if temb is not None: 61 | h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None] 62 | 63 | h = self.norm2(h) 64 | h = self.nonlinearity(h) 65 | h = self.dropout(h) 66 | h = self.conv2(h) 67 | 68 | if self.in_channels != self.out_channels: 69 | if self.use_conv_shortcut: 70 | x = self.conv_shortcut(x) 71 | else: 72 | x = self.nin_shortcut(x) 73 | 74 | return x+h 75 | ``` -------------------------------------------------------------------------------- /Deeplearning/NLP/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deeplearning/NLP/README.md -------------------------------------------------------------------------------- /Deploy/Deepstream/FAQ.md: -------------------------------------------------------------------------------- 1 | ## FAQ about Deepstream 2 | 3 | -------------------------------------------------------------------------------- /Deploy/Deepstream/README.md: -------------------------------------------------------------------------------- 1 | ## 1. Requirement 2 | ``` 3 | sudo apt install libgirepository1.0-dev libgstreamer1.0-dev 4 | ``` 5 | ``` 6 | sudo apt install \ 7 | libssl1.0.0 \ 8 | libgstreamer1.0-0 \ 9 | gstreamer1.0-tools \ 10 | gstreamer1.0-plugins-good \ 11 | gstreamer1.0-plugins-bad \ 12 | gstreamer1.0-plugins-ugly \ 13 | gstreamer1.0-libav \ 14 | libgstrtspserver-1.0-0 \ 15 | libjansson4=2.11-1 16 | ``` 17 | ## 2. Examples 18 | - [Sample Yolov4](sample-yolov4) 19 | - [Sample ALPR](sample-ALPR) 20 | - [Sample SCRFD Face Detection](sample-scrfd) 21 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-ALPR/README.md: -------------------------------------------------------------------------------- 1 | # Deepstream ALPR 2 | 3 |

4 | 5 |

6 | 7 | ## 1. Requirement 8 | - Deepstream 6.0 9 | 10 | ## 2. Run demo 11 | ``` 12 | cd nvdsinfer_custom_impl_Yolo 13 | make 14 | cd .. 15 | deepstream-app -c config_deepstream.txt 16 | ``` 17 |

18 | 19 | 20 |

21 | 22 | ## 3. Models 23 | ### 3.1 Object detection 24 | - Sử dụng phiên bản Darknet COCO yolov4-608x608 25 | - Convert sang ONNX 26 | - [Bổ sung NMS Plugin](../sample-yolov4/tools/add_nms_plugins.py) 27 | - Customized parser: **NvDsInferParseCustomYoloV4** 28 | 29 | ### 3.2 Vehicle Type Net 30 | - Sử dụng model Resnet18 classification từ NVIDIA TAO 31 | - Tiến hành training & prune & INT8 quantization 32 | 33 | ### 3.3 License Plate Detection 34 | - Sử dụng model yolov4 từ NVIDIA TAO 35 | - Tiến hành training & prune & INT8 quantization 36 | - Customized parser: **NvDsInferParseCustomYoloV4TLT** 37 | 38 | ### 3.4 License Plate Recognition 39 | - Sử dụng model yolov4 từ NVIDIA TAO 40 | - Tiến hành training & prune & INT8 quantization 41 | - Customized parser: **NvDsInferParseCustomYoloV4LPR** (sort các ký tự detect được và gán vào **attributeLabel**) 42 | 43 | ## References 44 | - https://github.com/NVIDIA-AI-IOT/deepstream_lpr_app -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-ALPR/config_deepstream.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | ################################################################################ 18 | 19 | [application] 20 | enable-perf-measurement=1 21 | perf-measurement-interval-sec=3 22 | #gie-kitti-output-dir=streamscl 23 | 24 | [tiled-display] 25 | enable=1 26 | rows=1 27 | columns=0 28 | width=1280 29 | height=720 30 | gpu-id=0 31 | #(0): nvbuf-mem-default - Default memory allocated, specific to particular platform 32 | #(1): nvbuf-mem-cuda-pinned - Allocate Pinned/Host cuda memory, applicable for Tesla 33 | #(2): nvbuf-mem-cuda-device - Allocate Device cuda memory, applicable for Tesla 34 | #(3): nvbuf-mem-cuda-unified - Allocate Unified cuda memory, applicable for Tesla 35 | #(4): nvbuf-mem-surface-array - Allocate Surface Array memory, applicable for Jetson 36 | nvbuf-memory-type=0 37 | 38 | [source0] 39 | enable=1 40 | #Type - 1=CameraV4L2 2=URI 3=MultiURI 41 | type=3 42 | uri=file:///home/damnguyen/Deploy/deepstream/videos/video_%d.mp4 43 | num-sources=1 44 | gpu-id=0 45 | #drop-frame-interval=2 46 | # (0): memtype_device - Memory type Device 47 | # (1): memtype_pinned - Memory type Host Pinned 48 | # (2): memtype_unified - Memory type Unified 49 | cudadec-memtype=0 50 | 51 | [sink0] 52 | enable=0 53 | #Type - 1=FakeSink 2=EglSink 3=File 54 | type=1 55 | sync=0 56 | source-id=0 57 | gpu-id=0 58 | qos=0 59 | nvbuf-memory-type=0 60 | overlay-id=1 61 | 62 | [sink1] 63 | enable=1 64 | type=3 65 | enc-type=1 66 | #1=mp4 2=mkv 67 | container=1 68 | #1=h264 2=h265 69 | codec=1 70 | sync=0 71 | #iframeinterval=10 72 | bitrate=2000000 73 | output-file=out1.mp4 74 | 75 | 76 | [osd] 77 | enable=1 78 | gpu-id=0 79 | border-width=1 80 | text-size=15 81 | text-color=1;1;1;1; 82 | text-bg-color=0.3;0.3;0.3;1 83 | font=Serif 84 | show-clock=0 85 | clock-x-offset=800 86 | clock-y-offset=820 87 | clock-text-size=12 88 | clock-color=1;0;0;0 89 | nvbuf-memory-type=0 90 | 91 | [streammux] 92 | gpu-id=0 93 | ##Boolean property to inform muxer that sources are live 94 | live-source=0 95 | batch-size=1 96 | ##time out in usec, to wait after the first buffer is available 97 | ##to push the batch even if the complete batch is not formed 98 | batched-push-timeout=40000 99 | ## Set muxer output width and height 100 | width=1920 101 | height=1080 102 | ##Enable to maintain aspect ratio wrt source, and allow black borders, works 103 | ##along with width, height properties 104 | enable-padding=0 105 | nvbuf-memory-type=0 106 | 107 | 108 | [primary-gie] 109 | enable=1 110 | gpu-id=0 111 | labelfile-path=labels.txt 112 | #Required by the app for OSD, not a plugin property 113 | bbox-border-color0=1;0;0;1 114 | bbox-border-color1=0;1;1;1 115 | bbox-border-color2=0;0;1;1 116 | bbox-border-color3=0;1;0;1 117 | gie-unique-id=1 118 | nvbuf-memory-type=0 119 | config-file=config_yolov4.txt 120 | 121 | [tracker] 122 | enable=1 123 | tracker-width=608 124 | tracker-height=608 125 | ll-lib-file=/opt/nvidia/deepstream/deepstream/lib/libnvds_nvmultiobjecttracker.so 126 | ll-config-file=/opt/nvidia/deepstream/deepstream/samples/configs/deepstream-app/config_tracker_NvDCF_max_perf.yml 127 | enable-batch-process=1 128 | display-tracking-id=1 129 | enable-past-frame=1 130 | 131 | [secondary-gie0] 132 | enable=1 133 | gpu-id=0 134 | gie-unique-id=2 135 | operate-on-gie-id=1 136 | operate-on-class-ids=2;5;7 137 | config-file=config_vehicletype.txt 138 | 139 | [secondary-gie1] 140 | enable=1 141 | gpu-id=0 142 | gie-unique-id=3 143 | operate-on-gie-id=1 144 | operate-on-class-ids=2;3;5;7 145 | config-file=config_lpd.txt 146 | 147 | [secondary-gie2] 148 | enable=1 149 | gpu-id=0 150 | gie-unique-id=4 151 | operate-on-gie-id=3 152 | operate-on-class-ids=0 153 | config-file=config_lpr.txt -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-ALPR/config_lpd.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a 5 | # copy of this software and associated documentation files (the "Software"), 6 | # to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | # and/or sell copies of the Software, and to permit persons to whom the 9 | # Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | # DEALINGS IN THE SOFTWARE. 21 | ################################################################################ 22 | 23 | [property] 24 | gpu-id=0 25 | net-scale-factor=1 26 | offsets=103.939;116.779;123.68 27 | tlt-model-key=license-plate-yolov4 28 | tlt-encoded-model=weights/license-plate-detection/yolov4_resnet18_epoch_050-fp32.etlt 29 | labelfile-path=weights/license-plate-detection/labels.txt 30 | int8-calib-file=weights/license-plate-detection/cal.bin 31 | model-engine-file=weights/license-plate-detection/yolov4_resnet18_epoch_050-fp32.etlt_b4_gpu0_fp32.engine 32 | infer-dims=3;320;320 33 | uff-input-blob-name=Input 34 | batch-size=4 35 | process-mode=2 36 | model-color-format=0 37 | ## 0=FP32, 1=INT8, 2=FP16 mode 38 | network-mode=0 39 | #0 detector 1 classifier 2 segmentatio 3 instance segmentation 40 | network-type=0 41 | num-detected-classes=1 42 | interval=0 43 | gie-unique-id=5 44 | operate-on-class-ids=2;3;5;7 45 | operate-on-gie-id=1 46 | output-blob-names=BatchedNMS 47 | parse-bbox-func-name=NvDsInferParseCustomYoloV4TLT 48 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so 49 | input-object-min-width=64 50 | input-object-min-height=64 -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-ALPR/config_lpr.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a 5 | # copy of this software and associated documentation files (the "Software"), 6 | # to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | # and/or sell copies of the Software, and to permit persons to whom the 9 | # Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | # DEALINGS IN THE SOFTWARE. 21 | ################################################################################ 22 | 23 | [property] 24 | gpu-id=0 25 | net-scale-factor=1 26 | offsets=103.939;116.779;123.68 27 | tlt-model-key=license-plate-recognition 28 | tlt-encoded-model=weights/license-plate-recognition/yolov4_resnet18-pruned-retrain-int8.etlt 29 | labelfile-path=weights/license-plate-recognition/labels.txt 30 | int8-calib-file=weights/license-plate-recognition/yolov4_resnet18-pruned-retrain.bin 31 | model-engine-file=weights/license-plate-recognition/yolov4_resnet18-pruned-retrain-int8.etlt_b4_gpu0_int8.engine 32 | infer-dims=3;224;224 33 | uff-input-blob-name=Input 34 | batch-size=4 35 | process-mode=2 36 | model-color-format=0 37 | ## 0=FP32, 1=INT8, 2=FP16 mode 38 | network-mode=1 39 | #0 detector 1 classifier 2 segmentatio 3 instance segmentation 40 | network-type=1 41 | interval=0 42 | gie-unique-id=5 43 | operate-on-class-ids=0 44 | operate-on-gie-id=1 45 | output-blob-names=BatchedNMS 46 | classifier-threshold=0.7 47 | classifier-async-mode=0 48 | #parse-bbox-func-name=NvDsInferParseCustomYoloV4LPR 49 | parse-classifier-func-name=NvDsInferParseCustomYoloV4LPR 50 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so 51 | input-object-min-width=16 52 | input-object-min-height=16 -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-ALPR/config_tracker.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2019-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | ################################################################################ 17 | 18 | # Mandatory properties for the tracker: 19 | # tracker-width 20 | # tracker-height: needs to be multiple of 6 for NvDCF 21 | # gpu-id 22 | # ll-lib-file: path to low-level tracker lib 23 | # ll-config-file: required for NvDCF, optional for KLT and IOU 24 | # 25 | [tracker] 26 | tracker-width=608 27 | tracker-height=608 28 | gpu-id=0 29 | ll-lib-file=/opt/nvidia/deepstream/deepstream/lib/libnvds_nvmultiobjecttracker.so 30 | ll-config-file=config_tracker_NvDCF_perf.yml 31 | #enable-past-frame=1 32 | enable-batch-process=1 -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-ALPR/config_vehicletype.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a 5 | # copy of this software and associated documentation files (the "Software"), 6 | # to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | # and/or sell copies of the Software, and to permit persons to whom the 9 | # Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | # DEALINGS IN THE SOFTWARE. 21 | ################################################################################ 22 | 23 | [property] 24 | gpu-id=0 25 | net-scale-factor=1 26 | offsets=124;117;104 27 | tlt-model-key=vehicle-type-net 28 | tlt-encoded-model=weights/vehicletypenet/vehicle-type-net-r18-pruned-retrain-int8.etlt 29 | labelfile-path=weights/vehicletypenet/labels.txt 30 | int8-calib-file=weights/vehicletypenet/vehicle-type-net-r18-pruned-retrain.bin 31 | model-engine-file=weights/vehicletypenet/vehicle-type-net-r18-pruned-retrain-int8.etlt_b4_gpu0_int8.engine 32 | input-dims=3;224;224;0 33 | uff-input-blob-name=input_1 34 | batch-size=4 35 | process-mode=2 36 | model-color-format=0 37 | ## 0=FP32, 1=INT8, 2=FP16 mode 38 | network-mode=1 39 | #0 detector 1 classifier 2 segmentatio 3 instance segmentation 40 | network-type=1 41 | interval=0 42 | gie-unique-id=4 43 | operate-on-class-ids=2;5;7 44 | operate-on-gie-id=1 45 | output-blob-names=predictions/Softmax 46 | classifier-threshold=0.2 47 | input-object-min-width=64 48 | input-object-min-height=64 -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-ALPR/config_yolov4.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | ################################################################################ 18 | 19 | # Following properties are mandatory when engine files are not specified: 20 | # int8-calib-file(Only in INT8), model-file-format 21 | # Caffemodel mandatory properties: model-file, proto-file, output-blob-names 22 | # UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names 23 | # ONNX: onnx-file 24 | # 25 | # Mandatory properties for detectors: 26 | # num-detected-classes 27 | # 28 | # Optional properties for detectors: 29 | # cluster-mode(Default=Group Rectangles), interval(Primary mode only, Default=0) 30 | # custom-lib-path 31 | # parse-bbox-func-name 32 | # 33 | # Mandatory properties for classifiers: 34 | # classifier-threshold, is-classifier 35 | # 36 | # Optional properties for classifiers: 37 | # classifier-async-mode(Secondary mode only, Default=false) 38 | # 39 | # Optional properties in secondary mode: 40 | # operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes), 41 | # input-object-min-width, input-object-min-height, input-object-max-width, 42 | # input-object-max-height 43 | # 44 | # Following properties are always recommended: 45 | # batch-size(Default=1) 46 | # 47 | # Other optional properties: 48 | # net-scale-factor(Default=1), network-mode(Default=0 i.e FP32), 49 | # model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path, 50 | # mean-file, gie-unique-id(Default=0), offsets, process-mode (Default=1 i.e. primary), 51 | # custom-lib-path, network-mode(Default=0 i.e FP32) 52 | # 53 | # The values in the config file are overridden by values set through GObject 54 | # properties. 55 | 56 | [property] 57 | gpu-id=0 58 | net-scale-factor=0.0039215697906911373 59 | # Skip frame 60 | interval=0 61 | #0=RGB, 1=BGR 62 | model-color-format=0 63 | input-dims=3;608;608;0 64 | onnx-file=weights/yolov4_-1_3_608_608_dynamic.nms.onnx 65 | model-engine-file=weights/yolov4_-1_3_608_608_dynamic.nms.onnx_b4_gpu0_fp32.engine 66 | labelfile-path=labels.txt 67 | batch-size=4 68 | ## 0=FP32, 1=INT8, 2=FP16 mode 69 | network-mode=0 70 | num-detected-classes=80 71 | gie-unique-id=1 72 | network-type=0 73 | is-classifier=0 74 | ## 0=Group Rectangles, 1=DBSCAN, 2=NMS, 3= DBSCAN+NMS Hybrid, 4 = None(No clustering) 75 | cluster-mode=2 76 | maintain-aspect-ratio=1 77 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so 78 | parse-bbox-func-name=NvDsInferParseCustomYoloV4 79 | #scaling-filter=0 80 | #scaling-compute-hw=0 81 | 82 | [class-attrs-all] 83 | nms-iou-threshold=0.6 84 | pre-cluster-threshold=0.4 85 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-ALPR/dict.txt: -------------------------------------------------------------------------------- 1 | A 2 | B 3 | C 4 | D 5 | E 6 | 8 7 | F 8 | 5 9 | 4 10 | G 11 | H 12 | I 13 | J 14 | K 15 | L 16 | M 17 | N 18 | 9 19 | 1 20 | P 21 | Q 22 | R 23 | S 24 | 7 25 | 6 26 | T 27 | 3 28 | 2 29 | U 30 | V 31 | W 32 | X 33 | Y 34 | Z 35 | 0 -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-ALPR/fig/lpr_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-ALPR/fig/lpr_pipeline.png -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-ALPR/fig/lpr_result1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-ALPR/fig/lpr_result1.png -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-ALPR/fig/lpr_result2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-ALPR/fig/lpr_result2.png -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-ALPR/labels.txt: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-ALPR/nvdsinfer_custom_impl_Yolo/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | CUDA_VER?= 18 | ifeq ($(CUDA_VER),) 19 | $(error "CUDA_VER is not set") 20 | endif 21 | CC:= g++ 22 | NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc 23 | 24 | CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations 25 | CFLAGS+= -I../../includes -I/usr/local/cuda-$(CUDA_VER)/include -I/opt/nvidia/deepstream/deepstream/sources/includes 26 | 27 | LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs 28 | LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group 29 | 30 | INCS:= $(wildcard *.h) 31 | SRCFILES:= nvdsparsebbox_Yolo.cpp 32 | 33 | TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so 34 | 35 | TARGET_OBJS:= $(SRCFILES:.cpp=.o) 36 | TARGET_OBJS:= $(TARGET_OBJS:.cu=.o) 37 | 38 | all: $(TARGET_LIB) 39 | 40 | %.o: %.cpp $(INCS) Makefile 41 | $(CC) -c -o $@ $(CFLAGS) $< 42 | 43 | %.o: %.cu $(INCS) Makefile 44 | $(NVCC) -c -o $@ --compiler-options '-fPIC' $< 45 | 46 | $(TARGET_LIB) : $(TARGET_OBJS) 47 | $(CC) -o $@ $(TARGET_OBJS) $(LFLAGS) 48 | 49 | clean: 50 | rm -rf $(TARGET_LIB) 51 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-ALPR/weights/README.md: -------------------------------------------------------------------------------- 1 | # To do -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-ALPR/weights/license-plate-detection/labels.txt: -------------------------------------------------------------------------------- 1 | license_plate -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-ALPR/weights/license-plate-recognition/labels.txt: -------------------------------------------------------------------------------- 1 | A 2 | B 3 | C 4 | D 5 | E 6 | 8 7 | F 8 | 5 9 | 4 10 | G 11 | H 12 | I 13 | J 14 | K 15 | L 16 | M 17 | N 18 | 9 19 | 1 20 | P 21 | Q 22 | R 23 | S 24 | 7 25 | 6 26 | T 27 | 3 28 | 2 29 | U 30 | V 31 | W 32 | X 33 | Y 34 | Z 35 | 0 -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-ALPR/weights/vehicletypenet/labels.txt: -------------------------------------------------------------------------------- 1 | hatchback;bus;pickup;sedan;suv;truck;van -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/README.md: -------------------------------------------------------------------------------- 1 | ## Build custom plugins 2 | 3 | ``` 4 | cd nvdsinfer_custom_impl_Yolo 5 | mkdir build && cd build 6 | cmake .. 7 | make -j8 8 | ``` 9 | 10 | ## Run deepstream-python 11 | ``` 12 | LD_PRELOAD= python3 run_scrfd.py file:/ 13 | ``` 14 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/config_scrfd.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | ################################################################################ 18 | 19 | # Following properties are mandatory when engine files are not specified: 20 | # int8-calib-file(Only in INT8), model-file-format 21 | # Caffemodel mandatory properties: model-file, proto-file, output-blob-names 22 | # UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names 23 | # ONNX: onnx-file 24 | # 25 | # Mandatory properties for detectors: 26 | # num-detected-classes 27 | # 28 | # Optional properties for detectors: 29 | # cluster-mode(Default=Group Rectangles), interval(Primary mode only, Default=0) 30 | # custom-lib-path 31 | # parse-bbox-func-name 32 | # 33 | # Mandatory properties for classifiers: 34 | # classifier-threshold, is-classifier 35 | # 36 | # Optional properties for classifiers: 37 | # classifier-async-mode(Secondary mode only, Default=false) 38 | # 39 | # Optional properties in secondary mode: 40 | # operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes), 41 | # input-object-min-width, input-object-min-height, input-object-max-width, 42 | # input-object-max-height 43 | # 44 | # Following properties are always recommended: 45 | # batch-size(Default=1) 46 | # 47 | # Other optional properties: 48 | # net-scale-factor(Default=1), network-mode(Default=0 i.e FP32), 49 | # model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path, 50 | # mean-file, gie-unique-id(Default=0), offsets, process-mode (Default=1 i.e. primary), 51 | # custom-lib-path, network-mode(Default=0 i.e FP32) 52 | # 53 | # The values in the config file are overridden by values set through GObject 54 | # properties. 55 | 56 | [property] 57 | gpu-id=0 58 | net-scale-factor=0.0039215697906911373 59 | # Skip frame 60 | interval=0 61 | #0=RGB, 1=BGR 62 | model-color-format=0 63 | input-dims=3;640;640;0 64 | onnx-file=weights/face-detection/scrfd-nms-full.nms.onnx 65 | model-engine-file=weights/face-detection/scrfd-nms-full.nms.onnx_b4_gpu0_fp32.engine 66 | labelfile-path=weights/face-detection/labels.txt 67 | batch-size=4 68 | ## 0=FP32, 1=INT8, 2=FP16 mode 69 | network-mode=0 70 | num-detected-classes=2 71 | gie-unique-id=1 72 | network-type=100 73 | ## 0=Group Rectangles, 1=DBSCAN, 2=NMS, 3= DBSCAN+NMS Hybrid, 4 = None(No clustering) 74 | cluster-mode=4 75 | maintain-aspect-ratio=0 76 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so 77 | #parse-bbox-func-name=NvDsInferParseCustomFaceDetection 78 | #scaling-filter=0 79 | #scaling-compute-hw=0 80 | output-tensor-meta=1 81 | #[class-attrs-all] 82 | #nms-iou-threshold=0.6 83 | #pre-cluster-threshold=0.4 84 | input-object-min-width=0 85 | input-object-min-height=0 86 | process-mode=1 -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.13 FATAL_ERROR) 2 | include(cmake/set_ifndef.cmake) 3 | 4 | project(TensorRT 5 | LANGUAGES CXX CUDA 6 | VERSION 8.2 7 | DESCRIPTION "TensorRT is a C++ library that facilitates high performance inference on NVIDIA GPUs and deep learning accelerators." 8 | HOMEPAGE_URL "https://github.com/NVIDIA/TensorRT") 9 | 10 | # C++14 11 | set(CMAKE_CXX_STANDARD 14) 12 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 13 | set(CMAKE_CXX_EXTENSIONS OFF) 14 | set(CMAKE_CXX_FLAGS "-Wno-deprecated-declarations ${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss") 15 | 16 | find_package(Threads REQUIRED) 17 | 18 | ## find_package(CUDA) is broken for cross-compilation. Enable CUDA language instead. 19 | if(NOT DEFINED CMAKE_TOOLCHAIN_FILE) 20 | find_package(CUDA ${CUDA_VERSION} REQUIRED) 21 | endif() 22 | 23 | include_directories( 24 | ${CUDA_INCLUDE_DIRS} 25 | ${CUDNN_ROOT_DIR}/include 26 | ) 27 | find_library(CUDNN_LIB cudnn HINTS 28 | ${CUDA_TOOLKIT_ROOT_DIR} ${CUDNN_ROOT_DIR} PATH_SUFFIXES lib64 lib) 29 | find_library(CUBLAS_LIB cublas HINTS 30 | ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib64 lib lib/stubs) 31 | find_library(CUBLASLT_LIB cublasLt HINTS 32 | ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib64 lib lib/stubs) 33 | find_library(CUDART_LIB cudart HINTS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib64) 34 | find_library(RT_LIB rt) 35 | set(CUDA_LIBRARIES ${CUDART_LIB}) 36 | 37 | 38 | message(STATUS "CUBLAS_LIB: ${CUBLAS_LIB}") 39 | message(STATUS "CUBLASLT_LIB: ${CUBLASLT_LIB}") 40 | message(STATUS "CUDART_LIB: ${CUDART_LIB}") 41 | message(STATUS "CUDNN_LIB: ${CUDNN_LIB}") 42 | 43 | file(GLOB SRCS *.cpp) 44 | set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS}) 45 | file(GLOB CU_SRCS *.cu) 46 | set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} ${CU_SRCS}) 47 | file(GLOB COMMON_SRCS common/*.cpp) 48 | set(COMMON_SOURCES ${COMMON_SOURCES} ${COMMON_SRCS}) 49 | file(GLOB COMMON_CU_SRCS common/kernels/*.cu) 50 | set(COMMON_CU_SOURCES ${COMMON_CU_SOURCES} ${COMMON_CU_SRCS}) 51 | 52 | # Generate Gencode 53 | if (DEFINED GPU_ARCHS) 54 | message(STATUS "GPU_ARCHS defined as ${GPU_ARCHS}. Generating CUDA code for SM ${GPU_ARCHS}") 55 | separate_arguments(GPU_ARCHS) 56 | else() 57 | list(APPEND GPU_ARCHS 58 | 53 59 | 60 60 | 61 61 | 70 62 | 75 63 | ) 64 | 65 | string(REGEX MATCH "aarch64" IS_ARM "${TRT_PLATFORM_ID}") 66 | if (IS_ARM) 67 | # Xavier (SM72) only supported for aarch64. 68 | list(APPEND GPU_ARCHS 72) 69 | endif() 70 | 71 | if (CUDA_VERSION VERSION_GREATER_EQUAL 11.0) 72 | # Ampere GPU (SM80) support is only available in CUDA versions > 11.0 73 | list(APPEND GPU_ARCHS 80) 74 | endif() 75 | if (CUDA_VERSION VERSION_GREATER_EQUAL 11.1) 76 | list(APPEND GPU_ARCHS 86) 77 | endif() 78 | 79 | message(STATUS "GPU_ARCHS is not defined. Generating CUDA code for default SMs: ${GPU_ARCHS}") 80 | endif() 81 | foreach(arch ${GPU_ARCHS}) 82 | set(GENCODES "${GENCODES} -gencode arch=compute_${arch},code=sm_${arch}") 83 | endforeach() 84 | # Generate PTX for the last architecture in the list. 85 | list(GET GPU_ARCHS -1 LATEST_SM) 86 | set(GENCODES "${GENCODES} -gencode arch=compute_${LATEST_SM},code=compute_${LATEST_SM}") 87 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wno-deprecated-declarations") 88 | 89 | 90 | include_directories(common common/kernels) 91 | list(APPEND PLUGIN_CU_SOURCES "${COMMON_CU_SOURCES}") 92 | set_source_files_properties(${PLUGIN_CU_SOURCES} PROPERTIES COMPILE_FLAGS ${GENCODES}) 93 | list(APPEND PLUGIN_SOURCES "${PLUGIN_CU_SOURCES}") 94 | list(APPEND PLUGIN_SOURCES "${COMMON_SOURCES}") 95 | 96 | message(STATUS "PLUGIN_SOURCES: ${PLUGIN_SOURCES}") 97 | message(STATUS "GENCODES: ${GENCODES}") 98 | 99 | add_library(my_plugin SHARED 100 | ${PLUGIN_SOURCES} 101 | ) 102 | 103 | target_include_directories(my_plugin 104 | PUBLIC /opt/nvidia/deepstream/deepstream/sources/includes 105 | ) 106 | target_include_directories(my_plugin 107 | PUBLIC /usr/include/gstreamer-1.0 /usr/include/glib-2.0 /usr/lib/x86_64-linux-gnu/glib-2.0/include 108 | ) 109 | 110 | target_link_libraries(my_plugin 111 | ${CUBLAS_LIB} 112 | ${CUBLASLT_LIB} 113 | ${CUDART_LIB} 114 | ${CUDNN_LIB} 115 | nvinfer 116 | ) -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | CUDA_VER?= 18 | ifeq ($(CUDA_VER),) 19 | $(error "CUDA_VER is not set") 20 | endif 21 | CC:= g++ 22 | NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc 23 | 24 | CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations 25 | CFLAGS+= -I../../includes -I/home/damnguyen/Deploy/deepstream/nvdsinfer_custom_impl_Yolo/common -I/usr/local/cuda-$(CUDA_VER)/include -I/opt/nvidia/deepstream/deepstream/sources/includes 26 | 27 | LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs 28 | LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group 29 | 30 | INCS:= $(wildcard *.h) 31 | SRCFILES:= *.cpp *.cu common/*.cpp common/kernels/*.cu 32 | 33 | TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so 34 | 35 | TARGET_OBJS:= $(SRCFILES:.cpp=.o) 36 | TARGET_OBJS:= $(TARGET_OBJS:.cu=.o) 37 | 38 | all: $(TARGET_LIB) 39 | 40 | %.o: %.cpp $(INCS) Makefile 41 | $(CC) -c -o $@ $(CFLAGS) $< 42 | 43 | %.o: %.cu $(INCS) Makefile 44 | $(NVCC) -c -o $@ --compiler-options '-fPIC' $< 45 | 46 | $(TARGET_LIB) : $(TARGET_OBJS) 47 | $(CC) -o $@ $(TARGET_OBJS) $(LFLAGS) 48 | 49 | clean: 50 | rm -rf $(TARGET_LIB) 51 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/batchedNMSCustomInference.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #include "bboxUtils.h" 17 | #include "cuda_runtime_api.h" 18 | #include "gatherNMSCustomOutputs.h" 19 | #include "kernel.h" 20 | #include "nmsUtils.h" 21 | 22 | pluginStatus_t nmsCustomInference(cudaStream_t stream, const int N, const int perBatchBoxesSize, const int perBatchScoresSize, const int perBatchLandmarksSize, 23 | const bool shareLocation, const int backgroundLabelId, const int numPredsPerClass, const int numClasses, 24 | const int topK, const int keepTopK, const float scoreThreshold, const float iouThreshold, const DataType DT_BBOX, 25 | const void* locData, const DataType DT_SCORE, const void* confData, const void* landData, void* keepCount, void* nmsedBoxes, 26 | void* nmsedScores, void* nmsedClasses, void* nmsedLandmarks, void* workspace, bool isNormalized, bool confSigmoid, bool clipBoxes, int scoreBits) 27 | { 28 | // locCount = batch_size * number_boxes_per_sample * 4 29 | const int locCount = N * perBatchBoxesSize; 30 | /* 31 | * shareLocation 32 | * Bounding box are shared among all classes, i.e., a bounding box could be classified as any candidate class. 33 | * Otherwise 34 | * Bounding box are designed for specific classes, i.e., a bounding box could be classified as one certain class or 35 | * not (binary classification). 36 | */ 37 | const int numLocClasses = shareLocation ? 1 : numClasses; 38 | 39 | size_t bboxDataSize = detectionForwardBBoxDataSize(N, perBatchBoxesSize, DT_BBOX); 40 | void* bboxDataRaw = workspace; 41 | cudaMemcpyAsync(bboxDataRaw, locData, bboxDataSize, cudaMemcpyDeviceToDevice, stream); 42 | pluginStatus_t status; 43 | 44 | /* 45 | * bboxDataRaw format: 46 | * [batch size, numPriors (per sample), numLocClasses, 4] 47 | */ 48 | // float for now 49 | void* bboxData; 50 | size_t bboxPermuteSize = detectionForwardBBoxPermuteSize(shareLocation, N, perBatchBoxesSize, DT_BBOX); 51 | void* bboxPermute = nextWorkspacePtr((int8_t*) bboxDataRaw, bboxDataSize); 52 | 53 | /* 54 | * After permutation, bboxData format: 55 | * [batch_size, numLocClasses, numPriors (per sample) (numPredsPerClass), 4] 56 | * This is equivalent to swapping axis 57 | */ 58 | if (!shareLocation) 59 | { 60 | status = permuteData( 61 | stream, locCount, numLocClasses, numPredsPerClass, 4, DT_BBOX, false, bboxDataRaw, bboxPermute); 62 | ASSERT_FAILURE(status == STATUS_SUCCESS); 63 | bboxData = bboxPermute; 64 | } 65 | /* 66 | * If shareLocation, numLocClasses = 1 67 | * No need to permute data on linear memory 68 | */ 69 | else 70 | { 71 | bboxData = bboxDataRaw; 72 | } 73 | 74 | /* 75 | * Conf data format 76 | * [batch size, numPriors * param.numClasses, 1, 1] 77 | */ 78 | const int numScores = N * perBatchScoresSize; 79 | size_t totalScoresSize = detectionForwardPreNMSSize(N, perBatchScoresSize); 80 | if(DT_SCORE == DataType::kHALF) totalScoresSize /= 2; // detectionForwardPreNMSSize is implemented in terms of kFLOAT 81 | void* scores = nextWorkspacePtr((int8_t*) bboxPermute, bboxPermuteSize); 82 | 83 | // need a conf_scores 84 | /* 85 | * After permutation, bboxData format: 86 | * [batch_size, numClasses, numPredsPerClass, 1] 87 | */ 88 | status = permuteData( 89 | stream, numScores, numClasses, numPredsPerClass, 1, DT_SCORE, confSigmoid, confData, scores); 90 | ASSERT_FAILURE(status == STATUS_SUCCESS); 91 | 92 | size_t indicesSize = detectionForwardPreNMSSize(N, perBatchScoresSize); 93 | void* indices = nextWorkspacePtr((int8_t*) scores, totalScoresSize); 94 | 95 | size_t postNMSScoresSize = detectionForwardPostNMSSize(N, numClasses, topK); 96 | if(DT_SCORE == DataType::kHALF) postNMSScoresSize /= 2; // detectionForwardPostNMSSize is implemented in terms of kFLOAT 97 | size_t postNMSIndicesSize = detectionForwardPostNMSSize(N, numClasses, topK); // indices are full int32 98 | void* postNMSScores = nextWorkspacePtr((int8_t*) indices, indicesSize); 99 | void* postNMSIndices = nextWorkspacePtr((int8_t*) postNMSScores, postNMSScoresSize); 100 | 101 | void* sortingWorkspace = nextWorkspacePtr((int8_t*) postNMSIndices, postNMSIndicesSize); 102 | // Sort the scores so that the following NMS could be applied. 103 | float scoreShift = 0.f; 104 | if(DT_SCORE == DataType::kHALF && scoreBits > 0 && scoreBits <= 10) 105 | scoreShift = 1.f; 106 | status = sortScoresPerClass(stream, N, numClasses, numPredsPerClass, backgroundLabelId, scoreThreshold, 107 | DT_SCORE, scores, indices, sortingWorkspace, scoreBits, scoreShift); 108 | 109 | ASSERT_FAILURE(status == STATUS_SUCCESS); 110 | 111 | // This is set to true as the input bounding boxes are of the format [ymin, 112 | // xmin, ymax, xmax]. The default implementation assumes [xmin, ymin, xmax, ymax] 113 | bool flipXY = true; 114 | // NMS 115 | status = allClassNMS(stream, N, numClasses, numPredsPerClass, topK, iouThreshold, shareLocation, isNormalized, 116 | DT_SCORE, DT_BBOX, bboxData, scores, indices, postNMSScores, postNMSIndices, flipXY, scoreShift); 117 | ASSERT_FAILURE(status == STATUS_SUCCESS); 118 | 119 | // Sort the bounding boxes after NMS using scores 120 | status = sortScoresPerImage(stream, N, numClasses * topK, DT_SCORE, postNMSScores, postNMSIndices, scores, 121 | indices, sortingWorkspace, scoreBits); 122 | 123 | ASSERT_FAILURE(status == STATUS_SUCCESS); 124 | 125 | // Gather data from the sorted bounding boxes after NMS 126 | status = gatherNMSCustomOutputs(stream, shareLocation, N, numPredsPerClass, numClasses, topK, keepTopK, DT_BBOX, 127 | DT_SCORE, indices, scores, bboxData, landData, keepCount, nmsedBoxes, nmsedScores, nmsedClasses, nmsedLandmarks, clipBoxes, scoreShift); 128 | ASSERT_FAILURE(status == STATUS_SUCCESS); 129 | 130 | return STATUS_SUCCESS; 131 | } 132 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/batchedNMSCustomPlugin.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/batchedNMSCustomPlugin.o -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/cmake/set_ifndef.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | function (set_ifndef variable value) 17 | if(NOT DEFINED ${variable}) 18 | set(${variable} ${value} PARENT_SCOPE) 19 | endif() 20 | endfunction() 21 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/ErrorRecorder.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef ERROR_RECORDER_H 18 | #define ERROR_RECORDER_H 19 | #include "NvInferRuntimeCommon.h" 20 | #include "logger.h" 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | using nvinfer1::IErrorRecorder; 28 | using nvinfer1::ErrorCode; 29 | 30 | //! 31 | //! A simple implementation of the IErrorRecorder interface for 32 | //! use by samples. This interface also can be used as a reference 33 | //! implementation. 34 | //! The sample Error recorder is based on a vector that pairs the error 35 | //! code and the error string into a single element. It also uses 36 | //! standard mutex's and atomics in order to make sure that the code 37 | //! works in a multi-threaded environment. 38 | //! 39 | class SampleErrorRecorder : public IErrorRecorder 40 | { 41 | using errorPair = std::pair; 42 | using errorStack = std::vector; 43 | 44 | public: 45 | SampleErrorRecorder() = default; 46 | 47 | virtual ~SampleErrorRecorder() noexcept {} 48 | int32_t getNbErrors() const noexcept final 49 | { 50 | return mErrorStack.size(); 51 | } 52 | ErrorCode getErrorCode(int32_t errorIdx) const noexcept final 53 | { 54 | return invalidIndexCheck(errorIdx) ? ErrorCode::kINVALID_ARGUMENT : (*this)[errorIdx].first; 55 | }; 56 | IErrorRecorder::ErrorDesc getErrorDesc(int32_t errorIdx) const noexcept final 57 | { 58 | return invalidIndexCheck(errorIdx) ? "errorIdx out of range." : (*this)[errorIdx].second.c_str(); 59 | } 60 | // This class can never overflow since we have dynamic resize via std::vector usage. 61 | bool hasOverflowed() const noexcept final 62 | { 63 | return false; 64 | } 65 | 66 | // Empty the errorStack. 67 | void clear() noexcept final 68 | { 69 | try 70 | { 71 | // grab a lock so that there is no addition while clearing. 72 | std::lock_guard guard(mStackLock); 73 | mErrorStack.clear(); 74 | } 75 | catch (const std::exception& e) 76 | { 77 | sample::gLogFatal << "Internal Error: " << e.what() << std::endl; 78 | } 79 | }; 80 | 81 | //! Simple helper function that 82 | bool empty() const noexcept 83 | { 84 | return mErrorStack.empty(); 85 | } 86 | 87 | bool reportError(ErrorCode val, IErrorRecorder::ErrorDesc desc) noexcept final 88 | { 89 | try 90 | { 91 | std::lock_guard guard(mStackLock); 92 | sample::gLogError << "Error[" << static_cast(val) << "]: " << desc << std::endl; 93 | mErrorStack.push_back(errorPair(val, desc)); 94 | } 95 | catch (const std::exception& e) 96 | { 97 | sample::gLogFatal << "Internal Error: " << e.what() << std::endl; 98 | } 99 | // All errors are considered fatal. 100 | return true; 101 | } 102 | 103 | // Atomically increment or decrement the ref counter. 104 | IErrorRecorder::RefCount incRefCount() noexcept final 105 | { 106 | return ++mRefCount; 107 | } 108 | IErrorRecorder::RefCount decRefCount() noexcept final 109 | { 110 | return --mRefCount; 111 | } 112 | 113 | private: 114 | // Simple helper functions. 115 | const errorPair& operator[](size_t index) const noexcept 116 | { 117 | return mErrorStack[index]; 118 | } 119 | 120 | bool invalidIndexCheck(int32_t index) const noexcept 121 | { 122 | // By converting signed to unsigned, we only need a single check since 123 | // negative numbers turn into large positive greater than the size. 124 | size_t sIndex = index; 125 | return sIndex >= mErrorStack.size(); 126 | } 127 | // Mutex to hold when locking mErrorStack. 128 | std::mutex mStackLock; 129 | 130 | // Reference count of the class. Destruction of the class when mRefCount 131 | // is not zero causes undefined behavior. 132 | std::atomic mRefCount{0}; 133 | 134 | // The error stack that holds the errors recorded by TensorRT. 135 | errorStack mErrorStack; 136 | }; // class SampleErrorRecorder 137 | #endif // ERROR_RECORDER_H 138 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/bboxUtils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #ifndef TRT_BBOX_UTILS_H 17 | #define TRT_BBOX_UTILS_H 18 | 19 | #include "plugin.h" 20 | 21 | using namespace nvinfer1; 22 | using namespace nvinfer1::plugin; 23 | 24 | template 25 | struct Bbox 26 | { 27 | T xmin, ymin, xmax, ymax; 28 | Bbox(T xmin, T ymin, T xmax, T ymax) 29 | : xmin(xmin) 30 | , ymin(ymin) 31 | , xmax(xmax) 32 | , ymax(ymax) 33 | { 34 | } 35 | Bbox() = default; 36 | }; 37 | 38 | template 39 | struct BboxInfo 40 | { 41 | T conf_score; 42 | int label; 43 | int bbox_idx; 44 | bool kept; 45 | BboxInfo(T conf_score, int label, int bbox_idx, bool kept) 46 | : conf_score(conf_score) 47 | , label(label) 48 | , bbox_idx(bbox_idx) 49 | , kept(kept) 50 | { 51 | } 52 | BboxInfo() = default; 53 | }; 54 | 55 | template 56 | bool operator<(const Bbox& lhs, const Bbox& rhs) 57 | { 58 | return lhs.x1 < rhs.x1; 59 | } 60 | 61 | template 62 | bool operator==(const Bbox& lhs, const Bbox& rhs) 63 | { 64 | return lhs.x1 == rhs.x1 && lhs.y1 == rhs.y1 && lhs.x2 == rhs.x2 && lhs.y2 == rhs.y2; 65 | } 66 | // }}} 67 | 68 | int8_t* alignPtr(int8_t* ptr, uintptr_t to); 69 | 70 | int8_t* nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize); 71 | 72 | size_t dataTypeSize(DataType dtype); 73 | 74 | void setUniformOffsets(cudaStream_t stream, int num_segments, int offset, int* d_offsets); 75 | 76 | #endif 77 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/checkMacrosPlugin.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "checkMacrosPlugin.h" 18 | #include 19 | #include 20 | #include 21 | 22 | namespace nvinfer1 23 | { 24 | namespace plugin 25 | { 26 | 27 | // This will be populated by the logger supplied by the user to initLibNvInferPlugins() 28 | ILogger* gLogger{}; 29 | 30 | template 31 | int LogStream::Buf::sync() 32 | { 33 | std::string s = str(); 34 | while (!s.empty() && s.back() == '\n') 35 | { 36 | s.pop_back(); 37 | } 38 | if (gLogger != nullptr) 39 | { 40 | gLogger->log(kSeverity, s.c_str()); 41 | } 42 | str(""); 43 | return 0; 44 | } 45 | 46 | // These use gLogger, and therefore require initLibNvInferPlugins() to be called with a logger 47 | // (otherwise, it will not log) 48 | LogStream gLogError; 49 | LogStream gLogWarning; 50 | LogStream gLogInfo; 51 | LogStream gLogVerbose; 52 | 53 | // break-pointable 54 | void throwCudaError(const char* file, const char* function, int line, int status, const char* msg) 55 | { 56 | CudaError error(file, function, line, status, msg); 57 | error.log(gLogError); 58 | throw error; 59 | } 60 | 61 | // break-pointable 62 | void throwCublasError(const char* file, const char* function, int line, int status, const char* msg) 63 | { 64 | if (msg == nullptr) 65 | { 66 | auto s_ = static_cast(status); 67 | switch (s_) 68 | { 69 | case CUBLAS_STATUS_SUCCESS: msg = "CUBLAS_STATUS_SUCCESS"; break; 70 | case CUBLAS_STATUS_NOT_INITIALIZED: msg = "CUBLAS_STATUS_NOT_INITIALIZED"; break; 71 | case CUBLAS_STATUS_ALLOC_FAILED: msg = "CUBLAS_STATUS_ALLOC_FAILED"; break; 72 | case CUBLAS_STATUS_INVALID_VALUE: msg = "CUBLAS_STATUS_INVALID_VALUE"; break; 73 | case CUBLAS_STATUS_ARCH_MISMATCH: msg = "CUBLAS_STATUS_ARCH_MISMATCH"; break; 74 | case CUBLAS_STATUS_MAPPING_ERROR: msg = "CUBLAS_STATUS_MAPPING_ERROR"; break; 75 | case CUBLAS_STATUS_EXECUTION_FAILED: msg = "CUBLAS_STATUS_EXECUTION_FAILED"; break; 76 | case CUBLAS_STATUS_INTERNAL_ERROR: msg = "CUBLAS_STATUS_INTERNAL_ERROR"; break; 77 | case CUBLAS_STATUS_NOT_SUPPORTED: msg = "CUBLAS_STATUS_NOT_SUPPORTED"; break; 78 | case CUBLAS_STATUS_LICENSE_ERROR: msg = "CUBLAS_STATUS_LICENSE_ERROR"; break; 79 | } 80 | } 81 | CublasError error(file, function, line, status, msg); 82 | error.log(gLogError); 83 | throw error; 84 | } 85 | 86 | // break-pointable 87 | void throwCudnnError(const char* file, const char* function, int line, int status, const char* msg) 88 | { 89 | CudnnError error(file, function, line, status, msg); 90 | error.log(gLogError); 91 | throw error; 92 | } 93 | 94 | void logError(const char* msg, const char* file, const char* fn, int line) 95 | { 96 | gLogError << "Parameter check failed at: " << file << "::" << fn << "::" << line; 97 | gLogError << ", condition: " << msg << std::endl; 98 | } 99 | 100 | // break-pointable 101 | void reportAssertion(const char* msg, const char* file, int line) 102 | { 103 | std::ostringstream stream; 104 | stream << "Assertion failed: " << msg << std::endl 105 | << file << ':' << line << std::endl 106 | << "Aborting..." << std::endl; 107 | getLogger()->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str()); 108 | cudaDeviceReset(); 109 | abort(); 110 | } 111 | 112 | void TRTException::log(std::ostream& logStream) const 113 | { 114 | logStream << file << " (" << line << ") - " << name << " Error in " << function << ": " << status; 115 | if (message != nullptr) 116 | { 117 | logStream << " (" << message << ")"; 118 | } 119 | logStream << std::endl; 120 | } 121 | 122 | } // namespace plugin 123 | 124 | } // namespace nvinfer1 125 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/cub_helper.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #include "kernel.h" 17 | template 18 | size_t cubSortPairsWorkspaceSize(int num_items, int num_segments) 19 | { 20 | size_t temp_storage_bytes = 0; 21 | cub::DeviceSegmentedRadixSort::SortPairsDescending((void*) NULL, temp_storage_bytes, (const KeyT*) NULL, 22 | (KeyT*) NULL, (const ValueT*) NULL, (ValueT*) NULL, 23 | num_items, // # items 24 | num_segments, // # segments 25 | (const int*) NULL, (const int*) NULL); 26 | return temp_storage_bytes; 27 | } 28 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/cudaDriverWrapper.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #define CUDA_LIB_NAME "cuda" 17 | 18 | #if defined(_WIN32) 19 | #if !defined(WIN32_LEAN_AND_MEAN) 20 | #define WIN32_LEAN_AND_MEAN 21 | #endif // defined(WIN32_LEAN_AND_MEAN) 22 | #include 23 | #define dllOpen(name) (void*) LoadLibraryA("nv" name ".dll") 24 | #define dllClose(handle) FreeLibrary(static_cast(handle)) 25 | #define dllGetSym(handle, name) GetProcAddress(static_cast(handle), name) 26 | #else 27 | #include 28 | #define dllOpen(name) dlopen("lib" name ".so.1", RTLD_LAZY) 29 | #define dllClose(handle) dlclose(handle) 30 | #define dllGetSym(handle, name) dlsym(handle, name) 31 | #endif 32 | 33 | #include "cudaDriverWrapper.h" 34 | #include "plugin.h" 35 | #include 36 | #include 37 | #include 38 | 39 | using namespace nvinfer1; 40 | 41 | CUDADriverWrapper::CUDADriverWrapper() 42 | { 43 | handle = dllOpen(CUDA_LIB_NAME); 44 | ASSERT(handle != nullptr); 45 | 46 | auto load_sym = [](void* handle, const char *name) { 47 | void* ret = dllGetSym(handle, name); 48 | ASSERT(ret != nullptr); 49 | return ret; 50 | }; 51 | 52 | *(void**)(&_cuGetErrorName) = load_sym(handle, "cuGetErrorName"); 53 | *(void**)(&_cuFuncSetAttribute) = load_sym(handle, "cuFuncSetAttribute"); 54 | *(void**)(&_cuLinkComplete) = load_sym(handle, "cuLinkComplete"); 55 | *(void**)(&_cuModuleUnload) = load_sym(handle, "cuModuleUnload"); 56 | *(void**)(&_cuLinkDestroy) = load_sym(handle, "cuLinkDestroy"); 57 | *(void**)(&_cuModuleLoadData) = load_sym(handle, "cuModuleLoadData"); 58 | *(void**)(&_cuLinkCreate) = load_sym(handle, "cuLinkCreate_v2"); 59 | *(void**)(&_cuModuleGetFunction) = load_sym(handle, "cuModuleGetFunction"); 60 | *(void**)(&_cuLinkAddFile) = load_sym(handle, "cuLinkAddFile_v2"); 61 | *(void**)(&_cuLinkAddData) = load_sym(handle, "cuLinkAddData_v2"); 62 | *(void**)(&_cuLaunchCooperativeKernel) = load_sym(handle, "cuLaunchCooperativeKernel"); 63 | *(void**)(&_cuLaunchKernel) = load_sym(handle, "cuLaunchKernel"); 64 | } 65 | 66 | CUDADriverWrapper::~CUDADriverWrapper() 67 | { 68 | dllClose(handle); 69 | } 70 | 71 | CUresult CUDADriverWrapper::cuGetErrorName(CUresult error, const char** pStr) const 72 | { 73 | return (*_cuGetErrorName)(error, pStr); 74 | } 75 | 76 | CUresult CUDADriverWrapper::cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) const 77 | { 78 | return (*_cuFuncSetAttribute)(hfunc, attrib, value); 79 | } 80 | 81 | CUresult CUDADriverWrapper::cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const 82 | { 83 | return (*_cuLinkComplete)(state, cubinOut, sizeOut); 84 | } 85 | 86 | CUresult CUDADriverWrapper::cuModuleUnload(CUmodule hmod) const 87 | { 88 | return (*_cuModuleUnload)(hmod); 89 | } 90 | 91 | CUresult CUDADriverWrapper::cuLinkDestroy(CUlinkState state) const 92 | { 93 | return (*_cuLinkDestroy)(state); 94 | } 95 | 96 | CUresult CUDADriverWrapper::cuModuleLoadData(CUmodule* module, const void* image) const 97 | { 98 | return (*_cuModuleLoadData)(module, image); 99 | } 100 | 101 | CUresult CUDADriverWrapper::cuLinkCreate( 102 | uint32_t numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const 103 | { 104 | return (*_cuLinkCreate)(numOptions, options, optionValues, stateOut); 105 | } 106 | 107 | CUresult CUDADriverWrapper::cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name) const 108 | { 109 | return (*_cuModuleGetFunction)(hfunc, hmod, name); 110 | } 111 | 112 | CUresult CUDADriverWrapper::cuLinkAddFile(CUlinkState state, CUjitInputType type, const char* path, uint32_t numOptions, 113 | CUjit_option* options, void** optionValues) const 114 | { 115 | return (*_cuLinkAddFile)(state, type, path, numOptions, options, optionValues); 116 | } 117 | 118 | CUresult CUDADriverWrapper::cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size, 119 | const char* name, uint32_t numOptions, CUjit_option* options, void** optionValues) const 120 | { 121 | return (*_cuLinkAddData)(state, type, data, size, name, numOptions, options, optionValues); 122 | } 123 | 124 | CUresult CUDADriverWrapper::cuLaunchCooperativeKernel(CUfunction f, uint32_t gridDimX, uint32_t gridDimY, 125 | uint32_t gridDimZ, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, 126 | CUstream hStream, void** kernelParams) const 127 | { 128 | return (*_cuLaunchCooperativeKernel)( 129 | f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams); 130 | } 131 | 132 | CUresult CUDADriverWrapper::cuLaunchKernel(CUfunction f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, 133 | uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, CUstream hStream, 134 | void** kernelParams, void** extra) const 135 | { 136 | return (*_cuLaunchKernel)( 137 | f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra); 138 | } 139 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/cudaDriverWrapper.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef CUDA_DRIVER_WRAPPER_H 18 | #define CUDA_DRIVER_WRAPPER_H 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #define cuErrCheck(stat, wrap) \ 25 | { \ 26 | nvinfer1::cuErrCheck_((stat), wrap, __FILE__, __LINE__); \ 27 | } 28 | 29 | namespace nvinfer1 30 | { 31 | class CUDADriverWrapper 32 | { 33 | public: 34 | CUDADriverWrapper(); 35 | 36 | ~CUDADriverWrapper(); 37 | 38 | // Delete default copy constructor and copy assignment constructor 39 | CUDADriverWrapper(const CUDADriverWrapper&) = delete; 40 | CUDADriverWrapper& operator=(const CUDADriverWrapper&) = delete; 41 | 42 | CUresult cuGetErrorName(CUresult error, const char** pStr) const; 43 | 44 | CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) const; 45 | 46 | CUresult cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const; 47 | 48 | CUresult cuModuleUnload(CUmodule hmod) const; 49 | 50 | CUresult cuLinkDestroy(CUlinkState state) const; 51 | 52 | CUresult cuModuleLoadData(CUmodule* module, const void* image) const; 53 | 54 | CUresult cuLinkCreate(uint32_t numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const; 55 | 56 | CUresult cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name) const; 57 | 58 | CUresult cuLinkAddFile(CUlinkState state, CUjitInputType type, const char* path, uint32_t numOptions, 59 | CUjit_option* options, void** optionValues) const; 60 | 61 | CUresult cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size, const char* name, 62 | uint32_t numOptions, CUjit_option* options, void** optionValues) const; 63 | 64 | CUresult cuLaunchCooperativeKernel(CUfunction f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, 65 | uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, CUstream hStream, 66 | void** kernelParams) const; 67 | 68 | CUresult cuLaunchKernel(CUfunction f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, uint32_t blockDimX, 69 | uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, CUstream hStream, void** kernelParams, 70 | void** extra) const; 71 | 72 | private: 73 | void* handle; 74 | CUresult (*_cuGetErrorName)(CUresult, const char**); 75 | CUresult (*_cuFuncSetAttribute)(CUfunction, CUfunction_attribute, int); 76 | CUresult (*_cuLinkComplete)(CUlinkState, void**, size_t*); 77 | CUresult (*_cuModuleUnload)(CUmodule); 78 | CUresult (*_cuLinkDestroy)(CUlinkState); 79 | CUresult (*_cuLinkCreate)(unsigned int, CUjit_option*, void**, CUlinkState*); 80 | CUresult (*_cuModuleLoadData)(CUmodule*, const void*); 81 | CUresult (*_cuModuleGetFunction)(CUfunction*, CUmodule, const char*); 82 | CUresult (*_cuLinkAddFile)(CUlinkState, CUjitInputType, const char*, unsigned int, CUjit_option*, void**); 83 | CUresult (*_cuLinkAddData)( 84 | CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**); 85 | CUresult (*_cuLaunchCooperativeKernel)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, 86 | unsigned int, unsigned int, unsigned int, CUstream, void**); 87 | CUresult (*_cuLaunchKernel)(CUfunction f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, 88 | uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, CUstream hStream, 89 | void** kernelParams, void** extra); 90 | }; 91 | 92 | inline void cuErrCheck_(CUresult stat, const CUDADriverWrapper& wrap, const char* file, int line) 93 | { 94 | if (stat != CUDA_SUCCESS) 95 | { 96 | const char* msg = nullptr; 97 | wrap.cuGetErrorName(stat, &msg); 98 | fprintf(stderr, "CUDA Error: %s %s %d\n", msg, file, line); 99 | } 100 | } 101 | 102 | } // namespace nvinfer1 103 | 104 | #endif // CUDA_DRIVER_WRAPPER_H 105 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/half.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | // 17 | // Custom wrapper around external half-precision header 18 | // 19 | // Header has some "extra parentheses" warnings when different rounding modes are used. 20 | 21 | #if defined(__GNUC__) 22 | #pragma GCC diagnostic push 23 | #pragma GCC diagnostic ignored "-Wparentheses" 24 | #endif 25 | 26 | 27 | #if defined(__clang__) 28 | #pragma clang diagnostic push 29 | #pragma clang diagnostic ignored "-Wmismatched-tags" 30 | #endif 31 | 32 | #include "ieee/half.h" 33 | 34 | #if defined(__clang__) 35 | #pragma clang diagnostic pop 36 | #endif 37 | 38 | #if defined(__GNUC__) 39 | #pragma GCC diagnostic pop 40 | #endif 41 | 42 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernel.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "kernel.h" 18 | #include "plugin.h" 19 | 20 | size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses, int numPredsPerClass, 21 | int topK, DataType DT_BBOX, DataType DT_SCORE) 22 | { 23 | size_t wss[7]; 24 | wss[0] = detectionForwardBBoxDataSize(N, C1, DT_BBOX); 25 | wss[1] = detectionForwardBBoxPermuteSize(shareLocation, N, C1, DT_BBOX); 26 | wss[2] = detectionForwardPreNMSSize(N, C2); 27 | wss[3] = detectionForwardPreNMSSize(N, C2); 28 | wss[4] = detectionForwardPostNMSSize(N, numClasses, topK); 29 | wss[5] = detectionForwardPostNMSSize(N, numClasses, topK); 30 | wss[6] = std::max(sortScoresPerClassWorkspaceSize(N, numClasses, numPredsPerClass, DT_SCORE), 31 | sortScoresPerImageWorkspaceSize(N, numClasses * topK, DT_SCORE)); 32 | return calculateTotalWorkspaceSize(wss, 7); 33 | } 34 | 35 | size_t detectionInferenceWorkspaceSizeCustom(bool shareLocation, int N, int C1, int C2, int C3, int numClasses, int numPredsPerClass, 36 | int topK, DataType DT_BBOX, DataType DT_SCORE) 37 | { 38 | size_t wss[8]; 39 | wss[0] = detectionForwardBBoxDataSize(N, C1, DT_BBOX); 40 | wss[1] = detectionForwardBBoxPermuteSize(shareLocation, N, C1, DT_BBOX); 41 | wss[2] = detectionForwardPreNMSSize(N, C2); 42 | wss[3] = detectionForwardPreNMSSize(N, C2); 43 | wss[4] = detectionForwardPostNMSSize(N, numClasses, topK); 44 | wss[5] = detectionForwardPostNMSSize(N, numClasses, topK); 45 | wss[6] = std::max(sortScoresPerClassWorkspaceSize(N, numClasses, numPredsPerClass, DT_SCORE), 46 | sortScoresPerImageWorkspaceSize(N, numClasses * topK, DT_SCORE)); 47 | wss[7] = detectionForwardLandmarkDataSize(N, C3, DT_BBOX); 48 | return calculateTotalWorkspaceSize(wss, 8); 49 | } 50 | 51 | namespace nvinfer1 52 | { 53 | namespace plugin 54 | { 55 | size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses, int numPredsPerClass, 56 | int topK, DataType DT_BBOX, DataType DT_SCORE) 57 | { 58 | size_t wss[7]; 59 | wss[0] = detectionForwardBBoxDataSize(N, C1, DT_BBOX); 60 | wss[1] = detectionForwardBBoxPermuteSize(shareLocation, N, C1, DT_BBOX); 61 | wss[2] = detectionForwardPreNMSSize(N, C2); 62 | wss[3] = detectionForwardPreNMSSize(N, C2); 63 | wss[4] = detectionForwardPostNMSSize(N, numClasses, topK); 64 | wss[5] = detectionForwardPostNMSSize(N, numClasses, topK); 65 | wss[6] = std::max(sortScoresPerClassWorkspaceSize(N, numClasses, numPredsPerClass, DT_SCORE), 66 | sortScoresPerImageWorkspaceSize(N, numClasses * topK, DT_SCORE)); 67 | return calculateTotalWorkspaceSize(wss, 7); 68 | } 69 | } // namespace plugin 70 | } // namespace nvinfer1 71 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernels/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | file(GLOB SRCS *.cpp) 17 | set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS}) 18 | set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE) 19 | file(GLOB CU_SRCS *.cu) 20 | set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} ${CU_SRCS}) 21 | set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} PARENT_SCOPE) 22 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernels/common.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "cuda.h" 18 | #include "cublas_v2.h" 19 | #include 20 | #include 21 | #include "kernel.h" 22 | #include "bboxUtils.h" 23 | 24 | #define CUDA_MEM_ALIGN 256 25 | 26 | // HASH 27 | unsigned int hash(const void* array_, size_t size) 28 | { 29 | // Apply hashing only when debugging RPN codes. 30 | if (DEBUG_ENABLE) 31 | { 32 | const char* array_const; 33 | char* array; 34 | cudaMallocHost((void**) &array, size); 35 | cudaMemcpy(array, array_, size, cudaMemcpyDeviceToHost); 36 | array_const = array; 37 | unsigned int hash = 45599; 38 | for (size_t i = 0; i < size; i++) 39 | { 40 | unsigned int value = array_const[i]; 41 | hash = hash * 1487 + value; 42 | hash = hash * 317; 43 | hash = hash % 105359; 44 | } 45 | return hash; 46 | } 47 | else 48 | { 49 | return 0; 50 | } 51 | } 52 | 53 | // ALIGNPTR 54 | int8_t* alignPtr(int8_t* ptr, uintptr_t to) 55 | { 56 | uintptr_t addr = (uintptr_t) ptr; 57 | if (addr % to) 58 | { 59 | addr += to - addr % to; 60 | } 61 | return (int8_t*) addr; 62 | } 63 | 64 | // NEXTWORKSPACEPTR 65 | int8_t* nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize) 66 | { 67 | uintptr_t addr = (uintptr_t) ptr; 68 | addr += previousWorkspaceSize; 69 | return alignPtr((int8_t*) addr, CUDA_MEM_ALIGN); 70 | } 71 | 72 | // CALCULATE TOTAL WORKSPACE SIZE 73 | size_t calculateTotalWorkspaceSize(size_t* workspaces, int count) 74 | { 75 | size_t total = 0; 76 | for (int i = 0; i < count; i++) 77 | { 78 | total += workspaces[i]; 79 | if (workspaces[i] % CUDA_MEM_ALIGN) 80 | { 81 | total += CUDA_MEM_ALIGN - (workspaces[i] % CUDA_MEM_ALIGN); 82 | } 83 | } 84 | return total; 85 | } 86 | 87 | using nvinfer1::DataType; 88 | 89 | // DATA TYPE SIZE 90 | size_t dataTypeSize(const DataType dtype) 91 | { 92 | switch (dtype) 93 | { 94 | case DataType::kINT8: return sizeof(char); 95 | case DataType::kHALF: return sizeof(short); 96 | case DataType::kFLOAT: return sizeof(float); 97 | default: return 0; 98 | } 99 | } 100 | 101 | // CUB 102 | /* 103 | size_t cubSortFloatIntPairsWorkspaceSize(int num_items, int num_segments) 104 | { 105 | size_t temp_storage_bytes = 0; 106 | cub::DeviceSegmentedRadixSort::SortPairsDescending( 107 | (int *)NULL, temp_storage_bytes, 108 | (const float *)NULL, (float *)NULL, 109 | (const int *)NULL, (int *)NULL, 110 | num_items, // # items 111 | num_segments, // # segments 112 | (const int *)NULL, (const int *)NULL); 113 | return temp_storage_bytes; 114 | } 115 | 116 | size_t cubSortFloatBboxInfoPairsWorkspaceSize(int num_items, int num_segments) 117 | { 118 | size_t temp_storage_bytes = 0; 119 | cub::DeviceSegmentedRadixSort::SortPairsDescending( 120 | (int *)NULL, temp_storage_bytes, 121 | (const float *)NULL, (float *)NULL, 122 | (const BboxInfo *)NULL, (BboxInfo *)NULL, 123 | num_items, // # items 124 | num_segments, // # segments 125 | (const int *)NULL, (const int *)NULL); 126 | return temp_storage_bytes; 127 | } 128 | */ 129 | 130 | template 131 | __launch_bounds__(nthds_per_cta) 132 | __global__ void setUniformOffsets_kernel( 133 | const int num_segments, 134 | const int offset, 135 | int* d_offsets) 136 | { 137 | const int idx = blockIdx.x * nthds_per_cta + threadIdx.x; 138 | if (idx <= num_segments) 139 | d_offsets[idx] = idx * offset; 140 | } 141 | 142 | void setUniformOffsets( 143 | cudaStream_t stream, 144 | const int num_segments, 145 | const int offset, 146 | int* d_offsets) 147 | { 148 | const int BS = 32; 149 | const int GS = (num_segments + 1 + BS - 1) / BS; 150 | setUniformOffsets_kernel<<>>(num_segments, offset, d_offsets); 151 | } 152 | 153 | 154 | const char* cublasGetErrorString(cublasStatus_t error) 155 | { 156 | switch (error) 157 | { 158 | case CUBLAS_STATUS_SUCCESS: 159 | return "CUBLAS_STATUS_SUCCESS"; 160 | case CUBLAS_STATUS_NOT_INITIALIZED: 161 | return "CUBLAS_STATUS_NOT_INITIALIZED"; 162 | case CUBLAS_STATUS_ALLOC_FAILED: 163 | return "CUBLAS_STATUS_ALLOC_FAILED"; 164 | case CUBLAS_STATUS_INVALID_VALUE: 165 | return "CUBLAS_STATUS_INVALID_VALUE"; 166 | case CUBLAS_STATUS_ARCH_MISMATCH: 167 | return "CUBLAS_STATUS_ARCH_MISMATCH"; 168 | case CUBLAS_STATUS_MAPPING_ERROR: 169 | return "CUBLAS_STATUS_MAPPING_ERROR"; 170 | case CUBLAS_STATUS_EXECUTION_FAILED: 171 | return "CUBLAS_STATUS_EXECUTION_FAILED"; 172 | case CUBLAS_STATUS_INTERNAL_ERROR: 173 | return "CUBLAS_STATUS_INTERNAL_ERROR"; 174 | #if CUDA_VERSION >= 6000 175 | case CUBLAS_STATUS_NOT_SUPPORTED: 176 | return "CUBLAS_STATUS_NOT_SUPPORTED"; 177 | #endif 178 | #if CUDA_VERSION >= 6050 179 | case CUBLAS_STATUS_LICENSE_ERROR: 180 | return "CUBLAS_STATUS_LICENSE_ERROR"; 181 | #endif 182 | } 183 | return "Unknown cublas status"; 184 | } 185 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernels/permuteData.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #include 17 | #include "kernel.h" 18 | 19 | template 20 | __launch_bounds__(nthds_per_cta) 21 | __global__ void permuteData_kernel( 22 | const int nthreads, 23 | const int num_classes, 24 | const int num_data, 25 | const int num_dim, 26 | bool confSigmoid, 27 | const Dtype* data, 28 | Dtype* new_data) 29 | { 30 | // data format: [batch_size, num_data, num_classes, num_dim] 31 | for (int index = blockIdx.x * nthds_per_cta + threadIdx.x; 32 | index < nthreads; 33 | index += nthds_per_cta * gridDim.x) 34 | { 35 | const int i = index % num_dim; 36 | const int c = (index / num_dim) % num_classes; 37 | const int d = (index / num_dim / num_classes) % num_data; 38 | const int n = index / num_dim / num_classes / num_data; 39 | const int new_index = ((n * num_classes + c) * num_data + d) * num_dim + i; 40 | float result = data[index]; 41 | if (confSigmoid) 42 | result = exp(result) / (1 + exp(result)); 43 | 44 | new_data[new_index] = result; 45 | } 46 | // new data format: [batch_size, num_classes, num_data, num_dim] 47 | } 48 | 49 | template 50 | pluginStatus_t permuteData_gpu( 51 | cudaStream_t stream, 52 | const int nthreads, 53 | const int num_classes, 54 | const int num_data, 55 | const int num_dim, 56 | bool confSigmoid, 57 | const void* data, 58 | void* new_data) 59 | { 60 | const int BS = 512; 61 | const int GS = (nthreads + BS - 1) / BS; 62 | permuteData_kernel<<>>(nthreads, num_classes, num_data, num_dim, confSigmoid, 63 | (const Dtype*) data, (Dtype*) new_data); 64 | CSC(cudaGetLastError(), STATUS_FAILURE); 65 | return STATUS_SUCCESS; 66 | } 67 | 68 | // permuteData LAUNCH CONFIG 69 | typedef pluginStatus_t (*pdFunc)(cudaStream_t, const int, const int, const int, const int, bool, const void*, void*); 70 | 71 | struct pdLaunchConfig 72 | { 73 | DataType t_data; 74 | pdFunc function; 75 | 76 | pdLaunchConfig(DataType t_data) 77 | : t_data(t_data) 78 | { 79 | } 80 | pdLaunchConfig(DataType t_data, pdFunc function) 81 | : t_data(t_data) 82 | , function(function) 83 | { 84 | } 85 | bool operator==(const pdLaunchConfig& other) 86 | { 87 | return t_data == other.t_data; 88 | } 89 | }; 90 | 91 | static std::array pdLCOptions = { 92 | pdLaunchConfig(DataType::kFLOAT, permuteData_gpu), pdLaunchConfig(DataType::kHALF, permuteData_gpu<__half>)}; 93 | 94 | pluginStatus_t permuteData(cudaStream_t stream, const int nthreads, const int num_classes, const int num_data, 95 | const int num_dim, const DataType DT_DATA, bool confSigmoid, const void* data, void* new_data) 96 | { 97 | pdLaunchConfig lc = pdLaunchConfig(DT_DATA); 98 | for (unsigned i = 0; i < pdLCOptions.size(); ++i) 99 | { 100 | if (lc == pdLCOptions[i]) 101 | { 102 | DEBUG_PRINTF("permuteData kernel %d\n", i); 103 | return pdLCOptions[i].function(stream, 104 | nthreads, 105 | num_classes, 106 | num_data, 107 | num_dim, 108 | confSigmoid, 109 | data, 110 | new_data); 111 | } 112 | } 113 | return STATUS_BAD_PARAM; 114 | } 115 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernels/reducedMathPlugin.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef _REDUCED_MATH_PLUGIN_H 18 | #define _REDUCED_MATH_PLUGIN_H 19 | #include 20 | // Dynamically strength-reduced div and mod 21 | // 22 | // Ideas taken from Sean Baxter's MGPU library. 23 | // These classes provide for reduced complexity division and modulus 24 | // on integers, for the case where the same divisor or modulus will 25 | // be used repeatedly. 26 | 27 | namespace nvinfer1 28 | { 29 | namespace plugin 30 | { 31 | namespace detail 32 | { 33 | 34 | void findDivisor(int denom, unsigned int& mul_coeff, unsigned int& shift_coeff); 35 | 36 | __host__ __device__ __forceinline__ uint32_t umulhi(uint32_t x, uint32_t y) 37 | { 38 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 100 39 | return __umulhi(x, y); 40 | #else 41 | uint64_t z = (uint64_t) x * (uint64_t) y; 42 | return (uint32_t) (z >> 32); 43 | #endif 44 | } 45 | 46 | // This is a weird implementation that returns div_up(0,1)=0 but 47 | // div_up(0,2)=1 (wrong) -- just do not use it with a=0. 48 | __host__ __device__ inline int div_up(int a, int b) 49 | { 50 | return (a - 1) / b + 1; 51 | } 52 | 53 | } //end namespace detail 54 | 55 | class ReducedDivisor 56 | { 57 | public: 58 | ReducedDivisor() {} 59 | __host__ __forceinline__ 60 | ReducedDivisor(int _y) 61 | : y(_y) 62 | { 63 | detail::findDivisor(y, mul_coeff, shift_coeff); 64 | } 65 | __host__ __device__ __forceinline__ 66 | ReducedDivisor(unsigned _mul_coeff, unsigned _shift_coeff, int _y) 67 | : mul_coeff(_mul_coeff) 68 | , shift_coeff(_shift_coeff) 69 | , y(_y) 70 | { 71 | } 72 | __host__ __device__ __forceinline__ int div(int x) const 73 | { 74 | // if dividing by 1, then findDivisor wouldn't have worked because 75 | // mul_coeff would have had to be 2^32, which can't be represented, 76 | // so we have to special case that one. 77 | return (y != 1) ? detail::umulhi((uint32_t) x, mul_coeff) >> shift_coeff : x; 78 | } 79 | __host__ __device__ __forceinline__ int mod(int x) const 80 | { 81 | return x - (div(x) * y); 82 | } 83 | __host__ __device__ __forceinline__ void divmod(int x, int& q, int& mod) const 84 | { 85 | q = div(x); 86 | mod = x - (q * y); 87 | } 88 | __host__ __device__ __forceinline__ int get() const 89 | { 90 | return y; 91 | } 92 | inline __host__ void get_mul_shift(unsigned& mul, unsigned& shift) 93 | { 94 | mul = mul_coeff; 95 | shift = shift_coeff; 96 | } 97 | 98 | protected: 99 | uint32_t mul_coeff; 100 | uint32_t shift_coeff; 101 | int y; 102 | }; 103 | 104 | } // namespace plugin 105 | 106 | } // namespace nvinfer1 107 | #endif /*_REDUCED_MATH_PLUGIN_H*/ 108 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernels/sortScoresPerImage.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #include "cub/cub.cuh" 17 | #include 18 | #include "kernel.h" 19 | #include "bboxUtils.h" 20 | #include "cub_helper.h" 21 | 22 | template 23 | pluginStatus_t sortScoresPerImage_gpu( 24 | cudaStream_t stream, 25 | const int num_images, 26 | const int num_items_per_image, 27 | void* unsorted_scores, 28 | void* unsorted_bbox_indices, 29 | void* sorted_scores, 30 | void* sorted_bbox_indices, 31 | void* workspace, 32 | int score_bits 33 | ) 34 | { 35 | void* d_offsets = workspace; 36 | void* cubWorkspace = nextWorkspacePtr((int8_t*) d_offsets, (num_images + 1) * sizeof(int)); 37 | 38 | setUniformOffsets(stream, num_images, num_items_per_image, (int*) d_offsets); 39 | 40 | const int arrayLen = num_images * num_items_per_image; 41 | size_t temp_storage_bytes = cubSortPairsWorkspaceSize(arrayLen, num_images); 42 | size_t begin_bit = 0; 43 | size_t end_bit = sizeof(T_SCORE) * 8; 44 | if (sizeof(T_SCORE) == 2 && score_bits > 0 && score_bits <= 10) 45 | { 46 | end_bit = 10; 47 | begin_bit = end_bit - score_bits; 48 | } 49 | cub::DeviceSegmentedRadixSort::SortPairsDescending( 50 | cubWorkspace, temp_storage_bytes, 51 | (const T_SCORE*) (unsorted_scores), (T_SCORE*) (sorted_scores), 52 | (const int*) (unsorted_bbox_indices), (int*) (sorted_bbox_indices), 53 | arrayLen, num_images, 54 | (const int*) d_offsets, (const int*) d_offsets + 1, 55 | begin_bit, end_bit, 56 | stream); 57 | CSC(cudaGetLastError(), STATUS_FAILURE); 58 | return STATUS_SUCCESS; 59 | } 60 | 61 | // sortScoresPerImage LAUNCH CONFIG 62 | typedef pluginStatus_t (*sspiFunc)(cudaStream_t, 63 | const int, 64 | const int, 65 | void*, 66 | void*, 67 | void*, 68 | void*, 69 | void*, 70 | int); 71 | struct sspiLaunchConfig 72 | { 73 | DataType t_score; 74 | sspiFunc function; 75 | 76 | sspiLaunchConfig(DataType t_score) 77 | : t_score(t_score) 78 | { 79 | } 80 | sspiLaunchConfig(DataType t_score, sspiFunc function) 81 | : t_score(t_score) 82 | , function(function) 83 | { 84 | } 85 | bool operator==(const sspiLaunchConfig& other) 86 | { 87 | return t_score == other.t_score; 88 | } 89 | }; 90 | 91 | static std::array sspiLCOptions = { 92 | sspiLaunchConfig(DataType::kFLOAT, sortScoresPerImage_gpu), 93 | sspiLaunchConfig(DataType::kHALF, sortScoresPerImage_gpu<__half>), 94 | }; 95 | 96 | pluginStatus_t sortScoresPerImage( 97 | cudaStream_t stream, 98 | const int num_images, 99 | const int num_items_per_image, 100 | const DataType DT_SCORE, 101 | void* unsorted_scores, 102 | void* unsorted_bbox_indices, 103 | void* sorted_scores, 104 | void* sorted_bbox_indices, 105 | void* workspace, 106 | int score_bits 107 | ) 108 | { 109 | sspiLaunchConfig lc = sspiLaunchConfig(DT_SCORE); 110 | for (unsigned i = 0; i < sspiLCOptions.size(); ++i) 111 | { 112 | if (lc == sspiLCOptions[i]) 113 | { 114 | DEBUG_PRINTF("sortScoresPerImage kernel %d\n", i); 115 | return sspiLCOptions[i].function(stream, 116 | num_images, 117 | num_items_per_image, 118 | unsorted_scores, 119 | unsorted_bbox_indices, 120 | sorted_scores, 121 | sorted_bbox_indices, 122 | workspace, 123 | score_bits); 124 | } 125 | } 126 | return STATUS_BAD_PARAM; 127 | } 128 | 129 | size_t sortScoresPerImageWorkspaceSize( 130 | const int num_images, 131 | const int num_items_per_image, 132 | const DataType DT_SCORE) 133 | { 134 | const int arrayLen = num_images * num_items_per_image; 135 | size_t wss[2]; 136 | wss[0] = (num_images + 1) * sizeof(int); // offsets 137 | if (DT_SCORE == DataType::kFLOAT) 138 | { 139 | wss[1] = cubSortPairsWorkspaceSize(arrayLen, num_images); // cub workspace 140 | } 141 | else if (DT_SCORE == DataType::kHALF) 142 | { 143 | wss[1] = cubSortPairsWorkspaceSize<__half, int>(arrayLen, num_images); // cub workspace 144 | } 145 | else 146 | { 147 | printf("SCORE type not supported.\n"); 148 | return (size_t) -1; 149 | } 150 | 151 | return calculateTotalWorkspaceSize(wss, 2); 152 | } 153 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/logger.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "logger.h" 18 | #include "ErrorRecorder.h" 19 | #include "logging.h" 20 | 21 | SampleErrorRecorder gRecorder; 22 | namespace sample 23 | { 24 | Logger gLogger{Logger::Severity::kINFO}; 25 | LogStreamConsumer gLogVerbose{LOG_VERBOSE(gLogger)}; 26 | LogStreamConsumer gLogInfo{LOG_INFO(gLogger)}; 27 | LogStreamConsumer gLogWarning{LOG_WARN(gLogger)}; 28 | LogStreamConsumer gLogError{LOG_ERROR(gLogger)}; 29 | LogStreamConsumer gLogFatal{LOG_FATAL(gLogger)}; 30 | 31 | void setReportableSeverity(Logger::Severity severity) 32 | { 33 | gLogger.setReportableSeverity(severity); 34 | gLogVerbose.setReportableSeverity(severity); 35 | gLogInfo.setReportableSeverity(severity); 36 | gLogWarning.setReportableSeverity(severity); 37 | gLogError.setReportableSeverity(severity); 38 | gLogFatal.setReportableSeverity(severity); 39 | } 40 | } // namespace sample 41 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/logger.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef LOGGER_H 18 | #define LOGGER_H 19 | 20 | #include "logging.h" 21 | 22 | class SampleErrorRecorder; 23 | extern SampleErrorRecorder gRecorder; 24 | namespace sample 25 | { 26 | extern Logger gLogger; 27 | extern LogStreamConsumer gLogVerbose; 28 | extern LogStreamConsumer gLogInfo; 29 | extern LogStreamConsumer gLogWarning; 30 | extern LogStreamConsumer gLogError; 31 | extern LogStreamConsumer gLogFatal; 32 | 33 | void setReportableSeverity(Logger::Severity severity); 34 | } // namespace sample 35 | 36 | #endif // LOGGER_H 37 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/nmsHelper.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "cuda_fp16.h" 18 | #include "plugin.h" 19 | #include 20 | 21 | using namespace nvinfer1; 22 | using namespace nvinfer1::plugin; 23 | 24 | size_t detectionForwardBBoxDataSize(int N, int C1, DataType DT_BBOX) 25 | { 26 | if (DT_BBOX == DataType::kFLOAT) 27 | { 28 | return N * C1 * sizeof(float); 29 | } 30 | if (DT_BBOX == DataType::kHALF) 31 | { 32 | return N * C1 * sizeof(__half); 33 | } 34 | 35 | printf("Only FP32/FP16 type bounding boxes are supported.\n"); 36 | return (size_t) -1; 37 | } 38 | 39 | size_t detectionForwardBBoxPermuteSize(bool shareLocation, int N, int C1, DataType DT_BBOX) 40 | { 41 | if (DT_BBOX == DataType::kFLOAT) 42 | { 43 | return shareLocation ? 0 : N * C1 * sizeof(float); 44 | } 45 | if (DT_BBOX == DataType::kHALF) 46 | { 47 | return shareLocation ? 0 : N * C1 * sizeof(__half); 48 | } 49 | 50 | printf("Only FP32/FP16 type bounding boxes are supported.\n"); 51 | return (size_t) -1; 52 | } 53 | 54 | size_t detectionForwardLandmarkDataSize(int N, int C3, DataType DT_BBOX) 55 | { 56 | if (DT_BBOX == DataType::kFLOAT) 57 | { 58 | return N * C3 * sizeof(float); 59 | } 60 | if (DT_BBOX == DataType::kHALF) 61 | { 62 | return N * C3 * sizeof(__half); 63 | } 64 | 65 | printf("Only FP32/FP16 type bounding boxes are supported.\n"); 66 | return (size_t) -1; 67 | } 68 | 69 | size_t detectionForwardLandmarkPermuteSize(bool shareLocation, int N, int C3, DataType DT_BBOX) 70 | { 71 | if (DT_BBOX == DataType::kFLOAT) 72 | { 73 | return shareLocation ? 0 : N * C3 * sizeof(float); 74 | } 75 | if (DT_BBOX == DataType::kHALF) 76 | { 77 | return shareLocation ? 0 : N * C3 * sizeof(__half); 78 | } 79 | 80 | printf("Only FP32/FP16 type bounding boxes are supported.\n"); 81 | return (size_t) -1; 82 | } 83 | 84 | size_t detectionForwardPreNMSSize(int N, int C2) 85 | { 86 | ASSERT(sizeof(float) == sizeof(int)); 87 | return N * C2 * sizeof(float); 88 | } 89 | 90 | size_t detectionForwardPostNMSSize(int N, int numClasses, int topK) 91 | { 92 | ASSERT(sizeof(float) == sizeof(int)); 93 | return N * numClasses * topK * sizeof(float); 94 | } 95 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/nmsUtils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #ifndef TRT_NMS_UTILS_H 17 | #define TRT_NMS_UTILS_H 18 | 19 | #include "plugin.h" 20 | 21 | using namespace nvinfer1; 22 | using namespace nvinfer1::plugin; 23 | 24 | size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses, int numPredsPerClass, 25 | int topK, DataType DT_BBOX, DataType DT_SCORE); 26 | size_t detectionInferenceWorkspaceSizeCustom(bool shareLocation, int N, int C1, int C2, int C3, int numClasses, int numPredsPerClass, 27 | int topK, DataType DT_BBOX, DataType DT_SCORE); 28 | #endif 29 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/reducedMathPlugin.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #include 17 | namespace nvinfer1 18 | { 19 | namespace plugin 20 | { 21 | namespace detail 22 | { 23 | 24 | // Count leading zeros - start from most significant bit. 25 | int clz(int x) 26 | { 27 | for (int i = 31; i >= 0; --i) 28 | { 29 | if ((1U << i) & x) 30 | { 31 | return 31 - i; 32 | } 33 | } 34 | return 32; 35 | } 36 | 37 | #define CUDNN_IS_POW_2(x) (0 == ((x) & ((x) -1))) 38 | 39 | int find_log_2(int x, bool round_up = false) 40 | { 41 | int a = 31 - clz(x); 42 | if (round_up) 43 | { 44 | a += !CUDNN_IS_POW_2(x); 45 | } 46 | return a; 47 | } 48 | 49 | void findDivisor(int denom, 50 | unsigned int& mul_coeff, unsigned int& shift_coeff) 51 | { 52 | if (denom == 0) 53 | { 54 | return; 55 | } 56 | if (denom == 1) 57 | { 58 | // if dividing by 1, reduced math doesn't work because mul_coeff would 59 | // need to be 2^32, which doesn't fit into unsigned int. the div() 60 | // routine handles this special case separately. 61 | mul_coeff = 0; 62 | shift_coeff = 0; 63 | return; 64 | } 65 | // To express the division N/D in terms of a multiplication, what we first 66 | // imagine is simply N*(1/D). However, 1/D will always evaluate to 0 (for D>1), 67 | // so we need another way. There's nothing that says we have to use exactly 68 | // the fraction 1/D; instead it could be any X/Y that reduces to 1/D (i.e., 69 | // Y=X*D), or at least to "close enough" to it. If we pick Y that is a power 70 | // of two, then the N*(X/Y) can be N*X followed by a right-shift by some amount. 71 | // The power of two we should pick should be at least 2^32, because in the 72 | // div() routine we'll use umulhi(), which returns only the upper 32 bits -- 73 | // this being equivalent to a right-shift by 32. But we might want a higher 74 | // power of two for better accuracy depending on the magnitude of the denominator. 75 | // Once we've picked Y, then X [our mul_coeff value] is simply Y/D, rounding up, 76 | // and we save shift_coeff as whatever further shift we have to do beyond 77 | // what the umulhi() implies. 78 | uint32_t p = 31 + find_log_2(denom, true); 79 | uint32_t m = ((1ull << p) + (uint32_t) denom - 1) / (uint32_t) denom; 80 | mul_coeff = m; 81 | shift_coeff = p - 32; 82 | } 83 | 84 | } // namespace detail 85 | 86 | } // namespace plugin 87 | 88 | } // namespace nvinfer1 89 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/serialize.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #pragma once 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include 24 | using std::cerr; 25 | using std::cout; 26 | using std::endl; 27 | 28 | template 29 | inline void serialize_value(void** buffer, T const& value); 30 | 31 | template 32 | inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value); 33 | 34 | namespace 35 | { 36 | 37 | template 38 | struct Serializer 39 | { 40 | }; 41 | 42 | template 43 | struct Serializer::value || std::is_enum::value || std::is_pod::value>::type> 45 | { 46 | static size_t serialized_size(T const&) 47 | { 48 | return sizeof(T); 49 | } 50 | static void serialize(void** buffer, T const& value) 51 | { 52 | ::memcpy(*buffer, &value, sizeof(T)); 53 | reinterpret_cast(*buffer) += sizeof(T); 54 | } 55 | static void deserialize(void const** buffer, size_t* buffer_size, T* value) 56 | { 57 | assert(*buffer_size >= sizeof(T)); 58 | ::memcpy(value, *buffer, sizeof(T)); 59 | reinterpret_cast(*buffer) += sizeof(T); 60 | *buffer_size -= sizeof(T); 61 | } 62 | }; 63 | 64 | template <> 65 | struct Serializer 66 | { 67 | static size_t serialized_size(const char* value) 68 | { 69 | return strlen(value) + 1; 70 | } 71 | static void serialize(void** buffer, const char* value) 72 | { 73 | ::strcpy(static_cast(*buffer), value); 74 | reinterpret_cast(*buffer) += strlen(value) + 1; 75 | } 76 | static void deserialize(void const** buffer, size_t* buffer_size, const char** value) 77 | { 78 | *value = static_cast(*buffer); 79 | size_t data_size = strnlen(*value, *buffer_size) + 1; 80 | assert(*buffer_size >= data_size); 81 | reinterpret_cast(*buffer) += data_size; 82 | *buffer_size -= data_size; 83 | } 84 | }; 85 | 86 | template 87 | struct Serializer, 88 | typename std::enable_if::value || std::is_enum::value || std::is_pod::value>::type> 89 | { 90 | static size_t serialized_size(std::vector const& value) 91 | { 92 | return sizeof(value.size()) + value.size() * sizeof(T); 93 | } 94 | static void serialize(void** buffer, std::vector const& value) 95 | { 96 | serialize_value(buffer, value.size()); 97 | size_t nbyte = value.size() * sizeof(T); 98 | ::memcpy(*buffer, value.data(), nbyte); 99 | reinterpret_cast(*buffer) += nbyte; 100 | } 101 | static void deserialize(void const** buffer, size_t* buffer_size, std::vector* value) 102 | { 103 | size_t size; 104 | deserialize_value(buffer, buffer_size, &size); 105 | value->resize(size); 106 | size_t nbyte = value->size() * sizeof(T); 107 | assert(*buffer_size >= nbyte); 108 | ::memcpy(value->data(), *buffer, nbyte); 109 | reinterpret_cast(*buffer) += nbyte; 110 | *buffer_size -= nbyte; 111 | } 112 | }; 113 | 114 | } // namespace 115 | 116 | template 117 | inline size_t serialized_size(T const& value) 118 | { 119 | return Serializer::serialized_size(value); 120 | } 121 | 122 | template 123 | inline void serialize_value(void** buffer, T const& value) 124 | { 125 | return Serializer::serialize(buffer, value); 126 | } 127 | 128 | template 129 | inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value) 130 | { 131 | return Serializer::deserialize(buffer, buffer_size, value); 132 | } 133 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/gatherNMSCustomOutputs.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #ifndef TRT_BATCHED_NMS_HELPER_H 17 | #define TRT_BATCHED_NMS_HELPER_H 18 | #include "plugin.h" 19 | using namespace nvinfer1; 20 | using namespace nvinfer1::plugin; 21 | 22 | pluginStatus_t gatherNMSCustomOutputs(cudaStream_t stream, bool shareLocation, int numImages, int numPredsPerClass, 23 | int numClasses, int topK, int keepTopK, DataType DT_BBOX, DataType DT_SCORE, const void* indices, 24 | const void* scores, const void* bboxData, const void* landData, void* keepCount, void* nmsedBoxes, void* nmsedScores, void* nmsedClasses, void* nmsedLandmarks, 25 | bool clipBoxes, const float scoreShift); 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.o -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-scrfd/parser_scrfd.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pyds 3 | import ctypes 4 | import numpy as np 5 | 6 | def layer_finder(output_layer_info, name): 7 | """ Return the layer contained in output_layer_info which corresponds 8 | to the given name. 9 | """ 10 | for layer in output_layer_info: 11 | # dataType == 0 <=> dataType == FLOAT 12 | # print(layer.layerName) 13 | if layer.dataType == 0 and layer.layerName == name: 14 | return layer 15 | return None 16 | 17 | 18 | def clip(x): 19 | return min(max(0.0, x), 1.0) 20 | 21 | def make_object(index, layers, default_classId = 1): 22 | """ Creates a NvDsInferObjectDetectionInfo object from one layer of SSD. 23 | Return None if the class Id is invalid, if the detection confidence 24 | is under the threshold or if the width/height of the bounding box is 25 | null/negative. 26 | Return the created NvDsInferObjectDetectionInfo object otherwise. 27 | """ 28 | box_layer, score_layer = layers 29 | res = pyds.NvDsInferObjectDetectionInfo() 30 | res.detectionConfidence = score_layer[index] 31 | res.classId = default_classId 32 | 33 | rect_x1_f = box_layer[index][0] 34 | rect_y1_f = box_layer[index][1] 35 | rect_x2_f = box_layer[index][2] 36 | rect_y2_f = box_layer[index][3] 37 | res.left = clip(rect_x1_f) 38 | res.top = clip(rect_y1_f) 39 | res.width = clip(rect_x2_f - rect_x1_f) 40 | res.height = clip(rect_y2_f - rect_y1_f) 41 | 42 | return res 43 | 44 | def nvds_infer_parse_scrfd(output_layer_info, input_size): 45 | """ Get data from output_layer_info and fill object_list 46 | num_detections: [1] 47 | nmsed_bboxes: [200, 4] 48 | nmsed_scores: [200] 49 | nmsed_classes: [200] 50 | nmsed_landmarks:[200, 10] 51 | """ 52 | num_detection_layer = output_layer_info[0] 53 | box_layer = output_layer_info[1] 54 | score_layer = output_layer_info[2] 55 | class_layer = output_layer_info[3] 56 | landmark_layer = output_layer_info[4] 57 | 58 | # if not num_detection_layer or not score_layer or not class_layer or not box_layer or not landmark_layer: 59 | # sys.stderr.write("ERROR: some layers missing in output tensors\n") 60 | # return [] 61 | 62 | ptr = ctypes.cast(pyds.get_ptr(num_detection_layer.buffer), ctypes.POINTER(ctypes.c_int32)) 63 | num_detection = np.ctypeslib.as_array(ptr, shape=(1,))[0] 64 | object_list = [] 65 | landmark_list = [] 66 | 67 | if num_detection > 0: 68 | ptr = ctypes.cast(pyds.get_ptr(box_layer.buffer), ctypes.POINTER(ctypes.c_float)) 69 | box_result = np.ctypeslib.as_array(ptr, shape=(200,4)) 70 | 71 | # Normalize 72 | box_result = box_result.astype('float32') 73 | box_result[:, 0] /= input_size[0] 74 | box_result[:, 1] /= input_size[1] 75 | box_result[:, 2] /= input_size[0] 76 | box_result[:, 3] /= input_size[1] 77 | 78 | ptr = ctypes.cast(pyds.get_ptr(score_layer.buffer), ctypes.POINTER(ctypes.c_float)) 79 | score_result = np.ctypeslib.as_array(ptr, shape=(200,)) 80 | ptr = ctypes.cast(pyds.get_ptr(landmark_layer.buffer), ctypes.POINTER(ctypes.c_float)) 81 | landmark_result = np.ctypeslib.as_array(ptr, shape=(200,10)) 82 | x3_layers = box_result, score_result 83 | for i in range(num_detection): 84 | obj = make_object(i, x3_layers) 85 | if obj: 86 | object_list.append(obj) 87 | landmark_list.append(landmark_result[i]) 88 | # print(landmark_list) 89 | return object_list, landmark_list -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-yolov4/config_deepstream.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | ################################################################################ 18 | 19 | [application] 20 | enable-perf-measurement=1 21 | perf-measurement-interval-sec=3 22 | #gie-kitti-output-dir=streamscl 23 | 24 | [tiled-display] 25 | enable=1 26 | rows=1 27 | columns=1 28 | width=1280 29 | height=720 30 | gpu-id=0 31 | #(0): nvbuf-mem-default - Default memory allocated, specific to particular platform 32 | #(1): nvbuf-mem-cuda-pinned - Allocate Pinned/Host cuda memory, applicable for Tesla 33 | #(2): nvbuf-mem-cuda-device - Allocate Device cuda memory, applicable for Tesla 34 | #(3): nvbuf-mem-cuda-unified - Allocate Unified cuda memory, applicable for Tesla 35 | #(4): nvbuf-mem-surface-array - Allocate Surface Array memory, applicable for Jetson 36 | nvbuf-memory-type=0 37 | 38 | [source0] 39 | enable=1 40 | #Type - 1=CameraV4L2 2=URI 3=MultiURI 41 | type=3 42 | uri=file:/home/nndam/Desktop/survelliance-videos/capture_0.mp4 43 | num-sources=1 44 | gpu-id=0 45 | # (0): memtype_device - Memory type Device 46 | # (1): memtype_pinned - Memory type Host Pinned 47 | # (2): memtype_unified - Memory type Unified 48 | cudadec-memtype=0 49 | 50 | [sink0] 51 | enable=1 52 | #Type - 1=FakeSink 2=EglSink 3=File 53 | type=2 54 | sync=0 55 | source-id=0 56 | gpu-id=0 57 | nvbuf-memory-type=0 58 | #1=mp4 2=mkv 59 | container=1 60 | #1=h264 2=h265 61 | codec=1 62 | output-file=yolov4.mp4 63 | 64 | [osd] 65 | enable=1 66 | gpu-id=0 67 | border-width=1 68 | text-size=12 69 | text-color=1;1;1;1; 70 | text-bg-color=0.3;0.3;0.3;1 71 | font=Serif 72 | show-clock=0 73 | clock-x-offset=800 74 | clock-y-offset=820 75 | clock-text-size=12 76 | clock-color=1;0;0;0 77 | nvbuf-memory-type=0 78 | 79 | [streammux] 80 | gpu-id=0 81 | ##Boolean property to inform muxer that sources are live 82 | live-source=0 83 | batch-size=1 84 | ##time out in usec, to wait after the first buffer is available 85 | ##to push the batch even if the complete batch is not formed 86 | batched-push-timeout=40000 87 | ## Set muxer output width and height 88 | width=1280 89 | height=720 90 | ##Enable to maintain aspect ratio wrt source, and allow black borders, works 91 | ##along with width, height properties 92 | enable-padding=0 93 | nvbuf-memory-type=0 94 | 95 | # config-file property is mandatory for any gie section. 96 | # Other properties are optional and if set will override the properties set in 97 | # the infer config file. 98 | [primary-gie] 99 | enable=1 100 | gpu-id=0 101 | labelfile-path=labels.txt 102 | batch-size=1 103 | 104 | #Required by the app for OSD, not a plugin property 105 | bbox-border-color0=1;0;0;1 106 | bbox-border-color1=0;1;1;1 107 | bbox-border-color2=0;0;1;1 108 | bbox-border-color3=0;1;0;1 109 | interval=0 110 | gie-unique-id=1 111 | nvbuf-memory-type=0 112 | config-file=config_yolov4.txt 113 | 114 | [tracker] 115 | enable=1 116 | tracker-width=416 117 | tracker-height=416 118 | ll-lib-file=/opt/nvidia/deepstream/deepstream/lib/libnvds_nvmultiobjecttracker.so 119 | ll-config-file=/opt/nvidia/deepstream/deepstream/samples/configs/deepstream-app/config_tracker_NvDCF_max_perf.yml 120 | enable-batch-process=1 121 | display-tracking-id=1 122 | 123 | [tests] 124 | file-loop=0 -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-yolov4/config_tracker.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # SPDX-FileCopyrightText: Copyright (c) 2019-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | ################################################################################ 17 | 18 | # Mandatory properties for the tracker: 19 | # tracker-width 20 | # tracker-height: needs to be multiple of 6 for NvDCF 21 | # gpu-id 22 | # ll-lib-file: path to low-level tracker lib 23 | # ll-config-file: required for NvDCF, optional for KLT and IOU 24 | # 25 | [tracker] 26 | tracker-width=608 27 | tracker-height=608 28 | gpu-id=0 29 | ll-lib-file=/opt/nvidia/deepstream/deepstream/lib/libnvds_nvmultiobjecttracker.so 30 | ll-config-file=config_tracker_NvDCF_perf.yml 31 | #enable-past-frame=1 32 | enable-batch-process=1 -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-yolov4/config_yolov4.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | ################################################################################ 18 | 19 | # Following properties are mandatory when engine files are not specified: 20 | # int8-calib-file(Only in INT8), model-file-format 21 | # Caffemodel mandatory properties: model-file, proto-file, output-blob-names 22 | # UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names 23 | # ONNX: onnx-file 24 | # 25 | # Mandatory properties for detectors: 26 | # num-detected-classes 27 | # 28 | # Optional properties for detectors: 29 | # cluster-mode(Default=Group Rectangles), interval(Primary mode only, Default=0) 30 | # custom-lib-path 31 | # parse-bbox-func-name 32 | # 33 | # Mandatory properties for classifiers: 34 | # classifier-threshold, is-classifier 35 | # 36 | # Optional properties for classifiers: 37 | # classifier-async-mode(Secondary mode only, Default=false) 38 | # 39 | # Optional properties in secondary mode: 40 | # operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes), 41 | # input-object-min-width, input-object-min-height, input-object-max-width, 42 | # input-object-max-height 43 | # 44 | # Following properties are always recommended: 45 | # batch-size(Default=1) 46 | # 47 | # Other optional properties: 48 | # net-scale-factor(Default=1), network-mode(Default=0 i.e FP32), 49 | # model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path, 50 | # mean-file, gie-unique-id(Default=0), offsets, process-mode (Default=1 i.e. primary), 51 | # custom-lib-path, network-mode(Default=0 i.e FP32) 52 | # 53 | # The values in the config file are overridden by values set through GObject 54 | # properties. 55 | 56 | [property] 57 | gpu-id=0 58 | net-scale-factor=0.0039215697906911373 59 | #0=RGB, 1=BGR 60 | model-color-format=0 61 | model-engine-file=weights/model-1x3x416x416-fp16.engine 62 | labelfile-path=labels.txt 63 | batch-size=1 64 | ## 0=FP32, 1=INT8, 2=FP16 mode 65 | network-mode=2 66 | num-detected-classes=80 67 | gie-unique-id=1 68 | network-type=0 69 | is-classifier=0 70 | ## 0=Group Rectangles, 1=DBSCAN, 2=NMS, 3= DBSCAN+NMS Hybrid, 4 = None(No clustering) 71 | cluster-mode=2 72 | maintain-aspect-ratio=1 73 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so 74 | parse-bbox-func-name=NvDsInferParseCustomYoloV4 75 | #scaling-filter=0 76 | #scaling-compute-hw=0 77 | 78 | [class-attrs-all] 79 | nms-iou-threshold=0.6 80 | pre-cluster-threshold=0.4 -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-yolov4/exec_backends/__pycache__/trt_backend.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-yolov4/exec_backends/__pycache__/trt_backend.cpython-36.pyc -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-yolov4/exec_backends/trt_backend.py: -------------------------------------------------------------------------------- 1 | import pycuda.driver as cuda 2 | import pycuda.autoinit 3 | import numpy as np 4 | 5 | import tensorrt as trt 6 | 7 | TRT_LOGGER = trt.Logger() 8 | trt.init_libnvinfer_plugins(None, "") 9 | # Simple helper data class that's a little nicer to use than a 2-tuple. 10 | class HostDeviceMem(object): 11 | def __init__(self, host_mem, device_mem): 12 | self.host = host_mem 13 | self.device = device_mem 14 | 15 | def __str__(self): 16 | return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) 17 | 18 | def __repr__(self): 19 | return self.__str__() 20 | 21 | # Allocates all buffers required for an engine, i.e. host/device inputs/outputs. 22 | def allocate_buffers(engine, max_boxes, total_classes): 23 | inputs = [] 24 | outputs = [] 25 | bindings = [] 26 | stream = cuda.Stream() 27 | out_shapes = [] 28 | input_shapes = [] 29 | out_names = [] 30 | max_batch_size = engine.get_profile_shape(0, 0)[2][0] 31 | # max_batch_size = 1 32 | for binding in engine: 33 | binding_shape = engine.get_binding_shape(binding) 34 | 35 | # #Fix -1 dimension for proper memory allocation for batch_size > 1 36 | # if binding == 'input': 37 | # max_width = engine.get_profile_shape(0, 0)[2][3] 38 | # max_height = engine.get_profile_shape(0, 0)[2][2] 39 | # size = max_batch_size * max_width * max_height * 3 40 | # elif binding == 'confs': 41 | # size = max_batch_size * max_boxes * (total_classes) 42 | # elif binding == 'boxes': 43 | # size = max_batch_size * max_boxes * (4) 44 | # else: 45 | # raise NotImplementedError("Not support binding: {}".format(binding)) 46 | print(binding, binding_shape) 47 | assert min(binding_shape) > 0, print(binding, binding_shape) 48 | size = 1 49 | for i in range(len(binding_shape)): 50 | size *= binding_shape[i] 51 | 52 | dtype = trt.nptype(engine.get_binding_dtype(binding)) 53 | # Allocate host and device buffers 54 | host_mem = cuda.pagelocked_empty(size, dtype) 55 | device_mem = cuda.mem_alloc(host_mem.nbytes) 56 | # Append the device buffer to device bindings. 57 | bindings.append(int(device_mem)) 58 | # Append to the appropriate list. 59 | if engine.binding_is_input(binding): 60 | inputs.append(HostDeviceMem(host_mem, device_mem)) 61 | input_shapes.append(engine.get_binding_shape(binding)) 62 | else: 63 | outputs.append(HostDeviceMem(host_mem, device_mem)) 64 | #Collect original output shapes and names from engine 65 | out_shapes.append(engine.get_binding_shape(binding)) 66 | out_names.append(binding) 67 | return inputs, outputs, bindings, stream, input_shapes, out_shapes, out_names, max_batch_size 68 | 69 | # This function is generalized for multiple inputs/outputs. 70 | # inputs and outputs are expected to be lists of HostDeviceMem objects. 71 | def do_inference(context, bindings, inputs, outputs, stream): 72 | # Transfer input data to the GPU. 73 | [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] 74 | # Run inference. 75 | context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) 76 | # Transfer predictions back from the GPU. 77 | [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] 78 | # Synchronize the stream 79 | stream.synchronize() 80 | # Return only the host outputs. 81 | return [out.host for out in outputs] 82 | 83 | class TrtModel(object): 84 | def __init__(self, model, max_size, total_classes = 80): 85 | self.engine_file = model 86 | self.engine = None 87 | self.inputs = None 88 | self.outputs = None 89 | self.bindings = None 90 | self.stream = None 91 | self.context = None 92 | self.input_shapes = None 93 | self.out_shapes = None 94 | self.max_batch_size = 1 95 | self.max_size = max_size 96 | self.total_classes = total_classes 97 | 98 | def build(self): 99 | with open(self.engine_file, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime: 100 | self.engine = runtime.deserialize_cuda_engine(f.read()) 101 | # Allocate 102 | self.max_boxes = self.get_number_of_boxes(self.max_size, self.max_size) 103 | self.inputs, self.outputs, self.bindings, self.stream, self.input_shapes, self.out_shapes, self.out_names, self.max_batch_size = \ 104 | allocate_buffers(self.engine, max_boxes = self.max_boxes, total_classes = self.total_classes) 105 | self.context = self.engine.create_execution_context() 106 | self.context.active_optimization_profile = 0 107 | 108 | def get_number_of_boxes(self, im_width, im_height): 109 | # Calculate total boxes (3 detect layers) 110 | assert im_width % 32 == 0 and im_height % 32 == 0 111 | return (int(im_width*im_height/32/32) + int(im_width*im_height/16/16) + int(im_width*im_height/8/8))*3 112 | 113 | def run(self, input, deflatten: bool = True, as_dict = False): 114 | # lazy load implementation 115 | if self.engine is None: 116 | self.build() 117 | 118 | input = np.asarray(input) 119 | batch_size, _, im_height, im_width = input.shape 120 | assert batch_size <= self.max_batch_size 121 | assert max(im_width, im_height) <= self.max_size, "Invalid shape: {}x{}, max shape: {}".format(im_width, im_height, self.max_size) 122 | allocate_place = np.prod(input.shape) 123 | # print('allocate_place', input.shape) 124 | self.inputs[0].host[:allocate_place] = input.flatten(order='C').astype(np.float32) 125 | self.context.set_binding_shape(0, input.shape) 126 | trt_outputs = do_inference( 127 | self.context, bindings=self.bindings, 128 | inputs=self.inputs, outputs=self.outputs, stream=self.stream) 129 | if deflatten: 130 | trt_outputs = [output[:np.prod(shape)].reshape(shape) for output, shape in zip(trt_outputs, self.out_shapes)] 131 | if as_dict: 132 | return {self.out_names[ix]: trt_output[:batch_size] for ix, trt_output in enumerate(trt_outputs)} 133 | return [trt_output[:batch_size] for trt_output in trt_outputs] 134 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-yolov4/labels.txt: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-yolov4/nvdsinfer_custom_impl_Yolo/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | CUDA_VER?= 18 | ifeq ($(CUDA_VER),) 19 | $(error "CUDA_VER is not set") 20 | endif 21 | CC:= g++ 22 | NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc 23 | 24 | CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations 25 | CFLAGS+= -I../../includes -I/usr/local/cuda-$(CUDA_VER)/include -I/opt/nvidia/deepstream/deepstream/sources/includes 26 | 27 | LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs 28 | LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group 29 | 30 | INCS:= $(wildcard *.h) 31 | SRCFILES:= nvdsparsebbox_Yolo.cpp 32 | 33 | TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so 34 | 35 | TARGET_OBJS:= $(SRCFILES:.cpp=.o) 36 | TARGET_OBJS:= $(TARGET_OBJS:.cu=.o) 37 | 38 | all: $(TARGET_LIB) 39 | 40 | %.o: %.cpp $(INCS) Makefile 41 | $(CC) -c -o $@ $(CFLAGS) $< 42 | 43 | %.o: %.cu $(INCS) Makefile 44 | $(NVCC) -c -o $@ --compiler-options '-fPIC' $< 45 | 46 | $(TARGET_LIB) : $(TARGET_OBJS) 47 | $(CC) -o $@ $(TARGET_OBJS) $(LFLAGS) 48 | 49 | clean: 50 | rm -rf $(TARGET_LIB) 51 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-yolov4/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "nvdsinfer_custom_impl.h" 25 | 26 | static const int NUM_CLASSES_YOLO = 80; 27 | 28 | float clamp(const float val, const float minVal, const float maxVal) 29 | { 30 | assert(minVal <= maxVal); 31 | return std::min(maxVal, std::max(minVal, val)); 32 | } 33 | 34 | extern "C" bool NvDsInferParseCustomYoloV4( 35 | std::vector const& outputLayersInfo, 36 | NvDsInferNetworkInfo const& networkInfo, 37 | NvDsInferParseDetectionParams const& detectionParams, 38 | std::vector& objectList); 39 | 40 | 41 | /* YOLOv4 implementations */ 42 | static NvDsInferParseObjectInfo convertBBoxYoloV4(const float& bx1, const float& by1, const float& bx2, 43 | const float& by2, const uint& netW, const uint& netH) 44 | { 45 | NvDsInferParseObjectInfo b; 46 | // Restore coordinates to network input resolution 47 | 48 | float x1 = bx1 * netW; 49 | float y1 = by1 * netH; 50 | float x2 = bx2 * netW; 51 | float y2 = by2 * netH; 52 | 53 | x1 = clamp(x1, 0, netW); 54 | y1 = clamp(y1, 0, netH); 55 | x2 = clamp(x2, 0, netW); 56 | y2 = clamp(y2, 0, netH); 57 | 58 | b.left = x1; 59 | b.width = clamp(x2 - x1, 0, netW); 60 | b.top = y1; 61 | b.height = clamp(y2 - y1, 0, netH); 62 | 63 | return b; 64 | } 65 | 66 | static void addBBoxProposalYoloV4(const float bx, const float by, const float bw, const float bh, 67 | const uint& netW, const uint& netH, const int maxIndex, 68 | const float maxProb, std::vector& binfo) 69 | { 70 | NvDsInferParseObjectInfo bbi = convertBBoxYoloV4(bx, by, bw, bh, netW, netH); 71 | if (bbi.width < 1 || bbi.height < 1) return; 72 | 73 | bbi.detectionConfidence = maxProb; 74 | bbi.classId = maxIndex; 75 | binfo.push_back(bbi); 76 | } 77 | 78 | static std::vector 79 | decodeYoloV4Tensor( 80 | const float* boxes, const float* scores, const float* classes, 81 | const uint num_bboxes, NvDsInferParseDetectionParams const& detectionParams, 82 | const uint& netW, const uint& netH) 83 | { 84 | std::vector binfo; 85 | 86 | uint bbox_location = 0; 87 | uint score_location = 0; 88 | for (uint b = 0; b < num_bboxes; ++b) 89 | { 90 | float bx1 = boxes[bbox_location]; 91 | float by1 = boxes[bbox_location + 1]; 92 | float bx2 = boxes[bbox_location + 2]; 93 | float by2 = boxes[bbox_location + 3]; 94 | float maxProb = scores[score_location]; 95 | int maxIndex = (int) classes[score_location]; 96 | 97 | if (maxProb > detectionParams.perClassPreclusterThreshold[maxIndex]) 98 | { 99 | addBBoxProposalYoloV4(bx1, by1, bx2, by2, netW, netH, maxIndex, maxProb, binfo); 100 | } 101 | 102 | bbox_location += 4; 103 | score_location += 1; 104 | } 105 | 106 | return binfo; 107 | } 108 | 109 | extern "C" bool NvDsInferParseCustomYoloV4( 110 | std::vector const& outputLayersInfo, 111 | NvDsInferNetworkInfo const& networkInfo, 112 | NvDsInferParseDetectionParams const& detectionParams, 113 | std::vector& objectList) 114 | { 115 | if (NUM_CLASSES_YOLO != detectionParams.numClassesConfigured) 116 | { 117 | std::cerr << "WARNING: Num classes mismatch. Configured:" 118 | << detectionParams.numClassesConfigured 119 | << ", detected by network: " << NUM_CLASSES_YOLO << std::endl; 120 | } 121 | 122 | std::vector objects; 123 | const NvDsInferLayerInfo &n_bboxes = outputLayersInfo[0]; 124 | const NvDsInferLayerInfo &boxes = outputLayersInfo[1]; // (num_boxes, 4) 125 | const NvDsInferLayerInfo &scores = outputLayersInfo[2]; // (num_boxes, ) 126 | const NvDsInferLayerInfo &classes = outputLayersInfo[3]; // (num_boxes, ) 127 | 128 | 129 | int num_bboxes = *(const int*)(n_bboxes.buffer); 130 | 131 | 132 | assert(boxes.inferDims.numDims == 2); 133 | assert(scores.inferDims.numDims == 1); 134 | assert(classes.inferDims.numDims == 1); 135 | 136 | // std::cout << "Network Info: " << networkInfo.height << " " << networkInfo.width << std::endl; 137 | 138 | std::vector outObjs = 139 | decodeYoloV4Tensor( 140 | (const float*)(boxes.buffer), (const float*)(scores.buffer), (const float*)(classes.buffer), num_bboxes, detectionParams, 141 | networkInfo.width, networkInfo.height); 142 | 143 | objects.insert(objects.end(), outObjs.begin(), outObjs.end()); 144 | 145 | objectList = objects; 146 | 147 | return true; 148 | } 149 | /* YOLOv4 implementations end*/ 150 | 151 | 152 | /* Check that the custom function has been defined correctly */ 153 | CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV4); -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-yolov4/test_images/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-yolov4/test_images/test.png -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-yolov4/test_onnx.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from exec_backends.trt_backend import TrtModel 4 | 5 | 6 | def preprocess(img, input_size = (416, 416)): 7 | resized_img = cv2.resize(img, (input_size[1], input_size[0])) 8 | resized_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB) 9 | resized_img = np.expand_dims(resized_img, 0) 10 | resized_img = resized_img.astype('float32') / 255.0 11 | resized_img = np.transpose(resized_img, (0, 3, 1, 2)) 12 | return resized_img 13 | 14 | def visualize(img, bboxes): 15 | height, width, _ = img.shape 16 | bboxes[:, 0] *= width 17 | bboxes[:, 1] *= height 18 | bboxes[:, 2] *= width 19 | bboxes[:, 3] *= height 20 | for x1, y1, x2, y2 in bboxes: 21 | x1 = int(x1) 22 | y1 = int(y1) 23 | x2 = int(x2) 24 | y2 = int(y2) 25 | cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2) 26 | return img 27 | 28 | if __name__ == '__main__': 29 | model_path = 'weights/model-1x3x416x416-fp16.engine' 30 | img_path = 'test_images/test.png' 31 | 32 | model = TrtModel(model_path, max_size = 416) 33 | img = cv2.imread(img_path) 34 | batch = preprocess(img) 35 | 36 | num_detections, bboxes, confs, classes = model.run(batch) 37 | print(num_detections.shape, bboxes.shape, confs.shape, classes.shape) 38 | bboxes = bboxes[0][:num_detections[0][0]] 39 | confs = confs[0][:num_detections[0][0]] 40 | classes = classes[0][:num_detections[0][0]] 41 | print(bboxes) 42 | vis = visualize(img.copy(), bboxes) 43 | cv2.imshow('vis.jpg', vis) 44 | cv2.waitKey(0) 45 | -------------------------------------------------------------------------------- /Deploy/Deepstream/sample-yolov4/tools/add_nms_plugins.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | #!/usr/bin/env python3 18 | import onnx_graphsurgeon as gs 19 | import argparse 20 | import onnx 21 | import numpy as np 22 | 23 | def create_and_add_plugin_node(graph, topK, keepTopK): 24 | 25 | batch_size = graph.inputs[0].shape[0] 26 | input_h = graph.inputs[0].shape[2] 27 | input_w = graph.inputs[0].shape[3] 28 | print('batch_size', batch_size) 29 | 30 | tensors = graph.tensors() 31 | boxes_tensor = tensors["boxes"] 32 | confs_tensor = tensors["confs"] 33 | 34 | num_detections = gs.Variable(name="num_detections").to_variable(dtype=np.int32, shape=[-1, 1]) 35 | nmsed_boxes = gs.Variable(name="nmsed_boxes").to_variable(dtype=np.float32, shape=[-1, keepTopK, 4]) 36 | nmsed_scores = gs.Variable(name="nmsed_scores").to_variable(dtype=np.float32, shape=[-1, keepTopK]) 37 | nmsed_classes = gs.Variable(name="nmsed_classes").to_variable(dtype=np.float32, shape=[-1, keepTopK]) 38 | 39 | new_outputs = [num_detections, nmsed_boxes, nmsed_scores, nmsed_classes] 40 | 41 | mns_node = gs.Node( 42 | op="BatchedNMSDynamic_TRT", 43 | attrs=create_attrs(input_h, input_w, topK, keepTopK), 44 | inputs=[boxes_tensor, confs_tensor], 45 | outputs=new_outputs) 46 | 47 | graph.nodes.append(mns_node) 48 | graph.outputs = new_outputs 49 | 50 | return graph.cleanup().toposort() 51 | 52 | 53 | 54 | 55 | def create_attrs(input_h, input_w, topK, keepTopK): 56 | 57 | num_anchors = 3 58 | 59 | h1 = input_h // 8 60 | h2 = input_h // 16 61 | h3 = input_h // 32 62 | 63 | w1 = input_w // 8 64 | w2 = input_w // 16 65 | w3 = input_w // 32 66 | 67 | num_boxes = num_anchors * (h1 * w1 + h2 * w2 + h3 * w3) 68 | 69 | attrs = {} 70 | 71 | attrs["shareLocation"] = 1 72 | attrs["backgroundLabelId"] = -1 73 | attrs["numClasses"] = 80 74 | attrs["topK"] = topK 75 | attrs["keepTopK"] = keepTopK 76 | attrs["scoreThreshold"] = 0.4 77 | attrs["iouThreshold"] = 0.6 78 | attrs["isNormalized"] = 1 79 | attrs["clipBoxes"] = 1 80 | 81 | # 001 is the default plugin version the parser will search for, and therefore can be omitted, 82 | # but we include it here for illustrative purposes. 83 | attrs["plugin_version"] = "1" 84 | 85 | return attrs 86 | 87 | 88 | def main(): 89 | parser = argparse.ArgumentParser(description="Add batchedNMSPlugin") 90 | parser.add_argument("-f", "--model", help="Path to the ONNX model generated by export_model.py", default="yolov4_1_3_416_416.onnx") 91 | parser.add_argument("-t", "--topK", help="number of bounding boxes for nms", default=2000) 92 | parser.add_argument("-k", "--keepTopK", help="bounding boxes to be kept per image", default=1000) 93 | 94 | args, _ = parser.parse_known_args() 95 | 96 | graph = gs.import_onnx(onnx.load(args.model)) 97 | 98 | graph = create_and_add_plugin_node(graph, int(args.topK), int(args.keepTopK)) 99 | 100 | onnx.save(gs.export_onnx(graph), args.model + ".nms.onnx") 101 | 102 | 103 | if __name__ == '__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /Deploy/NVIDIA/README.md: -------------------------------------------------------------------------------- 1 | # NVIDIA frameworks, platforms, engines, toolkits, blogs, ... 2 | 3 | - [Multi-instance GPU (MIG)](docs/multi_instance_gpu.md) 4 | - [FFMPEG with NVENC NVDEC hardware-acceleration](docs/nvidia_video_sdk.md) -------------------------------------------------------------------------------- /Deploy/NVIDIA/docs/nvidia_video_sdk.md: -------------------------------------------------------------------------------- 1 | # FFMPEG hardware acceleration with Nvidia Video SDK 2 | ## 1. Requirements 3 | - GPU with hardware-acceleration support, check here: https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new 4 |

5 | 6 | Example of NVDEC support 7 |

8 | 9 | - Nvidia Driver 10 | - CUDA Toolkit 11 | 12 | ## 2. Install FFMPEG with hardware acceleration 13 | System Information 14 | - OS: Ubuntu 18.04 15 | - CPU: Intel(R) Xeon(R) X5650 (12M Cache, 2.66 GHz, 6.40 GT/S Intel® QPI) 16 | - NVIDIA GTX 1060 OC 3Gb 17 | 18 | ``` 19 | sudo apt-get install build-essential yasm cmake libtool libc6 libc6-dev unzip wget libnuma1 libnuma-dev libx264-dev libvpx-dev libvorbis-dev 20 | 21 | git clone --branch sdk/11.1 https://git.videolan.org/git/ffmpeg/nv-codec-headers.git 22 | 23 | cd nv-codec-headers && sudo make install && cd .. 24 | 25 | git clone --branch n4.4.3 https://git.ffmpeg.org/ffmpeg.git ffmpeg/ && cd ffmpeg 26 | 27 | ./configure --enable-nonfree --enable-cuda-nvcc --enable-nvenc --enable-cuvid --enable-nvdec --enable-libnpp --extra-cflags=-I/usr/local/cuda/include --extra-ldflags=-L/usr/local/cuda/lib64 --disable-static --enable-shared --enable-libx264 --enable-libvpx --enable-libvorbis --enable-gpl --enable-cuda 28 | 29 | make -j8 30 | 31 | sudo make install 32 | 33 | sudo ldconfig 34 | 35 | ffmpeg --help 36 | ``` 37 | If you meet error about **nvcc**, try to change line 4355 of ```ffmpeg/configure``` to ```nvccflags_default="-gencode arch=compute_35,code=sm_35 -O2"``` 38 | 39 | ## 3. Benchmark 40 | ### 3.1. Convert MPEG-4 to H264 41 | - Public **libx264** 42 | ``` 43 | ffmpeg -y -i test.avi -c:v libx264 test.mp4 44 | 45 | Output #0, mp4, to 'test.mp4': 46 | Metadata: 47 | major_brand : mp42 48 | minor_version : 0 49 | compatible_brands: isommp42 50 | com.android.model: 21121210C 51 | com.android.version: 12 52 | com.android.manufacturer: Xiaomi 53 | encoder : Lavf58.76.100 54 | Stream #0:0(eng): Video: h264 (avc1 / 0x31637661), yuv420p(tv, bt709, progressive), 1920x1080, q=2-31, 30 fps, 15360 tbn (default) 55 | Metadata: 56 | creation_time : 2022-11-23T08:27:41.000000Z 57 | handler_name : VideoHandle 58 | vendor_id : [0][0][0][0] 59 | encoder : Lavc58.134.100 libx264 60 | Side data: 61 | cpb: bitrate max/min/avg: 0/0/0 buffer size: 0 vbv_delay: N/A 62 | Stream #0:1(eng): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, stereo, fltp, 128 kb/s (default) 63 | Metadata: 64 | creation_time : 2022-11-23T08:27:41.000000Z 65 | handler_name : SoundHandle 66 | vendor_id : [0][0][0][0] 67 | encoder : Lavc58.134.100 aac 68 | frame= 4871 fps= 44 q=-1.0 Lsize= 263346kB time=00:02:42.27 bitrate=13294.1kbits/s dup=0 drop=3 speed=1.48x 69 | video:260623kB audio:2550kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.065733% 70 | ``` 71 | - Hardware acceleration 72 | ``` 73 | ffmpeg -y -i test.avi -c:v h264_nvenc test.mp4 74 | 75 | Output #0, mp4, to 'test2.mp4': 76 | Metadata: 77 | major_brand : mp42 78 | minor_version : 0 79 | compatible_brands: isommp42 80 | com.android.model: 21121210C 81 | com.android.version: 12 82 | com.android.manufacturer: Xiaomi 83 | encoder : Lavf58.76.100 84 | Stream #0:0(eng): Video: h264 (Main) (avc1 / 0x31637661), yuv420p(tv, bt709, progressive), 1920x1080, q=2-31, 2000 kb/s, 30 fps, 15360 tbn (default) 85 | Metadata: 86 | creation_time : 2022-11-23T08:27:41.000000Z 87 | handler_name : VideoHandle 88 | vendor_id : [0][0][0][0] 89 | encoder : Lavc58.134.100 h264_nvenc 90 | Side data: 91 | cpb: bitrate max/min/avg: 0/0/2000000 buffer size: 4000000 vbv_delay: N/A 92 | Stream #0:1(eng): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, stereo, fltp, 128 kb/s (default) 93 | Metadata: 94 | creation_time : 2022-11-23T08:27:41.000000Z 95 | handler_name : SoundHandle 96 | vendor_id : [0][0][0][0] 97 | encoder : Lavc58.134.100 aac 98 | frame= 4871 fps=269 q=41.0 Lsize= 44291kB time=00:02:42.27 bitrate=2235.9kbits/s dup=0 drop=3 speed=8.95x 99 | video:41583kB audio:2550kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.356228% 100 | [aac @ 0x5590e2d37e00] Qavg: 182.528 101 | ``` 102 | So basically, without care about bitrate, we can increase performance from **1.48x** to **8.95x** with NVIDIA hardware-acceleration 103 | -------------------------------------------------------------------------------- /Deploy/NVIDIA/fig/gpu-mig-overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/NVIDIA/fig/gpu-mig-overview.jpg -------------------------------------------------------------------------------- /Deploy/NVIDIA/fig/mig_bert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/NVIDIA/fig/mig_bert.png -------------------------------------------------------------------------------- /Deploy/NVIDIA/fig/support_nvenc_nvdec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/NVIDIA/fig/support_nvenc_nvdec.png -------------------------------------------------------------------------------- /Deploy/README.md: -------------------------------------------------------------------------------- 1 | # Deploy 2 | Tất cả những thứ liên quan đến Deploy & Deploy engines -------------------------------------------------------------------------------- /Deploy/Transfer-Learning-Toolkit/README.md: -------------------------------------------------------------------------------- 1 | # Transfer-Learning-Toolkit (TLT) from NVIDIA 2 | 3 | - [Yolov4](docs/yolov4.md) 4 | - [Detectnet_V2](docs/detectnet_v2.md) -------------------------------------------------------------------------------- /Deploy/Transfer-Learning-Toolkit/fig/detectnet_v2-inference.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Transfer-Learning-Toolkit/fig/detectnet_v2-inference.jpg -------------------------------------------------------------------------------- /Deploy/Transfer-Learning-Toolkit/fig/nvidia-retrain-qat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Transfer-Learning-Toolkit/fig/nvidia-retrain-qat.png -------------------------------------------------------------------------------- /Deploy/Transfer-Learning-Toolkit/fig/yolov4-inference.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Transfer-Learning-Toolkit/fig/yolov4-inference.png -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/README.md: -------------------------------------------------------------------------------- 1 | # AI-Engineer-Note 2 | 3 | Tất cả những thứ liên quan đến Triton-inference-server 4 | ## Basic 5 | - [1. Cài đặt triton-server và triton-client](docs/install.md) 6 | + [1.1. Các chế độ quản lý model (load/unload/reload)](docs/model_management.md) 7 | - [2. Sơ lược về các backend trong Triton](docs/backend.md) 8 | - [3. Cấu hình cơ bản khi deploy mô hình](docs/model_configuration.md) 9 | - [4. Deploy mô hình](#) 10 | - [4.1 ONNX-runtime](docs/triton_onnx.md) 11 | - [4.2 TensorRT](docs/triton_tensorrt.md) 12 | - [4.3 Pytorch & TorchScript](docs/triton_pytorch.md) 13 | - [4.4 Kaldi (Advanced)](docs/triton_kaldi.md) 14 | - [5. Model Batching](docs/model_batching.md) 15 | - [6. Ensemble Model và pre/post processing](docs/model_ensemble.md) 16 | ## Advanced 17 | - [Sử dụng Performance Analyzer Tool](docs/perf_analyzer.md) 18 | - [Optimizations](#) 19 | + [Tối ưu Pytorch backend](docs/optimization_pytorch.md) 20 | -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/docs/backend.md: -------------------------------------------------------------------------------- 1 | # Triton backend 2 | 3 | Triton backend được xây dựng trong việc thực thi mô hình. Một backend thông thường có thể được wrap bằng việc sử dụng các deep-learning framework như Pytorch, Tensorflow, TensorRT, ONNX-runtime hoặc OpenVINO như chúng ta đã từng làm để deploy mô hình (chẳng hạn như việc xây dựng một class load mô hình, warmup, pre-processing, inference, post-processing, ...). Dựa trên ý tưởng như vậy, ```triton-backend``` cũng được xây dựng bằng việc tổng hợp các backend của các deep-learning framework trên, sau đó cung cấp ra ngoài những API để người dùng có thể kết nối tới các mô hình deep-learning đã được load bằng ```triton-server```. Cho đến phiên bản hiện tại, ```triton-server``` hỗ trợ các backend sau: 4 | - TensorRT (platform: ```tensorrt_plan```) 5 | - Pytorch (platform: ```pytorch_libtorch```) 6 | - ONNX (platform: ```onnxruntime_onnx```) 7 | - Tensorflow (platform: ```tensorflow_savedmodel```) 8 | - Other backends (platform: phụ thuộc vào backend đã được định nghĩa) -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/docs/install.md: -------------------------------------------------------------------------------- 1 | # Install Triton 2 | 3 | Nội dung phần này sẽ đề cập đến việc cài đặt và sử dụng nhanh triton-server và triton-client. 4 | 5 | ## 1. Cài đặt triton-server 6 | Nếu server đã cài đặt **triton-server** rồi thì có thể bỏ qua bước này và chuyển đến cài đặt và sử dụng **triton-client**. Hiện tại cách nhanh nhất để sử dụng triton-inference-server là docker từ NVIDIA NGC. Ngoài ra phương pháp build-from-source nếu có dịp mình sẽ trình bày sau. 7 | ### 1.1 Cài đặt sử dụng NVIDIA NGC 8 | ``` 9 | docker pull nvcr.io/nvidia/tritonserver:-py3 10 | ``` 11 | trong đó `````` là phiên bản, chẳng hạn 12 | ``` 13 | docker pull nvcr.io/nvidia/tritonserver:21.12-py3 14 | ``` 15 | ### 1.2 Chạy thử model 16 | Ở đây mình sẽ chạy thử mô hình wav2vec-base (mà mình đã convert sang ONNX) sử dụng backend là ONNX-runtime. Cấu trúc thư mục mình xây dựng như sau: 17 | ```bash 18 | ├── models 19 | │ ├── wav2vec_general_v2 20 | │ │ ├── 1 21 | │ │ │ ├── model.onnx 22 | │ │ ├── config.pbtxt 23 | ``` 24 | File ```config.pbtxt``` 25 | ``` 26 | name: "wav2vec_general_v2" 27 | platform: "onnxruntime_onnx" 28 | max_batch_size : 0 29 | input [ 30 | { 31 | name: "input" 32 | data_type: TYPE_FP32 33 | dims: [1, -1] 34 | } 35 | ] 36 | output [ 37 | { 38 | name: "output" 39 | data_type: TYPE_FP32 40 | dims: [-1, -1, 105] 41 | } 42 | ] 43 | ``` 44 | Chạy triton-server sử dụng GPU 1 (Hiện tại mình đang ở thư mục cùng bậc với thư mục ```models```) 45 | ``` 46 | docker run --gpus device=1 --rm -p8000:8000 -p8001:8001 -p 3 tritonserver --model-repository=/models 47 | ``` 48 | Hoặc chạy triton-server sử dụng GPU 1 và cơ chế ```share-memory``` 49 | ``` 50 | docker run --gpus device=1 --rm --ipc=host --shm-size=128m -p8000:8000 -p8001:8001 -p 3 tritonserver --model-repository=/models 51 | ``` 52 | 53 | Output 54 |

55 | 56 |

57 | 58 | 59 | ## 2. Cài đặt triton-client 60 | ### 2.1 Cài đặt cơ bản 61 | Với mục đích cơ bản là gọi từ ```python```, ta có thể cài đặt nhanh sử dụng ```pip``` 62 | ``` 63 | pip install tritonclient grpcio-tools 64 | ``` 65 | ### 2.2 Cài đặt nâng cao 66 | Khác việc sử dụng ```pip``` để cài đặt và sử ```python``` để gọi đến **triton-server**, ở đây chủ yếu đề cập đến việc ```build-from-source``` để sử dụng một số thư viện đi kèm như **Model Analyst** và **Performance Analyst** 67 | - Cài đặt các thư viện Linux 68 | ``` 69 | sudo apt-get install curl libcurl4-openssl-dev libb64-dev default-jdk maven 70 | ``` 71 | - Cài đặt ```rapidjson``` 72 | ``` 73 | git clone https://github.com/Tencent/rapidjson.git 74 | cd rapidjson 75 | cmake . 76 | make 77 | sudo make install 78 | ``` 79 | - Thêm 1 thư viện ```python``` không nó lại bắn lỗi giữa chừng lại mất công build lại 1 đoạn 80 | ``` 81 | python3 -m pip install grpcio-tools 82 | ``` 83 | - Tiến hành build **triton-client** (ở đây **triton-server** mình sử dụng Docker phiên bản r21.12) 84 | ``` 85 | git clone --recursive https://github.com/triton-inference-server/client.git triton-client 86 | cd triton-client 87 | mkdir build && cd build 88 | cmake -DCMAKE_INSTALL_PREFIX=`pwd`/install -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON -DTRITON_ENABLE_PERF_ANALYZER=ON -DTRITON_ENABLE_PYTHON_HTTP=ON -DTRITON_ENABLE_PYTHON_GRPC=ON -DTRITON_ENABLE_JAVA_HTTP=ON -DTRITON_ENABLE_GPU=ON -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON -DTRITON_COMMON_REPO_TAG=r21.12 -DTRITON_THIRD_PARTY_REPO_TAG=r21.12 -DTRITON_CORE_REPO_TAG=r21.12 -DTRITON_BACKEND_REPO_TAG=r21.12 .. 89 | make cc-clients python-clients java-clients 90 | ``` 91 | - Sau đó những thư viện ta đã build sẽ xuất hiện trong thư mục ```triton-client/build/install``` và cái chúng ta cần quan tâm sẽ là ```bin/perf_analyzer``` 92 | ### 2.3 Sử dụng triton-client để gọi và lấy kết quả 93 | Tham khảo [src/sample_grpc.py](../src/sample_grpc.py) 94 | 95 | -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/docs/model_batching.md: -------------------------------------------------------------------------------- 1 | # Model Batching 2 | 3 | Phần này ta sẽ tìm hiểu về một số cơ chế batching hỗ trợ bởi Triton 4 | 5 | ### Dynamic Batching 6 | Dynamic Batching thì không cần đề cập nhiều, luồng các messages vào đồng thời sẽ được gom lại và infer theo batch, phương pháp này chủ yếu nhằm tăng [throughput](../docs/perf_analyzer.md) (dẫn đến tăng [latency](../docs/perf_analyzer.md) khi trong cùng một điều kiện về resources) 7 | ``` 8 | dynamic_batching { } 9 | ``` 10 | hoặc thêm cấu hình thời gian tối đa queue chờ messages mới (microseconds) 11 | ``` 12 | dynamic_batching { 13 | max_queue_delay_microseconds: 100 14 | } 15 | ``` 16 | 17 | ### Ragged Batching 18 | -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/docs/model_configuration.md: -------------------------------------------------------------------------------- 1 | # Model Configuration 2 | Mặc định, cấu hình phải định nghĩa trước cho mô hình các thông số như tên model, platform sử dụng (```tensorrt_plan, pytorch_libtorch, tensorflow_savedmodel, ...```), kiểu dữ liệu, kích thước cho input, output, cấu hình wramup, cấu hình optimization, ... 3 | ### 1. Cấu hình cơ bản (minimal model configuration) 4 | Mặc định ta không cần xây dựng cấu hình cho các model TensorRT, Tensorflow saved-model và ONNX vì Triton có thể tự động generate. Đối với các model này nếu như không tồn tại ```config.pbtxt``` và ta khởi động triton-server với tham số ```--strict-model-config = false```, triton-server sẽ tự động generate ra file ```config.pbtxt``` ở mức cơ bản. Hoặc ta có thể xây dựng file ```config.pbtxt``` bằng tay. Ở đây mình sẽ xây dựng cấu hình cho đoạn code Pre-processing, Inference và Post-processing GFPGan đều sử dụng Pytorch. 5 | - Pre-processing 6 | ``` 7 | name: "pre_gfpgan_batch" 8 | platform: "pytorch_libtorch" 9 | max_batch_size: 8 10 | input [ 11 | { 12 | name: "input__0" 13 | data_type: TYPE_UINT8 14 | dims: [-1, -1, 3] 15 | } 16 | ] 17 | output [ 18 | { 19 | name: "output__0" 20 | data_type: TYPE_FP32 21 | dims: [3, -1, -1] 22 | } 23 | ] 24 | ``` 25 | - Inference 26 | ``` 27 | name: "infer_face_restoration_v2.1" 28 | platform: "pytorch_libtorch" 29 | max_batch_size: 8 30 | input [ 31 | { 32 | name: "input__0" 33 | data_type: TYPE_FP32 34 | dims: [3, 512, 512] 35 | } 36 | ] 37 | output [ 38 | { 39 | name: "output__0" 40 | data_type: TYPE_FP32 41 | dims: [3, 512, 512] 42 | } 43 | ] 44 | ``` 45 | - Post-processing 46 | ``` 47 | name: "post_gfpgan_batch" 48 | platform: "pytorch_libtorch" 49 | max_batch_size: 8 50 | input [ 51 | { 52 | name: "input__0" 53 | data_type: TYPE_FP32 54 | dims: [3, -1, -1] 55 | } 56 | ] 57 | output [ 58 | { 59 | name: "output__0" 60 | data_type: TYPE_UINT8 61 | dims: [-1, -1, 3] 62 | } 63 | ] 64 | ``` 65 | 66 | Giá trị **-1** thể hiện cho **dynamic-shape** 67 | 68 | Cần lưu ý giá trị ```max_batch_size```, khi giá trị này **khác** 0, giá trị ```dims``` sẽ được hiểu là kích thước của **1 dữ liệu đầu vào**, model sẽ chấp nhận kích thước đầu vào từ ```1 x dims``` đến ```max_batch_size x dims``` (dynamic batch), và nếu giá trị này **bằng** 0, giá trị ```dims``` sẽ được hiểu là **kích thước đầu vào** (static batch) -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/docs/model_ensemble.md: -------------------------------------------------------------------------------- 1 | # Ensemble multiple models and pre/post-processing 2 | 3 | Phần này mình sẽ trình bày nội dung liên quan đến Model Ensemble trong việc giải quyết 2 tình huống 4 | - Xây dựng pipeline end-to-end khi kết hợp 2 hoặc nhiều model với nhau (output của model này là input của model khác) 5 | - Tích hợp tiền xử lý / hậu xử lý vào pipeline 6 | 7 | Lưu ý: Cách giải quyết của 2 tình huống này là giống nhau 8 | ### 1. Đặt vấn đề 9 | Ví dụ như trong trường hợp của mình, mình sử dụng mô hình GFPGan với nhiều bộ dữ liệu khác nhau, từ đó có các phiên bản khác nhau của mô hình, các phiên bản này đều có đặc điểm chung là sử dụng **cùng** một phương pháp **tiền xử lý (pre-processing)** và **hậu xử lý (post-processing)**. Cách thức deploy hiện tại là đặt tiền/hậu xử lý ở phía ```client```, nhưng điều này sẽ khá bất cập khi scalable. Do vậy câu hỏi đặt ra là làm thế nào để tích hợp 2 thứ này vào triton một cách nhanh chóng và linh hoạt nhất để giảm thiểu chi phí chuyển giao trung gian và số lượng requests gửi đến. Triton có hỗ trợ chúng ta dưới dạng **Model Ensemble**. Ý tưởng chủ yếu được gói gọn trong 2 gạch đầu dòng sau: 10 | - Quá trình tiền/hậu xử lý được build thành 1 model triton 11 | - Tạo model ensemble: ```pre-processing -> infer -> post-processing```. Model này không phải là một model thực sự mà là một ```dataflow``` được xây dựng dựa trên model configuration 12 | ### 2. Convert model tiền/hậu xử lý 13 | Lấy ví dụ việc tiền xử lý ảnh của mình như sau (sử dụng numpy & opencv-python, thuần CPU) 14 | ``` 15 | def triton_preprocess(cropped_face): 16 | rgb = cv2.cvtColor(cropped_face, cv2.COLOR_BGR2RGB) # BGR sang RGB 17 | rgb = rgb.astype("float32") / 255.0 # Rescale về đoạn [0, 1] 18 | rgb = (rgb - 0.5)/0.5 # Rescale từ [0, 1] về [-1, 1] 19 | rgb = np.expand_dims(rgb, axis = 0) # [256, 256, 3] -> [1, 256, 256, 3] 20 | return np.transpose(rgb, (0, 3, 1, 2)) # [1, 256, 256, 3] -> [1, 3, 256, 256] 21 | 22 | def triton_postprocess(net_out, min_max = (-1, 1)): 23 | net_out = np.transpose(net_out, (0, 2, 3, 1)) # [1, 3, 256, 256] -> [1, 256, 256, 3] 24 | net_out = np.clip(net_out[0], min_max[0], min_max[1]) # [1, 256, 256, 3] -> [256, 256, 3] & clip 25 | net_out = (net_out - min_max[0]) / (min_max[1] - min_max[0]) # Rescale từ [-1, 1] về [0, 1] 26 | net_out = np.array(net_out * 255.0, dtype = np.uint8) # Rescale từ [0, 1] về [0, 255] với uint8 27 | return cv2.cvtColor(net_out, cv2.COLOR_RGB2BGR) # RGB sang BGR 28 | ``` 29 | Tiến hành convert sang pytorch 30 | ``` 31 | class GFPGanPreprocessor(nn.Module): 32 | def __init__(self): 33 | super(GFPGanPreprocessor, self).__init__() 34 | def forward(self, x): 35 | x = x[:, :, [2, 1, 0]] 36 | x = x / 255.0 37 | x = (x - 0.5)/0.5 38 | x = torch.unsqueeze(x, 0) 39 | return torch.permute(x, (0, 3, 1, 2)) 40 | 41 | class GFPGanPostprocessor(nn.Module): 42 | def __init__(self): 43 | super(GFPGanPostprocessor, self).__init__() 44 | def forward(self, x): 45 | x = torch.permute(x, (0, 2, 3, 1)) 46 | x = torch.clamp(x, -1.0, 1.0) 47 | x = ((x + 1.0)/2.0*255.0).byte() 48 | return x[:, :, [2, 1, 0]] 49 | ``` 50 | Sử dụng pytorch JIT, nếu mọi người chưa biết về JIT có thể xem qua bài viết này 51 | - [Deploy mô hình sử dụng Pytorch (TorchScript) và Triton](./triton_pytorch.md) 52 | ``` 53 | # JIT 54 | pre_model = GFPGanPreprocessor() 55 | post_model = GFPGanPostprocessor() 56 | pre_model.eval() 57 | post_model.eval() 58 | 59 | pre_x = torch.rand((256, 256, 3)) 60 | pre_traced_cell = torch.jit.trace(pre_model, (pre_x,), strict=False, check_trace=True) 61 | print(pre_model(pre_x)) 62 | print(pre_traced_cell(pre_x)) 63 | pre_traced_cell.save('pre_traced_cell.pt') 64 | 65 | post_x = torch.rand((1, 3, 256, 256)) 66 | post_traced_cell = torch.jit.trace(post_model, (post_x,), strict=False, check_trace=True) 67 | print(post_model(post_x)) 68 | print(post_traced_cell(post_x)) 69 | post_traced_cell.save('post_traced_cell.pt') 70 | ``` 71 | Kết quả chúng ta thu được 2 file ```pre_traced_cell.pt``` và ```post_traced_cell.pt``` là 2 model pre/post-process 72 | ### 3. Đẩy model lên triton-server 73 | Bước này khá là cơ bản, mình tiến hành đẩy 2 model lên triton với các cấu hình tương ứng sau 74 | - Pre-process 75 | ``` 76 | name: "pre_gfpgan" 77 | platform: "pytorch_libtorch" 78 | max_batch_size: 0 79 | input [ 80 | { 81 | name: "input__0" 82 | data_type: TYPE_UINT8 83 | dims: [-1, -1, 3] 84 | } 85 | ] 86 | output [ 87 | { 88 | name: "output__0" 89 | data_type: TYPE_FP32 90 | dims: [1, 3, -1, -1] 91 | } 92 | ] 93 | ``` 94 | - Post-process 95 | ``` 96 | name: "post_gfpgan" 97 | platform: "pytorch_libtorch" 98 | max_batch_size: 0 99 | input [ 100 | { 101 | name: "input__0" 102 | data_type: TYPE_FP32 103 | dims: [1, 3, -1, -1] 104 | } 105 | ] 106 | output [ 107 | { 108 | name: "output__0" 109 | data_type: TYPE_UINT8 110 | dims: [-1, -1, 3] 111 | } 112 | ] 113 | ``` 114 | - Đẩy lên triton, nên sử dụng EXPLICIT MODE như trong hướng dẫn sau: 115 | + [Các chế độ quản lý model (load/unload/reload)](./model_management.md) 116 | 117 | ### 4. Tạo Ensemble Model 118 | Thiết lập mô hình ensemble với input là ```raw_image```, output là ```image_out``` 119 | - Trong quá trình tiền xử lý, ```raw_image``` là input đầu vào ```input__0``` của model ```pre_gfpgan``` ta vừa load lên triton ở bước trên 120 | - Model ```pre_gfpgan``` trả về ```preprocessed_image``` lại feed tương ứng vào ```input__0``` của model ```infer_face_restoration_v2.1``` 121 | - Output của model ```infer_face_restoration_v2.1``` ta đặt là ```net_out``` lại là input của model ```post_gfpgan``` - - Cuối cùng trả ra output của ```post_gfpgan``` là ```image_out``` đồng thời là output cuối cùng của model 122 | ``` 123 | name: "ens_face_restoration_v2.1" 124 | platform: "ensemble" 125 | max_batch_size: 0 126 | input [ 127 | { 128 | name: "raw_image" 129 | data_type: TYPE_UINT8 130 | dims: [-1, -1, 3] 131 | } 132 | ] 133 | output [ 134 | { 135 | name: "image_out" 136 | data_type: TYPE_UINT8 137 | dims: [-1, -1, 3] 138 | } 139 | ] 140 | ensemble_scheduling { 141 | step [ 142 | { 143 | model_name: "pre_gfpgan" 144 | model_version: -1 145 | input_map { 146 | key: "input__0" 147 | value: "raw_image" 148 | } 149 | output_map { 150 | key: "output__0" 151 | value: "preprocessed_image" 152 | } 153 | }, 154 | { 155 | model_name: "infer_face_restoration_v2.1" 156 | model_version: -1 157 | input_map { 158 | key: "input__0" 159 | value: "preprocessed_image" 160 | } 161 | output_map { 162 | key: "output__0" 163 | value: "net_out" 164 | } 165 | }, 166 | { 167 | model_name: "post_gfpgan" 168 | model_version: -1 169 | input_map { 170 | key: "input__0" 171 | value: "net_out" 172 | } 173 | output_map { 174 | key: "output__0" 175 | value: "image_out" 176 | } 177 | } 178 | ] 179 | } 180 | ``` 181 | 182 | Thiết lập xong cấu hình, ta khởi tạo thư mục ```1``` **rỗng** để tạo phiên bản đầu tiên và đẩy lên triton-server là done -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/docs/model_instance.md: -------------------------------------------------------------------------------- 1 | # Model Instance 2 | 3 | Khi muốn scale up quy mô hệ thông, ta muốn sử dụng nhiều instance của model tương ứng với 1 hoặc nhiều GPU để tối đa hóa tốc độ, giảm thiểu độ trễ phía người dùng. Nghĩa là một requests từ phía người dùng có thể có nhiều lựa chọn hơn, khắc phục hiện tượng bottleneck phía inference. Do vậy, phần này mình sẽ trình bày về cấu hình Model Instance của ```triton-server```. 4 | ### 1. Cơ chế Model Instance trong triton-server 5 | Kiến trúc Triton cho phép nhiều model và một hoặc nhiều instance của cùng một model thực thi song song trên hệ thống. Hệ thống có thể không có, có một hoặc nhiều GPU. Hình dưới đây minh họa với 2 model, giả sử Triton hiện không xử lý bất kỳ yêu cầu nào, khi 2 requests đến đồng thời, 1 request cho mỗi 1 model, Triton ngay lập tức lên lịch cho cả 2 requests trên GPU và thực hiện song song chúng. Nếu hệ thống không có GPU, lập lịch trên CPU thì sẽ tiến hành trên các luồng và phụ thuộc vào OS hệ thống. 6 |

7 | 8 |

9 | 10 | Mặc định, nếu nhiều requests đến cùng 1 model tại 1 thời điểm, Triton sẽ lập lịch sao cho chỉ xử lý 1 request mỗi một thời điểm 11 |

12 | 13 |

14 | 15 | Triton cung cấp một config cho model được gọi là **instance-group** chỉ định số lượng executions được thực thi song song, mỗi execution như vậy được gọi là **instance**. Mặc định, Triton sẽ khởi tạo các **instance** trên các GPU khác nhau. Ví dụ như trong hình dưới đây, có 3 instances và 4 requests được gọi đến, request thứ 4 phải đợi cho đến khi 1 trong 3 lần thực thi đầu tiên hoàn thành trước khi bắt đầu. 16 |

17 | 18 |

-------------------------------------------------------------------------------- /Deploy/Triton-inference-server/docs/model_management.md: -------------------------------------------------------------------------------- 1 | # Model Management 2 | 3 | Có 3 chế độ quản lý model trong triton đó là **NONE** (mặc định), **EXPLICIT** (động) và **POLL** 4 | 5 | ### NONE Mode (Default) 6 | - Cấu hình ```--model-control-mode=none``` 7 | - Triton sẽ tiến hành load toàn bộ mô hình cùng cấu hình tương ứng lên bộ nhớ, những model nào bị lỗi sẽ bỏ qua và không khả dụng. 8 | - Việc thay đổi repo của model khi server đang chạy sẽ không tác động đến hệ thống hiện tại 9 | - **Không thể** sử dụng ```load``` và ```unload``` API từ ```triton-client``` 10 | - Ưu điểm: 11 | + Dễ sử dụng 12 | - Nhược điểm: 13 | + Khó tùy biến 14 | + Việc bổ sung/loại bỏ models đòi hỏi **phải** khởi động lại ```triton-server``` 15 | ### EXPLICIT Mode (Recommend) 16 | - Cấu hình ```--model-control-mode=explicit``` 17 | - Mặc định triton sẽ **không** ```load``` model nào vào bộ nhớ nếu flag ```--load-model``` không được khai báo. Do vậy, với khởi động mặc định cần phải call API ```load``` các model cần thiết **bằng tay** 18 | - Các model có thể được gọi ```load``` và ```unload``` tùy ý thông qua API từ ```triton-client``` 19 | - Việc thay đổi repo của model khi server đang chạy sẽ tác động đến hệ thống hiện tại: **load lại model đó** 20 | - Ưu điểm: 21 | + Dễ tùy biến 22 | + Việc bổ sung/loại bỏ models **không cần** khởi động lại ```triton-server``` 23 | - Nhược điểm: 24 | + Hơi khó để làm quen và sử dụng 25 | 26 | Tham khảo API ```Load/Unload/Reload``` model sử dụng Python tại [đây](../src/sample_load_unload.py) 27 | ### POLL 28 | Thấy bảo là không recommend trong **production** nên cũng lười không đọc luôn ... -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/docs/optimization_pytorch.md: -------------------------------------------------------------------------------- 1 | # Optimize Pytorch Backend 2 | Trong quá trình khởi động ```triton-server``` để load các mô hình sử dụng ```pytorch``` backend đôi khi ta sẽ gặp những thông báo kiểu: 3 | ``` 4 | I1227 03:45:06.216251 1 libtorch.cc:1255] TRITONBACKEND_ModelInitialize: license_plate_restoration_square_v1.1 (version 1) 5 | I1227 03:45:06.216786 1 libtorch.cc:251] Optimized execution is enabled for model instance 'license_plate_restoration_square_v1.1' 6 | I1227 03:45:06.216796 1 libtorch.cc:269] Inference Mode is disabled for model instance 'license_plate_restoration_square_v1.1' 7 | I1227 03:45:06.216800 1 libtorch.cc:344] NvFuser is not specified for model instance 'license_plate_restoration_square_v1.1' 8 | ``` 9 | Đây là thông báo khi **Inference Mode** và **NvFuser** chưa được bật để tối ưu tốc độ. Do vậy trong phần này mình sẽ trình bày về cấu hình tối ưu ```triton-server``` khi sử dụng ```pytorch``` backend với các tham số phù hợp. 10 | 11 | ### 1. Inference Mode 12 | 13 | **InferenceMode** hoạt động tương tự như **NoGradMode** khi không sử dụng autograd. Do vậy, trong đại đa số trường hợp khi mà model của chúng ta không quá đặc biệt (chứa những toán tử bị ảnh hưởng bởi autograd) thì ta có thể bật **InferenceMode** trong file cấu hình như sau: 14 | 15 | ``` 16 | parameters: { 17 | key: "INFERENCE_MODE" 18 | value: { 19 | string_value:"true" 20 | } 21 | } 22 | ``` 23 | 24 | - Kết quả khi tắt **Inference Mode** (mặc định) 25 | ``` 26 | Inferences/Second vs. Client p95 Batch Latency 27 | Concurrency: 1, throughput: 46.4 infer/sec, latency 24657 usec 28 | Concurrency: 2, throughput: 53.8 infer/sec, latency 41444 usec 29 | Concurrency: 3, throughput: 54 infer/sec, latency 59257 usec 30 | Concurrency: 4, throughput: 53.4 infer/sec, latency 81955 usec 31 | ``` 32 | - Kết quả sau khi bật (được cải thiện một chút) 33 | ``` 34 | Inferences/Second vs. Client p95 Batch Latency 35 | Concurrency: 1, throughput: 42.6 infer/sec, latency 27506 usec 36 | Concurrency: 2, throughput: 54.4 infer/sec, latency 40857 usec 37 | Concurrency: 3, throughput: 54 infer/sec, latency 60192 usec 38 | Concurrency: 4, throughput: 53.6 infer/sec, latency 81830 usec 39 | ``` 40 | 41 | ### 2. NvFuser (CUDA Graph Fuser) 42 | Nếu như các bạn có đọc qua về **TensorRT Optimization** thì cơ chế của **NvFuser** sẽ tương tự. Đơn giản là sẽ tiến hành Fuse một số toán tử lại với nhau để tăng tốc độ thực thi. Cơ chế fusing này đã trở nên rất phổ biến và được tích hợp vào hầu hết các framework hiện nay. 43 | Tiến hành bật **NvFuser**: 44 | ``` 45 | parameters: { 46 | key: "ENABLE_NVFUSER" 47 | value: { 48 | string_value:"true" 49 | } 50 | } 51 | ``` 52 | 53 | ### 3. Các chế độ Optimization khác 54 | Ngoài ra, ta có một số **optimization flags** khác có thể thử 55 | ``` 56 | ENABLE_JIT_EXECUTOR 57 | ``` 58 | ``` 59 | ENABLE_JIT_PROFILING 60 | ``` 61 | ``` 62 | ENABLE_TENSOR_FUSER 63 | ``` 64 | Lưu ý rằng việc enable toàn bộ các ```optimization flags``` chưa chắc đã mang lại kết quả tốt nhất. Khuyến nghị chỉ sử dụng **INFERENCE_MODE** làm mặc định. Dưới đây là kết quả khi enable tất cả các ```optimization flags``` 65 | 66 | ``` 67 | Inferences/Second vs. Client p95 Batch Latency 68 | Concurrency: 1, throughput: 42.2 infer/sec, latency 27052 usec 69 | Concurrency: 2, throughput: 48.2 infer/sec, latency 46771 usec 70 | Concurrency: 3, throughput: 49.8 infer/sec, latency 65506 usec 71 | Concurrency: 4, throughput: 29 infer/sec, latency 189399 usec 72 | ``` 73 | 74 | ### 4. Model Instance 75 | Việc bật nhiều **instance** giúp chúng ta tăng tốc độ khi luồng requests đầu vào có nhiều lựa chọn hơn (nhiều consumers). Tuy nhiên trong một số trường hợp các ```optimization flags``` thường gây ra một số lỗi cho nên khi lựa chọn việc sử dụng nhiều **instance models** ta nên ```DISABLE_OPTIMIZED_EXECUTION``` 76 | ``` 77 | parameters: { 78 | key: "DISABLE_OPTIMIZED_EXECUTION" 79 | value: { 80 | string_value:"true" 81 | } 82 | } 83 | ``` -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/docs/perf_analyzer.md: -------------------------------------------------------------------------------- 1 | # Performance Analyst Tool 2 | **Performance Analyst** là tool dùng để phân tích tốc độ từ phía người dùng. Để sử dụng, ta phải cài đặt **triton-client** từ source như trong hướng dẫn sau: 3 | - [Cài đặt Triton-inference-server](install.md#2-2-cài-đặt-nâng-cao) 4 | 5 | Một số thuật ngữ mà ta cần chú ý: 6 | - **Throughput**: tốc độ xử lý requests (thường là số lượng requests/s) 7 | - **Latency**: thời gian chờ để xử lý xong một requests 8 | 9 | Ví dụ: Throughput hiện tại trên 1 concurency đạt được là 50 requests/s với latency 100ms, khi tăng số lượng concurrencies lên 2 thì throughput vẫn như vậy nhưng latency tăng lên 200ms 10 | 11 | Ta sẽ lấy ví dụ cho việc phân tích model ```wav2vec_general_v2``` mình đang deploy như sau: 12 | ``` 13 | perf_analyzer -m wav2vec_general_v2 --percentile=95 --concurrency-range 1:8 --shape input:1,320000 14 | ``` 15 | Kết quả trả về cho chúng ta **throughput** và **latency** với backend **ONNX-runtime** 16 |

17 | 18 |

19 | 20 | Ta sẽ tiến hành thay đổi cấu hình ```config.pbtxt``` để sử dụng optimized **ONNX-TensorRT**, khởi động lại Triton-inference-server và so sánh kết quả. Lưu ý rằng quá trình load lại mô hình sẽ lâu hơn vì phải mỗi lần khởi động lại triton sẽ convert mô hình từ **ONNX** sang **TensorRT** 21 | ``` 22 | optimization { execution_accelerators { 23 | gpu_execution_accelerator : [ { 24 | name : "tensorrt" 25 | parameters { key: "precision_mode" value: "FP32" } 26 | parameters { key: "max_workspace_size_bytes" value: "1073741824" } 27 | }] 28 | }} 29 | ``` 30 | 31 | Lưu ý giá trị 1073741824 = 1 x 1024 x 1024 x 1024 (bytes) = 1Gb là giá trị ```workspace``` mặc định, đối với model **kích thước lớn** ta cần đẩy giá trị này lên cao, chẳng hạn **4Gb = 4294967296** 32 | 33 | Kết quả 34 |

35 | 36 |

37 | 38 | Như vậy, đối với mô hình trên, việc sử dụng backend TensorRT (FP32) giúp cải thiện tốc độ đáng kể (**1.76** lần) so với sử dụng backend ONNX-runtime thông thường. 39 | *(Bạn có thể thử với FP16, nó sẽ không chỉ dừng ở con số **1.76** kia đâu)* -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/docs/triton_kaldi.md: -------------------------------------------------------------------------------- 1 | # Kaldi ASR with Triton-inference-server 2 | Phần này sẽ đề cập đến cách sử dụng Kaldi backend trong Triton 3 | ### 1. Build 4 | - Build docker image 5 | ``` 6 | git clone https://github.com/NVIDIA/DeepLearningExamples.git 7 | cd DeepLearningExamples/Kaldi/SpeechRecognition 8 | scripts/docker/build.sh 9 | ``` 10 | - Download mô hình sample LibriSpeech 11 | ``` 12 | scripts/docker/launch_download.sh 13 | ``` 14 | - Khởi chạy triton-kaldi-server với LibriSpeech 15 | ``` 16 | scripts/docker/launch_server.sh 17 | ``` 18 | ### 2. Load custom model 19 | Phần này mình sẽ tiến hành sử dụng triton để load customized model. 20 | - Tạo thư mục mới tại thư mục làm việc hiện tại 21 | ``` 22 | models/infer_asr_kaldi_radio_v1/1 23 | ``` 24 | với ```infer_asr_kaldi_radio_v1``` là tên model của mình. 25 | - Run triton tại thư mục hiện tại với MODE ```EXPLICIT``` 26 | ``` 27 | docker run --rm -it \ 28 | --gpus device=0 \ 29 | --shm-size=1g \ 30 | --ulimit memlock=-1 \ 31 | --ulimit stack=67108864 \ 32 | -p8005:8000 \ 33 | -p8006:8001 \ 34 | -p8007:8002 \ 35 | --name trt_server_asr \ 36 | -v $PWD/data:/data \ 37 | -v $PWD/model-repo:/mnt/model-repo \ 38 | -v $PWD/models:/models \ 39 | triton_kaldi_server tritonserver --model-repo=/models --model-control-mode=explicit 40 | ``` 41 | trong đó ```$PWD/models``` là thư mục ta vừa tạo 42 | - Sử dụng một screen khác copy ```libtriton_kaldi.so``` 43 | ``` 44 | docker ps 45 | docker exec -it bash 46 | cp /workspace/model-repo/kaldi_online/1/libtriton_kaldi.so /models/infer_asr_kaldi_radio_v1/ 47 | ``` 48 | - Xây dựng cấu trúc thư mục như sau (nhớ sửa lại đường dẫn trong các file ```.conf``` cho đúng): 49 | ``` 50 | ├── models 51 | │ ├── infer_asr_kaldi_radio_v1 52 | │ │ ├── 1 53 | │ │ │ ├── conf 54 | │ │ │ │ ├── ivector_extractor.conf 55 | │ │ │ │ ├── mfcc.conf 56 | │ │ │ │ ├── online.conf 57 | │ │ │ │ ├── online_cmvn.conf 58 | │ │ │ │ ├── splice.conf 59 | │ │ │ ├── ivector_extractor 60 | │ │ │ │ ├── final.dubm 61 | │ │ │ │ ├── final.ie 62 | │ │ │ │ ├── final.mat 63 | │ │ │ │ ├── global_cmvn.stats 64 | │ │ │ │ ├── online_cmvn.conf 65 | │ │ │ │ ├── online_cmvn_iextractor 66 | │ │ │ │ ├── splice_opts 67 | │ │ │ ├── final.mdl 68 | │ │ │ ├── global_cmvn.stats 69 | │ │ │ ├── HCLG.fst 70 | │ │ │ ├── words.txt 71 | │ │ ├── config.pbtxt 72 | │ │ ├── libtriton_kaldi.so 73 | ``` 74 | Lưu ý: file ```/models/infer_asr_kaldi_radio_v1/1/global_cmvn.stats``` khác với file ```/models/infer_asr_kaldi_radio_v1/1/ivector_extractor/global_cmvn.stats``` 75 | - Load model lên triton bằng [gRPC API](../docs/model_management.md) 76 | -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/docs/triton_onnx.md: -------------------------------------------------------------------------------- 1 | # ONNX-runtime with Triton-inference-server 2 | 3 | Để deploy ONNX model chạy với ONNX-runtime (ngoài ONNX-runtime có thể sử dụng TensorRT-runtime nếu support), ta cần để platform là ```onnxruntime_onnx```, ngoài ra các tham số cơ bản trong cấu hình cũng tương tự. Mình sẽ tiến hành deploy model ```wav2vec_general_v2``` như sau: 4 | - Trong thư mục ```models```, khởi tạo thư mục ```wav2vec_general_v2``` chứa file cấu hình và weights 5 | - Để file weights dưới đường dẫn ```models/wav2vec_general_v2/1/model.onnx```, trong đó ```1``` là phiên bản của mô hình 6 | - Để file config dưới đường dẫn ```models/wav2vec_general_v2/config.pbtxt```, lưu ý không ném trong thư mục phiên bản 7 | 8 | ``` 9 | name: "wav2vec_general_v2" 10 | platform: "onnxruntime_onnx" 11 | max_batch_size : 0 12 | input [ 13 | { 14 | name: "input" 15 | data_type: TYPE_FP32 16 | dims: [1, -1] 17 | } 18 | ] 19 | output [ 20 | { 21 | name: "output" 22 | data_type: TYPE_FP32 23 | dims: [-1, -1, 105] 24 | } 25 | ] 26 | ``` 27 | - Đẩy model lên triton-server 28 | ``` 29 | python src/sample_load_unload.py wav2vec_general_v2 30 | ``` -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/docs/triton_tensorrt.md: -------------------------------------------------------------------------------- 1 | # TensorRT-runtime with Triton-inference-server 2 | 3 | Trong trường hợp muốn deploy mô hình sử dụng TensorRT-runtime thay vì ONNX-runtime (model thường phải convert sang ONNX trước khi sang TensorRT), file weight **phải** được convert theo **đúng** phiên bản TensorRT mà docker triton-inference-server đang sử dụng. Do vậy, ta truy cập vào môi trường docker hiện tại như sau: 4 | 5 | - Lấy ID của Docker đang chạy triton-inference-server 6 | ``` 7 | damnguyen@rnd3:~$ docker ps 8 | 9 | CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 10 | 6ef0b4972292 nvcr.io/nvidia/tritonserver:21.12-py3 "/opt/tritonserver/n…" 23 hours ago Up 23 hours 0.0.0.0:8000-8002->8000-8002/tcp, :::8000-8002->8000-8002/tcp cranky_hamilton 11 | b09d98350935 quay.io/cloudhut/kowl:master-645e3b4 "./kowl" 6 days ago Up 6 days gifted_davinci 12 | ``` 13 | ta có CONTAINER ID của triton là ```6ef0b4972292``` 14 | - Chạy bash sử dụng triton container 15 | ``` 16 | damnguyen@rnd3:~$ docker exec -it 6ef0b4972292 bash 17 | root@6ef0b4972292:/opt/tritonserver# 18 | ``` 19 | - Convert model ONNX sang TensorRT (cú pháp tương tự khi làm việc với engine TensorRT thông thường) 20 | ``` 21 | /usr/src/tensorrt/bin/trtexec --onnx= --saveEngine= 22 | ``` 23 | - Deploy model lên triton tương tự như ONNX sử dụng tên platform là ```tensorrt_plan``` thay vì ```onnxruntime_onnx``` 24 | + [Deploy mô hình sử dụng ONNX-runtime và Triton](./triton_onnx.md) -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/fig/multi_model_exec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/multi_model_exec.png -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/fig/multi_model_parallel_exec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/multi_model_parallel_exec.png -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/fig/multi_model_serial_exec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/multi_model_serial_exec.png -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/fig/wav2vec_general_perf_onnx.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/wav2vec_general_perf_onnx.jpg -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/fig/wav2vec_general_perf_tensorrt.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/wav2vec_general_perf_tensorrt.jpg -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/fig/wav2vec_general_start.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/wav2vec_general_start.jpg -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/src/sample_grpc.py: -------------------------------------------------------------------------------- 1 | import tritonclient.grpc as grpcclient 2 | 3 | class TritonModelGRPC: 4 | ''' 5 | Sample model-request triton-inference-server with gRPC 6 | ''' 7 | def __init__(self, 8 | triton_host = 'localhost:8001', # default gRPC port 9 | triton_model_name = 'wav2vec_general_v2', 10 | verbose = False): 11 | print('Init connection from Triton-inference-server') 12 | print('- Host: {}'.format(triton_host)) 13 | print('- Model: {}'.format(triton_model_name)) 14 | self.triton_host = triton_host 15 | self.triton_model_name = triton_model_name 16 | self.model = grpcclient.InferenceServerClient(url=self.triton_host, 17 | verbose=verbose, 18 | ssl=False, 19 | root_certificates=None, 20 | private_key=None, 21 | certificate_chain=None) 22 | if not self.model.is_server_live(): 23 | print("FAILED : is_server_live") 24 | sys.exit(1) 25 | 26 | if not self.model.is_server_ready(): 27 | print("FAILED : is_server_ready") 28 | sys.exit(1) 29 | 30 | if not self.model.is_model_ready("wav2vec_general_v2"): 31 | print("FAILED : is_model_ready") 32 | sys.exit(1) 33 | self.verbose = verbose 34 | 35 | def run(self, feats): 36 | # Input shape must be [-1] 37 | assert len(feats.shape) == 2, "Shape not support: {}".format(feats.shape) 38 | assert feats.shape[0] == 1, "Shape not support: {}".format(feats.shape) 39 | feats_length = feats.shape[-1] 40 | if self.verbose: 41 | print('='*50) 42 | print('- Input shape: [1, {}]'.format(feats_length)) 43 | inputs = [] 44 | outputs = [] 45 | inputs.append(grpcclient.InferInput('input', [1, feats_length], "FP32")) 46 | inputs[0].set_data_from_numpy(feats) 47 | outputs.append(grpcclient.InferRequestedOutput('output')) 48 | if self.verbose: 49 | tik = time.time() 50 | results = self.model.infer( 51 | model_name="wav2vec_general_v2", 52 | inputs=inputs, 53 | outputs=outputs, 54 | client_timeout=None) 55 | if self.verbose: 56 | tok = time.time() 57 | print('- Time cost:', tok - tik) 58 | output = results.as_numpy('output') 59 | return output 60 | 61 | -------------------------------------------------------------------------------- /Deploy/Triton-inference-server/src/sample_load_unload.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Example 3 | - Load model 4 | python3 sample_load_unload.py --models emotion_recognition_v1.1 5 | - Unload model 6 | python3 sample_load_unload.py --unload --models emotion_recognition_v1.1 7 | - Load model from file 8 | python3 sample_load_unload.py --path --models model_list.txt 9 | ''' 10 | import argparse 11 | import tritonclient.grpc as grpcclient 12 | 13 | parser = argparse.ArgumentParser(description='Load/Unload model') 14 | parser.add_argument('--models', default="", help='list of model names to load/unload') 15 | parser.add_argument('--unload', action = "store_true", help='load or unload model') 16 | parser.add_argument('--reload', action = "store_true", help='reload model') 17 | parser.add_argument('--path', action = "store_true", help='get list of models from filepath') 18 | parser.add_argument('--url', default="localhost:8001", help='default triton-server URL') 19 | args = parser.parse_args() 20 | 21 | if not args.path: 22 | MODEL_NAMES = args.models.strip().split(',') 23 | else: 24 | MODEL_NAMES = open(args.models).read().strip('\n').split('\n') 25 | URL = args.url 26 | triton_client = grpcclient.InferenceServerClient(url=URL, verbose=True) 27 | triton_client.is_server_live() 28 | triton_client.get_model_repository_index().models 29 | if args.unload: 30 | for MODEL_NAME in MODEL_NAMES: 31 | if triton_client.is_model_ready(MODEL_NAME): 32 | print('UNLOAD: {}'.format(MODEL_NAME)) 33 | triton_client.unload_model(MODEL_NAME) 34 | else: 35 | print('Skip: {}'.format(MODEL_NAME)) 36 | else: 37 | for MODEL_NAME in MODEL_NAMES: 38 | if triton_client.is_model_ready(MODEL_NAME): 39 | if args.reload: 40 | print('RELOAD: {}'.format(MODEL_NAME)) 41 | triton_client.unload_model(MODEL_NAME) 42 | triton_client.load_model(MODEL_NAME) 43 | else: 44 | print('Skip: {}'.format(MODEL_NAME)) 45 | else: 46 | print('LOAD: {}'.format(MODEL_NAME)) 47 | triton_client.load_model(MODEL_NAME) 48 | 49 | 50 | print('='*70) 51 | triton_client.get_model_repository_index().models -------------------------------------------------------------------------------- /Framework/ONNX/README.md: -------------------------------------------------------------------------------- 1 | # AI-Engineer-Howto 2 | 3 | Tất cả những thứ liên quan đến ONNX và ONNX-runtime -------------------------------------------------------------------------------- /Framework/Pytorch/README.md: -------------------------------------------------------------------------------- 1 | # AI-Engineer-Howto 2 | 3 | Tất cả những thứ liên quan đến Pytorch & Pytorch-serving 4 | - [Build Pytorch from source](docs/build_from_source.md) -------------------------------------------------------------------------------- /Framework/Pytorch/docs/build_from_source.md: -------------------------------------------------------------------------------- 1 | # Pytorch 2 | 3 | ## Build pytorch from source (best config for AMD CPU & NVIDIA-GPU) 4 | We will use OpenBLAS instead of MKL & MKLDNN 5 | ``` 6 | # Install anaconda (if not) 7 | curl -O https://repo.anaconda.com/archive/Anaconda3-2020.07-Linux-x86_64.sh 8 | bash Anaconda3-2020.07-Linux-x86_64.sh 9 | source ~/anaconda3/bin/activate 10 | 11 | # Install dependencies 12 | conda create -n myenv_pytorch_1.9 python=3.8 13 | conda activate myenv_pytorch_1.9 14 | conda install astunparse numpy ninja pyyaml setuptools cmake cffi typing_extensions future six requests dataclasses 15 | pip install ninja 16 | 17 | # Build 18 | git clone --recursive --branch v1.9.1 https://github.com/pytorch/pytorch.git 19 | cd pytorch 20 | export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} 21 | USE_NCCL=ON USE_CUDNN=OFF USE_CUDA=ON USE_MKL=OFF USE_MKLDNN=OFF python setup.py install 22 | ``` 23 | ## Compatible with 24 | - TorchVision: 0.10.1 25 | - OpenCV: 4.6.0 26 | - MMCV: 1.3.3 27 | - MMCV Compiler: GCC 9.4 28 | - MMCV CUDA Compiler: 11.3 29 | - MMDetection: 2.7.0+e78eee5 30 | 31 | -------------------------------------------------------------------------------- /Framework/TensorRT/README.md: -------------------------------------------------------------------------------- 1 | # TensorRT 2 | - [Convert ONNX model to TensorRT](docs/tutorial.md) 3 | -------------------------------------------------------------------------------- /Framework/TensorRT/docs/tutorial.md: -------------------------------------------------------------------------------- 1 | # AI-Engineer-Howto 2 | ## Convert model to TensorRT 3 | ### 1. Convert model to ONNX 4 | Để thuận tiện cho việc deploy, các model sử dụng các framework khác nhau nên được convert sang ONNX, việc convert model từ ONNX sang các runtime khác cũng được dễ dàng hơn, đặc biệt là TensorRT 5 | ### 2. Get input/output shape 6 | Sau khi convert model sang ONNX, ta cần xác định kích thước của input/output (cơ bản chỉ cần input) và các tham số tương ứng. Dễ dàng nhất là ta sử dụng [netron](https://netron.app/) để xem kiến trúc. Ví dụ trong hình dưới đây là model SCFD face detection được visualize sử dụng [netron](https://netron.app/): 7 |

8 | 9 |

10 | 11 | - Input: 12 | - **input.1** (float32): [batch_size, 3, 640, 640] hay [-1, 3, 640, 640] (những giá trị **khác số** được hiểu là giá trị dynamic (động)) 13 | - Output: 14 | - **num_detections** (int32): [-1, 1] 15 | - **nmsed_boxes** (float32): [-1, 200, 4] 16 | - **nmsed_scores** (float32): [-1, 200] 17 | - **nmsed_classes** (float32): [-1, 200] 18 | - **nmsed_landmarks** (float32): [-1, 200, 10] 19 | ### 3. Serialize Engine 20 | Đầu tiên cần hiểu về **dynamic** và **static** đối với **shape** và **batch** 21 | - **batch** (batch size): số lượng các input đầu vào, thường là dimension đầu tiên của tensor 22 | - **shape**: kích thước các dimension của tensor, bao gồm cả **batch** 23 | - **dynamic**: động 24 | - **static**: cố định 25 | 26 | Như vậy ta có: 27 | - **dynamic batch**: chỉ **batch** dạng động, các shape khác giữ nguyên, chẳng hạn [-1, 3, 640, 640] thì ta có các input thỏa mãn là [1, 3, 640, 640], [7, 3, 640, 640], ... các input không thỏa mãn là [1, 3, 640, 512], [1, 4, 640, 640], ... 28 | - **static shapes**: chỉ chấp nhận 1 kích thước cố định, chẳng hạn [4, 3, 640, 640] chỉ chấp nhận input [4, 3, 640, 640], còn [7, 4, 640, 512] chỉ chấp nhận input [7, 4, 640, 512], ... 29 | - **dynamic shapes**: một số các dimension động, chẳng hạn [-1, 3, -1, 32] có thể chấp nhận các input [4, 3, 214, 32], [12, 3, 320, 32], ...
30 | 31 | Thông thường ta chỉ quan tâm **dynamic shapes** và **static shapes**. 32 | 33 | Tiến hành convert (serialize), ta có 2 kiểu convert model chính là **implicitBatch** (mặc định) và **explicitBatch**. Giá trị batch_size được hiểu mặc định là giá trị đầu tiên, như trong model phía trên của mình đó là giá trị **-1**, nếu như trong trường hợp trên model của mình có kích thước input là **[1, 3, 640, 640]** thì model không hỗ trợ **dynamic shapes** mà chỉ hỗ trợ **static shapes**, tức là chấp nhận 1 kích thước đầu vào duy nhất. Tuy nhiên ta có một số phương pháp hỗ trợ convert model ONNX từ **dynamic** thành **static** và ngược lại. 34 | - **implicitBatch** (default): Hoạt động với model có input dạng **static shapes** 35 | - **explicitBatch**: Hoạt động với model có input dạng **dynamic shapes** 36 | 37 | Ví dụ cho việc convert model sử dụng **implicitBatch** (kích thước input được xác định sẵn **static** trong meta ONNX model): 38 | 39 | ``` 40 | /usr/src/tensorrt/bin/trtexec \ 41 | --implicitBatch \ 42 | --onnx= \ 43 | --saveEngine=output.plan \ 44 | --device=0 \ 45 | --verbose 46 | ``` 47 | 48 | Ví dụ cho việc convert model sử dụng **explicitBatch** (ở đây ta phải xác định thêm **minShapes**, **optShapes** và **maxShapes** của từng input): 49 | 50 | ``` 51 | /usr/src/tensorrt/bin/trtexec \ 52 | --explicitBatch \ 53 | --onnx= \ 54 | --minShapes=input.1:1x3x640x640 \ 55 | --optShapes=input.1:1x3x640x640 \ 56 | --maxShapes=input.1:4x3x640x640 \ 57 | --saveEngine=output.plan \ 58 | --device=0 \ 59 | --verbose 60 | ``` 61 | 62 | trong đó:
63 | - **saveEngine**: đường dẫn đến model TensorRT output, thường để đuôi **.plan** hoặc **.trt**
64 | - **device**: GPU ID
65 | - **verbose**: in ra log của quá trình convert
66 | - Cú pháp định nghĩa các shape trong trường hợp có nhiều input: ```:,:,...``` 67 | 68 | ### 3. Deserialize Engine & Inference 69 | Sau khi có model tensorrt (hay còn gọi là engine file), ta cần thực hiện việc load model và inference 70 | - Cài đặt pycuda & tensorrt python binding như trong hướng dẫn ở [đây](https://github.com/NNDam/Retinaface-TensorRT) 71 | - Wrap model gồm 3 function chính: allocate_buffers, do_inference, post_process 72 | - Tham khảo việc wrap model tại example repositories 73 | 74 | #### 3.1. Allocate buffers 75 | Khởi tạo bộ nhớ cho các inputs & outputs. Lưu ý rằng đối với **dynamic shapes** model ta cần allocate theo kích thước của inputs và outputs theo **maxShapes** 76 | 77 | ``` 78 | # Allocates all buffers required for an engine, i.e. host/device inputs/outputs. 79 | def allocate_buffers(engine): 80 | inputs = [] 81 | outputs = [] 82 | bindings = [] 83 | stream = cuda.Stream() 84 | out_shapes = [] 85 | input_shapes = [] 86 | out_names = [] 87 | max_batch_size = engine.get_profile_shape(0, 0)[2][0] 88 | for binding in engine: 89 | binding_shape = engine.get_binding_shape(binding) 90 | # Fix -1 dimension for proper memory allocation for batch_size > 1 91 | if binding_shape[0] == -1: # Dynamic batch size 92 | binding_shape = (max_batch_size,) + binding_shape[1:] 93 | size = trt.volume(binding_shape) 94 | dtype = trt.nptype(engine.get_binding_dtype(binding)) 95 | # Allocate host and device buffers 96 | host_mem = cuda.pagelocked_empty(size, dtype) 97 | device_mem = cuda.mem_alloc(host_mem.nbytes) 98 | # Append the device buffer to device bindings. 99 | bindings.append(int(device_mem)) 100 | # Append to the appropriate list. 101 | if engine.binding_is_input(binding): 102 | inputs.append(HostDeviceMem(host_mem, device_mem)) 103 | input_shapes.append(engine.get_binding_shape(binding)) 104 | else: 105 | outputs.append(HostDeviceMem(host_mem, device_mem)) 106 | #Collect original output shapes and names from engine 107 | out_shapes.append(engine.get_binding_shape(binding)) 108 | out_names.append(binding) 109 | return inputs, outputs, bindings, stream, input_shapes, out_shapes, out_names, max_batch_size 110 | ``` 111 | 112 | #### 3.2. Inference 113 | Tiến hành inference, bao gồm lấy dữ liệu inputs từ host sang device GPU, thực hiện execute trên device GPU để thu được outputs và copy outputs từ device GPU về host 114 | 115 | ``` 116 | def do_inference(context, bindings, inputs, outputs, stream): 117 | # Transfer input data to the GPU. 118 | [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] 119 | # Run inference. 120 | context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) 121 | # Transfer predictions back from the GPU. 122 | [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] 123 | # Synchronize the stream 124 | stream.synchronize() 125 | # Return only the host outputs. 126 | return [out.host for out in outputs] 127 | ``` 128 | 129 | #### 3.3. Post-processing 130 | Tiến hành reshape lại dữ liệu outputs tùy vào yêu cầu của bài toán và thực hiện các post-processing khác 131 | 132 | ## Example repositories 133 | - [Retinaface](https://github.com/NNDam/Retinaface-TensorRT) 134 | - [vietocr](https://github.com/NNDam/vietocr-tensorrt) 135 | - [yolor](https://github.com/NNDam/yolor) 136 | -------------------------------------------------------------------------------- /Framework/TensorRT/fig/sample_netron_scrfd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Framework/TensorRT/fig/sample_netron_scrfd.png -------------------------------------------------------------------------------- /Framework/Tensorflow/README.md: -------------------------------------------------------------------------------- 1 | # AI-Engineer-Howto 2 | 3 | Tất cả những thứ liên quan đến Tensorflow và Tensorflow-serving -------------------------------------------------------------------------------- /Linux/README.md: -------------------------------------------------------------------------------- 1 | # Collection of FAQ about CUDA & Linux & apt-packages 2 | 3 |
Build OpenCV from source 4 | 5 | - [Build OpenCV from source](docs/build_opencv.md) 6 | 7 |
8 | 9 |
Install Math Kernel Library (MKL/BLAS/LAPACK/OPENBLAS) 10 | You are recommended to install all Math Kernel Library and then compile framework (e.g pytorch, mxnet) from source using custom config for optimization.
11 | Install all LAPACK+BLAS: 12 | 13 | ``` 14 | sudo apt install libjpeg-dev libpng-dev libblas-dev libopenblas-dev libatlas-base-dev liblapack-dev liblapacke-dev gfortran 15 | ``` 16 | 17 | Install MKL: 18 | 19 | ``` 20 | # Get the key 21 | wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB 22 | # now install that key 23 | apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB 24 | # now remove the public key file exit the root shell 25 | rm GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB 26 | # Add to apt 27 | sudo wget https://apt.repos.intel.com/setup/intelproducts.list -O /etc/apt/sources.list.d/intelproducts.list 28 | sudo sh -c 'echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list' 29 | # Install 30 | sudo apt-get update 31 | sudo apt-get install intel-mkl-2020.4-912 32 | ``` 33 | 34 |
35 | 36 |
Fresh install NVIDIA driver (PC/Laptop/Workstation) 37 | 38 | ``` 39 | # Remove old packages 40 | sudo apt-get remove --purge '^nvidia-.*' 41 | sudo apt-get install ubuntu-desktop 42 | sudo apt-get --purge remove "*cublas*" "cuda*" 43 | sudo apt-get --purge remove "*nvidia*" 44 | sudo add-apt-repository --remove ppa:graphics-drivers/ppa 45 | sudo rm /etc/X11/xorg.conf 46 | sudo apt autoremove 47 | sudo reboot 48 | 49 | # After restart 50 | sudo ubuntu-drivers devices 51 | sudo ubuntu-drivers autoinstall 52 | sudo reboot 53 | ``` 54 | 55 |
56 | 57 |
NVIDIA-SMI has failed because it couldn’t communicate with the NVIDIA driver 58 | 59 | First, make sure that you have "Fresh install NVIDIA driver". If not work, try this bellow 60 | 61 | - Make sure the package nvidia-prime is installed: 62 | 63 | ``` 64 | sudo apt install nvidia-prime 65 | ``` 66 | 67 | Afterwards, run 68 | ``` 69 | sudo prime-select nvidia 70 | ``` 71 | 72 | - Make sure that NVIDIA is not in blacklist 73 | 74 | ``` 75 | grep nvidia /etc/modprobe.d/* /lib/modprobe.d/* 76 | ``` 77 | 78 | to find a file containing ```blacklist nvidia``` and remove it, then run 79 | 80 | ``` 81 | sudo update-initramfs -u 82 | ``` 83 | 84 | - If get error ```This PCI I/O region assigned to your NVIDIA device is invalid```: 85 | 86 | ``` 87 | sudo nano /etc/default/grub 88 | ``` 89 | 90 | edit ```GRUB_CMDLINE_LINUX_DEFAULT="quiet splash pci=realloc=off"``` 91 | 92 | ``` 93 | sudo update-grub 94 | sudo reboot 95 | ``` 96 | 97 |
98 | 99 |
Check current CUDA version 100 | 101 | ``` 102 | nvcc --version 103 | ``` 104 | 105 |
106 | 107 |
Check current supported CUDA versions 108 | 109 | ``` 110 | ls /usr/local/ 111 | ``` 112 | 113 |
114 | 115 |
Select GPU devices 116 | 117 | ``` 118 | CUDA_VISIBLE_DEVICES= 119 | CUDA_VISIBLE_DEVICES=0 python abc.py 120 | CUDA_VISIBLE_DEVICES=0 ./sample.sh 121 | CUDA_VISIBLE_DEVICES=0,1,2,3 python abc.py 122 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./sample.sh 123 | ``` 124 | 125 |
126 | 127 |
Switch CUDA version 128 | 129 | ``` 130 | CUDA_VER=11.3 131 | export PATH="/usr/local/cuda-$CUDA_VER/bin:$PATH" 132 | export LD_LIBRARY_PATH=/usr/local/cuda-$CUDA_VER/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} 133 | ``` 134 | 135 |
136 | 137 |
Check NVENV/NVDEC status 138 | 139 | ``` 140 | nvidia-smi dmon 141 | ``` 142 | see the tab **%enc** and **%dec** 143 |
144 | 145 |
Error with distributed training NCCL (got freezed) 146 | 147 | ``` 148 | export NCCL_P2P_DISABLE="1" 149 | ``` 150 | 151 |
152 | 153 |
Install CMake from source 154 | 155 | ``` 156 | version=3.23 157 | build=2 ## don't modify from here 158 | mkdir ~/temp 159 | cd ~/temp 160 | wget https://cmake.org/files/v$version/cmake-$version.$build.tar.gz 161 | tar -xzvf cmake-$version.$build.tar.gz 162 | cd cmake-$version.$build/ 163 | ./bootstrap 164 | make -j8 165 | sudo make install 166 | ``` 167 | 168 |
169 | 170 |
Install MXNet from source (for AMD CPU & NVIDIA GPU) 171 | 172 | ``` 173 | git clone --recursive --branch 1.9.1 https://github.com/apache/incubator-mxnet.git mxnet 174 | cd mxnet 175 | cp config/linux_gpu.cmake config.cmake 176 | rm -rf build 177 | mkdir -p build && cd build 178 | cmake -DUSE_CUDA=ON -DUSE_CUDNN=OFF -DUSE_MKL_IF_AVAILABLE=OFF -DUSE_MKLDNN=OFF -DUSE_OPENMP=OFF -DUSE_OPENCV=ON -DUSE_BLAS=open .. 179 | make -j32 180 | cd ../python 181 | pip install --user -e . 182 | ``` 183 | 184 |
185 | 186 | 187 |
Tensorflow could not load dynamic library 'cudart64_101.dll' 188 | For above example tensorflow would require CUDA 10.1, please switch to CUDA 10.1 or change tensorflow version which compatible with CUDA version, check here: https://www.tensorflow.org/install/source#gpu 189 |
190 | 191 | ### Computer Vision 192 |
Fix Deepstream (6.2+) FFMPEG OpenCV installation 193 | Fix some errors about undefined reference & not found of libavcodec, libavutil, libvpx, ... 194 | 195 | ``` 196 | apt-get install --reinstall --no-install-recommends -y libavcodec58 libavcodec-dev libavformat58 libavformat-dev libavutil56 libavutil-dev gstreamer1.0-libav 197 | apt install --reinstall gstreamer1.0-plugins-good 198 | apt install --reinstall libvpx6 libx264-155 libx265-179 libmpg123-0 libmpeg2-4 libmpeg2encpp-2.1-0 199 | gst-inspect-1.0 | grep 264 200 | rm ~/.cache/gstreamer-1.0/registry.x86_64.bin 201 | apt install --reinstall libx264-155 202 | apt-get install gstreamer1.0-libav 203 | apt-get install --reinstall gstreamer1.0-plugins-ugly 204 | ``` 205 | 206 |
207 | 208 |
Gstreamer pipeline to convert MP4-MP4 with re-encoding 209 | 210 | ``` 211 | gst-launch-1.0 filesrc location="" ! qtdemux ! video/x-h264 ! h264parse ! avdec_h264 ! videoconvert ! x264enc ! h264parse ! qtmux ! filesink location= 212 | ``` 213 | 214 |
215 | 216 |
Gstreamer pipeline to convert RTSP-RTMP 217 | 218 | ``` 219 | gst-launch-1.0 rtspsrc location='rtsp://' ! rtph264depay ! h264parse ! flvmux ! rtmpsink location='rtmp://rtmp://' 220 | ``` 221 | 222 |
223 | 224 |
Gstreamer pipeline to convert RTSP-RTMP with reducing resolution 225 | 226 | ``` 227 | gst-launch-1.0 rtspsrc location='rtsp://' ! rtpbin ! rtph264depay ! h264parse ! avdec_h264 ! videoconvert ! videoscale ! video/x-raw,width=640,height=640 ! x264enc ! h264parse ! flvmux streamable=true ! rtmpsink location='rtmp://' 228 | ``` 229 | 230 |
231 | -------------------------------------------------------------------------------- /Linux/docs/build_opencv.md: -------------------------------------------------------------------------------- 1 | # Build OpenCV from source 2 | 3 | ### 1. Install the required dependencies 4 | ``` 5 | sudo apt install build-essential cmake git pkg-config libgtk-3-dev \ 6 | libavcodec-dev libavformat-dev libswscale-dev libv4l-dev \ 7 | libxvidcore-dev libx264-dev libjpeg-dev libpng-dev libtiff-dev \ 8 | gfortran openexr libatlas-base-dev python3-dev python3-numpy \ 9 | libtbb2 libtbb-dev libdc1394-22-dev 10 | sudo apt install libopenblas-dev libopenblas-base 11 | ``` 12 | ### 2. Clone the OpenCV’s and OpenCV contrib repositories 13 | ``` 14 | mkdir ~/opencv_build && cd ~/opencv_build 15 | git clone https://github.com/opencv/opencv.git 16 | git clone https://github.com/opencv/opencv_contrib.git 17 | cd ~/opencv_build/opencv 18 | mkdir build && cd build 19 | ``` 20 | ### 3. Fix OpenBlas search Path: 21 | ``` 22 | https://github.com/opencv/opencv/issues/12957 23 | ``` 24 | and header 25 | ``` 26 | sudo cp /usr/include/lapacke*.h /usr/include/x86_64-linux-gnu/ 27 | ``` 28 | ### 4. Check CPU tags for optimization 29 | ``` 30 | damnguyen@rnd3:~/opencv_build/opencv/build$ lscpu 31 | Architecture: x86_64 32 | CPU op-mode(s): 32-bit, 64-bit 33 | Byte Order: Little Endian 34 | CPU(s): 96 35 | On-line CPU(s) list: 0-95 36 | Thread(s) per core: 2 37 | Core(s) per socket: 24 38 | Socket(s): 2 39 | NUMA node(s): 2 40 | Vendor ID: AuthenticAMD 41 | CPU family: 23 42 | Model: 49 43 | Model name: AMD EPYC 7352 24-Core Processor 44 | Stepping: 0 45 | CPU MHz: 1495.927 46 | CPU max MHz: 2300.0000 47 | CPU min MHz: 1500.0000 48 | BogoMIPS: 4600.06 49 | Virtualization: AMD-V 50 | L1d cache: 32K 51 | L1i cache: 32K 52 | L2 cache: 512K 53 | L3 cache: 16384K 54 | NUMA node0 CPU(s): 0-23,48-71 55 | NUMA node1 CPU(s): 24-47,72-95 56 | Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate sme ssbd ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif umip rdpid overflow_recov succor smca 57 | ``` 58 | This **AMD EPYC 7352 24-Core Processor** support **avx**, **avx2**, **sse4_1**, **sse4_2** 59 | ### 5. Config 60 | ``` 61 | cmake -D CMAKE_BUILD_TYPE=RELEASE \ 62 | -D CMAKE_INSTALL_PREFIX=$(python3 -c "import sys; print(sys.prefix)") \ 63 | -D INSTALL_C_EXAMPLES=ON \ 64 | -D INSTALL_PYTHON_EXAMPLES=ON \ 65 | -D OPENCV_GENERATE_PKGCONFIG=ON \ 66 | -D OPENCV_EXTRA_MODULES_PATH=~/opencv_build/opencv_contrib/modules \ 67 | -D WITH_CUDA=OFF \ 68 | -D BUILD_NEW_PYTHON_SUPPORT=ON \ 69 | -D BUILD_opencv_python3=ON \ 70 | -D HAVE_opencv_python3=ON \ 71 | -D OPENCV_PYTHON3_INSTALL_PATH=$(python3 -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())") \ 72 | -D PYTHON_EXECUTABLE=$(which python3) \ 73 | -D BUILD_EXAMPLES=ON -D WITH_FFMPEG=OFF .. 74 | ``` 75 | Remember to check any error with OpenBLAS 76 | ### 6. Build 77 | ``` 78 | make -j8 79 | make install 80 | ``` 81 | ### 7. Verify 82 | ``` 83 | pkg-config --modversion opencv4 84 | python3 -c "import cv2; print(cv2.__version__)" 85 | ``` 86 | --------------------------------------------------------------------------------