├── Deeplearning
    ├── ComputerVision
    │   ├── README.md
    │   └── docs
    │   │   ├── gated_convolution.md
    │   │   ├── multihead_attn.md
    │   │   └── resblock.md
    └── NLP
    │   └── README.md
├── Deploy
    ├── Deepstream
    │   ├── FAQ.md
    │   ├── README.md
    │   ├── sample-ALPR
    │   │   ├── README.md
    │   │   ├── config_deepstream.txt
    │   │   ├── config_lpd.txt
    │   │   ├── config_lpr.txt
    │   │   ├── config_tracker.txt
    │   │   ├── config_vehicletype.txt
    │   │   ├── config_yolov4.txt
    │   │   ├── dict.txt
    │   │   ├── fig
    │   │   │   ├── lpr_pipeline.png
    │   │   │   ├── lpr_result1.png
    │   │   │   └── lpr_result2.png
    │   │   ├── labels.txt
    │   │   ├── nvdsinfer_custom_impl_Yolo
    │   │   │   ├── Makefile
    │   │   │   └── nvdsparsebbox_Yolo.cpp
    │   │   └── weights
    │   │   │   ├── README.md
    │   │   │   ├── license-plate-detection
    │   │   │       └── labels.txt
    │   │   │   ├── license-plate-recognition
    │   │   │       └── labels.txt
    │   │   │   └── vehicletypenet
    │   │   │       └── labels.txt
    │   ├── sample-scrfd
    │   │   ├── README.md
    │   │   ├── config_scrfd.txt
    │   │   ├── nvdsinfer_custom_impl_Yolo
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── Makefile
    │   │   │   ├── README.md
    │   │   │   ├── batchedNMSCustomInference.cu
    │   │   │   ├── batchedNMSCustomPlugin.cpp
    │   │   │   ├── batchedNMSCustomPlugin.h
    │   │   │   ├── batchedNMSCustomPlugin.o
    │   │   │   ├── cmake
    │   │   │   │   └── set_ifndef.cmake
    │   │   │   ├── common
    │   │   │   │   ├── ErrorRecorder.h
    │   │   │   │   ├── bboxUtils.h
    │   │   │   │   ├── checkMacrosPlugin.cpp
    │   │   │   │   ├── checkMacrosPlugin.h
    │   │   │   │   ├── common.cuh
    │   │   │   │   ├── cub_helper.h
    │   │   │   │   ├── cudaDriverWrapper.cpp
    │   │   │   │   ├── cudaDriverWrapper.h
    │   │   │   │   ├── half.h
    │   │   │   │   ├── kernel.cpp
    │   │   │   │   ├── kernel.h
    │   │   │   │   ├── kernels
    │   │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   │   ├── allClassNMS.cu
    │   │   │   │   │   ├── common.cu
    │   │   │   │   │   ├── decodeBBoxes.cu
    │   │   │   │   │   ├── nmsLayer.cu
    │   │   │   │   │   ├── permuteData.cu
    │   │   │   │   │   ├── reducedMathPlugin.h
    │   │   │   │   │   ├── sortScoresPerClass.cu
    │   │   │   │   │   └── sortScoresPerImage.cu
    │   │   │   │   ├── logger.cpp
    │   │   │   │   ├── logger.h
    │   │   │   │   ├── logging.h
    │   │   │   │   ├── nmsHelper.cpp
    │   │   │   │   ├── nmsUtils.h
    │   │   │   │   ├── plugin.h
    │   │   │   │   ├── reducedMathPlugin.cpp
    │   │   │   │   └── serialize.hpp
    │   │   │   ├── gatherNMSCustomOutputs.cu
    │   │   │   ├── gatherNMSCustomOutputs.h
    │   │   │   ├── nvdsparsebbox_Yolo.cpp
    │   │   │   └── nvdsparsebbox_Yolo.o
    │   │   ├── parser_scrfd.py
    │   │   └── run_scrfd.py
    │   └── sample-yolov4
    │   │   ├── config_deepstream.txt
    │   │   ├── config_tracker.txt
    │   │   ├── config_yolov4.txt
    │   │   ├── exec_backends
    │   │       ├── __pycache__
    │   │       │   └── trt_backend.cpython-36.pyc
    │   │       └── trt_backend.py
    │   │   ├── labels.txt
    │   │   ├── nvdsinfer_custom_impl_Yolo
    │   │       ├── Makefile
    │   │       └── nvdsparsebbox_Yolo.cpp
    │   │   ├── run_yolov4.py
    │   │   ├── test_images
    │   │       └── test.png
    │   │   ├── test_onnx.py
    │   │   └── tools
    │   │       └── add_nms_plugins.py
    ├── NVIDIA
    │   ├── README.md
    │   ├── docs
    │   │   ├── multi_instance_gpu.md
    │   │   └── nvidia_video_sdk.md
    │   └── fig
    │   │   ├── gpu-mig-overview.jpg
    │   │   ├── mig_bert.png
    │   │   └── support_nvenc_nvdec.png
    ├── README.md
    ├── Transfer-Learning-Toolkit
    │   ├── README.md
    │   ├── docs
    │   │   ├── detectnet_v2.md
    │   │   └── yolov4.md
    │   └── fig
    │   │   ├── detectnet_v2-inference.jpg
    │   │   ├── nvidia-retrain-qat.png
    │   │   └── yolov4-inference.png
    └── Triton-inference-server
    │   ├── README.md
    │   ├── docs
    │       ├── backend.md
    │       ├── install.md
    │       ├── model_batching.md
    │       ├── model_configuration.md
    │       ├── model_ensemble.md
    │       ├── model_instance.md
    │       ├── model_management.md
    │       ├── optimization_pytorch.md
    │       ├── perf_analyzer.md
    │       ├── triton_kaldi.md
    │       ├── triton_onnx.md
    │       ├── triton_pytorch.md
    │       └── triton_tensorrt.md
    │   ├── fig
    │       ├── multi_model_exec.png
    │       ├── multi_model_parallel_exec.png
    │       ├── multi_model_serial_exec.png
    │       ├── wav2vec_general_perf_onnx.jpg
    │       ├── wav2vec_general_perf_tensorrt.jpg
    │       └── wav2vec_general_start.jpg
    │   └── src
    │       ├── sample_grpc.py
    │       └── sample_load_unload.py
├── Framework
    ├── ONNX
    │   └── README.md
    ├── Pytorch
    │   ├── README.md
    │   └── docs
    │   │   └── build_from_source.md
    ├── TensorRT
    │   ├── README.md
    │   ├── docs
    │   │   └── tutorial.md
    │   └── fig
    │   │   └── sample_netron_scrfd.png
    └── Tensorflow
    │   └── README.md
├── Linux
    ├── README.md
    └── docs
    │   └── build_opencv.md
└── README.md


/Deeplearning/ComputerVision/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deeplearning/ComputerVision/README.md


--------------------------------------------------------------------------------
/Deeplearning/ComputerVision/docs/gated_convolution.md:
--------------------------------------------------------------------------------
  1 | ## Gated Convolution
  2 | 
  3 | ### 1. Expland
  4 | ### 2. Pytorch Implementation
  5 | ```
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | 
 10 | class GatedConv2dWithActivation(torch.nn.Module):
 11 |     """
 12 |     Gated Convlution layer with activation (default activation:LeakyReLU)
 13 |     Params: same as conv2d
 14 |     Input: The feature from last layer "I"
 15 |     Output:\phi(f(I))*\sigmoid(g(I))
 16 |     """
 17 | 
 18 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True,batch_norm=True, activation=torch.nn.LeakyReLU(0.2, inplace=True)):
 19 |         super(GatedConv2dWithActivation, self).__init__()
 20 |         self.batch_norm = batch_norm
 21 |         self.activation = activation
 22 |         self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias)
 23 |         self.mask_conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias)
 24 |         self.batch_norm2d = torch.nn.BatchNorm2d(out_channels)
 25 |         self.sigmoid = torch.nn.Sigmoid()
 26 | 
 27 |         for m in self.modules():
 28 |             if isinstance(m, nn.Conv2d):
 29 |                 nn.init.kaiming_normal_(m.weight)
 30 |     def gated(self, mask):
 31 |         return self.sigmoid(mask)
 32 |     def forward(self, input):
 33 |         x = self.conv2d(input)
 34 |         mask = self.mask_conv2d(input)
 35 |         if self.activation is not None:
 36 |             x = self.activation(x) * self.gated(mask)
 37 |         else:
 38 |             x = x * self.gated(mask)
 39 |         if self.batch_norm:
 40 |             return self.batch_norm2d(x)
 41 |         else:
 42 |             return x
 43 | 
 44 | class GatedDeConv2dWithActivation(torch.nn.Module):
 45 |     """
 46 |     Gated DeConvlution layer with activation (default activation:LeakyReLU)
 47 |     resize + conv
 48 |     Params: same as conv2d
 49 |     Input: The feature from last layer "I"
 50 |     Output:\phi(f(I))*\sigmoid(g(I))
 51 |     """
 52 |     def __init__(self, scale_factor, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, batch_norm=True,activation=torch.nn.LeakyReLU(0.2, inplace=True)):
 53 |         super(GatedDeConv2dWithActivation, self).__init__()
 54 |         self.conv2d = GatedConv2dWithActivation(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, batch_norm, activation)
 55 |         self.scale_factor = scale_factor
 56 | 
 57 |     def forward(self, input):
 58 |         #print(input.size())
 59 |         x = F.interpolate(input, scale_factor=2)
 60 |         return self.conv2d(x)
 61 | 
 62 | class SNGatedConv2dWithActivation(torch.nn.Module):
 63 |     """
 64 |     Gated Convolution with spetral normalization
 65 |     """
 66 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, batch_norm=True, activation=torch.nn.LeakyReLU(0.2, inplace=True)):
 67 |         super(SNGatedConv2dWithActivation, self).__init__()
 68 |         self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias)
 69 |         self.mask_conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias)
 70 |         self.activation = activation
 71 |         self.batch_norm = batch_norm
 72 |         self.batch_norm2d = torch.nn.BatchNorm2d(out_channels)
 73 |         self.sigmoid = torch.nn.Sigmoid()
 74 |         self.conv2d = torch.nn.utils.spectral_norm(self.conv2d)
 75 |         self.mask_conv2d = torch.nn.utils.spectral_norm(self.mask_conv2d)
 76 |         for m in self.modules():
 77 |             if isinstance(m, nn.Conv2d):
 78 |                 nn.init.kaiming_normal_(m.weight)
 79 | 
 80 |     def gated(self, mask):
 81 |         return self.sigmoid(mask)
 82 | 
 83 |     def forward(self, input):
 84 |         x = self.conv2d(input)
 85 |         mask = self.mask_conv2d(input)
 86 |         if self.activation is not None:
 87 |             x = self.activation(x) * self.gated(mask)
 88 |         else:
 89 |             x = x * self.gated(mask)
 90 |         if self.batch_norm:
 91 |             return self.batch_norm2d(x)
 92 |         else:
 93 |             return x
 94 | 
 95 | class SNGatedDeConv2dWithActivation(torch.nn.Module):
 96 |     """
 97 |     Gated DeConvlution layer with activation (default activation:LeakyReLU)
 98 |     resize + conv
 99 |     Params: same as conv2d
100 |     Input: The feature from last layer "I"
101 |     Output:\phi(f(I))*\sigmoid(g(I))
102 |     """
103 |     def __init__(self, scale_factor, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, batch_norm=True, activation=torch.nn.LeakyReLU(0.2, inplace=True)):
104 |         super(SNGatedDeConv2dWithActivation, self).__init__()
105 |         self.conv2d = SNGatedConv2dWithActivation(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias, batch_norm, activation)
106 |         self.scale_factor = scale_factor
107 | 
108 |     def forward(self, input):
109 |         #print(input.size())
110 |         x = F.interpolate(input, scale_factor=2)
111 |         return self.conv2d(x)
112 | ```


--------------------------------------------------------------------------------
/Deeplearning/ComputerVision/docs/multihead_attn.md:
--------------------------------------------------------------------------------
 1 | ## Multi-head Attention Block
 2 | 
 3 | ### 1. Expland
 4 | ### 2. Pytorch Implementation
 5 | ```
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | 
10 | def Normalize(in_channels):
11 |     return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
12 |     
13 | class MultiHeadAttnBlock(nn.Module):
14 |     def __init__(self, in_channels, head_size=1):
15 |         super().__init__()
16 |         self.in_channels = in_channels
17 |         self.head_size = head_size
18 |         self.att_size = in_channels // head_size
19 |         assert(in_channels % head_size == 0), 'The size of head should be divided by the number of channels.'
20 | 
21 |         self.norm1 = Normalize(in_channels)
22 |         self.norm2 = Normalize(in_channels)
23 | 
24 |         self.q = torch.nn.Conv2d(in_channels,
25 |                                  in_channels,
26 |                                  kernel_size=1,
27 |                                  stride=1,
28 |                                  padding=0)
29 |         self.k = torch.nn.Conv2d(in_channels,
30 |                                  in_channels,
31 |                                  kernel_size=1,
32 |                                  stride=1,
33 |                                  padding=0)
34 |         self.v = torch.nn.Conv2d(in_channels,
35 |                                  in_channels,
36 |                                  kernel_size=1,
37 |                                  stride=1,
38 |                                  padding=0)
39 |         self.proj_out = torch.nn.Conv2d(in_channels,
40 |                                         in_channels,
41 |                                         kernel_size=1,
42 |                                         stride=1,
43 |                                         padding=0)
44 |         self.num = 0
45 | 
46 |     def forward(self, x, y=None):
47 |         h_ = x
48 |         h_ = self.norm1(h_)
49 |         if y is None:
50 |             y = h_
51 |         else:
52 |             y = self.norm2(y)
53 | 
54 |         q = self.q(y)
55 |         k = self.k(h_)
56 |         v = self.v(h_)
57 | 
58 |         # compute attention
59 |         b,c,h,w = q.shape
60 |         q = q.reshape(b, self.head_size, self.att_size ,h*w) 
61 |         q = q.permute(0, 3, 1, 2) # b, hw, head, att
62 | 
63 |         k = k.reshape(b, self.head_size, self.att_size ,h*w) 
64 |         k = k.permute(0, 3, 1, 2)
65 | 
66 |         v = v.reshape(b, self.head_size, self.att_size ,h*w) 
67 |         v = v.permute(0, 3, 1, 2)
68 | 
69 | 
70 |         q = q.transpose(1, 2)
71 |         v = v.transpose(1, 2)
72 |         k = k.transpose(1, 2).transpose(2,3)
73 | 
74 |         scale = int(self.att_size)**(-0.5)
75 |         q.mul_(scale)
76 |         w_ = torch.matmul(q, k)
77 |         w_ = F.softmax(w_, dim=3)
78 | 
79 |         w_ = w_.matmul(v)
80 | 
81 |         w_ = w_.transpose(1, 2).contiguous() # [b, h*w, head, att]
82 |         w_ = w_.view(b, h, w, -1)
83 |         w_ = w_.permute(0, 3, 1, 2)
84 | 
85 |         w_ = self.proj_out(w_)
86 | 
87 |         return x+w_
88 | ```


--------------------------------------------------------------------------------
/Deeplearning/ComputerVision/docs/resblock.md:
--------------------------------------------------------------------------------
 1 | ## Resblock
 2 | 
 3 | ### 1. Expland
 4 | ### 2. Pytorch Implementation
 5 | ```
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | 
10 | def Normalize(in_channels):
11 |     return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
12 | 
13 | class ResnetBlock(nn.Module):
14 |     def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
15 |                  dropout, temb_channels=512):
16 |         super().__init__()
17 |         self.nonlinearity = torch.nn.LeakyReLU(0.2)
18 |         self.in_channels = in_channels
19 |         out_channels = in_channels if out_channels is None else out_channels
20 |         self.out_channels = out_channels
21 |         self.use_conv_shortcut = conv_shortcut
22 | 
23 |         self.norm1 = Normalize(in_channels)
24 |         self.conv1 = torch.nn.Conv2d(in_channels,
25 |                                      out_channels,
26 |                                      kernel_size=3,
27 |                                      stride=1,
28 |                                      padding=1)
29 |         if temb_channels > 0:
30 |             self.temb_proj = torch.nn.Linear(temb_channels,
31 |                                              out_channels)
32 |         self.norm2 = Normalize(out_channels)
33 |         self.dropout = torch.nn.Dropout(dropout)
34 |         self.conv2 = torch.nn.Conv2d(out_channels,
35 |                                      out_channels,
36 |                                      kernel_size=3,
37 |                                      stride=1,
38 |                                      padding=1)
39 |         if self.in_channels != self.out_channels:
40 |             if self.use_conv_shortcut:
41 |                 self.conv_shortcut = torch.nn.Conv2d(in_channels,
42 |                                                      out_channels,
43 |                                                      kernel_size=3,
44 |                                                      stride=1,
45 |                                                      padding=1)
46 |             else:
47 |                 self.nin_shortcut = torch.nn.Conv2d(in_channels,
48 |                                                     out_channels,
49 |                                                     kernel_size=1,
50 |                                                     stride=1,
51 |                                                     padding=0)
52 |                                         
53 | 
54 |     def forward(self, x, temb):
55 |         h = x
56 |         h = self.norm1(h)
57 |         h = self.nonlinearity(h)
58 |         h = self.conv1(h)
59 | 
60 |         if temb is not None:
61 |             h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
62 | 
63 |         h = self.norm2(h)
64 |         h = self.nonlinearity(h)
65 |         h = self.dropout(h)
66 |         h = self.conv2(h)
67 | 
68 |         if self.in_channels != self.out_channels:
69 |             if self.use_conv_shortcut:
70 |                 x = self.conv_shortcut(x)
71 |             else:
72 |                 x = self.nin_shortcut(x)
73 | 
74 |         return x+h
75 | ```


--------------------------------------------------------------------------------
/Deeplearning/NLP/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deeplearning/NLP/README.md


--------------------------------------------------------------------------------
/Deploy/Deepstream/FAQ.md:
--------------------------------------------------------------------------------
1 | ## FAQ about Deepstream
2 | 
3 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/README.md:
--------------------------------------------------------------------------------
 1 | ## 1. Requirement
 2 | ```
 3 | sudo apt install libgirepository1.0-dev libgstreamer1.0-dev
 4 | ```
 5 | ```
 6 | sudo apt install \
 7 | libssl1.0.0 \
 8 | libgstreamer1.0-0 \
 9 | gstreamer1.0-tools \
10 | gstreamer1.0-plugins-good \
11 | gstreamer1.0-plugins-bad \
12 | gstreamer1.0-plugins-ugly \
13 | gstreamer1.0-libav \
14 | libgstrtspserver-1.0-0 \
15 | libjansson4=2.11-1
16 | ```
17 | ## 2. Examples
18 | - [Sample Yolov4](sample-yolov4)
19 | - [Sample ALPR](sample-ALPR) 
20 | - [Sample SCRFD Face Detection](sample-scrfd) 
21 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/README.md:
--------------------------------------------------------------------------------
 1 | # Deepstream ALPR
 2 | 
 3 | <p align="center">
 4 |   <img src="fig/lpr_pipeline.png" width="960">
 5 | </p>
 6 | 
 7 | ## 1. Requirement
 8 | - Deepstream 6.0
 9 | 
10 | ## 2. Run demo
11 | ```
12 | cd nvdsinfer_custom_impl_Yolo
13 | make
14 | cd ..
15 | deepstream-app -c config_deepstream.txt
16 | ```
17 | <p align="center">
18 |   <img src="fig/lpr_result1.png" width="500">
19 |   <img src="fig/lpr_result2.png" width="500">
20 | </p>
21 | 
22 | ## 3. Models
23 | ### 3.1 Object detection
24 | - Sử dụng phiên bản Darknet COCO yolov4-608x608 
25 | - Convert sang ONNX
26 | - [Bổ sung NMS Plugin](../sample-yolov4/tools/add_nms_plugins.py)
27 | - Customized parser: **NvDsInferParseCustomYoloV4**
28 | 
29 | ### 3.2 Vehicle Type Net
30 | - Sử dụng model Resnet18 classification từ NVIDIA TAO
31 | - Tiến hành training & prune & INT8 quantization
32 | 
33 | ### 3.3 License Plate Detection
34 | - Sử dụng model yolov4 từ NVIDIA TAO
35 | - Tiến hành training & prune & INT8 quantization
36 | - Customized parser: **NvDsInferParseCustomYoloV4TLT**
37 | 
38 | ### 3.4 License Plate Recognition
39 | - Sử dụng model yolov4 từ NVIDIA TAO
40 | - Tiến hành training & prune & INT8 quantization
41 | - Customized parser: **NvDsInferParseCustomYoloV4LPR** (sort các ký tự detect được và gán vào **attributeLabel**)
42 | 
43 | ## References
44 | - https://github.com/NVIDIA-AI-IOT/deepstream_lpr_app


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/config_deepstream.txt:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | #
  3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | #
 17 | ################################################################################
 18 | 
 19 | [application]
 20 | enable-perf-measurement=1
 21 | perf-measurement-interval-sec=3
 22 | #gie-kitti-output-dir=streamscl
 23 | 
 24 | [tiled-display]
 25 | enable=1
 26 | rows=1
 27 | columns=0
 28 | width=1280
 29 | height=720
 30 | gpu-id=0
 31 | #(0): nvbuf-mem-default - Default memory allocated, specific to particular platform
 32 | #(1): nvbuf-mem-cuda-pinned - Allocate Pinned/Host cuda memory, applicable for Tesla
 33 | #(2): nvbuf-mem-cuda-device - Allocate Device cuda memory, applicable for Tesla
 34 | #(3): nvbuf-mem-cuda-unified - Allocate Unified cuda memory, applicable for Tesla
 35 | #(4): nvbuf-mem-surface-array - Allocate Surface Array memory, applicable for Jetson
 36 | nvbuf-memory-type=0
 37 | 
 38 | [source0]
 39 | enable=1
 40 | #Type - 1=CameraV4L2 2=URI 3=MultiURI
 41 | type=3
 42 | uri=file:///home/damnguyen/Deploy/deepstream/videos/video_%d.mp4
 43 | num-sources=1
 44 | gpu-id=0
 45 | #drop-frame-interval=2
 46 | # (0): memtype_device   - Memory type Device
 47 | # (1): memtype_pinned   - Memory type Host Pinned
 48 | # (2): memtype_unified  - Memory type Unified
 49 | cudadec-memtype=0
 50 | 
 51 | [sink0]
 52 | enable=0
 53 | #Type - 1=FakeSink 2=EglSink 3=File
 54 | type=1
 55 | sync=0
 56 | source-id=0
 57 | gpu-id=0
 58 | qos=0
 59 | nvbuf-memory-type=0
 60 | overlay-id=1
 61 | 
 62 | [sink1]
 63 | enable=1
 64 | type=3
 65 | enc-type=1
 66 | #1=mp4 2=mkv
 67 | container=1
 68 | #1=h264 2=h265
 69 | codec=1
 70 | sync=0
 71 | #iframeinterval=10
 72 | bitrate=2000000
 73 | output-file=out1.mp4
 74 | 
 75 | 
 76 | [osd]
 77 | enable=1
 78 | gpu-id=0
 79 | border-width=1
 80 | text-size=15
 81 | text-color=1;1;1;1;
 82 | text-bg-color=0.3;0.3;0.3;1
 83 | font=Serif
 84 | show-clock=0
 85 | clock-x-offset=800
 86 | clock-y-offset=820
 87 | clock-text-size=12
 88 | clock-color=1;0;0;0
 89 | nvbuf-memory-type=0
 90 | 
 91 | [streammux]
 92 | gpu-id=0
 93 | ##Boolean property to inform muxer that sources are live
 94 | live-source=0
 95 | batch-size=1
 96 | ##time out in usec, to wait after the first buffer is available
 97 | ##to push the batch even if the complete batch is not formed
 98 | batched-push-timeout=40000
 99 | ## Set muxer output width and height
100 | width=1920
101 | height=1080
102 | ##Enable to maintain aspect ratio wrt source, and allow black borders, works
103 | ##along with width, height properties
104 | enable-padding=0
105 | nvbuf-memory-type=0
106 | 
107 | 
108 | [primary-gie]
109 | enable=1
110 | gpu-id=0
111 | labelfile-path=labels.txt
112 | #Required by the app for OSD, not a plugin property
113 | bbox-border-color0=1;0;0;1
114 | bbox-border-color1=0;1;1;1
115 | bbox-border-color2=0;0;1;1
116 | bbox-border-color3=0;1;0;1
117 | gie-unique-id=1
118 | nvbuf-memory-type=0
119 | config-file=config_yolov4.txt
120 | 
121 | [tracker]
122 | enable=1
123 | tracker-width=608
124 | tracker-height=608
125 | ll-lib-file=/opt/nvidia/deepstream/deepstream/lib/libnvds_nvmultiobjecttracker.so
126 | ll-config-file=/opt/nvidia/deepstream/deepstream/samples/configs/deepstream-app/config_tracker_NvDCF_max_perf.yml
127 | enable-batch-process=1
128 | display-tracking-id=1
129 | enable-past-frame=1
130 | 
131 | [secondary-gie0]
132 | enable=1
133 | gpu-id=0
134 | gie-unique-id=2
135 | operate-on-gie-id=1
136 | operate-on-class-ids=2;5;7
137 | config-file=config_vehicletype.txt
138 | 
139 | [secondary-gie1]
140 | enable=1
141 | gpu-id=0
142 | gie-unique-id=3
143 | operate-on-gie-id=1
144 | operate-on-class-ids=2;3;5;7
145 | config-file=config_lpd.txt
146 | 
147 | [secondary-gie2]
148 | enable=1
149 | gpu-id=0
150 | gie-unique-id=4
151 | operate-on-gie-id=3
152 | operate-on-class-ids=0
153 | config-file=config_lpr.txt


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/config_lpd.txt:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a
 5 | # copy of this software and associated documentation files (the "Software"),
 6 | # to deal in the Software without restriction, including without limitation
 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 | # and/or sell copies of the Software, and to permit persons to whom the
 9 | # Software is furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in
12 | # all copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 | # DEALINGS IN THE SOFTWARE.
21 | ################################################################################
22 | 
23 | [property]
24 | gpu-id=0
25 | net-scale-factor=1
26 | offsets=103.939;116.779;123.68
27 | tlt-model-key=license-plate-yolov4
28 | tlt-encoded-model=weights/license-plate-detection/yolov4_resnet18_epoch_050-fp32.etlt
29 | labelfile-path=weights/license-plate-detection/labels.txt
30 | int8-calib-file=weights/license-plate-detection/cal.bin
31 | model-engine-file=weights/license-plate-detection/yolov4_resnet18_epoch_050-fp32.etlt_b4_gpu0_fp32.engine
32 | infer-dims=3;320;320
33 | uff-input-blob-name=Input
34 | batch-size=4
35 | process-mode=2
36 | model-color-format=0
37 | ## 0=FP32, 1=INT8, 2=FP16 mode
38 | network-mode=0
39 | #0 detector 1 classifier 2 segmentatio 3 instance segmentation
40 | network-type=0
41 | num-detected-classes=1
42 | interval=0
43 | gie-unique-id=5
44 | operate-on-class-ids=2;3;5;7
45 | operate-on-gie-id=1
46 | output-blob-names=BatchedNMS
47 | parse-bbox-func-name=NvDsInferParseCustomYoloV4TLT
48 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
49 | input-object-min-width=64
50 | input-object-min-height=64


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/config_lpr.txt:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a
 5 | # copy of this software and associated documentation files (the "Software"),
 6 | # to deal in the Software without restriction, including without limitation
 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 | # and/or sell copies of the Software, and to permit persons to whom the
 9 | # Software is furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in
12 | # all copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 | # DEALINGS IN THE SOFTWARE.
21 | ################################################################################
22 | 
23 | [property]
24 | gpu-id=0
25 | net-scale-factor=1
26 | offsets=103.939;116.779;123.68
27 | tlt-model-key=license-plate-recognition
28 | tlt-encoded-model=weights/license-plate-recognition/yolov4_resnet18-pruned-retrain-int8.etlt
29 | labelfile-path=weights/license-plate-recognition/labels.txt
30 | int8-calib-file=weights/license-plate-recognition/yolov4_resnet18-pruned-retrain.bin
31 | model-engine-file=weights/license-plate-recognition/yolov4_resnet18-pruned-retrain-int8.etlt_b4_gpu0_int8.engine
32 | infer-dims=3;224;224
33 | uff-input-blob-name=Input
34 | batch-size=4
35 | process-mode=2
36 | model-color-format=0
37 | ## 0=FP32, 1=INT8, 2=FP16 mode
38 | network-mode=1
39 | #0 detector 1 classifier 2 segmentatio 3 instance segmentation
40 | network-type=1
41 | interval=0
42 | gie-unique-id=5
43 | operate-on-class-ids=0
44 | operate-on-gie-id=1
45 | output-blob-names=BatchedNMS
46 | classifier-threshold=0.7
47 | classifier-async-mode=0
48 | #parse-bbox-func-name=NvDsInferParseCustomYoloV4LPR
49 | parse-classifier-func-name=NvDsInferParseCustomYoloV4LPR
50 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
51 | input-object-min-width=16
52 | input-object-min-height=16


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/config_tracker.txt:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # SPDX-FileCopyrightText: Copyright (c) 2019-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | ################################################################################
17 | 
18 | # Mandatory properties for the tracker:
19 | #   tracker-width
20 | #   tracker-height: needs to be multiple of 6 for NvDCF
21 | #   gpu-id
22 | #   ll-lib-file: path to low-level tracker lib
23 | #   ll-config-file: required for NvDCF, optional for KLT and IOU
24 | #
25 | [tracker]
26 | tracker-width=608
27 | tracker-height=608
28 | gpu-id=0
29 | ll-lib-file=/opt/nvidia/deepstream/deepstream/lib/libnvds_nvmultiobjecttracker.so
30 | ll-config-file=config_tracker_NvDCF_perf.yml
31 | #enable-past-frame=1
32 | enable-batch-process=1


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/config_vehicletype.txt:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a
 5 | # copy of this software and associated documentation files (the "Software"),
 6 | # to deal in the Software without restriction, including without limitation
 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 | # and/or sell copies of the Software, and to permit persons to whom the
 9 | # Software is furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in
12 | # all copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 | # DEALINGS IN THE SOFTWARE.
21 | ################################################################################
22 | 
23 | [property]
24 | gpu-id=0
25 | net-scale-factor=1
26 | offsets=124;117;104
27 | tlt-model-key=vehicle-type-net
28 | tlt-encoded-model=weights/vehicletypenet/vehicle-type-net-r18-pruned-retrain-int8.etlt
29 | labelfile-path=weights/vehicletypenet/labels.txt
30 | int8-calib-file=weights/vehicletypenet/vehicle-type-net-r18-pruned-retrain.bin
31 | model-engine-file=weights/vehicletypenet/vehicle-type-net-r18-pruned-retrain-int8.etlt_b4_gpu0_int8.engine
32 | input-dims=3;224;224;0
33 | uff-input-blob-name=input_1
34 | batch-size=4
35 | process-mode=2
36 | model-color-format=0
37 | ## 0=FP32, 1=INT8, 2=FP16 mode
38 | network-mode=1
39 | #0 detector 1 classifier 2 segmentatio 3 instance segmentation
40 | network-type=1
41 | interval=0
42 | gie-unique-id=4
43 | operate-on-class-ids=2;5;7
44 | operate-on-gie-id=1
45 | output-blob-names=predictions/Softmax
46 | classifier-threshold=0.2
47 | input-object-min-width=64
48 | input-object-min-height=64


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/config_yolov4.txt:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | ################################################################################
18 | 
19 | # Following properties are mandatory when engine files are not specified:
20 | #   int8-calib-file(Only in INT8), model-file-format
21 | #   Caffemodel mandatory properties: model-file, proto-file, output-blob-names
22 | #   UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names
23 | #   ONNX: onnx-file
24 | #
25 | # Mandatory properties for detectors:
26 | #   num-detected-classes
27 | #
28 | # Optional properties for detectors:
29 | #   cluster-mode(Default=Group Rectangles), interval(Primary mode only, Default=0)
30 | #   custom-lib-path
31 | #   parse-bbox-func-name
32 | #
33 | # Mandatory properties for classifiers:
34 | #   classifier-threshold, is-classifier
35 | #
36 | # Optional properties for classifiers:
37 | #   classifier-async-mode(Secondary mode only, Default=false)
38 | #
39 | # Optional properties in secondary mode:
40 | #   operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes),
41 | #   input-object-min-width, input-object-min-height, input-object-max-width,
42 | #   input-object-max-height
43 | #
44 | # Following properties are always recommended:
45 | #   batch-size(Default=1)
46 | #
47 | # Other optional properties:
48 | #   net-scale-factor(Default=1), network-mode(Default=0 i.e FP32),
49 | #   model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path,
50 | #   mean-file, gie-unique-id(Default=0), offsets, process-mode (Default=1 i.e. primary),
51 | #   custom-lib-path, network-mode(Default=0 i.e FP32)
52 | #
53 | # The values in the config file are overridden by values set through GObject
54 | # properties.
55 | 
56 | [property]
57 | gpu-id=0
58 | net-scale-factor=0.0039215697906911373
59 | # Skip frame
60 | interval=0
61 | #0=RGB, 1=BGR
62 | model-color-format=0
63 | input-dims=3;608;608;0
64 | onnx-file=weights/yolov4_-1_3_608_608_dynamic.nms.onnx
65 | model-engine-file=weights/yolov4_-1_3_608_608_dynamic.nms.onnx_b4_gpu0_fp32.engine
66 | labelfile-path=labels.txt
67 | batch-size=4
68 | ## 0=FP32, 1=INT8, 2=FP16 mode
69 | network-mode=0
70 | num-detected-classes=80
71 | gie-unique-id=1
72 | network-type=0
73 | is-classifier=0
74 | ## 0=Group Rectangles, 1=DBSCAN, 2=NMS, 3= DBSCAN+NMS Hybrid, 4 = None(No clustering)
75 | cluster-mode=2
76 | maintain-aspect-ratio=1
77 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
78 | parse-bbox-func-name=NvDsInferParseCustomYoloV4
79 | #scaling-filter=0
80 | #scaling-compute-hw=0
81 | 
82 | [class-attrs-all]
83 | nms-iou-threshold=0.6
84 | pre-cluster-threshold=0.4
85 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/dict.txt:
--------------------------------------------------------------------------------
 1 | A
 2 | B
 3 | C
 4 | D
 5 | E
 6 | 8
 7 | F
 8 | 5
 9 | 4
10 | G
11 | H
12 | I
13 | J
14 | K
15 | L
16 | M
17 | N
18 | 9
19 | 1
20 | P
21 | Q
22 | R
23 | S
24 | 7
25 | 6
26 | T
27 | 3
28 | 2
29 | U
30 | V
31 | W
32 | X
33 | Y
34 | Z
35 | 0


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/fig/lpr_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-ALPR/fig/lpr_pipeline.png


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/fig/lpr_result1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-ALPR/fig/lpr_result1.png


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/fig/lpr_result2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-ALPR/fig/lpr_result2.png


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/labels.txt:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorbike
 5 | aeroplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/nvdsinfer_custom_impl_Yolo/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | CUDA_VER?=
18 | ifeq ($(CUDA_VER),)
19 |   $(error "CUDA_VER is not set")
20 | endif
21 | CC:= g++
22 | NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc
23 | 
24 | CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations
25 | CFLAGS+= -I../../includes -I/usr/local/cuda-$(CUDA_VER)/include -I/opt/nvidia/deepstream/deepstream/sources/includes
26 | 
27 | LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs
28 | LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group
29 | 
30 | INCS:= $(wildcard *.h)
31 | SRCFILES:= nvdsparsebbox_Yolo.cpp
32 | 
33 | TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so
34 | 
35 | TARGET_OBJS:= $(SRCFILES:.cpp=.o)
36 | TARGET_OBJS:= $(TARGET_OBJS:.cu=.o)
37 | 
38 | all: $(TARGET_LIB)
39 | 
40 | %.o: %.cpp $(INCS) Makefile
41 | 	$(CC) -c -o $@ $(CFLAGS) $<
42 | 
43 | %.o: %.cu $(INCS) Makefile
44 | 	$(NVCC) -c -o $@ --compiler-options '-fPIC' $<
45 | 
46 | $(TARGET_LIB) : $(TARGET_OBJS)
47 | 	$(CC) -o $@  $(TARGET_OBJS) $(LFLAGS)
48 | 
49 | clean:
50 | 	rm -rf $(TARGET_LIB)
51 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/weights/README.md:
--------------------------------------------------------------------------------
1 | # To do


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/weights/license-plate-detection/labels.txt:
--------------------------------------------------------------------------------
1 | license_plate


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/weights/license-plate-recognition/labels.txt:
--------------------------------------------------------------------------------
 1 | A
 2 | B
 3 | C
 4 | D
 5 | E
 6 | 8
 7 | F
 8 | 5
 9 | 4
10 | G
11 | H
12 | I
13 | J
14 | K
15 | L
16 | M
17 | N
18 | 9
19 | 1
20 | P
21 | Q
22 | R
23 | S
24 | 7
25 | 6
26 | T
27 | 3
28 | 2
29 | U
30 | V
31 | W
32 | X
33 | Y
34 | Z
35 | 0


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-ALPR/weights/vehicletypenet/labels.txt:
--------------------------------------------------------------------------------
1 | hatchback;bus;pickup;sedan;suv;truck;van


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/README.md:
--------------------------------------------------------------------------------
 1 | ## Build custom plugins
 2 | 
 3 | ```
 4 | cd nvdsinfer_custom_impl_Yolo
 5 | mkdir build && cd build
 6 | cmake ..
 7 | make -j8
 8 | ```
 9 | 
10 | ## Run deepstream-python
11 | ```
12 | LD_PRELOAD=<path-to-plugin.so> python3 run_scrfd.py file:/<path-to-input-video>
13 | ```
14 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/config_scrfd.txt:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | ################################################################################
18 | 
19 | # Following properties are mandatory when engine files are not specified:
20 | #   int8-calib-file(Only in INT8), model-file-format
21 | #   Caffemodel mandatory properties: model-file, proto-file, output-blob-names
22 | #   UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names
23 | #   ONNX: onnx-file
24 | #
25 | # Mandatory properties for detectors:
26 | #   num-detected-classes
27 | #
28 | # Optional properties for detectors:
29 | #   cluster-mode(Default=Group Rectangles), interval(Primary mode only, Default=0)
30 | #   custom-lib-path
31 | #   parse-bbox-func-name
32 | #
33 | # Mandatory properties for classifiers:
34 | #   classifier-threshold, is-classifier
35 | #
36 | # Optional properties for classifiers:
37 | #   classifier-async-mode(Secondary mode only, Default=false)
38 | #
39 | # Optional properties in secondary mode:
40 | #   operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes),
41 | #   input-object-min-width, input-object-min-height, input-object-max-width,
42 | #   input-object-max-height
43 | #
44 | # Following properties are always recommended:
45 | #   batch-size(Default=1)
46 | #
47 | # Other optional properties:
48 | #   net-scale-factor(Default=1), network-mode(Default=0 i.e FP32),
49 | #   model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path,
50 | #   mean-file, gie-unique-id(Default=0), offsets, process-mode (Default=1 i.e. primary),
51 | #   custom-lib-path, network-mode(Default=0 i.e FP32)
52 | #
53 | # The values in the config file are overridden by values set through GObject
54 | # properties.
55 | 
56 | [property]
57 | gpu-id=0
58 | net-scale-factor=0.0039215697906911373
59 | # Skip frame
60 | interval=0
61 | #0=RGB, 1=BGR
62 | model-color-format=0
63 | input-dims=3;640;640;0
64 | onnx-file=weights/face-detection/scrfd-nms-full.nms.onnx
65 | model-engine-file=weights/face-detection/scrfd-nms-full.nms.onnx_b4_gpu0_fp32.engine
66 | labelfile-path=weights/face-detection/labels.txt
67 | batch-size=4
68 | ## 0=FP32, 1=INT8, 2=FP16 mode
69 | network-mode=0
70 | num-detected-classes=2
71 | gie-unique-id=1
72 | network-type=100
73 | ## 0=Group Rectangles, 1=DBSCAN, 2=NMS, 3= DBSCAN+NMS Hybrid, 4 = None(No clustering)
74 | cluster-mode=4
75 | maintain-aspect-ratio=0
76 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
77 | #parse-bbox-func-name=NvDsInferParseCustomFaceDetection
78 | #scaling-filter=0
79 | #scaling-compute-hw=0
80 | output-tensor-meta=1
81 | #[class-attrs-all]
82 | #nms-iou-threshold=0.6
83 | #pre-cluster-threshold=0.4
84 | input-object-min-width=0
85 | input-object-min-height=0
86 | process-mode=1


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
  2 | include(cmake/set_ifndef.cmake)
  3 | 
  4 | project(TensorRT
  5 |         LANGUAGES CXX CUDA
  6 |         VERSION 8.2
  7 |         DESCRIPTION "TensorRT is a C++ library that facilitates high performance inference on NVIDIA GPUs and deep learning accelerators."
  8 |         HOMEPAGE_URL "https://github.com/NVIDIA/TensorRT")
  9 | 
 10 | # C++14
 11 | set(CMAKE_CXX_STANDARD 14)
 12 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 13 | set(CMAKE_CXX_EXTENSIONS OFF)
 14 | set(CMAKE_CXX_FLAGS "-Wno-deprecated-declarations ${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss")
 15 | 
 16 | find_package(Threads REQUIRED)
 17 | 
 18 | ## find_package(CUDA) is broken for cross-compilation. Enable CUDA language instead.
 19 | if(NOT DEFINED CMAKE_TOOLCHAIN_FILE)
 20 |     find_package(CUDA ${CUDA_VERSION} REQUIRED)
 21 | endif()
 22 | 
 23 | include_directories(
 24 |     ${CUDA_INCLUDE_DIRS}
 25 |     ${CUDNN_ROOT_DIR}/include
 26 | )
 27 | find_library(CUDNN_LIB cudnn HINTS
 28 |     ${CUDA_TOOLKIT_ROOT_DIR} ${CUDNN_ROOT_DIR} PATH_SUFFIXES lib64 lib)
 29 | find_library(CUBLAS_LIB cublas HINTS
 30 |     ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib64 lib lib/stubs)
 31 | find_library(CUBLASLT_LIB cublasLt HINTS
 32 |     ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib64 lib lib/stubs)
 33 | find_library(CUDART_LIB cudart HINTS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib64)
 34 | find_library(RT_LIB rt)
 35 | set(CUDA_LIBRARIES ${CUDART_LIB})
 36 | 
 37 | 
 38 | message(STATUS "CUBLAS_LIB: ${CUBLAS_LIB}")
 39 | message(STATUS "CUBLASLT_LIB: ${CUBLASLT_LIB}")
 40 | message(STATUS "CUDART_LIB: ${CUDART_LIB}")
 41 | message(STATUS "CUDNN_LIB: ${CUDNN_LIB}")
 42 | 
 43 | file(GLOB SRCS *.cpp)
 44 | set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS})
 45 | file(GLOB CU_SRCS *.cu)
 46 | set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} ${CU_SRCS})
 47 | file(GLOB COMMON_SRCS common/*.cpp)
 48 | set(COMMON_SOURCES ${COMMON_SOURCES} ${COMMON_SRCS})
 49 | file(GLOB COMMON_CU_SRCS common/kernels/*.cu)
 50 | set(COMMON_CU_SOURCES ${COMMON_CU_SOURCES} ${COMMON_CU_SRCS})
 51 | 
 52 | # Generate Gencode
 53 | if (DEFINED GPU_ARCHS)
 54 |   message(STATUS "GPU_ARCHS defined as ${GPU_ARCHS}. Generating CUDA code for SM ${GPU_ARCHS}")
 55 |   separate_arguments(GPU_ARCHS)
 56 | else()
 57 |   list(APPEND GPU_ARCHS
 58 |       53
 59 |       60
 60 |       61
 61 |       70
 62 |       75
 63 |     )
 64 | 
 65 |   string(REGEX MATCH "aarch64" IS_ARM "${TRT_PLATFORM_ID}")
 66 |   if (IS_ARM)
 67 |     # Xavier (SM72) only supported for aarch64.
 68 |     list(APPEND GPU_ARCHS 72)
 69 |   endif()
 70 | 
 71 |   if (CUDA_VERSION VERSION_GREATER_EQUAL 11.0)
 72 |     # Ampere GPU (SM80) support is only available in CUDA versions > 11.0
 73 |     list(APPEND GPU_ARCHS 80)
 74 |   endif()
 75 |   if (CUDA_VERSION VERSION_GREATER_EQUAL 11.1)
 76 |     list(APPEND GPU_ARCHS 86)
 77 |   endif()
 78 | 
 79 |   message(STATUS "GPU_ARCHS is not defined. Generating CUDA code for default SMs: ${GPU_ARCHS}")
 80 | endif()
 81 | foreach(arch ${GPU_ARCHS})
 82 |     set(GENCODES "${GENCODES} -gencode arch=compute_${arch},code=sm_${arch}")
 83 | endforeach()
 84 | # Generate PTX for the last architecture in the list.
 85 | list(GET GPU_ARCHS -1 LATEST_SM)
 86 | set(GENCODES "${GENCODES} -gencode arch=compute_${LATEST_SM},code=compute_${LATEST_SM}")
 87 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wno-deprecated-declarations")
 88 | 
 89 | 
 90 | include_directories(common common/kernels)
 91 | list(APPEND PLUGIN_CU_SOURCES "${COMMON_CU_SOURCES}")
 92 | set_source_files_properties(${PLUGIN_CU_SOURCES} PROPERTIES COMPILE_FLAGS ${GENCODES})
 93 | list(APPEND PLUGIN_SOURCES "${PLUGIN_CU_SOURCES}")
 94 | list(APPEND PLUGIN_SOURCES "${COMMON_SOURCES}")
 95 | 
 96 | message(STATUS "PLUGIN_SOURCES: ${PLUGIN_SOURCES}")
 97 | message(STATUS "GENCODES: ${GENCODES}")
 98 | 
 99 | add_library(my_plugin SHARED
100 |     ${PLUGIN_SOURCES}
101 | )
102 | 
103 | target_include_directories(my_plugin
104 |     PUBLIC /opt/nvidia/deepstream/deepstream/sources/includes
105 | )
106 | target_include_directories(my_plugin
107 |     PUBLIC /usr/include/gstreamer-1.0 /usr/include/glib-2.0 /usr/lib/x86_64-linux-gnu/glib-2.0/include
108 | )
109 | 
110 | target_link_libraries(my_plugin
111 |     ${CUBLAS_LIB}
112 |     ${CUBLASLT_LIB}
113 |     ${CUDART_LIB}
114 |     ${CUDNN_LIB}
115 |     nvinfer
116 | )


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | CUDA_VER?=
18 | ifeq ($(CUDA_VER),)
19 |   $(error "CUDA_VER is not set")
20 | endif
21 | CC:= g++
22 | NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc
23 | 
24 | CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations
25 | CFLAGS+= -I../../includes -I/home/damnguyen/Deploy/deepstream/nvdsinfer_custom_impl_Yolo/common -I/usr/local/cuda-$(CUDA_VER)/include -I/opt/nvidia/deepstream/deepstream/sources/includes
26 | 
27 | LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs
28 | LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group
29 | 
30 | INCS:= $(wildcard *.h)
31 | SRCFILES:= *.cpp *.cu common/*.cpp common/kernels/*.cu
32 | 
33 | TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so
34 | 
35 | TARGET_OBJS:= $(SRCFILES:.cpp=.o)
36 | TARGET_OBJS:= $(TARGET_OBJS:.cu=.o)
37 | 
38 | all: $(TARGET_LIB)
39 | 
40 | %.o: %.cpp $(INCS) Makefile
41 | 	$(CC) -c -o $@ $(CFLAGS) $<
42 | 
43 | %.o: %.cu $(INCS) Makefile
44 | 	$(NVCC) -c -o $@ --compiler-options '-fPIC' $<
45 | 
46 | $(TARGET_LIB) : $(TARGET_OBJS)
47 | 	$(CC) -o $@  $(TARGET_OBJS) $(LFLAGS)
48 | 
49 | clean:
50 | 	rm -rf $(TARGET_LIB)
51 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/batchedNMSCustomInference.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | #include "bboxUtils.h"
 17 | #include "cuda_runtime_api.h"
 18 | #include "gatherNMSCustomOutputs.h"
 19 | #include "kernel.h"
 20 | #include "nmsUtils.h"
 21 | 
 22 | pluginStatus_t nmsCustomInference(cudaStream_t stream, const int N, const int perBatchBoxesSize, const int perBatchScoresSize, const int perBatchLandmarksSize,
 23 |     const bool shareLocation, const int backgroundLabelId, const int numPredsPerClass, const int numClasses,
 24 |     const int topK, const int keepTopK, const float scoreThreshold, const float iouThreshold, const DataType DT_BBOX,
 25 |     const void* locData, const DataType DT_SCORE, const void* confData, const void* landData, void* keepCount, void* nmsedBoxes,
 26 |     void* nmsedScores, void* nmsedClasses, void* nmsedLandmarks, void* workspace, bool isNormalized, bool confSigmoid, bool clipBoxes, int scoreBits)
 27 | {
 28 |     // locCount = batch_size * number_boxes_per_sample * 4
 29 |     const int locCount = N * perBatchBoxesSize;
 30 |     /*
 31 |      * shareLocation
 32 |      * Bounding box are shared among all classes, i.e., a bounding box could be classified as any candidate class.
 33 |      * Otherwise
 34 |      * Bounding box are designed for specific classes, i.e., a bounding box could be classified as one certain class or
 35 |      * not (binary classification).
 36 |      */
 37 |     const int numLocClasses = shareLocation ? 1 : numClasses;
 38 | 
 39 |     size_t bboxDataSize = detectionForwardBBoxDataSize(N, perBatchBoxesSize, DT_BBOX);
 40 |     void* bboxDataRaw = workspace;
 41 |     cudaMemcpyAsync(bboxDataRaw, locData, bboxDataSize, cudaMemcpyDeviceToDevice, stream);
 42 |     pluginStatus_t status;
 43 | 
 44 |     /*
 45 |      * bboxDataRaw format:
 46 |      * [batch size, numPriors (per sample), numLocClasses, 4]
 47 |      */
 48 |     // float for now
 49 |     void* bboxData;
 50 |     size_t bboxPermuteSize = detectionForwardBBoxPermuteSize(shareLocation, N, perBatchBoxesSize, DT_BBOX);
 51 |     void* bboxPermute = nextWorkspacePtr((int8_t*) bboxDataRaw, bboxDataSize);
 52 | 
 53 |     /*
 54 |      * After permutation, bboxData format:
 55 |      * [batch_size, numLocClasses, numPriors (per sample) (numPredsPerClass), 4]
 56 |      * This is equivalent to swapping axis
 57 |      */
 58 |     if (!shareLocation)
 59 |     {
 60 |         status = permuteData(
 61 |             stream, locCount, numLocClasses, numPredsPerClass, 4, DT_BBOX, false, bboxDataRaw, bboxPermute);
 62 |         ASSERT_FAILURE(status == STATUS_SUCCESS);
 63 |         bboxData = bboxPermute;
 64 |     }
 65 |     /*
 66 |      * If shareLocation, numLocClasses = 1
 67 |      * No need to permute data on linear memory
 68 |      */
 69 |     else
 70 |     {
 71 |         bboxData = bboxDataRaw;
 72 |     }
 73 | 
 74 |     /*
 75 |      * Conf data format
 76 |      * [batch size, numPriors * param.numClasses, 1, 1]
 77 |      */
 78 |     const int numScores = N * perBatchScoresSize;
 79 |     size_t totalScoresSize = detectionForwardPreNMSSize(N, perBatchScoresSize);
 80 |     if(DT_SCORE == DataType::kHALF) totalScoresSize /= 2; // detectionForwardPreNMSSize is implemented in terms of kFLOAT
 81 |     void* scores = nextWorkspacePtr((int8_t*) bboxPermute, bboxPermuteSize);
 82 | 
 83 |     // need a conf_scores
 84 |     /*
 85 |      * After permutation, bboxData format:
 86 |      * [batch_size, numClasses, numPredsPerClass, 1]
 87 |      */
 88 |     status = permuteData(
 89 |         stream, numScores, numClasses, numPredsPerClass, 1, DT_SCORE, confSigmoid, confData, scores);
 90 |     ASSERT_FAILURE(status == STATUS_SUCCESS);
 91 | 
 92 |     size_t indicesSize = detectionForwardPreNMSSize(N, perBatchScoresSize);
 93 |     void* indices = nextWorkspacePtr((int8_t*) scores, totalScoresSize);
 94 | 
 95 |     size_t postNMSScoresSize = detectionForwardPostNMSSize(N, numClasses, topK);
 96 |     if(DT_SCORE == DataType::kHALF) postNMSScoresSize /= 2; // detectionForwardPostNMSSize is implemented in terms of kFLOAT
 97 |     size_t postNMSIndicesSize = detectionForwardPostNMSSize(N, numClasses, topK); // indices are full int32
 98 |     void* postNMSScores = nextWorkspacePtr((int8_t*) indices, indicesSize);
 99 |     void* postNMSIndices = nextWorkspacePtr((int8_t*) postNMSScores, postNMSScoresSize);
100 | 
101 |     void* sortingWorkspace = nextWorkspacePtr((int8_t*) postNMSIndices, postNMSIndicesSize);
102 |     // Sort the scores so that the following NMS could be applied.
103 |     float scoreShift = 0.f;
104 |     if(DT_SCORE == DataType::kHALF && scoreBits > 0 && scoreBits <= 10)
105 |         scoreShift = 1.f;
106 |     status = sortScoresPerClass(stream, N, numClasses, numPredsPerClass, backgroundLabelId, scoreThreshold,
107 |         DT_SCORE, scores, indices, sortingWorkspace, scoreBits, scoreShift);
108 | 
109 |     ASSERT_FAILURE(status == STATUS_SUCCESS);
110 | 
111 |     // This is set to true as the input bounding boxes are of the format [ymin,
112 |     // xmin, ymax, xmax]. The default implementation assumes [xmin, ymin, xmax, ymax]
113 |     bool flipXY = true;
114 |     // NMS
115 |     status = allClassNMS(stream, N, numClasses, numPredsPerClass, topK, iouThreshold, shareLocation, isNormalized,
116 |         DT_SCORE, DT_BBOX, bboxData, scores, indices, postNMSScores, postNMSIndices, flipXY, scoreShift);
117 |     ASSERT_FAILURE(status == STATUS_SUCCESS);
118 | 
119 |     // Sort the bounding boxes after NMS using scores
120 |     status = sortScoresPerImage(stream, N, numClasses * topK, DT_SCORE, postNMSScores, postNMSIndices, scores,
121 |         indices, sortingWorkspace, scoreBits);
122 | 
123 |     ASSERT_FAILURE(status == STATUS_SUCCESS);
124 | 
125 |     // Gather data from the sorted bounding boxes after NMS
126 |     status = gatherNMSCustomOutputs(stream, shareLocation, N, numPredsPerClass, numClasses, topK, keepTopK, DT_BBOX,
127 |         DT_SCORE, indices, scores, bboxData, landData, keepCount, nmsedBoxes, nmsedScores, nmsedClasses, nmsedLandmarks, clipBoxes, scoreShift);
128 |     ASSERT_FAILURE(status == STATUS_SUCCESS);
129 | 
130 |     return STATUS_SUCCESS;
131 | }
132 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/batchedNMSCustomPlugin.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/batchedNMSCustomPlugin.o


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/cmake/set_ifndef.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | function (set_ifndef variable value)
17 |   if(NOT DEFINED ${variable})
18 |     set(${variable} ${value} PARENT_SCOPE)
19 |   endif()
20 | endfunction()
21 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/ErrorRecorder.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #ifndef ERROR_RECORDER_H
 18 | #define ERROR_RECORDER_H
 19 | #include "NvInferRuntimeCommon.h"
 20 | #include "logger.h"
 21 | #include <atomic>
 22 | #include <cstdint>
 23 | #include <exception>
 24 | #include <mutex>
 25 | #include <vector>
 26 | 
 27 | using nvinfer1::IErrorRecorder;
 28 | using nvinfer1::ErrorCode;
 29 | 
 30 | //!
 31 | //! A simple implementation of the IErrorRecorder interface for
 32 | //! use by samples. This interface also can be used as a reference
 33 | //! implementation.
 34 | //! The sample Error recorder is based on a vector that pairs the error
 35 | //! code and the error string into a single element. It also uses
 36 | //! standard mutex's and atomics in order to make sure that the code
 37 | //! works in a multi-threaded environment.
 38 | //!
 39 | class SampleErrorRecorder : public IErrorRecorder
 40 | {
 41 |     using errorPair = std::pair<ErrorCode, std::string>;
 42 |     using errorStack = std::vector<errorPair>;
 43 | 
 44 | public:
 45 |     SampleErrorRecorder() = default;
 46 | 
 47 |     virtual ~SampleErrorRecorder() noexcept {}
 48 |     int32_t getNbErrors() const noexcept final
 49 |     {
 50 |         return mErrorStack.size();
 51 |     }
 52 |     ErrorCode getErrorCode(int32_t errorIdx) const noexcept final
 53 |     {
 54 |         return invalidIndexCheck(errorIdx) ? ErrorCode::kINVALID_ARGUMENT : (*this)[errorIdx].first;
 55 |     };
 56 |     IErrorRecorder::ErrorDesc getErrorDesc(int32_t errorIdx) const noexcept final
 57 |     {
 58 |         return invalidIndexCheck(errorIdx) ? "errorIdx out of range." : (*this)[errorIdx].second.c_str();
 59 |     }
 60 |     // This class can never overflow since we have dynamic resize via std::vector usage.
 61 |     bool hasOverflowed() const noexcept final
 62 |     {
 63 |         return false;
 64 |     }
 65 | 
 66 |     // Empty the errorStack.
 67 |     void clear() noexcept final
 68 |     {
 69 |         try
 70 |         {
 71 |             // grab a lock so that there is no addition while clearing.
 72 |             std::lock_guard<std::mutex> guard(mStackLock);
 73 |             mErrorStack.clear();
 74 |         }
 75 |         catch (const std::exception& e)
 76 |         {
 77 |             sample::gLogFatal << "Internal Error: " << e.what() << std::endl;
 78 |         }
 79 |     };
 80 | 
 81 |     //! Simple helper function that
 82 |     bool empty() const noexcept
 83 |     {
 84 |         return mErrorStack.empty();
 85 |     }
 86 | 
 87 |     bool reportError(ErrorCode val, IErrorRecorder::ErrorDesc desc) noexcept final
 88 |     {
 89 |         try
 90 |         {
 91 |             std::lock_guard<std::mutex> guard(mStackLock);
 92 |             sample::gLogError << "Error[" << static_cast<int32_t>(val) << "]: " << desc << std::endl;
 93 |             mErrorStack.push_back(errorPair(val, desc));
 94 |         }
 95 |         catch (const std::exception& e)
 96 |         {
 97 |             sample::gLogFatal << "Internal Error: " << e.what() << std::endl;
 98 |         }
 99 |         // All errors are considered fatal.
100 |         return true;
101 |     }
102 | 
103 |     // Atomically increment or decrement the ref counter.
104 |     IErrorRecorder::RefCount incRefCount() noexcept final
105 |     {
106 |         return ++mRefCount;
107 |     }
108 |     IErrorRecorder::RefCount decRefCount() noexcept final
109 |     {
110 |         return --mRefCount;
111 |     }
112 | 
113 | private:
114 |     // Simple helper functions.
115 |     const errorPair& operator[](size_t index) const noexcept
116 |     {
117 |         return mErrorStack[index];
118 |     }
119 | 
120 |     bool invalidIndexCheck(int32_t index) const noexcept
121 |     {
122 |         // By converting signed to unsigned, we only need a single check since
123 |         // negative numbers turn into large positive greater than the size.
124 |         size_t sIndex = index;
125 |         return sIndex >= mErrorStack.size();
126 |     }
127 |     // Mutex to hold when locking mErrorStack.
128 |     std::mutex mStackLock;
129 | 
130 |     // Reference count of the class. Destruction of the class when mRefCount
131 |     // is not zero causes undefined behavior.
132 |     std::atomic<int32_t> mRefCount{0};
133 | 
134 |     // The error stack that holds the errors recorded by TensorRT.
135 |     errorStack mErrorStack;
136 | };     // class SampleErrorRecorder
137 | #endif // ERROR_RECORDER_H
138 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/bboxUtils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #ifndef TRT_BBOX_UTILS_H
17 | #define TRT_BBOX_UTILS_H
18 | 
19 | #include "plugin.h"
20 | 
21 | using namespace nvinfer1;
22 | using namespace nvinfer1::plugin;
23 | 
24 | template <typename T>
25 | struct Bbox
26 | {
27 |     T xmin, ymin, xmax, ymax;
28 |     Bbox(T xmin, T ymin, T xmax, T ymax)
29 |         : xmin(xmin)
30 |         , ymin(ymin)
31 |         , xmax(xmax)
32 |         , ymax(ymax)
33 |     {
34 |     }
35 |     Bbox() = default;
36 | };
37 | 
38 | template <typename T>
39 | struct BboxInfo
40 | {
41 |     T conf_score;
42 |     int label;
43 |     int bbox_idx;
44 |     bool kept;
45 |     BboxInfo(T conf_score, int label, int bbox_idx, bool kept)
46 |         : conf_score(conf_score)
47 |         , label(label)
48 |         , bbox_idx(bbox_idx)
49 |         , kept(kept)
50 |     {
51 |     }
52 |     BboxInfo() = default;
53 | };
54 | 
55 | template <typename TFloat>
56 | bool operator<(const Bbox<TFloat>& lhs, const Bbox<TFloat>& rhs)
57 | {
58 |     return lhs.x1 < rhs.x1;
59 | }
60 | 
61 | template <typename TFloat>
62 | bool operator==(const Bbox<TFloat>& lhs, const Bbox<TFloat>& rhs)
63 | {
64 |     return lhs.x1 == rhs.x1 && lhs.y1 == rhs.y1 && lhs.x2 == rhs.x2 && lhs.y2 == rhs.y2;
65 | }
66 | // }}}
67 | 
68 | int8_t* alignPtr(int8_t* ptr, uintptr_t to);
69 | 
70 | int8_t* nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize);
71 | 
72 | size_t dataTypeSize(DataType dtype);
73 | 
74 | void setUniformOffsets(cudaStream_t stream, int num_segments, int offset, int* d_offsets);
75 | 
76 | #endif
77 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/checkMacrosPlugin.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include "checkMacrosPlugin.h"
 18 | #include <cstdlib>
 19 | #include <cublas_v2.h>
 20 | #include <cuda_runtime.h>
 21 | 
 22 | namespace nvinfer1
 23 | {
 24 | namespace plugin
 25 | {
 26 | 
 27 | // This will be populated by the logger supplied by the user to initLibNvInferPlugins()
 28 | ILogger* gLogger{};
 29 | 
 30 | template <ILogger::Severity kSeverity>
 31 | int LogStream<kSeverity>::Buf::sync()
 32 | {
 33 |     std::string s = str();
 34 |     while (!s.empty() && s.back() == '\n')
 35 |     {
 36 |         s.pop_back();
 37 |     }
 38 |     if (gLogger != nullptr)
 39 |     {
 40 |         gLogger->log(kSeverity, s.c_str());
 41 |     }
 42 |     str("");
 43 |     return 0;
 44 | }
 45 | 
 46 | // These use gLogger, and therefore require initLibNvInferPlugins() to be called with a logger
 47 | // (otherwise, it will not log)
 48 | LogStream<ILogger::Severity::kERROR> gLogError;
 49 | LogStream<ILogger::Severity::kWARNING> gLogWarning;
 50 | LogStream<ILogger::Severity::kINFO> gLogInfo;
 51 | LogStream<ILogger::Severity::kVERBOSE> gLogVerbose;
 52 | 
 53 | // break-pointable
 54 | void throwCudaError(const char* file, const char* function, int line, int status, const char* msg)
 55 | {
 56 |     CudaError error(file, function, line, status, msg);
 57 |     error.log(gLogError);
 58 |     throw error;
 59 | }
 60 | 
 61 | // break-pointable
 62 | void throwCublasError(const char* file, const char* function, int line, int status, const char* msg)
 63 | {
 64 |     if (msg == nullptr)
 65 |     {
 66 |         auto s_ = static_cast<cublasStatus_t>(status);
 67 |         switch (s_)
 68 |         {
 69 |         case CUBLAS_STATUS_SUCCESS: msg = "CUBLAS_STATUS_SUCCESS"; break;
 70 |         case CUBLAS_STATUS_NOT_INITIALIZED: msg = "CUBLAS_STATUS_NOT_INITIALIZED"; break;
 71 |         case CUBLAS_STATUS_ALLOC_FAILED: msg = "CUBLAS_STATUS_ALLOC_FAILED"; break;
 72 |         case CUBLAS_STATUS_INVALID_VALUE: msg = "CUBLAS_STATUS_INVALID_VALUE"; break;
 73 |         case CUBLAS_STATUS_ARCH_MISMATCH: msg = "CUBLAS_STATUS_ARCH_MISMATCH"; break;
 74 |         case CUBLAS_STATUS_MAPPING_ERROR: msg = "CUBLAS_STATUS_MAPPING_ERROR"; break;
 75 |         case CUBLAS_STATUS_EXECUTION_FAILED: msg = "CUBLAS_STATUS_EXECUTION_FAILED"; break;
 76 |         case CUBLAS_STATUS_INTERNAL_ERROR: msg = "CUBLAS_STATUS_INTERNAL_ERROR"; break;
 77 |         case CUBLAS_STATUS_NOT_SUPPORTED: msg = "CUBLAS_STATUS_NOT_SUPPORTED"; break;
 78 |         case CUBLAS_STATUS_LICENSE_ERROR: msg = "CUBLAS_STATUS_LICENSE_ERROR"; break;
 79 |         }
 80 |     }
 81 |     CublasError error(file, function, line, status, msg);
 82 |     error.log(gLogError);
 83 |     throw error;
 84 | }
 85 | 
 86 | // break-pointable
 87 | void throwCudnnError(const char* file, const char* function, int line, int status, const char* msg)
 88 | {
 89 |     CudnnError error(file, function, line, status, msg);
 90 |     error.log(gLogError);
 91 |     throw error;
 92 | }
 93 | 
 94 | void logError(const char* msg, const char* file, const char* fn, int line)
 95 | {
 96 |     gLogError << "Parameter check failed at: " << file << "::" << fn << "::" << line;
 97 |     gLogError << ", condition: " << msg << std::endl;
 98 | }
 99 | 
100 | // break-pointable
101 | void reportAssertion(const char* msg, const char* file, int line)
102 | {
103 |     std::ostringstream stream;
104 |     stream << "Assertion failed: " << msg << std::endl
105 |            << file << ':' << line << std::endl
106 |            << "Aborting..." << std::endl;
107 |     getLogger()->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str());
108 |     cudaDeviceReset();
109 |     abort();
110 | }
111 | 
112 | void TRTException::log(std::ostream& logStream) const
113 | {
114 |     logStream << file << " (" << line << ") - " << name << " Error in " << function << ": " << status;
115 |     if (message != nullptr)
116 |     {
117 |         logStream << " (" << message << ")";
118 |     }
119 |     logStream << std::endl;
120 | }
121 | 
122 | } // namespace plugin
123 | 
124 | } // namespace nvinfer1
125 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/cub_helper.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #include "kernel.h"
17 | template <typename KeyT, typename ValueT>
18 | size_t cubSortPairsWorkspaceSize(int num_items, int num_segments)
19 | {
20 |     size_t temp_storage_bytes = 0;
21 |     cub::DeviceSegmentedRadixSort::SortPairsDescending((void*) NULL, temp_storage_bytes, (const KeyT*) NULL,
22 |         (KeyT*) NULL, (const ValueT*) NULL, (ValueT*) NULL,
23 |         num_items,    // # items
24 |         num_segments, // # segments
25 |         (const int*) NULL, (const int*) NULL);
26 |     return temp_storage_bytes;
27 | }
28 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/cudaDriverWrapper.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | #define CUDA_LIB_NAME "cuda"
 17 | 
 18 | #if defined(_WIN32)
 19 | #if !defined(WIN32_LEAN_AND_MEAN)
 20 | #define WIN32_LEAN_AND_MEAN
 21 | #endif // defined(WIN32_LEAN_AND_MEAN)
 22 | #include <windows.h>
 23 | #define dllOpen(name) (void*) LoadLibraryA("nv" name ".dll")
 24 | #define dllClose(handle) FreeLibrary(static_cast<HMODULE>(handle))
 25 | #define dllGetSym(handle, name) GetProcAddress(static_cast<HMODULE>(handle), name)
 26 | #else
 27 | #include <dlfcn.h>
 28 | #define dllOpen(name) dlopen("lib" name ".so.1", RTLD_LAZY)
 29 | #define dllClose(handle) dlclose(handle)
 30 | #define dllGetSym(handle, name) dlsym(handle, name)
 31 | #endif
 32 | 
 33 | #include "cudaDriverWrapper.h"
 34 | #include "plugin.h"
 35 | #include <cstdint>
 36 | #include <cstdio>
 37 | #include <cuda.h>
 38 | 
 39 | using namespace nvinfer1;
 40 | 
 41 | CUDADriverWrapper::CUDADriverWrapper()
 42 | {
 43 |     handle = dllOpen(CUDA_LIB_NAME);
 44 |     ASSERT(handle != nullptr);
 45 | 
 46 |     auto load_sym = [](void* handle, const char *name) {
 47 |         void* ret = dllGetSym(handle, name);
 48 |         ASSERT(ret != nullptr);
 49 |         return ret;
 50 |     };
 51 | 
 52 |     *(void**)(&_cuGetErrorName) = load_sym(handle, "cuGetErrorName");
 53 |     *(void**)(&_cuFuncSetAttribute) = load_sym(handle, "cuFuncSetAttribute");
 54 |     *(void**)(&_cuLinkComplete) = load_sym(handle, "cuLinkComplete");
 55 |     *(void**)(&_cuModuleUnload) = load_sym(handle, "cuModuleUnload");
 56 |     *(void**)(&_cuLinkDestroy) = load_sym(handle, "cuLinkDestroy");
 57 |     *(void**)(&_cuModuleLoadData) = load_sym(handle, "cuModuleLoadData");
 58 |     *(void**)(&_cuLinkCreate) = load_sym(handle, "cuLinkCreate_v2");
 59 |     *(void**)(&_cuModuleGetFunction) = load_sym(handle, "cuModuleGetFunction");
 60 |     *(void**)(&_cuLinkAddFile) = load_sym(handle, "cuLinkAddFile_v2");
 61 |     *(void**)(&_cuLinkAddData) = load_sym(handle, "cuLinkAddData_v2");
 62 |     *(void**)(&_cuLaunchCooperativeKernel) = load_sym(handle, "cuLaunchCooperativeKernel");
 63 |     *(void**)(&_cuLaunchKernel) = load_sym(handle, "cuLaunchKernel");
 64 | }
 65 | 
 66 | CUDADriverWrapper::~CUDADriverWrapper()
 67 | {
 68 |     dllClose(handle);
 69 | }
 70 | 
 71 | CUresult CUDADriverWrapper::cuGetErrorName(CUresult error, const char** pStr) const
 72 | {
 73 |     return (*_cuGetErrorName)(error, pStr);
 74 | }
 75 | 
 76 | CUresult CUDADriverWrapper::cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) const
 77 | {
 78 |     return (*_cuFuncSetAttribute)(hfunc, attrib, value);
 79 | }
 80 | 
 81 | CUresult CUDADriverWrapper::cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const
 82 | {
 83 |     return (*_cuLinkComplete)(state, cubinOut, sizeOut);
 84 | }
 85 | 
 86 | CUresult CUDADriverWrapper::cuModuleUnload(CUmodule hmod) const
 87 | {
 88 |     return (*_cuModuleUnload)(hmod);
 89 | }
 90 | 
 91 | CUresult CUDADriverWrapper::cuLinkDestroy(CUlinkState state) const
 92 | {
 93 |     return (*_cuLinkDestroy)(state);
 94 | }
 95 | 
 96 | CUresult CUDADriverWrapper::cuModuleLoadData(CUmodule* module, const void* image) const
 97 | {
 98 |     return (*_cuModuleLoadData)(module, image);
 99 | }
100 | 
101 | CUresult CUDADriverWrapper::cuLinkCreate(
102 |     uint32_t numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const
103 | {
104 |     return (*_cuLinkCreate)(numOptions, options, optionValues, stateOut);
105 | }
106 | 
107 | CUresult CUDADriverWrapper::cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name) const
108 | {
109 |     return (*_cuModuleGetFunction)(hfunc, hmod, name);
110 | }
111 | 
112 | CUresult CUDADriverWrapper::cuLinkAddFile(CUlinkState state, CUjitInputType type, const char* path, uint32_t numOptions,
113 |     CUjit_option* options, void** optionValues) const
114 | {
115 |     return (*_cuLinkAddFile)(state, type, path, numOptions, options, optionValues);
116 | }
117 | 
118 | CUresult CUDADriverWrapper::cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size,
119 |     const char* name, uint32_t numOptions, CUjit_option* options, void** optionValues) const
120 | {
121 |     return (*_cuLinkAddData)(state, type, data, size, name, numOptions, options, optionValues);
122 | }
123 | 
124 | CUresult CUDADriverWrapper::cuLaunchCooperativeKernel(CUfunction f, uint32_t gridDimX, uint32_t gridDimY,
125 |     uint32_t gridDimZ, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes,
126 |     CUstream hStream, void** kernelParams) const
127 | {
128 |     return (*_cuLaunchCooperativeKernel)(
129 |         f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams);
130 | }
131 | 
132 | CUresult CUDADriverWrapper::cuLaunchKernel(CUfunction f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
133 |     uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, CUstream hStream,
134 |     void** kernelParams, void** extra) const
135 | {
136 |     return (*_cuLaunchKernel)(
137 |         f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
138 | }
139 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/cudaDriverWrapper.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #ifndef CUDA_DRIVER_WRAPPER_H
 18 | #define CUDA_DRIVER_WRAPPER_H
 19 | 
 20 | #include <cstdint>
 21 | #include <cstdio>
 22 | #include <cuda.h>
 23 | 
 24 | #define cuErrCheck(stat, wrap)                                                                                         \
 25 |     {                                                                                                                  \
 26 |         nvinfer1::cuErrCheck_((stat), wrap, __FILE__, __LINE__);                                                       \
 27 |     }
 28 | 
 29 | namespace nvinfer1
 30 | {
 31 | class CUDADriverWrapper
 32 | {
 33 | public:
 34 |     CUDADriverWrapper();
 35 | 
 36 |     ~CUDADriverWrapper();
 37 | 
 38 |     // Delete default copy constructor and copy assignment constructor
 39 |     CUDADriverWrapper(const CUDADriverWrapper&) = delete;
 40 |     CUDADriverWrapper& operator=(const CUDADriverWrapper&) = delete;
 41 | 
 42 |     CUresult cuGetErrorName(CUresult error, const char** pStr) const;
 43 | 
 44 |     CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) const;
 45 | 
 46 |     CUresult cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut) const;
 47 | 
 48 |     CUresult cuModuleUnload(CUmodule hmod) const;
 49 | 
 50 |     CUresult cuLinkDestroy(CUlinkState state) const;
 51 | 
 52 |     CUresult cuModuleLoadData(CUmodule* module, const void* image) const;
 53 | 
 54 |     CUresult cuLinkCreate(uint32_t numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut) const;
 55 | 
 56 |     CUresult cuModuleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name) const;
 57 | 
 58 |     CUresult cuLinkAddFile(CUlinkState state, CUjitInputType type, const char* path, uint32_t numOptions,
 59 |         CUjit_option* options, void** optionValues) const;
 60 | 
 61 |     CUresult cuLinkAddData(CUlinkState state, CUjitInputType type, void* data, size_t size, const char* name,
 62 |         uint32_t numOptions, CUjit_option* options, void** optionValues) const;
 63 | 
 64 |     CUresult cuLaunchCooperativeKernel(CUfunction f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
 65 |         uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, CUstream hStream,
 66 |         void** kernelParams) const;
 67 | 
 68 |     CUresult cuLaunchKernel(CUfunction f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, uint32_t blockDimX,
 69 |         uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, CUstream hStream, void** kernelParams,
 70 |         void** extra) const;
 71 | 
 72 | private:
 73 |     void* handle;
 74 |     CUresult (*_cuGetErrorName)(CUresult, const char**);
 75 |     CUresult (*_cuFuncSetAttribute)(CUfunction, CUfunction_attribute, int);
 76 |     CUresult (*_cuLinkComplete)(CUlinkState, void**, size_t*);
 77 |     CUresult (*_cuModuleUnload)(CUmodule);
 78 |     CUresult (*_cuLinkDestroy)(CUlinkState);
 79 |     CUresult (*_cuLinkCreate)(unsigned int, CUjit_option*, void**, CUlinkState*);
 80 |     CUresult (*_cuModuleLoadData)(CUmodule*, const void*);
 81 |     CUresult (*_cuModuleGetFunction)(CUfunction*, CUmodule, const char*);
 82 |     CUresult (*_cuLinkAddFile)(CUlinkState, CUjitInputType, const char*, unsigned int, CUjit_option*, void**);
 83 |     CUresult (*_cuLinkAddData)(
 84 |         CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
 85 |     CUresult (*_cuLaunchCooperativeKernel)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
 86 |         unsigned int, unsigned int, unsigned int, CUstream, void**);
 87 |     CUresult (*_cuLaunchKernel)(CUfunction f, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ,
 88 |         uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t sharedMemBytes, CUstream hStream,
 89 |         void** kernelParams, void** extra);
 90 | };
 91 | 
 92 | inline void cuErrCheck_(CUresult stat, const CUDADriverWrapper& wrap, const char* file, int line)
 93 | {
 94 |     if (stat != CUDA_SUCCESS)
 95 |     {
 96 |         const char* msg = nullptr;
 97 |         wrap.cuGetErrorName(stat, &msg);
 98 |         fprintf(stderr, "CUDA Error: %s %s %d\n", msg, file, line);
 99 |     }
100 | }
101 | 
102 | } // namespace nvinfer1
103 | 
104 | #endif // CUDA_DRIVER_WRAPPER_H
105 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/half.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | //
17 | // Custom wrapper around external half-precision header
18 | //
19 | // Header has some "extra parentheses" warnings when different rounding modes are used.
20 | 
21 | #if defined(__GNUC__)
22 | #pragma GCC diagnostic push
23 | #pragma GCC diagnostic ignored "-Wparentheses"
24 | #endif
25 | 
26 | 
27 | #if defined(__clang__)
28 | #pragma clang diagnostic push
29 | #pragma clang diagnostic ignored "-Wmismatched-tags"
30 | #endif
31 | 
32 | #include "ieee/half.h"
33 | 
34 | #if defined(__clang__)
35 | #pragma clang diagnostic pop
36 | #endif
37 | 
38 | #if defined(__GNUC__)
39 | #pragma GCC diagnostic pop
40 | #endif
41 | 
42 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernel.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "kernel.h"
18 | #include "plugin.h"
19 | 
20 | size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses, int numPredsPerClass,
21 |     int topK, DataType DT_BBOX, DataType DT_SCORE)
22 | {
23 |     size_t wss[7];
24 |     wss[0] = detectionForwardBBoxDataSize(N, C1, DT_BBOX);
25 |     wss[1] = detectionForwardBBoxPermuteSize(shareLocation, N, C1, DT_BBOX);
26 |     wss[2] = detectionForwardPreNMSSize(N, C2);
27 |     wss[3] = detectionForwardPreNMSSize(N, C2);
28 |     wss[4] = detectionForwardPostNMSSize(N, numClasses, topK);
29 |     wss[5] = detectionForwardPostNMSSize(N, numClasses, topK);
30 |     wss[6] = std::max(sortScoresPerClassWorkspaceSize(N, numClasses, numPredsPerClass, DT_SCORE),
31 |         sortScoresPerImageWorkspaceSize(N, numClasses * topK, DT_SCORE));
32 |     return calculateTotalWorkspaceSize(wss, 7);
33 | }
34 | 
35 | size_t detectionInferenceWorkspaceSizeCustom(bool shareLocation, int N, int C1, int C2, int C3, int numClasses, int numPredsPerClass,
36 |     int topK, DataType DT_BBOX, DataType DT_SCORE)
37 | {
38 |     size_t wss[8];
39 |     wss[0] = detectionForwardBBoxDataSize(N, C1, DT_BBOX);
40 |     wss[1] = detectionForwardBBoxPermuteSize(shareLocation, N, C1, DT_BBOX);
41 |     wss[2] = detectionForwardPreNMSSize(N, C2);
42 |     wss[3] = detectionForwardPreNMSSize(N, C2);
43 |     wss[4] = detectionForwardPostNMSSize(N, numClasses, topK);
44 |     wss[5] = detectionForwardPostNMSSize(N, numClasses, topK);
45 |     wss[6] = std::max(sortScoresPerClassWorkspaceSize(N, numClasses, numPredsPerClass, DT_SCORE),
46 |         sortScoresPerImageWorkspaceSize(N, numClasses * topK, DT_SCORE));
47 |     wss[7] = detectionForwardLandmarkDataSize(N, C3, DT_BBOX);
48 |     return calculateTotalWorkspaceSize(wss, 8);
49 | }
50 | 
51 | namespace nvinfer1
52 | {
53 | namespace plugin
54 | {
55 | size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses, int numPredsPerClass,
56 |     int topK, DataType DT_BBOX, DataType DT_SCORE)
57 | {
58 |     size_t wss[7];
59 |     wss[0] = detectionForwardBBoxDataSize(N, C1, DT_BBOX);
60 |     wss[1] = detectionForwardBBoxPermuteSize(shareLocation, N, C1, DT_BBOX);
61 |     wss[2] = detectionForwardPreNMSSize(N, C2);
62 |     wss[3] = detectionForwardPreNMSSize(N, C2);
63 |     wss[4] = detectionForwardPostNMSSize(N, numClasses, topK);
64 |     wss[5] = detectionForwardPostNMSSize(N, numClasses, topK);
65 |     wss[6] = std::max(sortScoresPerClassWorkspaceSize(N, numClasses, numPredsPerClass, DT_SCORE),
66 |         sortScoresPerImageWorkspaceSize(N, numClasses * topK, DT_SCORE));
67 |     return calculateTotalWorkspaceSize(wss, 7);
68 | }
69 | } // namespace plugin
70 | } // namespace nvinfer1
71 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernels/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | file(GLOB SRCS *.cpp)
17 | set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS})
18 | set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE)
19 | file(GLOB CU_SRCS *.cu)
20 | set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} ${CU_SRCS})
21 | set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} PARENT_SCOPE)
22 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernels/common.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include "cuda.h"
 18 | #include "cublas_v2.h"
 19 | #include <cub/cub.cuh>
 20 | #include <stdint.h>
 21 | #include "kernel.h"
 22 | #include "bboxUtils.h"
 23 | 
 24 | #define CUDA_MEM_ALIGN 256
 25 | 
 26 | // HASH
 27 | unsigned int hash(const void* array_, size_t size)
 28 | {
 29 |     // Apply hashing only when debugging RPN codes.
 30 |     if (DEBUG_ENABLE)
 31 |     {
 32 |         const char* array_const;
 33 |         char* array;
 34 |         cudaMallocHost((void**) &array, size);
 35 |         cudaMemcpy(array, array_, size, cudaMemcpyDeviceToHost);
 36 |         array_const = array;
 37 |         unsigned int hash = 45599;
 38 |         for (size_t i = 0; i < size; i++)
 39 |         {
 40 |             unsigned int value = array_const[i];
 41 |             hash = hash * 1487 + value;
 42 |             hash = hash * 317;
 43 |             hash = hash % 105359;
 44 |         }
 45 |         return hash;
 46 |     }
 47 |     else
 48 |     {
 49 |         return 0;
 50 |     }
 51 | }
 52 | 
 53 | // ALIGNPTR
 54 | int8_t* alignPtr(int8_t* ptr, uintptr_t to)
 55 | {
 56 |     uintptr_t addr = (uintptr_t) ptr;
 57 |     if (addr % to)
 58 |     {
 59 |         addr += to - addr % to;
 60 |     }
 61 |     return (int8_t*) addr;
 62 | }
 63 | 
 64 | // NEXTWORKSPACEPTR
 65 | int8_t* nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize)
 66 | {
 67 |     uintptr_t addr = (uintptr_t) ptr;
 68 |     addr += previousWorkspaceSize;
 69 |     return alignPtr((int8_t*) addr, CUDA_MEM_ALIGN);
 70 | }
 71 | 
 72 | // CALCULATE TOTAL WORKSPACE SIZE
 73 | size_t calculateTotalWorkspaceSize(size_t* workspaces, int count)
 74 | {
 75 |     size_t total = 0;
 76 |     for (int i = 0; i < count; i++)
 77 |     {
 78 |         total += workspaces[i];
 79 |         if (workspaces[i] % CUDA_MEM_ALIGN)
 80 |         {
 81 |             total += CUDA_MEM_ALIGN - (workspaces[i] % CUDA_MEM_ALIGN);
 82 |         }
 83 |     }
 84 |     return total;
 85 | }
 86 | 
 87 | using nvinfer1::DataType;
 88 | 
 89 | // DATA TYPE SIZE
 90 | size_t dataTypeSize(const DataType dtype)
 91 | {
 92 |     switch (dtype)
 93 |     {
 94 |     case DataType::kINT8: return sizeof(char);
 95 |     case DataType::kHALF: return sizeof(short);
 96 |     case DataType::kFLOAT: return sizeof(float);
 97 |     default: return 0;
 98 |     }
 99 | }
100 | 
101 | // CUB
102 | /*
103 | size_t cubSortFloatIntPairsWorkspaceSize(int num_items, int num_segments)
104 | {
105 |     size_t temp_storage_bytes = 0;
106 |     cub::DeviceSegmentedRadixSort::SortPairsDescending(
107 |     (int *)NULL, temp_storage_bytes,
108 |     (const float *)NULL, (float *)NULL,
109 |     (const int *)NULL, (int *)NULL,
110 |     num_items,     // # items
111 |     num_segments,  // # segments
112 |     (const int *)NULL, (const int *)NULL);
113 |     return temp_storage_bytes;
114 | }
115 | 
116 | size_t cubSortFloatBboxInfoPairsWorkspaceSize(int num_items, int num_segments)
117 | {
118 |     size_t temp_storage_bytes = 0;
119 |     cub::DeviceSegmentedRadixSort::SortPairsDescending(
120 |     (int *)NULL, temp_storage_bytes,
121 |     (const float *)NULL, (float *)NULL,
122 |     (const BboxInfo<float> *)NULL, (BboxInfo<float> *)NULL,
123 |     num_items,     // # items
124 |     num_segments,  // # segments
125 |     (const int *)NULL, (const int *)NULL);
126 |     return temp_storage_bytes;
127 | }
128 | */
129 | 
130 | template <unsigned nthds_per_cta>
131 | __launch_bounds__(nthds_per_cta)
132 |     __global__ void setUniformOffsets_kernel(
133 |         const int num_segments,
134 |         const int offset,
135 |         int* d_offsets)
136 | {
137 |     const int idx = blockIdx.x * nthds_per_cta + threadIdx.x;
138 |     if (idx <= num_segments)
139 |         d_offsets[idx] = idx * offset;
140 | }
141 | 
142 | void setUniformOffsets(
143 |     cudaStream_t stream,
144 |     const int num_segments,
145 |     const int offset,
146 |     int* d_offsets)
147 | {
148 |     const int BS = 32;
149 |     const int GS = (num_segments + 1 + BS - 1) / BS;
150 |     setUniformOffsets_kernel<BS><<<GS, BS, 0, stream>>>(num_segments, offset, d_offsets);
151 | }
152 | 
153 | 
154 | const char* cublasGetErrorString(cublasStatus_t error)
155 | {
156 |     switch (error)
157 |     {
158 |     case CUBLAS_STATUS_SUCCESS:
159 |         return "CUBLAS_STATUS_SUCCESS";
160 |     case CUBLAS_STATUS_NOT_INITIALIZED:
161 |         return "CUBLAS_STATUS_NOT_INITIALIZED";
162 |     case CUBLAS_STATUS_ALLOC_FAILED:
163 |         return "CUBLAS_STATUS_ALLOC_FAILED";
164 |     case CUBLAS_STATUS_INVALID_VALUE:
165 |         return "CUBLAS_STATUS_INVALID_VALUE";
166 |     case CUBLAS_STATUS_ARCH_MISMATCH:
167 |         return "CUBLAS_STATUS_ARCH_MISMATCH";
168 |     case CUBLAS_STATUS_MAPPING_ERROR:
169 |         return "CUBLAS_STATUS_MAPPING_ERROR";
170 |     case CUBLAS_STATUS_EXECUTION_FAILED:
171 |         return "CUBLAS_STATUS_EXECUTION_FAILED";
172 |     case CUBLAS_STATUS_INTERNAL_ERROR:
173 |         return "CUBLAS_STATUS_INTERNAL_ERROR";
174 | #if CUDA_VERSION >= 6000
175 |     case CUBLAS_STATUS_NOT_SUPPORTED:
176 |         return "CUBLAS_STATUS_NOT_SUPPORTED";
177 | #endif
178 | #if CUDA_VERSION >= 6050
179 |     case CUBLAS_STATUS_LICENSE_ERROR:
180 |         return "CUBLAS_STATUS_LICENSE_ERROR";
181 | #endif
182 |     }
183 |     return "Unknown cublas status";
184 | }
185 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernels/permuteData.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | #include <array>
 17 | #include "kernel.h"
 18 | 
 19 | template <typename Dtype, unsigned nthds_per_cta>
 20 | __launch_bounds__(nthds_per_cta)
 21 |     __global__ void permuteData_kernel(
 22 |         const int nthreads,
 23 |         const int num_classes,
 24 |         const int num_data,
 25 |         const int num_dim,
 26 |         bool confSigmoid,
 27 |         const Dtype* data,
 28 |         Dtype* new_data)
 29 | {
 30 |     // data format: [batch_size, num_data, num_classes, num_dim]
 31 |     for (int index = blockIdx.x * nthds_per_cta + threadIdx.x;
 32 |          index < nthreads;
 33 |          index += nthds_per_cta * gridDim.x)
 34 |     {
 35 |         const int i = index % num_dim;
 36 |         const int c = (index / num_dim) % num_classes;
 37 |         const int d = (index / num_dim / num_classes) % num_data;
 38 |         const int n = index / num_dim / num_classes / num_data;
 39 |         const int new_index = ((n * num_classes + c) * num_data + d) * num_dim + i;
 40 |         float result = data[index];
 41 |         if (confSigmoid)
 42 |             result = exp(result) / (1 + exp(result));
 43 | 
 44 |         new_data[new_index] = result;
 45 |     }
 46 |     // new data format: [batch_size, num_classes, num_data, num_dim]
 47 | }
 48 | 
 49 | template <typename Dtype>
 50 | pluginStatus_t permuteData_gpu(
 51 |     cudaStream_t stream,
 52 |     const int nthreads,
 53 |     const int num_classes,
 54 |     const int num_data,
 55 |     const int num_dim,
 56 |     bool confSigmoid,
 57 |     const void* data,
 58 |     void* new_data)
 59 | {
 60 |     const int BS = 512;
 61 |     const int GS = (nthreads + BS - 1) / BS;
 62 |     permuteData_kernel<Dtype, BS><<<GS, BS, 0, stream>>>(nthreads, num_classes, num_data, num_dim, confSigmoid,
 63 |                                                          (const Dtype*) data, (Dtype*) new_data);
 64 |     CSC(cudaGetLastError(), STATUS_FAILURE);
 65 |     return STATUS_SUCCESS;
 66 | }
 67 | 
 68 | // permuteData LAUNCH CONFIG
 69 | typedef pluginStatus_t (*pdFunc)(cudaStream_t, const int, const int, const int, const int, bool, const void*, void*);
 70 | 
 71 | struct pdLaunchConfig
 72 | {
 73 |     DataType t_data;
 74 |     pdFunc function;
 75 | 
 76 |     pdLaunchConfig(DataType t_data)
 77 |         : t_data(t_data)
 78 |     {
 79 |     }
 80 |     pdLaunchConfig(DataType t_data, pdFunc function)
 81 |         : t_data(t_data)
 82 |         , function(function)
 83 |     {
 84 |     }
 85 |     bool operator==(const pdLaunchConfig& other)
 86 |     {
 87 |         return t_data == other.t_data;
 88 |     }
 89 | };
 90 | 
 91 | static std::array<pdLaunchConfig, 2> pdLCOptions = {
 92 |     pdLaunchConfig(DataType::kFLOAT, permuteData_gpu<float>), pdLaunchConfig(DataType::kHALF, permuteData_gpu<__half>)};
 93 | 
 94 | pluginStatus_t permuteData(cudaStream_t stream, const int nthreads, const int num_classes, const int num_data,
 95 |     const int num_dim, const DataType DT_DATA, bool confSigmoid, const void* data, void* new_data)
 96 | {
 97 |     pdLaunchConfig lc = pdLaunchConfig(DT_DATA);
 98 |     for (unsigned i = 0; i < pdLCOptions.size(); ++i)
 99 |     {
100 |         if (lc == pdLCOptions[i])
101 |         {
102 |             DEBUG_PRINTF("permuteData kernel %d\n", i);
103 |             return pdLCOptions[i].function(stream,
104 |                                          nthreads,
105 |                                          num_classes,
106 |                                          num_data,
107 |                                          num_dim,
108 |                                          confSigmoid,
109 |                                          data,
110 |                                          new_data);
111 |         }
112 |     }
113 |     return STATUS_BAD_PARAM;
114 | }
115 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernels/reducedMathPlugin.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #ifndef _REDUCED_MATH_PLUGIN_H
 18 | #define _REDUCED_MATH_PLUGIN_H
 19 | #include <cstdint>
 20 | // Dynamically strength-reduced div and mod
 21 | //
 22 | // Ideas taken from Sean Baxter's MGPU library.
 23 | // These classes provide for reduced complexity division and modulus
 24 | // on integers, for the case where the same divisor or modulus will
 25 | // be used repeatedly.
 26 | 
 27 | namespace nvinfer1
 28 | {
 29 | namespace plugin
 30 | {
 31 | namespace detail
 32 | {
 33 | 
 34 | void findDivisor(int denom, unsigned int& mul_coeff, unsigned int& shift_coeff);
 35 | 
 36 | __host__ __device__ __forceinline__ uint32_t umulhi(uint32_t x, uint32_t y)
 37 | {
 38 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 100
 39 |     return __umulhi(x, y);
 40 | #else
 41 |     uint64_t z = (uint64_t) x * (uint64_t) y;
 42 |     return (uint32_t) (z >> 32);
 43 | #endif
 44 | }
 45 | 
 46 | // This is a weird implementation that returns div_up(0,1)=0 but
 47 | // div_up(0,2)=1 (wrong) -- just do not use it with a=0.
 48 | __host__ __device__ inline int div_up(int a, int b)
 49 | {
 50 |     return (a - 1) / b + 1;
 51 | }
 52 | 
 53 | } //end namespace detail
 54 | 
 55 | class ReducedDivisor
 56 | {
 57 | public:
 58 |     ReducedDivisor() {}
 59 |     __host__ __forceinline__
 60 |     ReducedDivisor(int _y)
 61 |         : y(_y)
 62 |     {
 63 |         detail::findDivisor(y, mul_coeff, shift_coeff);
 64 |     }
 65 |     __host__ __device__ __forceinline__
 66 |     ReducedDivisor(unsigned _mul_coeff, unsigned _shift_coeff, int _y)
 67 |         : mul_coeff(_mul_coeff)
 68 |         , shift_coeff(_shift_coeff)
 69 |         , y(_y)
 70 |     {
 71 |     }
 72 |     __host__ __device__ __forceinline__ int div(int x) const
 73 |     {
 74 |         // if dividing by 1, then findDivisor wouldn't have worked because
 75 |         // mul_coeff would have had to be 2^32, which can't be represented,
 76 |         // so we have to special case that one.
 77 |         return (y != 1) ? detail::umulhi((uint32_t) x, mul_coeff) >> shift_coeff : x;
 78 |     }
 79 |     __host__ __device__ __forceinline__ int mod(int x) const
 80 |     {
 81 |         return x - (div(x) * y);
 82 |     }
 83 |     __host__ __device__ __forceinline__ void divmod(int x, int& q, int& mod) const
 84 |     {
 85 |         q = div(x);
 86 |         mod = x - (q * y);
 87 |     }
 88 |     __host__ __device__ __forceinline__ int get() const
 89 |     {
 90 |         return y;
 91 |     }
 92 |     inline __host__ void get_mul_shift(unsigned& mul, unsigned& shift)
 93 |     {
 94 |         mul = mul_coeff;
 95 |         shift = shift_coeff;
 96 |     }
 97 | 
 98 | protected:
 99 |     uint32_t mul_coeff;
100 |     uint32_t shift_coeff;
101 |     int y;
102 | };
103 | 
104 | } // namespace plugin
105 | 
106 | } // namespace nvinfer1
107 | #endif /*_REDUCED_MATH_PLUGIN_H*/
108 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/kernels/sortScoresPerImage.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | #include "cub/cub.cuh"
 17 | #include <array>
 18 | #include "kernel.h"
 19 | #include "bboxUtils.h"
 20 | #include "cub_helper.h"
 21 | 
 22 | template <typename T_SCORE>
 23 | pluginStatus_t sortScoresPerImage_gpu(
 24 |     cudaStream_t stream,
 25 |     const int num_images,
 26 |     const int num_items_per_image,
 27 |     void* unsorted_scores,
 28 |     void* unsorted_bbox_indices,
 29 |     void* sorted_scores,
 30 |     void* sorted_bbox_indices,
 31 |     void* workspace,
 32 |     int score_bits
 33 | )
 34 | {
 35 |     void* d_offsets = workspace;
 36 |     void* cubWorkspace = nextWorkspacePtr((int8_t*) d_offsets, (num_images + 1) * sizeof(int));
 37 | 
 38 |     setUniformOffsets(stream, num_images, num_items_per_image, (int*) d_offsets);
 39 | 
 40 |     const int arrayLen = num_images * num_items_per_image;
 41 |     size_t temp_storage_bytes = cubSortPairsWorkspaceSize<T_SCORE, int>(arrayLen, num_images);
 42 |     size_t begin_bit = 0;
 43 |     size_t end_bit = sizeof(T_SCORE) * 8;
 44 |     if (sizeof(T_SCORE) == 2 && score_bits > 0 && score_bits <= 10)
 45 |     {
 46 |         end_bit = 10;
 47 |         begin_bit = end_bit - score_bits;
 48 |     }
 49 |     cub::DeviceSegmentedRadixSort::SortPairsDescending(
 50 |         cubWorkspace, temp_storage_bytes,
 51 |         (const T_SCORE*) (unsorted_scores), (T_SCORE*) (sorted_scores),
 52 |         (const int*) (unsorted_bbox_indices), (int*) (sorted_bbox_indices),
 53 |         arrayLen, num_images,
 54 |         (const int*) d_offsets, (const int*) d_offsets + 1,
 55 |         begin_bit, end_bit,
 56 |         stream);
 57 |     CSC(cudaGetLastError(), STATUS_FAILURE);
 58 |     return STATUS_SUCCESS;
 59 | }
 60 | 
 61 | // sortScoresPerImage LAUNCH CONFIG
 62 | typedef pluginStatus_t (*sspiFunc)(cudaStream_t,
 63 |                                 const int,
 64 |                                 const int,
 65 |                                 void*,
 66 |                                 void*,
 67 |                                 void*,
 68 |                                 void*,
 69 |                                 void*,
 70 |                                 int);
 71 | struct sspiLaunchConfig
 72 | {
 73 |     DataType t_score;
 74 |     sspiFunc function;
 75 | 
 76 |     sspiLaunchConfig(DataType t_score)
 77 |         : t_score(t_score)
 78 |     {
 79 |     }
 80 |     sspiLaunchConfig(DataType t_score, sspiFunc function)
 81 |         : t_score(t_score)
 82 |         , function(function)
 83 |     {
 84 |     }
 85 |     bool operator==(const sspiLaunchConfig& other)
 86 |     {
 87 |         return t_score == other.t_score;
 88 |     }
 89 | };
 90 | 
 91 | static std::array<sspiLaunchConfig, 2> sspiLCOptions = {
 92 |     sspiLaunchConfig(DataType::kFLOAT, sortScoresPerImage_gpu<float>),
 93 |     sspiLaunchConfig(DataType::kHALF, sortScoresPerImage_gpu<__half>),
 94 | };
 95 | 
 96 | pluginStatus_t sortScoresPerImage(
 97 |     cudaStream_t stream,
 98 |     const int num_images,
 99 |     const int num_items_per_image,
100 |     const DataType DT_SCORE,
101 |     void* unsorted_scores,
102 |     void* unsorted_bbox_indices,
103 |     void* sorted_scores,
104 |     void* sorted_bbox_indices,
105 |     void* workspace,
106 |     int score_bits
107 | )
108 | {
109 |     sspiLaunchConfig lc = sspiLaunchConfig(DT_SCORE);
110 |     for (unsigned i = 0; i < sspiLCOptions.size(); ++i)
111 |     {
112 |         if (lc == sspiLCOptions[i])
113 |         {
114 |             DEBUG_PRINTF("sortScoresPerImage kernel %d\n", i);
115 |             return sspiLCOptions[i].function(stream,
116 |                                            num_images,
117 |                                            num_items_per_image,
118 |                                            unsorted_scores,
119 |                                            unsorted_bbox_indices,
120 |                                            sorted_scores,
121 |                                            sorted_bbox_indices,
122 |                                            workspace,
123 |                                            score_bits);
124 |         }
125 |     }
126 |     return STATUS_BAD_PARAM;
127 | }
128 | 
129 | size_t sortScoresPerImageWorkspaceSize(
130 |     const int num_images,
131 |     const int num_items_per_image,
132 |     const DataType DT_SCORE)
133 | {
134 |     const int arrayLen = num_images * num_items_per_image;
135 |     size_t wss[2];
136 |     wss[0] = (num_images + 1) * sizeof(int); // offsets
137 |     if (DT_SCORE == DataType::kFLOAT)
138 |     {
139 |         wss[1] = cubSortPairsWorkspaceSize<float, int>(arrayLen, num_images); // cub workspace
140 |     }
141 |     else if (DT_SCORE == DataType::kHALF)
142 |     {
143 |         wss[1] = cubSortPairsWorkspaceSize<__half, int>(arrayLen, num_images); // cub workspace
144 |     }
145 |     else
146 |     {
147 |         printf("SCORE type not supported.\n");
148 |         return (size_t) -1;
149 |     }
150 | 
151 |     return calculateTotalWorkspaceSize(wss, 2);
152 | }
153 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/logger.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "logger.h"
18 | #include "ErrorRecorder.h"
19 | #include "logging.h"
20 | 
21 | SampleErrorRecorder gRecorder;
22 | namespace sample
23 | {
24 | Logger gLogger{Logger::Severity::kINFO};
25 | LogStreamConsumer gLogVerbose{LOG_VERBOSE(gLogger)};
26 | LogStreamConsumer gLogInfo{LOG_INFO(gLogger)};
27 | LogStreamConsumer gLogWarning{LOG_WARN(gLogger)};
28 | LogStreamConsumer gLogError{LOG_ERROR(gLogger)};
29 | LogStreamConsumer gLogFatal{LOG_FATAL(gLogger)};
30 | 
31 | void setReportableSeverity(Logger::Severity severity)
32 | {
33 |     gLogger.setReportableSeverity(severity);
34 |     gLogVerbose.setReportableSeverity(severity);
35 |     gLogInfo.setReportableSeverity(severity);
36 |     gLogWarning.setReportableSeverity(severity);
37 |     gLogError.setReportableSeverity(severity);
38 |     gLogFatal.setReportableSeverity(severity);
39 | }
40 | } // namespace sample
41 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/logger.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #ifndef LOGGER_H
18 | #define LOGGER_H
19 | 
20 | #include "logging.h"
21 | 
22 | class SampleErrorRecorder;
23 | extern SampleErrorRecorder gRecorder;
24 | namespace sample
25 | {
26 | extern Logger gLogger;
27 | extern LogStreamConsumer gLogVerbose;
28 | extern LogStreamConsumer gLogInfo;
29 | extern LogStreamConsumer gLogWarning;
30 | extern LogStreamConsumer gLogError;
31 | extern LogStreamConsumer gLogFatal;
32 | 
33 | void setReportableSeverity(Logger::Severity severity);
34 | } // namespace sample
35 | 
36 | #endif // LOGGER_H
37 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/nmsHelper.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "cuda_fp16.h"
18 | #include "plugin.h"
19 | #include <algorithm>
20 | 
21 | using namespace nvinfer1;
22 | using namespace nvinfer1::plugin;
23 | 
24 | size_t detectionForwardBBoxDataSize(int N, int C1, DataType DT_BBOX)
25 | {
26 |     if (DT_BBOX == DataType::kFLOAT)
27 |     {
28 |         return N * C1 * sizeof(float);
29 |     }
30 |     if (DT_BBOX == DataType::kHALF)
31 |     {
32 |         return N * C1 * sizeof(__half);
33 |     }
34 | 
35 |     printf("Only FP32/FP16 type bounding boxes are supported.\n");
36 |     return (size_t) -1;
37 | }
38 | 
39 | size_t detectionForwardBBoxPermuteSize(bool shareLocation, int N, int C1, DataType DT_BBOX)
40 | {
41 |     if (DT_BBOX == DataType::kFLOAT)
42 |     {
43 |         return shareLocation ? 0 : N * C1 * sizeof(float);
44 |     }
45 |     if (DT_BBOX == DataType::kHALF)
46 |     {
47 |         return shareLocation ? 0 : N * C1 * sizeof(__half);
48 |     }
49 | 
50 |     printf("Only FP32/FP16 type bounding boxes are supported.\n");
51 |     return (size_t) -1;
52 | }
53 | 
54 | size_t detectionForwardLandmarkDataSize(int N, int C3, DataType DT_BBOX)
55 | {
56 |     if (DT_BBOX == DataType::kFLOAT)
57 |     {
58 |         return N * C3 * sizeof(float);
59 |     }
60 |     if (DT_BBOX == DataType::kHALF)
61 |     {
62 |         return N * C3 * sizeof(__half);
63 |     }
64 | 
65 |     printf("Only FP32/FP16 type bounding boxes are supported.\n");
66 |     return (size_t) -1;
67 | }
68 | 
69 | size_t detectionForwardLandmarkPermuteSize(bool shareLocation, int N, int C3, DataType DT_BBOX)
70 | {
71 |     if (DT_BBOX == DataType::kFLOAT)
72 |     {
73 |         return shareLocation ? 0 : N * C3 * sizeof(float);
74 |     }
75 |     if (DT_BBOX == DataType::kHALF)
76 |     {
77 |         return shareLocation ? 0 : N * C3 * sizeof(__half);
78 |     }
79 | 
80 |     printf("Only FP32/FP16 type bounding boxes are supported.\n");
81 |     return (size_t) -1;
82 | }
83 | 
84 | size_t detectionForwardPreNMSSize(int N, int C2)
85 | {
86 |     ASSERT(sizeof(float) == sizeof(int));
87 |     return N * C2 * sizeof(float);
88 | }
89 | 
90 | size_t detectionForwardPostNMSSize(int N, int numClasses, int topK)
91 | {
92 |     ASSERT(sizeof(float) == sizeof(int));
93 |     return N * numClasses * topK * sizeof(float);
94 | }
95 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/nmsUtils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #ifndef TRT_NMS_UTILS_H
17 | #define TRT_NMS_UTILS_H
18 | 
19 | #include "plugin.h"
20 | 
21 | using namespace nvinfer1;
22 | using namespace nvinfer1::plugin;
23 | 
24 | size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses, int numPredsPerClass,
25 |     int topK, DataType DT_BBOX, DataType DT_SCORE);
26 | size_t detectionInferenceWorkspaceSizeCustom(bool shareLocation, int N, int C1, int C2, int C3, int numClasses, int numPredsPerClass,
27 |     int topK, DataType DT_BBOX, DataType DT_SCORE);
28 | #endif
29 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/reducedMathPlugin.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #include <cstdint>
17 | namespace nvinfer1
18 | {
19 | namespace plugin
20 | {
21 | namespace detail
22 | {
23 | 
24 | // Count leading zeros - start from most significant bit.
25 | int clz(int x)
26 | {
27 |     for (int i = 31; i >= 0; --i)
28 |     {
29 |         if ((1U << i) & x)
30 |         {
31 |             return 31 - i;
32 |         }
33 |     }
34 |     return 32;
35 | }
36 | 
37 | #define CUDNN_IS_POW_2(x) (0 == ((x) & ((x) -1)))
38 | 
39 | int find_log_2(int x, bool round_up = false)
40 | {
41 |     int a = 31 - clz(x);
42 |     if (round_up)
43 |     {
44 |         a += !CUDNN_IS_POW_2(x);
45 |     }
46 |     return a;
47 | }
48 | 
49 | void findDivisor(int denom,
50 |                   unsigned int& mul_coeff, unsigned int& shift_coeff)
51 | {
52 |     if (denom == 0)
53 |     {
54 |         return;
55 |     }
56 |     if (denom == 1)
57 |     {
58 |         // if dividing by 1, reduced math doesn't work because mul_coeff would
59 |         // need to be 2^32, which doesn't fit into unsigned int.  the div()
60 |         // routine handles this special case separately.
61 |         mul_coeff = 0;
62 |         shift_coeff = 0;
63 |         return;
64 |     }
65 |     // To express the division N/D in terms of a multiplication, what we first
66 |     // imagine is simply N*(1/D).  However, 1/D will always evaluate to 0 (for D>1),
67 |     // so we need another way.  There's nothing that says we have to use exactly
68 |     // the fraction 1/D; instead it could be any X/Y that reduces to 1/D (i.e.,
69 |     // Y=X*D), or at least to "close enough" to it.  If we pick Y that is a power
70 |     // of two, then the N*(X/Y) can be N*X followed by a right-shift by some amount.
71 |     // The power of two we should pick should be at least 2^32, because in the
72 |     // div() routine we'll use umulhi(), which returns only the upper 32 bits --
73 |     // this being equivalent to a right-shift by 32.  But we might want a higher
74 |     // power of two for better accuracy depending on the magnitude of the denominator.
75 |     // Once we've picked Y, then X [our mul_coeff value] is simply Y/D, rounding up,
76 |     // and we save shift_coeff as whatever further shift we have to do beyond
77 |     // what the umulhi() implies.
78 |     uint32_t p = 31 + find_log_2(denom, true);
79 |     uint32_t m = ((1ull << p) + (uint32_t) denom - 1) / (uint32_t) denom;
80 |     mul_coeff = m;
81 |     shift_coeff = p - 32;
82 | }
83 | 
84 | } // namespace detail
85 | 
86 | } // namespace plugin
87 | 
88 | } // namespace nvinfer1
89 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/common/serialize.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | #pragma once
 17 | 
 18 | #include <cstring>
 19 | #include <vector>
 20 | #include <cassert>
 21 | #include <type_traits>
 22 | 
 23 | #include <iostream>
 24 | using std::cerr;
 25 | using std::cout;
 26 | using std::endl;
 27 | 
 28 | template <typename T>
 29 | inline void serialize_value(void** buffer, T const& value);
 30 | 
 31 | template <typename T>
 32 | inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value);
 33 | 
 34 | namespace
 35 | {
 36 | 
 37 | template <typename T, class Enable = void>
 38 | struct Serializer
 39 | {
 40 | };
 41 | 
 42 | template <typename T>
 43 | struct Serializer<T,
 44 |     typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value || std::is_pod<T>::value>::type>
 45 | {
 46 |     static size_t serialized_size(T const&)
 47 |     {
 48 |         return sizeof(T);
 49 |     }
 50 |     static void serialize(void** buffer, T const& value)
 51 |     {
 52 |         ::memcpy(*buffer, &value, sizeof(T));
 53 |         reinterpret_cast<char*&>(*buffer) += sizeof(T);
 54 |     }
 55 |     static void deserialize(void const** buffer, size_t* buffer_size, T* value)
 56 |     {
 57 |         assert(*buffer_size >= sizeof(T));
 58 |         ::memcpy(value, *buffer, sizeof(T));
 59 |         reinterpret_cast<char const*&>(*buffer) += sizeof(T);
 60 |         *buffer_size -= sizeof(T);
 61 |     }
 62 | };
 63 | 
 64 | template <>
 65 | struct Serializer<const char*>
 66 | {
 67 |     static size_t serialized_size(const char* value)
 68 |     {
 69 |         return strlen(value) + 1;
 70 |     }
 71 |     static void serialize(void** buffer, const char* value)
 72 |     {
 73 |         ::strcpy(static_cast<char*>(*buffer), value);
 74 |         reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
 75 |     }
 76 |     static void deserialize(void const** buffer, size_t* buffer_size, const char** value)
 77 |     {
 78 |         *value = static_cast<char const*>(*buffer);
 79 |         size_t data_size = strnlen(*value, *buffer_size) + 1;
 80 |         assert(*buffer_size >= data_size);
 81 |         reinterpret_cast<char const*&>(*buffer) += data_size;
 82 |         *buffer_size -= data_size;
 83 |     }
 84 | };
 85 | 
 86 | template <typename T>
 87 | struct Serializer<std::vector<T>,
 88 |     typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value || std::is_pod<T>::value>::type>
 89 | {
 90 |     static size_t serialized_size(std::vector<T> const& value)
 91 |     {
 92 |         return sizeof(value.size()) + value.size() * sizeof(T);
 93 |     }
 94 |     static void serialize(void** buffer, std::vector<T> const& value)
 95 |     {
 96 |         serialize_value(buffer, value.size());
 97 |         size_t nbyte = value.size() * sizeof(T);
 98 |         ::memcpy(*buffer, value.data(), nbyte);
 99 |         reinterpret_cast<char*&>(*buffer) += nbyte;
100 |     }
101 |     static void deserialize(void const** buffer, size_t* buffer_size, std::vector<T>* value)
102 |     {
103 |         size_t size;
104 |         deserialize_value(buffer, buffer_size, &size);
105 |         value->resize(size);
106 |         size_t nbyte = value->size() * sizeof(T);
107 |         assert(*buffer_size >= nbyte);
108 |         ::memcpy(value->data(), *buffer, nbyte);
109 |         reinterpret_cast<char const*&>(*buffer) += nbyte;
110 |         *buffer_size -= nbyte;
111 |     }
112 | };
113 | 
114 | } // namespace
115 | 
116 | template <typename T>
117 | inline size_t serialized_size(T const& value)
118 | {
119 |     return Serializer<T>::serialized_size(value);
120 | }
121 | 
122 | template <typename T>
123 | inline void serialize_value(void** buffer, T const& value)
124 | {
125 |     return Serializer<T>::serialize(buffer, value);
126 | }
127 | 
128 | template <typename T>
129 | inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value)
130 | {
131 |     return Serializer<T>::deserialize(buffer, buffer_size, value);
132 | }
133 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/gatherNMSCustomOutputs.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #ifndef TRT_BATCHED_NMS_HELPER_H
17 | #define TRT_BATCHED_NMS_HELPER_H
18 | #include "plugin.h"
19 | using namespace nvinfer1;
20 | using namespace nvinfer1::plugin;
21 | 
22 | pluginStatus_t gatherNMSCustomOutputs(cudaStream_t stream, bool shareLocation, int numImages, int numPredsPerClass,
23 |     int numClasses, int topK, int keepTopK, DataType DT_BBOX, DataType DT_SCORE, const void* indices,
24 |     const void* scores, const void* bboxData, const void* landData, void* keepCount, void* nmsedBoxes, void* nmsedScores, void* nmsedClasses, void* nmsedLandmarks,
25 |     bool clipBoxes, const float scoreShift);
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-scrfd/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.o


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-scrfd/parser_scrfd.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import pyds
 3 | import ctypes
 4 | import numpy as np
 5 | 
 6 | def layer_finder(output_layer_info, name):
 7 |     """ Return the layer contained in output_layer_info which corresponds
 8 |         to the given name.
 9 |     """
10 |     for layer in output_layer_info:
11 |         # dataType == 0 <=> dataType == FLOAT
12 |         # print(layer.layerName)
13 |         if layer.dataType == 0 and layer.layerName == name:
14 |             return layer
15 |     return None
16 | 
17 | 
18 | def clip(x):
19 |     return min(max(0.0, x), 1.0)
20 | 
21 | def make_object(index, layers, default_classId = 1):
22 |     """ Creates a NvDsInferObjectDetectionInfo object from one layer of SSD.
23 |         Return None if the class Id is invalid, if the detection confidence
24 |         is under the threshold or if the width/height of the bounding box is
25 |         null/negative.
26 |         Return the created NvDsInferObjectDetectionInfo object otherwise.
27 |     """
28 |     box_layer, score_layer = layers
29 |     res = pyds.NvDsInferObjectDetectionInfo()
30 |     res.detectionConfidence = score_layer[index]
31 |     res.classId = default_classId
32 | 
33 |     rect_x1_f = box_layer[index][0]
34 |     rect_y1_f = box_layer[index][1]
35 |     rect_x2_f = box_layer[index][2]
36 |     rect_y2_f = box_layer[index][3]
37 |     res.left = clip(rect_x1_f)
38 |     res.top = clip(rect_y1_f)
39 |     res.width = clip(rect_x2_f - rect_x1_f)
40 |     res.height = clip(rect_y2_f - rect_y1_f)
41 | 
42 |     return res
43 | 
44 | def nvds_infer_parse_scrfd(output_layer_info, input_size):
45 |     """ Get data from output_layer_info and fill object_list
46 |         num_detections: [1]
47 |         nmsed_bboxes:   [200, 4]
48 |         nmsed_scores:   [200]
49 |         nmsed_classes:  [200]
50 |         nmsed_landmarks:[200, 10]
51 |     """
52 |     num_detection_layer = output_layer_info[0]
53 |     box_layer           = output_layer_info[1]
54 |     score_layer         = output_layer_info[2]
55 |     class_layer         = output_layer_info[3]
56 |     landmark_layer      = output_layer_info[4]
57 | 
58 |     # if not num_detection_layer or not score_layer or not class_layer or not box_layer or not landmark_layer:
59 |     #     sys.stderr.write("ERROR: some layers missing in output tensors\n")
60 |     #     return []
61 |     
62 |     ptr = ctypes.cast(pyds.get_ptr(num_detection_layer.buffer), ctypes.POINTER(ctypes.c_int32))
63 |     num_detection = np.ctypeslib.as_array(ptr, shape=(1,))[0]
64 |     object_list = []
65 |     landmark_list = []
66 | 
67 |     if num_detection > 0:
68 |         ptr = ctypes.cast(pyds.get_ptr(box_layer.buffer), ctypes.POINTER(ctypes.c_float))
69 |         box_result = np.ctypeslib.as_array(ptr, shape=(200,4))
70 | 
71 |         # Normalize 
72 |         box_result = box_result.astype('float32')
73 |         box_result[:, 0] /= input_size[0]
74 |         box_result[:, 1] /= input_size[1]
75 |         box_result[:, 2] /= input_size[0]
76 |         box_result[:, 3] /= input_size[1]
77 | 
78 |         ptr = ctypes.cast(pyds.get_ptr(score_layer.buffer), ctypes.POINTER(ctypes.c_float))
79 |         score_result = np.ctypeslib.as_array(ptr, shape=(200,))
80 |         ptr = ctypes.cast(pyds.get_ptr(landmark_layer.buffer), ctypes.POINTER(ctypes.c_float))
81 |         landmark_result = np.ctypeslib.as_array(ptr, shape=(200,10))
82 |         x3_layers = box_result, score_result
83 |         for i in range(num_detection):
84 |             obj = make_object(i, x3_layers)
85 |             if obj:
86 |                 object_list.append(obj)
87 |                 landmark_list.append(landmark_result[i])
88 |     # print(landmark_list)
89 |     return object_list, landmark_list


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/config_deepstream.txt:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | #
  3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | #
 17 | ################################################################################
 18 | 
 19 | [application]
 20 | enable-perf-measurement=1
 21 | perf-measurement-interval-sec=3
 22 | #gie-kitti-output-dir=streamscl
 23 | 
 24 | [tiled-display]
 25 | enable=1
 26 | rows=1
 27 | columns=1
 28 | width=1280
 29 | height=720
 30 | gpu-id=0
 31 | #(0): nvbuf-mem-default - Default memory allocated, specific to particular platform
 32 | #(1): nvbuf-mem-cuda-pinned - Allocate Pinned/Host cuda memory, applicable for Tesla
 33 | #(2): nvbuf-mem-cuda-device - Allocate Device cuda memory, applicable for Tesla
 34 | #(3): nvbuf-mem-cuda-unified - Allocate Unified cuda memory, applicable for Tesla
 35 | #(4): nvbuf-mem-surface-array - Allocate Surface Array memory, applicable for Jetson
 36 | nvbuf-memory-type=0
 37 | 
 38 | [source0]
 39 | enable=1
 40 | #Type - 1=CameraV4L2 2=URI 3=MultiURI
 41 | type=3
 42 | uri=file:/home/nndam/Desktop/survelliance-videos/capture_0.mp4
 43 | num-sources=1
 44 | gpu-id=0
 45 | # (0): memtype_device   - Memory type Device
 46 | # (1): memtype_pinned   - Memory type Host Pinned
 47 | # (2): memtype_unified  - Memory type Unified
 48 | cudadec-memtype=0
 49 | 
 50 | [sink0]
 51 | enable=1
 52 | #Type - 1=FakeSink 2=EglSink 3=File
 53 | type=2
 54 | sync=0
 55 | source-id=0
 56 | gpu-id=0
 57 | nvbuf-memory-type=0
 58 | #1=mp4 2=mkv
 59 | container=1
 60 | #1=h264 2=h265
 61 | codec=1
 62 | output-file=yolov4.mp4
 63 | 
 64 | [osd]
 65 | enable=1
 66 | gpu-id=0
 67 | border-width=1
 68 | text-size=12
 69 | text-color=1;1;1;1;
 70 | text-bg-color=0.3;0.3;0.3;1
 71 | font=Serif
 72 | show-clock=0
 73 | clock-x-offset=800
 74 | clock-y-offset=820
 75 | clock-text-size=12
 76 | clock-color=1;0;0;0
 77 | nvbuf-memory-type=0
 78 | 
 79 | [streammux]
 80 | gpu-id=0
 81 | ##Boolean property to inform muxer that sources are live
 82 | live-source=0
 83 | batch-size=1
 84 | ##time out in usec, to wait after the first buffer is available
 85 | ##to push the batch even if the complete batch is not formed
 86 | batched-push-timeout=40000
 87 | ## Set muxer output width and height
 88 | width=1280
 89 | height=720
 90 | ##Enable to maintain aspect ratio wrt source, and allow black borders, works
 91 | ##along with width, height properties
 92 | enable-padding=0
 93 | nvbuf-memory-type=0
 94 | 
 95 | # config-file property is mandatory for any gie section.
 96 | # Other properties are optional and if set will override the properties set in
 97 | # the infer config file.
 98 | [primary-gie]
 99 | enable=1
100 | gpu-id=0
101 | labelfile-path=labels.txt
102 | batch-size=1
103 | 
104 | #Required by the app for OSD, not a plugin property
105 | bbox-border-color0=1;0;0;1
106 | bbox-border-color1=0;1;1;1
107 | bbox-border-color2=0;0;1;1
108 | bbox-border-color3=0;1;0;1
109 | interval=0
110 | gie-unique-id=1
111 | nvbuf-memory-type=0
112 | config-file=config_yolov4.txt
113 | 
114 | [tracker]
115 | enable=1
116 | tracker-width=416
117 | tracker-height=416
118 | ll-lib-file=/opt/nvidia/deepstream/deepstream/lib/libnvds_nvmultiobjecttracker.so
119 | ll-config-file=/opt/nvidia/deepstream/deepstream/samples/configs/deepstream-app/config_tracker_NvDCF_max_perf.yml
120 | enable-batch-process=1
121 | display-tracking-id=1
122 | 
123 | [tests]
124 | file-loop=0


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/config_tracker.txt:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # SPDX-FileCopyrightText: Copyright (c) 2019-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | ################################################################################
17 | 
18 | # Mandatory properties for the tracker:
19 | #   tracker-width
20 | #   tracker-height: needs to be multiple of 6 for NvDCF
21 | #   gpu-id
22 | #   ll-lib-file: path to low-level tracker lib
23 | #   ll-config-file: required for NvDCF, optional for KLT and IOU
24 | #
25 | [tracker]
26 | tracker-width=608
27 | tracker-height=608
28 | gpu-id=0
29 | ll-lib-file=/opt/nvidia/deepstream/deepstream/lib/libnvds_nvmultiobjecttracker.so
30 | ll-config-file=config_tracker_NvDCF_perf.yml
31 | #enable-past-frame=1
32 | enable-batch-process=1


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/config_yolov4.txt:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #
 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | ################################################################################
18 | 
19 | # Following properties are mandatory when engine files are not specified:
20 | #   int8-calib-file(Only in INT8), model-file-format
21 | #   Caffemodel mandatory properties: model-file, proto-file, output-blob-names
22 | #   UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names
23 | #   ONNX: onnx-file
24 | #
25 | # Mandatory properties for detectors:
26 | #   num-detected-classes
27 | #
28 | # Optional properties for detectors:
29 | #   cluster-mode(Default=Group Rectangles), interval(Primary mode only, Default=0)
30 | #   custom-lib-path
31 | #   parse-bbox-func-name
32 | #
33 | # Mandatory properties for classifiers:
34 | #   classifier-threshold, is-classifier
35 | #
36 | # Optional properties for classifiers:
37 | #   classifier-async-mode(Secondary mode only, Default=false)
38 | #
39 | # Optional properties in secondary mode:
40 | #   operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes),
41 | #   input-object-min-width, input-object-min-height, input-object-max-width,
42 | #   input-object-max-height
43 | #
44 | # Following properties are always recommended:
45 | #   batch-size(Default=1)
46 | #
47 | # Other optional properties:
48 | #   net-scale-factor(Default=1), network-mode(Default=0 i.e FP32),
49 | #   model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path,
50 | #   mean-file, gie-unique-id(Default=0), offsets, process-mode (Default=1 i.e. primary),
51 | #   custom-lib-path, network-mode(Default=0 i.e FP32)
52 | #
53 | # The values in the config file are overridden by values set through GObject
54 | # properties.
55 | 
56 | [property]
57 | gpu-id=0
58 | net-scale-factor=0.0039215697906911373
59 | #0=RGB, 1=BGR
60 | model-color-format=0
61 | model-engine-file=weights/model-1x3x416x416-fp16.engine
62 | labelfile-path=labels.txt
63 | batch-size=1
64 | ## 0=FP32, 1=INT8, 2=FP16 mode
65 | network-mode=2
66 | num-detected-classes=80
67 | gie-unique-id=1
68 | network-type=0
69 | is-classifier=0
70 | ## 0=Group Rectangles, 1=DBSCAN, 2=NMS, 3= DBSCAN+NMS Hybrid, 4 = None(No clustering)
71 | cluster-mode=2
72 | maintain-aspect-ratio=1
73 | custom-lib-path=nvdsinfer_custom_impl_Yolo/libnvdsinfer_custom_impl_Yolo.so
74 | parse-bbox-func-name=NvDsInferParseCustomYoloV4
75 | #scaling-filter=0
76 | #scaling-compute-hw=0
77 | 
78 | [class-attrs-all]
79 | nms-iou-threshold=0.6
80 | pre-cluster-threshold=0.4


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/exec_backends/__pycache__/trt_backend.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-yolov4/exec_backends/__pycache__/trt_backend.cpython-36.pyc


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/exec_backends/trt_backend.py:
--------------------------------------------------------------------------------
  1 | import pycuda.driver as cuda
  2 | import pycuda.autoinit
  3 | import numpy as np
  4 | 
  5 | import tensorrt as trt
  6 | 
  7 | TRT_LOGGER = trt.Logger()
  8 | trt.init_libnvinfer_plugins(None, "")
  9 | # Simple helper data class that's a little nicer to use than a 2-tuple.
 10 | class HostDeviceMem(object):
 11 |     def __init__(self, host_mem, device_mem):
 12 |         self.host = host_mem
 13 |         self.device = device_mem
 14 | 
 15 |     def __str__(self):
 16 |         return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
 17 | 
 18 |     def __repr__(self):
 19 |         return self.__str__()
 20 | 
 21 | # Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
 22 | def allocate_buffers(engine, max_boxes, total_classes):
 23 |     inputs = []
 24 |     outputs = []
 25 |     bindings = []
 26 |     stream = cuda.Stream()
 27 |     out_shapes = []
 28 |     input_shapes = []
 29 |     out_names = []
 30 |     max_batch_size = engine.get_profile_shape(0, 0)[2][0]
 31 |     # max_batch_size = 1
 32 |     for binding in engine:
 33 |         binding_shape = engine.get_binding_shape(binding)
 34 | 
 35 |         # #Fix -1 dimension for proper memory allocation for batch_size > 1
 36 |         # if binding == 'input':
 37 |         #     max_width = engine.get_profile_shape(0, 0)[2][3]
 38 |         #     max_height = engine.get_profile_shape(0, 0)[2][2]
 39 |         #     size = max_batch_size * max_width * max_height * 3
 40 |         # elif binding == 'confs':
 41 |         #     size = max_batch_size * max_boxes * (total_classes)
 42 |         # elif binding == 'boxes':
 43 |         #     size = max_batch_size * max_boxes * (4)
 44 |         # else:
 45 |         #     raise NotImplementedError("Not support binding: {}".format(binding))
 46 |         print(binding, binding_shape)
 47 |         assert min(binding_shape) > 0, print(binding, binding_shape)
 48 |         size = 1
 49 |         for i in range(len(binding_shape)):
 50 |             size *= binding_shape[i]
 51 | 
 52 |         dtype = trt.nptype(engine.get_binding_dtype(binding))
 53 |         # Allocate host and device buffers
 54 |         host_mem = cuda.pagelocked_empty(size, dtype)
 55 |         device_mem = cuda.mem_alloc(host_mem.nbytes)
 56 |         # Append the device buffer to device bindings.
 57 |         bindings.append(int(device_mem))
 58 |         # Append to the appropriate list.
 59 |         if engine.binding_is_input(binding):
 60 |             inputs.append(HostDeviceMem(host_mem, device_mem))
 61 |             input_shapes.append(engine.get_binding_shape(binding))
 62 |         else:
 63 |             outputs.append(HostDeviceMem(host_mem, device_mem))
 64 |             #Collect original output shapes and names from engine
 65 |             out_shapes.append(engine.get_binding_shape(binding))
 66 |             out_names.append(binding)
 67 |     return inputs, outputs, bindings, stream, input_shapes, out_shapes, out_names, max_batch_size
 68 | 
 69 | # This function is generalized for multiple inputs/outputs.
 70 | # inputs and outputs are expected to be lists of HostDeviceMem objects.
 71 | def do_inference(context, bindings, inputs, outputs, stream):
 72 |     # Transfer input data to the GPU.
 73 |     [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
 74 |     # Run inference.
 75 |     context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
 76 |     # Transfer predictions back from the GPU.
 77 |     [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
 78 |     # Synchronize the stream
 79 |     stream.synchronize()
 80 |     # Return only the host outputs.
 81 |     return [out.host for out in outputs]
 82 | 
 83 | class TrtModel(object):
 84 |     def __init__(self, model, max_size, total_classes = 80):
 85 |         self.engine_file = model
 86 |         self.engine = None
 87 |         self.inputs = None
 88 |         self.outputs = None
 89 |         self.bindings = None
 90 |         self.stream = None
 91 |         self.context = None
 92 |         self.input_shapes = None
 93 |         self.out_shapes = None
 94 |         self.max_batch_size = 1
 95 |         self.max_size = max_size
 96 |         self.total_classes = total_classes
 97 | 
 98 |     def build(self):
 99 |         with open(self.engine_file, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
100 |             self.engine = runtime.deserialize_cuda_engine(f.read())
101 |         # Allocate
102 |         self.max_boxes = self.get_number_of_boxes(self.max_size, self.max_size)
103 |         self.inputs, self.outputs, self.bindings, self.stream, self.input_shapes, self.out_shapes, self.out_names, self.max_batch_size = \
104 |                 allocate_buffers(self.engine, max_boxes = self.max_boxes, total_classes = self.total_classes)
105 |         self.context = self.engine.create_execution_context()
106 |         self.context.active_optimization_profile = 0
107 | 
108 |     def get_number_of_boxes(self, im_width, im_height):
109 |         # Calculate total boxes (3 detect layers)
110 |         assert im_width % 32 == 0 and im_height % 32 == 0
111 |         return (int(im_width*im_height/32/32) + int(im_width*im_height/16/16) + int(im_width*im_height/8/8))*3
112 |         
113 |     def run(self, input, deflatten: bool = True, as_dict = False):
114 |         # lazy load implementation
115 |         if self.engine is None:
116 |             self.build()
117 | 
118 |         input = np.asarray(input)
119 |         batch_size, _, im_height, im_width = input.shape
120 |         assert batch_size <= self.max_batch_size
121 |         assert max(im_width, im_height) <= self.max_size, "Invalid shape: {}x{}, max shape: {}".format(im_width, im_height, self.max_size)
122 |         allocate_place = np.prod(input.shape)
123 |         # print('allocate_place', input.shape)
124 |         self.inputs[0].host[:allocate_place] = input.flatten(order='C').astype(np.float32)
125 |         self.context.set_binding_shape(0, input.shape)
126 |         trt_outputs = do_inference(
127 |             self.context, bindings=self.bindings,
128 |             inputs=self.inputs, outputs=self.outputs, stream=self.stream)
129 |         if deflatten:
130 |             trt_outputs = [output[:np.prod(shape)].reshape(shape) for output, shape in zip(trt_outputs, self.out_shapes)]
131 |         if as_dict:
132 |             return {self.out_names[ix]: trt_output[:batch_size] for ix, trt_output in enumerate(trt_outputs)}
133 |         return [trt_output[:batch_size] for trt_output in trt_outputs]
134 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/labels.txt:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorbike
 5 | aeroplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/nvdsinfer_custom_impl_Yolo/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | CUDA_VER?=
18 | ifeq ($(CUDA_VER),)
19 |   $(error "CUDA_VER is not set")
20 | endif
21 | CC:= g++
22 | NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc
23 | 
24 | CFLAGS:= -Wall -std=c++11 -shared -fPIC -Wno-error=deprecated-declarations
25 | CFLAGS+= -I../../includes -I/usr/local/cuda-$(CUDA_VER)/include -I/opt/nvidia/deepstream/deepstream/sources/includes
26 | 
27 | LIBS:= -lnvinfer_plugin -lnvinfer -lnvparsers -L/usr/local/cuda-$(CUDA_VER)/lib64 -lcudart -lcublas -lstdc++fs
28 | LFLAGS:= -shared -Wl,--start-group $(LIBS) -Wl,--end-group
29 | 
30 | INCS:= $(wildcard *.h)
31 | SRCFILES:= nvdsparsebbox_Yolo.cpp
32 | 
33 | TARGET_LIB:= libnvdsinfer_custom_impl_Yolo.so
34 | 
35 | TARGET_OBJS:= $(SRCFILES:.cpp=.o)
36 | TARGET_OBJS:= $(TARGET_OBJS:.cu=.o)
37 | 
38 | all: $(TARGET_LIB)
39 | 
40 | %.o: %.cpp $(INCS) Makefile
41 | 	$(CC) -c -o $@ $(CFLAGS) $<
42 | 
43 | %.o: %.cu $(INCS) Makefile
44 | 	$(NVCC) -c -o $@ --compiler-options '-fPIC' $<
45 | 
46 | $(TARGET_LIB) : $(TARGET_OBJS)
47 | 	$(CC) -o $@  $(TARGET_OBJS) $(LFLAGS)
48 | 
49 | clean:
50 | 	rm -rf $(TARGET_LIB)
51 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/nvdsinfer_custom_impl_Yolo/nvdsparsebbox_Yolo.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <algorithm>
 18 | #include <cassert>
 19 | #include <cmath>
 20 | #include <cstring>
 21 | #include <fstream>
 22 | #include <iostream>
 23 | #include <unordered_map>
 24 | #include "nvdsinfer_custom_impl.h"
 25 | 
 26 | static const int NUM_CLASSES_YOLO = 80;
 27 | 
 28 | float clamp(const float val, const float minVal, const float maxVal)
 29 | {
 30 |     assert(minVal <= maxVal);
 31 |     return std::min(maxVal, std::max(minVal, val));
 32 | }
 33 | 
 34 | extern "C" bool NvDsInferParseCustomYoloV4(
 35 |     std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
 36 |     NvDsInferNetworkInfo const& networkInfo,
 37 |     NvDsInferParseDetectionParams const& detectionParams,
 38 |     std::vector<NvDsInferParseObjectInfo>& objectList);
 39 | 
 40 | 
 41 | /* YOLOv4 implementations */
 42 | static NvDsInferParseObjectInfo convertBBoxYoloV4(const float& bx1, const float& by1, const float& bx2,
 43 |                                      const float& by2, const uint& netW, const uint& netH)
 44 | {
 45 |     NvDsInferParseObjectInfo b;
 46 |     // Restore coordinates to network input resolution
 47 | 
 48 |     float x1 = bx1 * netW;
 49 |     float y1 = by1 * netH;
 50 |     float x2 = bx2 * netW;
 51 |     float y2 = by2 * netH;
 52 | 
 53 |     x1 = clamp(x1, 0, netW);
 54 |     y1 = clamp(y1, 0, netH);
 55 |     x2 = clamp(x2, 0, netW);
 56 |     y2 = clamp(y2, 0, netH);
 57 | 
 58 |     b.left = x1;
 59 |     b.width = clamp(x2 - x1, 0, netW);
 60 |     b.top = y1;
 61 |     b.height = clamp(y2 - y1, 0, netH);
 62 | 
 63 |     return b;
 64 | }
 65 | 
 66 | static void addBBoxProposalYoloV4(const float bx, const float by, const float bw, const float bh,
 67 |                      const uint& netW, const uint& netH, const int maxIndex,
 68 |                      const float maxProb, std::vector<NvDsInferParseObjectInfo>& binfo)
 69 | {
 70 |     NvDsInferParseObjectInfo bbi = convertBBoxYoloV4(bx, by, bw, bh, netW, netH);
 71 |     if (bbi.width < 1 || bbi.height < 1) return;
 72 | 
 73 |     bbi.detectionConfidence = maxProb;
 74 |     bbi.classId = maxIndex;
 75 |     binfo.push_back(bbi);
 76 | }
 77 | 
 78 | static std::vector<NvDsInferParseObjectInfo>
 79 | decodeYoloV4Tensor(
 80 |     const float* boxes, const float* scores, const float* classes,
 81 |     const uint num_bboxes, NvDsInferParseDetectionParams const& detectionParams,
 82 |     const uint& netW, const uint& netH)
 83 | {
 84 |     std::vector<NvDsInferParseObjectInfo> binfo;
 85 | 
 86 |     uint bbox_location = 0;
 87 |     uint score_location = 0;
 88 |     for (uint b = 0; b < num_bboxes; ++b)
 89 |     {
 90 |         float bx1 = boxes[bbox_location];
 91 |         float by1 = boxes[bbox_location + 1];
 92 |         float bx2 = boxes[bbox_location + 2];
 93 |         float by2 = boxes[bbox_location + 3];
 94 |         float maxProb = scores[score_location];
 95 |         int maxIndex = (int) classes[score_location];
 96 | 
 97 |         if (maxProb > detectionParams.perClassPreclusterThreshold[maxIndex])
 98 |         {
 99 |             addBBoxProposalYoloV4(bx1, by1, bx2, by2, netW, netH, maxIndex, maxProb, binfo);
100 |         }
101 | 
102 |         bbox_location += 4;
103 |         score_location += 1;
104 |     }
105 | 
106 |     return binfo;
107 | }
108 | 
109 | extern "C" bool NvDsInferParseCustomYoloV4(
110 |     std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
111 |     NvDsInferNetworkInfo const& networkInfo,
112 |     NvDsInferParseDetectionParams const& detectionParams,
113 |     std::vector<NvDsInferParseObjectInfo>& objectList)
114 | {
115 |     if (NUM_CLASSES_YOLO != detectionParams.numClassesConfigured)
116 |     {
117 |         std::cerr << "WARNING: Num classes mismatch. Configured:"
118 |                   << detectionParams.numClassesConfigured
119 |                   << ", detected by network: " << NUM_CLASSES_YOLO << std::endl;
120 |     }
121 | 
122 |     std::vector<NvDsInferParseObjectInfo> objects;
123 |     const NvDsInferLayerInfo &n_bboxes   = outputLayersInfo[0];
124 |     const NvDsInferLayerInfo &boxes      = outputLayersInfo[1]; // (num_boxes, 4)
125 |     const NvDsInferLayerInfo &scores     = outputLayersInfo[2]; // (num_boxes, )
126 |     const NvDsInferLayerInfo &classes    = outputLayersInfo[3]; // (num_boxes, )
127 |     
128 | 
129 |     int num_bboxes = *(const int*)(n_bboxes.buffer);
130 | 
131 | 
132 |     assert(boxes.inferDims.numDims == 2);
133 |     assert(scores.inferDims.numDims == 1);
134 |     assert(classes.inferDims.numDims == 1);
135 | 
136 |     // std::cout << "Network Info: " << networkInfo.height << "  " << networkInfo.width << std::endl;
137 | 
138 |     std::vector<NvDsInferParseObjectInfo> outObjs =
139 |         decodeYoloV4Tensor(
140 |             (const float*)(boxes.buffer), (const float*)(scores.buffer), (const float*)(classes.buffer), num_bboxes, detectionParams,
141 |             networkInfo.width, networkInfo.height);
142 | 
143 |     objects.insert(objects.end(), outObjs.begin(), outObjs.end());
144 | 
145 |     objectList = objects;
146 | 
147 |     return true;
148 | }
149 | /* YOLOv4 implementations end*/
150 | 
151 | 
152 | /* Check that the custom function has been defined correctly */
153 | CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomYoloV4);


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/test_images/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Deepstream/sample-yolov4/test_images/test.png


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/test_onnx.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | from exec_backends.trt_backend import TrtModel
 4 | 
 5 | 
 6 | def preprocess(img, input_size = (416, 416)):
 7 |     resized_img = cv2.resize(img, (input_size[1], input_size[0]))
 8 |     resized_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)
 9 |     resized_img = np.expand_dims(resized_img, 0)
10 |     resized_img = resized_img.astype('float32') / 255.0
11 |     resized_img = np.transpose(resized_img, (0, 3, 1, 2))
12 |     return resized_img
13 | 
14 | def visualize(img, bboxes):
15 |     height, width, _ = img.shape
16 |     bboxes[:, 0] *= width
17 |     bboxes[:, 1] *= height
18 |     bboxes[:, 2] *= width
19 |     bboxes[:, 3] *= height
20 |     for x1, y1, x2, y2 in bboxes:
21 |         x1 = int(x1)
22 |         y1 = int(y1)
23 |         x2 = int(x2)
24 |         y2 = int(y2)
25 |         cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2)
26 |     return img
27 | 
28 | if __name__ == '__main__':
29 |     model_path = 'weights/model-1x3x416x416-fp16.engine'
30 |     img_path = 'test_images/test.png'
31 | 
32 |     model = TrtModel(model_path, max_size = 416)
33 |     img = cv2.imread(img_path)
34 |     batch = preprocess(img)
35 | 
36 |     num_detections, bboxes, confs, classes = model.run(batch)
37 |     print(num_detections.shape, bboxes.shape, confs.shape, classes.shape)
38 |     bboxes  = bboxes[0][:num_detections[0][0]]
39 |     confs   = confs[0][:num_detections[0][0]]
40 |     classes = classes[0][:num_detections[0][0]]
41 |     print(bboxes)
42 |     vis = visualize(img.copy(), bboxes)
43 |     cv2.imshow('vis.jpg', vis)
44 |     cv2.waitKey(0)
45 | 


--------------------------------------------------------------------------------
/Deploy/Deepstream/sample-yolov4/tools/add_nms_plugins.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | #
 16 | 
 17 | #!/usr/bin/env python3
 18 | import onnx_graphsurgeon as gs
 19 | import argparse
 20 | import onnx
 21 | import numpy as np
 22 | 
 23 | def create_and_add_plugin_node(graph, topK, keepTopK):
 24 |     
 25 |     batch_size = graph.inputs[0].shape[0]
 26 |     input_h = graph.inputs[0].shape[2]
 27 |     input_w = graph.inputs[0].shape[3]
 28 |     print('batch_size', batch_size)
 29 | 
 30 |     tensors = graph.tensors()
 31 |     boxes_tensor = tensors["boxes"]
 32 |     confs_tensor = tensors["confs"]
 33 | 
 34 |     num_detections = gs.Variable(name="num_detections").to_variable(dtype=np.int32, shape=[-1, 1])
 35 |     nmsed_boxes = gs.Variable(name="nmsed_boxes").to_variable(dtype=np.float32, shape=[-1, keepTopK, 4])
 36 |     nmsed_scores = gs.Variable(name="nmsed_scores").to_variable(dtype=np.float32, shape=[-1, keepTopK])
 37 |     nmsed_classes = gs.Variable(name="nmsed_classes").to_variable(dtype=np.float32, shape=[-1, keepTopK])
 38 | 
 39 |     new_outputs = [num_detections, nmsed_boxes, nmsed_scores, nmsed_classes]
 40 | 
 41 |     mns_node = gs.Node(
 42 |         op="BatchedNMSDynamic_TRT",
 43 |         attrs=create_attrs(input_h, input_w, topK, keepTopK),
 44 |         inputs=[boxes_tensor, confs_tensor],
 45 |         outputs=new_outputs)
 46 | 
 47 |     graph.nodes.append(mns_node)
 48 |     graph.outputs = new_outputs
 49 | 
 50 |     return graph.cleanup().toposort()
 51 | 
 52 | 
 53 | 
 54 | 
 55 | def create_attrs(input_h, input_w, topK, keepTopK):
 56 | 
 57 |     num_anchors = 3
 58 | 
 59 |     h1 = input_h // 8
 60 |     h2 = input_h // 16
 61 |     h3 = input_h // 32
 62 | 
 63 |     w1 = input_w // 8
 64 |     w2 = input_w // 16
 65 |     w3 = input_w // 32
 66 | 
 67 |     num_boxes = num_anchors * (h1 * w1 + h2 * w2 + h3 * w3)
 68 | 
 69 |     attrs = {}
 70 | 
 71 |     attrs["shareLocation"] = 1
 72 |     attrs["backgroundLabelId"] = -1
 73 |     attrs["numClasses"] = 80
 74 |     attrs["topK"] = topK
 75 |     attrs["keepTopK"] = keepTopK
 76 |     attrs["scoreThreshold"] = 0.4
 77 |     attrs["iouThreshold"] = 0.6
 78 |     attrs["isNormalized"] = 1
 79 |     attrs["clipBoxes"] = 1
 80 | 
 81 |     # 001 is the default plugin version the parser will search for, and therefore can be omitted,
 82 |     # but we include it here for illustrative purposes.
 83 |     attrs["plugin_version"] = "1"
 84 | 
 85 |     return attrs
 86 | 
 87 | 
 88 | def main():
 89 |     parser = argparse.ArgumentParser(description="Add batchedNMSPlugin")
 90 |     parser.add_argument("-f", "--model", help="Path to the ONNX model generated by export_model.py", default="yolov4_1_3_416_416.onnx")
 91 |     parser.add_argument("-t", "--topK", help="number of bounding boxes for nms", default=2000)
 92 |     parser.add_argument("-k", "--keepTopK", help="bounding boxes to be kept per image", default=1000)
 93 | 
 94 |     args, _ = parser.parse_known_args()
 95 | 
 96 |     graph = gs.import_onnx(onnx.load(args.model))
 97 |     
 98 |     graph = create_and_add_plugin_node(graph, int(args.topK), int(args.keepTopK))
 99 |     
100 |     onnx.save(gs.export_onnx(graph), args.model + ".nms.onnx")
101 | 
102 | 
103 | if __name__ == '__main__':
104 |     main()
105 | 


--------------------------------------------------------------------------------
/Deploy/NVIDIA/README.md:
--------------------------------------------------------------------------------
1 | # NVIDIA frameworks, platforms, engines, toolkits, blogs, ...
2 | 
3 | - [Multi-instance GPU (MIG)](docs/multi_instance_gpu.md)
4 | - [FFMPEG with NVENC NVDEC hardware-acceleration](docs/nvidia_video_sdk.md)


--------------------------------------------------------------------------------
/Deploy/NVIDIA/docs/nvidia_video_sdk.md:
--------------------------------------------------------------------------------
  1 | # FFMPEG hardware acceleration with Nvidia Video SDK
  2 | ## 1. Requirements
  3 | - GPU with hardware-acceleration support, check here: https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new
  4 | <p align="center">
  5 |   <img src="../fig/support_nvenc_nvdec.png" width="1080">
  6 |   <i>Example of NVDEC support</i>
  7 | </p>
  8 | 
  9 | - Nvidia Driver
 10 | - CUDA Toolkit
 11 | 
 12 | ## 2. Install FFMPEG with hardware acceleration
 13 | System Information
 14 | - OS: Ubuntu 18.04
 15 | - CPU: Intel(R) Xeon(R) X5650 (12M Cache, 2.66 GHz, 6.40 GT/S Intel® QPI)
 16 | - NVIDIA GTX 1060 OC 3Gb
 17 | 
 18 | ```
 19 | sudo apt-get install build-essential yasm cmake libtool libc6 libc6-dev unzip wget libnuma1 libnuma-dev libx264-dev libvpx-dev libvorbis-dev
 20 | 
 21 | git clone  --branch sdk/11.1  https://git.videolan.org/git/ffmpeg/nv-codec-headers.git
 22 | 
 23 | cd nv-codec-headers && sudo make install && cd ..
 24 | 
 25 | git clone --branch n4.4.3 https://git.ffmpeg.org/ffmpeg.git ffmpeg/ && cd ffmpeg
 26 | 
 27 | ./configure --enable-nonfree --enable-cuda-nvcc --enable-nvenc --enable-cuvid --enable-nvdec --enable-libnpp --extra-cflags=-I/usr/local/cuda/include --extra-ldflags=-L/usr/local/cuda/lib64 --disable-static --enable-shared --enable-libx264 --enable-libvpx --enable-libvorbis --enable-gpl --enable-cuda
 28 | 
 29 | make -j8
 30 | 
 31 | sudo make install
 32 | 
 33 | sudo ldconfig
 34 | 
 35 | ffmpeg --help
 36 | ```
 37 | If you meet error about **nvcc**, try to change line 4355 of ```ffmpeg/configure``` to ```nvccflags_default="-gencode arch=compute_35,code=sm_35 -O2"```
 38 | 
 39 | ## 3. Benchmark
 40 | ### 3.1. Convert MPEG-4 to H264
 41 | - Public **libx264**
 42 | ```
 43 | ffmpeg -y -i test.avi -c:v libx264 test.mp4
 44 | 
 45 | Output #0, mp4, to 'test.mp4':
 46 |   Metadata:
 47 |     major_brand     : mp42
 48 |     minor_version   : 0
 49 |     compatible_brands: isommp42
 50 |     com.android.model: 21121210C
 51 |     com.android.version: 12
 52 |     com.android.manufacturer: Xiaomi
 53 |     encoder         : Lavf58.76.100
 54 |   Stream #0:0(eng): Video: h264 (avc1 / 0x31637661), yuv420p(tv, bt709, progressive), 1920x1080, q=2-31, 30 fps, 15360 tbn (default)
 55 |     Metadata:
 56 |       creation_time   : 2022-11-23T08:27:41.000000Z
 57 |       handler_name    : VideoHandle
 58 |       vendor_id       : [0][0][0][0]
 59 |       encoder         : Lavc58.134.100 libx264
 60 |     Side data:
 61 |       cpb: bitrate max/min/avg: 0/0/0 buffer size: 0 vbv_delay: N/A
 62 |   Stream #0:1(eng): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, stereo, fltp, 128 kb/s (default)
 63 |     Metadata:
 64 |       creation_time   : 2022-11-23T08:27:41.000000Z
 65 |       handler_name    : SoundHandle
 66 |       vendor_id       : [0][0][0][0]
 67 |       encoder         : Lavc58.134.100 aac
 68 | frame= 4871 fps= 44 q=-1.0 Lsize=  263346kB time=00:02:42.27 bitrate=13294.1kbits/s dup=0 drop=3 speed=1.48x
 69 | video:260623kB audio:2550kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.065733%
 70 | ```
 71 | - Hardware acceleration
 72 | ```
 73 | ffmpeg -y -i test.avi -c:v h264_nvenc test.mp4
 74 | 
 75 | Output #0, mp4, to 'test2.mp4':
 76 |   Metadata:
 77 |     major_brand     : mp42
 78 |     minor_version   : 0
 79 |     compatible_brands: isommp42
 80 |     com.android.model: 21121210C
 81 |     com.android.version: 12
 82 |     com.android.manufacturer: Xiaomi
 83 |     encoder         : Lavf58.76.100
 84 |   Stream #0:0(eng): Video: h264 (Main) (avc1 / 0x31637661), yuv420p(tv, bt709, progressive), 1920x1080, q=2-31, 2000 kb/s, 30 fps, 15360 tbn (default)
 85 |     Metadata:
 86 |       creation_time   : 2022-11-23T08:27:41.000000Z
 87 |       handler_name    : VideoHandle
 88 |       vendor_id       : [0][0][0][0]
 89 |       encoder         : Lavc58.134.100 h264_nvenc
 90 |     Side data:
 91 |       cpb: bitrate max/min/avg: 0/0/2000000 buffer size: 4000000 vbv_delay: N/A
 92 |   Stream #0:1(eng): Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, stereo, fltp, 128 kb/s (default)
 93 |     Metadata:
 94 |       creation_time   : 2022-11-23T08:27:41.000000Z
 95 |       handler_name    : SoundHandle
 96 |       vendor_id       : [0][0][0][0]
 97 |       encoder         : Lavc58.134.100 aac
 98 | frame= 4871 fps=269 q=41.0 Lsize=   44291kB time=00:02:42.27 bitrate=2235.9kbits/s dup=0 drop=3 speed=8.95x
 99 | video:41583kB audio:2550kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.356228%
100 | [aac @ 0x5590e2d37e00] Qavg: 182.528
101 | ```
102 | So basically, without care about bitrate, we can increase performance from **1.48x** to **8.95x** with NVIDIA hardware-acceleration
103 | 


--------------------------------------------------------------------------------
/Deploy/NVIDIA/fig/gpu-mig-overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/NVIDIA/fig/gpu-mig-overview.jpg


--------------------------------------------------------------------------------
/Deploy/NVIDIA/fig/mig_bert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/NVIDIA/fig/mig_bert.png


--------------------------------------------------------------------------------
/Deploy/NVIDIA/fig/support_nvenc_nvdec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/NVIDIA/fig/support_nvenc_nvdec.png


--------------------------------------------------------------------------------
/Deploy/README.md:
--------------------------------------------------------------------------------
1 | # Deploy
2 | Tất cả những thứ liên quan đến Deploy & Deploy engines


--------------------------------------------------------------------------------
/Deploy/Transfer-Learning-Toolkit/README.md:
--------------------------------------------------------------------------------
1 | # Transfer-Learning-Toolkit (TLT) from NVIDIA
2 | 
3 | - [Yolov4](docs/yolov4.md)
4 | - [Detectnet_V2](docs/detectnet_v2.md)


--------------------------------------------------------------------------------
/Deploy/Transfer-Learning-Toolkit/fig/detectnet_v2-inference.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Transfer-Learning-Toolkit/fig/detectnet_v2-inference.jpg


--------------------------------------------------------------------------------
/Deploy/Transfer-Learning-Toolkit/fig/nvidia-retrain-qat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Transfer-Learning-Toolkit/fig/nvidia-retrain-qat.png


--------------------------------------------------------------------------------
/Deploy/Transfer-Learning-Toolkit/fig/yolov4-inference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Transfer-Learning-Toolkit/fig/yolov4-inference.png


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/README.md:
--------------------------------------------------------------------------------
 1 | # AI-Engineer-Note
 2 | 
 3 | Tất cả những thứ liên quan đến Triton-inference-server
 4 | ## Basic
 5 | - [1. Cài đặt triton-server và triton-client](docs/install.md)
 6 |     + [1.1. Các chế độ quản lý model (load/unload/reload)](docs/model_management.md)
 7 | - [2. Sơ lược về các backend trong Triton](docs/backend.md)
 8 | - [3. Cấu hình cơ bản khi deploy mô hình](docs/model_configuration.md)
 9 | - [4. Deploy mô hình](#)
10 |     - [4.1 ONNX-runtime](docs/triton_onnx.md)
11 |     - [4.2 TensorRT](docs/triton_tensorrt.md)
12 |     - [4.3 Pytorch & TorchScript](docs/triton_pytorch.md)
13 |     - [4.4 Kaldi <i>(Advanced)</i>](docs/triton_kaldi.md)
14 | - [5. Model Batching](docs/model_batching.md)
15 | - [6. Ensemble Model và pre/post processing](docs/model_ensemble.md)
16 | ## Advanced
17 | - [Sử dụng Performance Analyzer Tool](docs/perf_analyzer.md)
18 | - [Optimizations](#)
19 |     + [Tối ưu Pytorch backend](docs/optimization_pytorch.md)
20 | 


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/backend.md:
--------------------------------------------------------------------------------
1 | # Triton backend
2 | 
3 | Triton backend được xây dựng trong việc thực thi mô hình. Một backend thông thường có thể được wrap bằng việc sử dụng các deep-learning framework như Pytorch, Tensorflow, TensorRT, ONNX-runtime hoặc OpenVINO như chúng ta đã từng làm để deploy mô hình (chẳng hạn như việc xây dựng một class load mô hình, warmup, pre-processing, inference, post-processing, ...). Dựa trên ý tưởng như vậy, ```triton-backend``` cũng được xây dựng bằng việc tổng hợp các backend của các deep-learning framework trên, sau đó cung cấp ra ngoài những API để người dùng có thể kết nối tới các mô hình deep-learning đã được load bằng ```triton-server```. Cho đến phiên bản hiện tại, ```triton-server``` hỗ trợ các backend sau:
4 | - TensorRT (platform: ```tensorrt_plan```)
5 | - Pytorch (platform: ```pytorch_libtorch```)
6 | - ONNX (platform: ```onnxruntime_onnx```)
7 | - Tensorflow (platform: ```tensorflow_savedmodel```)
8 | - Other backends (platform: phụ thuộc vào backend đã được định nghĩa)


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/install.md:
--------------------------------------------------------------------------------
 1 | # Install Triton
 2 | 
 3 | Nội dung phần này sẽ đề cập đến việc cài đặt và sử dụng nhanh triton-server và triton-client.
 4 | 
 5 | ## 1. Cài đặt triton-server
 6 | Nếu server đã cài đặt **triton-server** rồi thì có thể bỏ qua bước này và chuyển đến cài đặt và sử dụng **triton-client**. Hiện tại cách nhanh nhất để sử dụng triton-inference-server là docker từ NVIDIA NGC. Ngoài ra phương pháp build-from-source nếu có dịp mình sẽ trình bày sau. 
 7 | ### 1.1 Cài đặt sử dụng NVIDIA NGC 
 8 | ```
 9 | docker pull nvcr.io/nvidia/tritonserver:<xx.yy>-py3
10 | ```
11 | trong đó ```<xx.yy>``` là phiên bản, chẳng hạn
12 | ```
13 | docker pull nvcr.io/nvidia/tritonserver:21.12-py3
14 | ```
15 | ### 1.2 Chạy thử model
16 | Ở đây mình sẽ chạy thử mô hình wav2vec-base (mà mình đã convert sang ONNX) sử dụng backend là ONNX-runtime. Cấu trúc thư mục mình xây dựng như sau:
17 | ```bash
18 | ├── models
19 | │   ├── wav2vec_general_v2
20 | │   │   ├── 1
21 | │   │   │   ├── model.onnx
22 | │   │   ├── config.pbtxt
23 | ```
24 | File ```config.pbtxt```
25 | ```
26 | name: "wav2vec_general_v2"
27 | platform: "onnxruntime_onnx"
28 | max_batch_size : 0
29 | input [
30 |   {
31 |     name: "input"
32 |     data_type: TYPE_FP32
33 |     dims: [1, -1]
34 |   }
35 | ]
36 | output [
37 |   {
38 |     name: "output"
39 |     data_type: TYPE_FP32
40 |     dims: [-1, -1, 105]
41 |   }
42 | ]
43 | ```
44 | Chạy triton-server sử dụng GPU 1 <i>(Hiện tại mình đang ở thư mục cùng bậc với thư mục ```models```)</i>
45 | ```
46 | docker run --gpus device=1 --rm -p8000:8000 -p8001:8001 -p 3 tritonserver --model-repository=/models
47 | ```
48 | Hoặc chạy triton-server sử dụng GPU 1 và cơ chế ```share-memory```
49 | ```
50 | docker run --gpus device=1 --rm --ipc=host --shm-size=128m -p8000:8000 -p8001:8001 -p 3 tritonserver --model-repository=/models
51 | ```
52 | 
53 | Output
54 | <p align="left">
55 |   <img src="../fig/wav2vec_general_start.jpg" width="960">
56 | </p>
57 | 
58 | 
59 | ## 2. Cài đặt triton-client
60 | ### 2.1 Cài đặt cơ bản
61 | Với mục đích cơ bản là gọi từ ```python```, ta có thể cài đặt nhanh sử dụng ```pip```
62 | ```
63 | pip install tritonclient grpcio-tools
64 | ```
65 | ### 2.2 Cài đặt nâng cao
66 | Khác việc sử dụng ```pip``` để cài đặt và sử ```python``` để gọi đến **triton-server**, ở đây chủ yếu đề cập đến việc ```build-from-source``` để sử dụng một số thư viện đi kèm như **Model Analyst** và **Performance Analyst**
67 | - Cài đặt các thư viện Linux
68 | ```
69 | sudo apt-get install curl libcurl4-openssl-dev libb64-dev default-jdk maven
70 | ```
71 | - Cài đặt ```rapidjson```
72 | ```
73 | git clone https://github.com/Tencent/rapidjson.git
74 | cd rapidjson
75 | cmake .
76 | make
77 | sudo make install
78 | ```
79 | - Thêm 1 thư viện ```python``` không nó lại bắn lỗi giữa chừng lại mất công build lại 1 đoạn
80 | ```
81 | python3 -m pip install grpcio-tools
82 | ```
83 | - Tiến hành build **triton-client** (ở đây **triton-server** mình sử dụng Docker phiên bản r21.12)
84 | ```
85 | git clone --recursive https://github.com/triton-inference-server/client.git triton-client
86 | cd triton-client
87 | mkdir build && cd build
88 | cmake -DCMAKE_INSTALL_PREFIX=`pwd`/install -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON -DTRITON_ENABLE_PERF_ANALYZER=ON -DTRITON_ENABLE_PYTHON_HTTP=ON -DTRITON_ENABLE_PYTHON_GRPC=ON -DTRITON_ENABLE_JAVA_HTTP=ON -DTRITON_ENABLE_GPU=ON -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON -DTRITON_COMMON_REPO_TAG=r21.12 -DTRITON_THIRD_PARTY_REPO_TAG=r21.12 -DTRITON_CORE_REPO_TAG=r21.12 -DTRITON_BACKEND_REPO_TAG=r21.12 ..
89 | make cc-clients python-clients java-clients
90 | ```
91 | - Sau đó những thư viện ta đã build sẽ xuất hiện trong thư mục ```triton-client/build/install``` và cái chúng ta cần quan tâm sẽ là ```bin/perf_analyzer```
92 | ### 2.3 Sử dụng triton-client để gọi và lấy kết quả
93 | Tham khảo [src/sample_grpc.py](../src/sample_grpc.py)
94 | 
95 | 


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/model_batching.md:
--------------------------------------------------------------------------------
 1 | # Model Batching
 2 | 
 3 | Phần này ta sẽ tìm hiểu về một số cơ chế batching hỗ trợ bởi Triton
 4 | 
 5 | ### Dynamic Batching
 6 | Dynamic Batching thì không cần đề cập nhiều, luồng các messages vào đồng thời sẽ được gom lại và infer theo batch, phương pháp này chủ yếu nhằm tăng [throughput](../docs/perf_analyzer.md) (dẫn đến tăng [latency](../docs/perf_analyzer.md) khi trong cùng một điều kiện về resources)
 7 | ```
 8 | dynamic_batching { }
 9 | ```
10 | hoặc thêm cấu hình thời gian tối đa queue chờ messages mới (microseconds)
11 | ```
12 | dynamic_batching {
13 |     max_queue_delay_microseconds: 100
14 |   }
15 | ```
16 | 
17 | ### Ragged Batching
18 | 


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/model_configuration.md:
--------------------------------------------------------------------------------
 1 | # Model Configuration
 2 | Mặc định, cấu hình phải định nghĩa trước cho mô hình các thông số như tên model, platform sử dụng (```tensorrt_plan, pytorch_libtorch, tensorflow_savedmodel, ...```), kiểu dữ liệu, kích thước cho input, output, cấu hình wramup, cấu hình optimization, ...
 3 | ### 1. Cấu hình cơ bản (minimal model configuration)
 4 | Mặc định ta không cần xây dựng cấu hình cho các model TensorRT, Tensorflow saved-model và ONNX vì Triton có thể tự động generate. Đối với các model này nếu như không tồn tại ```config.pbtxt``` và ta khởi động triton-server với tham số ```--strict-model-config = false```, triton-server sẽ tự động generate ra file ```config.pbtxt``` ở mức cơ bản. Hoặc ta có thể xây dựng file ```config.pbtxt``` bằng tay. Ở đây mình sẽ xây dựng cấu hình cho đoạn code Pre-processing, Inference và Post-processing GFPGan đều sử dụng Pytorch.
 5 | - Pre-processing
 6 | ```
 7 | name: "pre_gfpgan_batch"
 8 | platform: "pytorch_libtorch"
 9 | max_batch_size: 8
10 | input [
11 |   {
12 |     name: "input__0"
13 |     data_type: TYPE_UINT8
14 |     dims: [-1, -1, 3]
15 |   }
16 | ]
17 | output [
18 |   {
19 |     name: "output__0"
20 |     data_type: TYPE_FP32
21 |     dims: [3, -1, -1]
22 |   }
23 | ]
24 | ```
25 | - Inference
26 | ```
27 | name: "infer_face_restoration_v2.1"
28 | platform: "pytorch_libtorch"
29 | max_batch_size: 8
30 | input [
31 |   {
32 |     name: "input__0"
33 |     data_type: TYPE_FP32
34 |     dims: [3, 512, 512]
35 |   }
36 | ]
37 | output [
38 |   {
39 |     name: "output__0"
40 |     data_type: TYPE_FP32
41 |     dims: [3, 512, 512]
42 |   }
43 | ]
44 | ```
45 | - Post-processing
46 | ```
47 | name: "post_gfpgan_batch"
48 | platform: "pytorch_libtorch"
49 | max_batch_size: 8
50 | input [
51 |   {
52 |     name: "input__0"
53 |     data_type: TYPE_FP32
54 |     dims: [3, -1, -1]
55 |   }
56 | ]
57 | output [
58 |   {
59 |     name: "output__0"
60 |     data_type: TYPE_UINT8
61 |     dims: [-1, -1, 3] 
62 |   }
63 | ]
64 | ```
65 | 
66 | <i>Giá trị **-1** thể hiện cho **dynamic-shape** </i>
67 | 
68 | Cần lưu ý giá trị ```max_batch_size```, khi giá trị này **khác** 0, giá trị ```dims``` sẽ được hiểu là kích thước của **1 dữ liệu đầu vào**, model sẽ chấp nhận kích thước đầu vào từ ```1 x dims``` đến ```max_batch_size x dims``` (dynamic batch), và nếu giá trị này **bằng** 0, giá trị ```dims``` sẽ được hiểu là **kích thước đầu vào** (static batch)


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/model_ensemble.md:
--------------------------------------------------------------------------------
  1 | # Ensemble multiple models and pre/post-processing
  2 | 
  3 | Phần này mình sẽ trình bày nội dung liên quan đến Model Ensemble trong việc giải quyết 2 tình huống
  4 | - Xây dựng pipeline end-to-end khi kết hợp 2 hoặc nhiều model với nhau (output của model này là input của model khác)
  5 | - Tích hợp tiền xử lý / hậu xử lý vào pipeline
  6 | 
  7 | <i>Lưu ý: Cách giải quyết của 2 tình huống này là giống nhau</i>
  8 | ### 1. Đặt vấn đề
  9 | Ví dụ như trong trường hợp của mình, mình sử dụng mô hình GFPGan với nhiều bộ dữ liệu khác nhau, từ đó có các phiên bản khác nhau của mô hình, các phiên bản này đều có đặc điểm chung là sử dụng **cùng** một phương pháp **tiền xử lý (pre-processing)** và **hậu xử lý (post-processing)**. Cách thức deploy hiện tại là đặt tiền/hậu xử lý ở phía ```client```, nhưng điều này sẽ khá bất cập khi scalable. Do vậy câu hỏi đặt ra là làm thế nào để tích hợp 2 thứ này vào triton một cách nhanh chóng và linh hoạt nhất để giảm thiểu chi phí chuyển giao trung gian và số lượng requests gửi đến. Triton có hỗ trợ chúng ta dưới dạng **Model Ensemble**. Ý tưởng chủ yếu được gói gọn trong 2 gạch đầu dòng sau:
 10 | - Quá trình tiền/hậu xử lý được build thành 1 model triton
 11 | - Tạo model ensemble: ```pre-processing -> infer -> post-processing```. Model này không phải là một model thực sự mà là một ```dataflow``` được xây dựng dựa trên model configuration
 12 | ### 2. Convert model tiền/hậu xử lý
 13 | Lấy ví dụ việc tiền xử lý ảnh của mình như sau (sử dụng numpy & opencv-python, thuần CPU)
 14 | ```
 15 | def triton_preprocess(cropped_face):
 16 |     rgb = cv2.cvtColor(cropped_face, cv2.COLOR_BGR2RGB)     # BGR sang RGB
 17 |     rgb = rgb.astype("float32") / 255.0                     # Rescale về đoạn [0, 1]
 18 |     rgb = (rgb - 0.5)/0.5                                   # Rescale từ [0, 1] về [-1, 1]
 19 |     rgb = np.expand_dims(rgb, axis = 0)                     # [256, 256, 3] -> [1, 256, 256, 3]
 20 |     return np.transpose(rgb, (0, 3, 1, 2))                  # [1, 256, 256, 3] -> [1, 3, 256, 256]
 21 | 
 22 | def triton_postprocess(net_out, min_max = (-1, 1)):
 23 |     net_out = np.transpose(net_out, (0, 2, 3, 1))                      # [1, 3, 256, 256] -> [1, 256, 256, 3]
 24 |     net_out = np.clip(net_out[0], min_max[0], min_max[1])              # [1, 256, 256, 3] -> [256, 256, 3] & clip
 25 |     net_out = (net_out - min_max[0]) / (min_max[1] - min_max[0])       # Rescale từ [-1, 1] về [0, 1]
 26 |     net_out = np.array(net_out * 255.0, dtype = np.uint8)              # Rescale từ [0, 1] về [0, 255] với uint8
 27 |     return cv2.cvtColor(net_out, cv2.COLOR_RGB2BGR)                    # RGB sang BGR
 28 | ```
 29 | Tiến hành convert sang pytorch
 30 | ```
 31 | class GFPGanPreprocessor(nn.Module):
 32 |     def __init__(self):
 33 |         super(GFPGanPreprocessor, self).__init__()
 34 |     def forward(self, x):
 35 |         x = x[:, :, [2, 1, 0]]                    
 36 |         x = x / 255.0
 37 |         x = (x - 0.5)/0.5
 38 |         x = torch.unsqueeze(x, 0)
 39 |         return torch.permute(x, (0, 3, 1, 2))
 40 | 
 41 | class GFPGanPostprocessor(nn.Module):
 42 |     def __init__(self):
 43 |         super(GFPGanPostprocessor, self).__init__()
 44 |     def forward(self, x):
 45 |         x = torch.permute(x, (0, 2, 3, 1))              
 46 |         x = torch.clamp(x, -1.0, 1.0)
 47 |         x = ((x + 1.0)/2.0*255.0).byte()
 48 |         return x[:, :, [2, 1, 0]] 
 49 | ```
 50 | Sử dụng pytorch JIT, nếu mọi người chưa biết về JIT có thể xem qua bài viết này
 51 | - [Deploy mô hình sử dụng Pytorch (TorchScript) và Triton](./triton_pytorch.md)
 52 | ```
 53 | # JIT
 54 | pre_model = GFPGanPreprocessor()
 55 | post_model = GFPGanPostprocessor()
 56 | pre_model.eval()
 57 | post_model.eval()
 58 | 
 59 | pre_x = torch.rand((256, 256, 3))
 60 | pre_traced_cell = torch.jit.trace(pre_model, (pre_x,), strict=False, check_trace=True)
 61 | print(pre_model(pre_x))
 62 | print(pre_traced_cell(pre_x))
 63 | pre_traced_cell.save('pre_traced_cell.pt')
 64 | 
 65 | post_x = torch.rand((1, 3, 256, 256))
 66 | post_traced_cell = torch.jit.trace(post_model, (post_x,), strict=False, check_trace=True)
 67 | print(post_model(post_x))
 68 | print(post_traced_cell(post_x))
 69 | post_traced_cell.save('post_traced_cell.pt')
 70 | ```
 71 | Kết quả chúng ta thu được 2 file ```pre_traced_cell.pt``` và ```post_traced_cell.pt``` là 2 model pre/post-process
 72 | ### 3. Đẩy model lên triton-server
 73 | Bước này khá là cơ bản, mình tiến hành đẩy 2 model lên triton với các cấu hình tương ứng sau
 74 | - Pre-process
 75 | ```
 76 | name: "pre_gfpgan"
 77 | platform: "pytorch_libtorch"
 78 | max_batch_size: 0
 79 | input [
 80 |   {
 81 |     name: "input__0"
 82 |     data_type: TYPE_UINT8
 83 |     dims: [-1, -1, 3]
 84 |   }
 85 | ]
 86 | output [
 87 |   {
 88 |     name: "output__0"
 89 |     data_type: TYPE_FP32
 90 |     dims: [1, 3, -1, -1]
 91 |   }
 92 | ]
 93 | ```
 94 | - Post-process
 95 | ```
 96 | name: "post_gfpgan"
 97 | platform: "pytorch_libtorch"
 98 | max_batch_size: 0
 99 | input [
100 |   {
101 |     name: "input__0"
102 |     data_type: TYPE_FP32
103 |     dims: [1, 3, -1, -1]
104 |   }
105 | ]
106 | output [
107 |   {
108 |     name: "output__0"
109 |     data_type: TYPE_UINT8
110 |     dims: [-1, -1, 3] 
111 |   }
112 | ]
113 | ```
114 | - Đẩy lên triton, nên sử dụng EXPLICIT MODE như trong hướng dẫn sau:
115 |     + [Các chế độ quản lý model (load/unload/reload)](./model_management.md)
116 | 
117 | ### 4. Tạo Ensemble Model
118 | Thiết lập mô hình ensemble với input là ```raw_image```, output là ```image_out```
119 | - Trong quá trình tiền xử lý, ```raw_image``` là input đầu vào ```input__0``` của model ```pre_gfpgan``` ta vừa load lên triton ở bước trên
120 | - Model ```pre_gfpgan``` trả về ```preprocessed_image``` lại feed tương ứng vào ```input__0``` của model ```infer_face_restoration_v2.1```
121 | - Output của model ```infer_face_restoration_v2.1``` ta đặt là ```net_out``` lại là input của model ```post_gfpgan``` - - Cuối cùng trả ra output của ```post_gfpgan``` là ```image_out``` đồng thời là output cuối cùng của model
122 | ```
123 | name: "ens_face_restoration_v2.1"
124 | platform: "ensemble"
125 | max_batch_size: 0
126 | input [
127 |   {
128 |     name: "raw_image"
129 |     data_type: TYPE_UINT8
130 |     dims: [-1, -1, 3]
131 |   }
132 | ]
133 | output [
134 |   {
135 |     name: "image_out"
136 |     data_type: TYPE_UINT8
137 |     dims: [-1, -1, 3]
138 |   }
139 | ]
140 | ensemble_scheduling {
141 |   step [
142 |     {
143 |       model_name: "pre_gfpgan"
144 |       model_version: -1
145 |       input_map {
146 |         key: "input__0"
147 |         value: "raw_image"
148 |       }
149 |       output_map {
150 |         key: "output__0"
151 |         value: "preprocessed_image"
152 |       }
153 |     },
154 |     {
155 |       model_name: "infer_face_restoration_v2.1"
156 |       model_version: -1
157 |       input_map {
158 |         key: "input__0"
159 |         value: "preprocessed_image"
160 |       }
161 |       output_map {
162 |         key: "output__0"
163 |         value: "net_out"
164 |       }
165 |     },
166 |     {
167 |       model_name: "post_gfpgan"
168 |       model_version: -1
169 |       input_map {
170 |         key: "input__0"
171 |         value: "net_out"
172 |       }
173 |       output_map {
174 |         key: "output__0"
175 |         value: "image_out"
176 |       }
177 |     }
178 |   ]
179 | }
180 | ```
181 | 
182 | Thiết lập xong cấu hình, ta khởi tạo thư mục ```1``` **rỗng** để tạo phiên bản đầu tiên và đẩy lên triton-server là done


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/model_instance.md:
--------------------------------------------------------------------------------
 1 | # Model Instance
 2 | 
 3 | Khi muốn scale up quy mô hệ thông, ta muốn sử dụng nhiều instance của model tương ứng với 1 hoặc nhiều GPU để tối đa hóa tốc độ, giảm thiểu độ trễ phía người dùng. Nghĩa là một requests từ phía người dùng có thể có nhiều lựa chọn hơn, khắc phục hiện tượng bottleneck phía inference. Do vậy, phần này mình sẽ trình bày về cấu hình Model Instance của ```triton-server```.
 4 | ### 1. Cơ chế Model Instance trong triton-server
 5 | Kiến trúc Triton cho phép nhiều model và một hoặc nhiều instance của cùng một model thực thi song song trên hệ thống. Hệ thống có thể không có, có một hoặc nhiều GPU. Hình dưới đây minh họa với 2 model, giả sử Triton hiện không xử lý bất kỳ yêu cầu nào, khi 2 requests đến đồng thời, 1 request cho mỗi 1 model, Triton ngay lập tức lên lịch cho cả 2 requests trên GPU và thực hiện song song chúng. Nếu hệ thống không có GPU, lập lịch trên CPU thì sẽ tiến hành trên các luồng và phụ thuộc vào OS hệ thống.
 6 | <p align="left">
 7 |   <img src="../fig/multi_model_exec.png" width="800">
 8 | </p>
 9 | 
10 | Mặc định, nếu nhiều requests đến cùng 1 model tại 1 thời điểm, Triton sẽ lập lịch sao cho chỉ xử lý 1 request mỗi một thời điểm
11 | <p align="left">
12 |   <img src="../fig/multi_model_serial_exec.png" width="800">
13 | </p>
14 | 
15 | Triton cung cấp một config cho model được gọi là **instance-group** chỉ định số lượng executions được thực thi song song, mỗi execution như vậy được gọi là **instance**. Mặc định, Triton sẽ khởi tạo các **instance** trên các GPU khác nhau. Ví dụ như trong hình dưới đây, có 3 instances và 4 requests được gọi đến, request thứ 4 phải đợi cho đến khi 1 trong 3 lần thực thi đầu tiên hoàn thành trước khi bắt đầu.
16 | <p align="left">
17 |   <img src="../fig/multi_model_parallel_exec.png" width="800">
18 | </p>


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/model_management.md:
--------------------------------------------------------------------------------
 1 | # Model Management
 2 | 
 3 | Có 3 chế độ quản lý model trong triton đó là **NONE** (mặc định), **EXPLICIT** (động) và **POLL**
 4 | 
 5 | ### NONE Mode (Default)
 6 | - Cấu hình ```--model-control-mode=none```
 7 | - Triton sẽ tiến hành load toàn bộ mô hình cùng cấu hình tương ứng lên bộ nhớ, những model nào bị lỗi sẽ bỏ qua và không khả dụng.
 8 | - Việc thay đổi repo của model khi server đang chạy sẽ không tác động đến hệ thống hiện tại
 9 | - **Không thể** sử dụng ```load``` và ```unload``` API từ ```triton-client```
10 | - Ưu điểm:
11 |     + Dễ sử dụng
12 | - Nhược điểm:
13 |     + Khó tùy biến
14 |     + Việc bổ sung/loại bỏ models đòi hỏi **phải** khởi động lại ```triton-server```
15 | ### EXPLICIT Mode (Recommend)
16 | - Cấu hình ```--model-control-mode=explicit```
17 | - Mặc định triton sẽ **không** ```load``` model nào vào bộ nhớ nếu flag ```--load-model``` không được khai báo. Do vậy, với khởi động mặc định cần phải call API ```load``` các model cần thiết **bằng tay**
18 | - Các model có thể được gọi ```load``` và ```unload``` tùy ý thông qua API từ ```triton-client```
19 | - Việc thay đổi repo của model khi server đang chạy sẽ tác động đến hệ thống hiện tại: **load lại model đó**
20 | - Ưu điểm:
21 |     + Dễ tùy biến
22 |     + Việc bổ sung/loại bỏ models **không cần** khởi động lại ```triton-server```
23 | - Nhược điểm:
24 |     + Hơi khó để làm quen và sử dụng
25 | 
26 | Tham khảo API ```Load/Unload/Reload``` model sử dụng Python tại [đây](../src/sample_load_unload.py)
27 | ### POLL
28 | Thấy bảo là không recommend trong **production** nên cũng lười không đọc luôn ...


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/optimization_pytorch.md:
--------------------------------------------------------------------------------
 1 | # Optimize Pytorch Backend
 2 | Trong quá trình khởi động ```triton-server``` để load các mô hình sử dụng ```pytorch``` backend đôi khi ta sẽ gặp những thông báo kiểu:
 3 | ```
 4 | I1227 03:45:06.216251 1 libtorch.cc:1255] TRITONBACKEND_ModelInitialize: license_plate_restoration_square_v1.1 (version 1)
 5 | I1227 03:45:06.216786 1 libtorch.cc:251] Optimized execution is enabled for model instance 'license_plate_restoration_square_v1.1'
 6 | I1227 03:45:06.216796 1 libtorch.cc:269] Inference Mode is disabled for model instance 'license_plate_restoration_square_v1.1'
 7 | I1227 03:45:06.216800 1 libtorch.cc:344] NvFuser is not specified for model instance 'license_plate_restoration_square_v1.1'
 8 | ```
 9 | Đây là thông báo khi **Inference Mode** và **NvFuser** chưa được bật để tối ưu tốc độ. Do vậy trong phần này mình sẽ trình bày về cấu hình tối ưu ```triton-server``` khi sử dụng ```pytorch``` backend với các tham số phù hợp.
10 | 
11 | ### 1. Inference Mode
12 | 
13 | **InferenceMode** hoạt động tương tự như **NoGradMode** khi không sử dụng autograd. Do vậy, trong đại đa số trường hợp khi mà model của chúng ta không quá đặc biệt (chứa những toán tử bị ảnh hưởng bởi autograd) thì ta có thể bật **InferenceMode** trong file cấu hình như sau:
14 | 
15 | ```
16 | parameters: {
17 | key: "INFERENCE_MODE"
18 |     value: {
19 |     string_value:"true"
20 |     }
21 | }
22 | ```
23 | 
24 | - Kết quả khi tắt **Inference Mode** (mặc định)
25 | ```
26 | Inferences/Second vs. Client p95 Batch Latency
27 | Concurrency: 1, throughput: 46.4 infer/sec, latency 24657 usec
28 | Concurrency: 2, throughput: 53.8 infer/sec, latency 41444 usec
29 | Concurrency: 3, throughput: 54 infer/sec, latency 59257 usec
30 | Concurrency: 4, throughput: 53.4 infer/sec, latency 81955 usec
31 | ```
32 | - Kết quả sau khi bật (được cải thiện một chút)
33 | ```
34 | Inferences/Second vs. Client p95 Batch Latency
35 | Concurrency: 1, throughput: 42.6 infer/sec, latency 27506 usec
36 | Concurrency: 2, throughput: 54.4 infer/sec, latency 40857 usec
37 | Concurrency: 3, throughput: 54 infer/sec, latency 60192 usec
38 | Concurrency: 4, throughput: 53.6 infer/sec, latency 81830 usec
39 | ```
40 | 
41 | ### 2. NvFuser (CUDA Graph Fuser)
42 | Nếu như các bạn có đọc qua về **TensorRT Optimization** thì cơ chế của **NvFuser** sẽ tương tự. Đơn giản là sẽ tiến hành Fuse một số toán tử lại với nhau để tăng tốc độ thực thi. Cơ chế fusing này đã trở nên rất phổ biến và được tích hợp vào hầu hết các framework hiện nay.
43 | Tiến hành bật **NvFuser**:
44 | ```
45 | parameters: {
46 | key: "ENABLE_NVFUSER"
47 |     value: {
48 |     string_value:"true"
49 |     }
50 | }
51 | ```
52 | 
53 | ### 3. Các chế độ Optimization khác
54 | Ngoài ra, ta có một số **optimization flags** khác có thể thử
55 | ```
56 | ENABLE_JIT_EXECUTOR
57 | ```
58 | ```
59 | ENABLE_JIT_PROFILING
60 | ```
61 | ```
62 | ENABLE_TENSOR_FUSER
63 | ```
64 | Lưu ý rằng việc enable toàn bộ các ```optimization flags``` chưa chắc đã mang lại kết quả tốt nhất. Khuyến nghị chỉ sử dụng **INFERENCE_MODE** làm mặc định. Dưới đây là kết quả khi enable tất cả các ```optimization flags```
65 | 
66 | ```
67 | Inferences/Second vs. Client p95 Batch Latency
68 | Concurrency: 1, throughput: 42.2 infer/sec, latency 27052 usec
69 | Concurrency: 2, throughput: 48.2 infer/sec, latency 46771 usec
70 | Concurrency: 3, throughput: 49.8 infer/sec, latency 65506 usec
71 | Concurrency: 4, throughput: 29 infer/sec, latency 189399 usec
72 | ```
73 | 
74 | ### 4. Model Instance
75 | Việc bật nhiều **instance** giúp chúng ta tăng tốc độ khi luồng requests đầu vào có nhiều lựa chọn hơn (nhiều consumers). Tuy nhiên trong một số trường hợp các ```optimization flags``` thường gây ra một số lỗi cho nên khi lựa chọn việc sử dụng nhiều **instance models** ta nên ```DISABLE_OPTIMIZED_EXECUTION```
76 | ```
77 | parameters: {
78 | key: "DISABLE_OPTIMIZED_EXECUTION"
79 |     value: {
80 |     string_value:"true"
81 |     }
82 | }
83 | ```


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/perf_analyzer.md:
--------------------------------------------------------------------------------
 1 | # Performance Analyst Tool 
 2 | **Performance Analyst** là tool dùng để phân tích tốc độ từ phía người dùng. Để sử dụng, ta phải cài đặt **triton-client** từ source như trong hướng dẫn sau:
 3 | - [Cài đặt Triton-inference-server](install.md#2-2-cài-đặt-nâng-cao)
 4 | 
 5 | Một số thuật ngữ mà ta cần chú ý:
 6 | - **Throughput**: tốc độ xử lý requests (thường là số lượng requests/s)
 7 | - **Latency**: thời gian chờ để xử lý xong một requests
 8 | 
 9 | Ví dụ: Throughput hiện tại trên 1 concurency đạt được là 50 requests/s với latency 100ms, khi tăng số lượng concurrencies lên 2 thì throughput vẫn như vậy nhưng latency tăng lên 200ms
10 | 
11 | Ta sẽ lấy ví dụ cho việc phân tích model ```wav2vec_general_v2``` mình đang deploy như sau:
12 | ```
13 | perf_analyzer -m wav2vec_general_v2 --percentile=95 --concurrency-range 1:8 --shape input:1,320000
14 | ```
15 | Kết quả trả về cho chúng ta **throughput** và **latency** với backend **ONNX-runtime**
16 | <p align="left">
17 |   <img src="../fig/wav2vec_general_perf_onnx.jpg" width="800">
18 | </p>
19 | 
20 | Ta sẽ tiến hành thay đổi cấu hình ```config.pbtxt``` để sử dụng optimized **ONNX-TensorRT**, khởi động lại Triton-inference-server và so sánh kết quả. Lưu ý rằng quá trình load lại mô hình sẽ lâu hơn vì phải mỗi lần khởi động lại triton sẽ convert mô hình từ **ONNX** sang **TensorRT**
21 | ```
22 | optimization { execution_accelerators {
23 |   gpu_execution_accelerator : [ {
24 |     name : "tensorrt"
25 |     parameters { key: "precision_mode" value: "FP32" }
26 |     parameters { key: "max_workspace_size_bytes" value: "1073741824" }
27 |     }]
28 | }}
29 | ```
30 | 
31 | Lưu ý giá trị <b>1073741824 = 1 x 1024 x 1024 x 1024 (bytes) = 1Gb</b> là giá trị ```workspace``` mặc định, đối với model **kích thước lớn** ta cần đẩy giá trị này lên cao, chẳng hạn **4Gb = 4294967296**
32 | 
33 | Kết quả
34 | <p align="left">
35 |   <img src="../fig/wav2vec_general_perf_tensorrt.jpg" width="800">
36 | </p>
37 | 
38 | Như vậy, đối với mô hình trên, việc sử dụng backend TensorRT (FP32) giúp cải thiện tốc độ đáng kể (**1.76** lần) so với sử dụng backend ONNX-runtime thông thường.
39 | *(Bạn có thể thử với FP16, nó sẽ không chỉ dừng ở con số **1.76** kia đâu)*


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/triton_kaldi.md:
--------------------------------------------------------------------------------
 1 | # Kaldi ASR with Triton-inference-server
 2 | Phần này sẽ đề cập đến cách sử dụng Kaldi backend trong Triton
 3 | ### 1. Build
 4 | - Build docker image
 5 | ```
 6 | git clone https://github.com/NVIDIA/DeepLearningExamples.git
 7 | cd DeepLearningExamples/Kaldi/SpeechRecognition
 8 | scripts/docker/build.sh
 9 | ```
10 | - Download mô hình sample LibriSpeech
11 | ```
12 | scripts/docker/launch_download.sh
13 | ```
14 | - Khởi chạy triton-kaldi-server với LibriSpeech
15 | ```
16 | scripts/docker/launch_server.sh
17 | ```
18 | ### 2. Load custom model
19 | Phần này mình sẽ tiến hành sử dụng triton để load customized model.
20 | - Tạo thư mục mới tại thư mục làm việc hiện tại
21 | ```
22 | models/infer_asr_kaldi_radio_v1/1
23 | ```
24 | với ```infer_asr_kaldi_radio_v1``` là tên model của mình.
25 | - Run triton tại thư mục hiện tại với MODE ```EXPLICIT```
26 | ```
27 | docker run --rm -it \
28 |    --gpus device=0 \
29 |    --shm-size=1g \
30 |    --ulimit memlock=-1 \
31 |    --ulimit stack=67108864 \
32 |    -p8005:8000 \
33 |    -p8006:8001 \
34 |    -p8007:8002 \
35 |    --name trt_server_asr \
36 |    -v $PWD/data:/data \
37 |    -v $PWD/model-repo:/mnt/model-repo \
38 |    -v $PWD/models:/models \
39 |    triton_kaldi_server tritonserver --model-repo=/models --model-control-mode=explicit
40 | ```
41 | trong đó ```$PWD/models``` là thư mục ta vừa tạo
42 | - Sử dụng một screen khác copy ```libtriton_kaldi.so```
43 | ```
44 | docker ps
45 | docker exec -it <CONTAINER-ID> bash
46 | cp /workspace/model-repo/kaldi_online/1/libtriton_kaldi.so /models/infer_asr_kaldi_radio_v1/
47 | ```
48 | - Xây dựng cấu trúc thư mục như sau (nhớ sửa lại đường dẫn trong các file ```.conf``` cho đúng):
49 | ```
50 | ├── models
51 | │   ├── infer_asr_kaldi_radio_v1
52 | │   │   ├── 1
53 | │   │   │   ├── conf
54 | │   │   │   │   ├── ivector_extractor.conf
55 | │   │   │   │   ├── mfcc.conf
56 | │   │   │   │   ├── online.conf
57 | │   │   │   │   ├── online_cmvn.conf
58 | │   │   │   │   ├── splice.conf
59 | │   │   │   ├── ivector_extractor
60 | │   │   │   │   ├── final.dubm
61 | │   │   │   │   ├── final.ie
62 | │   │   │   │   ├── final.mat
63 | │   │   │   │   ├── global_cmvn.stats
64 | │   │   │   │   ├── online_cmvn.conf
65 | │   │   │   │   ├── online_cmvn_iextractor
66 | │   │   │   │   ├── splice_opts
67 | │   │   │   ├── final.mdl
68 | │   │   │   ├── global_cmvn.stats
69 | │   │   │   ├── HCLG.fst
70 | │   │   │   ├── words.txt
71 | │   │   ├── config.pbtxt
72 | │   │   ├── libtriton_kaldi.so
73 | ```
74 |   Lưu ý: file ```/models/infer_asr_kaldi_radio_v1/1/global_cmvn.stats``` khác với file ```/models/infer_asr_kaldi_radio_v1/1/ivector_extractor/global_cmvn.stats```
75 | - Load model lên triton bằng [gRPC API](../docs/model_management.md)
76 | 


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/triton_onnx.md:
--------------------------------------------------------------------------------
 1 | # ONNX-runtime with Triton-inference-server
 2 | 
 3 | Để deploy ONNX model chạy với ONNX-runtime (ngoài ONNX-runtime có thể sử dụng TensorRT-runtime nếu support), ta cần để platform là ```onnxruntime_onnx```, ngoài ra các tham số cơ bản trong cấu hình cũng tương tự. Mình sẽ tiến hành deploy model ```wav2vec_general_v2``` như sau:
 4 | - Trong thư mục ```models```, khởi tạo thư mục ```wav2vec_general_v2``` chứa file cấu hình và weights
 5 | - Để file weights dưới đường dẫn ```models/wav2vec_general_v2/1/model.onnx```, trong đó ```1``` là phiên bản của mô hình
 6 | - Để file config dưới đường dẫn ```models/wav2vec_general_v2/config.pbtxt```, lưu ý không ném trong thư mục phiên bản
 7 | 
 8 | ```
 9 | name: "wav2vec_general_v2"
10 | platform: "onnxruntime_onnx"
11 | max_batch_size : 0
12 | input [
13 |   {
14 |     name: "input"
15 |     data_type: TYPE_FP32
16 |     dims: [1, -1]
17 |   }
18 | ]
19 | output [
20 |   {
21 |     name: "output"
22 |     data_type: TYPE_FP32
23 |     dims: [-1, -1, 105]
24 |   }
25 | ]
26 | ```
27 | - Đẩy model lên triton-server
28 | ```
29 | python src/sample_load_unload.py wav2vec_general_v2
30 | ```


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/docs/triton_tensorrt.md:
--------------------------------------------------------------------------------
 1 | # TensorRT-runtime with Triton-inference-server
 2 | 
 3 | Trong trường hợp muốn deploy mô hình sử dụng TensorRT-runtime thay vì ONNX-runtime (model thường phải convert sang ONNX trước khi sang TensorRT), file weight **phải** được convert theo **đúng** phiên bản TensorRT mà docker triton-inference-server đang sử dụng. Do vậy, ta truy cập vào môi trường docker hiện tại như sau:
 4 | 
 5 | - Lấy ID của Docker đang chạy triton-inference-server
 6 | ```
 7 | damnguyen@rnd3:~$ docker ps
 8 | 
 9 | CONTAINER ID   IMAGE                                   COMMAND                  CREATED        STATUS        PORTS                                                           NAMES
10 | 6ef0b4972292   nvcr.io/nvidia/tritonserver:21.12-py3   "/opt/tritonserver/n…"   23 hours ago   Up 23 hours   0.0.0.0:8000-8002->8000-8002/tcp, :::8000-8002->8000-8002/tcp   cranky_hamilton
11 | b09d98350935   quay.io/cloudhut/kowl:master-645e3b4    "./kowl"                 6 days ago     Up 6 days                                                                     gifted_davinci
12 | ```
13 | ta có CONTAINER ID của triton là ```6ef0b4972292```
14 | - Chạy bash sử dụng triton container
15 | ```
16 | damnguyen@rnd3:~$ docker exec -it 6ef0b4972292 bash
17 | root@6ef0b4972292:/opt/tritonserver#
18 | ```
19 | - Convert model ONNX sang TensorRT (cú pháp tương tự khi làm việc với engine TensorRT thông thường)
20 | ```
21 | /usr/src/tensorrt/bin/trtexec --onnx=<path-to-onnx> --saveEngine=<path-to-save-plan-file>
22 | ```
23 | - Deploy model lên triton tương tự như ONNX sử dụng tên platform là ```tensorrt_plan``` thay vì ```onnxruntime_onnx```
24 |     + [Deploy mô hình sử dụng ONNX-runtime và Triton](./triton_onnx.md)


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/fig/multi_model_exec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/multi_model_exec.png


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/fig/multi_model_parallel_exec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/multi_model_parallel_exec.png


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/fig/multi_model_serial_exec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/multi_model_serial_exec.png


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/fig/wav2vec_general_perf_onnx.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/wav2vec_general_perf_onnx.jpg


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/fig/wav2vec_general_perf_tensorrt.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/wav2vec_general_perf_tensorrt.jpg


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/fig/wav2vec_general_start.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Deploy/Triton-inference-server/fig/wav2vec_general_start.jpg


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/src/sample_grpc.py:
--------------------------------------------------------------------------------
 1 | import tritonclient.grpc as grpcclient
 2 | 
 3 | class TritonModelGRPC:
 4 |     '''
 5 |         Sample model-request triton-inference-server with gRPC
 6 |     '''
 7 |     def __init__(self,
 8 |                 triton_host = 'localhost:8001', # default gRPC port
 9 |                 triton_model_name = 'wav2vec_general_v2',
10 |                 verbose = False):
11 |         print('Init connection from Triton-inference-server')
12 |         print('- Host: {}'.format(triton_host))
13 |         print('- Model: {}'.format(triton_model_name))
14 |         self.triton_host = triton_host
15 |         self.triton_model_name = triton_model_name
16 |         self.model = grpcclient.InferenceServerClient(url=self.triton_host,
17 |                                                             verbose=verbose,
18 |                                                             ssl=False,
19 |                                                             root_certificates=None,
20 |                                                             private_key=None,
21 |                                                             certificate_chain=None)
22 |         if not self.model.is_server_live():
23 |             print("FAILED : is_server_live")
24 |             sys.exit(1)
25 | 
26 |         if not self.model.is_server_ready():
27 |             print("FAILED : is_server_ready")
28 |             sys.exit(1)
29 |         
30 |         if not self.model.is_model_ready("wav2vec_general_v2"):
31 |             print("FAILED : is_model_ready")
32 |             sys.exit(1)
33 |         self.verbose = verbose
34 | 
35 |     def run(self, feats):
36 |         # Input shape must be [-1]
37 |         assert len(feats.shape) == 2, "Shape not support: {}".format(feats.shape)
38 |         assert feats.shape[0] == 1, "Shape not support: {}".format(feats.shape)
39 |         feats_length = feats.shape[-1]
40 |         if self.verbose:
41 |             print('='*50)
42 |             print('- Input shape: [1, {}]'.format(feats_length))
43 |         inputs = []
44 |         outputs = []
45 |         inputs.append(grpcclient.InferInput('input', [1, feats_length], "FP32"))
46 |         inputs[0].set_data_from_numpy(feats)
47 |         outputs.append(grpcclient.InferRequestedOutput('output'))
48 |         if self.verbose:
49 |             tik = time.time()
50 |         results = self.model.infer(
51 |             model_name="wav2vec_general_v2",
52 |             inputs=inputs,
53 |             outputs=outputs,
54 |             client_timeout=None)
55 |         if self.verbose:
56 |             tok = time.time()
57 |             print('- Time cost:', tok - tik)
58 |         output = results.as_numpy('output')
59 |         return output
60 | 
61 | 


--------------------------------------------------------------------------------
/Deploy/Triton-inference-server/src/sample_load_unload.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Example
 3 |     - Load model
 4 |         python3 sample_load_unload.py --models emotion_recognition_v1.1
 5 |     - Unload model
 6 |         python3 sample_load_unload.py --unload --models emotion_recognition_v1.1 
 7 |     - Load model from file
 8 |         python3 sample_load_unload.py --path --models model_list.txt
 9 | '''
10 | import argparse
11 | import tritonclient.grpc as grpcclient
12 | 
13 | parser = argparse.ArgumentParser(description='Load/Unload model')
14 | parser.add_argument('--models', default="", help='list of model names to load/unload')
15 | parser.add_argument('--unload', action = "store_true", help='load or unload model') 
16 | parser.add_argument('--reload', action = "store_true", help='reload model') 
17 | parser.add_argument('--path', action = "store_true", help='get list of models from filepath') 
18 | parser.add_argument('--url', default="localhost:8001", help='default triton-server URL')
19 | args = parser.parse_args()
20 | 
21 | if not args.path:
22 |     MODEL_NAMES = args.models.strip().split(',')
23 | else:
24 |     MODEL_NAMES = open(args.models).read().strip('\n').split('\n')
25 | URL = args.url
26 | triton_client = grpcclient.InferenceServerClient(url=URL, verbose=True)
27 | triton_client.is_server_live()
28 | triton_client.get_model_repository_index().models
29 | if args.unload:
30 |     for MODEL_NAME in MODEL_NAMES:
31 |         if triton_client.is_model_ready(MODEL_NAME):
32 |             print('UNLOAD: {}'.format(MODEL_NAME))
33 |             triton_client.unload_model(MODEL_NAME)
34 |         else:
35 |             print('Skip: {}'.format(MODEL_NAME))
36 | else:
37 |     for MODEL_NAME in MODEL_NAMES:
38 |         if triton_client.is_model_ready(MODEL_NAME):
39 |             if args.reload:
40 |                 print('RELOAD: {}'.format(MODEL_NAME))
41 |                 triton_client.unload_model(MODEL_NAME)
42 |                 triton_client.load_model(MODEL_NAME)
43 |             else:
44 |                 print('Skip: {}'.format(MODEL_NAME))
45 |         else:
46 |             print('LOAD: {}'.format(MODEL_NAME))
47 |             triton_client.load_model(MODEL_NAME)
48 | 
49 | 
50 | print('='*70)
51 | triton_client.get_model_repository_index().models


--------------------------------------------------------------------------------
/Framework/ONNX/README.md:
--------------------------------------------------------------------------------
1 | # AI-Engineer-Howto
2 | 
3 | Tất cả những thứ liên quan đến ONNX và ONNX-runtime


--------------------------------------------------------------------------------
/Framework/Pytorch/README.md:
--------------------------------------------------------------------------------
1 | # AI-Engineer-Howto
2 | 
3 | Tất cả những thứ liên quan đến Pytorch & Pytorch-serving
4 | - [Build Pytorch from source](docs/build_from_source.md)


--------------------------------------------------------------------------------
/Framework/Pytorch/docs/build_from_source.md:
--------------------------------------------------------------------------------
 1 | # Pytorch
 2 | 
 3 | ## Build pytorch from source (best config for AMD CPU & NVIDIA-GPU)
 4 | We will use OpenBLAS instead of MKL & MKLDNN
 5 | ```
 6 | # Install anaconda (if not)
 7 | curl -O https://repo.anaconda.com/archive/Anaconda3-2020.07-Linux-x86_64.sh
 8 | bash Anaconda3-2020.07-Linux-x86_64.sh
 9 | source ~/anaconda3/bin/activate
10 | 
11 | # Install dependencies
12 | conda create -n myenv_pytorch_1.9 python=3.8
13 | conda activate myenv_pytorch_1.9
14 | conda install astunparse numpy ninja pyyaml setuptools cmake cffi typing_extensions future six requests dataclasses
15 | pip install ninja
16 | 
17 | # Build
18 | git clone --recursive --branch v1.9.1  https://github.com/pytorch/pytorch.git
19 | cd pytorch
20 | export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
21 | USE_NCCL=ON USE_CUDNN=OFF USE_CUDA=ON USE_MKL=OFF USE_MKLDNN=OFF python setup.py install
22 | ```
23 | ## Compatible with
24 | - TorchVision: 0.10.1
25 | - OpenCV: 4.6.0
26 | - MMCV: 1.3.3
27 | - MMCV Compiler: GCC 9.4
28 | - MMCV CUDA Compiler: 11.3
29 | - MMDetection: 2.7.0+e78eee5
30 | 
31 | 


--------------------------------------------------------------------------------
/Framework/TensorRT/README.md:
--------------------------------------------------------------------------------
1 | # TensorRT
2 | - [Convert ONNX model to TensorRT](docs/tutorial.md)
3 | 


--------------------------------------------------------------------------------
/Framework/TensorRT/docs/tutorial.md:
--------------------------------------------------------------------------------
  1 | # AI-Engineer-Howto
  2 | ## Convert model to TensorRT
  3 | ### 1. Convert model to ONNX
  4 | Để thuận tiện cho việc deploy, các model sử dụng các framework khác nhau nên được convert sang ONNX, việc convert model từ ONNX sang các runtime khác cũng được dễ dàng hơn, đặc biệt là TensorRT
  5 | ### 2. Get input/output shape
  6 | Sau khi convert model sang ONNX, ta cần xác định kích thước của input/output (cơ bản chỉ cần input) và các tham số tương ứng. Dễ dàng nhất là ta sử dụng [netron](https://netron.app/) để xem kiến trúc. Ví dụ trong hình dưới đây là model SCFD face detection được visualize sử dụng [netron](https://netron.app/):
  7 | <p align="center">
  8 |   <img src="../fig/sample_netron_scrfd.png" width="960">
  9 | </p>
 10 | 
 11 | - Input:
 12 |     - **input.1** (float32): [batch_size, 3, 640, 640] hay [-1, 3, 640, 640] (những giá trị **khác số** được hiểu là giá trị dynamic (động))
 13 | - Output:
 14 |     - **num_detections** (int32): [-1, 1]
 15 |     - **nmsed_boxes** (float32): [-1, 200, 4]
 16 |     - **nmsed_scores** (float32): [-1, 200]
 17 |     - **nmsed_classes** (float32): [-1, 200]
 18 |     - **nmsed_landmarks** (float32): [-1, 200, 10]
 19 | ### 3. Serialize Engine
 20 | Đầu tiên cần hiểu về **dynamic** và **static** đối với **shape** và **batch**
 21 | - **batch** (batch size): số lượng các input đầu vào, thường là dimension đầu tiên của tensor
 22 | - **shape**: kích thước các dimension của tensor, bao gồm cả **batch**
 23 | - **dynamic**: động
 24 | - **static**: cố định
 25 | 
 26 | Như vậy ta có:
 27 | - **dynamic batch**: chỉ **batch** dạng động, các shape khác giữ nguyên, chẳng hạn [-1, 3, 640, 640] thì ta có các input thỏa mãn là [1, 3, 640, 640], [7, 3, 640, 640], ... các input không thỏa mãn là [1, 3, 640, 512], [1, 4, 640, 640], ...
 28 | - **static shapes**: chỉ chấp nhận 1 kích thước cố định, chẳng hạn [4, 3, 640, 640] chỉ chấp nhận input [4, 3, 640, 640], còn [7, 4, 640, 512] chỉ chấp nhận input [7, 4, 640, 512], ...
 29 | - **dynamic shapes**: một số các dimension động, chẳng hạn [-1, 3, -1, 32] có thể chấp nhận các input [4, 3, 214, 32], [12, 3, 320, 32], ...<br>
 30 | 
 31 | Thông thường ta chỉ quan tâm **dynamic shapes** và **static shapes**.
 32 | 
 33 | Tiến hành convert (serialize), ta có 2 kiểu convert model chính là **implicitBatch** (mặc định) và **explicitBatch**. Giá trị batch_size được hiểu mặc định là giá trị đầu tiên, như trong model phía trên của mình đó là giá trị **-1**, nếu như trong trường hợp trên model của mình có kích thước input là **[1, 3, 640, 640]** thì model không hỗ trợ **dynamic shapes** mà chỉ hỗ trợ **static shapes**, tức là chấp nhận 1 kích thước đầu vào duy nhất. Tuy nhiên ta có một số phương pháp hỗ trợ convert model ONNX từ **dynamic** thành **static** và ngược lại.
 34 | - **implicitBatch** (default): Hoạt động với model có input dạng **static shapes**
 35 | - **explicitBatch**: Hoạt động với model có input dạng **dynamic shapes**
 36 | 
 37 | Ví dụ cho việc convert model sử dụng **implicitBatch** (kích thước input được xác định sẵn **static** trong meta ONNX model):
 38 | 
 39 | ```
 40 | /usr/src/tensorrt/bin/trtexec \
 41 |         --implicitBatch \
 42 |         --onnx=<path-to-ONNX-model> \
 43 |         --saveEngine=output.plan \
 44 |         --device=0 \
 45 |         --verbose
 46 | ```
 47 | 
 48 | Ví dụ cho việc convert model sử dụng **explicitBatch** (ở đây ta phải xác định thêm **minShapes**, **optShapes** và **maxShapes** của từng input):
 49 | 
 50 | ```
 51 | /usr/src/tensorrt/bin/trtexec \
 52 |         --explicitBatch \
 53 |         --onnx=<path-to-ONNX-model> \
 54 |         --minShapes=input.1:1x3x640x640 \
 55 |         --optShapes=input.1:1x3x640x640 \
 56 |         --maxShapes=input.1:4x3x640x640 \
 57 |         --saveEngine=output.plan \
 58 |         --device=0 \
 59 |         --verbose 
 60 | ```
 61 | 
 62 | trong đó: <br>
 63 |     - **saveEngine**: đường dẫn đến model TensorRT output, thường để đuôi **.plan** hoặc **.trt** <br>
 64 |     - **device**: GPU ID <br>
 65 |     - **verbose**: in ra log của quá trình convert <br>
 66 |     - Cú pháp định nghĩa các shape trong trường hợp có nhiều input: ```<input1-name>:<input1-shape>,<input2-name>:<input2-shape>,...```
 67 | 
 68 | ### 3. Deserialize Engine & Inference
 69 | Sau khi có model tensorrt (hay còn gọi là engine file), ta cần thực hiện việc load model và inference
 70 | - Cài đặt pycuda & tensorrt python binding như trong hướng dẫn ở [đây](https://github.com/NNDam/Retinaface-TensorRT)
 71 | - Wrap model gồm 3 function chính: allocate_buffers, do_inference, post_process
 72 | - Tham khảo việc wrap model tại example repositories
 73 | 
 74 | #### 3.1. Allocate buffers
 75 | Khởi tạo bộ nhớ cho các inputs & outputs. Lưu ý rằng đối với **dynamic shapes** model ta cần allocate theo kích thước của inputs và outputs theo **maxShapes**
 76 | 
 77 | ```
 78 | # Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
 79 | def allocate_buffers(engine):
 80 |     inputs = []
 81 |     outputs = []
 82 |     bindings = []
 83 |     stream = cuda.Stream()
 84 |     out_shapes = []
 85 |     input_shapes = []
 86 |     out_names = []
 87 |     max_batch_size = engine.get_profile_shape(0, 0)[2][0]
 88 |     for binding in engine:
 89 |         binding_shape = engine.get_binding_shape(binding)
 90 |         # Fix -1 dimension for proper memory allocation for batch_size > 1
 91 |         if binding_shape[0] == -1: # Dynamic batch size
 92 |             binding_shape = (max_batch_size,) + binding_shape[1:]
 93 |         size = trt.volume(binding_shape)
 94 |         dtype = trt.nptype(engine.get_binding_dtype(binding))
 95 |         # Allocate host and device buffers
 96 |         host_mem = cuda.pagelocked_empty(size, dtype)
 97 |         device_mem = cuda.mem_alloc(host_mem.nbytes)
 98 |         # Append the device buffer to device bindings.
 99 |         bindings.append(int(device_mem))
100 |         # Append to the appropriate list.
101 |         if engine.binding_is_input(binding):
102 |             inputs.append(HostDeviceMem(host_mem, device_mem))
103 |             input_shapes.append(engine.get_binding_shape(binding))
104 |         else:
105 |             outputs.append(HostDeviceMem(host_mem, device_mem))
106 |             #Collect original output shapes and names from engine
107 |             out_shapes.append(engine.get_binding_shape(binding))
108 |             out_names.append(binding)
109 |     return inputs, outputs, bindings, stream, input_shapes, out_shapes, out_names, max_batch_size
110 | ```
111 | 
112 | #### 3.2. Inference
113 | Tiến hành inference, bao gồm lấy dữ liệu inputs từ host sang device GPU, thực hiện execute trên device GPU để thu được outputs và copy outputs từ device GPU về host
114 | 
115 | ```
116 | def do_inference(context, bindings, inputs, outputs, stream):
117 |     # Transfer input data to the GPU.
118 |     [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
119 |     # Run inference.
120 |     context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
121 |     # Transfer predictions back from the GPU.
122 |     [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
123 |     # Synchronize the stream
124 |     stream.synchronize()
125 |     # Return only the host outputs.
126 |     return [out.host for out in outputs]
127 | ```
128 | 
129 | #### 3.3. Post-processing 
130 | Tiến hành reshape lại dữ liệu outputs tùy vào yêu cầu của bài toán và thực hiện các post-processing khác
131 | 
132 | ## Example repositories
133 | - [Retinaface](https://github.com/NNDam/Retinaface-TensorRT)
134 | - [vietocr](https://github.com/NNDam/vietocr-tensorrt) 
135 | - [yolor](https://github.com/NNDam/yolor)
136 | 


--------------------------------------------------------------------------------
/Framework/TensorRT/fig/sample_netron_scrfd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NNDam/AI-Engineer-Note/0b9388ecb43a1d596111b38c66865b42e83b9852/Framework/TensorRT/fig/sample_netron_scrfd.png


--------------------------------------------------------------------------------
/Framework/Tensorflow/README.md:
--------------------------------------------------------------------------------
1 | # AI-Engineer-Howto
2 | 
3 | Tất cả những thứ liên quan đến Tensorflow và Tensorflow-serving


--------------------------------------------------------------------------------
/Linux/README.md:
--------------------------------------------------------------------------------
  1 | # Collection of FAQ about CUDA & Linux & apt-packages
  2 | 
  3 | <details><summary><b>Build OpenCV from source</b></summary>
  4 | 
  5 | - [Build OpenCV from source](docs/build_opencv.md)
  6 | 
  7 | </details>
  8 | 
  9 | <details><summary><b>Install Math Kernel Library (MKL/BLAS/LAPACK/OPENBLAS)</b></summary>
 10 | You are recommended to install all Math Kernel Library and then compile framework (e.g pytorch, mxnet) from source using custom config for optimization.</br>
 11 | Install all LAPACK+BLAS:
 12 | 
 13 | ```
 14 | sudo apt install libjpeg-dev libpng-dev libblas-dev libopenblas-dev libatlas-base-dev liblapack-dev liblapacke-dev gfortran 
 15 | ```
 16 | 
 17 | Install MKL:
 18 | 
 19 | ```
 20 | # Get the key
 21 | wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB
 22 | # now install that key
 23 | apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB
 24 | # now remove the public key file exit the root shell
 25 | rm GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB
 26 | # Add to apt
 27 | sudo wget https://apt.repos.intel.com/setup/intelproducts.list -O /etc/apt/sources.list.d/intelproducts.list
 28 | sudo sh -c 'echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list'
 29 | # Install
 30 | sudo apt-get update
 31 | sudo apt-get install intel-mkl-2020.4-912
 32 | ```
 33 | 
 34 | </details>
 35 | 
 36 | <details><summary><b>Fresh install NVIDIA driver (PC/Laptop/Workstation)</b></summary>
 37 | 
 38 | ```
 39 | # Remove old packages
 40 | sudo apt-get remove --purge '^nvidia-.*'
 41 | sudo apt-get install ubuntu-desktop
 42 | sudo apt-get --purge remove "*cublas*" "cuda*"
 43 | sudo apt-get --purge remove "*nvidia*"
 44 | sudo add-apt-repository --remove ppa:graphics-drivers/ppa
 45 | sudo rm /etc/X11/xorg.conf
 46 | sudo apt autoremove
 47 | sudo reboot
 48 | 
 49 | # After restart
 50 | sudo ubuntu-drivers devices
 51 | sudo ubuntu-drivers autoinstall
 52 | sudo reboot
 53 | ```
 54 | 
 55 | </details>
 56 | 
 57 | <details><summary><b>NVIDIA-SMI has failed because it couldn’t communicate with the NVIDIA driver</b></summary>
 58 |   
 59 | First, make sure that you have "Fresh install NVIDIA driver". If not work, try this bellow
 60 |   
 61 | - Make sure the package nvidia-prime is installed:
 62 | 
 63 | ```
 64 | sudo apt install nvidia-prime
 65 | ```
 66 | 
 67 | Afterwards, run
 68 | ```
 69 | sudo prime-select nvidia
 70 | ```
 71 | 
 72 | - Make sure that NVIDIA is not in blacklist
 73 |   
 74 | ```
 75 | grep nvidia /etc/modprobe.d/* /lib/modprobe.d/*
 76 | ```
 77 | 
 78 | to find a file containing ```blacklist nvidia``` and remove it, then run
 79 | 
 80 | ```
 81 | sudo update-initramfs -u
 82 | ```
 83 | 
 84 | - If get error ```This PCI I/O region assigned to your NVIDIA device is invalid```:
 85 | 
 86 | ```
 87 | sudo nano /etc/default/grub
 88 | ```
 89 | 
 90 | edit ```GRUB_CMDLINE_LINUX_DEFAULT="quiet splash pci=realloc=off"```
 91 | 
 92 | ```
 93 | sudo update-grub
 94 | sudo reboot
 95 | ```
 96 | 
 97 | </details>
 98 | 
 99 | <details><summary><b>Check current CUDA version</b></summary>
100 | 
101 | ```
102 | nvcc --version
103 | ```
104 | 
105 | </details>
106 | 
107 | <details><summary><b>Check current supported CUDA versions</b></summary>
108 | 
109 | ```
110 | ls /usr/local/
111 | ```
112 | 
113 | </details>
114 | 
115 | <details><summary><b>Select GPU devices</b></summary>
116 | 
117 | ```
118 | CUDA_VISIBLE_DEVICES=<index-of-devices> <command>
119 | CUDA_VISIBLE_DEVICES=0 python abc.py
120 | CUDA_VISIBLE_DEVICES=0 ./sample.sh
121 | CUDA_VISIBLE_DEVICES=0,1,2,3 python abc.py
122 | CUDA_VISIBLE_DEVICES=0,1,2,3 ./sample.sh
123 | ```
124 | 
125 | </details>
126 | 
127 | <details><summary><b>Switch CUDA version</b></summary>
128 | 
129 | ```
130 | CUDA_VER=11.3
131 | export PATH="/usr/local/cuda-$CUDA_VER/bin:$PATH"
132 | export LD_LIBRARY_PATH=/usr/local/cuda-$CUDA_VER/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
133 | ```
134 | 
135 | </details>
136 | 
137 | <details><summary><b>Check NVENV/NVDEC status</b></summary>
138 | 
139 | ```
140 | nvidia-smi dmon
141 | ```
142 | see the tab **%enc** and **%dec**
143 | </details>
144 | 
145 | <details><summary><b>Error with distributed training NCCL (got freezed)</b></summary>
146 | 
147 | ```
148 | export NCCL_P2P_DISABLE="1"
149 | ```
150 | 
151 | </details>
152 | 
153 | <details><summary><b>Install CMake from source</b></summary>
154 | 
155 | ```
156 | version=3.23
157 | build=2 ## don't modify from here
158 | mkdir ~/temp
159 | cd ~/temp
160 | wget https://cmake.org/files/v$version/cmake-$version.$build.tar.gz
161 | tar -xzvf cmake-$version.$build.tar.gz
162 | cd cmake-$version.$build/
163 | ./bootstrap
164 | make -j8
165 | sudo make install
166 | ```
167 | 
168 | </details>
169 | 
170 | <details><summary><b>Install MXNet from source (for AMD CPU & NVIDIA GPU)</b></summary>
171 | 
172 | ```
173 | git clone --recursive --branch 1.9.1 https://github.com/apache/incubator-mxnet.git mxnet
174 | cd mxnet
175 | cp config/linux_gpu.cmake config.cmake
176 | rm -rf build
177 | mkdir -p build && cd build
178 | cmake -DUSE_CUDA=ON -DUSE_CUDNN=OFF -DUSE_MKL_IF_AVAILABLE=OFF -DUSE_MKLDNN=OFF -DUSE_OPENMP=OFF -DUSE_OPENCV=ON -DUSE_BLAS=open ..
179 | make -j32
180 | cd ../python
181 | pip install --user -e .
182 | ```
183 | 
184 | </details>
185 | 
186 |   
187 | <details><summary><b>Tensorflow could not load dynamic library 'cudart64_101.dll'</b></summary>
188 | For above example tensorflow would require CUDA 10.1, please switch to CUDA 10.1 or change tensorflow version which compatible with CUDA version, check here: https://www.tensorflow.org/install/source#gpu
189 | </details>
190 | 
191 | ### Computer Vision
192 | <details><summary><b>Fix Deepstream (6.2+) FFMPEG OpenCV installation</b></summary>
193 | Fix some errors about undefined reference & not found of libavcodec, libavutil, libvpx, ...
194 |   
195 | ```
196 | apt-get install --reinstall --no-install-recommends -y libavcodec58 libavcodec-dev libavformat58 libavformat-dev libavutil56 libavutil-dev gstreamer1.0-libav
197 | apt install --reinstall gstreamer1.0-plugins-good
198 | apt install --reinstall libvpx6 libx264-155 libx265-179 libmpg123-0 libmpeg2-4 libmpeg2encpp-2.1-0
199 | gst-inspect-1.0 | grep 264
200 | rm ~/.cache/gstreamer-1.0/registry.x86_64.bin
201 | apt install --reinstall libx264-155
202 | apt-get install gstreamer1.0-libav
203 | apt-get install --reinstall gstreamer1.0-plugins-ugly
204 | ```
205 | 
206 | </details>
207 | 
208 | <details><summary><b>Gstreamer pipeline to convert MP4-MP4 with re-encoding</b></summary>
209 | 
210 | ```
211 | gst-launch-1.0 filesrc location="<path-to-input>" ! qtdemux ! video/x-h264 ! h264parse ! avdec_h264 ! videoconvert ! x264enc ! h264parse ! qtmux ! filesink location=<path-to-output>
212 | ```
213 | 
214 | </details>
215 |   
216 | <details><summary><b>Gstreamer pipeline to convert RTSP-RTMP</b></summary>
217 | 
218 | ```
219 | gst-launch-1.0 rtspsrc location='rtsp://<path-to-rtsp-input>' ! rtph264depay ! h264parse ! flvmux ! rtmpsink location='rtmp://rtmp://<path-to-rtmp-output>'
220 | ```
221 | 
222 | </details>
223 | 
224 | <details><summary><b>Gstreamer pipeline to convert RTSP-RTMP with reducing resolution</b></summary>
225 | 
226 | ```
227 | gst-launch-1.0 rtspsrc location='rtsp://<path-to-rtsp-input>' ! rtpbin ! rtph264depay ! h264parse ! avdec_h264 ! videoconvert ! videoscale ! video/x-raw,width=640,height=640 ! x264enc ! h264parse ! flvmux streamable=true ! rtmpsink location='rtmp://<path-to-rtmp-output>'
228 | ```
229 | 
230 | </details>  
231 | 


--------------------------------------------------------------------------------
/Linux/docs/build_opencv.md:
--------------------------------------------------------------------------------
 1 | # Build OpenCV from source
 2 | 
 3 | ### 1. Install the required dependencies
 4 | ```
 5 | sudo apt install build-essential cmake git pkg-config libgtk-3-dev \
 6 |     libavcodec-dev libavformat-dev libswscale-dev libv4l-dev \
 7 |     libxvidcore-dev libx264-dev libjpeg-dev libpng-dev libtiff-dev \
 8 |     gfortran openexr libatlas-base-dev python3-dev python3-numpy \
 9 |     libtbb2 libtbb-dev libdc1394-22-dev
10 | sudo apt install libopenblas-dev libopenblas-base 
11 | ```
12 | ### 2. Clone the OpenCV’s and OpenCV contrib repositories
13 | ```
14 | mkdir ~/opencv_build && cd ~/opencv_build
15 | git clone https://github.com/opencv/opencv.git
16 | git clone https://github.com/opencv/opencv_contrib.git
17 | cd ~/opencv_build/opencv
18 | mkdir build && cd build
19 | ```
20 | ### 3. Fix OpenBlas search Path:
21 | ```
22 | https://github.com/opencv/opencv/issues/12957
23 | ```
24 | and header
25 | ```
26 | sudo cp /usr/include/lapacke*.h /usr/include/x86_64-linux-gnu/
27 | ```
28 | ### 4. Check CPU tags for optimization
29 | ```
30 | damnguyen@rnd3:~/opencv_build/opencv/build$ lscpu
31 | Architecture:        x86_64
32 | CPU op-mode(s):      32-bit, 64-bit
33 | Byte Order:          Little Endian
34 | CPU(s):              96
35 | On-line CPU(s) list: 0-95
36 | Thread(s) per core:  2
37 | Core(s) per socket:  24
38 | Socket(s):           2
39 | NUMA node(s):        2
40 | Vendor ID:           AuthenticAMD
41 | CPU family:          23
42 | Model:               49
43 | Model name:          AMD EPYC 7352 24-Core Processor
44 | Stepping:            0
45 | CPU MHz:             1495.927
46 | CPU max MHz:         2300.0000
47 | CPU min MHz:         1500.0000
48 | BogoMIPS:            4600.06
49 | Virtualization:      AMD-V
50 | L1d cache:           32K
51 | L1i cache:           32K
52 | L2 cache:            512K
53 | L3 cache:            16384K
54 | NUMA node0 CPU(s):   0-23,48-71
55 | NUMA node1 CPU(s):   24-47,72-95
56 | Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate sme ssbd ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif umip rdpid overflow_recov succor smca
57 | ```
58 | This **AMD EPYC 7352 24-Core Processor** support **avx**, **avx2**, **sse4_1**, **sse4_2**
59 | ### 5. Config
60 | ```
61 | cmake -D CMAKE_BUILD_TYPE=RELEASE \
62 |     -D CMAKE_INSTALL_PREFIX=$(python3 -c "import sys; print(sys.prefix)") \
63 |     -D INSTALL_C_EXAMPLES=ON \
64 |     -D INSTALL_PYTHON_EXAMPLES=ON \
65 |     -D OPENCV_GENERATE_PKGCONFIG=ON \
66 |     -D OPENCV_EXTRA_MODULES_PATH=~/opencv_build/opencv_contrib/modules \
67 |     -D WITH_CUDA=OFF \
68 |     -D BUILD_NEW_PYTHON_SUPPORT=ON \
69 |     -D BUILD_opencv_python3=ON \
70 |     -D HAVE_opencv_python3=ON \
71 |     -D OPENCV_PYTHON3_INSTALL_PATH=$(python3 -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())") \
72 |     -D PYTHON_EXECUTABLE=$(which python3) \
73 |     -D BUILD_EXAMPLES=ON -D WITH_FFMPEG=OFF ..
74 | ```
75 | Remember to check any error with OpenBLAS
76 | ### 6. Build
77 | ```
78 | make -j8
79 | make install
80 | ```
81 | ### 7. Verify
82 | ```
83 | pkg-config --modversion opencv4
84 | python3 -c "import cv2; print(cv2.__version__)"
85 | ```
86 | 


--------------------------------------------------------------------------------