├── COCO_val2014_000000000564_deepstream_infer.jpg
├── COCO_val2014_000000000564_infer.jpg
├── README.md
├── libs
└── nvdsinfer
│ ├── Makefile
│ ├── README
│ ├── nms_cpu.cpp
│ ├── nms_cpu.h
│ ├── nvdsinfer_context_impl.cpp
│ ├── nvdsinfer_context_impl.h
│ ├── nvdsinfer_context_impl_capi.cpp
│ ├── nvdsinfer_context_impl_output_parsing.cpp
│ ├── nvdsinfer_conversion.cu
│ ├── nvdsinfer_conversion.h
│ ├── resize_merge_cpu.cpp
│ └── resize_merge_cpu.h
├── openpose_app
├── COCO_val2014_000000000564.jpg
├── COCO_val2014_000000000569.jpg
├── Makefile
├── README
├── nvinfer_config.txt
└── openpose_app.c
└── todo.jpg
/COCO_val2014_000000000564_deepstream_infer.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cding-nv/deepstream-openpose/6db238cd4490655e2768bafed3ca4b8f5f778d25/COCO_val2014_000000000564_deepstream_infer.jpg
--------------------------------------------------------------------------------
/COCO_val2014_000000000564_infer.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cding-nv/deepstream-openpose/6db238cd4490655e2768bafed3ca4b8f5f778d25/COCO_val2014_000000000564_infer.jpg
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # deepstream-openpose
2 |
3 |
4 | ## 1. Run [CMU](https://github.com/CMU-Perceptual-Computing-Lab/openpose) pose demo
5 | Platform: xavier, Jetpack 4.3 with DeepStream 4.0.2
6 |
7 | Notes:
8 | 1. Porting to Tesla/x86 platform should be easy.
9 | 2. CUDA_cublas_device_LIBRARY NOTFOUND issue ->
10 | [Solution](https://forums.developer.nvidia.com/t/cuda-blas-libraries-not-installed/107908/18?u=chrisding)
11 | 3. Refer to `openpose/scripts/ubuntu/install_deps.sh` to install deps libs.
12 | 4. Refer to `openpose/models/getModels.sh` to fetch models
13 | 5. Build.
14 | ```
15 | $ cmake -D CMAKE_BUILD_TYPE=Debug ..
16 | $ make -j4
17 | ```
18 | 6. These demos can work.
19 | ```
20 | $ ./build/examples/openpose/openpose.bin
21 | $ ./build/examples/tutorial_api_cpp/01_body_from_image_default.bin
22 | $ ...
23 | ```
24 | It show like this.
25 |
26 |
27 |
28 |
29 |
30 | ## 2. Deploy pose coco model by DeepStream 4.0.2
31 | Model: `pose/coco/pose_iter_440000.caffemodel`, `pose/coco/pose_deploy_linevec.prototxt`
32 |
33 | Pipeline:
34 | > filesrc -> jpegparse -> nvv4l2decoder -> nvstreammux -> nvinfer (openpose and 18 parts parse)
35 | nvsegvidsual -> nvmultistreamtiler -> (nvegltransform) -> nveglglessink
36 |
37 | ### Build libnvds_infer.so
38 | ```
39 | $ cd libs/nvinfer
40 | $ make
41 | Backup /opt/nvidia/deepstream/deepstream-4.0/lib/libnvds_infer.so
42 | $ sudo ln -sf $(pwd)/libnvds_infer.so /opt/nvidia/deepstream/deepstream-4.0/lib/libnvds_infer.so
43 | ```
44 |
45 | ### Build openpose-app
46 | ```
47 | $ cd openpose_app
48 | $ make
49 | Change nvinfer_config.txt "model-file" and "proto-file" to be your path
50 | model-file=
51 | proto-file=
52 | ```
53 |
54 | ### Run
55 | ```
56 | $ ./openpose-app ./nvinfer_config.txt COCO_val2014_000000000564.jpg
57 | ```
58 |
59 |
60 |
61 |
62 | ## 3. TODO
63 | Add dsexample plugin after nvinfer and do [resize_merge](./libs/nvdsinfer/resize_merge_cpu.cpp), [nms](./libs/nvdsinfer/nms_cpu.cpp) and BodyPartConnector, and show result by nvosd like the below.
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/libs/nvdsinfer/Makefile:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # NVIDIA Corporation and its licensors retain all intellectual property
5 | # and proprietary rights in and to this software, related documentation
6 | # and any modifications thereto. Any use, reproduction, disclosure or
7 | # distribution of this software and related documentation without an express
8 | # license agreement from NVIDIA Corporation is strictly prohibited.
9 | #################################################################################
10 |
11 | CUDA_VER?=
12 | ifeq ($(CUDA_VER),)
13 | $(error "CUDA_VER is not set")
14 | endif
15 |
16 | NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc
17 | CXX:= g++
18 | SRCS:= nvdsinfer_context_impl.cpp nvdsinfer_context_impl_capi.cpp \
19 | nvdsinfer_context_impl_output_parsing.cpp nvdsinfer_conversion.cu \
20 | nms_cpu.cpp resize_merge_cpu.cpp
21 | INCS:= $(wildcard *.h)
22 | LIB:=libnvds_infer.so
23 |
24 | NVDS_VERSION:=4.0
25 |
26 | LIB_INSTALL_DIR?=/opt/nvidia/deepstream/deepstream-$(NVDS_VERSION)/lib/
27 |
28 | CFLAGS+= -fPIC -g -std=c++11 \
29 | -I /usr/local/cuda-$(CUDA_VER)/include \
30 | -I /opt/nvidia/deepstream/deepstream-4.0/sources/includes/
31 |
32 | CFLAGS+= `pkg-config --cflags gstreamer-1.0`
33 |
34 | LIBS := -shared -g -Wl,-no-undefined \
35 | -lnvinfer -lnvinfer_plugin -lnvonnxparser -lnvcaffe_parser \
36 | -L/usr/local/cuda-$(CUDA_VER)/lib64/ -lcudart \
37 | -lopencv_objdetect -lopencv_imgproc -lopencv_core
38 |
39 | LIBS+= `pkg-config --libs gstreamer-1.0`
40 |
41 | LIBS+= -L$(LIB_INSTALL_DIR) -lnvdsgst_helper -lnvdsgst_meta -lnvds_meta \
42 | -lnvds_inferutils -ldl \
43 | -Wl,-rpath,$(LIB_INSTALL_DIR)
44 |
45 |
46 | OBJS:= $(SRCS:.cpp=.o)
47 | OBJS:= $(OBJS:.cu=.o)
48 |
49 | all: $(LIB)
50 |
51 | %.o: %.cpp $(INCS) Makefile
52 | @echo $(CFLAGS)
53 | $(CXX) -c -o $@ $(CFLAGS) $<
54 |
55 | %.o: %.cu $(INCS) Makefile
56 | @echo $(CFLAGS)
57 | $(NVCC) -c -o $@ --compiler-options '-fPIC' $<
58 |
59 | $(LIB): $(OBJS) $(DEP) Makefile
60 | @echo $(CFLAGS)
61 | $(CXX) -o $@ $(OBJS) $(LIBS)
62 |
63 | install: $(LIB)
64 | cp -rv $(LIB) $(LIB_INSTALL_DIR)
65 |
66 | clean:
67 | rm -rf $(OBJS) $(LIB)
68 |
--------------------------------------------------------------------------------
/libs/nvdsinfer/README:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # NVIDIA Corporation and its licensors retain all intellectual property
5 | # and proprietary rights in and to this software, related documentation
6 | # and any modifications thereto. Any use, reproduction, disclosure or
7 | # distribution of this software and related documentation without an express
8 | # license agreement from NVIDIA Corporation is strictly prohibited.
9 | #
10 | ################################################################################
11 |
12 | Refer to the DeepStream SDK documentation for a description of the "nvinfer"
13 | plugin and "NvDsInfer" API.
14 |
15 | --------------------------------------------------------------------------------
16 | Pre-requisites:
17 | - TensorRT 5.1+ development package
18 | - OpenCV 3.4.0+ development package
19 |
20 | Please refer to the TensorRT documentation for installing the TensorRT development
21 | package.
22 |
23 | To install OpenCV development pacakge
24 | sudo apt-get install libopencv-dev
25 |
26 | --------------------------------------------------------------------------------
27 | Compiling and installing the plugin:
28 | Export or set in Makefile the appropriate CUDA_VER
29 | Run make and sudo make install
30 |
--------------------------------------------------------------------------------
/libs/nvdsinfer/nms_cpu.cpp:
--------------------------------------------------------------------------------
1 | #include "nms_cpu.h"
2 | //#include
3 |
4 | #define error printf
5 |
6 | template
7 | void nmsRegisterKernelCPU(int* kernelPtr, const T* const sourcePtr, const int w, const int h,
8 | const T& threshold, const int x, const int y)
9 | {
10 | // We have three scenarios for NMS, one for the border, 1 for the 1st inner border, and
11 | // 1 for the rest. cv::resize adds artifacts around the 1st inner border, causing two
12 | // maximas to occur side by side. Eg. [1 1 0.8 0.8 0.5 ..]. The CUDA kernel gives
13 | // [0.8 1 0.8 0.8 0.5 ..] Hence for this special case in the 1st inner border, we look at the
14 | // visible regions.
15 |
16 | const auto index = y*w + x;
17 | if (1 < x && x < (w-2) && 1 < y && y < (h-2))
18 | {
19 | const auto value = sourcePtr[index];
20 | if (value > threshold)
21 | {
22 | const auto topLeft = sourcePtr[(y-1)*w + x-1];
23 | const auto top = sourcePtr[(y-1)*w + x];
24 | const auto topRight = sourcePtr[(y-1)*w + x+1];
25 | const auto left = sourcePtr[ y*w + x-1];
26 | const auto right = sourcePtr[ y*w + x+1];
27 | const auto bottomLeft = sourcePtr[(y+1)*w + x-1];
28 | const auto bottom = sourcePtr[(y+1)*w + x];
29 | const auto bottomRight = sourcePtr[(y+1)*w + x+1];
30 |
31 | if (value > topLeft && value > top && value > topRight
32 | && value > left && value > right
33 | && value > bottomLeft && value > bottom && value > bottomRight)
34 | kernelPtr[index] = 1;
35 | else
36 | kernelPtr[index] = 0;
37 | }
38 | else
39 | kernelPtr[index] = 0;
40 | }
41 | else if (x == 1 || x == (w-2) || y == 1 || y == (h-2))
42 | {
43 | //kernelPtr[index] = 0;
44 | const auto value = sourcePtr[index];
45 | if (value > threshold)
46 | {
47 | const auto topLeft = ((0 < x && 0 < y) ? sourcePtr[(y-1)*w + x-1] : threshold);
48 | const auto top = (0 < y ? sourcePtr[(y-1)*w + x] : threshold);
49 | const auto topRight = ((0 < y && x < (w-1)) ? sourcePtr[(y-1)*w + x+1] : threshold);
50 | const auto left = (0 < x ? sourcePtr[ y*w + x-1] : threshold);
51 | const auto right = (x < (w-1) ? sourcePtr[y*w + x+1] : threshold);
52 | const auto bottomLeft = ((y < (h-1) && 0 < x) ? sourcePtr[(y+1)*w + x-1] : threshold);
53 | const auto bottom = (y < (h-1) ? sourcePtr[(y+1)*w + x] : threshold);
54 | const auto bottomRight = ((x < (w-1) && y < (h-1)) ? sourcePtr[(y+1)*w + x+1] : threshold);
55 |
56 | if (value >= topLeft && value >= top && value >= topRight
57 | && value >= left && value >= right
58 | && value >= bottomLeft && value >= bottom && value >= bottomRight)
59 | kernelPtr[index] = 1;
60 | else
61 | kernelPtr[index] = 0;
62 | }
63 | else
64 | kernelPtr[index] = 0;
65 | }
66 | else
67 | kernelPtr[index] = 0;
68 | }
69 |
70 | template
71 | void nmsAccuratePeakPosition(T* output, const T* const sourcePtr, const int& peakLocX, const int& peakLocY,
72 | const int& width, const int& height)
73 | {
74 | T xAcc = 0.f;
75 | T yAcc = 0.f;
76 | T scoreAcc = 0.f;
77 | const auto dWidth = 3;
78 | const auto dHeight = 3;
79 | for (auto dy = -dHeight ; dy <= dHeight ; dy++)
80 | {
81 | const auto y = peakLocY + dy;
82 | if (0 <= y && y < height) // Default height = 368
83 | {
84 | for (auto dx = -dWidth ; dx <= dWidth ; dx++)
85 | {
86 | const auto x = peakLocX + dx;
87 | if (0 <= x && x < width) // Default width = 656
88 | {
89 | const auto score = sourcePtr[y * width + x];
90 | if (score > 0)
91 | {
92 | xAcc += x*score;
93 | yAcc += y*score;
94 | scoreAcc += score;
95 | }
96 | }
97 | }
98 | }
99 | }
100 |
101 | // Offset to keep Matlab format (empirically higher acc)
102 | // Best results for 1 scale: x + 0, y + 0.5
103 | // +0.5 to both to keep Matlab format
104 | // Hard code offset x=y=0.5
105 | output[0] = xAcc / scoreAcc + 0.5;
106 | output[1] = yAcc / scoreAcc + 0.5;
107 | output[2] = sourcePtr[peakLocY*width + peakLocX];
108 | }
109 |
110 | template
111 | void nmsCpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
112 | const std::array& targetSize, const std::array& sourceSize)
113 | {
114 | try
115 | {
116 | // Sanity checks
117 | if (sourceSize.empty())
118 | printf("sourceSize cannot be empty. %d, %s, %s\n", __LINE__, __FUNCTION__, __FILE__);
119 | if (targetSize.empty())
120 | printf("targetSize cannot be empty. %d, %s, %s\n", __LINE__, __FUNCTION__, __FILE__);
121 | if (threshold < 0 || threshold > 1.0)
122 | printf("threshold value invalid. %d, %s, %s\n", __LINE__, __FUNCTION__, __FILE__);
123 |
124 | // Params
125 | const auto channels = targetSize[1]; // 57
126 | const auto sourceHeight = sourceSize[2]; // 368
127 | const auto sourceWidth = sourceSize[3]; // 496
128 | const auto targetPeaks = targetSize[2]; // 97
129 | const auto targetPeakVec = targetSize[3]; // 3
130 | const auto sourceChannelOffset = sourceWidth * sourceHeight;
131 | const auto targetChannelOffset = targetPeaks * targetPeakVec;
132 |
133 | // Per channel operation
134 | for (auto c = 0 ; c < channels ; c++)
135 | {
136 | auto* currKernelPtr = &kernelPtr[c*sourceChannelOffset];
137 | const T* currSourcePtr = &sourcePtr[c*sourceChannelOffset];
138 |
139 | for (auto y = 0; y < sourceHeight; y++)
140 | for (auto x = 0; x < sourceWidth; x++)
141 | nmsRegisterKernelCPU(currKernelPtr, currSourcePtr, sourceWidth, sourceHeight, threshold, x, y);
142 |
143 | auto currentPeakCount = 1;
144 | auto* currTargetPtr = &targetPtr[c*targetChannelOffset];
145 | for (auto y = 0; y < sourceHeight; y++)
146 | {
147 | for (auto x = 0; x < sourceWidth; x++)
148 | {
149 | const auto index = y*sourceWidth + x;
150 | // Find high intensity points
151 | if (currentPeakCount < targetPeaks)
152 | {
153 | if (currKernelPtr[index] == 1)
154 | {
155 | // Accurate Peak Position
156 | nmsAccuratePeakPosition(&currTargetPtr[currentPeakCount*3], currSourcePtr, x, y,
157 | sourceWidth, sourceHeight);
158 | currentPeakCount++;
159 | }
160 | }
161 | }
162 | }
163 | currTargetPtr[0] = T(currentPeakCount-1);
164 | }
165 | }
166 | catch (const std::exception& e)
167 | {
168 | printf("exception: %s, %d, %s, %s\n", e.what(), __LINE__, __FUNCTION__, __FILE__);
169 | }
170 | }
171 |
172 | template void nmsCpu(
173 | float* targetPtr, int* kernelPtr, const float* const sourcePtr, const float threshold,
174 | const std::array& targetSize, const std::array& sourceSize);
175 | template void nmsCpu(
176 | double* targetPtr, int* kernelPtr, const double* const sourcePtr, const double threshold,
177 | const std::array& targetSize, const std::array& sourceSize);
178 |
179 |
--------------------------------------------------------------------------------
/libs/nvdsinfer/nms_cpu.h:
--------------------------------------------------------------------------------
1 | #ifndef NMS_CPU_H
2 | #define NMS_CPU_H
3 |
4 | #include
5 |
6 | template
7 | void nmsCpu(
8 | T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold, const std::array& targetSize,
9 | const std::array& sourceSize);
10 |
11 | #endif // NMS_CPU_H
--------------------------------------------------------------------------------
/libs/nvdsinfer/nvdsinfer_context_impl.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA Corporation and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA Corporation is strictly prohibited.
9 | *
10 | */
11 |
12 | #include
13 | #include
14 | #include
15 | #include
16 | #include
17 | #include
18 | #include
19 |
20 | #include "nvtx3/nvToolsExtCudaRt.h"
21 |
22 | #include "nvdsinfer_context_impl.h"
23 | #include "nvdsinfer_conversion.h"
24 |
25 | #include
26 | #include
27 | #include
28 |
29 | /* Function types for custom library interfaces. */
30 |
31 | using NvDsInferPluginFactoryCaffeGetFcn = decltype (&NvDsInferPluginFactoryCaffeGet);
32 | using NvDsInferPluginFactoryCaffeDestroyFcn = decltype (&NvDsInferPluginFactoryCaffeDestroy);
33 |
34 | using NvDsInferPluginFactoryUffGetFcn = decltype (&NvDsInferPluginFactoryUffGet);
35 | using NvDsInferPluginFactoryUffDestroyFcn = decltype (&NvDsInferPluginFactoryUffDestroy);
36 |
37 | using NvDsInferPluginFactoryRuntimeGetFcn = decltype (&NvDsInferPluginFactoryRuntimeGet);
38 | using NvDsInferPluginFactoryRuntimeDestroyFcn = decltype (&NvDsInferPluginFactoryRuntimeDestroy);
39 |
40 | using NvDsInferInitializeInputLayersFcn = decltype (&NvDsInferInitializeInputLayers);
41 |
42 | using NvDsInferCudaEngineGetFcn = decltype (&NvDsInferCudaEngineGet);
43 |
44 | /* Pair data type for returning input back to caller. */
45 | using NvDsInferReturnInputPair = std::pair;
46 |
47 | static const int WORKSPACE_SIZE = 450 * 1024 * 1024;
48 |
49 | using namespace nvinfer1;
50 | using namespace std;
51 |
52 | /*
53 | * TensorRT INT8 Calibration implementation. This implementation requires
54 | * pre-generated INT8 Calibration Tables. Please refer TensorRT documentation
55 | * for information on the calibration tables and the procedure for creating the
56 | * tables.
57 | *
58 | * Since this implementation only reads from pre-generated calibration tables,
59 | * readCalibrationCache is requires to be implemented.
60 | */
61 | class NvDsInferInt8Calibrator : public IInt8EntropyCalibrator2
62 | {
63 | public:
64 | NvDsInferInt8Calibrator(string calibrationTableFile) :
65 | m_CalibrationTableFile(calibrationTableFile)
66 | {
67 | }
68 |
69 | ~NvDsInferInt8Calibrator()
70 | {
71 | }
72 |
73 | int
74 | getBatchSize() const override
75 | {
76 | return 0;
77 | }
78 |
79 | bool
80 | getBatch(void* bindings[], const char* names[], int nbBindings) override
81 | {
82 | return false;
83 | }
84 |
85 | /* Reads calibration table file contents into a buffer and returns a pointer
86 | * to the buffer.
87 | */
88 | const void*
89 | readCalibrationCache(size_t& length) override
90 | {
91 | m_CalibrationCache.clear();
92 | ifstream input(m_CalibrationTableFile, std::ios::binary);
93 | input >> noskipws;
94 | if (input.good())
95 | copy(std::istream_iterator(input),
96 | istream_iterator(),
97 | back_inserter(m_CalibrationCache));
98 |
99 | length = m_CalibrationCache.size();
100 | return length ? m_CalibrationCache.data() : nullptr;
101 | }
102 |
103 | void
104 | writeCalibrationCache(const void* cache, size_t length) override
105 | {
106 | }
107 |
108 | private:
109 | string m_CalibrationTableFile;
110 | vector m_CalibrationCache;
111 | };
112 |
113 | /**
114 | * Get the size of the element from the data type
115 | */
116 | inline unsigned int
117 | getElementSize(NvDsInferDataType t)
118 | {
119 | switch (t)
120 | {
121 | case INT32:
122 | return 4;
123 | case FLOAT:
124 | return 4;
125 | case HALF:
126 | return 2;
127 | case INT8:
128 | return 1;
129 | }
130 |
131 | return 0;
132 | }
133 |
134 | static inline bool
135 | string_empty(char *str)
136 | {
137 | return strlen(str) == 0;
138 | }
139 |
140 | static inline bool
141 | file_accessible (char *path)
142 | {
143 | return (access(path, F_OK) != -1);
144 | }
145 |
146 | /* Cuda callback function for returning input back to client. */
147 | static void
148 | returnInputCudaCallback(cudaStream_t stream, cudaError_t status, void* userData)
149 | {
150 | NvDsInferReturnInputPair *pair = (NvDsInferReturnInputPair *) userData;
151 | pair->first(pair->second);
152 | delete pair;
153 | }
154 |
155 | std::mutex NvDsInferContextImpl::DlaExecutionMutex;
156 |
157 | void
158 | NvDsInferContextImpl::NvDsInferLogger::log(Severity severity, const char *msg)
159 | {
160 | NvDsInferLogLevel level;
161 |
162 | switch (severity)
163 | {
164 | case Severity::kINTERNAL_ERROR:
165 | case Severity::kERROR:
166 | level = NVDSINFER_LOG_ERROR;
167 | break;
168 | case Severity::kWARNING:
169 | level = NVDSINFER_LOG_WARNING;
170 | break;
171 | case Severity::kINFO:
172 | level = NVDSINFER_LOG_DEBUG;
173 | break;
174 | default:
175 | return;
176 | }
177 |
178 | callLogFunc(handle, handle->m_UniqueID, level, __func__, handle->m_LoggingFunc,
179 | handle->m_UserCtx, msg);
180 | }
181 |
182 | /* Default constructor. */
183 | NvDsInferContextImpl::NvDsInferContextImpl() :
184 | INvDsInferContext(),
185 | m_UniqueID(0),
186 | m_DBScanHandle(nullptr),
187 | m_CustomLibHandle(nullptr),
188 | m_CustomBBoxParseFunc(nullptr),
189 | m_CustomClassifierParseFunc(nullptr),
190 | m_RuntimePluginFactory(nullptr),
191 | m_GpuID (0),
192 | m_DlaEnabled (false),
193 | m_InferRuntime(nullptr),
194 | m_CudaEngine(nullptr),
195 | m_InferExecutionContext(nullptr),
196 | m_PreProcessStream(nullptr),
197 | m_InferStream(nullptr),
198 | m_BufferCopyStream(nullptr),
199 | m_MeanDataBuffer(nullptr),
200 | m_Batches(NVDSINFER_MIN_OUTPUT_BUFFERPOOL_SIZE),
201 | m_InputConsumedEvent(nullptr),
202 | m_PreProcessCompleteEvent(nullptr),
203 | m_InferCompleteEvent(nullptr),
204 | m_LoggingFunc(nullptr),
205 | m_UserCtx(nullptr),
206 | m_Initialized(false)
207 | {
208 | m_Logger.handle = this;
209 | }
210 |
211 | /* The function performs all the initialization steps required by the inference
212 | * engine. */
213 | NvDsInferStatus
214 | NvDsInferContextImpl::initialize(NvDsInferContextInitParams &initParams,
215 | void *userCtx, NvDsInferContextLoggingFunc logFunc)
216 | {
217 | cudaError_t cudaReturn;
218 | bool generateModel = true;
219 | std::string nvtx_name;
220 |
221 | m_LoggingFunc = logFunc;
222 | m_UserCtx = userCtx;
223 |
224 | /* Synchronization using once_flag and call_once to ensure TensorRT plugin
225 | * initialization function is called only once in case of multiple instances
226 | * of this constructor being called from different threads. */
227 | {
228 | static once_flag pluginInitFlag;
229 | call_once(pluginInitFlag,
230 | [this]() { initLibNvInferPlugins(&this->m_Logger, ""); } );
231 | }
232 |
233 | m_UniqueID = initParams.uniqueID;
234 | m_MaxBatchSize = initParams.maxBatchSize;
235 | m_NetworkScaleFactor = initParams.networkScaleFactor;
236 | m_NetworkInputFormat = initParams.networkInputFormat;
237 | m_NetworkType = initParams.networkType;
238 | m_UseDBScan = initParams.useDBScan;
239 |
240 | m_ClassifierThreshold = initParams.classifierThreshold;
241 | m_SegmentationThreshold = initParams.segmentationThreshold;
242 | m_GpuID = initParams.gpuID;
243 | m_CopyInputToHostBuffers = initParams.copyInputToHostBuffers;
244 | m_OutputBufferPoolSize = initParams.outputBufferPoolSize;
245 | m_Batches.resize(m_OutputBufferPoolSize);
246 |
247 | if (m_UniqueID == 0)
248 | {
249 | printError("Unique ID not set");
250 | return NVDSINFER_CONFIG_FAILED;
251 | }
252 |
253 | if (m_MaxBatchSize > NVDSINFER_MAX_BATCH_SIZE)
254 | {
255 | printError ("Batch-size (%d) more than maximum allowed batch-size (%d)",
256 | initParams.maxBatchSize, NVDSINFER_MAX_BATCH_SIZE);
257 | return NVDSINFER_CONFIG_FAILED;
258 | }
259 |
260 | if (initParams.numOutputLayers > 0 && initParams.outputLayerNames == nullptr)
261 | {
262 | printError("NumOutputLayers > 0 but outputLayerNames array not specified");
263 | return NVDSINFER_CONFIG_FAILED;
264 | }
265 |
266 | switch (m_NetworkType)
267 | {
268 | case NvDsInferNetworkType_Detector:
269 | m_NumDetectedClasses = initParams.numDetectedClasses;
270 | if (initParams.numDetectedClasses > 0 && initParams.perClassDetectionParams == nullptr)
271 | {
272 | printError("NumDetectedClasses > 0 but PerClassDetectionParams array not specified");
273 | return NVDSINFER_CONFIG_FAILED;
274 | }
275 |
276 | m_PerClassDetectionParams.assign(initParams.perClassDetectionParams,
277 | initParams.perClassDetectionParams + m_NumDetectedClasses);
278 | m_DetectionParams.numClassesConfigured = initParams.numDetectedClasses;
279 | m_DetectionParams.perClassThreshold.resize(initParams.numDetectedClasses);
280 |
281 | /* Resize the per class vector to the number of detected classes. */
282 | m_PerClassObjectList.resize(initParams.numDetectedClasses);
283 | if (!m_UseDBScan)
284 | {
285 | m_PerClassCvRectList.resize(initParams.numDetectedClasses);
286 | }
287 |
288 | /* Fill the class thresholds in the m_DetectionParams structure. This
289 | * will be required during parsing. */
290 | for (unsigned int i = 0; i < initParams.numDetectedClasses; i++)
291 | {
292 | m_DetectionParams.perClassThreshold[i] =
293 | m_PerClassDetectionParams[i].threshold;
294 | }
295 | break;
296 | case NvDsInferNetworkType_Classifier:
297 | break;
298 | case NvDsInferNetworkType_Segmentation:
299 | break;
300 | case NvDsInferNetworkType_Other:
301 | break;
302 | default:
303 | printError("Unsupported network type");
304 | return NVDSINFER_CONFIG_FAILED;
305 | }
306 |
307 | switch (initParams.networkMode)
308 | {
309 | case NvDsInferNetworkMode_FP32:
310 | case NvDsInferNetworkMode_FP16:
311 | case NvDsInferNetworkMode_INT8:
312 | break;
313 | default:
314 | printError("Unsupported network dataType");
315 | return NVDSINFER_CONFIG_FAILED;
316 | }
317 |
318 | if (m_OutputBufferPoolSize < NVDSINFER_MIN_OUTPUT_BUFFERPOOL_SIZE)
319 | {
320 | printError("Output buffer pool size (%d) less than minimum required(%d)",
321 | m_OutputBufferPoolSize, NVDSINFER_MIN_OUTPUT_BUFFERPOOL_SIZE);
322 | return NVDSINFER_CONFIG_FAILED;
323 | }
324 |
325 | /* Set the cuda device to be used. */
326 | cudaReturn = cudaSetDevice(m_GpuID);
327 | if (cudaReturn != cudaSuccess)
328 | {
329 | printError("Failed to set cuda device (%s).", cudaGetErrorName(cudaReturn));
330 | return NVDSINFER_CUDA_ERROR;
331 | }
332 |
333 | /* Create the API root class. */
334 | m_InferRuntime = createInferRuntime(m_Logger);
335 | if (!m_InferRuntime)
336 | {
337 | printError("Failed to create Infer runtime engine.");
338 | return NVDSINFER_TENSORRT_ERROR;
339 | }
340 |
341 | /* Load the custom library if specified. */
342 | if (!string_empty(initParams.customLibPath))
343 | {
344 | m_CustomLibHandle = dlopen (initParams.customLibPath, RTLD_LAZY);
345 | if (!m_CustomLibHandle)
346 | {
347 | printError("Could not open custom lib: %s", dlerror());
348 | return NVDSINFER_CUSTOM_LIB_FAILED;
349 | }
350 | }
351 |
352 | /* If the custom library is specified, check if PluginFactory instance is
353 | * required during deserialization of cuda engine. */
354 | NvDsInferPluginFactoryRuntimeGetFcn fcn = nullptr;
355 | if (m_CustomLibHandle)
356 | {
357 | fcn = (NvDsInferPluginFactoryRuntimeGetFcn)
358 | dlsym(m_CustomLibHandle, "NvDsInferPluginFactoryRuntimeGet");
359 | if (fcn)
360 | {
361 | if (!fcn(m_RuntimePluginFactory))
362 | {
363 | printError("Failed to get runtime plugin factory instance"
364 | " from custom library.");
365 | return NVDSINFER_CUSTOM_LIB_FAILED;
366 | }
367 | }
368 | }
369 |
370 | if (!string_empty(initParams.modelEngineFilePath))
371 | {
372 | if (useEngineFile(initParams) == NVDSINFER_SUCCESS)
373 | {
374 | generateModel = false;
375 | }
376 | }
377 |
378 | if (generateModel)
379 | {
380 | NvDsInferStatus status;
381 | IHostMemory *gieModelStream;
382 | printInfo("Trying to create engine from model files");
383 |
384 | /* Create the gie Model stream from the model files and other parameters. */
385 | status = generateTRTModel(initParams, gieModelStream);
386 | if (status != NVDSINFER_SUCCESS)
387 | {
388 | printError("Failed to create engine from model files");
389 | return status;
390 | }
391 |
392 | /* Use DLA if specified. */
393 | if (initParams.useDLA)
394 | {
395 | m_InferRuntime->setDLACore(initParams.dlaCore);
396 | }
397 |
398 | /* Create the cuda engine from the serialized stream. */
399 | m_CudaEngine =
400 | m_InferRuntime->deserializeCudaEngine(gieModelStream->data(),
401 | gieModelStream->size(),
402 | m_RuntimePluginFactory);
403 | /* Destroy the model stream, since cuda engine has been serialized. */
404 | gieModelStream->destroy();
405 |
406 | if (!m_CudaEngine)
407 | {
408 | printError("Failed to create engine from serialized stream");
409 | return NVDSINFER_TENSORRT_ERROR;
410 | }
411 | if (checkEngineParams(initParams) != NVDSINFER_SUCCESS)
412 | {
413 | return NVDSINFER_CONFIG_FAILED;
414 | }
415 | }
416 |
417 | m_DlaEnabled = initParams.useDLA;
418 |
419 | /* Get the network input dimensions. */
420 | DimsCHW inputDims =
421 | static_cast(m_CudaEngine->getBindingDimensions(INPUT_LAYER_INDEX));
422 | m_NetworkInfo.width = inputDims.w();
423 | m_NetworkInfo.height = inputDims.h();
424 | m_NetworkInfo.channels = inputDims.c();
425 |
426 | switch (m_NetworkInputFormat)
427 | {
428 | case NvDsInferFormat_RGB:
429 | case NvDsInferFormat_BGR:
430 | if (m_NetworkInfo.channels != 3)
431 | {
432 | printError("RGB/BGR input format specified but network input"
433 | " channels is not 3");
434 | return NVDSINFER_CONFIG_FAILED;
435 | }
436 | break;
437 | case NvDsInferFormat_GRAY:
438 | if (m_NetworkInfo.channels != 1)
439 | {
440 | printError("GRAY input format specified but network input "
441 | "channels is not 1.");
442 | return NVDSINFER_CONFIG_FAILED;
443 | }
444 | break;
445 | default:
446 | printError("Unknown input format");
447 | return NVDSINFER_CONFIG_FAILED;
448 | }
449 |
450 | /* Create the mean data buffer from mean image file or per color component
451 | * offsets if either are specified. */
452 | if (!string_empty(initParams.meanImageFilePath) || initParams.numOffsets > 0)
453 | {
454 | /* Mean Image File specified. Allocate the mean image buffer on device
455 | * memory. */
456 | cudaReturn = cudaMalloc((void **)&m_MeanDataBuffer,
457 | m_NetworkInfo.width * m_NetworkInfo.height *
458 | m_NetworkInfo.channels * sizeof (float));
459 | if (cudaReturn != cudaSuccess)
460 | {
461 | printError("Failed to allocate cuda buffer for mean image(%s)",
462 | cudaGetErrorName(cudaReturn));
463 | return NVDSINFER_CUDA_ERROR;
464 | }
465 | /* Read the mean image file (PPM format) if specified and copy the
466 | * contents into the buffer. */
467 | if (!string_empty(initParams.meanImageFilePath))
468 | {
469 | if (!file_accessible(initParams.meanImageFilePath))
470 | {
471 | printError("Cannot access mean image file '%s'",
472 | initParams.meanImageFilePath);
473 | return NVDSINFER_CONFIG_FAILED;
474 | }
475 | NvDsInferStatus status = readMeanImageFile(initParams.meanImageFilePath);
476 | if (status != NVDSINFER_SUCCESS)
477 | {
478 | printError("Failed to read mean image file");
479 | return status;
480 | }
481 | }
482 | /* Create the mean data buffer from per-channel offsets. */
483 | else
484 | {
485 | /* Make sure the number of offsets are equal to the number of input
486 | * channels. */
487 | if (initParams.numOffsets != m_NetworkInfo.channels)
488 | {
489 | printError("Number of offsets(%d) not equal to number of input "
490 | "channels(%d)", initParams.numOffsets,
491 | m_NetworkInfo.channels);
492 | return NVDSINFER_CONFIG_FAILED;
493 | }
494 |
495 | vector meanData(
496 | m_NetworkInfo.channels * m_NetworkInfo.width *
497 | m_NetworkInfo.height);
498 | for (size_t j = 0; j < m_NetworkInfo.width * m_NetworkInfo.height; j++)
499 | {
500 | for (size_t i = 0; i < m_NetworkInfo.channels; i++)
501 | {
502 | meanData[j * m_NetworkInfo.channels + i] = initParams.offsets[i];
503 | }
504 | }
505 | cudaReturn = cudaMemcpy(m_MeanDataBuffer, meanData.data(),
506 | meanData.size() * sizeof(float), cudaMemcpyHostToDevice);
507 | if (cudaReturn != cudaSuccess)
508 | {
509 | printError("Failed to copy mean data to mean data cuda buffer(%s)",
510 | cudaGetErrorName(cudaReturn));
511 | return NVDSINFER_CUDA_ERROR;
512 | }
513 | }
514 | }
515 |
516 | /* Get information on all bound layers. */
517 | getBoundLayersInfo();
518 |
519 | /* Create the Infer Execution Context. */
520 | m_InferExecutionContext = m_CudaEngine->createExecutionContext();
521 | if (!m_InferExecutionContext)
522 | {
523 | printError("Failed to create Infer Execution Context");
524 | return NVDSINFER_TENSORRT_ERROR;
525 | }
526 |
527 | /* Create the cuda stream on which pre-processing jobs will be executed. */
528 | cudaReturn = cudaStreamCreateWithFlags(&m_PreProcessStream,
529 | cudaStreamNonBlocking);
530 | if (cudaReturn != cudaSuccess)
531 | {
532 | printError("Failed to create cudaStream(%s)",
533 | cudaGetErrorName(cudaReturn));
534 | return NVDSINFER_TENSORRT_ERROR;
535 | }
536 | nvtx_name = "nvdsinfer_preprocess_uid=" + to_string(m_UniqueID);
537 | nvtxNameCudaStreamA (m_PreProcessStream, nvtx_name.c_str());
538 |
539 | /* Create the cuda stream on which inference jobs will be executed. */
540 | cudaReturn = cudaStreamCreateWithFlags(&m_InferStream, cudaStreamNonBlocking);
541 | if (cudaReturn != cudaSuccess)
542 | {
543 | printError("Failed to create cudaStream(%s)",
544 | cudaGetErrorName(cudaReturn));
545 | return NVDSINFER_CUDA_ERROR;
546 | }
547 | nvtx_name = "nvdsinfer_infer_uid=" + to_string(m_UniqueID);
548 | nvtxNameCudaStreamA (m_InferStream, nvtx_name.c_str());
549 |
550 | /* Create the cuda stream on which device to host memcpy jobs will be
551 | * executed. */
552 | cudaReturn = cudaStreamCreateWithFlags (&m_BufferCopyStream,
553 | cudaStreamNonBlocking);
554 | if (cudaReturn != cudaSuccess)
555 | {
556 | printError("Failed to create cudaStream(%s)",
557 | cudaGetErrorName(cudaReturn));
558 | return NVDSINFER_CUDA_ERROR;
559 | }
560 | nvtx_name = "nvdsinfer_DtoHcopy_uid=" + to_string(m_UniqueID);
561 | nvtxNameCudaStreamA (m_BufferCopyStream, nvtx_name.c_str());
562 |
563 | /* Allocate binding buffers on the device and the corresponding host
564 | * buffers. */
565 | NvDsInferStatus status = allocateBuffers();
566 | if (status != NVDSINFER_SUCCESS)
567 | {
568 | printError("Failed to allocate buffers");
569 | return status;
570 | }
571 |
572 | /* Parse the labels file if specified. */
573 | if (!string_empty(initParams.labelsFilePath))
574 | {
575 | if (!file_accessible(initParams.labelsFilePath))
576 | {
577 | printError("Could not access labels file '%s'", initParams.labelsFilePath);
578 | return NVDSINFER_CONFIG_FAILED;
579 | }
580 | NvDsInferStatus status = parseLabelsFile(initParams.labelsFilePath);
581 | if (status != NVDSINFER_SUCCESS)
582 | {
583 | printError("Failed to read labels file");
584 | return status;
585 | }
586 | }
587 |
588 | /* Cuda event to synchronize between consumption of input binding buffer by
589 | * the cuda engine and the pre-processing kernel which writes to the input
590 | * binding buffer. */
591 | cudaReturn = cudaEventCreateWithFlags(&m_InputConsumedEvent,
592 | cudaEventDisableTiming);
593 | if (cudaReturn != cudaSuccess)
594 | {
595 | printError("Failed to create cuda event(%s)", cudaGetErrorName(cudaReturn));
596 | return NVDSINFER_CUDA_ERROR;
597 | }
598 | nvtx_name = "nvdsinfer_TRT_input_consumed_uid=" + to_string(m_UniqueID);
599 | nvtxNameCudaEventA (m_InputConsumedEvent, nvtx_name.c_str());
600 |
601 | /* Cuda event to synchronize between completion of the pre-processing kernels
602 | * and enqueuing the next set of binding buffers for inference. */
603 | cudaReturn = cudaEventCreateWithFlags(&m_PreProcessCompleteEvent,
604 | cudaEventDisableTiming);
605 | if (cudaReturn != cudaSuccess)
606 | {
607 | printError("Failed to create cuda event(%s)", cudaGetErrorName(cudaReturn));
608 | return NVDSINFER_CUDA_ERROR;
609 | }
610 | nvtx_name = "nvdsinfer_preprocess_complete_uid=" + to_string(m_UniqueID);
611 | nvtxNameCudaEventA (m_PreProcessCompleteEvent, nvtx_name.c_str());
612 |
613 | /* Cuda event to synchronize between completion of inference on a batch
614 | * and copying the output contents from device to host memory. */
615 | cudaReturn = cudaEventCreateWithFlags(&m_InferCompleteEvent,
616 | cudaEventDisableTiming);
617 | if (cudaReturn != cudaSuccess)
618 | {
619 | printError("Failed to create cuda event(%s)", cudaGetErrorName(cudaReturn));
620 | return NVDSINFER_CUDA_ERROR;
621 | }
622 | nvtx_name = "nvdsinfer_infer_complete_uid=" + to_string(m_UniqueID);
623 | nvtxNameCudaEventA (m_InferCompleteEvent, nvtx_name.c_str());
624 |
625 | /* If custom parse function is specified get the function address from the
626 | * custom library. */
627 | if (m_CustomLibHandle && m_NetworkType == NvDsInferNetworkType_Detector &&
628 | !string_empty(initParams.customBBoxParseFuncName))
629 | {
630 | m_CustomBBoxParseFunc =
631 | (NvDsInferParseCustomFunc) dlsym(m_CustomLibHandle,
632 | initParams.customBBoxParseFuncName);
633 | if (!m_CustomBBoxParseFunc)
634 | {
635 | printError("Could not find parse func '%s' in custom library",
636 | initParams.customBBoxParseFuncName);
637 | return NVDSINFER_CONFIG_FAILED;
638 | }
639 | }
640 |
641 | if (m_CustomLibHandle && m_NetworkType == NvDsInferNetworkType_Classifier &&
642 | !string_empty(initParams.customClassifierParseFuncName))
643 | {
644 | m_CustomClassifierParseFunc =
645 | (NvDsInferClassiferParseCustomFunc) dlsym(m_CustomLibHandle,
646 | initParams.customClassifierParseFuncName);
647 | if (!m_CustomClassifierParseFunc)
648 | {
649 | printError("Could not find parse func '%s' in custom library",
650 | initParams.customClassifierParseFuncName);
651 | return NVDSINFER_CONFIG_FAILED;
652 | }
653 | }
654 |
655 | /* If there are more than one input layers (non-image input) and custom
656 | * library is specified, try to initialize these layers. */
657 | if (m_AllLayerInfo.size() > 1 + m_OutputLayerInfo.size())
658 | {
659 | NvDsInferStatus status = initNonImageInputLayers();
660 | if (status != NVDSINFER_SUCCESS)
661 | {
662 | printError("Failed to initialize non-image input layers");
663 | return status;
664 | }
665 | }
666 |
667 | if (m_UseDBScan)
668 | {
669 | m_DBScanHandle = NvDsInferDBScanCreate();
670 | }
671 |
672 | m_Initialized = true;
673 |
674 | return NVDSINFER_SUCCESS;
675 | }
676 |
677 | /* Get the network input resolution. This is required since this implementation
678 | * requires that the caller supplies an input buffer having the network
679 | * resolution.
680 | */
681 | void
682 | NvDsInferContextImpl::getNetworkInfo(NvDsInferNetworkInfo &networkInfo)
683 | {
684 | networkInfo = m_NetworkInfo;
685 | }
686 |
687 | /* Allocate binding buffers for all bound layers on the device memory. The size
688 | * of the buffers allocated is calculated from the dimensions of the layers, the
689 | * data type of the layer and the max batch size of the infer cuda engine.
690 | *
691 | * NvDsInfer enqueue API requires an array of (void *) buffer pointers. The length
692 | * of the array is equal to the number of bound layers. The buffer corresponding
693 | * to a layer is placed at an index equal to the layer's binding index.
694 | *
695 | * Also allocate corresponding host buffers for output layers in system memory.
696 | *
697 | * Multiple sets of the device and host buffers are allocated so that (inference +
698 | * device to host copy) and output layers parsing can be parallelized.
699 | */
700 | NvDsInferStatus
701 | NvDsInferContextImpl::allocateBuffers()
702 | {
703 | cudaError_t cudaReturn;
704 |
705 | // m_CudaEngine->createExecutionContext();
706 | /* Resize the binding buffers vector to the number of bound layers. */
707 | m_BindingBuffers.assign(m_AllLayerInfo.size(), nullptr);
708 |
709 | for (unsigned int i = 0; i < m_AllLayerInfo.size(); i++)
710 | {
711 | size_t size = m_MaxBatchSize * m_AllLayerInfo[i].dims.numElements *
712 | getElementSize(m_AllLayerInfo[i].dataType);
713 |
714 | /* Do not allocate device memory for output layers here. */
715 | if (!m_CudaEngine->bindingIsInput(i))
716 | continue;
717 |
718 | /* Allocate device memory for the binding buffer. */
719 | cudaReturn = cudaMalloc(&m_BindingBuffers[i], size);
720 | if (cudaReturn != cudaSuccess)
721 | {
722 | printError("Failed to allocate cuda buffer(%s)",
723 | cudaGetErrorName(cudaReturn));
724 | return NVDSINFER_CUDA_ERROR;
725 | }
726 | }
727 |
728 | /* Initialize the batch vector, allocate host memory for the layers,
729 | * add all the free indexes to the free queue. */
730 | for (unsigned int i = 0; i < m_Batches.size(); i++)
731 | {
732 | NvDsInferBatch & batch = m_Batches[i];
733 | /* Resize the host buffers vector to the number of bound layers. */
734 | batch.m_HostBuffers.resize(m_AllLayerInfo.size());
735 | batch.m_DeviceBuffers.assign(m_AllLayerInfo.size(), nullptr);
736 |
737 |
738 | for (unsigned int j = 0; j < m_AllLayerInfo.size(); j++)
739 | {
740 | size_t size = m_MaxBatchSize * m_AllLayerInfo[j].dims.numElements *
741 | getElementSize(m_AllLayerInfo[j].dataType);
742 |
743 | if (m_CudaEngine->bindingIsInput(j))
744 | {
745 | /* Reuse input binding buffer pointers. */
746 | batch.m_DeviceBuffers[j] = m_BindingBuffers[j];
747 | }
748 | else
749 | {
750 | /* Allocate device memory for output layers here. */
751 | cudaReturn = cudaMalloc(&batch.m_DeviceBuffers[j], size);
752 | if (cudaReturn != cudaSuccess)
753 | {
754 | printError("Failed to allocate cuda buffer(%s)",
755 | cudaGetErrorName(cudaReturn));
756 | return NVDSINFER_CUDA_ERROR;
757 | }
758 | }
759 |
760 | /* Allocate host memory for input layers only if application
761 | * needs access to the input layer contents. */
762 | if (m_CudaEngine->bindingIsInput(j) && !m_CopyInputToHostBuffers)
763 | continue;
764 |
765 | /* Resize the uint8_t vector to the size (in bytes) of the buffer.
766 | * The underlying heap memory can be used as host buffer. */
767 | batch.m_HostBuffers[j].resize(size);
768 | }
769 | cudaReturn = cudaEventCreateWithFlags (&batch.m_CopyCompleteEvent,
770 | cudaEventDisableTiming | cudaEventBlockingSync);
771 | if (cudaReturn != cudaSuccess)
772 | {
773 | printError("Failed to create cuda event(%s)",
774 | cudaGetErrorName(cudaReturn));
775 | return NVDSINFER_CUDA_ERROR;
776 | }
777 |
778 | /* Add all the indexes to the free queue initially. */
779 | m_FreeIndexQueue.push(i);
780 | }
781 |
782 | return NVDSINFER_SUCCESS;
783 | }
784 |
785 | /* Get properties of bound layers like the name, dimension, datatype and
786 | * fill the m_AllLayerInfo and m_OutputLayerInfo vectors.
787 | */
788 | NvDsInferStatus
789 | NvDsInferContextImpl::getBoundLayersInfo()
790 | {
791 | for (int i = 0; i < m_CudaEngine->getNbBindings(); i++)
792 | {
793 | NvDsInferLayerInfo info;
794 | Dims d = m_CudaEngine->getBindingDimensions(i);
795 |
796 | info.isInput = m_CudaEngine->bindingIsInput(i);
797 | info.bindingIndex = i;
798 | info.layerName = m_CudaEngine->getBindingName(i);
799 | info.dims.numDims = d.nbDims;
800 | info.dims.numElements = 1;
801 | for (int j = 0; j < d.nbDims; j++)
802 | {
803 | info.dims.d[j] = d.d[j];
804 | info.dims.numElements *= d.d[j];
805 | }
806 |
807 | switch (m_CudaEngine->getBindingDataType(i))
808 | {
809 | case DataType::kFLOAT:
810 | info.dataType = FLOAT;
811 | break;
812 | case DataType::kHALF:
813 | info.dataType = HALF;
814 | break;
815 | case DataType::kINT32:
816 | info.dataType = INT32;
817 | break;
818 | case DataType::kINT8:
819 | info.dataType = INT8;
820 | break;
821 | default:
822 | printError("Unknown data type for bound layer i(%s)",
823 | info.layerName);
824 | return NVDSINFER_TENSORRT_ERROR;
825 | }
826 |
827 | m_AllLayerInfo.push_back(info);
828 | if (!m_CudaEngine->bindingIsInput(i))
829 | m_OutputLayerInfo.push_back(info);
830 | }
831 | return NVDSINFER_SUCCESS;
832 | }
833 |
834 | /* Initialize non-image input layers if the custom library has implemented
835 | * the interface. */
836 | NvDsInferStatus
837 | NvDsInferContextImpl::initNonImageInputLayers()
838 | {
839 | cudaError_t cudaReturn;
840 |
841 | /* Needs the custom library to be specified. */
842 | if (m_CustomLibHandle == nullptr)
843 | {
844 | printWarning("More than one input layers but custom initialization "
845 | "function not implemented");
846 | return NVDSINFER_SUCCESS;
847 | }
848 |
849 | /* Check if the interface to initialize the layers has been implemented. */
850 | NvDsInferInitializeInputLayersFcn fcn = (NvDsInferInitializeInputLayersFcn)
851 | dlsym(m_CustomLibHandle, "NvDsInferInitializeInputLayers");
852 | if (fcn == nullptr)
853 | {
854 | printWarning("More than one input layers but custom initialization "
855 | "function not implemented");
856 | return NVDSINFER_SUCCESS;
857 | }
858 |
859 | /* Interface implemented. */
860 | /* Vector of NvDsInferLayerInfo for non-image input layers. */
861 | vector inputLayers;
862 | for (auto &layer : m_AllLayerInfo)
863 | {
864 | if (m_CudaEngine->bindingIsInput(layer.bindingIndex) &&
865 | layer.bindingIndex != INPUT_LAYER_INDEX)
866 | {
867 | inputLayers.push_back(layer);
868 | }
869 | }
870 |
871 | /* Vector of host memories that can be initialized using CPUs. */
872 | vector> initBuffers(inputLayers.size());
873 |
874 | for (size_t i = 0; i < inputLayers.size(); i++)
875 | {
876 | /* For each layer calculate the size required for the layer, allocate
877 | * the host memory and assign the pointer to layer info structure. */
878 | size_t size = inputLayers[i].dims.numElements *
879 | getElementSize(inputLayers[i].dataType) * m_MaxBatchSize;
880 | initBuffers[i].resize(size);
881 | inputLayers[i].buffer = (void *) initBuffers[i].data();
882 | }
883 |
884 | /* Call the input layer initialization function. */
885 | if (!fcn(inputLayers, m_NetworkInfo, m_MaxBatchSize))
886 | {
887 | printError("Failed to initialize input layers using "
888 | "NvDsInferInitializeInputLayers() in custom lib");
889 | return NVDSINFER_CUSTOM_LIB_FAILED;
890 | }
891 |
892 | /* Memcpy the initialized contents from the host memory to device memory for
893 | * layer binding buffers. */
894 | for (size_t i = 0; i < inputLayers.size(); i++)
895 | {
896 | cudaReturn = cudaMemcpyAsync(m_BindingBuffers[inputLayers[i].bindingIndex],
897 | initBuffers[i].data(), initBuffers[i].size(),
898 | cudaMemcpyHostToDevice, m_InferStream);
899 | if (cudaReturn != cudaSuccess)
900 | {
901 | printError("Failed to copy from host to device memory (%s)",
902 | cudaGetErrorName(cudaReturn));
903 | return NVDSINFER_CUDA_ERROR;
904 | }
905 | /* Application has requested access to the bound buffer contents. Copy
906 | * the contents to all sets of host buffers. */
907 | if (m_CopyInputToHostBuffers)
908 | {
909 | for (size_t j = 0; j < m_Batches.size(); j++)
910 | {
911 | for (size_t i = 0; i < inputLayers.size(); i++)
912 | {
913 | m_Batches[j].m_HostBuffers[inputLayers[i].bindingIndex].
914 | assign(initBuffers[i].begin(), initBuffers[i].end());
915 | }
916 | }
917 | }
918 | }
919 | cudaReturn = cudaStreamSynchronize(m_InferStream);
920 | if (cudaReturn != cudaSuccess)
921 | {
922 | printError("Failed to synchronize cuda stream(%s)",
923 | cudaGetErrorName(cudaReturn));
924 | return NVDSINFER_CUDA_ERROR;
925 | }
926 |
927 | return NVDSINFER_SUCCESS;
928 | }
929 |
930 | /* Parse the labels file and extract the class label strings. For format of
931 | * the labels file, please refer to the custom models section in the DeepStreamSDK
932 | * documentation.
933 | */
934 | NvDsInferStatus
935 | NvDsInferContextImpl::parseLabelsFile(char *labelsFilePath)
936 | {
937 | ifstream labels_file(labelsFilePath);
938 | string delim { ';' };
939 | while (!labels_file.eof())
940 | {
941 | string line, word;
942 | vector l;
943 | size_t pos = 0, oldpos = 0;
944 |
945 | getline(labels_file, line, '\n');
946 | if (line.empty())
947 | continue;
948 |
949 | while ((pos = line.find(delim, oldpos)) != string::npos)
950 | {
951 | word = line.substr(oldpos, pos - oldpos);
952 | l.push_back(word);
953 | oldpos = pos + delim.length();
954 | }
955 | l.push_back(line.substr(oldpos));
956 | m_Labels.push_back(l);
957 | }
958 | return NVDSINFER_SUCCESS;
959 | }
960 |
961 | /* Read the mean image ppm file and copy the mean image data to the mean
962 | * data buffer allocated on the device memory.
963 | */
964 | NvDsInferStatus
965 | NvDsInferContextImpl::readMeanImageFile(char *meanImageFilePath)
966 | {
967 | ifstream infile(meanImageFilePath, std::ifstream::binary);
968 | size_t size = m_NetworkInfo.width * m_NetworkInfo.height *
969 | m_NetworkInfo.channels;
970 | char tempMeanDataChar[size];
971 | float tempMeanDataFloat[size];
972 | cudaError_t cudaReturn;
973 |
974 | if (!infile.good())
975 | {
976 | printError("Could not open mean image file '%s'", meanImageFilePath);
977 | return NVDSINFER_CONFIG_FAILED;
978 | }
979 |
980 | string magic, max;
981 | unsigned int h, w;
982 | infile >> magic >> h >> w >> max;
983 |
984 | if (magic != "P3" && magic != "P6")
985 | {
986 | printError("Magic PPM identifier check failed");
987 | return NVDSINFER_CONFIG_FAILED;
988 | }
989 |
990 | if (w != m_NetworkInfo.width || h != m_NetworkInfo.height)
991 | {
992 | printError("Mismatch between ppm mean image resolution(%d x %d) and "
993 | "network resolution(%d x %d)", w, h, m_NetworkInfo.width,
994 | m_NetworkInfo.height);
995 | return NVDSINFER_CONFIG_FAILED;
996 | }
997 |
998 | infile.get();
999 | infile.read(tempMeanDataChar, size);
1000 | if (infile.gcount() != (int) size)
1001 | {
1002 | printError("Failed to read sufficient bytes from mean file");
1003 | return NVDSINFER_CONFIG_FAILED;
1004 | }
1005 |
1006 | for (size_t i = 0; i < size; i++)
1007 | {
1008 | tempMeanDataFloat[i] = (float) tempMeanDataChar[i];
1009 | }
1010 |
1011 | cudaReturn = cudaMemcpy(m_MeanDataBuffer, tempMeanDataFloat,
1012 | size * sizeof(float), cudaMemcpyHostToDevice);
1013 | if (cudaReturn != cudaSuccess)
1014 | {
1015 | printError("Failed to copy mean data to mean data buffer (%s)",
1016 | cudaGetErrorName(cudaReturn));
1017 | return NVDSINFER_CUDA_ERROR;
1018 | }
1019 |
1020 | return NVDSINFER_SUCCESS;
1021 | }
1022 |
1023 | NvDsInferStatus
1024 | NvDsInferContextImpl::queueInputBatch(NvDsInferContextBatchInput &batchInput)
1025 |
1026 | {
1027 | unsigned int batchSize = batchInput.numInputFrames;
1028 | unsigned int batchIndex;
1029 | void *bindingBuffers[m_AllLayerInfo.size()];
1030 | NvDsInferStatus status;
1031 | NvDsInferConvertFcn convertFcn = nullptr;
1032 |
1033 | /* Check that current batch size does not exceed max batch size. */
1034 | if (batchSize > m_MaxBatchSize)
1035 | {
1036 | printError("Not inferring on batch since it's size(%d) exceeds max batch"
1037 | " size(%d)", batchSize, m_MaxBatchSize);
1038 | return NVDSINFER_INVALID_PARAMS;
1039 | }
1040 |
1041 | /* DLA does not allow enqueuing batches smaller than the engine's maxBatchSize. */
1042 | int enqueueBatchSize = m_DlaEnabled ? m_MaxBatchSize : batchSize;
1043 |
1044 | /* Set the cuda device to be used. */
1045 | cudaError_t cudaReturn = cudaSetDevice(m_GpuID);
1046 | if (cudaReturn != cudaSuccess)
1047 | {
1048 | printError("Failed to set cuda device(%s)", cudaGetErrorName(cudaReturn));
1049 | return NVDSINFER_CUDA_ERROR;
1050 | }
1051 |
1052 |
1053 | /* Make the future jobs on the stream wait till the infer engine consumes
1054 | * the previous contents of the input binding buffer. */
1055 | cudaReturn = cudaStreamWaitEvent (m_PreProcessStream, m_InputConsumedEvent, 0);
1056 | if (cudaReturn != cudaSuccess)
1057 | {
1058 | printError("Failed to make stream wait on event(%s)",
1059 | cudaGetErrorName(cudaReturn));
1060 | return NVDSINFER_CUDA_ERROR;
1061 | }
1062 |
1063 | /* Find the required conversion function. */
1064 | switch (m_NetworkInputFormat)
1065 | {
1066 | case NvDsInferFormat_RGB:
1067 | switch (batchInput.inputFormat)
1068 | {
1069 | case NvDsInferFormat_RGB:
1070 | convertFcn = NvDsInferConvert_C3ToP3Float;
1071 | break;
1072 | case NvDsInferFormat_BGR:
1073 | convertFcn = NvDsInferConvert_C3ToP3RFloat;
1074 | break;
1075 | case NvDsInferFormat_RGBA:
1076 | convertFcn = NvDsInferConvert_C4ToP3Float;
1077 | break;
1078 | case NvDsInferFormat_BGRx:
1079 | convertFcn = NvDsInferConvert_C4ToP3RFloat;
1080 | break;
1081 | default:
1082 | printError("Input format conversion is not supported");
1083 | return NVDSINFER_INVALID_PARAMS;
1084 | }
1085 | break;
1086 | case NvDsInferFormat_BGR:
1087 | switch (batchInput.inputFormat)
1088 | {
1089 | case NvDsInferFormat_RGB:
1090 | convertFcn = NvDsInferConvert_C3ToP3RFloat;
1091 | break;
1092 | case NvDsInferFormat_BGR:
1093 | convertFcn = NvDsInferConvert_C3ToP3Float;
1094 | break;
1095 | case NvDsInferFormat_RGBA:
1096 | convertFcn = NvDsInferConvert_C4ToP3RFloat;
1097 | break;
1098 | case NvDsInferFormat_BGRx:
1099 | convertFcn = NvDsInferConvert_C4ToP3Float;
1100 | break;
1101 | default:
1102 | printError("Input format conversion is not supported");
1103 | return NVDSINFER_INVALID_PARAMS;
1104 | }
1105 | break;
1106 | case NvDsInferFormat_GRAY:
1107 | if (batchInput.inputFormat != NvDsInferFormat_GRAY)
1108 | {
1109 | printError("Input frame format is not GRAY.");
1110 | return NVDSINFER_INVALID_PARAMS;
1111 | }
1112 | convertFcn = NvDsInferConvert_C1ToP1Float;
1113 | break;
1114 | default:
1115 | printError("Unsupported network input format");
1116 | return NVDSINFER_INVALID_PARAMS;
1117 | }
1118 |
1119 | /* For each frame in the input batch convert/copy to the input binding buffer. */
1120 | for (unsigned int i = 0; i < batchSize; i++)
1121 | {
1122 | float *outPtr = (float *) m_BindingBuffers[INPUT_LAYER_INDEX] +
1123 | i * m_AllLayerInfo[INPUT_LAYER_INDEX].dims.numElements;
1124 |
1125 | /* Input needs to be pre-processed. */
1126 | convertFcn(outPtr, (unsigned char*) batchInput.inputFrames[i],
1127 | m_NetworkInfo.width, m_NetworkInfo.height,
1128 | batchInput.inputPitch, m_NetworkScaleFactor,
1129 | m_MeanDataBuffer, m_PreProcessStream);
1130 | }
1131 |
1132 | /* We may use multiple sets of the output device and host buffers since while the
1133 | * output of one batch is being parsed on the CPU, we can queue
1134 | * pre-processing and inference of another on the GPU. Pop an index from the
1135 | * free queue. Wait if queue is empty. */
1136 | {
1137 | unique_lock lock(m_QueueMutex);
1138 | while (m_FreeIndexQueue.empty())
1139 | {
1140 | m_QueueCondition.wait(lock);
1141 | }
1142 | batchIndex = m_FreeIndexQueue.front();
1143 | m_FreeIndexQueue.pop();
1144 | }
1145 |
1146 | /* Inputs can be returned back once pre-processing is complete. */
1147 | if (batchInput.returnInputFunc)
1148 | {
1149 | cudaReturn = cudaStreamAddCallback(m_PreProcessStream, returnInputCudaCallback,
1150 | new NvDsInferReturnInputPair(batchInput.returnInputFunc,
1151 | batchInput.returnFuncData), 0);
1152 | if (cudaReturn != cudaSuccess)
1153 | {
1154 | printError("Failed to add cudaStream callback for returning input buffers (%s)",
1155 | cudaGetErrorName(cudaReturn));
1156 | return NVDSINFER_CUDA_ERROR;
1157 | }
1158 | }
1159 |
1160 | /* Fill the array of binding buffers for the current batch. */
1161 | std::copy(m_Batches[batchIndex].m_DeviceBuffers.begin(),
1162 | m_Batches[batchIndex].m_DeviceBuffers.end(), bindingBuffers);
1163 |
1164 | /* Record CUDA event to synchronize the completion of pre-processing kernels. */
1165 | cudaReturn = cudaEventRecord(m_PreProcessCompleteEvent, m_PreProcessStream);
1166 | if (cudaReturn != cudaSuccess)
1167 | {
1168 | printError("Failed to record cuda event (%s)",
1169 | cudaGetErrorName(cudaReturn));
1170 | status = NVDSINFER_CUDA_ERROR;
1171 | goto error;
1172 | }
1173 |
1174 | /* Make the future jobs on the stream wait till pre-processing kernels finish. */
1175 | cudaReturn = cudaStreamWaitEvent (m_InferStream, m_PreProcessCompleteEvent, 0);
1176 | if (cudaReturn != cudaSuccess)
1177 | {
1178 | printError("Failed to make stream wait on event(%s)",
1179 | cudaGetErrorName(cudaReturn));
1180 | status = NVDSINFER_CUDA_ERROR;
1181 | goto error;
1182 | }
1183 |
1184 | {
1185 | std::unique_lock deferLock(DlaExecutionMutex, std::defer_lock);
1186 |
1187 | /* IExecutionContext::enqueue is not thread safe in case of DLA */
1188 | if (m_DlaEnabled)
1189 | deferLock.lock();
1190 |
1191 | /* Queue the bound buffers for inferencing. */
1192 | if (!m_InferExecutionContext->enqueue(enqueueBatchSize, bindingBuffers,
1193 | m_InferStream, &m_InputConsumedEvent))
1194 | {
1195 | printError("Failed to enqueue inference batch");
1196 | status = NVDSINFER_TENSORRT_ERROR;
1197 | goto error;
1198 | }
1199 | }
1200 |
1201 | /* Record event on m_InferStream to indicate completion of inference on the
1202 | * current batch. */
1203 | cudaReturn = cudaEventRecord (m_InferCompleteEvent, m_InferStream);
1204 | if (cudaReturn != cudaSuccess)
1205 | {
1206 | printError("Failed to record cuda event (%s)", cudaGetErrorName(cudaReturn));
1207 | status = NVDSINFER_CUDA_ERROR;
1208 | goto error;
1209 | }
1210 |
1211 | /* Make future copy jobs on the buffer copy stream wait on the infer
1212 | * completion event. */
1213 | cudaReturn = cudaStreamWaitEvent (m_BufferCopyStream, m_InferCompleteEvent, 0);
1214 | if (cudaReturn != cudaSuccess)
1215 | {
1216 | printError("CUDA Stream failed to wait on event (%s)",
1217 | cudaGetErrorName(cudaReturn));
1218 | status = NVDSINFER_CUDA_ERROR;
1219 | goto error;
1220 | }
1221 |
1222 | /* Queue the copy of output contents from device to host memory after the
1223 | * infer completion event. */
1224 | {
1225 | NvDsInferBatch &batch = m_Batches[batchIndex];
1226 | batch.m_BatchSize = batchSize;
1227 |
1228 | for (unsigned int i = 0; i < m_OutputLayerInfo.size(); i++)
1229 | {
1230 | NvDsInferLayerInfo & info = m_OutputLayerInfo[i];
1231 | cudaReturn =
1232 | cudaMemcpyAsync(batch.m_HostBuffers[info.bindingIndex].data(),
1233 | batch.m_DeviceBuffers[info.bindingIndex],
1234 | getElementSize(info.dataType) *
1235 | info.dims.numElements * batch.m_BatchSize,
1236 | cudaMemcpyDeviceToHost, m_BufferCopyStream);
1237 | if (cudaReturn != cudaSuccess)
1238 | {
1239 | printError("cudaMemcpyAsync for output buffers failed (%s)",
1240 | cudaGetErrorName(cudaReturn));
1241 | status = NVDSINFER_CUDA_ERROR;
1242 | goto error;
1243 | }
1244 | }
1245 | if (m_CopyInputToHostBuffers)
1246 | {
1247 | NvDsInferLayerInfo &info = m_AllLayerInfo[INPUT_LAYER_INDEX];
1248 | cudaReturn =
1249 | cudaMemcpyAsync(batch.m_HostBuffers[info.bindingIndex].data(),
1250 | m_BindingBuffers[info.bindingIndex],
1251 | getElementSize(info.dataType) *
1252 | info.dims.numElements * batch.m_BatchSize,
1253 | cudaMemcpyDeviceToHost, m_BufferCopyStream);
1254 | if (cudaReturn != cudaSuccess)
1255 | {
1256 | printError("cudaMemcpyAsync for input layer failed (%s)",
1257 | cudaGetErrorName(cudaReturn));
1258 | status = NVDSINFER_CUDA_ERROR;
1259 | goto error;
1260 | }
1261 | }
1262 | /* Record CUDA event to later synchronize for the copy to actually
1263 | * complete. */
1264 | cudaReturn = cudaEventRecord(batch.m_CopyCompleteEvent,
1265 | m_BufferCopyStream);
1266 | if (cudaReturn != cudaSuccess)
1267 | {
1268 | printError("Failed to record cuda event (%s)",
1269 | cudaGetErrorName(cudaReturn));
1270 | status = NVDSINFER_CUDA_ERROR;
1271 | goto error;
1272 | }
1273 | }
1274 |
1275 | /* Push the batch index into the processing queue. */
1276 | {
1277 | unique_lock lock(m_QueueMutex);
1278 | m_ProcessIndexQueue.push(batchIndex);
1279 | m_QueueCondition.notify_one();
1280 | }
1281 | return NVDSINFER_SUCCESS;
1282 |
1283 | error:
1284 | {
1285 | unique_lock lock(m_QueueMutex);
1286 | m_FreeIndexQueue.push(batchIndex);
1287 | }
1288 | return status;
1289 | }
1290 |
1291 | /* Dequeue batch output of the inference engine for each batch input. */
1292 | NvDsInferStatus
1293 | NvDsInferContextImpl::dequeueOutputBatch(NvDsInferContextBatchOutput &batchOutput)
1294 | {
1295 | unsigned int batchIndex;
1296 |
1297 | /* Set the cuda device */
1298 | cudaError_t cudaReturn = cudaSetDevice(m_GpuID);
1299 | if (cudaReturn != cudaSuccess)
1300 | {
1301 | printError("Failed to set cuda device (%s)", cudaGetErrorName(cudaReturn));
1302 | return NVDSINFER_CUDA_ERROR;
1303 | }
1304 |
1305 | /* Pop a batch index from the process queue. Wait if
1306 | * the queue is empty. */
1307 | {
1308 | unique_lock lock(m_QueueMutex);
1309 | while (m_ProcessIndexQueue.empty())
1310 | {
1311 | m_QueueCondition.wait(lock);
1312 | }
1313 | batchIndex = m_ProcessIndexQueue.front();
1314 | m_ProcessIndexQueue.pop();
1315 | }
1316 | NvDsInferBatch & batch = m_Batches[batchIndex];
1317 |
1318 | /* Wait for the copy to the current set of host buffers to complete. */
1319 | cudaReturn = cudaEventSynchronize (batch.m_CopyCompleteEvent);
1320 | if (cudaReturn != cudaSuccess)
1321 | {
1322 | printError("Failed to synchronize on cuda event (%s)",
1323 | cudaGetErrorName(cudaReturn));
1324 | {
1325 | unique_lock lock(m_QueueMutex);
1326 | m_FreeIndexQueue.push(batchIndex);
1327 | m_QueueCondition.notify_one();
1328 | }
1329 | return NVDSINFER_CUDA_ERROR;
1330 | }
1331 |
1332 | batchOutput.frames = new NvDsInferFrameOutput[batch.m_BatchSize];
1333 | batchOutput.numFrames = batch.m_BatchSize;
1334 | /* For each frame in the current batch, parse the output and add the frame
1335 | * output to the batch output. The number of frames output in one batch
1336 | * will be equal to the number of frames present in the batch during queuing
1337 | * at the input.
1338 | */
1339 | for (unsigned int index = 0; index < batch.m_BatchSize; index++)
1340 | {
1341 | NvDsInferFrameOutput &frameOutput = batchOutput.frames[index];
1342 | frameOutput.outputType = NvDsInferNetworkType_Other;
1343 |
1344 | /* Calculate the pointer to the output for each frame in the batch for
1345 | * each output layer buffer. The NvDsInferLayerInfo vector for output
1346 | * layers is passed to the output parsing function. */
1347 | for (unsigned int i = 0; i < m_OutputLayerInfo.size(); i++)
1348 | {
1349 | NvDsInferLayerInfo & info = m_OutputLayerInfo[i];
1350 | info.buffer =
1351 | (void *)(batch.m_HostBuffers[info.bindingIndex].data() +
1352 | info.dims.numElements *
1353 | getElementSize(info.dataType) * index);
1354 | }
1355 |
1356 | switch (m_NetworkType)
1357 | {
1358 | case NvDsInferNetworkType_Detector:
1359 | fillDetectionOutput(frameOutput.detectionOutput);
1360 | frameOutput.outputType = NvDsInferNetworkType_Detector;
1361 | break;
1362 | case NvDsInferNetworkType_Classifier:
1363 | fillClassificationOutput(frameOutput.classificationOutput);
1364 | frameOutput.outputType = NvDsInferNetworkType_Classifier;
1365 | break;
1366 | case NvDsInferNetworkType_Segmentation:
1367 | fillSegmentationOutput(frameOutput.segmentationOutput);
1368 | frameOutput.outputType = NvDsInferNetworkType_Segmentation;
1369 | break;
1370 | default:
1371 | break;
1372 | }
1373 | }
1374 |
1375 | /* Fill the host buffers information in the output. */
1376 | batchOutput.outputBatchID = batchIndex;
1377 | batchOutput.numHostBuffers = m_AllLayerInfo.size();
1378 | batchOutput.hostBuffers = new void*[m_AllLayerInfo.size()];
1379 | for (size_t i = 0; i < batchOutput.numHostBuffers; i++)
1380 | {
1381 | batchOutput.hostBuffers[i] = m_Batches[batchIndex].m_HostBuffers[i].data();
1382 | }
1383 |
1384 | batchOutput.numOutputDeviceBuffers = m_OutputLayerInfo.size();
1385 | batchOutput.outputDeviceBuffers = new void*[m_OutputLayerInfo.size()];
1386 | for (size_t i = 0; i < batchOutput.numOutputDeviceBuffers; i++)
1387 | {
1388 | batchOutput.outputDeviceBuffers[i] =
1389 | m_Batches[batchIndex].m_DeviceBuffers[m_OutputLayerInfo[i].bindingIndex];
1390 | }
1391 |
1392 | /* Mark the set of host buffers as not with the context. */
1393 | m_Batches[batchIndex].m_BuffersWithContext = false;
1394 | return NVDSINFER_SUCCESS;
1395 | }
1396 |
1397 | /**
1398 | * Release a set of host buffers back to the context.
1399 | */
1400 | void
1401 | NvDsInferContextImpl::releaseBatchOutput(NvDsInferContextBatchOutput &batchOutput)
1402 | {
1403 | unique_lock < std::mutex > lock (m_QueueMutex);
1404 | unsigned int outputBatchID = batchOutput.outputBatchID;
1405 |
1406 | /* Check for a valid id */
1407 | if (outputBatchID >= m_Batches.size())
1408 | {
1409 | printWarning("Tried to release an unknown outputBatchID");
1410 | return;
1411 | }
1412 | /* And if the batch is not already with the context. */
1413 | if (m_Batches[outputBatchID].m_BuffersWithContext)
1414 | {
1415 | printWarning("Tried to release an outputBatchID which is"
1416 | " already with the context");
1417 | return;
1418 | }
1419 | m_Batches[outputBatchID].m_BuffersWithContext = true;
1420 | m_FreeIndexQueue.push (outputBatchID);
1421 | m_QueueCondition.notify_one ();
1422 |
1423 | /* Free memory allocated in dequeueOutputBatch */
1424 | for (unsigned int i = 0; i < batchOutput.numFrames; i++)
1425 | {
1426 | releaseFrameOutput(batchOutput.frames[i]);
1427 | }
1428 |
1429 | delete[] batchOutput.frames;
1430 | delete[] batchOutput.hostBuffers;
1431 | delete[] batchOutput.outputDeviceBuffers;
1432 | }
1433 |
1434 | /**
1435 | * Fill all the bound layers information in the vector.
1436 | */
1437 | void
1438 | NvDsInferContextImpl::fillLayersInfo(vector &layersInfo)
1439 | {
1440 | layersInfo.assign (m_AllLayerInfo.begin(), m_AllLayerInfo.end());
1441 | }
1442 |
1443 | const vector> &
1444 | NvDsInferContextImpl::getLabels()
1445 | {
1446 | return m_Labels;
1447 | }
1448 |
1449 | /* Check if the runtime cuda engine is compatible with requested configuration. */
1450 | NvDsInferStatus
1451 | NvDsInferContextImpl::checkEngineParams(NvDsInferContextInitParams &initParams)
1452 | {
1453 | /* Check if the cuda engine can support requested max batch size. */
1454 | if ((int) m_MaxBatchSize > m_CudaEngine->getMaxBatchSize())
1455 | {
1456 | printWarning("Requested Max Batch Size is less than engine batch size");
1457 | return NVDSINFER_CONFIG_FAILED;
1458 | }
1459 |
1460 | for (unsigned int i = 0; i < initParams.numOutputLayers; i++)
1461 | {
1462 | int bindingIndex = m_CudaEngine->getBindingIndex(initParams.outputLayerNames[i]);
1463 | if (bindingIndex == -1 || m_CudaEngine->bindingIsInput(bindingIndex))
1464 | {
1465 | printWarning("Could not find output layer '%s' in engine",
1466 | initParams.outputLayerNames[i]);
1467 | }
1468 | }
1469 |
1470 | return NVDSINFER_SUCCESS;
1471 | }
1472 |
1473 | /* Try to create the Cuda Engine from a serialized file. */
1474 | NvDsInferStatus
1475 | NvDsInferContextImpl::useEngineFile(NvDsInferContextInitParams &initParams)
1476 | {
1477 | NvDsInferStatus status;
1478 | size_t size = 0;
1479 | size_t i = 0;
1480 | ifstream gieModelFile(initParams.modelEngineFilePath);
1481 | if (!gieModelFile.good())
1482 | {
1483 | printWarning("Failed to read from model engine file");
1484 | return NVDSINFER_CONFIG_FAILED;
1485 | }
1486 |
1487 | /* Get the engine file size and read contents into a char buffer. */
1488 | gieModelFile.seekg(0, ios::end);
1489 | size = gieModelFile.tellg();
1490 | gieModelFile.seekg(0, ios::beg);
1491 |
1492 | std::vector buff(size);
1493 | while (gieModelFile.get(buff[i]))
1494 | i++;
1495 | gieModelFile.close();
1496 |
1497 | /* Use DLA if specified. */
1498 | if (initParams.useDLA)
1499 | {
1500 | m_InferRuntime->setDLACore(initParams.dlaCore);
1501 | }
1502 |
1503 | /* Create the cuda engine from the serialized engine file contents. */
1504 | m_CudaEngine = m_InferRuntime->deserializeCudaEngine((void *) buff.data(),
1505 | size, m_RuntimePluginFactory);
1506 | if (!m_CudaEngine)
1507 | {
1508 | printWarning("Failed to create engine from file");
1509 | return NVDSINFER_TENSORRT_ERROR;
1510 | }
1511 |
1512 | /* Check if the deserialized cuda engine is compatible with requested
1513 | * configuration. */
1514 | status = checkEngineParams(initParams);
1515 | if (status != NVDSINFER_SUCCESS)
1516 | {
1517 | /* Cannot use deserialized cuda engine. Destroy the engine. */
1518 | m_CudaEngine->destroy();
1519 | m_CudaEngine = nullptr;
1520 | }
1521 | return status;
1522 | }
1523 |
1524 | /* Custom unique_ptr subclass with deleter functions for TensorRT objects. */
1525 | template
1526 | class NvDsInferUniquePtr : public std::unique_ptr
1527 | {
1528 | public:
1529 | NvDsInferUniquePtr(T * t = nullptr) :
1530 | std::unique_ptr(t, [](T *t){if (t) t->destroy();})
1531 | {}
1532 | };
1533 |
1534 | /* Create cudaengine for the model from the init params
1535 | * (caffemodel & prototxt/uff/onnx, int8 calibration tables, etc) and return the
1536 | * serialized cuda engine stream. */
1537 | NvDsInferStatus
1538 | NvDsInferContextImpl::generateTRTModel(
1539 | NvDsInferContextInitParams &initParams,
1540 | IHostMemory *&gieModelStream)
1541 | {
1542 | /* Custom implementation of unique_ptr ensures that corresponding destroy
1543 | * methods of TensorRT objects get called when the pointer variables go out
1544 | * of scope. */
1545 | NvDsInferUniquePtr builder = nvinfer1::createInferBuilder(m_Logger);
1546 | NvDsInferUniquePtr network = builder->createNetwork ();
1547 | NvDsInferUniquePtr cudaEngine;
1548 |
1549 | NvDsInferUniquePtr caffeParser;
1550 | NvDsInferUniquePtr uffParser;
1551 | NvDsInferUniquePtr onnxParser;
1552 |
1553 | NvDsInferInt8Calibrator pCalibrator(initParams.int8CalibrationFilePath);
1554 | NvDsInferNetworkMode networkMode = initParams.networkMode;
1555 | DataType modelDataType;
1556 |
1557 | stringstream engineFileName;
1558 |
1559 | NvDsInferPluginFactoryCaffe caffePluginFactory{nullptr};
1560 | NvDsInferPluginFactoryUff uffPluginFactory{nullptr};
1561 |
1562 | NvDsInferCudaEngineGetFcn cudaEngineGetFcn = nullptr;
1563 |
1564 | switch (networkMode)
1565 | {
1566 | case NvDsInferNetworkMode_FP32:
1567 | case NvDsInferNetworkMode_FP16:
1568 | case NvDsInferNetworkMode_INT8:
1569 | break;
1570 | default:
1571 | printError("Unknown network mode %d", networkMode);
1572 | return NVDSINFER_CONFIG_FAILED;
1573 | }
1574 |
1575 | if (!string_empty(initParams.tltEncodedModelFilePath))
1576 | {
1577 | /* Use the CUDA engine creation function for TLT encoded models provided
1578 | * by NvDsInferUtils. */
1579 | cudaEngineGetFcn = NvDsInferCudaEngineGetFromTltModel;
1580 | }
1581 | else if (m_CustomLibHandle)
1582 | {
1583 | /* Get the address of the custom cuda engine creation function if available
1584 | * in the custom lib. */
1585 | cudaEngineGetFcn = (NvDsInferCudaEngineGetFcn) dlsym(m_CustomLibHandle,
1586 | "NvDsInferCudaEngineGet");
1587 | }
1588 |
1589 | if (networkMode == NvDsInferNetworkMode_INT8)
1590 | {
1591 | /* Check if platform supports INT8 else use FP16 */
1592 | if (builder->platformHasFastInt8())
1593 | {
1594 | if (!string_empty(initParams.int8CalibrationFilePath) &&
1595 | file_accessible(initParams.int8CalibrationFilePath))
1596 | {
1597 | /* Set INT8 mode and set the INT8 Calibrator */
1598 | builder->setInt8Mode(true);
1599 | builder->setInt8Calibrator(&pCalibrator);
1600 | /* modelDataType should be FLOAT for INT8 */
1601 | modelDataType = DataType::kFLOAT;
1602 | }
1603 | else if (cudaEngineGetFcn != nullptr)
1604 | {
1605 | printWarning("INT8 calibration file not specified/accessible. "
1606 | "INT8 calibration can be done through setDynamicRange "
1607 | "API in 'NvDsInferCreateNetwork' implementation");
1608 | }
1609 | else
1610 | {
1611 | printWarning("INT8 calibration file not specified. Trying FP16 mode.");
1612 | networkMode = NvDsInferNetworkMode_FP16;
1613 | }
1614 | }
1615 | else
1616 | {
1617 | printWarning("INT8 not supported by platform. Trying FP16 mode.");
1618 | networkMode = NvDsInferNetworkMode_FP16;
1619 | }
1620 | }
1621 |
1622 | if (networkMode == NvDsInferNetworkMode_FP16)
1623 | {
1624 | /* Check if platform supports FP16 else use FP32 */
1625 | if (builder->platformHasFastFp16())
1626 | {
1627 | builder->setHalf2Mode(true);
1628 | modelDataType = DataType::kHALF;
1629 | }
1630 | else
1631 | {
1632 | printWarning("FP16 not supported by platform. Using FP32 mode.");
1633 | networkMode = NvDsInferNetworkMode_FP32;
1634 | }
1635 | }
1636 |
1637 | if (networkMode == NvDsInferNetworkMode_FP32)
1638 | {
1639 | modelDataType = DataType::kFLOAT;
1640 | }
1641 |
1642 | /* Set the maximum batch size */
1643 | builder->setMaxBatchSize(m_MaxBatchSize);
1644 | builder->setMaxWorkspaceSize(WORKSPACE_SIZE);
1645 |
1646 | /* Use DLA if specified. */
1647 | if (initParams.useDLA)
1648 | {
1649 | builder->setDefaultDeviceType(DeviceType::kDLA);
1650 | builder->setDLACore(initParams.dlaCore);
1651 | builder->allowGPUFallback(true);
1652 | }
1653 |
1654 | /* If the custom network creation function has been specified use that. */
1655 | if (cudaEngineGetFcn)
1656 | {
1657 | nvinfer1::ICudaEngine *engine = nullptr;
1658 | if (!cudaEngineGetFcn (builder.get(), &initParams, modelDataType, engine) ||
1659 | engine == nullptr)
1660 | {
1661 | printError("Failed to create network using custom network creation"
1662 | " function");
1663 | return NVDSINFER_CUSTOM_LIB_FAILED;
1664 | }
1665 | cudaEngine = engine;
1666 | if (!string_empty(initParams.tltEncodedModelFilePath))
1667 | {
1668 | engineFileName << initParams.tltEncodedModelFilePath;
1669 | }
1670 | else
1671 | {
1672 | char *cwd = getcwd(NULL, 0);
1673 | engineFileName << cwd << "/model";
1674 | free(cwd);
1675 | }
1676 | }
1677 | /* Check for caffe model files first. */
1678 | else if (!string_empty(initParams.modelFilePath) &&
1679 | !string_empty(initParams.protoFilePath))
1680 | {
1681 | if (!file_accessible(initParams.modelFilePath))
1682 | {
1683 | printError("Cannot access caffemodel file '%s'",
1684 | initParams.modelFilePath);
1685 | return NVDSINFER_CONFIG_FAILED;
1686 | }
1687 | if (!file_accessible(initParams.protoFilePath))
1688 | {
1689 | printError("Cannot access prototxt file '%s'",
1690 | initParams.protoFilePath);
1691 | return NVDSINFER_CONFIG_FAILED;
1692 | }
1693 |
1694 | caffeParser = nvcaffeparser1::createCaffeParser();
1695 | /* Check if the custom library provides a PluginFactory for Caffe parsing. */
1696 | if (m_CustomLibHandle)
1697 | {
1698 | NvDsInferPluginFactoryCaffeGetFcn fcn =
1699 | (NvDsInferPluginFactoryCaffeGetFcn) dlsym(m_CustomLibHandle,
1700 | "NvDsInferPluginFactoryCaffeGet");
1701 | if (fcn)
1702 | {
1703 | NvDsInferPluginFactoryType type;
1704 | if (!fcn(caffePluginFactory, type))
1705 | {
1706 | printError("Could not get PluginFactory instance for "
1707 | "Caffe parsing from custom library");
1708 | return NVDSINFER_CUSTOM_LIB_FAILED;
1709 | }
1710 | /* Use the appropriate API to set the PluginFactory based on its
1711 | * type. */
1712 | switch (type)
1713 | {
1714 | case PLUGIN_FACTORY:
1715 | caffeParser->setPluginFactory(
1716 | caffePluginFactory.pluginFactory);
1717 | break;
1718 | case PLUGIN_FACTORY_EXT:
1719 | caffeParser->setPluginFactoryExt(
1720 | caffePluginFactory.pluginFactoryExt);
1721 | break;
1722 | case PLUGIN_FACTORY_V2:
1723 | caffeParser->setPluginFactoryV2(
1724 | caffePluginFactory.pluginFactoryV2);
1725 | break;
1726 | default:
1727 | printError("Invalid PluginFactory type returned by "
1728 | "custom library");
1729 | return NVDSINFER_CUSTOM_LIB_FAILED;
1730 | }
1731 | }
1732 | }
1733 |
1734 | /* Parse the caffe model. */
1735 | const nvcaffeparser1::IBlobNameToTensor *blobNameToTensor =
1736 | caffeParser->parse(initParams.protoFilePath,
1737 | initParams.modelFilePath, *network,
1738 | modelDataType);
1739 |
1740 | if (!blobNameToTensor)
1741 | {
1742 | printError("Failed while parsing network");
1743 | return NVDSINFER_TENSORRT_ERROR;
1744 | }
1745 |
1746 | for (unsigned int i = 0; i < initParams.numOutputLayers; i++)
1747 | {
1748 | char *layerName = initParams.outputLayerNames[i];
1749 | /* Find and mark the coverage layer as output */
1750 | ITensor *tensor = blobNameToTensor->find(layerName);
1751 | if (!tensor)
1752 | {
1753 | printError("Could not find output layer '%s'", layerName);
1754 | return NVDSINFER_CONFIG_FAILED;
1755 | }
1756 | network->markOutput(*tensor);
1757 | }
1758 | engineFileName << initParams.modelFilePath;
1759 | }
1760 | /* Check for UFF model next. */
1761 | else if (!string_empty(initParams.uffFilePath))
1762 | {
1763 | if (!file_accessible(initParams.uffFilePath))
1764 | {
1765 | printError("Cannot access UFF file '%s'", initParams.uffFilePath);
1766 | return NVDSINFER_CONFIG_FAILED;
1767 | }
1768 |
1769 | //uffParser = nvuffparser::createUffParser();
1770 | DimsCHW uffInputDims;
1771 | nvuffparser::UffInputOrder uffInputOrder;
1772 |
1773 | /* UFF parsing needs the input layer name. */
1774 | if (string_empty(initParams.uffInputBlobName))
1775 | {
1776 | printError("UFF input blob name not provided");
1777 | return NVDSINFER_CONFIG_FAILED;
1778 |
1779 | }
1780 |
1781 | uffInputDims.c() = initParams.uffDimsCHW.c;
1782 | uffInputDims.h() = initParams.uffDimsCHW.h;
1783 | uffInputDims.w() = initParams.uffDimsCHW.w;
1784 |
1785 | switch (initParams.uffInputOrder)
1786 | {
1787 | case NvDsInferUffInputOrder_kNCHW:
1788 | uffInputOrder = nvuffparser::UffInputOrder::kNCHW;
1789 | break;
1790 | case NvDsInferUffInputOrder_kNHWC:
1791 | uffInputOrder = nvuffparser::UffInputOrder::kNHWC;
1792 | break;
1793 | case NvDsInferUffInputOrder_kNC:
1794 | uffInputOrder = nvuffparser::UffInputOrder::kNC;
1795 | break;
1796 | default:
1797 | printError("Unrecognized uff input order");
1798 | return NVDSINFER_CONFIG_FAILED;
1799 | }
1800 |
1801 | /* Register the input layer (name, dims and input order). */
1802 | if (!uffParser->registerInput(initParams.uffInputBlobName,
1803 | uffInputDims, uffInputOrder))
1804 | {
1805 | printError("Failed to register input blob: %s DimsCHW:(%d,%d,%d) "
1806 | "Order: %s", initParams.uffInputBlobName, initParams.uffDimsCHW.c,
1807 | initParams.uffDimsCHW.h, initParams.uffDimsCHW.w,
1808 | (initParams.uffInputOrder == NvDsInferUffInputOrder_kNHWC ?
1809 | "HWC" : "CHW"));
1810 | return NVDSINFER_CONFIG_FAILED;
1811 |
1812 | }
1813 | /* Register outputs. */
1814 | for (unsigned int i = 0; i < initParams.numOutputLayers; i++) {
1815 | uffParser->registerOutput(initParams.outputLayerNames[i]);
1816 | }
1817 |
1818 | /* Check if the custom library provides a PluginFactory for UFF parsing. */
1819 | if (m_CustomLibHandle)
1820 | {
1821 | NvDsInferPluginFactoryUffGetFcn fcn =
1822 | (NvDsInferPluginFactoryUffGetFcn) dlsym(m_CustomLibHandle,
1823 | "NvDsInferPluginFactoryUffGet");
1824 | if (fcn)
1825 | {
1826 | NvDsInferPluginFactoryType type;
1827 | if (!fcn(uffPluginFactory, type))
1828 | {
1829 | printError("Could not get PluginFactory instance for UFF"
1830 | " parsing from custom library");
1831 | return NVDSINFER_CUSTOM_LIB_FAILED;
1832 | }
1833 | /* Use the appropriate API to set the PluginFactory based on its
1834 | * type. */
1835 | switch (type)
1836 | {
1837 | case PLUGIN_FACTORY:
1838 | uffParser->setPluginFactory(
1839 | uffPluginFactory.pluginFactory);
1840 | break;
1841 | case PLUGIN_FACTORY_EXT:
1842 | uffParser->setPluginFactoryExt(
1843 | uffPluginFactory.pluginFactoryExt);
1844 | break;
1845 | default:
1846 | printError("Invalid PluginFactory type returned by "
1847 | "custom library");
1848 | return NVDSINFER_CUSTOM_LIB_FAILED;
1849 | }
1850 | }
1851 | }
1852 |
1853 | if (!uffParser->parse(initParams.uffFilePath,
1854 | *network, modelDataType))
1855 | {
1856 | printError("Failed to parse UFF file: incorrect file or incorrect"
1857 | " input/output blob names");
1858 | return NVDSINFER_TENSORRT_ERROR;
1859 | }
1860 | engineFileName << initParams.uffFilePath;
1861 | }
1862 | else if (!string_empty(initParams.onnxFilePath))
1863 | {
1864 | if (!file_accessible(initParams.onnxFilePath))
1865 | {
1866 | printError("Cannot access ONNX file '%s'", initParams.onnxFilePath);
1867 | return NVDSINFER_CONFIG_FAILED;
1868 | }
1869 | onnxParser = nvonnxparser::createParser(*network, m_Logger);
1870 |
1871 | if (!onnxParser->parseFromFile(initParams.onnxFilePath,
1872 | (int) ILogger::Severity::kWARNING))
1873 | {
1874 | printError("Failed to parse onnx file");
1875 | return NVDSINFER_TENSORRT_ERROR;
1876 | }
1877 | engineFileName << initParams.onnxFilePath;
1878 | }
1879 | else
1880 | {
1881 | printError("No model files specified");
1882 | return NVDSINFER_CONFIG_FAILED;
1883 | }
1884 |
1885 | if (!cudaEngineGetFcn)
1886 | {
1887 | /* Build the engine */
1888 | cudaEngine = builder->buildCudaEngine(*network);
1889 | }
1890 | if (cudaEngine == nullptr)
1891 | {
1892 | printError("Failed while building cuda engine for network");
1893 | return NVDSINFER_TENSORRT_ERROR;
1894 | }
1895 |
1896 | /* Serialize the network into a stream and return the stream pointer since
1897 | * the cuda engine is valid only for the lifetime of the builder. */
1898 | gieModelStream = cudaEngine->serialize();
1899 |
1900 | /* Optionally write the stream to a file which can used during next run. */
1901 | engineFileName << "_b" << m_MaxBatchSize << "_";
1902 | if (initParams.useDLA)
1903 | engineFileName << "dla_";
1904 | engineFileName << ((networkMode == NvDsInferNetworkMode_FP32) ? "fp32" :
1905 | (networkMode == NvDsInferNetworkMode_FP16) ? "fp16" : "int8")
1906 | << ".engine";
1907 | printInfo("Storing the serialized cuda engine to file at %s",
1908 | engineFileName.str().c_str());
1909 | ofstream gieModelFileOut(engineFileName.str());
1910 | gieModelFileOut.write((char *) gieModelStream->data(),
1911 | gieModelStream->size());
1912 |
1913 | cudaEngine.reset ();
1914 |
1915 | /* Destroy the plugin factory instances. */
1916 | if (caffePluginFactory.pluginFactory)
1917 | {
1918 | NvDsInferPluginFactoryCaffeDestroyFcn fcn =
1919 | (NvDsInferPluginFactoryCaffeDestroyFcn) dlsym(m_CustomLibHandle,
1920 | "NvDsInferPluginFactoryCaffeDestroy");
1921 | if (fcn)
1922 | {
1923 | fcn(caffePluginFactory);
1924 | }
1925 | }
1926 | if (uffPluginFactory.pluginFactory)
1927 | {
1928 | NvDsInferPluginFactoryUffDestroyFcn fcn =
1929 | (NvDsInferPluginFactoryUffDestroyFcn) dlsym(m_CustomLibHandle,
1930 | "NvDsInferPluginFactoryUffDestroy");
1931 | if (fcn)
1932 | {
1933 | fcn(uffPluginFactory);
1934 | }
1935 | }
1936 |
1937 | return NVDSINFER_SUCCESS;
1938 | }
1939 |
1940 | /**
1941 | * Clean up and free all resources
1942 | */
1943 | NvDsInferContextImpl::~NvDsInferContextImpl()
1944 | {
1945 | /* Set the cuda device to be used. */
1946 | cudaError_t cudaReturn = cudaSetDevice(m_GpuID);
1947 | if (cudaReturn != cudaSuccess)
1948 | {
1949 | printError("Failed to set cuda device %d (%s).", m_GpuID,
1950 | cudaGetErrorName(cudaReturn));
1951 | return;
1952 | }
1953 |
1954 | unique_lock < std::mutex > lock (m_QueueMutex);
1955 |
1956 | /* Clean up other cuda resources. */
1957 | if (m_PreProcessStream)
1958 | {
1959 | cudaStreamSynchronize(m_PreProcessStream);
1960 | cudaStreamDestroy(m_PreProcessStream);
1961 | }
1962 | if (m_InferStream)
1963 | {
1964 | cudaStreamSynchronize(m_InferStream);
1965 | cudaStreamDestroy(m_InferStream);
1966 | }
1967 | if (m_BufferCopyStream)
1968 | {
1969 | cudaStreamSynchronize(m_BufferCopyStream);
1970 | cudaStreamDestroy(m_BufferCopyStream);
1971 | }
1972 | if (m_InputConsumedEvent)
1973 | cudaEventDestroy (m_InputConsumedEvent);
1974 | if (m_PreProcessCompleteEvent)
1975 | cudaEventDestroy (m_PreProcessCompleteEvent);
1976 | if (m_InferCompleteEvent)
1977 | cudaEventDestroy (m_InferCompleteEvent);
1978 |
1979 | bool warn = false;
1980 |
1981 | for (auto & batch:m_Batches)
1982 | {
1983 | if (!batch.m_BuffersWithContext && !warn)
1984 | {
1985 | warn = true;
1986 | printWarning ("Not all output batches released back to the context "
1987 | "before destroy. Memory associated with the outputs will "
1988 | "no longer be valid.");
1989 | }
1990 | if (batch.m_CopyCompleteEvent)
1991 | cudaEventDestroy(batch.m_CopyCompleteEvent);
1992 | for (size_t i = 0; i < batch.m_DeviceBuffers.size(); i++)
1993 | {
1994 | if (batch.m_DeviceBuffers[i] && !m_CudaEngine->bindingIsInput(i))
1995 | cudaFree(batch.m_DeviceBuffers[i]);
1996 | }
1997 | }
1998 |
1999 |
2000 | if (m_DBScanHandle)
2001 | NvDsInferDBScanDestroy(m_DBScanHandle);
2002 |
2003 | if (m_InferExecutionContext)
2004 | m_InferExecutionContext->destroy();
2005 |
2006 | if (m_CudaEngine)
2007 | m_CudaEngine->destroy();
2008 |
2009 | if (m_InferRuntime)
2010 | m_InferRuntime->destroy();
2011 |
2012 | if (m_CustomLibHandle)
2013 | {
2014 | /* Destroy the PluginFactory instance required during runtime cuda engine
2015 | * deserialization. */
2016 | if (m_RuntimePluginFactory)
2017 | {
2018 | NvDsInferPluginFactoryRuntimeDestroyFcn fcn =
2019 | (NvDsInferPluginFactoryRuntimeDestroyFcn) dlsym(
2020 | m_CustomLibHandle, "NvDsInferPluginFactoryRuntimeDestroy");
2021 | if (fcn)
2022 | {
2023 | fcn(m_RuntimePluginFactory);
2024 | }
2025 | }
2026 | dlclose(m_CustomLibHandle);
2027 | }
2028 |
2029 | if (m_MeanDataBuffer)
2030 | {
2031 | cudaFree(m_MeanDataBuffer);
2032 | }
2033 |
2034 | for (auto & buffer:m_BindingBuffers)
2035 | {
2036 | if (buffer)
2037 | cudaFree(buffer);
2038 | }
2039 | }
2040 |
2041 | /*
2042 | * Destroy the context to release all resources.
2043 | */
2044 | void
2045 | NvDsInferContextImpl::destroy()
2046 | {
2047 | delete this;
2048 | }
2049 |
2050 | /*
2051 | * Factory function to create an NvDsInferContext instance and initialize it with
2052 | * supplied parameters.
2053 | */
2054 | NvDsInferStatus
2055 | createNvDsInferContext(NvDsInferContextHandle *handle,
2056 | NvDsInferContextInitParams &initParams, void *userCtx,
2057 | NvDsInferContextLoggingFunc logFunc)
2058 | {
2059 | NvDsInferStatus status;
2060 | NvDsInferContextImpl *ctx = new NvDsInferContextImpl();
2061 |
2062 | status = ctx->initialize(initParams, userCtx, logFunc);
2063 | if (status == NVDSINFER_SUCCESS)
2064 | {
2065 | *handle = ctx;
2066 | }
2067 | else
2068 | {
2069 | static_cast(ctx)->destroy();
2070 | }
2071 | return status;
2072 | }
2073 |
2074 | /*
2075 | * Reset the members inside the initParams structure to default values.
2076 | */
2077 | void
2078 | NvDsInferContext_ResetInitParams (NvDsInferContextInitParams *initParams)
2079 | {
2080 | if (initParams == nullptr)
2081 | {
2082 | fprintf(stderr, "Warning. NULL initParams passed to "
2083 | "NvDsInferContext_ResetInitParams()\n");
2084 | return;
2085 | }
2086 |
2087 | memset(initParams, 0, sizeof (*initParams));
2088 |
2089 | initParams->networkMode = NvDsInferNetworkMode_FP32;
2090 | initParams->networkInputFormat = NvDsInferFormat_Unknown;
2091 | initParams->uffInputOrder = NvDsInferUffInputOrder_kNCHW;
2092 | initParams->maxBatchSize = 1;
2093 | initParams->networkScaleFactor = 1.0;
2094 | initParams->networkType = NvDsInferNetworkType_Detector;
2095 | initParams->outputBufferPoolSize = NVDSINFER_MIN_OUTPUT_BUFFERPOOL_SIZE;
2096 | }
2097 |
2098 | const char *
2099 | NvDsInferContext_GetStatusName (NvDsInferStatus status)
2100 | {
2101 | #define CHECK_AND_RETURN_STRING(status_iter) \
2102 | if (status == status_iter) return #status_iter
2103 |
2104 | CHECK_AND_RETURN_STRING(NVDSINFER_SUCCESS);
2105 | CHECK_AND_RETURN_STRING(NVDSINFER_CONFIG_FAILED);
2106 | CHECK_AND_RETURN_STRING(NVDSINFER_CUSTOM_LIB_FAILED);
2107 | CHECK_AND_RETURN_STRING(NVDSINFER_INVALID_PARAMS);
2108 | CHECK_AND_RETURN_STRING(NVDSINFER_OUTPUT_PARSING_FAILED);
2109 | CHECK_AND_RETURN_STRING(NVDSINFER_CUDA_ERROR);
2110 | CHECK_AND_RETURN_STRING(NVDSINFER_TENSORRT_ERROR);
2111 | CHECK_AND_RETURN_STRING(NVDSINFER_UNKNOWN_ERROR);
2112 |
2113 | return nullptr;
2114 | #undef CHECK_AND_RETURN_STRING
2115 |
2116 | }
2117 |
--------------------------------------------------------------------------------
/libs/nvdsinfer/nvdsinfer_context_impl.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA Corporation and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA Corporation is strictly prohibited.
9 | *
10 | */
11 |
12 | #ifndef __NVDSINFER_CONTEXT_IMPL_H__
13 | #define __NVDSINFER_CONTEXT_IMPL_H__
14 |
15 | #include
16 | #include
17 | #include
18 | #include
19 | #include
20 |
21 | #include "cuda_runtime_api.h"
22 | #include
23 | #include
24 |
25 | #include
26 |
27 | #include
28 | #include
29 | #include
30 |
31 |
32 | /**
33 | * Implementation of the INvDsInferContext interface.
34 | */
35 | class NvDsInferContextImpl : public INvDsInferContext
36 | {
37 | public:
38 | /**
39 | * Default constructor.
40 | */
41 | NvDsInferContextImpl();
42 |
43 | /**
44 | * Initializes the Infer engine, allocates layer buffers and other required
45 | * initialization steps.
46 | */
47 | NvDsInferStatus initialize(NvDsInferContextInitParams &initParams,
48 | void *userCtx, NvDsInferContextLoggingFunc logFunc);
49 |
50 | private:
51 | /**
52 | * Free up resouces and deinitialize the inference engine.
53 | */
54 | ~NvDsInferContextImpl();
55 |
56 | /* Implementation of the public methods of INvDsInferContext interface. */
57 | NvDsInferStatus queueInputBatch(NvDsInferContextBatchInput &batchInput) override;
58 | NvDsInferStatus dequeueOutputBatch(NvDsInferContextBatchOutput &batchOutput) override;
59 | void releaseBatchOutput(NvDsInferContextBatchOutput &batchOutput) override;
60 | void fillLayersInfo(std::vector &layersInfo) override;
61 | void getNetworkInfo(NvDsInferNetworkInfo &networkInfo) override;
62 | const std::vector>& getLabels() override;
63 | void destroy() override;
64 |
65 | /* Other private methods. */
66 | NvDsInferStatus checkEngineParams(NvDsInferContextInitParams &initParams);
67 | NvDsInferStatus useEngineFile(NvDsInferContextInitParams &initParams);
68 | NvDsInferStatus generateTRTModel(NvDsInferContextInitParams &initParams,
69 | nvinfer1::IHostMemory *&gieModelStream);
70 | NvDsInferStatus readMeanImageFile(char *meanImageFilePath);
71 | NvDsInferStatus getBoundLayersInfo();
72 | NvDsInferStatus allocateBuffers();
73 | NvDsInferStatus parseLabelsFile(char *labelsFilePath);
74 | bool parseBoundingBox(
75 | std::vector const& outputLayersInfo,
76 | NvDsInferNetworkInfo const &networkInfo,
77 | NvDsInferParseDetectionParams const &detectionParams,
78 | std::vector &objectList);
79 | bool parseAttributesFromSoftmaxLayers(
80 | std::vector const &outputLayersInfo,
81 | NvDsInferNetworkInfo const &networkInfo,
82 | float classifierThreshold,
83 | std::vector &attrList,
84 | std::string &attrString);
85 | void clusterAndFillDetectionOutputCV(NvDsInferDetectionOutput &output);
86 | void clusterAndFillDetectionOutputDBSCAN(NvDsInferDetectionOutput &output);
87 | NvDsInferStatus fillDetectionOutput(NvDsInferDetectionOutput &output);
88 | NvDsInferStatus fillClassificationOutput(NvDsInferClassificationOutput &output);
89 | NvDsInferStatus fillSegmentationOutput(NvDsInferSegmentationOutput &output);
90 | void releaseFrameOutput(NvDsInferFrameOutput &frameOutput);
91 | NvDsInferStatus initNonImageInputLayers();
92 |
93 | /* Input layer has a binding index of 0 */
94 | static const int INPUT_LAYER_INDEX = 0;
95 |
96 | /* Mutex to keep DLA IExecutionContext::enqueue theadsafe */
97 | static std::mutex DlaExecutionMutex;
98 |
99 | /** Unique identifier for the instance. This can be used to identify the
100 | * instance generating log and error messages. */
101 | unsigned int m_UniqueID;
102 |
103 | unsigned int m_MaxBatchSize;
104 |
105 | double m_NetworkScaleFactor;
106 |
107 | /** Input format for the network. */
108 | NvDsInferFormat m_NetworkInputFormat;
109 |
110 | NvDsInferNetworkType m_NetworkType;
111 |
112 | /* Network input information. */
113 | NvDsInferNetworkInfo m_NetworkInfo;
114 |
115 | bool m_UseDBScan;
116 |
117 | NvDsInferDBScanHandle m_DBScanHandle;
118 |
119 | /* Number of classes detected by the model. */
120 | unsigned int m_NumDetectedClasses;
121 |
122 | /* Detection / grouping parameters. */
123 | std::vector m_PerClassDetectionParams;
124 | NvDsInferParseDetectionParams m_DetectionParams;
125 |
126 | /* Vector for all parsed objects. */
127 | std::vector m_ObjectList;
128 | /* Vector of cv::Rect vectors for each class. */
129 | std::vector> m_PerClassCvRectList;
130 | /* Vector of NvDsInferObjectDetectionInfo vectors for each class. */
131 | std::vector> m_PerClassObjectList;
132 |
133 | float m_ClassifierThreshold;
134 | float m_SegmentationThreshold;
135 |
136 | /* Custom library implementation. */
137 | void *m_CustomLibHandle;
138 | NvDsInferParseCustomFunc m_CustomBBoxParseFunc;
139 | NvDsInferClassiferParseCustomFunc m_CustomClassifierParseFunc;
140 | nvinfer1::IPluginFactory *m_RuntimePluginFactory;
141 |
142 | unsigned int m_GpuID;
143 | bool m_DlaEnabled;
144 |
145 | /* Holds the string labels for classes. */
146 | std::vector> m_Labels;
147 |
148 | /* Logger for GIE info/warning/errors */
149 | class NvDsInferLogger : public nvinfer1::ILogger
150 | {
151 | void log(Severity severity, const char *msg) override ;
152 | public:
153 | NvDsInferContextImpl *handle;
154 | };
155 | NvDsInferLogger m_Logger;
156 |
157 | /* Custom unique_ptrs. These TensorRT objects will get deleted automatically
158 | * when the NvDsInferContext object is deleted. */
159 | nvinfer1::IRuntime *m_InferRuntime;
160 | nvinfer1::ICudaEngine *m_CudaEngine;
161 | nvinfer1::IExecutionContext *m_InferExecutionContext;
162 |
163 | cudaStream_t m_PreProcessStream;
164 | cudaStream_t m_InferStream;
165 | cudaStream_t m_BufferCopyStream;
166 |
167 | /* Vectors for holding information about bound layers. */
168 | std::vector m_AllLayerInfo;
169 | std::vector m_OutputLayerInfo;
170 |
171 | float *m_MeanDataBuffer;
172 |
173 | std::vector m_BindingBuffers;
174 |
175 | unsigned int m_OutputBufferPoolSize;
176 |
177 | /**
178 | * Holds information for one batch for processing.
179 | */
180 | typedef struct
181 | {
182 | std::vector> m_HostBuffers;
183 | std::vector m_DeviceBuffers;
184 |
185 | unsigned int m_BatchSize;
186 | cudaEvent_t m_CopyCompleteEvent = nullptr;
187 | bool m_BuffersWithContext = true;
188 |
189 | //NvDsInferContextReturnInputAsyncFunc m_ReturnFunc = nullptr;
190 | //void *m_ReturnFuncData = nullptr;
191 | } NvDsInferBatch;
192 |
193 | std::vector m_Batches;
194 |
195 | /* Queues and synchronization members for processing multiple batches
196 | * in parallel.
197 | */
198 | std::mutex m_QueueMutex;
199 | std::condition_variable m_QueueCondition;
200 | std::queue m_ProcessIndexQueue;
201 | std::queue m_FreeIndexQueue;
202 |
203 | bool m_CopyInputToHostBuffers;
204 |
205 | /* Cuda Event for synchronizing input consumption by TensorRT CUDA engine. */
206 | cudaEvent_t m_InputConsumedEvent;
207 | /* Cuda Event for synchronizing completion of pre-processing. */
208 | cudaEvent_t m_PreProcessCompleteEvent;
209 | /* Cuda Event for synchronizing infer completion by TensorRT CUDA engine. */
210 | cudaEvent_t m_InferCompleteEvent;
211 |
212 | NvDsInferContextLoggingFunc m_LoggingFunc;
213 |
214 | void *m_UserCtx;
215 |
216 | bool m_Initialized;
217 | };
218 |
219 | /* Calls clients logging callback function. */
220 | static inline void
221 | callLogFunc(NvDsInferContextImpl *ctx, unsigned int uniqueID, NvDsInferLogLevel level,
222 | const char *func, NvDsInferContextLoggingFunc logFunc, void *logCtx,
223 | const char *fmt, ...)
224 | {
225 | va_list args;
226 | va_start (args, fmt);
227 | char logMsgBuffer[_MAX_STR_LENGTH + 1];
228 | vsnprintf(logMsgBuffer, _MAX_STR_LENGTH, fmt, args);
229 | logFunc(ctx, uniqueID, level, func, logMsgBuffer, logCtx);
230 | va_end (args);
231 | }
232 |
233 | #define printMsg(level, tag_str, fmt, ...) \
234 | do { \
235 | char * baseName = strrchr((char *) __FILE__, '/'); \
236 | baseName = (baseName) ? (baseName + 1) : (char *) __FILE__; \
237 | if (m_LoggingFunc) \
238 | { \
239 | callLogFunc(this, m_UniqueID, level, __func__, m_LoggingFunc, \
240 | m_UserCtx, fmt, ## __VA_ARGS__); \
241 | } \
242 | else \
243 | { \
244 | fprintf(stderr, \
245 | tag_str " NvDsInferContextImpl::%s() <%s:%d> [UID = %d]: " fmt "\n", \
246 | __func__, baseName, __LINE__, m_UniqueID, ## __VA_ARGS__); \
247 | } \
248 | } while (0)
249 |
250 | #define printError(fmt, ...) \
251 | do { \
252 | printMsg (NVDSINFER_LOG_ERROR, "Error in", fmt, ##__VA_ARGS__); \
253 | } while (0)
254 |
255 | #define printWarning(fmt, ...) \
256 | do { \
257 | printMsg (NVDSINFER_LOG_WARNING, "Warning from", fmt, ##__VA_ARGS__); \
258 | } while (0)
259 |
260 | #define printInfo(fmt, ...) \
261 | do { \
262 | printMsg (NVDSINFER_LOG_INFO, "Info from", fmt, ##__VA_ARGS__); \
263 | } while (0)
264 |
265 | #define printDebug(fmt, ...) \
266 | do { \
267 | printMsg (NVDSINFER_LOG_DEBUG, "DEBUG", fmt, ##__VA_ARGS__); \
268 | } while (0)
269 |
270 | #endif
271 |
--------------------------------------------------------------------------------
/libs/nvdsinfer/nvdsinfer_context_impl_capi.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA Corporation and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA Corporation is strictly prohibited.
9 | *
10 | */
11 |
12 | #include "nvdsinfer_context_impl.h"
13 | #include
14 |
15 | /* This file implements the C interface for the NvDsInferContext class. The
16 | * interface is a simple wrapper over the C++ interface. */
17 |
18 | using namespace std;
19 |
20 | #define NULL_PARAM_CHECK(param, retvalue) \
21 | if (param == nullptr) \
22 | { \
23 | fprintf(stderr, "Warning: NULL parameter " #param " passed to %s\n", \
24 | __func__); \
25 | return retvalue; \
26 | }
27 |
28 |
29 | NvDsInferStatus
30 | NvDsInferContext_Create(NvDsInferContextHandle *handle,
31 | NvDsInferContextInitParams *initParams, void *userCtx,
32 | NvDsInferContextLoggingFunc logFunc)
33 | {
34 | NULL_PARAM_CHECK(handle, NVDSINFER_INVALID_PARAMS);
35 | NULL_PARAM_CHECK(initParams, NVDSINFER_INVALID_PARAMS);
36 |
37 | return createNvDsInferContext(handle, *initParams, userCtx, logFunc);
38 | }
39 |
40 | void
41 | NvDsInferContext_Destroy(NvDsInferContextHandle handle)
42 | {
43 | NULL_PARAM_CHECK(handle, );
44 |
45 | handle->destroy();
46 | }
47 |
48 | NvDsInferStatus
49 | NvDsInferContext_QueueInputBatch(NvDsInferContextHandle handle,
50 | NvDsInferContextBatchInput *batchInput)
51 | {
52 | NULL_PARAM_CHECK(handle, NVDSINFER_INVALID_PARAMS);
53 | NULL_PARAM_CHECK(batchInput, NVDSINFER_INVALID_PARAMS);
54 |
55 | return handle->queueInputBatch(*batchInput);
56 | }
57 |
58 | NvDsInferStatus
59 | NvDsInferContext_DequeueOutputBatch(NvDsInferContextHandle handle,
60 | NvDsInferContextBatchOutput *batchOutput)
61 | {
62 | NULL_PARAM_CHECK(handle, NVDSINFER_INVALID_PARAMS);
63 | NULL_PARAM_CHECK(batchOutput, NVDSINFER_INVALID_PARAMS);
64 |
65 | return handle->dequeueOutputBatch(*batchOutput);
66 | }
67 |
68 | void
69 | NvDsInferContext_ReleaseBatchOutput(NvDsInferContextHandle handle,
70 | NvDsInferContextBatchOutput *batchOutput)
71 | {
72 | NULL_PARAM_CHECK(handle, );
73 | NULL_PARAM_CHECK(batchOutput, );
74 |
75 | return handle->releaseBatchOutput(*batchOutput);
76 | }
77 |
78 | unsigned int
79 | NvDsInferContext_GetNumLayersInfo(NvDsInferContextHandle handle)
80 | {
81 | NULL_PARAM_CHECK(handle, 0);
82 |
83 | std::vector layersInfo;
84 | handle->fillLayersInfo(layersInfo);
85 |
86 | return layersInfo.size();
87 | }
88 |
89 | void
90 | NvDsInferContext_FillLayersInfo(NvDsInferContextHandle handle,
91 | NvDsInferLayerInfo *layersInfo)
92 | {
93 | NULL_PARAM_CHECK(handle, );
94 |
95 | std::vector layersInfoVec;
96 | handle->fillLayersInfo(layersInfoVec);
97 | for (unsigned int i = 0; i < layersInfoVec.size(); i++)
98 | layersInfo[i] = layersInfoVec[i];
99 | }
100 |
101 | void
102 | NvDsInferContext_GetNetworkInfo(NvDsInferContextHandle handle,
103 | NvDsInferNetworkInfo *networkInfo)
104 | {
105 | NULL_PARAM_CHECK(handle, );
106 | NULL_PARAM_CHECK(networkInfo, );
107 |
108 | return handle->getNetworkInfo(*networkInfo);
109 | }
110 |
111 | const char*
112 | NvDsInferContext_GetLabel(NvDsInferContextHandle handle, unsigned int id,
113 | unsigned int value)
114 | {
115 | NULL_PARAM_CHECK(handle, nullptr);
116 |
117 | auto labels = handle->getLabels();
118 | if (labels.size() > id && labels[id].size() > value)
119 | return labels[id][value].c_str();
120 |
121 | return nullptr;
122 | }
123 |
--------------------------------------------------------------------------------
/libs/nvdsinfer/nvdsinfer_context_impl_output_parsing.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA Corporation and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA Corporation is strictly prohibited.
9 | *
10 | */
11 | #include
12 | //#include
13 | #include
14 |
15 | #include "nvdsinfer_context_impl.h"
16 | #include "nms_cpu.h"
17 | #include "resize_merge_cpu.h"
18 |
19 | static const bool ATHR_ENABLED = true;
20 | static const float ATHR_THRESHOLD = 60.0;
21 |
22 | using namespace std;
23 |
24 | #define DIVIDE_AND_ROUND_UP(a, b) ((a + b - 1) / b)
25 |
26 | /* Parse all object bounding boxes for the class `classIndex` in the frame
27 | * meeting the minimum threshold criteria.
28 | *
29 | * This parser function has been specifically written for the sample resnet10
30 | * model provided with the SDK. Other models will require this function to be
31 | * modified.
32 | */
33 | bool
34 | NvDsInferContextImpl::parseBoundingBox(
35 | vector < NvDsInferLayerInfo > const &outputLayersInfo,
36 | NvDsInferNetworkInfo const &networkInfo,
37 | NvDsInferParseDetectionParams const &detectionParams,
38 | vector < NvDsInferObjectDetectionInfo > &objectList) {
39 |
40 | int outputCoverageLayerIndex = -1;
41 | int outputBBoxLayerIndex = -1;
42 |
43 |
44 | for (unsigned int i = 0; i < outputLayersInfo.size(); i++) {
45 | if (strstr(outputLayersInfo[i].layerName, "bbox") != nullptr) {
46 | outputBBoxLayerIndex = i;
47 | }
48 | if (strstr(outputLayersInfo[i].layerName, "cov") != nullptr) {
49 | outputCoverageLayerIndex = i;
50 | }
51 | }
52 |
53 | if (outputCoverageLayerIndex == -1) {
54 | printError("Could not find output coverage layer for parsing objects");
55 | return false;
56 | }
57 | if (outputBBoxLayerIndex == -1) {
58 | printError("Could not find output bbox layer for parsing objects");
59 | return false;
60 | }
61 |
62 | float *outputCoverageBuffer =
63 | (float *)outputLayersInfo[outputCoverageLayerIndex].buffer;
64 | float *outputBboxBuffer =
65 | (float *)outputLayersInfo[outputBBoxLayerIndex].buffer;
66 |
67 | NvDsInferDimsCHW outputCoverageDims;
68 | NvDsInferDimsCHW outputBBoxDims;
69 |
70 | getDimsCHWFromDims(outputCoverageDims,
71 | outputLayersInfo[outputCoverageLayerIndex].dims);
72 | getDimsCHWFromDims(outputBBoxDims,
73 | outputLayersInfo[outputBBoxLayerIndex].dims);
74 |
75 | unsigned int targetShape[2] = { outputCoverageDims.w, outputCoverageDims.h };
76 | float bboxNorm[2] = { 35.0, 35.0 };
77 | float gcCenters0[targetShape[0]];
78 | float gcCenters1[targetShape[1]];
79 | int gridSize = outputCoverageDims.w * outputCoverageDims.h;
80 | int strideX = DIVIDE_AND_ROUND_UP(networkInfo.width, outputBBoxDims.w);
81 | int strideY = DIVIDE_AND_ROUND_UP(networkInfo.height, outputBBoxDims.h);
82 |
83 | for (unsigned int i = 0; i < targetShape[0]; i++) {
84 | gcCenters0[i] = (float)(i * strideX + 0.5);
85 | gcCenters0[i] /= (float)bboxNorm[0];
86 | }
87 | for (unsigned int i = 0; i < targetShape[1]; i++) {
88 | gcCenters1[i] = (float)(i * strideY + 0.5);
89 | gcCenters1[i] /= (float)bboxNorm[1];
90 | }
91 |
92 | unsigned int numClasses =
93 | MIN(outputCoverageDims.c, detectionParams.numClassesConfigured);
94 | for (unsigned int classIndex = 0; classIndex < numClasses; classIndex++) {
95 |
96 | /* Pointers to memory regions containing the (x1,y1) and (x2,y2) coordinates
97 | * of rectangles in the output bounding box layer. */
98 | float *outputX1 = outputBboxBuffer
99 | + classIndex * sizeof (float) * outputBBoxDims.h * outputBBoxDims.w;
100 |
101 | float *outputY1 = outputX1 + gridSize;
102 | float *outputX2 = outputY1 + gridSize;
103 | float *outputY2 = outputX2 + gridSize;
104 |
105 | /* Iterate through each point in the grid and check if the rectangle at that
106 | * point meets the minimum threshold criteria. */
107 | for (unsigned int h = 0; h < outputCoverageDims.h; h++) {
108 | for (unsigned int w = 0; w < outputCoverageDims.w; w++) {
109 | int i = w + h * outputCoverageDims.w;
110 | float confidence = outputCoverageBuffer[classIndex * gridSize + i];
111 |
112 | if (confidence < detectionParams.perClassThreshold[classIndex])
113 | continue;
114 |
115 | int rectX1, rectY1, rectX2, rectY2;
116 | float rectX1Float, rectY1Float, rectX2Float, rectY2Float;
117 |
118 | /* Centering and normalization of the rectangle. */
119 | rectX1Float =
120 | outputX1[w + h * outputCoverageDims.w] - gcCenters0[w];
121 | rectY1Float =
122 | outputY1[w + h * outputCoverageDims.w] - gcCenters1[h];
123 | rectX2Float =
124 | outputX2[w + h * outputCoverageDims.w] + gcCenters0[w];
125 | rectY2Float =
126 | outputY2[w + h * outputCoverageDims.w] + gcCenters1[h];
127 |
128 | rectX1Float *= -bboxNorm[0];
129 | rectY1Float *= -bboxNorm[1];
130 | rectX2Float *= bboxNorm[0];
131 | rectY2Float *= bboxNorm[1];
132 |
133 | rectX1 = rectX1Float;
134 | rectY1 = rectY1Float;
135 | rectX2 = rectX2Float;
136 | rectY2 = rectY2Float;
137 |
138 | /* Clip parsed rectangles to frame bounds. */
139 | if (rectX1 >= (int)m_NetworkInfo.width)
140 | rectX1 = m_NetworkInfo.width - 1;
141 | if (rectX2 >= (int)m_NetworkInfo.width)
142 | rectX2 = m_NetworkInfo.width - 1;
143 | if (rectY1 >= (int)m_NetworkInfo.height)
144 | rectY1 = m_NetworkInfo.height - 1;
145 | if (rectY2 >= (int)m_NetworkInfo.height)
146 | rectY2 = m_NetworkInfo.height - 1;
147 |
148 | if (rectX1 < 0)
149 | rectX1 = 0;
150 | if (rectX2 < 0)
151 | rectX2 = 0;
152 | if (rectY1 < 0)
153 | rectY1 = 0;
154 | if (rectY2 < 0)
155 | rectY2 = 0;
156 |
157 | objectList.push_back({ classIndex, (unsigned int) rectX1,
158 | (unsigned int) rectY1, (unsigned int) (rectX2 - rectX1),
159 | (unsigned int) (rectY2 - rectY1), confidence});
160 | }
161 | }
162 | }
163 | return true;
164 | }
165 |
166 | /**
167 | * Cluster objects using OpenCV groupRectangles and fill the output structure.
168 | */
169 | void
170 | NvDsInferContextImpl::clusterAndFillDetectionOutputCV(NvDsInferDetectionOutput &output) {
171 | size_t totalObjects = 0;
172 |
173 | for (auto & list:m_PerClassCvRectList)
174 | list.clear();
175 |
176 | /* The above functions will add all objects in the m_ObjectList vector.
177 | * Need to seperate them per class for grouping. */
178 | for (auto & object:m_ObjectList) {
179 | m_PerClassCvRectList[object.classId].emplace_back(object.left,
180 | object.top, object.width, object.height);
181 | }
182 |
183 | for (unsigned int c = 0; c < m_NumDetectedClasses; c++) {
184 | /* Cluster together rectangles with similar locations and sizes
185 | * since these rectangles might represent the same object. Refer
186 | * to opencv documentation of groupRectangles for more
187 | * information about the tuning parameters for grouping. */
188 | if (m_PerClassDetectionParams[c].groupThreshold > 0)
189 | cv::groupRectangles(m_PerClassCvRectList[c],
190 | m_PerClassDetectionParams[c].groupThreshold,
191 | m_PerClassDetectionParams[c].eps);
192 | totalObjects += m_PerClassCvRectList[c].size();
193 | }
194 |
195 | output.objects = new NvDsInferObject[totalObjects];
196 | output.numObjects = 0;
197 |
198 | for (unsigned int c = 0; c < m_NumDetectedClasses; c++) {
199 | /* Add coordinates and class ID and the label of all objects
200 | * detected in the frame to the frame output. */
201 | for (auto & rect:m_PerClassCvRectList[c]) {
202 | NvDsInferObject &object = output.objects[output.numObjects];
203 | object.left = rect.x;
204 | object.top = rect.y;
205 | object.width = rect.width;
206 | object.height = rect.height;
207 | object.classIndex = c;
208 | object.label = nullptr;
209 | if (c < m_Labels.size() && m_Labels[c].size() > 0)
210 | object.label = strdup(m_Labels[c][0].c_str());
211 | output.numObjects++;
212 | }
213 | }
214 | }
215 |
216 | /**
217 | * Cluster objects using DBSCAN and fill the output structure.
218 | */
219 | void
220 | NvDsInferContextImpl::clusterAndFillDetectionOutputDBSCAN(NvDsInferDetectionOutput &output) {
221 | size_t totalObjects = 0;
222 | NvDsInferDBScanClusteringParams clusteringParams;
223 | clusteringParams.enableATHRFilter = ATHR_ENABLED;
224 | clusteringParams.thresholdATHR = ATHR_THRESHOLD;
225 | vector numObjectsList(m_NumDetectedClasses);
226 |
227 | for (auto & list:m_PerClassObjectList)
228 | list.clear();
229 |
230 | /* The above functions will add all objects in the m_ObjectList vector.
231 | * Need to seperate them per class for grouping. */
232 | for (auto & object:m_ObjectList) {
233 | m_PerClassObjectList[object.classId].emplace_back(object);
234 | }
235 |
236 | for (unsigned int c = 0; c < m_NumDetectedClasses; c++) {
237 | NvDsInferObjectDetectionInfo *objArray = m_PerClassObjectList[c].data();
238 | size_t numObjects = m_PerClassObjectList[c].size();
239 |
240 | clusteringParams.eps = m_PerClassDetectionParams[c].eps;
241 | clusteringParams.minBoxes = m_PerClassDetectionParams[c].minBoxes;
242 |
243 | /* Cluster together rectangles with similar locations and sizes
244 | * since these rectangles might represent the same object using
245 | * DBSCAN. */
246 | if (m_PerClassDetectionParams[c].minBoxes > 0)
247 | NvDsInferDBScanCluster(m_DBScanHandle, &clusteringParams,
248 | objArray, &numObjects);
249 | totalObjects += numObjects;
250 | numObjectsList[c] = numObjects;
251 | }
252 |
253 | output.objects = new NvDsInferObject[totalObjects];
254 | output.numObjects = 0;
255 |
256 | for (unsigned int c = 0; c < m_NumDetectedClasses; c++) {
257 | /* Add coordinates and class ID and the label of all objects
258 | * detected in the frame to the frame output. */
259 | for (size_t i = 0; i < numObjectsList[c]; i++) {
260 | NvDsInferObject &object = output.objects[output.numObjects];
261 | object.left = m_PerClassObjectList[c][i].left;
262 | object.top = m_PerClassObjectList[c][i].top;
263 | object.width = m_PerClassObjectList[c][i].width;
264 | object.height = m_PerClassObjectList[c][i].height;
265 | object.classIndex = c;
266 | object.label = nullptr;
267 | if (c < m_Labels.size() && m_Labels[c].size() > 0)
268 | object.label = strdup(m_Labels[c][0].c_str());
269 | output.numObjects++;
270 | }
271 | }
272 | }
273 |
274 | bool
275 | NvDsInferContextImpl::parseAttributesFromSoftmaxLayers(
276 | std::vector const &outputLayersInfo,
277 | NvDsInferNetworkInfo const &networkInfo,
278 | float classifierThreshold,
279 | std::vector &attrList,
280 | std::string &attrString) {
281 | /* Get the number of attributes supported by the classifier. */
282 | unsigned int numAttributes = m_OutputLayerInfo.size();
283 |
284 | /* Iterate through all the output coverage layers of the classifier.
285 | */
286 | for (unsigned int l = 0; l < numAttributes; l++) {
287 | /* outputCoverageBuffer for classifiers is usually a softmax layer.
288 | * The layer is an array of probabilities of the object belonging
289 | * to each class with each probability being in the range [0,1] and
290 | * sum all probabilities will be 1.
291 | */
292 | NvDsInferDimsCHW dims;
293 |
294 | getDimsCHWFromDims(dims, m_OutputLayerInfo[l].dims);
295 | unsigned int numClasses = dims.c;
296 | float *outputCoverageBuffer =
297 | (float *)m_OutputLayerInfo[l].buffer;
298 | float maxProbability = 0;
299 | bool attrFound = false;
300 | NvDsInferAttribute attr;
301 |
302 | /* Iterate through all the probabilities that the object belongs to
303 | * each class. Find the maximum probability and the corresponding class
304 | * which meets the minimum threshold. */
305 | for (unsigned int c = 0; c < numClasses; c++) {
306 | float probability = outputCoverageBuffer[c];
307 | if (probability > m_ClassifierThreshold
308 | && probability > maxProbability) {
309 | maxProbability = probability;
310 | attrFound = true;
311 | attr.attributeIndex = l;
312 | attr.attributeValue = c;
313 | attr.attributeConfidence = probability;
314 | }
315 | }
316 | if (attrFound) {
317 | if (m_Labels.size() > attr.attributeIndex &&
318 | attr.attributeValue < m_Labels[attr.attributeIndex].size())
319 | attr.attributeLabel =
320 | m_Labels[attr.attributeIndex][attr.attributeValue].c_str();
321 | else
322 | attr.attributeLabel = nullptr;
323 | attrList.push_back(attr);
324 | if (attr.attributeLabel)
325 | attrString.append(attr.attributeLabel).append(" ");
326 | }
327 | }
328 |
329 | return true;
330 | }
331 |
332 | NvDsInferStatus
333 | NvDsInferContextImpl::fillDetectionOutput(NvDsInferDetectionOutput &output) {
334 | /* Clear the object lists. */
335 | m_ObjectList.clear();
336 |
337 | /* Call custom parsing function if specified otherwise use the one
338 | * written along with this implementation. */
339 | if (m_CustomBBoxParseFunc) {
340 | if (!m_CustomBBoxParseFunc(m_OutputLayerInfo, m_NetworkInfo,
341 | m_DetectionParams, m_ObjectList)) {
342 | printError("Failed to parse bboxes using custom parse function");
343 | return NVDSINFER_CUSTOM_LIB_FAILED;
344 | }
345 | } else {
346 | if (!parseBoundingBox(m_OutputLayerInfo, m_NetworkInfo,
347 | m_DetectionParams, m_ObjectList)) {
348 | printError("Failed to parse bboxes");
349 | return NVDSINFER_OUTPUT_PARSING_FAILED;
350 | }
351 | }
352 |
353 | if (m_UseDBScan)
354 | clusterAndFillDetectionOutputDBSCAN(output);
355 | else
356 | clusterAndFillDetectionOutputCV(output);
357 |
358 | return NVDSINFER_SUCCESS;
359 | }
360 |
361 | NvDsInferStatus
362 | NvDsInferContextImpl::fillClassificationOutput(NvDsInferClassificationOutput &output) {
363 | string attrString;
364 | vector attributes;
365 |
366 | /* Call custom parsing function if specified otherwise use the one
367 | * written along with this implementation. */
368 | if (m_CustomClassifierParseFunc) {
369 | if (!m_CustomClassifierParseFunc(m_OutputLayerInfo, m_NetworkInfo,
370 | m_ClassifierThreshold, attributes, attrString)) {
371 | printError("Failed to parse classification attributes using "
372 | "custom parse function");
373 | return NVDSINFER_CUSTOM_LIB_FAILED;
374 | }
375 | } else {
376 | if (!parseAttributesFromSoftmaxLayers(m_OutputLayerInfo, m_NetworkInfo,
377 | m_ClassifierThreshold, attributes, attrString)) {
378 | printError("Failed to parse bboxes");
379 | return NVDSINFER_OUTPUT_PARSING_FAILED;
380 | }
381 | }
382 |
383 | /* Fill the output structure with the parsed attributes. */
384 | output.label = strdup(attrString.c_str());
385 | output.numAttributes = attributes.size();
386 | output.attributes = new NvDsInferAttribute[output.numAttributes];
387 | for (size_t i = 0; i < output.numAttributes; i++) {
388 | output.attributes[i].attributeIndex = attributes[i].attributeIndex;
389 | output.attributes[i].attributeValue = attributes[i].attributeValue;
390 | output.attributes[i].attributeConfidence = attributes[i].attributeConfidence;
391 | output.attributes[i].attributeLabel = attributes[i].attributeLabel;
392 | }
393 | return NVDSINFER_SUCCESS;
394 | }
395 |
396 | NvDsInferStatus
397 | NvDsInferContextImpl::fillSegmentationOutput(NvDsInferSegmentationOutput &output) {
398 | NvDsInferDimsCHW outputDimsCHW;
399 | getDimsCHWFromDims(outputDimsCHW, m_OutputLayerInfo[0].dims);
400 |
401 | //$6 = {numDims = 3, d = {57, 46, 62, 127, 2918418508, 127, 1443693648, 85}, numElements = 162564}
402 |
403 | const int SCALE = 8;
404 | output.width = outputDimsCHW.w * SCALE; //62
405 | output.height = outputDimsCHW.h * SCALE; //46
406 | output.classes = outputDimsCHW.c; //57
407 |
408 | output.class_map = new int [output.width * output.height];
409 | output.class_probability_map = (float *) m_OutputLayerInfo[0].buffer;
410 |
411 | int out[46][62];
412 | for (int i = 0; i < 46; i++) {
413 | for (int j = 0; j < 62; j++) {
414 | out[i][j] = 1;
415 | }
416 | }
417 | for (int k = 0; k < 18; k++) {
418 | int below = 0;
419 | int x = 0, y = 0;
420 | float confidence = 0.0;
421 |
422 | for (int i = 0; i < 46; i++) {
423 | for (int j = 0; j < 62; j++) {
424 | if (output.class_probability_map[k*46*62 + i * 62 + j] > confidence) {
425 | confidence = output.class_probability_map[k*46*62 + i * 62 + j];
426 | x = j;
427 | y = i;
428 | }
429 |
430 | if (output.class_probability_map[k*46*62 + i * 62 + j] < 0) {
431 | below++;
432 | }
433 |
434 | }
435 | }
436 | out[y][x] = 0;
437 |
438 | //printf("k=%d,y/x=(%d,%d) below 0 = %d\n", k, y, x, below);
439 | }
440 |
441 | for (int i = 0; i < 46; i++) {
442 | for (int j = 0; j < 62; j++) {
443 | printf("%d", out[i][j]);
444 | for (int y = 0; y < SCALE; y++) {
445 | for (int x = 0; x < SCALE; x++) {
446 | output.class_map[(i * SCALE + y) * output.width + j * SCALE + x] = 6 - out[i][j];
447 | }
448 | }
449 | }
450 | printf("\n");
451 | }
452 |
453 | #if 1
454 | // Reszie and merge
455 | float* resize_target_ptr = (float*)malloc(sizeof(float) * 368*496*57);
456 | std::vector resize_source_ptr = {output.class_probability_map};
457 | std::array resize_target_size = {1, 57, 368, 496};
458 | std::vector> resize_source_size = {{1, 57, 46, 62}};
459 | std::vector scale_input_to_net_inputs = {1.0};
460 |
461 | resizeAndMergeCpu(resize_target_ptr, resize_source_ptr, resize_target_size, resize_source_size, scale_input_to_net_inputs);
462 |
463 | // nms
464 | float* nms_target_ptr = (float*)malloc(sizeof(float) * 18 * 128 * 3);
465 | int * kernel_ptr = (int*)malloc(sizeof(int) * 368 * 496 * 57);
466 | float* nms_source_ptr = resize_target_ptr;
467 | float threshold = 0.05f;
468 | int outputChannels = 18;
469 | int POSE_MAX_PEOPLE = 127+1;
470 | int x_y_sorce = 3;
471 |
472 | std::array nms_target_size = {1, outputChannels, POSE_MAX_PEOPLE, x_y_sorce};
473 | std::array nms_source_size = {1, 57, 368, 496};
474 |
475 | nmsCpu(nms_target_ptr, kernel_ptr, nms_source_ptr, threshold, nms_target_size, nms_source_size);
476 |
477 | for (int i=0; i < outputChannels*POSE_MAX_PEOPLE / 3; i++) {
478 | if (nms_target_ptr[i*3+2] > 0.1)
479 | printf("%f %f %f\n", nms_target_ptr[i*3], nms_target_ptr[i*3+1], nms_target_ptr[i*3+2]);
480 | }
481 | #endif
482 |
483 | output.classes = 1;
484 |
485 |
486 | #if 0
487 | for (unsigned int y = 0; y < output.height; y++) {
488 | for (unsigned int x = 0; x < output.width; x++) {
489 | float max_prob = -1;
490 | int &cls = output.class_map[y * output.width + x] = -1;
491 | for (unsigned int c = 0; c < output.classes; c++) {
492 | float prob = output.class_probability_map[c * output.width * output.height + y * output.width + x];
493 | if (prob > max_prob && prob > m_SegmentationThreshold) {
494 | cls = c;
495 | max_prob = prob;
496 | }
497 | }
498 | }
499 | }
500 | #endif
501 |
502 | return NVDSINFER_SUCCESS;
503 | }
504 |
505 | void
506 | NvDsInferContextImpl::releaseFrameOutput(NvDsInferFrameOutput &frameOutput) {
507 | switch (m_NetworkType) {
508 | case NvDsInferNetworkType_Detector:
509 | for (unsigned int j = 0; j < frameOutput.detectionOutput.numObjects; j++) {
510 | free(frameOutput.detectionOutput.objects[j].label);
511 | }
512 | delete[] frameOutput.detectionOutput.objects;
513 | break;
514 | case NvDsInferNetworkType_Classifier:
515 | free(frameOutput.classificationOutput.label);
516 | delete[] frameOutput.classificationOutput.attributes;
517 | break;
518 | case NvDsInferNetworkType_Segmentation:
519 | delete[] frameOutput.segmentationOutput.class_map;
520 | break;
521 | default:
522 | break;
523 | }
524 | }
525 |
--------------------------------------------------------------------------------
/libs/nvdsinfer/nvdsinfer_conversion.cu:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA Corporation and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA Corporation is strictly prohibited.
9 | *
10 | */
11 |
12 | #include
13 | #include "nvdsinfer_conversion.h"
14 |
15 | #define THREADS_PER_BLOCK 32
16 | #define THREADS_PER_BLOCK_1 (THREADS_PER_BLOCK - 1)
17 |
18 | __global__ void
19 | NvDsInferConvert_CxToP3FloatKernel(
20 | float *outBuffer,
21 | unsigned char *inBuffer,
22 | unsigned int width,
23 | unsigned int height,
24 | unsigned int pitch,
25 | unsigned int inputPixelSize,
26 | float scaleFactor)
27 | {
28 | unsigned int row = blockIdx.y * blockDim.y + threadIdx.y;
29 | unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;
30 |
31 | if (col < width && row < height)
32 | {
33 | for (unsigned int k = 0; k < 3; k++)
34 | {
35 | outBuffer[width * height * k + row * width + col] =
36 | scaleFactor * inBuffer[row * pitch + col * inputPixelSize + k];
37 | }
38 | }
39 | }
40 |
41 | __global__ void
42 | NvDsInferConvert_CxToP3FloatKernelWithMeanSubtraction(
43 | float *outBuffer,
44 | unsigned char *inBuffer,
45 | unsigned int width,
46 | unsigned int height,
47 | unsigned int pitch,
48 | unsigned int inputPixelSize,
49 | float scaleFactor,
50 | float *meanDataBuffer)
51 | {
52 | unsigned int row = blockIdx.y * blockDim.y + threadIdx.y;
53 | unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;
54 |
55 | if (col < width && row < height)
56 | {
57 | for (unsigned int k = 0; k < 3; k++)
58 | {
59 | outBuffer[width * height * k + row * width + col] =
60 | scaleFactor * ((float) inBuffer[row * pitch + col * inputPixelSize + k] -
61 | meanDataBuffer[(row * width * 3) + (col * 3) + k]);
62 | }
63 | }
64 | }
65 |
66 | __global__ void
67 | NvDsInferConvert_CxToP3RFloatKernel(
68 | float *outBuffer,
69 | unsigned char *inBuffer,
70 | unsigned int width,
71 | unsigned int height,
72 | unsigned int pitch,
73 | unsigned int inputPixelSize,
74 | float scaleFactor)
75 | {
76 | unsigned int row = blockIdx.y * blockDim.y + threadIdx.y;
77 | unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;
78 |
79 | if (col < width && row < height)
80 | {
81 | for (unsigned int k = 0; k < 3; k++)
82 | {
83 | outBuffer[width * height * k + row * width + col] =
84 | scaleFactor * inBuffer[row * pitch + col * inputPixelSize + (2 - k)];
85 | }
86 | }
87 | }
88 |
89 | __global__ void
90 | NvDsInferConvert_CxToP3RFloatKernelWithMeanSubtraction(
91 | float *outBuffer,
92 | unsigned char *inBuffer,
93 | unsigned int width,
94 | unsigned int height,
95 | unsigned int pitch,
96 | unsigned int inputPixelSize,
97 | float scaleFactor,
98 | float *meanDataBuffer)
99 | {
100 | unsigned int row = blockIdx.y * blockDim.y + threadIdx.y;
101 | unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;
102 |
103 | if (col < width && row < height)
104 | {
105 | for (unsigned int k = 0; k < 3; k++)
106 | {
107 | outBuffer[width * height * k + row * width + col] =
108 | scaleFactor * ((float) inBuffer[row * pitch + col * inputPixelSize + (2 - k)] -
109 | meanDataBuffer[(row * width * 3) + (col * 3) + k]);
110 | }
111 | }
112 | }
113 |
114 | __global__ void
115 | NvDsInferConvert_C1ToP1FloatKernel(
116 | float *outBuffer,
117 | unsigned char *inBuffer,
118 | unsigned int width,
119 | unsigned int height,
120 | unsigned int pitch,
121 | float scaleFactor)
122 | {
123 | unsigned int row = blockIdx.y * blockDim.y + threadIdx.y;
124 | unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;
125 |
126 | if (col < width && row < height)
127 | {
128 | outBuffer[row * width + col] = scaleFactor * inBuffer[row * pitch + col];
129 | }
130 | }
131 |
132 | __global__ void
133 | NvDsInferConvert_C1ToP1FloatKernelWithMeanSubtraction(
134 | float *outBuffer,
135 | unsigned char *inBuffer,
136 | unsigned int width,
137 | unsigned int height,
138 | unsigned int pitch,
139 | float scaleFactor,
140 | float *meanDataBuffer)
141 | {
142 | unsigned int row = blockIdx.y * blockDim.y + threadIdx.y;
143 | unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;
144 |
145 | if (col < width && row < height)
146 | {
147 | outBuffer[row * width + col] =
148 | scaleFactor * ((float) inBuffer[row * pitch + col] -
149 | meanDataBuffer[(row * width) + col]);
150 | }
151 | }
152 |
153 | void
154 | NvDsInferConvert_C3ToP3Float(
155 | float *outBuffer,
156 | unsigned char *inBuffer,
157 | unsigned int width,
158 | unsigned int height,
159 | unsigned int pitch,
160 | float scaleFactor,
161 | float *meanDataBuffer,
162 | cudaStream_t stream)
163 | {
164 | dim3 threadsPerBlock(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
165 | dim3 blocks((width+THREADS_PER_BLOCK_1)/threadsPerBlock.x, (height+THREADS_PER_BLOCK_1)/threadsPerBlock.y);
166 |
167 | if (meanDataBuffer == NULL)
168 | {
169 | NvDsInferConvert_CxToP3FloatKernel <<>>
170 | (outBuffer, inBuffer, width, height, pitch, 3, scaleFactor);
171 | }
172 | else
173 | {
174 | NvDsInferConvert_CxToP3FloatKernelWithMeanSubtraction <<>>
175 | (outBuffer, inBuffer, width, height, pitch, 3, scaleFactor, meanDataBuffer);
176 | }
177 | }
178 |
179 | void
180 | NvDsInferConvert_C4ToP3Float(
181 | float *outBuffer,
182 | unsigned char *inBuffer,
183 | unsigned int width,
184 | unsigned int height,
185 | unsigned int pitch,
186 | float scaleFactor,
187 | float *meanDataBuffer,
188 | cudaStream_t stream)
189 | {
190 | dim3 threadsPerBlock(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
191 | dim3 blocks((width+THREADS_PER_BLOCK_1)/threadsPerBlock.x, (height+THREADS_PER_BLOCK_1)/threadsPerBlock.y);
192 |
193 | if (meanDataBuffer == NULL)
194 | {
195 | NvDsInferConvert_CxToP3FloatKernel <<>>
196 | (outBuffer, inBuffer, width, height, pitch, 4, scaleFactor);
197 | }
198 | else
199 | {
200 | NvDsInferConvert_CxToP3FloatKernelWithMeanSubtraction <<>>
201 | (outBuffer, inBuffer, width, height, pitch, 4, scaleFactor, meanDataBuffer);
202 | }
203 | }
204 |
205 | void
206 | NvDsInferConvert_C3ToP3RFloat(
207 | float *outBuffer,
208 | unsigned char *inBuffer,
209 | unsigned int width,
210 | unsigned int height,
211 | unsigned int pitch,
212 | float scaleFactor,
213 | float *meanDataBuffer,
214 | cudaStream_t stream)
215 | {
216 | dim3 threadsPerBlock(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
217 | dim3 blocks((width+THREADS_PER_BLOCK_1)/threadsPerBlock.x, (height+THREADS_PER_BLOCK_1)/threadsPerBlock.y);
218 |
219 | if (meanDataBuffer == NULL)
220 | {
221 | NvDsInferConvert_CxToP3RFloatKernel <<>>
222 | (outBuffer, inBuffer, width, height, pitch, 3, scaleFactor);
223 | }
224 | else
225 | {
226 | NvDsInferConvert_CxToP3RFloatKernelWithMeanSubtraction <<>>
227 | (outBuffer, inBuffer, width, height, pitch, 3, scaleFactor, meanDataBuffer);
228 | }
229 | }
230 |
231 | void
232 | NvDsInferConvert_C4ToP3RFloat(
233 | float *outBuffer,
234 | unsigned char *inBuffer,
235 | unsigned int width,
236 | unsigned int height,
237 | unsigned int pitch,
238 | float scaleFactor,
239 | float *meanDataBuffer,
240 | cudaStream_t stream)
241 | {
242 | dim3 threadsPerBlock(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
243 | dim3 blocks((width+THREADS_PER_BLOCK_1)/threadsPerBlock.x, (height+THREADS_PER_BLOCK_1)/threadsPerBlock.y);
244 |
245 | if (meanDataBuffer == NULL)
246 | {
247 | NvDsInferConvert_CxToP3RFloatKernel <<>>
248 | (outBuffer, inBuffer, width, height, pitch, 4, scaleFactor);
249 | }
250 | else
251 | {
252 | NvDsInferConvert_CxToP3RFloatKernelWithMeanSubtraction <<>>
253 | (outBuffer, inBuffer, width, height, pitch, 4, scaleFactor, meanDataBuffer);
254 | }
255 | }
256 |
257 | void
258 | NvDsInferConvert_C1ToP1Float(
259 | float *outBuffer,
260 | unsigned char *inBuffer,
261 | unsigned int width,
262 | unsigned int height,
263 | unsigned int pitch,
264 | float scaleFactor,
265 | float *meanDataBuffer,
266 | cudaStream_t stream)
267 | {
268 | dim3 threadsPerBlock(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
269 | dim3 blocks((width+THREADS_PER_BLOCK_1)/threadsPerBlock.x, (height+THREADS_PER_BLOCK_1)/threadsPerBlock.y);
270 |
271 | if (meanDataBuffer == NULL)
272 | {
273 | NvDsInferConvert_C1ToP1FloatKernel <<>>
274 | (outBuffer, inBuffer, width, height, pitch, scaleFactor);
275 | }
276 | else
277 | {
278 | NvDsInferConvert_C1ToP1FloatKernelWithMeanSubtraction <<>>
279 | (outBuffer, inBuffer, width, height, pitch, scaleFactor, meanDataBuffer);
280 | }
281 |
282 | }
283 |
--------------------------------------------------------------------------------
/libs/nvdsinfer/nvdsinfer_conversion.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * NVIDIA Corporation and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA Corporation is strictly prohibited.
9 | *
10 | */
11 |
12 | /**
13 | * This is a header file for pre-processing cuda kernels with normalization and
14 | * mean subtraction required by nvdsinfer.
15 | */
16 | #ifndef __NVDSINFER_CONVERSION_H__
17 | #define __NVDSINFER_CONVERSION_H__
18 |
19 | /**
20 | * Converts an input packed 3 channel buffer of width x height resolution into an
21 | * planar 3-channel float buffer of width x height resolution. The input buffer can
22 | * have a pitch > (width * 3). The cuda kernel supports normalization and mean
23 | * image subtraction.
24 | *
25 | * This kernel can be used for RGB -> RGB and BGR -> BGR conversions.
26 | *
27 | * @param outBuffer Cuda device buffer for planar float output. Should
28 | * be at least (width * height * 3 * sizeof(float)) bytes.
29 | * @param inBuffer Cuda device buffer for packed input. Should be
30 | * at least (pitch * height) bytes.
31 | * @param width Width of the buffers in pixels.
32 | * @param height Height of the buffers in pixels.
33 | * @param pitch Pitch of the input buffer in bytes.
34 | * @param scaleFactor Normalization factor.
35 | * @param meanDataBuffer Mean Image Data buffer. Should be at least
36 | * (width * height * 3 * sizeof(float)) bytes.
37 | * @param stream Cuda stream identifier.
38 | */
39 | void
40 | NvDsInferConvert_C3ToP3Float(
41 | float *outBuffer,
42 | unsigned char *inBuffer,
43 | unsigned int width,
44 | unsigned int height,
45 | unsigned int pitch,
46 | float scaleFactor,
47 | float *meanDataBuffer,
48 | cudaStream_t stream);
49 |
50 | /**
51 | * Converts an input packed 3 channel buffer of width x height resolution into an
52 | * planar 3-channel float buffer of width x height resolution. The input buffer can
53 | * have a pitch > (width * 3). The cuda kernel supports normalization and mean
54 | * image subtraction.
55 | *
56 | * This kernel can be used for RGBA -> RGB and BGRx -> BGR conversions.
57 | *
58 | * @param outBuffer Cuda device buffer for planar float output. Should
59 | * be at least (width * height * 3 * sizeof(float)) bytes.
60 | * @param inBuffer Cuda device buffer for packed input. Should be
61 | * at least (pitch * height) bytes.
62 | * @param width Width of the buffers in pixels.
63 | * @param height Height of the buffers in pixels.
64 | * @param pitch Pitch of the input buffer in bytes.
65 | * @param scaleFactor Normalization factor.
66 | * @param meanDataBuffer Mean Image Data buffer. Should be at least
67 | * (width * height * 3 * sizeof(float)) bytes.
68 | * @param stream Cuda stream identifier.
69 | */
70 | void
71 | NvDsInferConvert_C4ToP3Float(
72 | float *outBuffer,
73 | unsigned char *inBuffer,
74 | unsigned int width,
75 | unsigned int height,
76 | unsigned int pitch,
77 | float scaleFactor,
78 | float *meanDataBuffer,
79 | cudaStream_t stream);
80 |
81 | /**
82 | * Converts an input packed 3 channel buffer of width x height resolution into an
83 | * planar 3-channel float buffer of width x height resolution with plane order
84 | * reversed. The input buffer can have a pitch > (width * 3). The cuda kernel
85 | * supports normalization and mean image subtraction.
86 | *
87 | * This kernel can be used for BGR -> RGB and RGB -> BGR conversions.
88 | *
89 | * @param outBuffer Cuda device buffer for planar float output. Should
90 | * be at least (width * height * 3 * sizeof(float)) bytes.
91 | * @param inBuffer Cuda device buffer for packed input. Should be
92 | * at least (pitch * height) bytes.
93 | * @param width Width of the buffers in pixels.
94 | * @param height Height of the buffers in pixels.
95 | * @param pitch Pitch of the input buffer in bytes.
96 | * @param scaleFactor Normalization factor.
97 | * @param meanDataBuffer Mean Image Data buffer. Should be at least
98 | * (width * height * 3 * sizeof(float)) bytes.
99 | * @param stream Cuda stream identifier.
100 | */
101 | void
102 | NvDsInferConvert_C3ToP3RFloat(
103 | float *outBuffer,
104 | unsigned char *inBuffer,
105 | unsigned int width,
106 | unsigned int height,
107 | unsigned int pitch,
108 | float scaleFactor,
109 | float *meanDataBuffer,
110 | cudaStream_t stream);
111 |
112 | /**
113 | * Converts an input packed 4 channel buffer of width x height resolution into an
114 | * planar 3-channel float buffer of width x height resolution with plane order
115 | * reversed. The input buffer can have a pitch > (width * 3). The cuda kernel
116 | * supports normalization and mean image subtraction.
117 | *
118 | * This kernel can be used for BGRx -> RGB and RGBA -> BGR conversions.
119 | *
120 | * @param outBuffer Cuda device buffer for planar float output. Should
121 | * be at least (width * height * 3 * sizeof(float)) bytes.
122 | * @param inBuffer Cuda device buffer for packed input. Should be
123 | * at least (pitch * height) bytes.
124 | * @param width Width of the buffers in pixels.
125 | * @param height Height of the buffers in pixels.
126 | * @param pitch Pitch of the input buffer in bytes.
127 | * @param scaleFactor Normalization factor.
128 | * @param meanDataBuffer Mean Image Data buffer. Should be at least
129 | * (width * height * 3 * sizeof(float)) bytes.
130 | * @param stream Cuda stream identifier.
131 | */
132 | void
133 | NvDsInferConvert_C4ToP3RFloat(
134 | float *outBuffer,
135 | unsigned char *inBuffer,
136 | unsigned int width,
137 | unsigned int height,
138 | unsigned int pitch,
139 | float scaleFactor,
140 | float *meanDataBuffer,
141 | cudaStream_t stream);
142 |
143 | /**
144 | * Converts an 1 channel UINT8 input of width x height resolution into an
145 | * 1 channel float buffer of width x height resolution. The input buffer can
146 | * have a pitch > width . The cuda kernel supports normalization and mean
147 | * image subtraction.
148 | *
149 | * @param outBuffer Cuda device buffer for float output. Should
150 | * be at least (width * height * sizeof(float)) bytes.
151 | * @param inBuffer Cuda device buffer for UINT8 input. Should be
152 | * at least (pitch * height) bytes.
153 | * @param width Width of the buffers in pixels.
154 | * @param height Height of the buffers in pixels.
155 | * @param pitch Pitch of the input buffer in bytes.
156 | * @param scaleFactor Normalization factor.
157 | * @param meanDataBuffer Mean Image Data buffer. Should be at least
158 | * (width * height * sizeof(float)) bytes.
159 | * @param stream Cuda stream identifier.
160 | */
161 | void
162 | NvDsInferConvert_C1ToP1Float(
163 | float *outBuffer,
164 | unsigned char *inBuffer,
165 | unsigned int width,
166 | unsigned int height,
167 | unsigned int pitch,
168 | float scaleFactor,
169 | float *meanDataBuffer,
170 | cudaStream_t stream);
171 |
172 |
173 | /**
174 | * Function pointer type to which any of the NvDsInferConvert functions can be
175 | * assigned.
176 | */
177 | typedef void (* NvDsInferConvertFcn)(
178 | float *outBuffer,
179 | unsigned char *inBuffer,
180 | unsigned int width,
181 | unsigned int height,
182 | unsigned int pitch,
183 | float scaleFactor,
184 | float *meanDataBuffer,
185 | cudaStream_t stream);
186 |
187 | #endif /* __NVDSINFER_CONVERSION_H__ */
188 |
--------------------------------------------------------------------------------
/libs/nvdsinfer/resize_merge_cpu.cpp:
--------------------------------------------------------------------------------
1 | #include "resize_merge_cpu.h"
2 | #include
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #define UNUSED(unusedVariable) (void)(unusedVariable)
10 |
11 | template
12 | void resizeAndMergeCpu(T* targetPtr, const std::vector& sourcePtrs,
13 | const std::array& targetSize,
14 | const std::vector>& sourceSizes,
15 | const std::vector& scaleInputToNetInputs)
16 | {
17 | try
18 | {
19 | // Scale used in CUDA/CL to know scale ratio between input and output
20 | // CPU directly uses sourceWidth/Height and targetWidth/Height
21 | UNUSED(scaleInputToNetInputs);
22 |
23 | // Sanity check
24 | if (sourceSizes.empty())
25 | printf("sourceSizes cannot be empty. %d, %s, %s\n", __LINE__, __FUNCTION__, __FILE__);
26 |
27 | // Params
28 | const auto nums = (signed)sourceSizes.size();
29 | const auto channels = targetSize[1]; // 57
30 | const auto targetHeight = targetSize[2]; // 368
31 | const auto targetWidth = targetSize[3]; // 496
32 | const auto targetChannelOffset = targetWidth * targetHeight;
33 |
34 | // No multi-scale merging or no merging required
35 | if (sourceSizes.size() == 1)
36 | {
37 | // Params
38 | const auto& sourceSize = sourceSizes[0];
39 | const auto sourceHeight = sourceSize[2]; // 368/8 ..
40 | const auto sourceWidth = sourceSize[3]; // 496/8 ..
41 | const auto sourceChannelOffset = sourceHeight * sourceWidth;
42 | if (sourceSize[0] != 1)
43 | printf("It should never reache this point. Notify us otherwise. %d, %s, %s\n",
44 | __LINE__, __FUNCTION__, __FILE__);
45 |
46 | // Per channel resize
47 | const T* sourcePtr = sourcePtrs[0];
48 | for (auto c = 0 ; c < channels ; c++)
49 | {
50 | cv::Mat source(cv::Size(sourceWidth, sourceHeight), CV_32FC1,
51 | const_cast(&sourcePtr[c*sourceChannelOffset]));
52 | cv::Mat target(cv::Size(targetWidth, targetHeight), CV_32FC1,
53 | (&targetPtr[c*targetChannelOffset]));
54 | cv::resize(source, target, {targetWidth, targetHeight}, 0, 0, cv::INTER_CUBIC);
55 | }
56 | }
57 | // Multi-scale merging
58 | else
59 | {
60 | // Construct temp targets. We resuse targetPtr to store first scale
61 | std::vector> tempTargetPtrs;
62 | for (auto n = 1; n < nums; n++){
63 | tempTargetPtrs.emplace_back(std::unique_ptr(new T[targetChannelOffset * channels]()));
64 | }
65 |
66 | // Resize and sum
67 | for (auto n = 0; n < nums; n++){
68 |
69 | // Params
70 | const auto& sourceSize = sourceSizes[n];
71 | const auto sourceHeight = sourceSize[2]; // 368/6 ..
72 | const auto sourceWidth = sourceSize[3]; // 496/8 ..
73 | const auto sourceChannelOffset = sourceHeight * sourceWidth;
74 |
75 | // Access pointers
76 | const T* sourcePtr = sourcePtrs[n];
77 | T* tempTargetPtr;
78 | if (n != 0)
79 | tempTargetPtr = tempTargetPtrs[n-1].get();
80 | else
81 | tempTargetPtr = targetPtr;
82 |
83 | T* firstTempTargetPtr = targetPtr;
84 | for (auto c = 0 ; c < channels ; c++)
85 | {
86 | // Resize
87 | cv::Mat source(cv::Size(sourceWidth, sourceHeight), CV_32FC1,
88 | const_cast(&sourcePtr[c*sourceChannelOffset]));
89 | cv::Mat target(cv::Size(targetWidth, targetHeight), CV_32FC1,
90 | (&tempTargetPtr[c*targetChannelOffset]));
91 | cv::resize(source, target, {targetWidth, targetHeight}, 0, 0, cv::INTER_CUBIC);
92 |
93 | // Add
94 | if (n != 0)
95 | {
96 | cv::Mat addTarget(cv::Size(targetWidth, targetHeight), CV_32FC1,
97 | (&firstTempTargetPtr[c*targetChannelOffset]));
98 | cv::add(target, addTarget, addTarget);
99 | }
100 | }
101 | }
102 |
103 | // Average
104 | for (auto c = 0 ; c < channels ; c++)
105 | {
106 | cv::Mat target(cv::Size(targetWidth, targetHeight), CV_32FC1, (&targetPtr[c*targetChannelOffset]));
107 | target /= (float)nums;
108 | }
109 |
110 | }
111 | }
112 | catch (const std::exception& e)
113 | {
114 | printf("exception: %s, %d, %s, %s\n", e.what(), __LINE__, __FUNCTION__, __FILE__);
115 | }
116 | }
117 |
118 | template void resizeAndMergeCpu(
119 | float* targetPtr, const std::vector& sourcePtrs, const std::array& targetSize,
120 | const std::vector>& sourceSizes, const std::vector& scaleInputToNetInputs);
121 | template void resizeAndMergeCpu(
122 | double* targetPtr, const std::vector& sourcePtrs, const std::array& targetSize,
123 | const std::vector>& sourceSizes, const std::vector& scaleInputToNetInputs);
124 |
--------------------------------------------------------------------------------
/libs/nvdsinfer/resize_merge_cpu.h:
--------------------------------------------------------------------------------
1 | #ifndef RESIZE_MERGE_CPU_H
2 | #define RESIZE_MERGE_CPU_H
3 |
4 | #include
5 | #include
6 |
7 | template
8 | void resizeAndMergeCpu(
9 | T* targetPtr, const std::vector& sourcePtrs, const std::array& targetSize,
10 | const std::vector>& sourceSizes, const std::vector& scaleInputToNetInputs = {1.f});
11 |
12 | #endif // RESIZE_MERGE_CPU_H
13 |
--------------------------------------------------------------------------------
/openpose_app/COCO_val2014_000000000564.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cding-nv/deepstream-openpose/6db238cd4490655e2768bafed3ca4b8f5f778d25/openpose_app/COCO_val2014_000000000564.jpg
--------------------------------------------------------------------------------
/openpose_app/COCO_val2014_000000000569.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cding-nv/deepstream-openpose/6db238cd4490655e2768bafed3ca4b8f5f778d25/openpose_app/COCO_val2014_000000000569.jpg
--------------------------------------------------------------------------------
/openpose_app/Makefile:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Permission is hereby granted, free of charge, to any person obtaining a
5 | # copy of this software and associated documentation files (the "Software"),
6 | # to deal in the Software without restriction, including without limitation
7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 | # and/or sell copies of the Software, and to permit persons to whom the
9 | # Software is furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in
12 | # all copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 | # DEALINGS IN THE SOFTWARE.
21 | ################################################################################
22 |
23 | APP:= openpose-app
24 |
25 | TARGET_DEVICE = $(shell gcc -dumpmachine | cut -f1 -d -)
26 |
27 | NVDS_VERSION:=4.0
28 |
29 | LIB_INSTALL_DIR?=/opt/nvidia/deepstream/deepstream-$(NVDS_VERSION)/lib/
30 |
31 | ifeq ($(TARGET_DEVICE),aarch64)
32 | CFLAGS:= -DPLATFORM_TEGRA
33 | endif
34 |
35 | SRCS:= $(wildcard *.c)
36 |
37 | INCS:= $(wildcard *.h)
38 |
39 | PKGS:= gstreamer-1.0
40 |
41 | OBJS:= $(SRCS:.c=.o)
42 |
43 | CFLAGS+= -I/opt/nvidia/deepstream/deepstream-4.0/sources/includes
44 |
45 | CFLAGS+= `pkg-config --cflags $(PKGS)`
46 |
47 | LIBS:= `pkg-config --libs $(PKGS)`
48 |
49 | LIBS+= -lm -L$(LIB_INSTALL_DIR) -lnvdsgst_helper -lnvdsgst_meta \
50 | -Wl,-rpath,$(LIB_INSTALL_DIR)
51 |
52 | all: $(APP)
53 |
54 | %.o: %.c $(INCS) Makefile
55 | $(CC) -c -o $@ $(CFLAGS) $<
56 |
57 | $(APP): $(OBJS) Makefile
58 | $(CC) -o $(APP) $(OBJS) $(LIBS)
59 |
60 | clean:
61 | rm -rf $(OBJS) $(APP)
62 |
63 |
64 |
--------------------------------------------------------------------------------
/openpose_app/README:
--------------------------------------------------------------------------------
1 | *****************************************************************************
2 | * Copyright (c) 2019 NVIDIA Corporation. All rights reserved.
3 | *
4 | * NVIDIA Corporation and its licensors retain all intellectual property
5 | * and proprietary rights in and to this software, related documentation
6 | * and any modifications thereto. Any use, reproduction, disclosure or
7 | * distribution of this software and related documentation without an express
8 | * license agreement from NVIDIA Corporation is strictly prohibited.
9 | *****************************************************************************
10 |
11 | Prequisites:
12 |
13 | Please follow instructions in the apps/sample_apps/deepstream-app/README on how
14 | to install the prequisites for Deepstream SDK, the DeepStream SDK itself and the
15 | apps.
16 |
17 | Pipeline:
18 | filesrc -> jpegparse -> nvv4l2decoder -> nvstreammux -> nvinfer (segmentation)
19 | nvsegvidsual -> nvmultistreamtiler -> (nvegltransform) -> nveglglessink
20 |
--------------------------------------------------------------------------------
/openpose_app/nvinfer_config.txt:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Permission is hereby granted, free of charge, to any person obtaining a
5 | # copy of this software and associated documentation files (the "Software"),
6 | # to deal in the Software without restriction, including without limitation
7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 | # and/or sell copies of the Software, and to permit persons to whom the
9 | # Software is furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in
12 | # all copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 | # DEALINGS IN THE SOFTWARE.
21 | ################################################################################
22 |
23 | # Following properties are mandatory when engine files are not specified:
24 | # int8-calib-file(Only in INT8), model-file-format
25 | # Caffemodel mandatory properties: model-file, proto-file, output-blob-names
26 | # UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names
27 | # ONNX: onnx-file
28 | #
29 | # Mandatory properties for detectors:
30 | # num-detected-classes
31 | #
32 | # Optional properties for detectors:
33 | # enable-dbscan(Default=false), interval(Primary mode only, Default=0)
34 | # custom-lib-path,
35 | # parse-bbox-func-name
36 | #
37 | # Mandatory properties for classifiers:
38 | # classifier-threshold, is-classifier
39 | #
40 | # Optional properties for classifiers:
41 | # classifier-async-mode(Secondary mode only, Default=false)
42 | #
43 | # Optional properties in secondary mode:
44 | # operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes),
45 | # input-object-min-width, input-object-min-height, input-object-max-width,
46 | # input-object-max-height
47 | #
48 | # Following properties are always recommended:
49 | # batch-size(Default=1)
50 | #
51 | # Other optional properties:
52 | # net-scale-factor(Default=1), network-mode(Default=0 i.e FP32),
53 | # model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path,
54 | # mean-file, gie-unique-id(Default=0), offsets, gie-mode (Default=1 i.e. primary),
55 | # custom-lib-path, network-mode(Default=0 i.e FP32)
56 | #
57 | # The values in the config file are overridden by values set through GObject
58 | # properties.
59 |
60 | [property]
61 | gpu-id=0
62 | net-scale-factor=0.00390625
63 | #net-scale-factor=0.003921568627451
64 | #net-scale-factor=0.007843137254902
65 | offsets=128;128;128
66 | model-color-format=1
67 | model-file=/home/nvidia/openpose/models/pose/coco/pose_iter_440000.caffemodel
68 | proto-file=/home/nvidia/openpose/models/pose/coco/pose_deploy_linevec.prototxt
69 | #model-engine-file=/home/nvidia/openpose/models/pose/coco/pose_iter_440000.caffemodel_b1_fp32.engine
70 | batch-size=1
71 | ## 0=FP32, 1=INT8, 2=FP16 mode
72 | #network-mode=0
73 | num-detected-classes=4
74 | interval=0
75 | gie-unique-id=1
76 | network-type=2
77 | output-blob-names=net_output
78 | segmentation-threshold=0.0
79 | #parse-bbox-func-name=NvDsInferParseCustomSSD
80 | #custom-lib-path=nvdsinfer_custom_impl_ssd/libnvdsinfer_custom_impl_ssd.so
81 |
82 | [class-attrs-all]
83 | roi-top-offset=0
84 | roi-bottom-offset=0
85 | detected-min-w=0
86 | detected-min-h=0
87 | detected-max-w=0
88 | detected-max-h=0
89 |
90 | ## Per class configuration
91 | #[class-attrs-2]
92 | #threshold=0.6
93 | #roi-top-offset=20
94 | #roi-bottom-offset=10
95 | #detected-min-w=40
96 | #detected-min-h=40
97 | #detected-max-w=400
98 | #detected-max-h=800
99 |
--------------------------------------------------------------------------------
/openpose_app/openpose_app.c:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | *
4 | * Permission is hereby granted, free of charge, to any person obtaining a
5 | * copy of this software and associated documentation files (the "Software"),
6 | * to deal in the Software without restriction, including without limitation
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 | * and/or sell copies of the Software, and to permit persons to whom the
9 | * Software is furnished to do so, subject to the following conditions:
10 | *
11 | * The above copyright notice and this permission notice shall be included in
12 | * all copies or substantial portions of the Software.
13 | *
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 | * DEALINGS IN THE SOFTWARE.
21 | */
22 |
23 | #include
24 | #include
25 | #include
26 | #include
27 | #include
28 | #include
29 | #include
30 |
31 | #include "gstnvdsmeta.h"
32 | #ifndef PLATFORM_TEGRA
33 | #include "gst-nvmessage.h"
34 | #endif
35 |
36 | /* The muxer output resolution must be set if the input streams will be of
37 | * different resolution. The muxer will scale all the input frames to this
38 | * resolution. */
39 | #define MUXER_OUTPUT_WIDTH 1280
40 | #define MUXER_OUTPUT_HEIGHT 720
41 |
42 | /* Muxer batch formation timeout, for e.g. 40 millisec. Should ideally be set
43 | * based on the fastest source's framerate. */
44 | #define MUXER_BATCH_TIMEOUT_USEC 4000000
45 |
46 | #define TILED_OUTPUT_WIDTH 1280
47 | #define TILED_OUTPUT_HEIGHT 720
48 |
49 | /* tiler_sink_pad_buffer_probe will extract metadata received on segmentation
50 | * src pad */
51 | static GstPadProbeReturn
52 | tiler_src_pad_buffer_probe (GstPad * pad, GstPadProbeInfo * info,
53 | gpointer u_data)
54 | {
55 | GstBuffer *buf = (GstBuffer *) info->data;
56 | NvDsMetaList * l_frame = NULL;
57 | NvDsBatchMeta *batch_meta = gst_buffer_get_nvds_batch_meta (buf);
58 |
59 | for (l_frame = batch_meta->frame_meta_list; l_frame != NULL;
60 | l_frame = l_frame->next) {
61 | // TODO:
62 | }
63 | return GST_PAD_PROBE_OK;
64 | }
65 |
66 | static gboolean
67 | bus_call (GstBus * bus, GstMessage * msg, gpointer data)
68 | {
69 | GMainLoop *loop = (GMainLoop *) data;
70 | switch (GST_MESSAGE_TYPE (msg)) {
71 | case GST_MESSAGE_EOS:
72 | g_print ("End of stream\n");
73 | // Add the delay to show the result
74 | usleep(100000000);
75 | g_main_loop_quit (loop);
76 | break;
77 | case GST_MESSAGE_WARNING:
78 | {
79 | gchar *debug;
80 | GError *error;
81 | gst_message_parse_warning (msg, &error, &debug);
82 | g_printerr ("WARNING from element %s: %s\n",
83 | GST_OBJECT_NAME (msg->src), error->message);
84 | g_free (debug);
85 | g_printerr ("Warning: %s\n", error->message);
86 | g_error_free (error);
87 | break;
88 | }
89 | case GST_MESSAGE_ERROR:
90 | {
91 | gchar *debug;
92 | GError *error;
93 | gst_message_parse_error (msg, &error, &debug);
94 | g_printerr ("ERROR from element %s: %s\n",
95 | GST_OBJECT_NAME (msg->src), error->message);
96 | if (debug)
97 | g_printerr ("Error details: %s\n", debug);
98 | g_free (debug);
99 | g_error_free (error);
100 | g_main_loop_quit (loop);
101 | break;
102 | }
103 | #ifndef PLATFORM_TEGRA
104 | case GST_MESSAGE_ELEMENT:
105 | {
106 | if (gst_nvmessage_is_stream_eos (msg)) {
107 | guint stream_id;
108 | if (gst_nvmessage_parse_stream_eos (msg, &stream_id)) {
109 | g_print ("Got EOS from stream %d\n", stream_id);
110 | }
111 | }
112 | break;
113 | }
114 | #endif
115 | default:
116 | break;
117 | }
118 | return TRUE;
119 | }
120 |
121 | static GstElement *
122 | create_source_bin (guint index, gchar * uri)
123 | {
124 | GstElement *bin = NULL;
125 | gchar bin_name[16] = { };
126 |
127 | g_snprintf (bin_name, 15, "source-bin-%02d", index);
128 | /* Create a source GstBin to abstract this bin's content from the rest of the
129 | * pipeline */
130 | bin = gst_bin_new (bin_name);
131 |
132 | GstElement *source, *jpegparser, *decoder;
133 |
134 | source = gst_element_factory_make ("filesrc", "source");
135 |
136 | jpegparser = gst_element_factory_make ("jpegparse", "jpeg-parser");
137 |
138 | decoder = gst_element_factory_make ("nvv4l2decoder", "nvv4l2-decoder");
139 |
140 | if (!source || !jpegparser || !decoder)
141 | {
142 | g_printerr ("One element could not be created. Exiting.\n");
143 | return NULL;
144 | }
145 | g_object_set (G_OBJECT (source), "location", uri, NULL);
146 | const char *dot = strrchr(uri, '.');
147 | if ((!strcmp (dot+1, "mjpeg")) || (!strcmp (dot+1, "mjpg")))
148 | {
149 | #ifdef PLATFORM_TEGRA
150 | g_object_set (G_OBJECT (decoder), "mjpeg", 1, NULL);
151 | #endif
152 | }
153 |
154 | gst_bin_add_many (GST_BIN (bin), source, jpegparser, decoder, NULL);
155 |
156 | gst_element_link_many (source, jpegparser, decoder, NULL);
157 |
158 | /* We need to create a ghost pad for the source bin which will act as a proxy
159 | * for the video decoder src pad. The ghost pad will not have a target right
160 | * now. Once the decode bin creates the video decoder and generates the
161 | * cb_newpad callback, we will set the ghost pad target to the video decoder
162 | * src pad. */
163 | if (!gst_element_add_pad (bin, gst_ghost_pad_new_no_target ("src",
164 | GST_PAD_SRC))) {
165 | g_printerr ("Failed to add ghost pad in source bin\n");
166 | return NULL;
167 | }
168 |
169 | GstPad *srcpad = gst_element_get_static_pad (decoder, "src");
170 | if (!srcpad) {
171 | g_printerr ("Failed to get src pad of source bin. Exiting.\n");
172 | return NULL;
173 | }
174 | GstPad *bin_ghost_pad = gst_element_get_static_pad (bin, "src");
175 | if (!gst_ghost_pad_set_target (GST_GHOST_PAD (bin_ghost_pad),
176 | srcpad)) {
177 | g_printerr ("Failed to link decoder src pad to source bin ghost pad\n");
178 | }
179 |
180 | return bin;
181 | }
182 |
183 | int
184 | main (int argc, char *argv[])
185 | {
186 | GMainLoop *loop = NULL;
187 | GstElement *pipeline = NULL, *streammux = NULL, *sink = NULL, *seg = NULL,
188 | *nvsegvisual = NULL, *tiler = NULL;
189 | #ifdef PLATFORM_TEGRA
190 | GstElement *transform = NULL;
191 | #endif
192 | GstBus *bus = NULL;
193 | guint bus_watch_id;
194 | GstPad *seg_src_pad = NULL;
195 | guint i, num_sources;
196 | guint tiler_rows, tiler_columns;
197 | guint pgie_batch_size;
198 |
199 | /* Check input arguments */
200 | if (argc < 3) {
201 | g_printerr ("Usage: %s config_file \n", argv[0]);
202 | return -1;
203 | }
204 | num_sources = argc - 2;
205 |
206 | /* Standard GStreamer initialization */
207 | gst_init (&argc, &argv);
208 | loop = g_main_loop_new (NULL, FALSE);
209 |
210 | /* Create gstreamer elements */
211 | /* Create Pipeline element that will form a connection of other elements */
212 | pipeline = gst_pipeline_new ("dstest-image-decode-pipeline");
213 |
214 | /* Create nvstreammux instance to form batches from one or more sources. */
215 | streammux = gst_element_factory_make ("nvstreammux", "stream-muxer");
216 |
217 | if (!pipeline || !streammux) {
218 | g_printerr ("One element could not be created. Exiting.\n");
219 | return -1;
220 | }
221 | gst_bin_add (GST_BIN (pipeline), streammux);
222 |
223 | for (i = 0; i < num_sources; i++) {
224 | GstPad *sinkpad, *srcpad;
225 | gchar pad_name[16] = { };
226 | GstElement *source_bin = create_source_bin (i, argv[i + 2]);
227 |
228 | if (!source_bin) {
229 | g_printerr ("Failed to create source bin. Exiting.\n");
230 | return -1;
231 | }
232 |
233 | gst_bin_add (GST_BIN (pipeline), source_bin);
234 |
235 | g_snprintf (pad_name, 15, "sink_%u", i);
236 | sinkpad = gst_element_get_request_pad (streammux, pad_name);
237 | if (!sinkpad) {
238 | g_printerr ("Streammux request sink pad failed. Exiting.\n");
239 | return -1;
240 | }
241 |
242 | srcpad = gst_element_get_static_pad (source_bin, "src");
243 | if (!srcpad) {
244 | g_printerr ("Failed to get src pad of source bin. Exiting.\n");
245 | return -1;
246 | }
247 |
248 | if (gst_pad_link (srcpad, sinkpad) != GST_PAD_LINK_OK) {
249 | g_printerr ("Failed to link source bin to stream muxer. Exiting.\n");
250 | return -1;
251 | }
252 |
253 | gst_object_unref (srcpad);
254 | gst_object_unref (sinkpad);
255 | }
256 |
257 | /* Use nvinfer to infer on batched frame. */
258 | seg = gst_element_factory_make ("nvinfer", "primary-nvinference-engine");
259 |
260 | nvsegvisual = gst_element_factory_make ("nvsegvisual", "nvsegvisual");
261 |
262 | /* Use nvtiler to composite the batched frames into a 2D tiled array based
263 | * on the source of the frames. */
264 | tiler = gst_element_factory_make ("nvmultistreamtiler", "nvtiler");
265 |
266 | #ifdef PLATFORM_TEGRA
267 | transform = gst_element_factory_make ("nvegltransform", "transform");
268 | #endif
269 |
270 | sink = gst_element_factory_make ("nveglglessink", "nvvideo-renderer");
271 |
272 | if (!seg || !nvsegvisual || !tiler || !sink) {
273 | g_printerr ("One element could not be created. Exiting.\n");
274 | return -1;
275 | }
276 |
277 | #ifdef PLATFORM_TEGRA
278 | if(!transform) {
279 | g_printerr ("One tegra element could not be created. Exiting.\n");
280 | return -1;
281 | }
282 | #endif
283 |
284 | g_object_set (G_OBJECT (streammux), "width", MUXER_OUTPUT_WIDTH, "height",
285 | MUXER_OUTPUT_HEIGHT, "batch-size", num_sources,
286 | "batched-push-timeout", MUXER_BATCH_TIMEOUT_USEC, NULL);
287 |
288 | /* Configure the nvinfer element using the nvinfer config file. */
289 | g_object_set (G_OBJECT (seg), "config-file-path", argv[1], NULL);
290 |
291 | /* Override the batch-size set in the config file with the number of sources. */
292 | g_object_get (G_OBJECT (seg), "batch-size", &pgie_batch_size, NULL);
293 | if (pgie_batch_size != num_sources) {
294 | g_printerr
295 | ("WARNING: Overriding infer-config batch-size (%d) with number of sources (%d)\n",
296 | pgie_batch_size, num_sources);
297 | g_object_set (G_OBJECT (seg), "batch-size", num_sources, NULL);
298 | }
299 |
300 | g_object_set (G_OBJECT (nvsegvisual), "batch-size", num_sources, NULL);
301 | g_object_set (G_OBJECT (nvsegvisual), "width", 496, NULL);
302 | g_object_set (G_OBJECT (nvsegvisual), "height", 368, NULL);
303 |
304 | tiler_rows = (guint) sqrt (num_sources);
305 | tiler_columns = (guint) ceil (1.0 * num_sources / tiler_rows);
306 | /* we set the tiler properties here */
307 | g_object_set (G_OBJECT (tiler), "rows", tiler_rows, "columns", tiler_columns,
308 | "width", TILED_OUTPUT_WIDTH, "height", TILED_OUTPUT_HEIGHT, NULL);
309 |
310 | g_object_set(G_OBJECT(sink), "async", FALSE, NULL);
311 |
312 | /* we add a message handler */
313 | bus = gst_pipeline_get_bus (GST_PIPELINE (pipeline));
314 | bus_watch_id = gst_bus_add_watch (bus, bus_call, loop);
315 | gst_object_unref (bus);
316 |
317 | /* Set up the pipeline */
318 | /* Add all elements into the pipeline */
319 | #ifdef PLATFORM_TEGRA
320 | gst_bin_add_many (GST_BIN (pipeline), seg, nvsegvisual, tiler, transform, sink, NULL);
321 | /* we link the elements together
322 | * nvstreammux -> nvinfer -> nvsegvidsual -> nvtiler -> transform -> video-renderer */
323 | if (!gst_element_link_many (streammux, seg, nvsegvisual, tiler, transform, sink, NULL))
324 | {
325 | g_printerr ("Elements could not be linked. Exiting.\n");
326 | return -1;
327 | }
328 | #else
329 | gst_bin_add_many (GST_BIN (pipeline), seg, nvsegvisual, tiler, sink, NULL);
330 | /* Link the elements together
331 | * nvstreammux -> nvinfer -> nvsegvisual -> nvtiler -> video-renderer */
332 | if (!gst_element_link_many (streammux, seg, nvsegvisual, tiler, sink, NULL)) {
333 | g_printerr ("Elements could not be linked. Exiting.\n");
334 | return -1;
335 | }
336 | #endif
337 |
338 | /* Lets add probe to get informed of the meta data generated, we add probe to
339 | * the src pad of the nvseg element, since by that time, the buffer would have
340 | * had got all the segmentation metadata. */
341 | seg_src_pad = gst_element_get_static_pad (seg, "src");
342 | if (!seg_src_pad)
343 | g_print ("Unable to get src pad\n");
344 | else
345 | gst_pad_add_probe (seg_src_pad, GST_PAD_PROBE_TYPE_BUFFER,
346 | tiler_src_pad_buffer_probe, NULL, NULL);
347 |
348 | /* Set the pipeline to "playing" state */
349 | g_print ("Now playing:");
350 | for (i = 0; i < num_sources; i++) {
351 | g_print (" %s,", argv[i + 2]);
352 | }
353 | g_print ("\n");
354 | gst_element_set_state (pipeline, GST_STATE_PLAYING);
355 |
356 | /* Wait till pipeline encounters an error or EOS */
357 | g_print ("Running...\n");
358 | g_main_loop_run (loop);
359 |
360 | /* Out of the main loop, clean up nicely */
361 | g_print ("Returned, stopping playback\n");
362 | gst_element_set_state (pipeline, GST_STATE_NULL);
363 | g_print ("Deleting pipeline\n");
364 | gst_object_unref (GST_OBJECT (pipeline));
365 | g_source_remove (bus_watch_id);
366 | g_main_loop_unref (loop);
367 | return 0;
368 | }
369 |
--------------------------------------------------------------------------------
/todo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cding-nv/deepstream-openpose/6db238cd4490655e2768bafed3ca4b8f5f778d25/todo.jpg
--------------------------------------------------------------------------------