├── COCO_val2014_000000000564_deepstream_infer.jpg ├── COCO_val2014_000000000564_infer.jpg ├── README.md ├── libs └── nvdsinfer │ ├── Makefile │ ├── README │ ├── nms_cpu.cpp │ ├── nms_cpu.h │ ├── nvdsinfer_context_impl.cpp │ ├── nvdsinfer_context_impl.h │ ├── nvdsinfer_context_impl_capi.cpp │ ├── nvdsinfer_context_impl_output_parsing.cpp │ ├── nvdsinfer_conversion.cu │ ├── nvdsinfer_conversion.h │ ├── resize_merge_cpu.cpp │ └── resize_merge_cpu.h ├── openpose_app ├── COCO_val2014_000000000564.jpg ├── COCO_val2014_000000000569.jpg ├── Makefile ├── README ├── nvinfer_config.txt └── openpose_app.c └── todo.jpg /COCO_val2014_000000000564_deepstream_infer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cding-nv/deepstream-openpose/6db238cd4490655e2768bafed3ca4b8f5f778d25/COCO_val2014_000000000564_deepstream_infer.jpg -------------------------------------------------------------------------------- /COCO_val2014_000000000564_infer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cding-nv/deepstream-openpose/6db238cd4490655e2768bafed3ca4b8f5f778d25/COCO_val2014_000000000564_infer.jpg -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # deepstream-openpose 2 | 3 | 4 | ## 1. Run [CMU](https://github.com/CMU-Perceptual-Computing-Lab/openpose) pose demo 5 | Platform: xavier, Jetpack 4.3 with DeepStream 4.0.2 6 | 7 | Notes: 8 | 1. Porting to Tesla/x86 platform should be easy. 9 | 2. CUDA_cublas_device_LIBRARY NOTFOUND issue -> 10 | [Solution](https://forums.developer.nvidia.com/t/cuda-blas-libraries-not-installed/107908/18?u=chrisding) 11 | 3. Refer to `openpose/scripts/ubuntu/install_deps.sh` to install deps libs. 12 | 4. Refer to `openpose/models/getModels.sh` to fetch models 13 | 5. Build. 14 | ``` 15 | $ cmake -D CMAKE_BUILD_TYPE=Debug .. 16 | $ make -j4 17 | ``` 18 | 6. These demos can work. 19 | ``` 20 | $ ./build/examples/openpose/openpose.bin 21 | $ ./build/examples/tutorial_api_cpp/01_body_from_image_default.bin 22 | $ ... 23 | ``` 24 | It show like this. 25 | 26 |

27 | 28 |

29 | 30 | ## 2. Deploy pose coco model by DeepStream 4.0.2 31 | Model: `pose/coco/pose_iter_440000.caffemodel`, `pose/coco/pose_deploy_linevec.prototxt` 32 | 33 | Pipeline: 34 | > filesrc -> jpegparse -> nvv4l2decoder -> nvstreammux -> nvinfer (openpose and 18 parts parse) 35 | nvsegvidsual -> nvmultistreamtiler -> (nvegltransform) -> nveglglessink 36 | 37 | ### Build libnvds_infer.so 38 | ``` 39 | $ cd libs/nvinfer 40 | $ make 41 | Backup /opt/nvidia/deepstream/deepstream-4.0/lib/libnvds_infer.so 42 | $ sudo ln -sf $(pwd)/libnvds_infer.so /opt/nvidia/deepstream/deepstream-4.0/lib/libnvds_infer.so 43 | ``` 44 | 45 | ### Build openpose-app 46 | ``` 47 | $ cd openpose_app 48 | $ make 49 | Change nvinfer_config.txt "model-file" and "proto-file" to be your path 50 | model-file= 51 | proto-file= 52 | ``` 53 | 54 | ### Run 55 | ``` 56 | $ ./openpose-app ./nvinfer_config.txt COCO_val2014_000000000564.jpg 57 | ``` 58 |

59 | 60 |

61 | 62 | ## 3. TODO 63 | Add dsexample plugin after nvinfer and do [resize_merge](./libs/nvdsinfer/resize_merge_cpu.cpp), [nms](./libs/nvdsinfer/nms_cpu.cpp) and BodyPartConnector, and show result by nvosd like the below. 64 |

65 | 66 |

67 | -------------------------------------------------------------------------------- /libs/nvdsinfer/Makefile: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA Corporation and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA Corporation is strictly prohibited. 9 | ################################################################################# 10 | 11 | CUDA_VER?= 12 | ifeq ($(CUDA_VER),) 13 | $(error "CUDA_VER is not set") 14 | endif 15 | 16 | NVCC:=/usr/local/cuda-$(CUDA_VER)/bin/nvcc 17 | CXX:= g++ 18 | SRCS:= nvdsinfer_context_impl.cpp nvdsinfer_context_impl_capi.cpp \ 19 | nvdsinfer_context_impl_output_parsing.cpp nvdsinfer_conversion.cu \ 20 | nms_cpu.cpp resize_merge_cpu.cpp 21 | INCS:= $(wildcard *.h) 22 | LIB:=libnvds_infer.so 23 | 24 | NVDS_VERSION:=4.0 25 | 26 | LIB_INSTALL_DIR?=/opt/nvidia/deepstream/deepstream-$(NVDS_VERSION)/lib/ 27 | 28 | CFLAGS+= -fPIC -g -std=c++11 \ 29 | -I /usr/local/cuda-$(CUDA_VER)/include \ 30 | -I /opt/nvidia/deepstream/deepstream-4.0/sources/includes/ 31 | 32 | CFLAGS+= `pkg-config --cflags gstreamer-1.0` 33 | 34 | LIBS := -shared -g -Wl,-no-undefined \ 35 | -lnvinfer -lnvinfer_plugin -lnvonnxparser -lnvcaffe_parser \ 36 | -L/usr/local/cuda-$(CUDA_VER)/lib64/ -lcudart \ 37 | -lopencv_objdetect -lopencv_imgproc -lopencv_core 38 | 39 | LIBS+= `pkg-config --libs gstreamer-1.0` 40 | 41 | LIBS+= -L$(LIB_INSTALL_DIR) -lnvdsgst_helper -lnvdsgst_meta -lnvds_meta \ 42 | -lnvds_inferutils -ldl \ 43 | -Wl,-rpath,$(LIB_INSTALL_DIR) 44 | 45 | 46 | OBJS:= $(SRCS:.cpp=.o) 47 | OBJS:= $(OBJS:.cu=.o) 48 | 49 | all: $(LIB) 50 | 51 | %.o: %.cpp $(INCS) Makefile 52 | @echo $(CFLAGS) 53 | $(CXX) -c -o $@ $(CFLAGS) $< 54 | 55 | %.o: %.cu $(INCS) Makefile 56 | @echo $(CFLAGS) 57 | $(NVCC) -c -o $@ --compiler-options '-fPIC' $< 58 | 59 | $(LIB): $(OBJS) $(DEP) Makefile 60 | @echo $(CFLAGS) 61 | $(CXX) -o $@ $(OBJS) $(LIBS) 62 | 63 | install: $(LIB) 64 | cp -rv $(LIB) $(LIB_INSTALL_DIR) 65 | 66 | clean: 67 | rm -rf $(OBJS) $(LIB) 68 | -------------------------------------------------------------------------------- /libs/nvdsinfer/README: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # NVIDIA Corporation and its licensors retain all intellectual property 5 | # and proprietary rights in and to this software, related documentation 6 | # and any modifications thereto. Any use, reproduction, disclosure or 7 | # distribution of this software and related documentation without an express 8 | # license agreement from NVIDIA Corporation is strictly prohibited. 9 | # 10 | ################################################################################ 11 | 12 | Refer to the DeepStream SDK documentation for a description of the "nvinfer" 13 | plugin and "NvDsInfer" API. 14 | 15 | -------------------------------------------------------------------------------- 16 | Pre-requisites: 17 | - TensorRT 5.1+ development package 18 | - OpenCV 3.4.0+ development package 19 | 20 | Please refer to the TensorRT documentation for installing the TensorRT development 21 | package. 22 | 23 | To install OpenCV development pacakge 24 | sudo apt-get install libopencv-dev 25 | 26 | -------------------------------------------------------------------------------- 27 | Compiling and installing the plugin: 28 | Export or set in Makefile the appropriate CUDA_VER 29 | Run make and sudo make install 30 | -------------------------------------------------------------------------------- /libs/nvdsinfer/nms_cpu.cpp: -------------------------------------------------------------------------------- 1 | #include "nms_cpu.h" 2 | //#include 3 | 4 | #define error printf 5 | 6 | template 7 | void nmsRegisterKernelCPU(int* kernelPtr, const T* const sourcePtr, const int w, const int h, 8 | const T& threshold, const int x, const int y) 9 | { 10 | // We have three scenarios for NMS, one for the border, 1 for the 1st inner border, and 11 | // 1 for the rest. cv::resize adds artifacts around the 1st inner border, causing two 12 | // maximas to occur side by side. Eg. [1 1 0.8 0.8 0.5 ..]. The CUDA kernel gives 13 | // [0.8 1 0.8 0.8 0.5 ..] Hence for this special case in the 1st inner border, we look at the 14 | // visible regions. 15 | 16 | const auto index = y*w + x; 17 | if (1 < x && x < (w-2) && 1 < y && y < (h-2)) 18 | { 19 | const auto value = sourcePtr[index]; 20 | if (value > threshold) 21 | { 22 | const auto topLeft = sourcePtr[(y-1)*w + x-1]; 23 | const auto top = sourcePtr[(y-1)*w + x]; 24 | const auto topRight = sourcePtr[(y-1)*w + x+1]; 25 | const auto left = sourcePtr[ y*w + x-1]; 26 | const auto right = sourcePtr[ y*w + x+1]; 27 | const auto bottomLeft = sourcePtr[(y+1)*w + x-1]; 28 | const auto bottom = sourcePtr[(y+1)*w + x]; 29 | const auto bottomRight = sourcePtr[(y+1)*w + x+1]; 30 | 31 | if (value > topLeft && value > top && value > topRight 32 | && value > left && value > right 33 | && value > bottomLeft && value > bottom && value > bottomRight) 34 | kernelPtr[index] = 1; 35 | else 36 | kernelPtr[index] = 0; 37 | } 38 | else 39 | kernelPtr[index] = 0; 40 | } 41 | else if (x == 1 || x == (w-2) || y == 1 || y == (h-2)) 42 | { 43 | //kernelPtr[index] = 0; 44 | const auto value = sourcePtr[index]; 45 | if (value > threshold) 46 | { 47 | const auto topLeft = ((0 < x && 0 < y) ? sourcePtr[(y-1)*w + x-1] : threshold); 48 | const auto top = (0 < y ? sourcePtr[(y-1)*w + x] : threshold); 49 | const auto topRight = ((0 < y && x < (w-1)) ? sourcePtr[(y-1)*w + x+1] : threshold); 50 | const auto left = (0 < x ? sourcePtr[ y*w + x-1] : threshold); 51 | const auto right = (x < (w-1) ? sourcePtr[y*w + x+1] : threshold); 52 | const auto bottomLeft = ((y < (h-1) && 0 < x) ? sourcePtr[(y+1)*w + x-1] : threshold); 53 | const auto bottom = (y < (h-1) ? sourcePtr[(y+1)*w + x] : threshold); 54 | const auto bottomRight = ((x < (w-1) && y < (h-1)) ? sourcePtr[(y+1)*w + x+1] : threshold); 55 | 56 | if (value >= topLeft && value >= top && value >= topRight 57 | && value >= left && value >= right 58 | && value >= bottomLeft && value >= bottom && value >= bottomRight) 59 | kernelPtr[index] = 1; 60 | else 61 | kernelPtr[index] = 0; 62 | } 63 | else 64 | kernelPtr[index] = 0; 65 | } 66 | else 67 | kernelPtr[index] = 0; 68 | } 69 | 70 | template 71 | void nmsAccuratePeakPosition(T* output, const T* const sourcePtr, const int& peakLocX, const int& peakLocY, 72 | const int& width, const int& height) 73 | { 74 | T xAcc = 0.f; 75 | T yAcc = 0.f; 76 | T scoreAcc = 0.f; 77 | const auto dWidth = 3; 78 | const auto dHeight = 3; 79 | for (auto dy = -dHeight ; dy <= dHeight ; dy++) 80 | { 81 | const auto y = peakLocY + dy; 82 | if (0 <= y && y < height) // Default height = 368 83 | { 84 | for (auto dx = -dWidth ; dx <= dWidth ; dx++) 85 | { 86 | const auto x = peakLocX + dx; 87 | if (0 <= x && x < width) // Default width = 656 88 | { 89 | const auto score = sourcePtr[y * width + x]; 90 | if (score > 0) 91 | { 92 | xAcc += x*score; 93 | yAcc += y*score; 94 | scoreAcc += score; 95 | } 96 | } 97 | } 98 | } 99 | } 100 | 101 | // Offset to keep Matlab format (empirically higher acc) 102 | // Best results for 1 scale: x + 0, y + 0.5 103 | // +0.5 to both to keep Matlab format 104 | // Hard code offset x=y=0.5 105 | output[0] = xAcc / scoreAcc + 0.5; 106 | output[1] = yAcc / scoreAcc + 0.5; 107 | output[2] = sourcePtr[peakLocY*width + peakLocX]; 108 | } 109 | 110 | template 111 | void nmsCpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold, 112 | const std::array& targetSize, const std::array& sourceSize) 113 | { 114 | try 115 | { 116 | // Sanity checks 117 | if (sourceSize.empty()) 118 | printf("sourceSize cannot be empty. %d, %s, %s\n", __LINE__, __FUNCTION__, __FILE__); 119 | if (targetSize.empty()) 120 | printf("targetSize cannot be empty. %d, %s, %s\n", __LINE__, __FUNCTION__, __FILE__); 121 | if (threshold < 0 || threshold > 1.0) 122 | printf("threshold value invalid. %d, %s, %s\n", __LINE__, __FUNCTION__, __FILE__); 123 | 124 | // Params 125 | const auto channels = targetSize[1]; // 57 126 | const auto sourceHeight = sourceSize[2]; // 368 127 | const auto sourceWidth = sourceSize[3]; // 496 128 | const auto targetPeaks = targetSize[2]; // 97 129 | const auto targetPeakVec = targetSize[3]; // 3 130 | const auto sourceChannelOffset = sourceWidth * sourceHeight; 131 | const auto targetChannelOffset = targetPeaks * targetPeakVec; 132 | 133 | // Per channel operation 134 | for (auto c = 0 ; c < channels ; c++) 135 | { 136 | auto* currKernelPtr = &kernelPtr[c*sourceChannelOffset]; 137 | const T* currSourcePtr = &sourcePtr[c*sourceChannelOffset]; 138 | 139 | for (auto y = 0; y < sourceHeight; y++) 140 | for (auto x = 0; x < sourceWidth; x++) 141 | nmsRegisterKernelCPU(currKernelPtr, currSourcePtr, sourceWidth, sourceHeight, threshold, x, y); 142 | 143 | auto currentPeakCount = 1; 144 | auto* currTargetPtr = &targetPtr[c*targetChannelOffset]; 145 | for (auto y = 0; y < sourceHeight; y++) 146 | { 147 | for (auto x = 0; x < sourceWidth; x++) 148 | { 149 | const auto index = y*sourceWidth + x; 150 | // Find high intensity points 151 | if (currentPeakCount < targetPeaks) 152 | { 153 | if (currKernelPtr[index] == 1) 154 | { 155 | // Accurate Peak Position 156 | nmsAccuratePeakPosition(&currTargetPtr[currentPeakCount*3], currSourcePtr, x, y, 157 | sourceWidth, sourceHeight); 158 | currentPeakCount++; 159 | } 160 | } 161 | } 162 | } 163 | currTargetPtr[0] = T(currentPeakCount-1); 164 | } 165 | } 166 | catch (const std::exception& e) 167 | { 168 | printf("exception: %s, %d, %s, %s\n", e.what(), __LINE__, __FUNCTION__, __FILE__); 169 | } 170 | } 171 | 172 | template void nmsCpu( 173 | float* targetPtr, int* kernelPtr, const float* const sourcePtr, const float threshold, 174 | const std::array& targetSize, const std::array& sourceSize); 175 | template void nmsCpu( 176 | double* targetPtr, int* kernelPtr, const double* const sourcePtr, const double threshold, 177 | const std::array& targetSize, const std::array& sourceSize); 178 | 179 | -------------------------------------------------------------------------------- /libs/nvdsinfer/nms_cpu.h: -------------------------------------------------------------------------------- 1 | #ifndef NMS_CPU_H 2 | #define NMS_CPU_H 3 | 4 | #include 5 | 6 | template 7 | void nmsCpu( 8 | T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold, const std::array& targetSize, 9 | const std::array& sourceSize); 10 | 11 | #endif // NMS_CPU_H -------------------------------------------------------------------------------- /libs/nvdsinfer/nvdsinfer_context_impl.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA Corporation and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA Corporation is strictly prohibited. 9 | * 10 | */ 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "nvtx3/nvToolsExtCudaRt.h" 21 | 22 | #include "nvdsinfer_context_impl.h" 23 | #include "nvdsinfer_conversion.h" 24 | 25 | #include 26 | #include 27 | #include 28 | 29 | /* Function types for custom library interfaces. */ 30 | 31 | using NvDsInferPluginFactoryCaffeGetFcn = decltype (&NvDsInferPluginFactoryCaffeGet); 32 | using NvDsInferPluginFactoryCaffeDestroyFcn = decltype (&NvDsInferPluginFactoryCaffeDestroy); 33 | 34 | using NvDsInferPluginFactoryUffGetFcn = decltype (&NvDsInferPluginFactoryUffGet); 35 | using NvDsInferPluginFactoryUffDestroyFcn = decltype (&NvDsInferPluginFactoryUffDestroy); 36 | 37 | using NvDsInferPluginFactoryRuntimeGetFcn = decltype (&NvDsInferPluginFactoryRuntimeGet); 38 | using NvDsInferPluginFactoryRuntimeDestroyFcn = decltype (&NvDsInferPluginFactoryRuntimeDestroy); 39 | 40 | using NvDsInferInitializeInputLayersFcn = decltype (&NvDsInferInitializeInputLayers); 41 | 42 | using NvDsInferCudaEngineGetFcn = decltype (&NvDsInferCudaEngineGet); 43 | 44 | /* Pair data type for returning input back to caller. */ 45 | using NvDsInferReturnInputPair = std::pair; 46 | 47 | static const int WORKSPACE_SIZE = 450 * 1024 * 1024; 48 | 49 | using namespace nvinfer1; 50 | using namespace std; 51 | 52 | /* 53 | * TensorRT INT8 Calibration implementation. This implementation requires 54 | * pre-generated INT8 Calibration Tables. Please refer TensorRT documentation 55 | * for information on the calibration tables and the procedure for creating the 56 | * tables. 57 | * 58 | * Since this implementation only reads from pre-generated calibration tables, 59 | * readCalibrationCache is requires to be implemented. 60 | */ 61 | class NvDsInferInt8Calibrator : public IInt8EntropyCalibrator2 62 | { 63 | public: 64 | NvDsInferInt8Calibrator(string calibrationTableFile) : 65 | m_CalibrationTableFile(calibrationTableFile) 66 | { 67 | } 68 | 69 | ~NvDsInferInt8Calibrator() 70 | { 71 | } 72 | 73 | int 74 | getBatchSize() const override 75 | { 76 | return 0; 77 | } 78 | 79 | bool 80 | getBatch(void* bindings[], const char* names[], int nbBindings) override 81 | { 82 | return false; 83 | } 84 | 85 | /* Reads calibration table file contents into a buffer and returns a pointer 86 | * to the buffer. 87 | */ 88 | const void* 89 | readCalibrationCache(size_t& length) override 90 | { 91 | m_CalibrationCache.clear(); 92 | ifstream input(m_CalibrationTableFile, std::ios::binary); 93 | input >> noskipws; 94 | if (input.good()) 95 | copy(std::istream_iterator(input), 96 | istream_iterator(), 97 | back_inserter(m_CalibrationCache)); 98 | 99 | length = m_CalibrationCache.size(); 100 | return length ? m_CalibrationCache.data() : nullptr; 101 | } 102 | 103 | void 104 | writeCalibrationCache(const void* cache, size_t length) override 105 | { 106 | } 107 | 108 | private: 109 | string m_CalibrationTableFile; 110 | vector m_CalibrationCache; 111 | }; 112 | 113 | /** 114 | * Get the size of the element from the data type 115 | */ 116 | inline unsigned int 117 | getElementSize(NvDsInferDataType t) 118 | { 119 | switch (t) 120 | { 121 | case INT32: 122 | return 4; 123 | case FLOAT: 124 | return 4; 125 | case HALF: 126 | return 2; 127 | case INT8: 128 | return 1; 129 | } 130 | 131 | return 0; 132 | } 133 | 134 | static inline bool 135 | string_empty(char *str) 136 | { 137 | return strlen(str) == 0; 138 | } 139 | 140 | static inline bool 141 | file_accessible (char *path) 142 | { 143 | return (access(path, F_OK) != -1); 144 | } 145 | 146 | /* Cuda callback function for returning input back to client. */ 147 | static void 148 | returnInputCudaCallback(cudaStream_t stream, cudaError_t status, void* userData) 149 | { 150 | NvDsInferReturnInputPair *pair = (NvDsInferReturnInputPair *) userData; 151 | pair->first(pair->second); 152 | delete pair; 153 | } 154 | 155 | std::mutex NvDsInferContextImpl::DlaExecutionMutex; 156 | 157 | void 158 | NvDsInferContextImpl::NvDsInferLogger::log(Severity severity, const char *msg) 159 | { 160 | NvDsInferLogLevel level; 161 | 162 | switch (severity) 163 | { 164 | case Severity::kINTERNAL_ERROR: 165 | case Severity::kERROR: 166 | level = NVDSINFER_LOG_ERROR; 167 | break; 168 | case Severity::kWARNING: 169 | level = NVDSINFER_LOG_WARNING; 170 | break; 171 | case Severity::kINFO: 172 | level = NVDSINFER_LOG_DEBUG; 173 | break; 174 | default: 175 | return; 176 | } 177 | 178 | callLogFunc(handle, handle->m_UniqueID, level, __func__, handle->m_LoggingFunc, 179 | handle->m_UserCtx, msg); 180 | } 181 | 182 | /* Default constructor. */ 183 | NvDsInferContextImpl::NvDsInferContextImpl() : 184 | INvDsInferContext(), 185 | m_UniqueID(0), 186 | m_DBScanHandle(nullptr), 187 | m_CustomLibHandle(nullptr), 188 | m_CustomBBoxParseFunc(nullptr), 189 | m_CustomClassifierParseFunc(nullptr), 190 | m_RuntimePluginFactory(nullptr), 191 | m_GpuID (0), 192 | m_DlaEnabled (false), 193 | m_InferRuntime(nullptr), 194 | m_CudaEngine(nullptr), 195 | m_InferExecutionContext(nullptr), 196 | m_PreProcessStream(nullptr), 197 | m_InferStream(nullptr), 198 | m_BufferCopyStream(nullptr), 199 | m_MeanDataBuffer(nullptr), 200 | m_Batches(NVDSINFER_MIN_OUTPUT_BUFFERPOOL_SIZE), 201 | m_InputConsumedEvent(nullptr), 202 | m_PreProcessCompleteEvent(nullptr), 203 | m_InferCompleteEvent(nullptr), 204 | m_LoggingFunc(nullptr), 205 | m_UserCtx(nullptr), 206 | m_Initialized(false) 207 | { 208 | m_Logger.handle = this; 209 | } 210 | 211 | /* The function performs all the initialization steps required by the inference 212 | * engine. */ 213 | NvDsInferStatus 214 | NvDsInferContextImpl::initialize(NvDsInferContextInitParams &initParams, 215 | void *userCtx, NvDsInferContextLoggingFunc logFunc) 216 | { 217 | cudaError_t cudaReturn; 218 | bool generateModel = true; 219 | std::string nvtx_name; 220 | 221 | m_LoggingFunc = logFunc; 222 | m_UserCtx = userCtx; 223 | 224 | /* Synchronization using once_flag and call_once to ensure TensorRT plugin 225 | * initialization function is called only once in case of multiple instances 226 | * of this constructor being called from different threads. */ 227 | { 228 | static once_flag pluginInitFlag; 229 | call_once(pluginInitFlag, 230 | [this]() { initLibNvInferPlugins(&this->m_Logger, ""); } ); 231 | } 232 | 233 | m_UniqueID = initParams.uniqueID; 234 | m_MaxBatchSize = initParams.maxBatchSize; 235 | m_NetworkScaleFactor = initParams.networkScaleFactor; 236 | m_NetworkInputFormat = initParams.networkInputFormat; 237 | m_NetworkType = initParams.networkType; 238 | m_UseDBScan = initParams.useDBScan; 239 | 240 | m_ClassifierThreshold = initParams.classifierThreshold; 241 | m_SegmentationThreshold = initParams.segmentationThreshold; 242 | m_GpuID = initParams.gpuID; 243 | m_CopyInputToHostBuffers = initParams.copyInputToHostBuffers; 244 | m_OutputBufferPoolSize = initParams.outputBufferPoolSize; 245 | m_Batches.resize(m_OutputBufferPoolSize); 246 | 247 | if (m_UniqueID == 0) 248 | { 249 | printError("Unique ID not set"); 250 | return NVDSINFER_CONFIG_FAILED; 251 | } 252 | 253 | if (m_MaxBatchSize > NVDSINFER_MAX_BATCH_SIZE) 254 | { 255 | printError ("Batch-size (%d) more than maximum allowed batch-size (%d)", 256 | initParams.maxBatchSize, NVDSINFER_MAX_BATCH_SIZE); 257 | return NVDSINFER_CONFIG_FAILED; 258 | } 259 | 260 | if (initParams.numOutputLayers > 0 && initParams.outputLayerNames == nullptr) 261 | { 262 | printError("NumOutputLayers > 0 but outputLayerNames array not specified"); 263 | return NVDSINFER_CONFIG_FAILED; 264 | } 265 | 266 | switch (m_NetworkType) 267 | { 268 | case NvDsInferNetworkType_Detector: 269 | m_NumDetectedClasses = initParams.numDetectedClasses; 270 | if (initParams.numDetectedClasses > 0 && initParams.perClassDetectionParams == nullptr) 271 | { 272 | printError("NumDetectedClasses > 0 but PerClassDetectionParams array not specified"); 273 | return NVDSINFER_CONFIG_FAILED; 274 | } 275 | 276 | m_PerClassDetectionParams.assign(initParams.perClassDetectionParams, 277 | initParams.perClassDetectionParams + m_NumDetectedClasses); 278 | m_DetectionParams.numClassesConfigured = initParams.numDetectedClasses; 279 | m_DetectionParams.perClassThreshold.resize(initParams.numDetectedClasses); 280 | 281 | /* Resize the per class vector to the number of detected classes. */ 282 | m_PerClassObjectList.resize(initParams.numDetectedClasses); 283 | if (!m_UseDBScan) 284 | { 285 | m_PerClassCvRectList.resize(initParams.numDetectedClasses); 286 | } 287 | 288 | /* Fill the class thresholds in the m_DetectionParams structure. This 289 | * will be required during parsing. */ 290 | for (unsigned int i = 0; i < initParams.numDetectedClasses; i++) 291 | { 292 | m_DetectionParams.perClassThreshold[i] = 293 | m_PerClassDetectionParams[i].threshold; 294 | } 295 | break; 296 | case NvDsInferNetworkType_Classifier: 297 | break; 298 | case NvDsInferNetworkType_Segmentation: 299 | break; 300 | case NvDsInferNetworkType_Other: 301 | break; 302 | default: 303 | printError("Unsupported network type"); 304 | return NVDSINFER_CONFIG_FAILED; 305 | } 306 | 307 | switch (initParams.networkMode) 308 | { 309 | case NvDsInferNetworkMode_FP32: 310 | case NvDsInferNetworkMode_FP16: 311 | case NvDsInferNetworkMode_INT8: 312 | break; 313 | default: 314 | printError("Unsupported network dataType"); 315 | return NVDSINFER_CONFIG_FAILED; 316 | } 317 | 318 | if (m_OutputBufferPoolSize < NVDSINFER_MIN_OUTPUT_BUFFERPOOL_SIZE) 319 | { 320 | printError("Output buffer pool size (%d) less than minimum required(%d)", 321 | m_OutputBufferPoolSize, NVDSINFER_MIN_OUTPUT_BUFFERPOOL_SIZE); 322 | return NVDSINFER_CONFIG_FAILED; 323 | } 324 | 325 | /* Set the cuda device to be used. */ 326 | cudaReturn = cudaSetDevice(m_GpuID); 327 | if (cudaReturn != cudaSuccess) 328 | { 329 | printError("Failed to set cuda device (%s).", cudaGetErrorName(cudaReturn)); 330 | return NVDSINFER_CUDA_ERROR; 331 | } 332 | 333 | /* Create the API root class. */ 334 | m_InferRuntime = createInferRuntime(m_Logger); 335 | if (!m_InferRuntime) 336 | { 337 | printError("Failed to create Infer runtime engine."); 338 | return NVDSINFER_TENSORRT_ERROR; 339 | } 340 | 341 | /* Load the custom library if specified. */ 342 | if (!string_empty(initParams.customLibPath)) 343 | { 344 | m_CustomLibHandle = dlopen (initParams.customLibPath, RTLD_LAZY); 345 | if (!m_CustomLibHandle) 346 | { 347 | printError("Could not open custom lib: %s", dlerror()); 348 | return NVDSINFER_CUSTOM_LIB_FAILED; 349 | } 350 | } 351 | 352 | /* If the custom library is specified, check if PluginFactory instance is 353 | * required during deserialization of cuda engine. */ 354 | NvDsInferPluginFactoryRuntimeGetFcn fcn = nullptr; 355 | if (m_CustomLibHandle) 356 | { 357 | fcn = (NvDsInferPluginFactoryRuntimeGetFcn) 358 | dlsym(m_CustomLibHandle, "NvDsInferPluginFactoryRuntimeGet"); 359 | if (fcn) 360 | { 361 | if (!fcn(m_RuntimePluginFactory)) 362 | { 363 | printError("Failed to get runtime plugin factory instance" 364 | " from custom library."); 365 | return NVDSINFER_CUSTOM_LIB_FAILED; 366 | } 367 | } 368 | } 369 | 370 | if (!string_empty(initParams.modelEngineFilePath)) 371 | { 372 | if (useEngineFile(initParams) == NVDSINFER_SUCCESS) 373 | { 374 | generateModel = false; 375 | } 376 | } 377 | 378 | if (generateModel) 379 | { 380 | NvDsInferStatus status; 381 | IHostMemory *gieModelStream; 382 | printInfo("Trying to create engine from model files"); 383 | 384 | /* Create the gie Model stream from the model files and other parameters. */ 385 | status = generateTRTModel(initParams, gieModelStream); 386 | if (status != NVDSINFER_SUCCESS) 387 | { 388 | printError("Failed to create engine from model files"); 389 | return status; 390 | } 391 | 392 | /* Use DLA if specified. */ 393 | if (initParams.useDLA) 394 | { 395 | m_InferRuntime->setDLACore(initParams.dlaCore); 396 | } 397 | 398 | /* Create the cuda engine from the serialized stream. */ 399 | m_CudaEngine = 400 | m_InferRuntime->deserializeCudaEngine(gieModelStream->data(), 401 | gieModelStream->size(), 402 | m_RuntimePluginFactory); 403 | /* Destroy the model stream, since cuda engine has been serialized. */ 404 | gieModelStream->destroy(); 405 | 406 | if (!m_CudaEngine) 407 | { 408 | printError("Failed to create engine from serialized stream"); 409 | return NVDSINFER_TENSORRT_ERROR; 410 | } 411 | if (checkEngineParams(initParams) != NVDSINFER_SUCCESS) 412 | { 413 | return NVDSINFER_CONFIG_FAILED; 414 | } 415 | } 416 | 417 | m_DlaEnabled = initParams.useDLA; 418 | 419 | /* Get the network input dimensions. */ 420 | DimsCHW inputDims = 421 | static_cast(m_CudaEngine->getBindingDimensions(INPUT_LAYER_INDEX)); 422 | m_NetworkInfo.width = inputDims.w(); 423 | m_NetworkInfo.height = inputDims.h(); 424 | m_NetworkInfo.channels = inputDims.c(); 425 | 426 | switch (m_NetworkInputFormat) 427 | { 428 | case NvDsInferFormat_RGB: 429 | case NvDsInferFormat_BGR: 430 | if (m_NetworkInfo.channels != 3) 431 | { 432 | printError("RGB/BGR input format specified but network input" 433 | " channels is not 3"); 434 | return NVDSINFER_CONFIG_FAILED; 435 | } 436 | break; 437 | case NvDsInferFormat_GRAY: 438 | if (m_NetworkInfo.channels != 1) 439 | { 440 | printError("GRAY input format specified but network input " 441 | "channels is not 1."); 442 | return NVDSINFER_CONFIG_FAILED; 443 | } 444 | break; 445 | default: 446 | printError("Unknown input format"); 447 | return NVDSINFER_CONFIG_FAILED; 448 | } 449 | 450 | /* Create the mean data buffer from mean image file or per color component 451 | * offsets if either are specified. */ 452 | if (!string_empty(initParams.meanImageFilePath) || initParams.numOffsets > 0) 453 | { 454 | /* Mean Image File specified. Allocate the mean image buffer on device 455 | * memory. */ 456 | cudaReturn = cudaMalloc((void **)&m_MeanDataBuffer, 457 | m_NetworkInfo.width * m_NetworkInfo.height * 458 | m_NetworkInfo.channels * sizeof (float)); 459 | if (cudaReturn != cudaSuccess) 460 | { 461 | printError("Failed to allocate cuda buffer for mean image(%s)", 462 | cudaGetErrorName(cudaReturn)); 463 | return NVDSINFER_CUDA_ERROR; 464 | } 465 | /* Read the mean image file (PPM format) if specified and copy the 466 | * contents into the buffer. */ 467 | if (!string_empty(initParams.meanImageFilePath)) 468 | { 469 | if (!file_accessible(initParams.meanImageFilePath)) 470 | { 471 | printError("Cannot access mean image file '%s'", 472 | initParams.meanImageFilePath); 473 | return NVDSINFER_CONFIG_FAILED; 474 | } 475 | NvDsInferStatus status = readMeanImageFile(initParams.meanImageFilePath); 476 | if (status != NVDSINFER_SUCCESS) 477 | { 478 | printError("Failed to read mean image file"); 479 | return status; 480 | } 481 | } 482 | /* Create the mean data buffer from per-channel offsets. */ 483 | else 484 | { 485 | /* Make sure the number of offsets are equal to the number of input 486 | * channels. */ 487 | if (initParams.numOffsets != m_NetworkInfo.channels) 488 | { 489 | printError("Number of offsets(%d) not equal to number of input " 490 | "channels(%d)", initParams.numOffsets, 491 | m_NetworkInfo.channels); 492 | return NVDSINFER_CONFIG_FAILED; 493 | } 494 | 495 | vector meanData( 496 | m_NetworkInfo.channels * m_NetworkInfo.width * 497 | m_NetworkInfo.height); 498 | for (size_t j = 0; j < m_NetworkInfo.width * m_NetworkInfo.height; j++) 499 | { 500 | for (size_t i = 0; i < m_NetworkInfo.channels; i++) 501 | { 502 | meanData[j * m_NetworkInfo.channels + i] = initParams.offsets[i]; 503 | } 504 | } 505 | cudaReturn = cudaMemcpy(m_MeanDataBuffer, meanData.data(), 506 | meanData.size() * sizeof(float), cudaMemcpyHostToDevice); 507 | if (cudaReturn != cudaSuccess) 508 | { 509 | printError("Failed to copy mean data to mean data cuda buffer(%s)", 510 | cudaGetErrorName(cudaReturn)); 511 | return NVDSINFER_CUDA_ERROR; 512 | } 513 | } 514 | } 515 | 516 | /* Get information on all bound layers. */ 517 | getBoundLayersInfo(); 518 | 519 | /* Create the Infer Execution Context. */ 520 | m_InferExecutionContext = m_CudaEngine->createExecutionContext(); 521 | if (!m_InferExecutionContext) 522 | { 523 | printError("Failed to create Infer Execution Context"); 524 | return NVDSINFER_TENSORRT_ERROR; 525 | } 526 | 527 | /* Create the cuda stream on which pre-processing jobs will be executed. */ 528 | cudaReturn = cudaStreamCreateWithFlags(&m_PreProcessStream, 529 | cudaStreamNonBlocking); 530 | if (cudaReturn != cudaSuccess) 531 | { 532 | printError("Failed to create cudaStream(%s)", 533 | cudaGetErrorName(cudaReturn)); 534 | return NVDSINFER_TENSORRT_ERROR; 535 | } 536 | nvtx_name = "nvdsinfer_preprocess_uid=" + to_string(m_UniqueID); 537 | nvtxNameCudaStreamA (m_PreProcessStream, nvtx_name.c_str()); 538 | 539 | /* Create the cuda stream on which inference jobs will be executed. */ 540 | cudaReturn = cudaStreamCreateWithFlags(&m_InferStream, cudaStreamNonBlocking); 541 | if (cudaReturn != cudaSuccess) 542 | { 543 | printError("Failed to create cudaStream(%s)", 544 | cudaGetErrorName(cudaReturn)); 545 | return NVDSINFER_CUDA_ERROR; 546 | } 547 | nvtx_name = "nvdsinfer_infer_uid=" + to_string(m_UniqueID); 548 | nvtxNameCudaStreamA (m_InferStream, nvtx_name.c_str()); 549 | 550 | /* Create the cuda stream on which device to host memcpy jobs will be 551 | * executed. */ 552 | cudaReturn = cudaStreamCreateWithFlags (&m_BufferCopyStream, 553 | cudaStreamNonBlocking); 554 | if (cudaReturn != cudaSuccess) 555 | { 556 | printError("Failed to create cudaStream(%s)", 557 | cudaGetErrorName(cudaReturn)); 558 | return NVDSINFER_CUDA_ERROR; 559 | } 560 | nvtx_name = "nvdsinfer_DtoHcopy_uid=" + to_string(m_UniqueID); 561 | nvtxNameCudaStreamA (m_BufferCopyStream, nvtx_name.c_str()); 562 | 563 | /* Allocate binding buffers on the device and the corresponding host 564 | * buffers. */ 565 | NvDsInferStatus status = allocateBuffers(); 566 | if (status != NVDSINFER_SUCCESS) 567 | { 568 | printError("Failed to allocate buffers"); 569 | return status; 570 | } 571 | 572 | /* Parse the labels file if specified. */ 573 | if (!string_empty(initParams.labelsFilePath)) 574 | { 575 | if (!file_accessible(initParams.labelsFilePath)) 576 | { 577 | printError("Could not access labels file '%s'", initParams.labelsFilePath); 578 | return NVDSINFER_CONFIG_FAILED; 579 | } 580 | NvDsInferStatus status = parseLabelsFile(initParams.labelsFilePath); 581 | if (status != NVDSINFER_SUCCESS) 582 | { 583 | printError("Failed to read labels file"); 584 | return status; 585 | } 586 | } 587 | 588 | /* Cuda event to synchronize between consumption of input binding buffer by 589 | * the cuda engine and the pre-processing kernel which writes to the input 590 | * binding buffer. */ 591 | cudaReturn = cudaEventCreateWithFlags(&m_InputConsumedEvent, 592 | cudaEventDisableTiming); 593 | if (cudaReturn != cudaSuccess) 594 | { 595 | printError("Failed to create cuda event(%s)", cudaGetErrorName(cudaReturn)); 596 | return NVDSINFER_CUDA_ERROR; 597 | } 598 | nvtx_name = "nvdsinfer_TRT_input_consumed_uid=" + to_string(m_UniqueID); 599 | nvtxNameCudaEventA (m_InputConsumedEvent, nvtx_name.c_str()); 600 | 601 | /* Cuda event to synchronize between completion of the pre-processing kernels 602 | * and enqueuing the next set of binding buffers for inference. */ 603 | cudaReturn = cudaEventCreateWithFlags(&m_PreProcessCompleteEvent, 604 | cudaEventDisableTiming); 605 | if (cudaReturn != cudaSuccess) 606 | { 607 | printError("Failed to create cuda event(%s)", cudaGetErrorName(cudaReturn)); 608 | return NVDSINFER_CUDA_ERROR; 609 | } 610 | nvtx_name = "nvdsinfer_preprocess_complete_uid=" + to_string(m_UniqueID); 611 | nvtxNameCudaEventA (m_PreProcessCompleteEvent, nvtx_name.c_str()); 612 | 613 | /* Cuda event to synchronize between completion of inference on a batch 614 | * and copying the output contents from device to host memory. */ 615 | cudaReturn = cudaEventCreateWithFlags(&m_InferCompleteEvent, 616 | cudaEventDisableTiming); 617 | if (cudaReturn != cudaSuccess) 618 | { 619 | printError("Failed to create cuda event(%s)", cudaGetErrorName(cudaReturn)); 620 | return NVDSINFER_CUDA_ERROR; 621 | } 622 | nvtx_name = "nvdsinfer_infer_complete_uid=" + to_string(m_UniqueID); 623 | nvtxNameCudaEventA (m_InferCompleteEvent, nvtx_name.c_str()); 624 | 625 | /* If custom parse function is specified get the function address from the 626 | * custom library. */ 627 | if (m_CustomLibHandle && m_NetworkType == NvDsInferNetworkType_Detector && 628 | !string_empty(initParams.customBBoxParseFuncName)) 629 | { 630 | m_CustomBBoxParseFunc = 631 | (NvDsInferParseCustomFunc) dlsym(m_CustomLibHandle, 632 | initParams.customBBoxParseFuncName); 633 | if (!m_CustomBBoxParseFunc) 634 | { 635 | printError("Could not find parse func '%s' in custom library", 636 | initParams.customBBoxParseFuncName); 637 | return NVDSINFER_CONFIG_FAILED; 638 | } 639 | } 640 | 641 | if (m_CustomLibHandle && m_NetworkType == NvDsInferNetworkType_Classifier && 642 | !string_empty(initParams.customClassifierParseFuncName)) 643 | { 644 | m_CustomClassifierParseFunc = 645 | (NvDsInferClassiferParseCustomFunc) dlsym(m_CustomLibHandle, 646 | initParams.customClassifierParseFuncName); 647 | if (!m_CustomClassifierParseFunc) 648 | { 649 | printError("Could not find parse func '%s' in custom library", 650 | initParams.customClassifierParseFuncName); 651 | return NVDSINFER_CONFIG_FAILED; 652 | } 653 | } 654 | 655 | /* If there are more than one input layers (non-image input) and custom 656 | * library is specified, try to initialize these layers. */ 657 | if (m_AllLayerInfo.size() > 1 + m_OutputLayerInfo.size()) 658 | { 659 | NvDsInferStatus status = initNonImageInputLayers(); 660 | if (status != NVDSINFER_SUCCESS) 661 | { 662 | printError("Failed to initialize non-image input layers"); 663 | return status; 664 | } 665 | } 666 | 667 | if (m_UseDBScan) 668 | { 669 | m_DBScanHandle = NvDsInferDBScanCreate(); 670 | } 671 | 672 | m_Initialized = true; 673 | 674 | return NVDSINFER_SUCCESS; 675 | } 676 | 677 | /* Get the network input resolution. This is required since this implementation 678 | * requires that the caller supplies an input buffer having the network 679 | * resolution. 680 | */ 681 | void 682 | NvDsInferContextImpl::getNetworkInfo(NvDsInferNetworkInfo &networkInfo) 683 | { 684 | networkInfo = m_NetworkInfo; 685 | } 686 | 687 | /* Allocate binding buffers for all bound layers on the device memory. The size 688 | * of the buffers allocated is calculated from the dimensions of the layers, the 689 | * data type of the layer and the max batch size of the infer cuda engine. 690 | * 691 | * NvDsInfer enqueue API requires an array of (void *) buffer pointers. The length 692 | * of the array is equal to the number of bound layers. The buffer corresponding 693 | * to a layer is placed at an index equal to the layer's binding index. 694 | * 695 | * Also allocate corresponding host buffers for output layers in system memory. 696 | * 697 | * Multiple sets of the device and host buffers are allocated so that (inference + 698 | * device to host copy) and output layers parsing can be parallelized. 699 | */ 700 | NvDsInferStatus 701 | NvDsInferContextImpl::allocateBuffers() 702 | { 703 | cudaError_t cudaReturn; 704 | 705 | // m_CudaEngine->createExecutionContext(); 706 | /* Resize the binding buffers vector to the number of bound layers. */ 707 | m_BindingBuffers.assign(m_AllLayerInfo.size(), nullptr); 708 | 709 | for (unsigned int i = 0; i < m_AllLayerInfo.size(); i++) 710 | { 711 | size_t size = m_MaxBatchSize * m_AllLayerInfo[i].dims.numElements * 712 | getElementSize(m_AllLayerInfo[i].dataType); 713 | 714 | /* Do not allocate device memory for output layers here. */ 715 | if (!m_CudaEngine->bindingIsInput(i)) 716 | continue; 717 | 718 | /* Allocate device memory for the binding buffer. */ 719 | cudaReturn = cudaMalloc(&m_BindingBuffers[i], size); 720 | if (cudaReturn != cudaSuccess) 721 | { 722 | printError("Failed to allocate cuda buffer(%s)", 723 | cudaGetErrorName(cudaReturn)); 724 | return NVDSINFER_CUDA_ERROR; 725 | } 726 | } 727 | 728 | /* Initialize the batch vector, allocate host memory for the layers, 729 | * add all the free indexes to the free queue. */ 730 | for (unsigned int i = 0; i < m_Batches.size(); i++) 731 | { 732 | NvDsInferBatch & batch = m_Batches[i]; 733 | /* Resize the host buffers vector to the number of bound layers. */ 734 | batch.m_HostBuffers.resize(m_AllLayerInfo.size()); 735 | batch.m_DeviceBuffers.assign(m_AllLayerInfo.size(), nullptr); 736 | 737 | 738 | for (unsigned int j = 0; j < m_AllLayerInfo.size(); j++) 739 | { 740 | size_t size = m_MaxBatchSize * m_AllLayerInfo[j].dims.numElements * 741 | getElementSize(m_AllLayerInfo[j].dataType); 742 | 743 | if (m_CudaEngine->bindingIsInput(j)) 744 | { 745 | /* Reuse input binding buffer pointers. */ 746 | batch.m_DeviceBuffers[j] = m_BindingBuffers[j]; 747 | } 748 | else 749 | { 750 | /* Allocate device memory for output layers here. */ 751 | cudaReturn = cudaMalloc(&batch.m_DeviceBuffers[j], size); 752 | if (cudaReturn != cudaSuccess) 753 | { 754 | printError("Failed to allocate cuda buffer(%s)", 755 | cudaGetErrorName(cudaReturn)); 756 | return NVDSINFER_CUDA_ERROR; 757 | } 758 | } 759 | 760 | /* Allocate host memory for input layers only if application 761 | * needs access to the input layer contents. */ 762 | if (m_CudaEngine->bindingIsInput(j) && !m_CopyInputToHostBuffers) 763 | continue; 764 | 765 | /* Resize the uint8_t vector to the size (in bytes) of the buffer. 766 | * The underlying heap memory can be used as host buffer. */ 767 | batch.m_HostBuffers[j].resize(size); 768 | } 769 | cudaReturn = cudaEventCreateWithFlags (&batch.m_CopyCompleteEvent, 770 | cudaEventDisableTiming | cudaEventBlockingSync); 771 | if (cudaReturn != cudaSuccess) 772 | { 773 | printError("Failed to create cuda event(%s)", 774 | cudaGetErrorName(cudaReturn)); 775 | return NVDSINFER_CUDA_ERROR; 776 | } 777 | 778 | /* Add all the indexes to the free queue initially. */ 779 | m_FreeIndexQueue.push(i); 780 | } 781 | 782 | return NVDSINFER_SUCCESS; 783 | } 784 | 785 | /* Get properties of bound layers like the name, dimension, datatype and 786 | * fill the m_AllLayerInfo and m_OutputLayerInfo vectors. 787 | */ 788 | NvDsInferStatus 789 | NvDsInferContextImpl::getBoundLayersInfo() 790 | { 791 | for (int i = 0; i < m_CudaEngine->getNbBindings(); i++) 792 | { 793 | NvDsInferLayerInfo info; 794 | Dims d = m_CudaEngine->getBindingDimensions(i); 795 | 796 | info.isInput = m_CudaEngine->bindingIsInput(i); 797 | info.bindingIndex = i; 798 | info.layerName = m_CudaEngine->getBindingName(i); 799 | info.dims.numDims = d.nbDims; 800 | info.dims.numElements = 1; 801 | for (int j = 0; j < d.nbDims; j++) 802 | { 803 | info.dims.d[j] = d.d[j]; 804 | info.dims.numElements *= d.d[j]; 805 | } 806 | 807 | switch (m_CudaEngine->getBindingDataType(i)) 808 | { 809 | case DataType::kFLOAT: 810 | info.dataType = FLOAT; 811 | break; 812 | case DataType::kHALF: 813 | info.dataType = HALF; 814 | break; 815 | case DataType::kINT32: 816 | info.dataType = INT32; 817 | break; 818 | case DataType::kINT8: 819 | info.dataType = INT8; 820 | break; 821 | default: 822 | printError("Unknown data type for bound layer i(%s)", 823 | info.layerName); 824 | return NVDSINFER_TENSORRT_ERROR; 825 | } 826 | 827 | m_AllLayerInfo.push_back(info); 828 | if (!m_CudaEngine->bindingIsInput(i)) 829 | m_OutputLayerInfo.push_back(info); 830 | } 831 | return NVDSINFER_SUCCESS; 832 | } 833 | 834 | /* Initialize non-image input layers if the custom library has implemented 835 | * the interface. */ 836 | NvDsInferStatus 837 | NvDsInferContextImpl::initNonImageInputLayers() 838 | { 839 | cudaError_t cudaReturn; 840 | 841 | /* Needs the custom library to be specified. */ 842 | if (m_CustomLibHandle == nullptr) 843 | { 844 | printWarning("More than one input layers but custom initialization " 845 | "function not implemented"); 846 | return NVDSINFER_SUCCESS; 847 | } 848 | 849 | /* Check if the interface to initialize the layers has been implemented. */ 850 | NvDsInferInitializeInputLayersFcn fcn = (NvDsInferInitializeInputLayersFcn) 851 | dlsym(m_CustomLibHandle, "NvDsInferInitializeInputLayers"); 852 | if (fcn == nullptr) 853 | { 854 | printWarning("More than one input layers but custom initialization " 855 | "function not implemented"); 856 | return NVDSINFER_SUCCESS; 857 | } 858 | 859 | /* Interface implemented. */ 860 | /* Vector of NvDsInferLayerInfo for non-image input layers. */ 861 | vector inputLayers; 862 | for (auto &layer : m_AllLayerInfo) 863 | { 864 | if (m_CudaEngine->bindingIsInput(layer.bindingIndex) && 865 | layer.bindingIndex != INPUT_LAYER_INDEX) 866 | { 867 | inputLayers.push_back(layer); 868 | } 869 | } 870 | 871 | /* Vector of host memories that can be initialized using CPUs. */ 872 | vector> initBuffers(inputLayers.size()); 873 | 874 | for (size_t i = 0; i < inputLayers.size(); i++) 875 | { 876 | /* For each layer calculate the size required for the layer, allocate 877 | * the host memory and assign the pointer to layer info structure. */ 878 | size_t size = inputLayers[i].dims.numElements * 879 | getElementSize(inputLayers[i].dataType) * m_MaxBatchSize; 880 | initBuffers[i].resize(size); 881 | inputLayers[i].buffer = (void *) initBuffers[i].data(); 882 | } 883 | 884 | /* Call the input layer initialization function. */ 885 | if (!fcn(inputLayers, m_NetworkInfo, m_MaxBatchSize)) 886 | { 887 | printError("Failed to initialize input layers using " 888 | "NvDsInferInitializeInputLayers() in custom lib"); 889 | return NVDSINFER_CUSTOM_LIB_FAILED; 890 | } 891 | 892 | /* Memcpy the initialized contents from the host memory to device memory for 893 | * layer binding buffers. */ 894 | for (size_t i = 0; i < inputLayers.size(); i++) 895 | { 896 | cudaReturn = cudaMemcpyAsync(m_BindingBuffers[inputLayers[i].bindingIndex], 897 | initBuffers[i].data(), initBuffers[i].size(), 898 | cudaMemcpyHostToDevice, m_InferStream); 899 | if (cudaReturn != cudaSuccess) 900 | { 901 | printError("Failed to copy from host to device memory (%s)", 902 | cudaGetErrorName(cudaReturn)); 903 | return NVDSINFER_CUDA_ERROR; 904 | } 905 | /* Application has requested access to the bound buffer contents. Copy 906 | * the contents to all sets of host buffers. */ 907 | if (m_CopyInputToHostBuffers) 908 | { 909 | for (size_t j = 0; j < m_Batches.size(); j++) 910 | { 911 | for (size_t i = 0; i < inputLayers.size(); i++) 912 | { 913 | m_Batches[j].m_HostBuffers[inputLayers[i].bindingIndex]. 914 | assign(initBuffers[i].begin(), initBuffers[i].end()); 915 | } 916 | } 917 | } 918 | } 919 | cudaReturn = cudaStreamSynchronize(m_InferStream); 920 | if (cudaReturn != cudaSuccess) 921 | { 922 | printError("Failed to synchronize cuda stream(%s)", 923 | cudaGetErrorName(cudaReturn)); 924 | return NVDSINFER_CUDA_ERROR; 925 | } 926 | 927 | return NVDSINFER_SUCCESS; 928 | } 929 | 930 | /* Parse the labels file and extract the class label strings. For format of 931 | * the labels file, please refer to the custom models section in the DeepStreamSDK 932 | * documentation. 933 | */ 934 | NvDsInferStatus 935 | NvDsInferContextImpl::parseLabelsFile(char *labelsFilePath) 936 | { 937 | ifstream labels_file(labelsFilePath); 938 | string delim { ';' }; 939 | while (!labels_file.eof()) 940 | { 941 | string line, word; 942 | vector l; 943 | size_t pos = 0, oldpos = 0; 944 | 945 | getline(labels_file, line, '\n'); 946 | if (line.empty()) 947 | continue; 948 | 949 | while ((pos = line.find(delim, oldpos)) != string::npos) 950 | { 951 | word = line.substr(oldpos, pos - oldpos); 952 | l.push_back(word); 953 | oldpos = pos + delim.length(); 954 | } 955 | l.push_back(line.substr(oldpos)); 956 | m_Labels.push_back(l); 957 | } 958 | return NVDSINFER_SUCCESS; 959 | } 960 | 961 | /* Read the mean image ppm file and copy the mean image data to the mean 962 | * data buffer allocated on the device memory. 963 | */ 964 | NvDsInferStatus 965 | NvDsInferContextImpl::readMeanImageFile(char *meanImageFilePath) 966 | { 967 | ifstream infile(meanImageFilePath, std::ifstream::binary); 968 | size_t size = m_NetworkInfo.width * m_NetworkInfo.height * 969 | m_NetworkInfo.channels; 970 | char tempMeanDataChar[size]; 971 | float tempMeanDataFloat[size]; 972 | cudaError_t cudaReturn; 973 | 974 | if (!infile.good()) 975 | { 976 | printError("Could not open mean image file '%s'", meanImageFilePath); 977 | return NVDSINFER_CONFIG_FAILED; 978 | } 979 | 980 | string magic, max; 981 | unsigned int h, w; 982 | infile >> magic >> h >> w >> max; 983 | 984 | if (magic != "P3" && magic != "P6") 985 | { 986 | printError("Magic PPM identifier check failed"); 987 | return NVDSINFER_CONFIG_FAILED; 988 | } 989 | 990 | if (w != m_NetworkInfo.width || h != m_NetworkInfo.height) 991 | { 992 | printError("Mismatch between ppm mean image resolution(%d x %d) and " 993 | "network resolution(%d x %d)", w, h, m_NetworkInfo.width, 994 | m_NetworkInfo.height); 995 | return NVDSINFER_CONFIG_FAILED; 996 | } 997 | 998 | infile.get(); 999 | infile.read(tempMeanDataChar, size); 1000 | if (infile.gcount() != (int) size) 1001 | { 1002 | printError("Failed to read sufficient bytes from mean file"); 1003 | return NVDSINFER_CONFIG_FAILED; 1004 | } 1005 | 1006 | for (size_t i = 0; i < size; i++) 1007 | { 1008 | tempMeanDataFloat[i] = (float) tempMeanDataChar[i]; 1009 | } 1010 | 1011 | cudaReturn = cudaMemcpy(m_MeanDataBuffer, tempMeanDataFloat, 1012 | size * sizeof(float), cudaMemcpyHostToDevice); 1013 | if (cudaReturn != cudaSuccess) 1014 | { 1015 | printError("Failed to copy mean data to mean data buffer (%s)", 1016 | cudaGetErrorName(cudaReturn)); 1017 | return NVDSINFER_CUDA_ERROR; 1018 | } 1019 | 1020 | return NVDSINFER_SUCCESS; 1021 | } 1022 | 1023 | NvDsInferStatus 1024 | NvDsInferContextImpl::queueInputBatch(NvDsInferContextBatchInput &batchInput) 1025 | 1026 | { 1027 | unsigned int batchSize = batchInput.numInputFrames; 1028 | unsigned int batchIndex; 1029 | void *bindingBuffers[m_AllLayerInfo.size()]; 1030 | NvDsInferStatus status; 1031 | NvDsInferConvertFcn convertFcn = nullptr; 1032 | 1033 | /* Check that current batch size does not exceed max batch size. */ 1034 | if (batchSize > m_MaxBatchSize) 1035 | { 1036 | printError("Not inferring on batch since it's size(%d) exceeds max batch" 1037 | " size(%d)", batchSize, m_MaxBatchSize); 1038 | return NVDSINFER_INVALID_PARAMS; 1039 | } 1040 | 1041 | /* DLA does not allow enqueuing batches smaller than the engine's maxBatchSize. */ 1042 | int enqueueBatchSize = m_DlaEnabled ? m_MaxBatchSize : batchSize; 1043 | 1044 | /* Set the cuda device to be used. */ 1045 | cudaError_t cudaReturn = cudaSetDevice(m_GpuID); 1046 | if (cudaReturn != cudaSuccess) 1047 | { 1048 | printError("Failed to set cuda device(%s)", cudaGetErrorName(cudaReturn)); 1049 | return NVDSINFER_CUDA_ERROR; 1050 | } 1051 | 1052 | 1053 | /* Make the future jobs on the stream wait till the infer engine consumes 1054 | * the previous contents of the input binding buffer. */ 1055 | cudaReturn = cudaStreamWaitEvent (m_PreProcessStream, m_InputConsumedEvent, 0); 1056 | if (cudaReturn != cudaSuccess) 1057 | { 1058 | printError("Failed to make stream wait on event(%s)", 1059 | cudaGetErrorName(cudaReturn)); 1060 | return NVDSINFER_CUDA_ERROR; 1061 | } 1062 | 1063 | /* Find the required conversion function. */ 1064 | switch (m_NetworkInputFormat) 1065 | { 1066 | case NvDsInferFormat_RGB: 1067 | switch (batchInput.inputFormat) 1068 | { 1069 | case NvDsInferFormat_RGB: 1070 | convertFcn = NvDsInferConvert_C3ToP3Float; 1071 | break; 1072 | case NvDsInferFormat_BGR: 1073 | convertFcn = NvDsInferConvert_C3ToP3RFloat; 1074 | break; 1075 | case NvDsInferFormat_RGBA: 1076 | convertFcn = NvDsInferConvert_C4ToP3Float; 1077 | break; 1078 | case NvDsInferFormat_BGRx: 1079 | convertFcn = NvDsInferConvert_C4ToP3RFloat; 1080 | break; 1081 | default: 1082 | printError("Input format conversion is not supported"); 1083 | return NVDSINFER_INVALID_PARAMS; 1084 | } 1085 | break; 1086 | case NvDsInferFormat_BGR: 1087 | switch (batchInput.inputFormat) 1088 | { 1089 | case NvDsInferFormat_RGB: 1090 | convertFcn = NvDsInferConvert_C3ToP3RFloat; 1091 | break; 1092 | case NvDsInferFormat_BGR: 1093 | convertFcn = NvDsInferConvert_C3ToP3Float; 1094 | break; 1095 | case NvDsInferFormat_RGBA: 1096 | convertFcn = NvDsInferConvert_C4ToP3RFloat; 1097 | break; 1098 | case NvDsInferFormat_BGRx: 1099 | convertFcn = NvDsInferConvert_C4ToP3Float; 1100 | break; 1101 | default: 1102 | printError("Input format conversion is not supported"); 1103 | return NVDSINFER_INVALID_PARAMS; 1104 | } 1105 | break; 1106 | case NvDsInferFormat_GRAY: 1107 | if (batchInput.inputFormat != NvDsInferFormat_GRAY) 1108 | { 1109 | printError("Input frame format is not GRAY."); 1110 | return NVDSINFER_INVALID_PARAMS; 1111 | } 1112 | convertFcn = NvDsInferConvert_C1ToP1Float; 1113 | break; 1114 | default: 1115 | printError("Unsupported network input format"); 1116 | return NVDSINFER_INVALID_PARAMS; 1117 | } 1118 | 1119 | /* For each frame in the input batch convert/copy to the input binding buffer. */ 1120 | for (unsigned int i = 0; i < batchSize; i++) 1121 | { 1122 | float *outPtr = (float *) m_BindingBuffers[INPUT_LAYER_INDEX] + 1123 | i * m_AllLayerInfo[INPUT_LAYER_INDEX].dims.numElements; 1124 | 1125 | /* Input needs to be pre-processed. */ 1126 | convertFcn(outPtr, (unsigned char*) batchInput.inputFrames[i], 1127 | m_NetworkInfo.width, m_NetworkInfo.height, 1128 | batchInput.inputPitch, m_NetworkScaleFactor, 1129 | m_MeanDataBuffer, m_PreProcessStream); 1130 | } 1131 | 1132 | /* We may use multiple sets of the output device and host buffers since while the 1133 | * output of one batch is being parsed on the CPU, we can queue 1134 | * pre-processing and inference of another on the GPU. Pop an index from the 1135 | * free queue. Wait if queue is empty. */ 1136 | { 1137 | unique_lock lock(m_QueueMutex); 1138 | while (m_FreeIndexQueue.empty()) 1139 | { 1140 | m_QueueCondition.wait(lock); 1141 | } 1142 | batchIndex = m_FreeIndexQueue.front(); 1143 | m_FreeIndexQueue.pop(); 1144 | } 1145 | 1146 | /* Inputs can be returned back once pre-processing is complete. */ 1147 | if (batchInput.returnInputFunc) 1148 | { 1149 | cudaReturn = cudaStreamAddCallback(m_PreProcessStream, returnInputCudaCallback, 1150 | new NvDsInferReturnInputPair(batchInput.returnInputFunc, 1151 | batchInput.returnFuncData), 0); 1152 | if (cudaReturn != cudaSuccess) 1153 | { 1154 | printError("Failed to add cudaStream callback for returning input buffers (%s)", 1155 | cudaGetErrorName(cudaReturn)); 1156 | return NVDSINFER_CUDA_ERROR; 1157 | } 1158 | } 1159 | 1160 | /* Fill the array of binding buffers for the current batch. */ 1161 | std::copy(m_Batches[batchIndex].m_DeviceBuffers.begin(), 1162 | m_Batches[batchIndex].m_DeviceBuffers.end(), bindingBuffers); 1163 | 1164 | /* Record CUDA event to synchronize the completion of pre-processing kernels. */ 1165 | cudaReturn = cudaEventRecord(m_PreProcessCompleteEvent, m_PreProcessStream); 1166 | if (cudaReturn != cudaSuccess) 1167 | { 1168 | printError("Failed to record cuda event (%s)", 1169 | cudaGetErrorName(cudaReturn)); 1170 | status = NVDSINFER_CUDA_ERROR; 1171 | goto error; 1172 | } 1173 | 1174 | /* Make the future jobs on the stream wait till pre-processing kernels finish. */ 1175 | cudaReturn = cudaStreamWaitEvent (m_InferStream, m_PreProcessCompleteEvent, 0); 1176 | if (cudaReturn != cudaSuccess) 1177 | { 1178 | printError("Failed to make stream wait on event(%s)", 1179 | cudaGetErrorName(cudaReturn)); 1180 | status = NVDSINFER_CUDA_ERROR; 1181 | goto error; 1182 | } 1183 | 1184 | { 1185 | std::unique_lock deferLock(DlaExecutionMutex, std::defer_lock); 1186 | 1187 | /* IExecutionContext::enqueue is not thread safe in case of DLA */ 1188 | if (m_DlaEnabled) 1189 | deferLock.lock(); 1190 | 1191 | /* Queue the bound buffers for inferencing. */ 1192 | if (!m_InferExecutionContext->enqueue(enqueueBatchSize, bindingBuffers, 1193 | m_InferStream, &m_InputConsumedEvent)) 1194 | { 1195 | printError("Failed to enqueue inference batch"); 1196 | status = NVDSINFER_TENSORRT_ERROR; 1197 | goto error; 1198 | } 1199 | } 1200 | 1201 | /* Record event on m_InferStream to indicate completion of inference on the 1202 | * current batch. */ 1203 | cudaReturn = cudaEventRecord (m_InferCompleteEvent, m_InferStream); 1204 | if (cudaReturn != cudaSuccess) 1205 | { 1206 | printError("Failed to record cuda event (%s)", cudaGetErrorName(cudaReturn)); 1207 | status = NVDSINFER_CUDA_ERROR; 1208 | goto error; 1209 | } 1210 | 1211 | /* Make future copy jobs on the buffer copy stream wait on the infer 1212 | * completion event. */ 1213 | cudaReturn = cudaStreamWaitEvent (m_BufferCopyStream, m_InferCompleteEvent, 0); 1214 | if (cudaReturn != cudaSuccess) 1215 | { 1216 | printError("CUDA Stream failed to wait on event (%s)", 1217 | cudaGetErrorName(cudaReturn)); 1218 | status = NVDSINFER_CUDA_ERROR; 1219 | goto error; 1220 | } 1221 | 1222 | /* Queue the copy of output contents from device to host memory after the 1223 | * infer completion event. */ 1224 | { 1225 | NvDsInferBatch &batch = m_Batches[batchIndex]; 1226 | batch.m_BatchSize = batchSize; 1227 | 1228 | for (unsigned int i = 0; i < m_OutputLayerInfo.size(); i++) 1229 | { 1230 | NvDsInferLayerInfo & info = m_OutputLayerInfo[i]; 1231 | cudaReturn = 1232 | cudaMemcpyAsync(batch.m_HostBuffers[info.bindingIndex].data(), 1233 | batch.m_DeviceBuffers[info.bindingIndex], 1234 | getElementSize(info.dataType) * 1235 | info.dims.numElements * batch.m_BatchSize, 1236 | cudaMemcpyDeviceToHost, m_BufferCopyStream); 1237 | if (cudaReturn != cudaSuccess) 1238 | { 1239 | printError("cudaMemcpyAsync for output buffers failed (%s)", 1240 | cudaGetErrorName(cudaReturn)); 1241 | status = NVDSINFER_CUDA_ERROR; 1242 | goto error; 1243 | } 1244 | } 1245 | if (m_CopyInputToHostBuffers) 1246 | { 1247 | NvDsInferLayerInfo &info = m_AllLayerInfo[INPUT_LAYER_INDEX]; 1248 | cudaReturn = 1249 | cudaMemcpyAsync(batch.m_HostBuffers[info.bindingIndex].data(), 1250 | m_BindingBuffers[info.bindingIndex], 1251 | getElementSize(info.dataType) * 1252 | info.dims.numElements * batch.m_BatchSize, 1253 | cudaMemcpyDeviceToHost, m_BufferCopyStream); 1254 | if (cudaReturn != cudaSuccess) 1255 | { 1256 | printError("cudaMemcpyAsync for input layer failed (%s)", 1257 | cudaGetErrorName(cudaReturn)); 1258 | status = NVDSINFER_CUDA_ERROR; 1259 | goto error; 1260 | } 1261 | } 1262 | /* Record CUDA event to later synchronize for the copy to actually 1263 | * complete. */ 1264 | cudaReturn = cudaEventRecord(batch.m_CopyCompleteEvent, 1265 | m_BufferCopyStream); 1266 | if (cudaReturn != cudaSuccess) 1267 | { 1268 | printError("Failed to record cuda event (%s)", 1269 | cudaGetErrorName(cudaReturn)); 1270 | status = NVDSINFER_CUDA_ERROR; 1271 | goto error; 1272 | } 1273 | } 1274 | 1275 | /* Push the batch index into the processing queue. */ 1276 | { 1277 | unique_lock lock(m_QueueMutex); 1278 | m_ProcessIndexQueue.push(batchIndex); 1279 | m_QueueCondition.notify_one(); 1280 | } 1281 | return NVDSINFER_SUCCESS; 1282 | 1283 | error: 1284 | { 1285 | unique_lock lock(m_QueueMutex); 1286 | m_FreeIndexQueue.push(batchIndex); 1287 | } 1288 | return status; 1289 | } 1290 | 1291 | /* Dequeue batch output of the inference engine for each batch input. */ 1292 | NvDsInferStatus 1293 | NvDsInferContextImpl::dequeueOutputBatch(NvDsInferContextBatchOutput &batchOutput) 1294 | { 1295 | unsigned int batchIndex; 1296 | 1297 | /* Set the cuda device */ 1298 | cudaError_t cudaReturn = cudaSetDevice(m_GpuID); 1299 | if (cudaReturn != cudaSuccess) 1300 | { 1301 | printError("Failed to set cuda device (%s)", cudaGetErrorName(cudaReturn)); 1302 | return NVDSINFER_CUDA_ERROR; 1303 | } 1304 | 1305 | /* Pop a batch index from the process queue. Wait if 1306 | * the queue is empty. */ 1307 | { 1308 | unique_lock lock(m_QueueMutex); 1309 | while (m_ProcessIndexQueue.empty()) 1310 | { 1311 | m_QueueCondition.wait(lock); 1312 | } 1313 | batchIndex = m_ProcessIndexQueue.front(); 1314 | m_ProcessIndexQueue.pop(); 1315 | } 1316 | NvDsInferBatch & batch = m_Batches[batchIndex]; 1317 | 1318 | /* Wait for the copy to the current set of host buffers to complete. */ 1319 | cudaReturn = cudaEventSynchronize (batch.m_CopyCompleteEvent); 1320 | if (cudaReturn != cudaSuccess) 1321 | { 1322 | printError("Failed to synchronize on cuda event (%s)", 1323 | cudaGetErrorName(cudaReturn)); 1324 | { 1325 | unique_lock lock(m_QueueMutex); 1326 | m_FreeIndexQueue.push(batchIndex); 1327 | m_QueueCondition.notify_one(); 1328 | } 1329 | return NVDSINFER_CUDA_ERROR; 1330 | } 1331 | 1332 | batchOutput.frames = new NvDsInferFrameOutput[batch.m_BatchSize]; 1333 | batchOutput.numFrames = batch.m_BatchSize; 1334 | /* For each frame in the current batch, parse the output and add the frame 1335 | * output to the batch output. The number of frames output in one batch 1336 | * will be equal to the number of frames present in the batch during queuing 1337 | * at the input. 1338 | */ 1339 | for (unsigned int index = 0; index < batch.m_BatchSize; index++) 1340 | { 1341 | NvDsInferFrameOutput &frameOutput = batchOutput.frames[index]; 1342 | frameOutput.outputType = NvDsInferNetworkType_Other; 1343 | 1344 | /* Calculate the pointer to the output for each frame in the batch for 1345 | * each output layer buffer. The NvDsInferLayerInfo vector for output 1346 | * layers is passed to the output parsing function. */ 1347 | for (unsigned int i = 0; i < m_OutputLayerInfo.size(); i++) 1348 | { 1349 | NvDsInferLayerInfo & info = m_OutputLayerInfo[i]; 1350 | info.buffer = 1351 | (void *)(batch.m_HostBuffers[info.bindingIndex].data() + 1352 | info.dims.numElements * 1353 | getElementSize(info.dataType) * index); 1354 | } 1355 | 1356 | switch (m_NetworkType) 1357 | { 1358 | case NvDsInferNetworkType_Detector: 1359 | fillDetectionOutput(frameOutput.detectionOutput); 1360 | frameOutput.outputType = NvDsInferNetworkType_Detector; 1361 | break; 1362 | case NvDsInferNetworkType_Classifier: 1363 | fillClassificationOutput(frameOutput.classificationOutput); 1364 | frameOutput.outputType = NvDsInferNetworkType_Classifier; 1365 | break; 1366 | case NvDsInferNetworkType_Segmentation: 1367 | fillSegmentationOutput(frameOutput.segmentationOutput); 1368 | frameOutput.outputType = NvDsInferNetworkType_Segmentation; 1369 | break; 1370 | default: 1371 | break; 1372 | } 1373 | } 1374 | 1375 | /* Fill the host buffers information in the output. */ 1376 | batchOutput.outputBatchID = batchIndex; 1377 | batchOutput.numHostBuffers = m_AllLayerInfo.size(); 1378 | batchOutput.hostBuffers = new void*[m_AllLayerInfo.size()]; 1379 | for (size_t i = 0; i < batchOutput.numHostBuffers; i++) 1380 | { 1381 | batchOutput.hostBuffers[i] = m_Batches[batchIndex].m_HostBuffers[i].data(); 1382 | } 1383 | 1384 | batchOutput.numOutputDeviceBuffers = m_OutputLayerInfo.size(); 1385 | batchOutput.outputDeviceBuffers = new void*[m_OutputLayerInfo.size()]; 1386 | for (size_t i = 0; i < batchOutput.numOutputDeviceBuffers; i++) 1387 | { 1388 | batchOutput.outputDeviceBuffers[i] = 1389 | m_Batches[batchIndex].m_DeviceBuffers[m_OutputLayerInfo[i].bindingIndex]; 1390 | } 1391 | 1392 | /* Mark the set of host buffers as not with the context. */ 1393 | m_Batches[batchIndex].m_BuffersWithContext = false; 1394 | return NVDSINFER_SUCCESS; 1395 | } 1396 | 1397 | /** 1398 | * Release a set of host buffers back to the context. 1399 | */ 1400 | void 1401 | NvDsInferContextImpl::releaseBatchOutput(NvDsInferContextBatchOutput &batchOutput) 1402 | { 1403 | unique_lock < std::mutex > lock (m_QueueMutex); 1404 | unsigned int outputBatchID = batchOutput.outputBatchID; 1405 | 1406 | /* Check for a valid id */ 1407 | if (outputBatchID >= m_Batches.size()) 1408 | { 1409 | printWarning("Tried to release an unknown outputBatchID"); 1410 | return; 1411 | } 1412 | /* And if the batch is not already with the context. */ 1413 | if (m_Batches[outputBatchID].m_BuffersWithContext) 1414 | { 1415 | printWarning("Tried to release an outputBatchID which is" 1416 | " already with the context"); 1417 | return; 1418 | } 1419 | m_Batches[outputBatchID].m_BuffersWithContext = true; 1420 | m_FreeIndexQueue.push (outputBatchID); 1421 | m_QueueCondition.notify_one (); 1422 | 1423 | /* Free memory allocated in dequeueOutputBatch */ 1424 | for (unsigned int i = 0; i < batchOutput.numFrames; i++) 1425 | { 1426 | releaseFrameOutput(batchOutput.frames[i]); 1427 | } 1428 | 1429 | delete[] batchOutput.frames; 1430 | delete[] batchOutput.hostBuffers; 1431 | delete[] batchOutput.outputDeviceBuffers; 1432 | } 1433 | 1434 | /** 1435 | * Fill all the bound layers information in the vector. 1436 | */ 1437 | void 1438 | NvDsInferContextImpl::fillLayersInfo(vector &layersInfo) 1439 | { 1440 | layersInfo.assign (m_AllLayerInfo.begin(), m_AllLayerInfo.end()); 1441 | } 1442 | 1443 | const vector> & 1444 | NvDsInferContextImpl::getLabels() 1445 | { 1446 | return m_Labels; 1447 | } 1448 | 1449 | /* Check if the runtime cuda engine is compatible with requested configuration. */ 1450 | NvDsInferStatus 1451 | NvDsInferContextImpl::checkEngineParams(NvDsInferContextInitParams &initParams) 1452 | { 1453 | /* Check if the cuda engine can support requested max batch size. */ 1454 | if ((int) m_MaxBatchSize > m_CudaEngine->getMaxBatchSize()) 1455 | { 1456 | printWarning("Requested Max Batch Size is less than engine batch size"); 1457 | return NVDSINFER_CONFIG_FAILED; 1458 | } 1459 | 1460 | for (unsigned int i = 0; i < initParams.numOutputLayers; i++) 1461 | { 1462 | int bindingIndex = m_CudaEngine->getBindingIndex(initParams.outputLayerNames[i]); 1463 | if (bindingIndex == -1 || m_CudaEngine->bindingIsInput(bindingIndex)) 1464 | { 1465 | printWarning("Could not find output layer '%s' in engine", 1466 | initParams.outputLayerNames[i]); 1467 | } 1468 | } 1469 | 1470 | return NVDSINFER_SUCCESS; 1471 | } 1472 | 1473 | /* Try to create the Cuda Engine from a serialized file. */ 1474 | NvDsInferStatus 1475 | NvDsInferContextImpl::useEngineFile(NvDsInferContextInitParams &initParams) 1476 | { 1477 | NvDsInferStatus status; 1478 | size_t size = 0; 1479 | size_t i = 0; 1480 | ifstream gieModelFile(initParams.modelEngineFilePath); 1481 | if (!gieModelFile.good()) 1482 | { 1483 | printWarning("Failed to read from model engine file"); 1484 | return NVDSINFER_CONFIG_FAILED; 1485 | } 1486 | 1487 | /* Get the engine file size and read contents into a char buffer. */ 1488 | gieModelFile.seekg(0, ios::end); 1489 | size = gieModelFile.tellg(); 1490 | gieModelFile.seekg(0, ios::beg); 1491 | 1492 | std::vector buff(size); 1493 | while (gieModelFile.get(buff[i])) 1494 | i++; 1495 | gieModelFile.close(); 1496 | 1497 | /* Use DLA if specified. */ 1498 | if (initParams.useDLA) 1499 | { 1500 | m_InferRuntime->setDLACore(initParams.dlaCore); 1501 | } 1502 | 1503 | /* Create the cuda engine from the serialized engine file contents. */ 1504 | m_CudaEngine = m_InferRuntime->deserializeCudaEngine((void *) buff.data(), 1505 | size, m_RuntimePluginFactory); 1506 | if (!m_CudaEngine) 1507 | { 1508 | printWarning("Failed to create engine from file"); 1509 | return NVDSINFER_TENSORRT_ERROR; 1510 | } 1511 | 1512 | /* Check if the deserialized cuda engine is compatible with requested 1513 | * configuration. */ 1514 | status = checkEngineParams(initParams); 1515 | if (status != NVDSINFER_SUCCESS) 1516 | { 1517 | /* Cannot use deserialized cuda engine. Destroy the engine. */ 1518 | m_CudaEngine->destroy(); 1519 | m_CudaEngine = nullptr; 1520 | } 1521 | return status; 1522 | } 1523 | 1524 | /* Custom unique_ptr subclass with deleter functions for TensorRT objects. */ 1525 | template 1526 | class NvDsInferUniquePtr : public std::unique_ptr 1527 | { 1528 | public: 1529 | NvDsInferUniquePtr(T * t = nullptr) : 1530 | std::unique_ptr(t, [](T *t){if (t) t->destroy();}) 1531 | {} 1532 | }; 1533 | 1534 | /* Create cudaengine for the model from the init params 1535 | * (caffemodel & prototxt/uff/onnx, int8 calibration tables, etc) and return the 1536 | * serialized cuda engine stream. */ 1537 | NvDsInferStatus 1538 | NvDsInferContextImpl::generateTRTModel( 1539 | NvDsInferContextInitParams &initParams, 1540 | IHostMemory *&gieModelStream) 1541 | { 1542 | /* Custom implementation of unique_ptr ensures that corresponding destroy 1543 | * methods of TensorRT objects get called when the pointer variables go out 1544 | * of scope. */ 1545 | NvDsInferUniquePtr builder = nvinfer1::createInferBuilder(m_Logger); 1546 | NvDsInferUniquePtr network = builder->createNetwork (); 1547 | NvDsInferUniquePtr cudaEngine; 1548 | 1549 | NvDsInferUniquePtr caffeParser; 1550 | NvDsInferUniquePtr uffParser; 1551 | NvDsInferUniquePtr onnxParser; 1552 | 1553 | NvDsInferInt8Calibrator pCalibrator(initParams.int8CalibrationFilePath); 1554 | NvDsInferNetworkMode networkMode = initParams.networkMode; 1555 | DataType modelDataType; 1556 | 1557 | stringstream engineFileName; 1558 | 1559 | NvDsInferPluginFactoryCaffe caffePluginFactory{nullptr}; 1560 | NvDsInferPluginFactoryUff uffPluginFactory{nullptr}; 1561 | 1562 | NvDsInferCudaEngineGetFcn cudaEngineGetFcn = nullptr; 1563 | 1564 | switch (networkMode) 1565 | { 1566 | case NvDsInferNetworkMode_FP32: 1567 | case NvDsInferNetworkMode_FP16: 1568 | case NvDsInferNetworkMode_INT8: 1569 | break; 1570 | default: 1571 | printError("Unknown network mode %d", networkMode); 1572 | return NVDSINFER_CONFIG_FAILED; 1573 | } 1574 | 1575 | if (!string_empty(initParams.tltEncodedModelFilePath)) 1576 | { 1577 | /* Use the CUDA engine creation function for TLT encoded models provided 1578 | * by NvDsInferUtils. */ 1579 | cudaEngineGetFcn = NvDsInferCudaEngineGetFromTltModel; 1580 | } 1581 | else if (m_CustomLibHandle) 1582 | { 1583 | /* Get the address of the custom cuda engine creation function if available 1584 | * in the custom lib. */ 1585 | cudaEngineGetFcn = (NvDsInferCudaEngineGetFcn) dlsym(m_CustomLibHandle, 1586 | "NvDsInferCudaEngineGet"); 1587 | } 1588 | 1589 | if (networkMode == NvDsInferNetworkMode_INT8) 1590 | { 1591 | /* Check if platform supports INT8 else use FP16 */ 1592 | if (builder->platformHasFastInt8()) 1593 | { 1594 | if (!string_empty(initParams.int8CalibrationFilePath) && 1595 | file_accessible(initParams.int8CalibrationFilePath)) 1596 | { 1597 | /* Set INT8 mode and set the INT8 Calibrator */ 1598 | builder->setInt8Mode(true); 1599 | builder->setInt8Calibrator(&pCalibrator); 1600 | /* modelDataType should be FLOAT for INT8 */ 1601 | modelDataType = DataType::kFLOAT; 1602 | } 1603 | else if (cudaEngineGetFcn != nullptr) 1604 | { 1605 | printWarning("INT8 calibration file not specified/accessible. " 1606 | "INT8 calibration can be done through setDynamicRange " 1607 | "API in 'NvDsInferCreateNetwork' implementation"); 1608 | } 1609 | else 1610 | { 1611 | printWarning("INT8 calibration file not specified. Trying FP16 mode."); 1612 | networkMode = NvDsInferNetworkMode_FP16; 1613 | } 1614 | } 1615 | else 1616 | { 1617 | printWarning("INT8 not supported by platform. Trying FP16 mode."); 1618 | networkMode = NvDsInferNetworkMode_FP16; 1619 | } 1620 | } 1621 | 1622 | if (networkMode == NvDsInferNetworkMode_FP16) 1623 | { 1624 | /* Check if platform supports FP16 else use FP32 */ 1625 | if (builder->platformHasFastFp16()) 1626 | { 1627 | builder->setHalf2Mode(true); 1628 | modelDataType = DataType::kHALF; 1629 | } 1630 | else 1631 | { 1632 | printWarning("FP16 not supported by platform. Using FP32 mode."); 1633 | networkMode = NvDsInferNetworkMode_FP32; 1634 | } 1635 | } 1636 | 1637 | if (networkMode == NvDsInferNetworkMode_FP32) 1638 | { 1639 | modelDataType = DataType::kFLOAT; 1640 | } 1641 | 1642 | /* Set the maximum batch size */ 1643 | builder->setMaxBatchSize(m_MaxBatchSize); 1644 | builder->setMaxWorkspaceSize(WORKSPACE_SIZE); 1645 | 1646 | /* Use DLA if specified. */ 1647 | if (initParams.useDLA) 1648 | { 1649 | builder->setDefaultDeviceType(DeviceType::kDLA); 1650 | builder->setDLACore(initParams.dlaCore); 1651 | builder->allowGPUFallback(true); 1652 | } 1653 | 1654 | /* If the custom network creation function has been specified use that. */ 1655 | if (cudaEngineGetFcn) 1656 | { 1657 | nvinfer1::ICudaEngine *engine = nullptr; 1658 | if (!cudaEngineGetFcn (builder.get(), &initParams, modelDataType, engine) || 1659 | engine == nullptr) 1660 | { 1661 | printError("Failed to create network using custom network creation" 1662 | " function"); 1663 | return NVDSINFER_CUSTOM_LIB_FAILED; 1664 | } 1665 | cudaEngine = engine; 1666 | if (!string_empty(initParams.tltEncodedModelFilePath)) 1667 | { 1668 | engineFileName << initParams.tltEncodedModelFilePath; 1669 | } 1670 | else 1671 | { 1672 | char *cwd = getcwd(NULL, 0); 1673 | engineFileName << cwd << "/model"; 1674 | free(cwd); 1675 | } 1676 | } 1677 | /* Check for caffe model files first. */ 1678 | else if (!string_empty(initParams.modelFilePath) && 1679 | !string_empty(initParams.protoFilePath)) 1680 | { 1681 | if (!file_accessible(initParams.modelFilePath)) 1682 | { 1683 | printError("Cannot access caffemodel file '%s'", 1684 | initParams.modelFilePath); 1685 | return NVDSINFER_CONFIG_FAILED; 1686 | } 1687 | if (!file_accessible(initParams.protoFilePath)) 1688 | { 1689 | printError("Cannot access prototxt file '%s'", 1690 | initParams.protoFilePath); 1691 | return NVDSINFER_CONFIG_FAILED; 1692 | } 1693 | 1694 | caffeParser = nvcaffeparser1::createCaffeParser(); 1695 | /* Check if the custom library provides a PluginFactory for Caffe parsing. */ 1696 | if (m_CustomLibHandle) 1697 | { 1698 | NvDsInferPluginFactoryCaffeGetFcn fcn = 1699 | (NvDsInferPluginFactoryCaffeGetFcn) dlsym(m_CustomLibHandle, 1700 | "NvDsInferPluginFactoryCaffeGet"); 1701 | if (fcn) 1702 | { 1703 | NvDsInferPluginFactoryType type; 1704 | if (!fcn(caffePluginFactory, type)) 1705 | { 1706 | printError("Could not get PluginFactory instance for " 1707 | "Caffe parsing from custom library"); 1708 | return NVDSINFER_CUSTOM_LIB_FAILED; 1709 | } 1710 | /* Use the appropriate API to set the PluginFactory based on its 1711 | * type. */ 1712 | switch (type) 1713 | { 1714 | case PLUGIN_FACTORY: 1715 | caffeParser->setPluginFactory( 1716 | caffePluginFactory.pluginFactory); 1717 | break; 1718 | case PLUGIN_FACTORY_EXT: 1719 | caffeParser->setPluginFactoryExt( 1720 | caffePluginFactory.pluginFactoryExt); 1721 | break; 1722 | case PLUGIN_FACTORY_V2: 1723 | caffeParser->setPluginFactoryV2( 1724 | caffePluginFactory.pluginFactoryV2); 1725 | break; 1726 | default: 1727 | printError("Invalid PluginFactory type returned by " 1728 | "custom library"); 1729 | return NVDSINFER_CUSTOM_LIB_FAILED; 1730 | } 1731 | } 1732 | } 1733 | 1734 | /* Parse the caffe model. */ 1735 | const nvcaffeparser1::IBlobNameToTensor *blobNameToTensor = 1736 | caffeParser->parse(initParams.protoFilePath, 1737 | initParams.modelFilePath, *network, 1738 | modelDataType); 1739 | 1740 | if (!blobNameToTensor) 1741 | { 1742 | printError("Failed while parsing network"); 1743 | return NVDSINFER_TENSORRT_ERROR; 1744 | } 1745 | 1746 | for (unsigned int i = 0; i < initParams.numOutputLayers; i++) 1747 | { 1748 | char *layerName = initParams.outputLayerNames[i]; 1749 | /* Find and mark the coverage layer as output */ 1750 | ITensor *tensor = blobNameToTensor->find(layerName); 1751 | if (!tensor) 1752 | { 1753 | printError("Could not find output layer '%s'", layerName); 1754 | return NVDSINFER_CONFIG_FAILED; 1755 | } 1756 | network->markOutput(*tensor); 1757 | } 1758 | engineFileName << initParams.modelFilePath; 1759 | } 1760 | /* Check for UFF model next. */ 1761 | else if (!string_empty(initParams.uffFilePath)) 1762 | { 1763 | if (!file_accessible(initParams.uffFilePath)) 1764 | { 1765 | printError("Cannot access UFF file '%s'", initParams.uffFilePath); 1766 | return NVDSINFER_CONFIG_FAILED; 1767 | } 1768 | 1769 | //uffParser = nvuffparser::createUffParser(); 1770 | DimsCHW uffInputDims; 1771 | nvuffparser::UffInputOrder uffInputOrder; 1772 | 1773 | /* UFF parsing needs the input layer name. */ 1774 | if (string_empty(initParams.uffInputBlobName)) 1775 | { 1776 | printError("UFF input blob name not provided"); 1777 | return NVDSINFER_CONFIG_FAILED; 1778 | 1779 | } 1780 | 1781 | uffInputDims.c() = initParams.uffDimsCHW.c; 1782 | uffInputDims.h() = initParams.uffDimsCHW.h; 1783 | uffInputDims.w() = initParams.uffDimsCHW.w; 1784 | 1785 | switch (initParams.uffInputOrder) 1786 | { 1787 | case NvDsInferUffInputOrder_kNCHW: 1788 | uffInputOrder = nvuffparser::UffInputOrder::kNCHW; 1789 | break; 1790 | case NvDsInferUffInputOrder_kNHWC: 1791 | uffInputOrder = nvuffparser::UffInputOrder::kNHWC; 1792 | break; 1793 | case NvDsInferUffInputOrder_kNC: 1794 | uffInputOrder = nvuffparser::UffInputOrder::kNC; 1795 | break; 1796 | default: 1797 | printError("Unrecognized uff input order"); 1798 | return NVDSINFER_CONFIG_FAILED; 1799 | } 1800 | 1801 | /* Register the input layer (name, dims and input order). */ 1802 | if (!uffParser->registerInput(initParams.uffInputBlobName, 1803 | uffInputDims, uffInputOrder)) 1804 | { 1805 | printError("Failed to register input blob: %s DimsCHW:(%d,%d,%d) " 1806 | "Order: %s", initParams.uffInputBlobName, initParams.uffDimsCHW.c, 1807 | initParams.uffDimsCHW.h, initParams.uffDimsCHW.w, 1808 | (initParams.uffInputOrder == NvDsInferUffInputOrder_kNHWC ? 1809 | "HWC" : "CHW")); 1810 | return NVDSINFER_CONFIG_FAILED; 1811 | 1812 | } 1813 | /* Register outputs. */ 1814 | for (unsigned int i = 0; i < initParams.numOutputLayers; i++) { 1815 | uffParser->registerOutput(initParams.outputLayerNames[i]); 1816 | } 1817 | 1818 | /* Check if the custom library provides a PluginFactory for UFF parsing. */ 1819 | if (m_CustomLibHandle) 1820 | { 1821 | NvDsInferPluginFactoryUffGetFcn fcn = 1822 | (NvDsInferPluginFactoryUffGetFcn) dlsym(m_CustomLibHandle, 1823 | "NvDsInferPluginFactoryUffGet"); 1824 | if (fcn) 1825 | { 1826 | NvDsInferPluginFactoryType type; 1827 | if (!fcn(uffPluginFactory, type)) 1828 | { 1829 | printError("Could not get PluginFactory instance for UFF" 1830 | " parsing from custom library"); 1831 | return NVDSINFER_CUSTOM_LIB_FAILED; 1832 | } 1833 | /* Use the appropriate API to set the PluginFactory based on its 1834 | * type. */ 1835 | switch (type) 1836 | { 1837 | case PLUGIN_FACTORY: 1838 | uffParser->setPluginFactory( 1839 | uffPluginFactory.pluginFactory); 1840 | break; 1841 | case PLUGIN_FACTORY_EXT: 1842 | uffParser->setPluginFactoryExt( 1843 | uffPluginFactory.pluginFactoryExt); 1844 | break; 1845 | default: 1846 | printError("Invalid PluginFactory type returned by " 1847 | "custom library"); 1848 | return NVDSINFER_CUSTOM_LIB_FAILED; 1849 | } 1850 | } 1851 | } 1852 | 1853 | if (!uffParser->parse(initParams.uffFilePath, 1854 | *network, modelDataType)) 1855 | { 1856 | printError("Failed to parse UFF file: incorrect file or incorrect" 1857 | " input/output blob names"); 1858 | return NVDSINFER_TENSORRT_ERROR; 1859 | } 1860 | engineFileName << initParams.uffFilePath; 1861 | } 1862 | else if (!string_empty(initParams.onnxFilePath)) 1863 | { 1864 | if (!file_accessible(initParams.onnxFilePath)) 1865 | { 1866 | printError("Cannot access ONNX file '%s'", initParams.onnxFilePath); 1867 | return NVDSINFER_CONFIG_FAILED; 1868 | } 1869 | onnxParser = nvonnxparser::createParser(*network, m_Logger); 1870 | 1871 | if (!onnxParser->parseFromFile(initParams.onnxFilePath, 1872 | (int) ILogger::Severity::kWARNING)) 1873 | { 1874 | printError("Failed to parse onnx file"); 1875 | return NVDSINFER_TENSORRT_ERROR; 1876 | } 1877 | engineFileName << initParams.onnxFilePath; 1878 | } 1879 | else 1880 | { 1881 | printError("No model files specified"); 1882 | return NVDSINFER_CONFIG_FAILED; 1883 | } 1884 | 1885 | if (!cudaEngineGetFcn) 1886 | { 1887 | /* Build the engine */ 1888 | cudaEngine = builder->buildCudaEngine(*network); 1889 | } 1890 | if (cudaEngine == nullptr) 1891 | { 1892 | printError("Failed while building cuda engine for network"); 1893 | return NVDSINFER_TENSORRT_ERROR; 1894 | } 1895 | 1896 | /* Serialize the network into a stream and return the stream pointer since 1897 | * the cuda engine is valid only for the lifetime of the builder. */ 1898 | gieModelStream = cudaEngine->serialize(); 1899 | 1900 | /* Optionally write the stream to a file which can used during next run. */ 1901 | engineFileName << "_b" << m_MaxBatchSize << "_"; 1902 | if (initParams.useDLA) 1903 | engineFileName << "dla_"; 1904 | engineFileName << ((networkMode == NvDsInferNetworkMode_FP32) ? "fp32" : 1905 | (networkMode == NvDsInferNetworkMode_FP16) ? "fp16" : "int8") 1906 | << ".engine"; 1907 | printInfo("Storing the serialized cuda engine to file at %s", 1908 | engineFileName.str().c_str()); 1909 | ofstream gieModelFileOut(engineFileName.str()); 1910 | gieModelFileOut.write((char *) gieModelStream->data(), 1911 | gieModelStream->size()); 1912 | 1913 | cudaEngine.reset (); 1914 | 1915 | /* Destroy the plugin factory instances. */ 1916 | if (caffePluginFactory.pluginFactory) 1917 | { 1918 | NvDsInferPluginFactoryCaffeDestroyFcn fcn = 1919 | (NvDsInferPluginFactoryCaffeDestroyFcn) dlsym(m_CustomLibHandle, 1920 | "NvDsInferPluginFactoryCaffeDestroy"); 1921 | if (fcn) 1922 | { 1923 | fcn(caffePluginFactory); 1924 | } 1925 | } 1926 | if (uffPluginFactory.pluginFactory) 1927 | { 1928 | NvDsInferPluginFactoryUffDestroyFcn fcn = 1929 | (NvDsInferPluginFactoryUffDestroyFcn) dlsym(m_CustomLibHandle, 1930 | "NvDsInferPluginFactoryUffDestroy"); 1931 | if (fcn) 1932 | { 1933 | fcn(uffPluginFactory); 1934 | } 1935 | } 1936 | 1937 | return NVDSINFER_SUCCESS; 1938 | } 1939 | 1940 | /** 1941 | * Clean up and free all resources 1942 | */ 1943 | NvDsInferContextImpl::~NvDsInferContextImpl() 1944 | { 1945 | /* Set the cuda device to be used. */ 1946 | cudaError_t cudaReturn = cudaSetDevice(m_GpuID); 1947 | if (cudaReturn != cudaSuccess) 1948 | { 1949 | printError("Failed to set cuda device %d (%s).", m_GpuID, 1950 | cudaGetErrorName(cudaReturn)); 1951 | return; 1952 | } 1953 | 1954 | unique_lock < std::mutex > lock (m_QueueMutex); 1955 | 1956 | /* Clean up other cuda resources. */ 1957 | if (m_PreProcessStream) 1958 | { 1959 | cudaStreamSynchronize(m_PreProcessStream); 1960 | cudaStreamDestroy(m_PreProcessStream); 1961 | } 1962 | if (m_InferStream) 1963 | { 1964 | cudaStreamSynchronize(m_InferStream); 1965 | cudaStreamDestroy(m_InferStream); 1966 | } 1967 | if (m_BufferCopyStream) 1968 | { 1969 | cudaStreamSynchronize(m_BufferCopyStream); 1970 | cudaStreamDestroy(m_BufferCopyStream); 1971 | } 1972 | if (m_InputConsumedEvent) 1973 | cudaEventDestroy (m_InputConsumedEvent); 1974 | if (m_PreProcessCompleteEvent) 1975 | cudaEventDestroy (m_PreProcessCompleteEvent); 1976 | if (m_InferCompleteEvent) 1977 | cudaEventDestroy (m_InferCompleteEvent); 1978 | 1979 | bool warn = false; 1980 | 1981 | for (auto & batch:m_Batches) 1982 | { 1983 | if (!batch.m_BuffersWithContext && !warn) 1984 | { 1985 | warn = true; 1986 | printWarning ("Not all output batches released back to the context " 1987 | "before destroy. Memory associated with the outputs will " 1988 | "no longer be valid."); 1989 | } 1990 | if (batch.m_CopyCompleteEvent) 1991 | cudaEventDestroy(batch.m_CopyCompleteEvent); 1992 | for (size_t i = 0; i < batch.m_DeviceBuffers.size(); i++) 1993 | { 1994 | if (batch.m_DeviceBuffers[i] && !m_CudaEngine->bindingIsInput(i)) 1995 | cudaFree(batch.m_DeviceBuffers[i]); 1996 | } 1997 | } 1998 | 1999 | 2000 | if (m_DBScanHandle) 2001 | NvDsInferDBScanDestroy(m_DBScanHandle); 2002 | 2003 | if (m_InferExecutionContext) 2004 | m_InferExecutionContext->destroy(); 2005 | 2006 | if (m_CudaEngine) 2007 | m_CudaEngine->destroy(); 2008 | 2009 | if (m_InferRuntime) 2010 | m_InferRuntime->destroy(); 2011 | 2012 | if (m_CustomLibHandle) 2013 | { 2014 | /* Destroy the PluginFactory instance required during runtime cuda engine 2015 | * deserialization. */ 2016 | if (m_RuntimePluginFactory) 2017 | { 2018 | NvDsInferPluginFactoryRuntimeDestroyFcn fcn = 2019 | (NvDsInferPluginFactoryRuntimeDestroyFcn) dlsym( 2020 | m_CustomLibHandle, "NvDsInferPluginFactoryRuntimeDestroy"); 2021 | if (fcn) 2022 | { 2023 | fcn(m_RuntimePluginFactory); 2024 | } 2025 | } 2026 | dlclose(m_CustomLibHandle); 2027 | } 2028 | 2029 | if (m_MeanDataBuffer) 2030 | { 2031 | cudaFree(m_MeanDataBuffer); 2032 | } 2033 | 2034 | for (auto & buffer:m_BindingBuffers) 2035 | { 2036 | if (buffer) 2037 | cudaFree(buffer); 2038 | } 2039 | } 2040 | 2041 | /* 2042 | * Destroy the context to release all resources. 2043 | */ 2044 | void 2045 | NvDsInferContextImpl::destroy() 2046 | { 2047 | delete this; 2048 | } 2049 | 2050 | /* 2051 | * Factory function to create an NvDsInferContext instance and initialize it with 2052 | * supplied parameters. 2053 | */ 2054 | NvDsInferStatus 2055 | createNvDsInferContext(NvDsInferContextHandle *handle, 2056 | NvDsInferContextInitParams &initParams, void *userCtx, 2057 | NvDsInferContextLoggingFunc logFunc) 2058 | { 2059 | NvDsInferStatus status; 2060 | NvDsInferContextImpl *ctx = new NvDsInferContextImpl(); 2061 | 2062 | status = ctx->initialize(initParams, userCtx, logFunc); 2063 | if (status == NVDSINFER_SUCCESS) 2064 | { 2065 | *handle = ctx; 2066 | } 2067 | else 2068 | { 2069 | static_cast(ctx)->destroy(); 2070 | } 2071 | return status; 2072 | } 2073 | 2074 | /* 2075 | * Reset the members inside the initParams structure to default values. 2076 | */ 2077 | void 2078 | NvDsInferContext_ResetInitParams (NvDsInferContextInitParams *initParams) 2079 | { 2080 | if (initParams == nullptr) 2081 | { 2082 | fprintf(stderr, "Warning. NULL initParams passed to " 2083 | "NvDsInferContext_ResetInitParams()\n"); 2084 | return; 2085 | } 2086 | 2087 | memset(initParams, 0, sizeof (*initParams)); 2088 | 2089 | initParams->networkMode = NvDsInferNetworkMode_FP32; 2090 | initParams->networkInputFormat = NvDsInferFormat_Unknown; 2091 | initParams->uffInputOrder = NvDsInferUffInputOrder_kNCHW; 2092 | initParams->maxBatchSize = 1; 2093 | initParams->networkScaleFactor = 1.0; 2094 | initParams->networkType = NvDsInferNetworkType_Detector; 2095 | initParams->outputBufferPoolSize = NVDSINFER_MIN_OUTPUT_BUFFERPOOL_SIZE; 2096 | } 2097 | 2098 | const char * 2099 | NvDsInferContext_GetStatusName (NvDsInferStatus status) 2100 | { 2101 | #define CHECK_AND_RETURN_STRING(status_iter) \ 2102 | if (status == status_iter) return #status_iter 2103 | 2104 | CHECK_AND_RETURN_STRING(NVDSINFER_SUCCESS); 2105 | CHECK_AND_RETURN_STRING(NVDSINFER_CONFIG_FAILED); 2106 | CHECK_AND_RETURN_STRING(NVDSINFER_CUSTOM_LIB_FAILED); 2107 | CHECK_AND_RETURN_STRING(NVDSINFER_INVALID_PARAMS); 2108 | CHECK_AND_RETURN_STRING(NVDSINFER_OUTPUT_PARSING_FAILED); 2109 | CHECK_AND_RETURN_STRING(NVDSINFER_CUDA_ERROR); 2110 | CHECK_AND_RETURN_STRING(NVDSINFER_TENSORRT_ERROR); 2111 | CHECK_AND_RETURN_STRING(NVDSINFER_UNKNOWN_ERROR); 2112 | 2113 | return nullptr; 2114 | #undef CHECK_AND_RETURN_STRING 2115 | 2116 | } 2117 | -------------------------------------------------------------------------------- /libs/nvdsinfer/nvdsinfer_context_impl.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA Corporation and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA Corporation is strictly prohibited. 9 | * 10 | */ 11 | 12 | #ifndef __NVDSINFER_CONTEXT_IMPL_H__ 13 | #define __NVDSINFER_CONTEXT_IMPL_H__ 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "cuda_runtime_api.h" 22 | #include 23 | #include 24 | 25 | #include 26 | 27 | #include 28 | #include 29 | #include 30 | 31 | 32 | /** 33 | * Implementation of the INvDsInferContext interface. 34 | */ 35 | class NvDsInferContextImpl : public INvDsInferContext 36 | { 37 | public: 38 | /** 39 | * Default constructor. 40 | */ 41 | NvDsInferContextImpl(); 42 | 43 | /** 44 | * Initializes the Infer engine, allocates layer buffers and other required 45 | * initialization steps. 46 | */ 47 | NvDsInferStatus initialize(NvDsInferContextInitParams &initParams, 48 | void *userCtx, NvDsInferContextLoggingFunc logFunc); 49 | 50 | private: 51 | /** 52 | * Free up resouces and deinitialize the inference engine. 53 | */ 54 | ~NvDsInferContextImpl(); 55 | 56 | /* Implementation of the public methods of INvDsInferContext interface. */ 57 | NvDsInferStatus queueInputBatch(NvDsInferContextBatchInput &batchInput) override; 58 | NvDsInferStatus dequeueOutputBatch(NvDsInferContextBatchOutput &batchOutput) override; 59 | void releaseBatchOutput(NvDsInferContextBatchOutput &batchOutput) override; 60 | void fillLayersInfo(std::vector &layersInfo) override; 61 | void getNetworkInfo(NvDsInferNetworkInfo &networkInfo) override; 62 | const std::vector>& getLabels() override; 63 | void destroy() override; 64 | 65 | /* Other private methods. */ 66 | NvDsInferStatus checkEngineParams(NvDsInferContextInitParams &initParams); 67 | NvDsInferStatus useEngineFile(NvDsInferContextInitParams &initParams); 68 | NvDsInferStatus generateTRTModel(NvDsInferContextInitParams &initParams, 69 | nvinfer1::IHostMemory *&gieModelStream); 70 | NvDsInferStatus readMeanImageFile(char *meanImageFilePath); 71 | NvDsInferStatus getBoundLayersInfo(); 72 | NvDsInferStatus allocateBuffers(); 73 | NvDsInferStatus parseLabelsFile(char *labelsFilePath); 74 | bool parseBoundingBox( 75 | std::vector const& outputLayersInfo, 76 | NvDsInferNetworkInfo const &networkInfo, 77 | NvDsInferParseDetectionParams const &detectionParams, 78 | std::vector &objectList); 79 | bool parseAttributesFromSoftmaxLayers( 80 | std::vector const &outputLayersInfo, 81 | NvDsInferNetworkInfo const &networkInfo, 82 | float classifierThreshold, 83 | std::vector &attrList, 84 | std::string &attrString); 85 | void clusterAndFillDetectionOutputCV(NvDsInferDetectionOutput &output); 86 | void clusterAndFillDetectionOutputDBSCAN(NvDsInferDetectionOutput &output); 87 | NvDsInferStatus fillDetectionOutput(NvDsInferDetectionOutput &output); 88 | NvDsInferStatus fillClassificationOutput(NvDsInferClassificationOutput &output); 89 | NvDsInferStatus fillSegmentationOutput(NvDsInferSegmentationOutput &output); 90 | void releaseFrameOutput(NvDsInferFrameOutput &frameOutput); 91 | NvDsInferStatus initNonImageInputLayers(); 92 | 93 | /* Input layer has a binding index of 0 */ 94 | static const int INPUT_LAYER_INDEX = 0; 95 | 96 | /* Mutex to keep DLA IExecutionContext::enqueue theadsafe */ 97 | static std::mutex DlaExecutionMutex; 98 | 99 | /** Unique identifier for the instance. This can be used to identify the 100 | * instance generating log and error messages. */ 101 | unsigned int m_UniqueID; 102 | 103 | unsigned int m_MaxBatchSize; 104 | 105 | double m_NetworkScaleFactor; 106 | 107 | /** Input format for the network. */ 108 | NvDsInferFormat m_NetworkInputFormat; 109 | 110 | NvDsInferNetworkType m_NetworkType; 111 | 112 | /* Network input information. */ 113 | NvDsInferNetworkInfo m_NetworkInfo; 114 | 115 | bool m_UseDBScan; 116 | 117 | NvDsInferDBScanHandle m_DBScanHandle; 118 | 119 | /* Number of classes detected by the model. */ 120 | unsigned int m_NumDetectedClasses; 121 | 122 | /* Detection / grouping parameters. */ 123 | std::vector m_PerClassDetectionParams; 124 | NvDsInferParseDetectionParams m_DetectionParams; 125 | 126 | /* Vector for all parsed objects. */ 127 | std::vector m_ObjectList; 128 | /* Vector of cv::Rect vectors for each class. */ 129 | std::vector> m_PerClassCvRectList; 130 | /* Vector of NvDsInferObjectDetectionInfo vectors for each class. */ 131 | std::vector> m_PerClassObjectList; 132 | 133 | float m_ClassifierThreshold; 134 | float m_SegmentationThreshold; 135 | 136 | /* Custom library implementation. */ 137 | void *m_CustomLibHandle; 138 | NvDsInferParseCustomFunc m_CustomBBoxParseFunc; 139 | NvDsInferClassiferParseCustomFunc m_CustomClassifierParseFunc; 140 | nvinfer1::IPluginFactory *m_RuntimePluginFactory; 141 | 142 | unsigned int m_GpuID; 143 | bool m_DlaEnabled; 144 | 145 | /* Holds the string labels for classes. */ 146 | std::vector> m_Labels; 147 | 148 | /* Logger for GIE info/warning/errors */ 149 | class NvDsInferLogger : public nvinfer1::ILogger 150 | { 151 | void log(Severity severity, const char *msg) override ; 152 | public: 153 | NvDsInferContextImpl *handle; 154 | }; 155 | NvDsInferLogger m_Logger; 156 | 157 | /* Custom unique_ptrs. These TensorRT objects will get deleted automatically 158 | * when the NvDsInferContext object is deleted. */ 159 | nvinfer1::IRuntime *m_InferRuntime; 160 | nvinfer1::ICudaEngine *m_CudaEngine; 161 | nvinfer1::IExecutionContext *m_InferExecutionContext; 162 | 163 | cudaStream_t m_PreProcessStream; 164 | cudaStream_t m_InferStream; 165 | cudaStream_t m_BufferCopyStream; 166 | 167 | /* Vectors for holding information about bound layers. */ 168 | std::vector m_AllLayerInfo; 169 | std::vector m_OutputLayerInfo; 170 | 171 | float *m_MeanDataBuffer; 172 | 173 | std::vector m_BindingBuffers; 174 | 175 | unsigned int m_OutputBufferPoolSize; 176 | 177 | /** 178 | * Holds information for one batch for processing. 179 | */ 180 | typedef struct 181 | { 182 | std::vector> m_HostBuffers; 183 | std::vector m_DeviceBuffers; 184 | 185 | unsigned int m_BatchSize; 186 | cudaEvent_t m_CopyCompleteEvent = nullptr; 187 | bool m_BuffersWithContext = true; 188 | 189 | //NvDsInferContextReturnInputAsyncFunc m_ReturnFunc = nullptr; 190 | //void *m_ReturnFuncData = nullptr; 191 | } NvDsInferBatch; 192 | 193 | std::vector m_Batches; 194 | 195 | /* Queues and synchronization members for processing multiple batches 196 | * in parallel. 197 | */ 198 | std::mutex m_QueueMutex; 199 | std::condition_variable m_QueueCondition; 200 | std::queue m_ProcessIndexQueue; 201 | std::queue m_FreeIndexQueue; 202 | 203 | bool m_CopyInputToHostBuffers; 204 | 205 | /* Cuda Event for synchronizing input consumption by TensorRT CUDA engine. */ 206 | cudaEvent_t m_InputConsumedEvent; 207 | /* Cuda Event for synchronizing completion of pre-processing. */ 208 | cudaEvent_t m_PreProcessCompleteEvent; 209 | /* Cuda Event for synchronizing infer completion by TensorRT CUDA engine. */ 210 | cudaEvent_t m_InferCompleteEvent; 211 | 212 | NvDsInferContextLoggingFunc m_LoggingFunc; 213 | 214 | void *m_UserCtx; 215 | 216 | bool m_Initialized; 217 | }; 218 | 219 | /* Calls clients logging callback function. */ 220 | static inline void 221 | callLogFunc(NvDsInferContextImpl *ctx, unsigned int uniqueID, NvDsInferLogLevel level, 222 | const char *func, NvDsInferContextLoggingFunc logFunc, void *logCtx, 223 | const char *fmt, ...) 224 | { 225 | va_list args; 226 | va_start (args, fmt); 227 | char logMsgBuffer[_MAX_STR_LENGTH + 1]; 228 | vsnprintf(logMsgBuffer, _MAX_STR_LENGTH, fmt, args); 229 | logFunc(ctx, uniqueID, level, func, logMsgBuffer, logCtx); 230 | va_end (args); 231 | } 232 | 233 | #define printMsg(level, tag_str, fmt, ...) \ 234 | do { \ 235 | char * baseName = strrchr((char *) __FILE__, '/'); \ 236 | baseName = (baseName) ? (baseName + 1) : (char *) __FILE__; \ 237 | if (m_LoggingFunc) \ 238 | { \ 239 | callLogFunc(this, m_UniqueID, level, __func__, m_LoggingFunc, \ 240 | m_UserCtx, fmt, ## __VA_ARGS__); \ 241 | } \ 242 | else \ 243 | { \ 244 | fprintf(stderr, \ 245 | tag_str " NvDsInferContextImpl::%s() <%s:%d> [UID = %d]: " fmt "\n", \ 246 | __func__, baseName, __LINE__, m_UniqueID, ## __VA_ARGS__); \ 247 | } \ 248 | } while (0) 249 | 250 | #define printError(fmt, ...) \ 251 | do { \ 252 | printMsg (NVDSINFER_LOG_ERROR, "Error in", fmt, ##__VA_ARGS__); \ 253 | } while (0) 254 | 255 | #define printWarning(fmt, ...) \ 256 | do { \ 257 | printMsg (NVDSINFER_LOG_WARNING, "Warning from", fmt, ##__VA_ARGS__); \ 258 | } while (0) 259 | 260 | #define printInfo(fmt, ...) \ 261 | do { \ 262 | printMsg (NVDSINFER_LOG_INFO, "Info from", fmt, ##__VA_ARGS__); \ 263 | } while (0) 264 | 265 | #define printDebug(fmt, ...) \ 266 | do { \ 267 | printMsg (NVDSINFER_LOG_DEBUG, "DEBUG", fmt, ##__VA_ARGS__); \ 268 | } while (0) 269 | 270 | #endif 271 | -------------------------------------------------------------------------------- /libs/nvdsinfer/nvdsinfer_context_impl_capi.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA Corporation and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA Corporation is strictly prohibited. 9 | * 10 | */ 11 | 12 | #include "nvdsinfer_context_impl.h" 13 | #include 14 | 15 | /* This file implements the C interface for the NvDsInferContext class. The 16 | * interface is a simple wrapper over the C++ interface. */ 17 | 18 | using namespace std; 19 | 20 | #define NULL_PARAM_CHECK(param, retvalue) \ 21 | if (param == nullptr) \ 22 | { \ 23 | fprintf(stderr, "Warning: NULL parameter " #param " passed to %s\n", \ 24 | __func__); \ 25 | return retvalue; \ 26 | } 27 | 28 | 29 | NvDsInferStatus 30 | NvDsInferContext_Create(NvDsInferContextHandle *handle, 31 | NvDsInferContextInitParams *initParams, void *userCtx, 32 | NvDsInferContextLoggingFunc logFunc) 33 | { 34 | NULL_PARAM_CHECK(handle, NVDSINFER_INVALID_PARAMS); 35 | NULL_PARAM_CHECK(initParams, NVDSINFER_INVALID_PARAMS); 36 | 37 | return createNvDsInferContext(handle, *initParams, userCtx, logFunc); 38 | } 39 | 40 | void 41 | NvDsInferContext_Destroy(NvDsInferContextHandle handle) 42 | { 43 | NULL_PARAM_CHECK(handle, ); 44 | 45 | handle->destroy(); 46 | } 47 | 48 | NvDsInferStatus 49 | NvDsInferContext_QueueInputBatch(NvDsInferContextHandle handle, 50 | NvDsInferContextBatchInput *batchInput) 51 | { 52 | NULL_PARAM_CHECK(handle, NVDSINFER_INVALID_PARAMS); 53 | NULL_PARAM_CHECK(batchInput, NVDSINFER_INVALID_PARAMS); 54 | 55 | return handle->queueInputBatch(*batchInput); 56 | } 57 | 58 | NvDsInferStatus 59 | NvDsInferContext_DequeueOutputBatch(NvDsInferContextHandle handle, 60 | NvDsInferContextBatchOutput *batchOutput) 61 | { 62 | NULL_PARAM_CHECK(handle, NVDSINFER_INVALID_PARAMS); 63 | NULL_PARAM_CHECK(batchOutput, NVDSINFER_INVALID_PARAMS); 64 | 65 | return handle->dequeueOutputBatch(*batchOutput); 66 | } 67 | 68 | void 69 | NvDsInferContext_ReleaseBatchOutput(NvDsInferContextHandle handle, 70 | NvDsInferContextBatchOutput *batchOutput) 71 | { 72 | NULL_PARAM_CHECK(handle, ); 73 | NULL_PARAM_CHECK(batchOutput, ); 74 | 75 | return handle->releaseBatchOutput(*batchOutput); 76 | } 77 | 78 | unsigned int 79 | NvDsInferContext_GetNumLayersInfo(NvDsInferContextHandle handle) 80 | { 81 | NULL_PARAM_CHECK(handle, 0); 82 | 83 | std::vector layersInfo; 84 | handle->fillLayersInfo(layersInfo); 85 | 86 | return layersInfo.size(); 87 | } 88 | 89 | void 90 | NvDsInferContext_FillLayersInfo(NvDsInferContextHandle handle, 91 | NvDsInferLayerInfo *layersInfo) 92 | { 93 | NULL_PARAM_CHECK(handle, ); 94 | 95 | std::vector layersInfoVec; 96 | handle->fillLayersInfo(layersInfoVec); 97 | for (unsigned int i = 0; i < layersInfoVec.size(); i++) 98 | layersInfo[i] = layersInfoVec[i]; 99 | } 100 | 101 | void 102 | NvDsInferContext_GetNetworkInfo(NvDsInferContextHandle handle, 103 | NvDsInferNetworkInfo *networkInfo) 104 | { 105 | NULL_PARAM_CHECK(handle, ); 106 | NULL_PARAM_CHECK(networkInfo, ); 107 | 108 | return handle->getNetworkInfo(*networkInfo); 109 | } 110 | 111 | const char* 112 | NvDsInferContext_GetLabel(NvDsInferContextHandle handle, unsigned int id, 113 | unsigned int value) 114 | { 115 | NULL_PARAM_CHECK(handle, nullptr); 116 | 117 | auto labels = handle->getLabels(); 118 | if (labels.size() > id && labels[id].size() > value) 119 | return labels[id][value].c_str(); 120 | 121 | return nullptr; 122 | } 123 | -------------------------------------------------------------------------------- /libs/nvdsinfer/nvdsinfer_context_impl_output_parsing.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA Corporation and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA Corporation is strictly prohibited. 9 | * 10 | */ 11 | #include 12 | //#include 13 | #include 14 | 15 | #include "nvdsinfer_context_impl.h" 16 | #include "nms_cpu.h" 17 | #include "resize_merge_cpu.h" 18 | 19 | static const bool ATHR_ENABLED = true; 20 | static const float ATHR_THRESHOLD = 60.0; 21 | 22 | using namespace std; 23 | 24 | #define DIVIDE_AND_ROUND_UP(a, b) ((a + b - 1) / b) 25 | 26 | /* Parse all object bounding boxes for the class `classIndex` in the frame 27 | * meeting the minimum threshold criteria. 28 | * 29 | * This parser function has been specifically written for the sample resnet10 30 | * model provided with the SDK. Other models will require this function to be 31 | * modified. 32 | */ 33 | bool 34 | NvDsInferContextImpl::parseBoundingBox( 35 | vector < NvDsInferLayerInfo > const &outputLayersInfo, 36 | NvDsInferNetworkInfo const &networkInfo, 37 | NvDsInferParseDetectionParams const &detectionParams, 38 | vector < NvDsInferObjectDetectionInfo > &objectList) { 39 | 40 | int outputCoverageLayerIndex = -1; 41 | int outputBBoxLayerIndex = -1; 42 | 43 | 44 | for (unsigned int i = 0; i < outputLayersInfo.size(); i++) { 45 | if (strstr(outputLayersInfo[i].layerName, "bbox") != nullptr) { 46 | outputBBoxLayerIndex = i; 47 | } 48 | if (strstr(outputLayersInfo[i].layerName, "cov") != nullptr) { 49 | outputCoverageLayerIndex = i; 50 | } 51 | } 52 | 53 | if (outputCoverageLayerIndex == -1) { 54 | printError("Could not find output coverage layer for parsing objects"); 55 | return false; 56 | } 57 | if (outputBBoxLayerIndex == -1) { 58 | printError("Could not find output bbox layer for parsing objects"); 59 | return false; 60 | } 61 | 62 | float *outputCoverageBuffer = 63 | (float *)outputLayersInfo[outputCoverageLayerIndex].buffer; 64 | float *outputBboxBuffer = 65 | (float *)outputLayersInfo[outputBBoxLayerIndex].buffer; 66 | 67 | NvDsInferDimsCHW outputCoverageDims; 68 | NvDsInferDimsCHW outputBBoxDims; 69 | 70 | getDimsCHWFromDims(outputCoverageDims, 71 | outputLayersInfo[outputCoverageLayerIndex].dims); 72 | getDimsCHWFromDims(outputBBoxDims, 73 | outputLayersInfo[outputBBoxLayerIndex].dims); 74 | 75 | unsigned int targetShape[2] = { outputCoverageDims.w, outputCoverageDims.h }; 76 | float bboxNorm[2] = { 35.0, 35.0 }; 77 | float gcCenters0[targetShape[0]]; 78 | float gcCenters1[targetShape[1]]; 79 | int gridSize = outputCoverageDims.w * outputCoverageDims.h; 80 | int strideX = DIVIDE_AND_ROUND_UP(networkInfo.width, outputBBoxDims.w); 81 | int strideY = DIVIDE_AND_ROUND_UP(networkInfo.height, outputBBoxDims.h); 82 | 83 | for (unsigned int i = 0; i < targetShape[0]; i++) { 84 | gcCenters0[i] = (float)(i * strideX + 0.5); 85 | gcCenters0[i] /= (float)bboxNorm[0]; 86 | } 87 | for (unsigned int i = 0; i < targetShape[1]; i++) { 88 | gcCenters1[i] = (float)(i * strideY + 0.5); 89 | gcCenters1[i] /= (float)bboxNorm[1]; 90 | } 91 | 92 | unsigned int numClasses = 93 | MIN(outputCoverageDims.c, detectionParams.numClassesConfigured); 94 | for (unsigned int classIndex = 0; classIndex < numClasses; classIndex++) { 95 | 96 | /* Pointers to memory regions containing the (x1,y1) and (x2,y2) coordinates 97 | * of rectangles in the output bounding box layer. */ 98 | float *outputX1 = outputBboxBuffer 99 | + classIndex * sizeof (float) * outputBBoxDims.h * outputBBoxDims.w; 100 | 101 | float *outputY1 = outputX1 + gridSize; 102 | float *outputX2 = outputY1 + gridSize; 103 | float *outputY2 = outputX2 + gridSize; 104 | 105 | /* Iterate through each point in the grid and check if the rectangle at that 106 | * point meets the minimum threshold criteria. */ 107 | for (unsigned int h = 0; h < outputCoverageDims.h; h++) { 108 | for (unsigned int w = 0; w < outputCoverageDims.w; w++) { 109 | int i = w + h * outputCoverageDims.w; 110 | float confidence = outputCoverageBuffer[classIndex * gridSize + i]; 111 | 112 | if (confidence < detectionParams.perClassThreshold[classIndex]) 113 | continue; 114 | 115 | int rectX1, rectY1, rectX2, rectY2; 116 | float rectX1Float, rectY1Float, rectX2Float, rectY2Float; 117 | 118 | /* Centering and normalization of the rectangle. */ 119 | rectX1Float = 120 | outputX1[w + h * outputCoverageDims.w] - gcCenters0[w]; 121 | rectY1Float = 122 | outputY1[w + h * outputCoverageDims.w] - gcCenters1[h]; 123 | rectX2Float = 124 | outputX2[w + h * outputCoverageDims.w] + gcCenters0[w]; 125 | rectY2Float = 126 | outputY2[w + h * outputCoverageDims.w] + gcCenters1[h]; 127 | 128 | rectX1Float *= -bboxNorm[0]; 129 | rectY1Float *= -bboxNorm[1]; 130 | rectX2Float *= bboxNorm[0]; 131 | rectY2Float *= bboxNorm[1]; 132 | 133 | rectX1 = rectX1Float; 134 | rectY1 = rectY1Float; 135 | rectX2 = rectX2Float; 136 | rectY2 = rectY2Float; 137 | 138 | /* Clip parsed rectangles to frame bounds. */ 139 | if (rectX1 >= (int)m_NetworkInfo.width) 140 | rectX1 = m_NetworkInfo.width - 1; 141 | if (rectX2 >= (int)m_NetworkInfo.width) 142 | rectX2 = m_NetworkInfo.width - 1; 143 | if (rectY1 >= (int)m_NetworkInfo.height) 144 | rectY1 = m_NetworkInfo.height - 1; 145 | if (rectY2 >= (int)m_NetworkInfo.height) 146 | rectY2 = m_NetworkInfo.height - 1; 147 | 148 | if (rectX1 < 0) 149 | rectX1 = 0; 150 | if (rectX2 < 0) 151 | rectX2 = 0; 152 | if (rectY1 < 0) 153 | rectY1 = 0; 154 | if (rectY2 < 0) 155 | rectY2 = 0; 156 | 157 | objectList.push_back({ classIndex, (unsigned int) rectX1, 158 | (unsigned int) rectY1, (unsigned int) (rectX2 - rectX1), 159 | (unsigned int) (rectY2 - rectY1), confidence}); 160 | } 161 | } 162 | } 163 | return true; 164 | } 165 | 166 | /** 167 | * Cluster objects using OpenCV groupRectangles and fill the output structure. 168 | */ 169 | void 170 | NvDsInferContextImpl::clusterAndFillDetectionOutputCV(NvDsInferDetectionOutput &output) { 171 | size_t totalObjects = 0; 172 | 173 | for (auto & list:m_PerClassCvRectList) 174 | list.clear(); 175 | 176 | /* The above functions will add all objects in the m_ObjectList vector. 177 | * Need to seperate them per class for grouping. */ 178 | for (auto & object:m_ObjectList) { 179 | m_PerClassCvRectList[object.classId].emplace_back(object.left, 180 | object.top, object.width, object.height); 181 | } 182 | 183 | for (unsigned int c = 0; c < m_NumDetectedClasses; c++) { 184 | /* Cluster together rectangles with similar locations and sizes 185 | * since these rectangles might represent the same object. Refer 186 | * to opencv documentation of groupRectangles for more 187 | * information about the tuning parameters for grouping. */ 188 | if (m_PerClassDetectionParams[c].groupThreshold > 0) 189 | cv::groupRectangles(m_PerClassCvRectList[c], 190 | m_PerClassDetectionParams[c].groupThreshold, 191 | m_PerClassDetectionParams[c].eps); 192 | totalObjects += m_PerClassCvRectList[c].size(); 193 | } 194 | 195 | output.objects = new NvDsInferObject[totalObjects]; 196 | output.numObjects = 0; 197 | 198 | for (unsigned int c = 0; c < m_NumDetectedClasses; c++) { 199 | /* Add coordinates and class ID and the label of all objects 200 | * detected in the frame to the frame output. */ 201 | for (auto & rect:m_PerClassCvRectList[c]) { 202 | NvDsInferObject &object = output.objects[output.numObjects]; 203 | object.left = rect.x; 204 | object.top = rect.y; 205 | object.width = rect.width; 206 | object.height = rect.height; 207 | object.classIndex = c; 208 | object.label = nullptr; 209 | if (c < m_Labels.size() && m_Labels[c].size() > 0) 210 | object.label = strdup(m_Labels[c][0].c_str()); 211 | output.numObjects++; 212 | } 213 | } 214 | } 215 | 216 | /** 217 | * Cluster objects using DBSCAN and fill the output structure. 218 | */ 219 | void 220 | NvDsInferContextImpl::clusterAndFillDetectionOutputDBSCAN(NvDsInferDetectionOutput &output) { 221 | size_t totalObjects = 0; 222 | NvDsInferDBScanClusteringParams clusteringParams; 223 | clusteringParams.enableATHRFilter = ATHR_ENABLED; 224 | clusteringParams.thresholdATHR = ATHR_THRESHOLD; 225 | vector numObjectsList(m_NumDetectedClasses); 226 | 227 | for (auto & list:m_PerClassObjectList) 228 | list.clear(); 229 | 230 | /* The above functions will add all objects in the m_ObjectList vector. 231 | * Need to seperate them per class for grouping. */ 232 | for (auto & object:m_ObjectList) { 233 | m_PerClassObjectList[object.classId].emplace_back(object); 234 | } 235 | 236 | for (unsigned int c = 0; c < m_NumDetectedClasses; c++) { 237 | NvDsInferObjectDetectionInfo *objArray = m_PerClassObjectList[c].data(); 238 | size_t numObjects = m_PerClassObjectList[c].size(); 239 | 240 | clusteringParams.eps = m_PerClassDetectionParams[c].eps; 241 | clusteringParams.minBoxes = m_PerClassDetectionParams[c].minBoxes; 242 | 243 | /* Cluster together rectangles with similar locations and sizes 244 | * since these rectangles might represent the same object using 245 | * DBSCAN. */ 246 | if (m_PerClassDetectionParams[c].minBoxes > 0) 247 | NvDsInferDBScanCluster(m_DBScanHandle, &clusteringParams, 248 | objArray, &numObjects); 249 | totalObjects += numObjects; 250 | numObjectsList[c] = numObjects; 251 | } 252 | 253 | output.objects = new NvDsInferObject[totalObjects]; 254 | output.numObjects = 0; 255 | 256 | for (unsigned int c = 0; c < m_NumDetectedClasses; c++) { 257 | /* Add coordinates and class ID and the label of all objects 258 | * detected in the frame to the frame output. */ 259 | for (size_t i = 0; i < numObjectsList[c]; i++) { 260 | NvDsInferObject &object = output.objects[output.numObjects]; 261 | object.left = m_PerClassObjectList[c][i].left; 262 | object.top = m_PerClassObjectList[c][i].top; 263 | object.width = m_PerClassObjectList[c][i].width; 264 | object.height = m_PerClassObjectList[c][i].height; 265 | object.classIndex = c; 266 | object.label = nullptr; 267 | if (c < m_Labels.size() && m_Labels[c].size() > 0) 268 | object.label = strdup(m_Labels[c][0].c_str()); 269 | output.numObjects++; 270 | } 271 | } 272 | } 273 | 274 | bool 275 | NvDsInferContextImpl::parseAttributesFromSoftmaxLayers( 276 | std::vector const &outputLayersInfo, 277 | NvDsInferNetworkInfo const &networkInfo, 278 | float classifierThreshold, 279 | std::vector &attrList, 280 | std::string &attrString) { 281 | /* Get the number of attributes supported by the classifier. */ 282 | unsigned int numAttributes = m_OutputLayerInfo.size(); 283 | 284 | /* Iterate through all the output coverage layers of the classifier. 285 | */ 286 | for (unsigned int l = 0; l < numAttributes; l++) { 287 | /* outputCoverageBuffer for classifiers is usually a softmax layer. 288 | * The layer is an array of probabilities of the object belonging 289 | * to each class with each probability being in the range [0,1] and 290 | * sum all probabilities will be 1. 291 | */ 292 | NvDsInferDimsCHW dims; 293 | 294 | getDimsCHWFromDims(dims, m_OutputLayerInfo[l].dims); 295 | unsigned int numClasses = dims.c; 296 | float *outputCoverageBuffer = 297 | (float *)m_OutputLayerInfo[l].buffer; 298 | float maxProbability = 0; 299 | bool attrFound = false; 300 | NvDsInferAttribute attr; 301 | 302 | /* Iterate through all the probabilities that the object belongs to 303 | * each class. Find the maximum probability and the corresponding class 304 | * which meets the minimum threshold. */ 305 | for (unsigned int c = 0; c < numClasses; c++) { 306 | float probability = outputCoverageBuffer[c]; 307 | if (probability > m_ClassifierThreshold 308 | && probability > maxProbability) { 309 | maxProbability = probability; 310 | attrFound = true; 311 | attr.attributeIndex = l; 312 | attr.attributeValue = c; 313 | attr.attributeConfidence = probability; 314 | } 315 | } 316 | if (attrFound) { 317 | if (m_Labels.size() > attr.attributeIndex && 318 | attr.attributeValue < m_Labels[attr.attributeIndex].size()) 319 | attr.attributeLabel = 320 | m_Labels[attr.attributeIndex][attr.attributeValue].c_str(); 321 | else 322 | attr.attributeLabel = nullptr; 323 | attrList.push_back(attr); 324 | if (attr.attributeLabel) 325 | attrString.append(attr.attributeLabel).append(" "); 326 | } 327 | } 328 | 329 | return true; 330 | } 331 | 332 | NvDsInferStatus 333 | NvDsInferContextImpl::fillDetectionOutput(NvDsInferDetectionOutput &output) { 334 | /* Clear the object lists. */ 335 | m_ObjectList.clear(); 336 | 337 | /* Call custom parsing function if specified otherwise use the one 338 | * written along with this implementation. */ 339 | if (m_CustomBBoxParseFunc) { 340 | if (!m_CustomBBoxParseFunc(m_OutputLayerInfo, m_NetworkInfo, 341 | m_DetectionParams, m_ObjectList)) { 342 | printError("Failed to parse bboxes using custom parse function"); 343 | return NVDSINFER_CUSTOM_LIB_FAILED; 344 | } 345 | } else { 346 | if (!parseBoundingBox(m_OutputLayerInfo, m_NetworkInfo, 347 | m_DetectionParams, m_ObjectList)) { 348 | printError("Failed to parse bboxes"); 349 | return NVDSINFER_OUTPUT_PARSING_FAILED; 350 | } 351 | } 352 | 353 | if (m_UseDBScan) 354 | clusterAndFillDetectionOutputDBSCAN(output); 355 | else 356 | clusterAndFillDetectionOutputCV(output); 357 | 358 | return NVDSINFER_SUCCESS; 359 | } 360 | 361 | NvDsInferStatus 362 | NvDsInferContextImpl::fillClassificationOutput(NvDsInferClassificationOutput &output) { 363 | string attrString; 364 | vector attributes; 365 | 366 | /* Call custom parsing function if specified otherwise use the one 367 | * written along with this implementation. */ 368 | if (m_CustomClassifierParseFunc) { 369 | if (!m_CustomClassifierParseFunc(m_OutputLayerInfo, m_NetworkInfo, 370 | m_ClassifierThreshold, attributes, attrString)) { 371 | printError("Failed to parse classification attributes using " 372 | "custom parse function"); 373 | return NVDSINFER_CUSTOM_LIB_FAILED; 374 | } 375 | } else { 376 | if (!parseAttributesFromSoftmaxLayers(m_OutputLayerInfo, m_NetworkInfo, 377 | m_ClassifierThreshold, attributes, attrString)) { 378 | printError("Failed to parse bboxes"); 379 | return NVDSINFER_OUTPUT_PARSING_FAILED; 380 | } 381 | } 382 | 383 | /* Fill the output structure with the parsed attributes. */ 384 | output.label = strdup(attrString.c_str()); 385 | output.numAttributes = attributes.size(); 386 | output.attributes = new NvDsInferAttribute[output.numAttributes]; 387 | for (size_t i = 0; i < output.numAttributes; i++) { 388 | output.attributes[i].attributeIndex = attributes[i].attributeIndex; 389 | output.attributes[i].attributeValue = attributes[i].attributeValue; 390 | output.attributes[i].attributeConfidence = attributes[i].attributeConfidence; 391 | output.attributes[i].attributeLabel = attributes[i].attributeLabel; 392 | } 393 | return NVDSINFER_SUCCESS; 394 | } 395 | 396 | NvDsInferStatus 397 | NvDsInferContextImpl::fillSegmentationOutput(NvDsInferSegmentationOutput &output) { 398 | NvDsInferDimsCHW outputDimsCHW; 399 | getDimsCHWFromDims(outputDimsCHW, m_OutputLayerInfo[0].dims); 400 | 401 | //$6 = {numDims = 3, d = {57, 46, 62, 127, 2918418508, 127, 1443693648, 85}, numElements = 162564} 402 | 403 | const int SCALE = 8; 404 | output.width = outputDimsCHW.w * SCALE; //62 405 | output.height = outputDimsCHW.h * SCALE; //46 406 | output.classes = outputDimsCHW.c; //57 407 | 408 | output.class_map = new int [output.width * output.height]; 409 | output.class_probability_map = (float *) m_OutputLayerInfo[0].buffer; 410 | 411 | int out[46][62]; 412 | for (int i = 0; i < 46; i++) { 413 | for (int j = 0; j < 62; j++) { 414 | out[i][j] = 1; 415 | } 416 | } 417 | for (int k = 0; k < 18; k++) { 418 | int below = 0; 419 | int x = 0, y = 0; 420 | float confidence = 0.0; 421 | 422 | for (int i = 0; i < 46; i++) { 423 | for (int j = 0; j < 62; j++) { 424 | if (output.class_probability_map[k*46*62 + i * 62 + j] > confidence) { 425 | confidence = output.class_probability_map[k*46*62 + i * 62 + j]; 426 | x = j; 427 | y = i; 428 | } 429 | 430 | if (output.class_probability_map[k*46*62 + i * 62 + j] < 0) { 431 | below++; 432 | } 433 | 434 | } 435 | } 436 | out[y][x] = 0; 437 | 438 | //printf("k=%d,y/x=(%d,%d) below 0 = %d\n", k, y, x, below); 439 | } 440 | 441 | for (int i = 0; i < 46; i++) { 442 | for (int j = 0; j < 62; j++) { 443 | printf("%d", out[i][j]); 444 | for (int y = 0; y < SCALE; y++) { 445 | for (int x = 0; x < SCALE; x++) { 446 | output.class_map[(i * SCALE + y) * output.width + j * SCALE + x] = 6 - out[i][j]; 447 | } 448 | } 449 | } 450 | printf("\n"); 451 | } 452 | 453 | #if 1 454 | // Reszie and merge 455 | float* resize_target_ptr = (float*)malloc(sizeof(float) * 368*496*57); 456 | std::vector resize_source_ptr = {output.class_probability_map}; 457 | std::array resize_target_size = {1, 57, 368, 496}; 458 | std::vector> resize_source_size = {{1, 57, 46, 62}}; 459 | std::vector scale_input_to_net_inputs = {1.0}; 460 | 461 | resizeAndMergeCpu(resize_target_ptr, resize_source_ptr, resize_target_size, resize_source_size, scale_input_to_net_inputs); 462 | 463 | // nms 464 | float* nms_target_ptr = (float*)malloc(sizeof(float) * 18 * 128 * 3); 465 | int * kernel_ptr = (int*)malloc(sizeof(int) * 368 * 496 * 57); 466 | float* nms_source_ptr = resize_target_ptr; 467 | float threshold = 0.05f; 468 | int outputChannels = 18; 469 | int POSE_MAX_PEOPLE = 127+1; 470 | int x_y_sorce = 3; 471 | 472 | std::array nms_target_size = {1, outputChannels, POSE_MAX_PEOPLE, x_y_sorce}; 473 | std::array nms_source_size = {1, 57, 368, 496}; 474 | 475 | nmsCpu(nms_target_ptr, kernel_ptr, nms_source_ptr, threshold, nms_target_size, nms_source_size); 476 | 477 | for (int i=0; i < outputChannels*POSE_MAX_PEOPLE / 3; i++) { 478 | if (nms_target_ptr[i*3+2] > 0.1) 479 | printf("%f %f %f\n", nms_target_ptr[i*3], nms_target_ptr[i*3+1], nms_target_ptr[i*3+2]); 480 | } 481 | #endif 482 | 483 | output.classes = 1; 484 | 485 | 486 | #if 0 487 | for (unsigned int y = 0; y < output.height; y++) { 488 | for (unsigned int x = 0; x < output.width; x++) { 489 | float max_prob = -1; 490 | int &cls = output.class_map[y * output.width + x] = -1; 491 | for (unsigned int c = 0; c < output.classes; c++) { 492 | float prob = output.class_probability_map[c * output.width * output.height + y * output.width + x]; 493 | if (prob > max_prob && prob > m_SegmentationThreshold) { 494 | cls = c; 495 | max_prob = prob; 496 | } 497 | } 498 | } 499 | } 500 | #endif 501 | 502 | return NVDSINFER_SUCCESS; 503 | } 504 | 505 | void 506 | NvDsInferContextImpl::releaseFrameOutput(NvDsInferFrameOutput &frameOutput) { 507 | switch (m_NetworkType) { 508 | case NvDsInferNetworkType_Detector: 509 | for (unsigned int j = 0; j < frameOutput.detectionOutput.numObjects; j++) { 510 | free(frameOutput.detectionOutput.objects[j].label); 511 | } 512 | delete[] frameOutput.detectionOutput.objects; 513 | break; 514 | case NvDsInferNetworkType_Classifier: 515 | free(frameOutput.classificationOutput.label); 516 | delete[] frameOutput.classificationOutput.attributes; 517 | break; 518 | case NvDsInferNetworkType_Segmentation: 519 | delete[] frameOutput.segmentationOutput.class_map; 520 | break; 521 | default: 522 | break; 523 | } 524 | } 525 | -------------------------------------------------------------------------------- /libs/nvdsinfer/nvdsinfer_conversion.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA Corporation and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA Corporation is strictly prohibited. 9 | * 10 | */ 11 | 12 | #include 13 | #include "nvdsinfer_conversion.h" 14 | 15 | #define THREADS_PER_BLOCK 32 16 | #define THREADS_PER_BLOCK_1 (THREADS_PER_BLOCK - 1) 17 | 18 | __global__ void 19 | NvDsInferConvert_CxToP3FloatKernel( 20 | float *outBuffer, 21 | unsigned char *inBuffer, 22 | unsigned int width, 23 | unsigned int height, 24 | unsigned int pitch, 25 | unsigned int inputPixelSize, 26 | float scaleFactor) 27 | { 28 | unsigned int row = blockIdx.y * blockDim.y + threadIdx.y; 29 | unsigned int col = blockIdx.x * blockDim.x + threadIdx.x; 30 | 31 | if (col < width && row < height) 32 | { 33 | for (unsigned int k = 0; k < 3; k++) 34 | { 35 | outBuffer[width * height * k + row * width + col] = 36 | scaleFactor * inBuffer[row * pitch + col * inputPixelSize + k]; 37 | } 38 | } 39 | } 40 | 41 | __global__ void 42 | NvDsInferConvert_CxToP3FloatKernelWithMeanSubtraction( 43 | float *outBuffer, 44 | unsigned char *inBuffer, 45 | unsigned int width, 46 | unsigned int height, 47 | unsigned int pitch, 48 | unsigned int inputPixelSize, 49 | float scaleFactor, 50 | float *meanDataBuffer) 51 | { 52 | unsigned int row = blockIdx.y * blockDim.y + threadIdx.y; 53 | unsigned int col = blockIdx.x * blockDim.x + threadIdx.x; 54 | 55 | if (col < width && row < height) 56 | { 57 | for (unsigned int k = 0; k < 3; k++) 58 | { 59 | outBuffer[width * height * k + row * width + col] = 60 | scaleFactor * ((float) inBuffer[row * pitch + col * inputPixelSize + k] - 61 | meanDataBuffer[(row * width * 3) + (col * 3) + k]); 62 | } 63 | } 64 | } 65 | 66 | __global__ void 67 | NvDsInferConvert_CxToP3RFloatKernel( 68 | float *outBuffer, 69 | unsigned char *inBuffer, 70 | unsigned int width, 71 | unsigned int height, 72 | unsigned int pitch, 73 | unsigned int inputPixelSize, 74 | float scaleFactor) 75 | { 76 | unsigned int row = blockIdx.y * blockDim.y + threadIdx.y; 77 | unsigned int col = blockIdx.x * blockDim.x + threadIdx.x; 78 | 79 | if (col < width && row < height) 80 | { 81 | for (unsigned int k = 0; k < 3; k++) 82 | { 83 | outBuffer[width * height * k + row * width + col] = 84 | scaleFactor * inBuffer[row * pitch + col * inputPixelSize + (2 - k)]; 85 | } 86 | } 87 | } 88 | 89 | __global__ void 90 | NvDsInferConvert_CxToP3RFloatKernelWithMeanSubtraction( 91 | float *outBuffer, 92 | unsigned char *inBuffer, 93 | unsigned int width, 94 | unsigned int height, 95 | unsigned int pitch, 96 | unsigned int inputPixelSize, 97 | float scaleFactor, 98 | float *meanDataBuffer) 99 | { 100 | unsigned int row = blockIdx.y * blockDim.y + threadIdx.y; 101 | unsigned int col = blockIdx.x * blockDim.x + threadIdx.x; 102 | 103 | if (col < width && row < height) 104 | { 105 | for (unsigned int k = 0; k < 3; k++) 106 | { 107 | outBuffer[width * height * k + row * width + col] = 108 | scaleFactor * ((float) inBuffer[row * pitch + col * inputPixelSize + (2 - k)] - 109 | meanDataBuffer[(row * width * 3) + (col * 3) + k]); 110 | } 111 | } 112 | } 113 | 114 | __global__ void 115 | NvDsInferConvert_C1ToP1FloatKernel( 116 | float *outBuffer, 117 | unsigned char *inBuffer, 118 | unsigned int width, 119 | unsigned int height, 120 | unsigned int pitch, 121 | float scaleFactor) 122 | { 123 | unsigned int row = blockIdx.y * blockDim.y + threadIdx.y; 124 | unsigned int col = blockIdx.x * blockDim.x + threadIdx.x; 125 | 126 | if (col < width && row < height) 127 | { 128 | outBuffer[row * width + col] = scaleFactor * inBuffer[row * pitch + col]; 129 | } 130 | } 131 | 132 | __global__ void 133 | NvDsInferConvert_C1ToP1FloatKernelWithMeanSubtraction( 134 | float *outBuffer, 135 | unsigned char *inBuffer, 136 | unsigned int width, 137 | unsigned int height, 138 | unsigned int pitch, 139 | float scaleFactor, 140 | float *meanDataBuffer) 141 | { 142 | unsigned int row = blockIdx.y * blockDim.y + threadIdx.y; 143 | unsigned int col = blockIdx.x * blockDim.x + threadIdx.x; 144 | 145 | if (col < width && row < height) 146 | { 147 | outBuffer[row * width + col] = 148 | scaleFactor * ((float) inBuffer[row * pitch + col] - 149 | meanDataBuffer[(row * width) + col]); 150 | } 151 | } 152 | 153 | void 154 | NvDsInferConvert_C3ToP3Float( 155 | float *outBuffer, 156 | unsigned char *inBuffer, 157 | unsigned int width, 158 | unsigned int height, 159 | unsigned int pitch, 160 | float scaleFactor, 161 | float *meanDataBuffer, 162 | cudaStream_t stream) 163 | { 164 | dim3 threadsPerBlock(THREADS_PER_BLOCK, THREADS_PER_BLOCK); 165 | dim3 blocks((width+THREADS_PER_BLOCK_1)/threadsPerBlock.x, (height+THREADS_PER_BLOCK_1)/threadsPerBlock.y); 166 | 167 | if (meanDataBuffer == NULL) 168 | { 169 | NvDsInferConvert_CxToP3FloatKernel <<>> 170 | (outBuffer, inBuffer, width, height, pitch, 3, scaleFactor); 171 | } 172 | else 173 | { 174 | NvDsInferConvert_CxToP3FloatKernelWithMeanSubtraction <<>> 175 | (outBuffer, inBuffer, width, height, pitch, 3, scaleFactor, meanDataBuffer); 176 | } 177 | } 178 | 179 | void 180 | NvDsInferConvert_C4ToP3Float( 181 | float *outBuffer, 182 | unsigned char *inBuffer, 183 | unsigned int width, 184 | unsigned int height, 185 | unsigned int pitch, 186 | float scaleFactor, 187 | float *meanDataBuffer, 188 | cudaStream_t stream) 189 | { 190 | dim3 threadsPerBlock(THREADS_PER_BLOCK, THREADS_PER_BLOCK); 191 | dim3 blocks((width+THREADS_PER_BLOCK_1)/threadsPerBlock.x, (height+THREADS_PER_BLOCK_1)/threadsPerBlock.y); 192 | 193 | if (meanDataBuffer == NULL) 194 | { 195 | NvDsInferConvert_CxToP3FloatKernel <<>> 196 | (outBuffer, inBuffer, width, height, pitch, 4, scaleFactor); 197 | } 198 | else 199 | { 200 | NvDsInferConvert_CxToP3FloatKernelWithMeanSubtraction <<>> 201 | (outBuffer, inBuffer, width, height, pitch, 4, scaleFactor, meanDataBuffer); 202 | } 203 | } 204 | 205 | void 206 | NvDsInferConvert_C3ToP3RFloat( 207 | float *outBuffer, 208 | unsigned char *inBuffer, 209 | unsigned int width, 210 | unsigned int height, 211 | unsigned int pitch, 212 | float scaleFactor, 213 | float *meanDataBuffer, 214 | cudaStream_t stream) 215 | { 216 | dim3 threadsPerBlock(THREADS_PER_BLOCK, THREADS_PER_BLOCK); 217 | dim3 blocks((width+THREADS_PER_BLOCK_1)/threadsPerBlock.x, (height+THREADS_PER_BLOCK_1)/threadsPerBlock.y); 218 | 219 | if (meanDataBuffer == NULL) 220 | { 221 | NvDsInferConvert_CxToP3RFloatKernel <<>> 222 | (outBuffer, inBuffer, width, height, pitch, 3, scaleFactor); 223 | } 224 | else 225 | { 226 | NvDsInferConvert_CxToP3RFloatKernelWithMeanSubtraction <<>> 227 | (outBuffer, inBuffer, width, height, pitch, 3, scaleFactor, meanDataBuffer); 228 | } 229 | } 230 | 231 | void 232 | NvDsInferConvert_C4ToP3RFloat( 233 | float *outBuffer, 234 | unsigned char *inBuffer, 235 | unsigned int width, 236 | unsigned int height, 237 | unsigned int pitch, 238 | float scaleFactor, 239 | float *meanDataBuffer, 240 | cudaStream_t stream) 241 | { 242 | dim3 threadsPerBlock(THREADS_PER_BLOCK, THREADS_PER_BLOCK); 243 | dim3 blocks((width+THREADS_PER_BLOCK_1)/threadsPerBlock.x, (height+THREADS_PER_BLOCK_1)/threadsPerBlock.y); 244 | 245 | if (meanDataBuffer == NULL) 246 | { 247 | NvDsInferConvert_CxToP3RFloatKernel <<>> 248 | (outBuffer, inBuffer, width, height, pitch, 4, scaleFactor); 249 | } 250 | else 251 | { 252 | NvDsInferConvert_CxToP3RFloatKernelWithMeanSubtraction <<>> 253 | (outBuffer, inBuffer, width, height, pitch, 4, scaleFactor, meanDataBuffer); 254 | } 255 | } 256 | 257 | void 258 | NvDsInferConvert_C1ToP1Float( 259 | float *outBuffer, 260 | unsigned char *inBuffer, 261 | unsigned int width, 262 | unsigned int height, 263 | unsigned int pitch, 264 | float scaleFactor, 265 | float *meanDataBuffer, 266 | cudaStream_t stream) 267 | { 268 | dim3 threadsPerBlock(THREADS_PER_BLOCK, THREADS_PER_BLOCK); 269 | dim3 blocks((width+THREADS_PER_BLOCK_1)/threadsPerBlock.x, (height+THREADS_PER_BLOCK_1)/threadsPerBlock.y); 270 | 271 | if (meanDataBuffer == NULL) 272 | { 273 | NvDsInferConvert_C1ToP1FloatKernel <<>> 274 | (outBuffer, inBuffer, width, height, pitch, scaleFactor); 275 | } 276 | else 277 | { 278 | NvDsInferConvert_C1ToP1FloatKernelWithMeanSubtraction <<>> 279 | (outBuffer, inBuffer, width, height, pitch, scaleFactor, meanDataBuffer); 280 | } 281 | 282 | } 283 | -------------------------------------------------------------------------------- /libs/nvdsinfer/nvdsinfer_conversion.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA Corporation and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA Corporation is strictly prohibited. 9 | * 10 | */ 11 | 12 | /** 13 | * This is a header file for pre-processing cuda kernels with normalization and 14 | * mean subtraction required by nvdsinfer. 15 | */ 16 | #ifndef __NVDSINFER_CONVERSION_H__ 17 | #define __NVDSINFER_CONVERSION_H__ 18 | 19 | /** 20 | * Converts an input packed 3 channel buffer of width x height resolution into an 21 | * planar 3-channel float buffer of width x height resolution. The input buffer can 22 | * have a pitch > (width * 3). The cuda kernel supports normalization and mean 23 | * image subtraction. 24 | * 25 | * This kernel can be used for RGB -> RGB and BGR -> BGR conversions. 26 | * 27 | * @param outBuffer Cuda device buffer for planar float output. Should 28 | * be at least (width * height * 3 * sizeof(float)) bytes. 29 | * @param inBuffer Cuda device buffer for packed input. Should be 30 | * at least (pitch * height) bytes. 31 | * @param width Width of the buffers in pixels. 32 | * @param height Height of the buffers in pixels. 33 | * @param pitch Pitch of the input buffer in bytes. 34 | * @param scaleFactor Normalization factor. 35 | * @param meanDataBuffer Mean Image Data buffer. Should be at least 36 | * (width * height * 3 * sizeof(float)) bytes. 37 | * @param stream Cuda stream identifier. 38 | */ 39 | void 40 | NvDsInferConvert_C3ToP3Float( 41 | float *outBuffer, 42 | unsigned char *inBuffer, 43 | unsigned int width, 44 | unsigned int height, 45 | unsigned int pitch, 46 | float scaleFactor, 47 | float *meanDataBuffer, 48 | cudaStream_t stream); 49 | 50 | /** 51 | * Converts an input packed 3 channel buffer of width x height resolution into an 52 | * planar 3-channel float buffer of width x height resolution. The input buffer can 53 | * have a pitch > (width * 3). The cuda kernel supports normalization and mean 54 | * image subtraction. 55 | * 56 | * This kernel can be used for RGBA -> RGB and BGRx -> BGR conversions. 57 | * 58 | * @param outBuffer Cuda device buffer for planar float output. Should 59 | * be at least (width * height * 3 * sizeof(float)) bytes. 60 | * @param inBuffer Cuda device buffer for packed input. Should be 61 | * at least (pitch * height) bytes. 62 | * @param width Width of the buffers in pixels. 63 | * @param height Height of the buffers in pixels. 64 | * @param pitch Pitch of the input buffer in bytes. 65 | * @param scaleFactor Normalization factor. 66 | * @param meanDataBuffer Mean Image Data buffer. Should be at least 67 | * (width * height * 3 * sizeof(float)) bytes. 68 | * @param stream Cuda stream identifier. 69 | */ 70 | void 71 | NvDsInferConvert_C4ToP3Float( 72 | float *outBuffer, 73 | unsigned char *inBuffer, 74 | unsigned int width, 75 | unsigned int height, 76 | unsigned int pitch, 77 | float scaleFactor, 78 | float *meanDataBuffer, 79 | cudaStream_t stream); 80 | 81 | /** 82 | * Converts an input packed 3 channel buffer of width x height resolution into an 83 | * planar 3-channel float buffer of width x height resolution with plane order 84 | * reversed. The input buffer can have a pitch > (width * 3). The cuda kernel 85 | * supports normalization and mean image subtraction. 86 | * 87 | * This kernel can be used for BGR -> RGB and RGB -> BGR conversions. 88 | * 89 | * @param outBuffer Cuda device buffer for planar float output. Should 90 | * be at least (width * height * 3 * sizeof(float)) bytes. 91 | * @param inBuffer Cuda device buffer for packed input. Should be 92 | * at least (pitch * height) bytes. 93 | * @param width Width of the buffers in pixels. 94 | * @param height Height of the buffers in pixels. 95 | * @param pitch Pitch of the input buffer in bytes. 96 | * @param scaleFactor Normalization factor. 97 | * @param meanDataBuffer Mean Image Data buffer. Should be at least 98 | * (width * height * 3 * sizeof(float)) bytes. 99 | * @param stream Cuda stream identifier. 100 | */ 101 | void 102 | NvDsInferConvert_C3ToP3RFloat( 103 | float *outBuffer, 104 | unsigned char *inBuffer, 105 | unsigned int width, 106 | unsigned int height, 107 | unsigned int pitch, 108 | float scaleFactor, 109 | float *meanDataBuffer, 110 | cudaStream_t stream); 111 | 112 | /** 113 | * Converts an input packed 4 channel buffer of width x height resolution into an 114 | * planar 3-channel float buffer of width x height resolution with plane order 115 | * reversed. The input buffer can have a pitch > (width * 3). The cuda kernel 116 | * supports normalization and mean image subtraction. 117 | * 118 | * This kernel can be used for BGRx -> RGB and RGBA -> BGR conversions. 119 | * 120 | * @param outBuffer Cuda device buffer for planar float output. Should 121 | * be at least (width * height * 3 * sizeof(float)) bytes. 122 | * @param inBuffer Cuda device buffer for packed input. Should be 123 | * at least (pitch * height) bytes. 124 | * @param width Width of the buffers in pixels. 125 | * @param height Height of the buffers in pixels. 126 | * @param pitch Pitch of the input buffer in bytes. 127 | * @param scaleFactor Normalization factor. 128 | * @param meanDataBuffer Mean Image Data buffer. Should be at least 129 | * (width * height * 3 * sizeof(float)) bytes. 130 | * @param stream Cuda stream identifier. 131 | */ 132 | void 133 | NvDsInferConvert_C4ToP3RFloat( 134 | float *outBuffer, 135 | unsigned char *inBuffer, 136 | unsigned int width, 137 | unsigned int height, 138 | unsigned int pitch, 139 | float scaleFactor, 140 | float *meanDataBuffer, 141 | cudaStream_t stream); 142 | 143 | /** 144 | * Converts an 1 channel UINT8 input of width x height resolution into an 145 | * 1 channel float buffer of width x height resolution. The input buffer can 146 | * have a pitch > width . The cuda kernel supports normalization and mean 147 | * image subtraction. 148 | * 149 | * @param outBuffer Cuda device buffer for float output. Should 150 | * be at least (width * height * sizeof(float)) bytes. 151 | * @param inBuffer Cuda device buffer for UINT8 input. Should be 152 | * at least (pitch * height) bytes. 153 | * @param width Width of the buffers in pixels. 154 | * @param height Height of the buffers in pixels. 155 | * @param pitch Pitch of the input buffer in bytes. 156 | * @param scaleFactor Normalization factor. 157 | * @param meanDataBuffer Mean Image Data buffer. Should be at least 158 | * (width * height * sizeof(float)) bytes. 159 | * @param stream Cuda stream identifier. 160 | */ 161 | void 162 | NvDsInferConvert_C1ToP1Float( 163 | float *outBuffer, 164 | unsigned char *inBuffer, 165 | unsigned int width, 166 | unsigned int height, 167 | unsigned int pitch, 168 | float scaleFactor, 169 | float *meanDataBuffer, 170 | cudaStream_t stream); 171 | 172 | 173 | /** 174 | * Function pointer type to which any of the NvDsInferConvert functions can be 175 | * assigned. 176 | */ 177 | typedef void (* NvDsInferConvertFcn)( 178 | float *outBuffer, 179 | unsigned char *inBuffer, 180 | unsigned int width, 181 | unsigned int height, 182 | unsigned int pitch, 183 | float scaleFactor, 184 | float *meanDataBuffer, 185 | cudaStream_t stream); 186 | 187 | #endif /* __NVDSINFER_CONVERSION_H__ */ 188 | -------------------------------------------------------------------------------- /libs/nvdsinfer/resize_merge_cpu.cpp: -------------------------------------------------------------------------------- 1 | #include "resize_merge_cpu.h" 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define UNUSED(unusedVariable) (void)(unusedVariable) 10 | 11 | template 12 | void resizeAndMergeCpu(T* targetPtr, const std::vector& sourcePtrs, 13 | const std::array& targetSize, 14 | const std::vector>& sourceSizes, 15 | const std::vector& scaleInputToNetInputs) 16 | { 17 | try 18 | { 19 | // Scale used in CUDA/CL to know scale ratio between input and output 20 | // CPU directly uses sourceWidth/Height and targetWidth/Height 21 | UNUSED(scaleInputToNetInputs); 22 | 23 | // Sanity check 24 | if (sourceSizes.empty()) 25 | printf("sourceSizes cannot be empty. %d, %s, %s\n", __LINE__, __FUNCTION__, __FILE__); 26 | 27 | // Params 28 | const auto nums = (signed)sourceSizes.size(); 29 | const auto channels = targetSize[1]; // 57 30 | const auto targetHeight = targetSize[2]; // 368 31 | const auto targetWidth = targetSize[3]; // 496 32 | const auto targetChannelOffset = targetWidth * targetHeight; 33 | 34 | // No multi-scale merging or no merging required 35 | if (sourceSizes.size() == 1) 36 | { 37 | // Params 38 | const auto& sourceSize = sourceSizes[0]; 39 | const auto sourceHeight = sourceSize[2]; // 368/8 .. 40 | const auto sourceWidth = sourceSize[3]; // 496/8 .. 41 | const auto sourceChannelOffset = sourceHeight * sourceWidth; 42 | if (sourceSize[0] != 1) 43 | printf("It should never reache this point. Notify us otherwise. %d, %s, %s\n", 44 | __LINE__, __FUNCTION__, __FILE__); 45 | 46 | // Per channel resize 47 | const T* sourcePtr = sourcePtrs[0]; 48 | for (auto c = 0 ; c < channels ; c++) 49 | { 50 | cv::Mat source(cv::Size(sourceWidth, sourceHeight), CV_32FC1, 51 | const_cast(&sourcePtr[c*sourceChannelOffset])); 52 | cv::Mat target(cv::Size(targetWidth, targetHeight), CV_32FC1, 53 | (&targetPtr[c*targetChannelOffset])); 54 | cv::resize(source, target, {targetWidth, targetHeight}, 0, 0, cv::INTER_CUBIC); 55 | } 56 | } 57 | // Multi-scale merging 58 | else 59 | { 60 | // Construct temp targets. We resuse targetPtr to store first scale 61 | std::vector> tempTargetPtrs; 62 | for (auto n = 1; n < nums; n++){ 63 | tempTargetPtrs.emplace_back(std::unique_ptr(new T[targetChannelOffset * channels]())); 64 | } 65 | 66 | // Resize and sum 67 | for (auto n = 0; n < nums; n++){ 68 | 69 | // Params 70 | const auto& sourceSize = sourceSizes[n]; 71 | const auto sourceHeight = sourceSize[2]; // 368/6 .. 72 | const auto sourceWidth = sourceSize[3]; // 496/8 .. 73 | const auto sourceChannelOffset = sourceHeight * sourceWidth; 74 | 75 | // Access pointers 76 | const T* sourcePtr = sourcePtrs[n]; 77 | T* tempTargetPtr; 78 | if (n != 0) 79 | tempTargetPtr = tempTargetPtrs[n-1].get(); 80 | else 81 | tempTargetPtr = targetPtr; 82 | 83 | T* firstTempTargetPtr = targetPtr; 84 | for (auto c = 0 ; c < channels ; c++) 85 | { 86 | // Resize 87 | cv::Mat source(cv::Size(sourceWidth, sourceHeight), CV_32FC1, 88 | const_cast(&sourcePtr[c*sourceChannelOffset])); 89 | cv::Mat target(cv::Size(targetWidth, targetHeight), CV_32FC1, 90 | (&tempTargetPtr[c*targetChannelOffset])); 91 | cv::resize(source, target, {targetWidth, targetHeight}, 0, 0, cv::INTER_CUBIC); 92 | 93 | // Add 94 | if (n != 0) 95 | { 96 | cv::Mat addTarget(cv::Size(targetWidth, targetHeight), CV_32FC1, 97 | (&firstTempTargetPtr[c*targetChannelOffset])); 98 | cv::add(target, addTarget, addTarget); 99 | } 100 | } 101 | } 102 | 103 | // Average 104 | for (auto c = 0 ; c < channels ; c++) 105 | { 106 | cv::Mat target(cv::Size(targetWidth, targetHeight), CV_32FC1, (&targetPtr[c*targetChannelOffset])); 107 | target /= (float)nums; 108 | } 109 | 110 | } 111 | } 112 | catch (const std::exception& e) 113 | { 114 | printf("exception: %s, %d, %s, %s\n", e.what(), __LINE__, __FUNCTION__, __FILE__); 115 | } 116 | } 117 | 118 | template void resizeAndMergeCpu( 119 | float* targetPtr, const std::vector& sourcePtrs, const std::array& targetSize, 120 | const std::vector>& sourceSizes, const std::vector& scaleInputToNetInputs); 121 | template void resizeAndMergeCpu( 122 | double* targetPtr, const std::vector& sourcePtrs, const std::array& targetSize, 123 | const std::vector>& sourceSizes, const std::vector& scaleInputToNetInputs); 124 | -------------------------------------------------------------------------------- /libs/nvdsinfer/resize_merge_cpu.h: -------------------------------------------------------------------------------- 1 | #ifndef RESIZE_MERGE_CPU_H 2 | #define RESIZE_MERGE_CPU_H 3 | 4 | #include 5 | #include 6 | 7 | template 8 | void resizeAndMergeCpu( 9 | T* targetPtr, const std::vector& sourcePtrs, const std::array& targetSize, 10 | const std::vector>& sourceSizes, const std::vector& scaleInputToNetInputs = {1.f}); 11 | 12 | #endif // RESIZE_MERGE_CPU_H 13 | -------------------------------------------------------------------------------- /openpose_app/COCO_val2014_000000000564.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cding-nv/deepstream-openpose/6db238cd4490655e2768bafed3ca4b8f5f778d25/openpose_app/COCO_val2014_000000000564.jpg -------------------------------------------------------------------------------- /openpose_app/COCO_val2014_000000000569.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cding-nv/deepstream-openpose/6db238cd4490655e2768bafed3ca4b8f5f778d25/openpose_app/COCO_val2014_000000000569.jpg -------------------------------------------------------------------------------- /openpose_app/Makefile: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a 5 | # copy of this software and associated documentation files (the "Software"), 6 | # to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | # and/or sell copies of the Software, and to permit persons to whom the 9 | # Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | # DEALINGS IN THE SOFTWARE. 21 | ################################################################################ 22 | 23 | APP:= openpose-app 24 | 25 | TARGET_DEVICE = $(shell gcc -dumpmachine | cut -f1 -d -) 26 | 27 | NVDS_VERSION:=4.0 28 | 29 | LIB_INSTALL_DIR?=/opt/nvidia/deepstream/deepstream-$(NVDS_VERSION)/lib/ 30 | 31 | ifeq ($(TARGET_DEVICE),aarch64) 32 | CFLAGS:= -DPLATFORM_TEGRA 33 | endif 34 | 35 | SRCS:= $(wildcard *.c) 36 | 37 | INCS:= $(wildcard *.h) 38 | 39 | PKGS:= gstreamer-1.0 40 | 41 | OBJS:= $(SRCS:.c=.o) 42 | 43 | CFLAGS+= -I/opt/nvidia/deepstream/deepstream-4.0/sources/includes 44 | 45 | CFLAGS+= `pkg-config --cflags $(PKGS)` 46 | 47 | LIBS:= `pkg-config --libs $(PKGS)` 48 | 49 | LIBS+= -lm -L$(LIB_INSTALL_DIR) -lnvdsgst_helper -lnvdsgst_meta \ 50 | -Wl,-rpath,$(LIB_INSTALL_DIR) 51 | 52 | all: $(APP) 53 | 54 | %.o: %.c $(INCS) Makefile 55 | $(CC) -c -o $@ $(CFLAGS) $< 56 | 57 | $(APP): $(OBJS) Makefile 58 | $(CC) -o $(APP) $(OBJS) $(LIBS) 59 | 60 | clean: 61 | rm -rf $(OBJS) $(APP) 62 | 63 | 64 | -------------------------------------------------------------------------------- /openpose_app/README: -------------------------------------------------------------------------------- 1 | ***************************************************************************** 2 | * Copyright (c) 2019 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NVIDIA Corporation and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA Corporation is strictly prohibited. 9 | ***************************************************************************** 10 | 11 | Prequisites: 12 | 13 | Please follow instructions in the apps/sample_apps/deepstream-app/README on how 14 | to install the prequisites for Deepstream SDK, the DeepStream SDK itself and the 15 | apps. 16 | 17 | Pipeline: 18 | filesrc -> jpegparse -> nvv4l2decoder -> nvstreammux -> nvinfer (segmentation) 19 | nvsegvidsual -> nvmultistreamtiler -> (nvegltransform) -> nveglglessink 20 | -------------------------------------------------------------------------------- /openpose_app/nvinfer_config.txt: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a 5 | # copy of this software and associated documentation files (the "Software"), 6 | # to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | # and/or sell copies of the Software, and to permit persons to whom the 9 | # Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in 12 | # all copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | # DEALINGS IN THE SOFTWARE. 21 | ################################################################################ 22 | 23 | # Following properties are mandatory when engine files are not specified: 24 | # int8-calib-file(Only in INT8), model-file-format 25 | # Caffemodel mandatory properties: model-file, proto-file, output-blob-names 26 | # UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names 27 | # ONNX: onnx-file 28 | # 29 | # Mandatory properties for detectors: 30 | # num-detected-classes 31 | # 32 | # Optional properties for detectors: 33 | # enable-dbscan(Default=false), interval(Primary mode only, Default=0) 34 | # custom-lib-path, 35 | # parse-bbox-func-name 36 | # 37 | # Mandatory properties for classifiers: 38 | # classifier-threshold, is-classifier 39 | # 40 | # Optional properties for classifiers: 41 | # classifier-async-mode(Secondary mode only, Default=false) 42 | # 43 | # Optional properties in secondary mode: 44 | # operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes), 45 | # input-object-min-width, input-object-min-height, input-object-max-width, 46 | # input-object-max-height 47 | # 48 | # Following properties are always recommended: 49 | # batch-size(Default=1) 50 | # 51 | # Other optional properties: 52 | # net-scale-factor(Default=1), network-mode(Default=0 i.e FP32), 53 | # model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path, 54 | # mean-file, gie-unique-id(Default=0), offsets, gie-mode (Default=1 i.e. primary), 55 | # custom-lib-path, network-mode(Default=0 i.e FP32) 56 | # 57 | # The values in the config file are overridden by values set through GObject 58 | # properties. 59 | 60 | [property] 61 | gpu-id=0 62 | net-scale-factor=0.00390625 63 | #net-scale-factor=0.003921568627451 64 | #net-scale-factor=0.007843137254902 65 | offsets=128;128;128 66 | model-color-format=1 67 | model-file=/home/nvidia/openpose/models/pose/coco/pose_iter_440000.caffemodel 68 | proto-file=/home/nvidia/openpose/models/pose/coco/pose_deploy_linevec.prototxt 69 | #model-engine-file=/home/nvidia/openpose/models/pose/coco/pose_iter_440000.caffemodel_b1_fp32.engine 70 | batch-size=1 71 | ## 0=FP32, 1=INT8, 2=FP16 mode 72 | #network-mode=0 73 | num-detected-classes=4 74 | interval=0 75 | gie-unique-id=1 76 | network-type=2 77 | output-blob-names=net_output 78 | segmentation-threshold=0.0 79 | #parse-bbox-func-name=NvDsInferParseCustomSSD 80 | #custom-lib-path=nvdsinfer_custom_impl_ssd/libnvdsinfer_custom_impl_ssd.so 81 | 82 | [class-attrs-all] 83 | roi-top-offset=0 84 | roi-bottom-offset=0 85 | detected-min-w=0 86 | detected-min-h=0 87 | detected-max-w=0 88 | detected-max-h=0 89 | 90 | ## Per class configuration 91 | #[class-attrs-2] 92 | #threshold=0.6 93 | #roi-top-offset=20 94 | #roi-bottom-offset=10 95 | #detected-min-w=40 96 | #detected-min-h=40 97 | #detected-max-w=400 98 | #detected-max-h=800 99 | -------------------------------------------------------------------------------- /openpose_app/openpose_app.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include "gstnvdsmeta.h" 32 | #ifndef PLATFORM_TEGRA 33 | #include "gst-nvmessage.h" 34 | #endif 35 | 36 | /* The muxer output resolution must be set if the input streams will be of 37 | * different resolution. The muxer will scale all the input frames to this 38 | * resolution. */ 39 | #define MUXER_OUTPUT_WIDTH 1280 40 | #define MUXER_OUTPUT_HEIGHT 720 41 | 42 | /* Muxer batch formation timeout, for e.g. 40 millisec. Should ideally be set 43 | * based on the fastest source's framerate. */ 44 | #define MUXER_BATCH_TIMEOUT_USEC 4000000 45 | 46 | #define TILED_OUTPUT_WIDTH 1280 47 | #define TILED_OUTPUT_HEIGHT 720 48 | 49 | /* tiler_sink_pad_buffer_probe will extract metadata received on segmentation 50 | * src pad */ 51 | static GstPadProbeReturn 52 | tiler_src_pad_buffer_probe (GstPad * pad, GstPadProbeInfo * info, 53 | gpointer u_data) 54 | { 55 | GstBuffer *buf = (GstBuffer *) info->data; 56 | NvDsMetaList * l_frame = NULL; 57 | NvDsBatchMeta *batch_meta = gst_buffer_get_nvds_batch_meta (buf); 58 | 59 | for (l_frame = batch_meta->frame_meta_list; l_frame != NULL; 60 | l_frame = l_frame->next) { 61 | // TODO: 62 | } 63 | return GST_PAD_PROBE_OK; 64 | } 65 | 66 | static gboolean 67 | bus_call (GstBus * bus, GstMessage * msg, gpointer data) 68 | { 69 | GMainLoop *loop = (GMainLoop *) data; 70 | switch (GST_MESSAGE_TYPE (msg)) { 71 | case GST_MESSAGE_EOS: 72 | g_print ("End of stream\n"); 73 | // Add the delay to show the result 74 | usleep(100000000); 75 | g_main_loop_quit (loop); 76 | break; 77 | case GST_MESSAGE_WARNING: 78 | { 79 | gchar *debug; 80 | GError *error; 81 | gst_message_parse_warning (msg, &error, &debug); 82 | g_printerr ("WARNING from element %s: %s\n", 83 | GST_OBJECT_NAME (msg->src), error->message); 84 | g_free (debug); 85 | g_printerr ("Warning: %s\n", error->message); 86 | g_error_free (error); 87 | break; 88 | } 89 | case GST_MESSAGE_ERROR: 90 | { 91 | gchar *debug; 92 | GError *error; 93 | gst_message_parse_error (msg, &error, &debug); 94 | g_printerr ("ERROR from element %s: %s\n", 95 | GST_OBJECT_NAME (msg->src), error->message); 96 | if (debug) 97 | g_printerr ("Error details: %s\n", debug); 98 | g_free (debug); 99 | g_error_free (error); 100 | g_main_loop_quit (loop); 101 | break; 102 | } 103 | #ifndef PLATFORM_TEGRA 104 | case GST_MESSAGE_ELEMENT: 105 | { 106 | if (gst_nvmessage_is_stream_eos (msg)) { 107 | guint stream_id; 108 | if (gst_nvmessage_parse_stream_eos (msg, &stream_id)) { 109 | g_print ("Got EOS from stream %d\n", stream_id); 110 | } 111 | } 112 | break; 113 | } 114 | #endif 115 | default: 116 | break; 117 | } 118 | return TRUE; 119 | } 120 | 121 | static GstElement * 122 | create_source_bin (guint index, gchar * uri) 123 | { 124 | GstElement *bin = NULL; 125 | gchar bin_name[16] = { }; 126 | 127 | g_snprintf (bin_name, 15, "source-bin-%02d", index); 128 | /* Create a source GstBin to abstract this bin's content from the rest of the 129 | * pipeline */ 130 | bin = gst_bin_new (bin_name); 131 | 132 | GstElement *source, *jpegparser, *decoder; 133 | 134 | source = gst_element_factory_make ("filesrc", "source"); 135 | 136 | jpegparser = gst_element_factory_make ("jpegparse", "jpeg-parser"); 137 | 138 | decoder = gst_element_factory_make ("nvv4l2decoder", "nvv4l2-decoder"); 139 | 140 | if (!source || !jpegparser || !decoder) 141 | { 142 | g_printerr ("One element could not be created. Exiting.\n"); 143 | return NULL; 144 | } 145 | g_object_set (G_OBJECT (source), "location", uri, NULL); 146 | const char *dot = strrchr(uri, '.'); 147 | if ((!strcmp (dot+1, "mjpeg")) || (!strcmp (dot+1, "mjpg"))) 148 | { 149 | #ifdef PLATFORM_TEGRA 150 | g_object_set (G_OBJECT (decoder), "mjpeg", 1, NULL); 151 | #endif 152 | } 153 | 154 | gst_bin_add_many (GST_BIN (bin), source, jpegparser, decoder, NULL); 155 | 156 | gst_element_link_many (source, jpegparser, decoder, NULL); 157 | 158 | /* We need to create a ghost pad for the source bin which will act as a proxy 159 | * for the video decoder src pad. The ghost pad will not have a target right 160 | * now. Once the decode bin creates the video decoder and generates the 161 | * cb_newpad callback, we will set the ghost pad target to the video decoder 162 | * src pad. */ 163 | if (!gst_element_add_pad (bin, gst_ghost_pad_new_no_target ("src", 164 | GST_PAD_SRC))) { 165 | g_printerr ("Failed to add ghost pad in source bin\n"); 166 | return NULL; 167 | } 168 | 169 | GstPad *srcpad = gst_element_get_static_pad (decoder, "src"); 170 | if (!srcpad) { 171 | g_printerr ("Failed to get src pad of source bin. Exiting.\n"); 172 | return NULL; 173 | } 174 | GstPad *bin_ghost_pad = gst_element_get_static_pad (bin, "src"); 175 | if (!gst_ghost_pad_set_target (GST_GHOST_PAD (bin_ghost_pad), 176 | srcpad)) { 177 | g_printerr ("Failed to link decoder src pad to source bin ghost pad\n"); 178 | } 179 | 180 | return bin; 181 | } 182 | 183 | int 184 | main (int argc, char *argv[]) 185 | { 186 | GMainLoop *loop = NULL; 187 | GstElement *pipeline = NULL, *streammux = NULL, *sink = NULL, *seg = NULL, 188 | *nvsegvisual = NULL, *tiler = NULL; 189 | #ifdef PLATFORM_TEGRA 190 | GstElement *transform = NULL; 191 | #endif 192 | GstBus *bus = NULL; 193 | guint bus_watch_id; 194 | GstPad *seg_src_pad = NULL; 195 | guint i, num_sources; 196 | guint tiler_rows, tiler_columns; 197 | guint pgie_batch_size; 198 | 199 | /* Check input arguments */ 200 | if (argc < 3) { 201 | g_printerr ("Usage: %s config_file \n", argv[0]); 202 | return -1; 203 | } 204 | num_sources = argc - 2; 205 | 206 | /* Standard GStreamer initialization */ 207 | gst_init (&argc, &argv); 208 | loop = g_main_loop_new (NULL, FALSE); 209 | 210 | /* Create gstreamer elements */ 211 | /* Create Pipeline element that will form a connection of other elements */ 212 | pipeline = gst_pipeline_new ("dstest-image-decode-pipeline"); 213 | 214 | /* Create nvstreammux instance to form batches from one or more sources. */ 215 | streammux = gst_element_factory_make ("nvstreammux", "stream-muxer"); 216 | 217 | if (!pipeline || !streammux) { 218 | g_printerr ("One element could not be created. Exiting.\n"); 219 | return -1; 220 | } 221 | gst_bin_add (GST_BIN (pipeline), streammux); 222 | 223 | for (i = 0; i < num_sources; i++) { 224 | GstPad *sinkpad, *srcpad; 225 | gchar pad_name[16] = { }; 226 | GstElement *source_bin = create_source_bin (i, argv[i + 2]); 227 | 228 | if (!source_bin) { 229 | g_printerr ("Failed to create source bin. Exiting.\n"); 230 | return -1; 231 | } 232 | 233 | gst_bin_add (GST_BIN (pipeline), source_bin); 234 | 235 | g_snprintf (pad_name, 15, "sink_%u", i); 236 | sinkpad = gst_element_get_request_pad (streammux, pad_name); 237 | if (!sinkpad) { 238 | g_printerr ("Streammux request sink pad failed. Exiting.\n"); 239 | return -1; 240 | } 241 | 242 | srcpad = gst_element_get_static_pad (source_bin, "src"); 243 | if (!srcpad) { 244 | g_printerr ("Failed to get src pad of source bin. Exiting.\n"); 245 | return -1; 246 | } 247 | 248 | if (gst_pad_link (srcpad, sinkpad) != GST_PAD_LINK_OK) { 249 | g_printerr ("Failed to link source bin to stream muxer. Exiting.\n"); 250 | return -1; 251 | } 252 | 253 | gst_object_unref (srcpad); 254 | gst_object_unref (sinkpad); 255 | } 256 | 257 | /* Use nvinfer to infer on batched frame. */ 258 | seg = gst_element_factory_make ("nvinfer", "primary-nvinference-engine"); 259 | 260 | nvsegvisual = gst_element_factory_make ("nvsegvisual", "nvsegvisual"); 261 | 262 | /* Use nvtiler to composite the batched frames into a 2D tiled array based 263 | * on the source of the frames. */ 264 | tiler = gst_element_factory_make ("nvmultistreamtiler", "nvtiler"); 265 | 266 | #ifdef PLATFORM_TEGRA 267 | transform = gst_element_factory_make ("nvegltransform", "transform"); 268 | #endif 269 | 270 | sink = gst_element_factory_make ("nveglglessink", "nvvideo-renderer"); 271 | 272 | if (!seg || !nvsegvisual || !tiler || !sink) { 273 | g_printerr ("One element could not be created. Exiting.\n"); 274 | return -1; 275 | } 276 | 277 | #ifdef PLATFORM_TEGRA 278 | if(!transform) { 279 | g_printerr ("One tegra element could not be created. Exiting.\n"); 280 | return -1; 281 | } 282 | #endif 283 | 284 | g_object_set (G_OBJECT (streammux), "width", MUXER_OUTPUT_WIDTH, "height", 285 | MUXER_OUTPUT_HEIGHT, "batch-size", num_sources, 286 | "batched-push-timeout", MUXER_BATCH_TIMEOUT_USEC, NULL); 287 | 288 | /* Configure the nvinfer element using the nvinfer config file. */ 289 | g_object_set (G_OBJECT (seg), "config-file-path", argv[1], NULL); 290 | 291 | /* Override the batch-size set in the config file with the number of sources. */ 292 | g_object_get (G_OBJECT (seg), "batch-size", &pgie_batch_size, NULL); 293 | if (pgie_batch_size != num_sources) { 294 | g_printerr 295 | ("WARNING: Overriding infer-config batch-size (%d) with number of sources (%d)\n", 296 | pgie_batch_size, num_sources); 297 | g_object_set (G_OBJECT (seg), "batch-size", num_sources, NULL); 298 | } 299 | 300 | g_object_set (G_OBJECT (nvsegvisual), "batch-size", num_sources, NULL); 301 | g_object_set (G_OBJECT (nvsegvisual), "width", 496, NULL); 302 | g_object_set (G_OBJECT (nvsegvisual), "height", 368, NULL); 303 | 304 | tiler_rows = (guint) sqrt (num_sources); 305 | tiler_columns = (guint) ceil (1.0 * num_sources / tiler_rows); 306 | /* we set the tiler properties here */ 307 | g_object_set (G_OBJECT (tiler), "rows", tiler_rows, "columns", tiler_columns, 308 | "width", TILED_OUTPUT_WIDTH, "height", TILED_OUTPUT_HEIGHT, NULL); 309 | 310 | g_object_set(G_OBJECT(sink), "async", FALSE, NULL); 311 | 312 | /* we add a message handler */ 313 | bus = gst_pipeline_get_bus (GST_PIPELINE (pipeline)); 314 | bus_watch_id = gst_bus_add_watch (bus, bus_call, loop); 315 | gst_object_unref (bus); 316 | 317 | /* Set up the pipeline */ 318 | /* Add all elements into the pipeline */ 319 | #ifdef PLATFORM_TEGRA 320 | gst_bin_add_many (GST_BIN (pipeline), seg, nvsegvisual, tiler, transform, sink, NULL); 321 | /* we link the elements together 322 | * nvstreammux -> nvinfer -> nvsegvidsual -> nvtiler -> transform -> video-renderer */ 323 | if (!gst_element_link_many (streammux, seg, nvsegvisual, tiler, transform, sink, NULL)) 324 | { 325 | g_printerr ("Elements could not be linked. Exiting.\n"); 326 | return -1; 327 | } 328 | #else 329 | gst_bin_add_many (GST_BIN (pipeline), seg, nvsegvisual, tiler, sink, NULL); 330 | /* Link the elements together 331 | * nvstreammux -> nvinfer -> nvsegvisual -> nvtiler -> video-renderer */ 332 | if (!gst_element_link_many (streammux, seg, nvsegvisual, tiler, sink, NULL)) { 333 | g_printerr ("Elements could not be linked. Exiting.\n"); 334 | return -1; 335 | } 336 | #endif 337 | 338 | /* Lets add probe to get informed of the meta data generated, we add probe to 339 | * the src pad of the nvseg element, since by that time, the buffer would have 340 | * had got all the segmentation metadata. */ 341 | seg_src_pad = gst_element_get_static_pad (seg, "src"); 342 | if (!seg_src_pad) 343 | g_print ("Unable to get src pad\n"); 344 | else 345 | gst_pad_add_probe (seg_src_pad, GST_PAD_PROBE_TYPE_BUFFER, 346 | tiler_src_pad_buffer_probe, NULL, NULL); 347 | 348 | /* Set the pipeline to "playing" state */ 349 | g_print ("Now playing:"); 350 | for (i = 0; i < num_sources; i++) { 351 | g_print (" %s,", argv[i + 2]); 352 | } 353 | g_print ("\n"); 354 | gst_element_set_state (pipeline, GST_STATE_PLAYING); 355 | 356 | /* Wait till pipeline encounters an error or EOS */ 357 | g_print ("Running...\n"); 358 | g_main_loop_run (loop); 359 | 360 | /* Out of the main loop, clean up nicely */ 361 | g_print ("Returned, stopping playback\n"); 362 | gst_element_set_state (pipeline, GST_STATE_NULL); 363 | g_print ("Deleting pipeline\n"); 364 | gst_object_unref (GST_OBJECT (pipeline)); 365 | g_source_remove (bus_watch_id); 366 | g_main_loop_unref (loop); 367 | return 0; 368 | } 369 | -------------------------------------------------------------------------------- /todo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cding-nv/deepstream-openpose/6db238cd4490655e2768bafed3ca4b8f5f778d25/todo.jpg --------------------------------------------------------------------------------