├── .DS_Store ├── .gitignore ├── CMakeLists.txt ├── README.md ├── cmake_install.cmake ├── common.cpp ├── common.h ├── cudaUtility.h ├── imageBuffer.h ├── kernel.cu ├── main.cpp ├── mathFunctions.cpp ├── mathFunctions.cu ├── mathFunctions.h ├── model └── pelee │ ├── pelee_deploy_iplugin.prototxt │ ├── pelee_merged.caffemodel │ └── pelee_merged.prototxt ├── pluginImplement.cpp ├── pluginImplement.h ├── tensorNet.cpp ├── tensorNet.h ├── testPic └── test.png ├── testVideo └── test.avi └── util ├── cuda ├── cudaMappedMemory.h ├── cudaNormalize.cu ├── cudaNormalize.h ├── cudaOverlay.cu ├── cudaOverlay.h ├── cudaRGB.cu ├── cudaRGB.h ├── cudaResize.cu ├── cudaResize.h ├── cudaUtility.h ├── cudaYUV-NV12.cu ├── cudaYUV-YUYV.cu ├── cudaYUV-YV12.cu └── cudaYUV.h ├── loadImage.cpp └── loadImage.h /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric612/Pelee-Seg-TensorRT/05bf0b31c5891adaf64f40b784ef4a1927d68862/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## General 2 | jobs/* 3 | CMakeFiles/* 4 | 3rdparty/* 5 | cmake/* 6 | tools/* 7 | lib/* 8 | build/* 9 | 10 | # Compiled Object files 11 | *.slo 12 | *.lo 13 | *.o 14 | *.cuo 15 | 16 | # Compiled Dynamic libraries 17 | *.so 18 | *.dylib 19 | 20 | # Compiled Static libraries 21 | *.lai 22 | *.la 23 | *.a 24 | 25 | # Compiled protocol buffers 26 | *.pb.h 27 | *.pb.cc 28 | *_pb2.py 29 | 30 | # Compiled python 31 | *.pyc 32 | 33 | # Compiled MATLAB 34 | *.mex* 35 | 36 | # IPython notebook checkpoints 37 | .ipynb_checkpoints 38 | 39 | # Editor temporaries 40 | *.swp 41 | *~ 42 | 43 | # Sublime Text settings 44 | *.sublime-workspace 45 | *.sublime-project 46 | 47 | # Eclipse Project settings 48 | *.*project 49 | .settings 50 | 51 | # QtCreator files 52 | *.user 53 | 54 | # PyCharm files 55 | .idea 56 | 57 | # OSX dir files 58 | .DS_Store 59 | 60 | ## Caffe 61 | 62 | # User's build configuration 63 | Makefile.config 64 | Makefile 65 | 66 | # Data and models are either 67 | # 1. reference, and not casually committed 68 | # 2. custom, and live on their own unless they're deliberated contributed 69 | data/* 70 | models/* 71 | *.caffemodel 72 | *.caffemodel.h5 73 | *.solverstate 74 | *.solverstate.h5 75 | *.binaryproto 76 | *leveldb 77 | *lmdb 78 | 79 | # build, distribute, and bins (+ python proto bindings) 80 | build 81 | .build_debug/* 82 | .build_release/* 83 | distribute/* 84 | *.testbin 85 | *.bin 86 | python/caffe/proto/ 87 | cmake_build 88 | .cmake_build 89 | 90 | # Generated documentation 91 | docs/_site 92 | docs/gathered 93 | _site 94 | doxygen 95 | docs/dev 96 | 97 | # LevelDB files 98 | *.sst 99 | *.ldb 100 | LOCK 101 | LOG* 102 | CURRENT 103 | MANIFEST-* 104 | 105 | 106 | *.tar.gz 107 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | project(pelee) 3 | 4 | #set(inference_VERSION_MAJOR 2) 5 | #set(inference_VERSION_MINOR 1) 6 | 7 | #set(CMAKE_CXX_STANDARD 11) 8 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") 9 | find_package(OpenMP) 10 | if (OPENMP_FOUND) 11 | set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") 12 | set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") 13 | set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") 14 | endif() 15 | set(BUILD_DEPS "YES" CACHE BOOL "If YES, will install dependencies into sandbox. Automatically reset to NO after dependencies are installed.") 16 | 17 | set(PROJECT_OUTPUT_DIR ${PROJECT_BINARY_DIR}/build) 18 | set(PROJECT_INCLUDE_DIR ${PROJECT_OUTPUT_DIR}/include) 19 | 20 | file(MAKE_DIRECTORY ${PROJECT_INCLUDE_DIR}) 21 | file(MAKE_DIRECTORY ${PROJECT_OUTPUT_DIR}/bin) 22 | 23 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_OUTPUT_DIR}/bin) 24 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_OUTPUT_DIR}/lib) 25 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_OUTPUT_DIR}/lib) 26 | 27 | message("The runtime libraries are included in ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") 28 | message("The library files are included in ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}") 29 | 30 | message("-- system arch: ${CMAKE_SYSTEM_PROCESSOR}") 31 | message("-- output path: ${PROJECT_OUTPUT_DIR}") 32 | 33 | find_package(CUDA) 34 | find_package(OpenCV REQUIRED) 35 | message(" -- CUDA and Opencv Found ") 36 | message(" -- opencv_version "${OpenCV_VERSION}) 37 | 38 | 39 | set(CUDA_NVCC_FLAGS 40 | ${CUDA_NVCC_FLAGS};--disable-warnings; 41 | -O3 42 | -gencode arch=compute_30,code=sm_30 43 | -gencode arch=compute_35,code=sm_35 44 | -gencode arch=compute_50,code=sm_50 45 | -gencode arch=compute_50,code=compute_50 46 | -gencode arch=compute_52,code=sm_52 47 | -gencode arch=compute_61,code=sm_61 48 | -gencode arch=compute_62,code=sm_62 49 | ) 50 | 51 | file(GLOB cudaSources util/cuda/*.cu) 52 | file(GLOB cudaIncludes util/cuda/*.h) 53 | 54 | file(GLOB sources *.cu *.cpp util/*.cpp util/cuda/*.cu) 55 | file(GLOB includes util/*.h util/cuda/*.h) 56 | 57 | include_directories(${PROJECT_INCLUDE_DIR}/util) 58 | include_directories(${PROJECT_BINARY_DIR}/util) 59 | include_directories(${OpenCV_INCLUDE_DIRS}) 60 | ## 61 | 62 | link_directories(${OpenCV_LIBRARY_DIRS}) 63 | 64 | cuda_add_library(inferLib SHARED ${sources}) 65 | ## 66 | target_link_libraries(inferLib /usr/lib/aarch64-linux-gnu/libnvcaffe_parser.so) 67 | target_link_libraries(inferLib /usr/lib/aarch64-linux-gnu/libnvinfer.so) 68 | target_link_libraries(inferLib /usr/lib/aarch64-linux-gnu/libnvinfer_plugin.so) 69 | target_link_libraries(inferLib /usr/lib/aarch64-linux-gnu/libnvparsers.so) 70 | 71 | 72 | # transfer all headers to the include directory 73 | foreach(include ${includes}) 74 | message("-- Copying ${include}") 75 | configure_file(${include} ${PROJECT_INCLUDE_DIR} COPYONLY) 76 | endforeach() 77 | 78 | ## install 79 | foreach(include ${includes}) 80 | install(FILES "${include}" DESTINATION include/inferLib) 81 | endforeach() 82 | 83 | add_executable(pelee main.cpp ) 84 | target_link_libraries(pelee inferLib ${OpenCV_LIBS}) 85 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pelee-TensorRT 2 | 3 | **Accelerate Pelee with TensorRT** 4 | Pelee: A Real-Time Object Detection System on Mobile Devices (NeurIPS 2018) 5 | 6 | **TensorRT-Pelee can run over 70FPS(11ms) on Jetson TX2(FP32)** 7 | 8 | --- 9 | 10 | **Performance:**
11 | Jetson TX2: 72 FPS, 13.2~11 ms (FP32)
12 | Titan V: 200FPS, 5 ms (FP32)
13 | 14 | **Requierments:** 15 | 16 | 1.TensorRT4 (Jetpack 3.3 on TX2)
17 | 2.CUDA 9.0
18 | 3.cudnn7
19 | 20 | --- 21 | 22 | **Run:** 23 | 24 | ```shell 25 | cmake . 26 | make 27 | ./build/bin/pelee 28 | ``` 29 | 30 | --- 31 | 32 | **Reference:** 33 | 34 | https://github.com/Ghustwb/MobileNet-SSD-TensorRT 35 | 36 | --- 37 | 38 | **TODO:** 39 | - [ ] FP16 Implementation 40 | - [ ] Change Custom layers IPlugin to IPluginExt 41 | 42 | 43 | 44 | 45 | **The bug has been fixed** 46 | 47 | ![image](testPic/test.png) 48 | -------------------------------------------------------------------------------- /cmake_install.cmake: -------------------------------------------------------------------------------- 1 | # Install script for directory: /home/nvidia/TRT-Pelee 2 | 3 | # Set the install prefix 4 | if(NOT DEFINED CMAKE_INSTALL_PREFIX) 5 | set(CMAKE_INSTALL_PREFIX "/usr/local") 6 | endif() 7 | string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}") 8 | 9 | # Set the install configuration name. 10 | if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME) 11 | if(BUILD_TYPE) 12 | string(REGEX REPLACE "^[^A-Za-z0-9_]+" "" 13 | CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}") 14 | else() 15 | set(CMAKE_INSTALL_CONFIG_NAME "") 16 | endif() 17 | message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"") 18 | endif() 19 | 20 | # Set the component getting installed. 21 | if(NOT CMAKE_INSTALL_COMPONENT) 22 | if(COMPONENT) 23 | message(STATUS "Install component: \"${COMPONENT}\"") 24 | set(CMAKE_INSTALL_COMPONENT "${COMPONENT}") 25 | else() 26 | set(CMAKE_INSTALL_COMPONENT) 27 | endif() 28 | endif() 29 | 30 | # Install shared libraries without execute permission? 31 | if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE) 32 | set(CMAKE_INSTALL_SO_NO_EXE "1") 33 | endif() 34 | 35 | if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") 36 | file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/inferLib" TYPE FILE FILES "/home/nvidia/TRT-Pelee/util/loadImage.h") 37 | endif() 38 | 39 | if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") 40 | file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/inferLib" TYPE FILE FILES "/home/nvidia/TRT-Pelee/util/cuda/cudaOverlay.h") 41 | endif() 42 | 43 | if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") 44 | file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/inferLib" TYPE FILE FILES "/home/nvidia/TRT-Pelee/util/cuda/cudaResize.h") 45 | endif() 46 | 47 | if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") 48 | file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/inferLib" TYPE FILE FILES "/home/nvidia/TRT-Pelee/util/cuda/cudaRGB.h") 49 | endif() 50 | 51 | if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") 52 | file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/inferLib" TYPE FILE FILES "/home/nvidia/TRT-Pelee/util/cuda/cudaYUV.h") 53 | endif() 54 | 55 | if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") 56 | file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/inferLib" TYPE FILE FILES "/home/nvidia/TRT-Pelee/util/cuda/cudaMappedMemory.h") 57 | endif() 58 | 59 | if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") 60 | file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/inferLib" TYPE FILE FILES "/home/nvidia/TRT-Pelee/util/cuda/cudaNormalize.h") 61 | endif() 62 | 63 | if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") 64 | file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/inferLib" TYPE FILE FILES "/home/nvidia/TRT-Pelee/util/cuda/cudaUtility.h") 65 | endif() 66 | 67 | if(CMAKE_INSTALL_COMPONENT) 68 | set(CMAKE_INSTALL_MANIFEST "install_manifest_${CMAKE_INSTALL_COMPONENT}.txt") 69 | else() 70 | set(CMAKE_INSTALL_MANIFEST "install_manifest.txt") 71 | endif() 72 | 73 | string(REPLACE ";" "\n" CMAKE_INSTALL_MANIFEST_CONTENT 74 | "${CMAKE_INSTALL_MANIFEST_FILES}") 75 | file(WRITE "/home/nvidia/TRT-Pelee/${CMAKE_INSTALL_MANIFEST}" 76 | "${CMAKE_INSTALL_MANIFEST_CONTENT}") 77 | -------------------------------------------------------------------------------- /common.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | std::string locateFile(const std::string& input, const std::vector & directories) 3 | { 4 | std::string file; 5 | const int MAX_DEPTH{10}; 6 | bool found{false}; 7 | for (auto &dir : directories) 8 | { 9 | file = dir + input; 10 | std::cout << file << std::endl; 11 | for (int i = 0; i < MAX_DEPTH && !found; i++) 12 | { 13 | std::ifstream checkFile(file); 14 | found = checkFile.is_open(); 15 | if (found) break; 16 | file = "../" + file; 17 | } 18 | if (found) break; 19 | file.clear(); 20 | } 21 | std::cout << file << std::endl; 22 | assert(!file.empty() && "Could not find a file due to it not existing in the data directory."); 23 | return file; 24 | } 25 | 26 | void readPGMFile(const std::string& fileName, uint8_t *buffer, int inH, int inW) 27 | { 28 | std::ifstream infile(fileName, std::ifstream::binary); 29 | assert(infile.is_open() && "Attempting to read from a file that is not open."); 30 | std::string magic, h, w, max; 31 | infile >> magic >> h >> w >> max; 32 | infile.seekg(1, infile.cur); 33 | infile.read(reinterpret_cast(buffer), inH*inW); 34 | } 35 | 36 | /*********************************/ 37 | /* Updated date: 2018.3.7 38 | /*This is my own implementation of the detectout layer code, because I met a mistake with the detectout api of 39 | /*tensorrt3.0 a few months ago. You can use the detectout api of tensorrt3.0 correctly by adding an extra output 40 | /*in the deploy prototxt file. Please refer to my deploy prototxt. 41 | /********************************/ 42 | // Retrieve all location predictions. 43 | void GetLocPredictions(const float* loc_data, 44 | const int num_preds_per_class, const int num_loc_classes, 45 | std::vector >* loc_preds) { 46 | for (int p = 0; p < num_preds_per_class; ++p) { 47 | int start_idx = p * num_loc_classes * 4; 48 | vector labelbbox; 49 | for (int c = 0; c < num_loc_classes; ++c) { 50 | labelbbox.push_back(loc_data[start_idx + c * 4]); 51 | labelbbox.push_back(loc_data[start_idx + c * 4 + 1]); 52 | labelbbox.push_back(loc_data[start_idx + c * 4 + 2]); 53 | labelbbox.push_back(loc_data[start_idx + c * 4 + 3]); 54 | 55 | loc_preds->push_back(labelbbox); 56 | } 57 | 58 | } 59 | } 60 | 61 | // Retrieve all confidences. 62 | void GetConfidenceScores(const float* conf_data, 63 | const int num_preds_per_class, const int num_classes, 64 | vector >* conf_preds) { 65 | for (int p = 0; p < num_preds_per_class; ++p) { 66 | int start_idx = p * num_classes; 67 | vector conf_classes; 68 | for (int c = 0; c < num_classes; ++c) { 69 | conf_classes.push_back(conf_data[start_idx + c]); 70 | } 71 | conf_preds->push_back(conf_classes); 72 | } 73 | } 74 | 75 | // Retrieve all prior bboxes. bboxes and variances 76 | void GetPriorBBoxes(const float* prior_data, const int num_priors, 77 | vector >* prior_bboxes, 78 | vector >* prior_variances) { 79 | for (int i = 0; i < num_priors; ++i) { 80 | int start_idx = i * 4; 81 | vector prior_bbox; 82 | prior_bbox.push_back(prior_data[start_idx]); 83 | prior_bbox.push_back(prior_data[start_idx + 1]); 84 | prior_bbox.push_back(prior_data[start_idx + 2]); 85 | prior_bbox.push_back(prior_data[start_idx + 3]); 86 | prior_bboxes->push_back(prior_bbox); 87 | } 88 | 89 | for (int i = 0; i < num_priors; ++i) { 90 | int start_idx = (num_priors + i) * 4; 91 | vector prior_variance; 92 | vector var; 93 | for (int j = 0; j < 4; ++j) { 94 | prior_variance.push_back(prior_data[start_idx + j]); 95 | } 96 | prior_variances->push_back(prior_variance); 97 | } 98 | } 99 | 100 | /* code_type: 0 = CORNER; 1 = CENTER_SIZE; 2 = CORNER_SIZE 101 | * 102 | */ 103 | void DecodeBBox( 104 | const vector& prior_bbox, const vector& prior_variance, 105 | const int code_type, const bool variance_encoded_in_target, 106 | const bool clip_bbox, const vector& bbox, 107 | vector* decode_bbox) { 108 | if (0 == code_type) { 109 | if (variance_encoded_in_target) { 110 | // variance is encoded in target, we simply need to add the offset 111 | // predictions. 112 | decode_bbox->push_back(prior_bbox[0] + bbox[0]); 113 | decode_bbox->push_back(prior_bbox[1] + bbox[1]); 114 | decode_bbox->push_back(prior_bbox[2] + bbox[2]); 115 | decode_bbox->push_back(prior_bbox[3] + bbox[3]); 116 | } else { 117 | // variance is encoded in bbox, we need to scale the offset accordingly. 118 | decode_bbox->push_back( 119 | prior_bbox[0]+ prior_variance[0] * bbox[0]); 120 | decode_bbox->push_back( 121 | prior_bbox[1] + prior_variance[1] * bbox[1]); 122 | decode_bbox->push_back( 123 | prior_bbox[2] + prior_variance[2] * bbox[2]); 124 | decode_bbox->push_back( 125 | prior_bbox[3] + prior_variance[3] * bbox[3]); 126 | } 127 | } else if (1 == code_type) { 128 | float prior_width = prior_bbox[2] - prior_bbox[0]; 129 | //CHECK_GT(prior_width, 0); 130 | float prior_height = prior_bbox[3] - prior_bbox[1]; 131 | //CHECK_GT(prior_height, 0); 132 | float prior_center_x = (prior_bbox[0] + prior_bbox[2]) / 2.; 133 | float prior_center_y = (prior_bbox[1] + prior_bbox[3]) / 2.; 134 | 135 | float decode_bbox_center_x, decode_bbox_center_y; 136 | float decode_bbox_width, decode_bbox_height; 137 | if (variance_encoded_in_target) { 138 | // variance is encoded in target, we simply need to retore the offset 139 | // predictions. 140 | decode_bbox_center_x = bbox[0] * prior_width + prior_center_x; 141 | decode_bbox_center_y = bbox[1] * prior_height + prior_center_y; 142 | decode_bbox_width = exp(bbox[2]) * prior_width; 143 | decode_bbox_height = exp(bbox[3]) * prior_height; 144 | } else { 145 | // variance is encoded in bbox, we need to scale the offset accordingly. 146 | decode_bbox_center_x = 147 | prior_variance[0] * bbox[0] * prior_width + prior_center_x; 148 | decode_bbox_center_y = 149 | prior_variance[1] * bbox[1] * prior_height + prior_center_y; 150 | decode_bbox_width = 151 | exp(prior_variance[2] * bbox[2]) * prior_width; 152 | decode_bbox_height = 153 | exp(prior_variance[3] * bbox[3]) * prior_height; 154 | } 155 | 156 | decode_bbox->push_back(decode_bbox_center_x - decode_bbox_width / 2.); 157 | decode_bbox->push_back(decode_bbox_center_y - decode_bbox_height / 2.); 158 | decode_bbox->push_back(decode_bbox_center_x + decode_bbox_width / 2.); 159 | decode_bbox->push_back(decode_bbox_center_y + decode_bbox_height / 2.); 160 | } else if (2 == code_type) { 161 | float prior_width = prior_bbox[2] - prior_bbox[0]; 162 | //CHECK_GT(prior_width, 0); 163 | float prior_height = prior_bbox[3] - prior_bbox[1]; 164 | //CHECK_GT(prior_height, 0); 165 | if (variance_encoded_in_target) { 166 | // variance is encoded in target, we simply need to add the offset 167 | // predictions. 168 | decode_bbox->push_back(prior_bbox[0] + bbox[0] * prior_width); 169 | decode_bbox->push_back(prior_bbox[1] + bbox[1] * prior_height); 170 | decode_bbox->push_back(prior_bbox[2] + bbox[2] * prior_width); 171 | decode_bbox->push_back(prior_bbox[3] + bbox[3] * prior_height); 172 | } else { 173 | // variance is encoded in bbox, we need to scale the offset accordingly. 174 | decode_bbox->push_back( 175 | prior_bbox[0] + prior_variance[0] * bbox[0] * prior_width); 176 | decode_bbox->push_back( 177 | prior_bbox[1] + prior_variance[1] * bbox[1] * prior_height); 178 | decode_bbox->push_back( 179 | prior_bbox[2] + prior_variance[2] * bbox[2] * prior_width); 180 | decode_bbox->push_back( 181 | prior_bbox[3] + prior_variance[3] * bbox[3] * prior_height); 182 | } 183 | } else { 184 | std::cout<< "Unknown LocLossType."< >& prior_bboxes, 195 | const vector >& prior_variances, 196 | const int code_type, const bool variance_encoded_in_target, 197 | const bool clip_bbox, const vector >& bboxes, 198 | vector >* decode_bboxes) { 199 | //CHECK_EQ(prior_bboxes.size(), prior_variances.size()); 200 | //CHECK_EQ(prior_bboxes.size(), bboxes.size()); 201 | int num_bboxes = prior_bboxes.size(); 202 | 203 | for (int i = 0; i < num_bboxes; ++i) { 204 | vector decode_bbox; 205 | DecodeBBox(prior_bboxes[i], prior_variances[i], code_type, 206 | variance_encoded_in_target, clip_bbox, bboxes[i], &decode_bbox); 207 | decode_bboxes->push_back(decode_bbox); 208 | } 209 | } 210 | 211 | // 212 | void ConfData(const float* data, const int num_classes, const int num_prior, float* new_data) { 213 | int idx = 0; 214 | for (int c = 0; c < num_classes; ++c) { 215 | for (int p = 0; p < num_prior; ++p) { 216 | new_data[idx] = data[p*num_classes + c]; 217 | idx++; 218 | } 219 | } 220 | //softmax 221 | for (int p = 0; p < num_prior; ++p) { 222 | int sum = 0; 223 | float _max = new_data[p];//new_data[0*num_prior + p] 224 | for (int c = 1; c < num_classes; ++c) { 225 | _max = std::max(_max, new_data[c*num_prior + p]); 226 | } 227 | for (int c = 0; c < num_classes; ++c) { 228 | sum += exp(new_data[c*num_prior + p]-_max); 229 | } 230 | for (int j = 0; j < num_classes; ++j) { 231 | new_data[j*num_prior + p] = exp(new_data[j*num_prior + p]-_max)/sum; 232 | } 233 | } 234 | 235 | } 236 | 237 | template 238 | void DecodeBBoxes_2(const Dtype* loc_data, const Dtype* prior_data, 239 | const int code_type, const bool variance_encoded_in_target, 240 | const int num_priors, const bool share_location, 241 | const int num_loc_classes, const int background_label_id, 242 | const bool clip_bbox, Dtype* bbox_data) { 243 | 244 | if(code_type == 0){ 245 | for(int p = 0; p < num_priors; p++) { 246 | if (variance_encoded_in_target) { 247 | for (int i = 0; i < 4; i++) { 248 | bbox_data[4 * p + i] = prior_data[4 * p + i] + loc_data[4 * p + i]; 249 | } 250 | } else { 251 | for (int i = 0; i < 4; i++) { 252 | bbox_data[4 * p + i] = prior_data[4 * p + i] + prior_data[4 * num_priors + 4 * p + i] + loc_data[4 * p + i]; 253 | } 254 | } 255 | } 256 | }else if(code_type == 1){ 257 | for(int p = 0; p < num_priors; p++) { 258 | float prior_width = prior_data[4 * p + 2] - prior_data[4 * p + 0]; 259 | float prior_height = prior_data[4 * p + 3] - prior_data[4 * p + 1]; 260 | float prior_center_x = (prior_data[4 * p + 0] + prior_data[4 * p + 2]) / 2.; 261 | float prior_center_y = (prior_data[4 * p + 1] + prior_data[4 * p + 3]) / 2.; 262 | float decode_bbox_center_x, decode_bbox_center_y; 263 | float decode_bbox_width, decode_bbox_height;; 264 | if (variance_encoded_in_target) { 265 | decode_bbox_center_x = loc_data[4 * p + 0] * prior_width + prior_center_x; 266 | decode_bbox_center_y = loc_data[4 * p + 1] * prior_height + prior_center_y; 267 | decode_bbox_width = exp(loc_data[4 * p + 2]) * prior_width; 268 | decode_bbox_height = exp(loc_data[4 * p + 3]) * prior_height; 269 | }else{ 270 | decode_bbox_center_x = prior_data[4 * num_priors + 4 * p + 0] * loc_data[4 * p + 0] * prior_width + prior_center_x; 271 | decode_bbox_center_y = prior_data[4 * num_priors + 4 * p + 1] * loc_data[4 * p + 1] * prior_height + prior_center_y; 272 | decode_bbox_width = exp(prior_data[4 * num_priors + 4 * p + 2] * loc_data[4 * p + 2]) * prior_width; 273 | decode_bbox_height = exp(prior_data[4 * num_priors + 4 * p + 3] * loc_data[4 * p + 3]) * prior_height; 274 | } 275 | bbox_data[4 * p + 0] = (decode_bbox_center_x - decode_bbox_width / 2.); 276 | bbox_data[4 * p + 1] = (decode_bbox_center_y - decode_bbox_height / 2.); 277 | bbox_data[4 * p + 2] = (decode_bbox_center_x + decode_bbox_width / 2.); 278 | bbox_data[4 * p + 3] = (decode_bbox_center_y + decode_bbox_height / 2.); 279 | } 280 | 281 | }else if(code_type == 2){ 282 | for(int p = 0; p < num_priors; p++) { 283 | float prior_width = prior_data[4 * p + 2] - prior_data[4 * p + 0]; 284 | float prior_height = prior_data[4 * p + 3] - prior_data[4 * p + 1]; 285 | 286 | if (variance_encoded_in_target) { 287 | bbox_data[4 * p + 0] = prior_data[4 * p + 0] + loc_data[4 * p + 0] * prior_width; 288 | bbox_data[4 * p + 1] = prior_data[4 * p + 1] + loc_data[4 * p + 1] * prior_height; 289 | bbox_data[4 * p + 2] = exp(prior_data[4 * p + 2]) + loc_data[4 * p + 2] * prior_width; 290 | bbox_data[4 * p + 3] = exp(prior_data[4 * p + 3]) + loc_data[4 * p + 3] * prior_height; 291 | }else { 292 | bbox_data[4 * p + 0] = prior_data[4 * p + 0] + 293 | prior_data[4 * num_priors + 4 * p + 0] * loc_data[4 * p + 0] * prior_width; 294 | bbox_data[4 * p + 1] = prior_data[4 * p + 1] + 295 | prior_data[4 * num_priors + 4 * p + 1] * loc_data[4 * p + 1] * prior_height; 296 | bbox_data[4 * p + 2] = prior_data[4 * p + 2] + 297 | prior_data[4 * num_priors + 4 * p + 2] * loc_data[4 * p + 2] * prior_width; 298 | bbox_data[4 * p + 3] = prior_data[4 * p + 3] + 299 | prior_data[4 * num_priors + 4 * p + 3] * loc_data[4 * p + 3] * prior_height; 300 | } 301 | } 302 | 303 | }else{ 304 | std::cout << "Unknown LocLossType." << std::endl; 305 | } 306 | } 307 | 308 | 309 | template 310 | Dtype BBoxSize(const Dtype* bbox, const bool normalized = true) { 311 | if (bbox[2] < bbox[0] || bbox[3] < bbox[1]) { 312 | // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0. 313 | return Dtype(0.); 314 | } else { 315 | const Dtype width = bbox[2] - bbox[0]; 316 | const Dtype height = bbox[3] - bbox[1]; 317 | if (normalized) { 318 | return width * height; 319 | } else { 320 | // If bbox is not within range [0, 1]. 321 | return (width + 1) * (height + 1); 322 | } 323 | } 324 | } 325 | 326 | template 327 | Dtype JaccardOverlap(const Dtype* bbox1, const Dtype* bbox2) { 328 | if (bbox2[0] > bbox1[2] || bbox2[2] < bbox1[0] || 329 | bbox2[1] > bbox1[3] || bbox2[3] < bbox1[1]) { 330 | return Dtype(0.); 331 | } else { 332 | const Dtype inter_xmin = std::max(bbox1[0], bbox2[0]); 333 | const Dtype inter_ymin = std::max(bbox1[1], bbox2[1]); 334 | const Dtype inter_xmax = std::min(bbox1[2], bbox2[2]); 335 | const Dtype inter_ymax = std::min(bbox1[3], bbox2[3]); 336 | 337 | const Dtype inter_width = inter_xmax - inter_xmin; 338 | const Dtype inter_height = inter_ymax - inter_ymin; 339 | const Dtype inter_size = inter_width * inter_height; 340 | 341 | const Dtype bbox1_size = BBoxSize(bbox1); 342 | const Dtype bbox2_size = BBoxSize(bbox2); 343 | 344 | return inter_size / (bbox1_size + bbox2_size - inter_size); 345 | } 346 | } 347 | 348 | template 349 | bool SortScorePairDescend(const pair& pair1, 350 | const pair& pair2) { 351 | return pair1.first > pair2.first; 352 | } 353 | 354 | template 355 | void GetMaxScoreIndex(const Dtype* scores, const int num, const float threshold, 356 | const int top_k, vector >* score_index_vec) { 357 | // Generate index score pairs. 358 | for (int i = 0; i < num; ++i) { 359 | if (scores[i] > threshold) { 360 | score_index_vec->push_back(std::make_pair(scores[i], i)); 361 | } 362 | } 363 | 364 | // Sort the score pair according to the scores in descending order 365 | std::sort(score_index_vec->begin(), score_index_vec->end(), 366 | SortScorePairDescend); 367 | 368 | // Keep top_k scores if needed. 369 | if (top_k > -1 && top_k < score_index_vec->size()) { 370 | score_index_vec->resize(top_k); 371 | } 372 | } 373 | 374 | template 375 | void ApplyNMSFast(const Dtype* bboxes, const Dtype* scores, const int num, 376 | const float score_threshold, const float nms_threshold, 377 | const float eta, const int top_k, vector* indices) { 378 | // Get top_k scores (with corresponding indices). 379 | vector > score_index_vec; 380 | //float n1 = cv::getTickCount(); 381 | GetMaxScoreIndex(scores, num, score_threshold, top_k, &score_index_vec); 382 | // n1 = (cv::getTickCount()-n1) / cv::getTickFrequency(); 383 | //printf("======n==1 Forward_DetectionOutputLayer time is %f \n", n1); 384 | 385 | // Do nms. 386 | float adaptive_threshold = nms_threshold; 387 | indices->clear(); 388 | //float n2 = cv::getTickCount(); 389 | std::cout<<"======n==n" <size(); ++k) { 394 | if (keep) { 395 | const int kept_idx = (*indices)[k]; 396 | float overlap = JaccardOverlap(bboxes + idx * 4, bboxes + kept_idx * 4); 397 | keep = overlap <= adaptive_threshold; 398 | } else { 399 | break; 400 | } 401 | } 402 | if (keep) { 403 | indices->push_back(idx); 404 | } 405 | score_index_vec.erase(score_index_vec.begin()); 406 | if (keep && eta < 1 && adaptive_threshold > 0.5) { 407 | adaptive_threshold *= eta; 408 | } 409 | } 410 | //n2 = (cv::getTickCount()-n2) / cv::getTickFrequency(); 411 | //printf("======n==2 Forward_DetectionOutputLayer time is %f \n", n2); 412 | } 413 | 414 | 415 | void Forward_DetectionOutputLayer(float* loc_data, float* conf_data, float* prior_data, int num_priors_, int num_classes_, vector >* detecions) { 416 | // Retrieve all location predictions. 417 | /*vector> all_loc_preds; 418 | GetLocPredictions(loc_data, num_priors_, num_loc_classes_, &all_loc_preds); 419 | // Retrieve all confidences. 420 | vector > all_conf_scores; 421 | GetConfidenceScores(conf_data, num_priors_, num_classes_, 422 | &all_conf_scores); 423 | // Retrieve all prior bboxes. 424 | vector> prior_bboxes; 425 | vector> prior_variances; 426 | GetPriorBBoxes(prior_data, num_priors_, &prior_bboxes, &prior_variances); 427 | // Decode all loc predictions to bboxes. 428 | vector> all_decode_bboxes; 429 | //const bool clip_bbox = false; 430 | DecodeBBoxes(prior_bboxes, prior_variances, code_type_, 431 | variance_encoded_in_target_, clip_bbox, all_loc_preds, 432 | &all_decode_bboxes);*/ 433 | 434 | 435 | int num_kept = 0; 436 | vector > > all_indices; 437 | 438 | map> indices; 439 | int num_det = 0; 440 | const int conf_idx = num_classes_ * num_priors_; 441 | const bool share_location_ = true; 442 | const int num_loc_classes = 1; 443 | int background_label_id_ = 0; 444 | float confidence_threshold_ = 0.1; 445 | float nms_threshold_ = 0.45; 446 | float eta_ = 1.0;//默认1.0 447 | int top_k_ = 400; 448 | int keep_top_k_ = 200; 449 | 450 | const int code_type = 1;//center 451 | const bool variance_encoded_in_target = false;//default 452 | const bool clip_bbox = false; 453 | 454 | float* decode_bboxes = new float[4 * num_priors_]; 455 | float t = cv::getTickCount(); 456 | DecodeBBoxes_2(loc_data, prior_data, code_type, variance_encoded_in_target, num_priors_, share_location_, num_loc_classes,background_label_id_, clip_bbox, decode_bboxes); 457 | t = (cv::getTickCount()-t) / cv::getTickFrequency(); 458 | printf("======1 Forward_DetectionOutputLayer time is %f \n", t); 459 | float* new_conf_data = new float[num_priors_ * num_classes_]; 460 | float t1 = cv::getTickCount(); 461 | ConfData(conf_data, num_classes_, num_priors_, new_conf_data); 462 | t1 = (cv::getTickCount()-t1) / cv::getTickFrequency(); 463 | printf("======2 Forward_DetectionOutputLayer time is %f \n", t1); 464 | 465 | float t2 = cv::getTickCount(); 466 | for(int c = 0; c < num_classes_; c++){ 467 | if(c == background_label_id_){ 468 | continue; 469 | } 470 | float* cur_conf_data = new_conf_data + c * num_priors_; 471 | //float* cur_bbox_data = all_decode_bboxes 472 | float tt = cv::getTickCount(); 473 | ApplyNMSFast(decode_bboxes, cur_conf_data, num_priors_, 474 | confidence_threshold_, nms_threshold_, eta_, top_k_, &(indices[c])); 475 | tt = (cv::getTickCount()-tt) / cv::getTickFrequency(); 476 | std::cout<<"===nms==="< -1 && num_det > keep_top_k_){ 485 | vector > > score_index_pairs; 486 | for(map >::iterator it = indices.begin(); it != indices.end(); ++it){ 487 | int label = it->first; 488 | const vector& label_indices = it->second; 489 | for(int j = 0; j < label_indices.size(); ++j){ 490 | int idx = label_indices[j]; 491 | float score = new_conf_data[label * num_priors_ + idx]; 492 | score_index_pairs.push_back(std::make_pair(score, std::make_pair(label, idx))); 493 | } 494 | } 495 | // Keep top k results per image. 496 | std::sort(score_index_pairs.begin(), score_index_pairs.end(), SortScorePairDescend >); 497 | score_index_pairs.resize(keep_top_k_); 498 | // Store the new indices. 499 | map > new_indices; 500 | for(int j = 0; j < score_index_pairs.size(); ++j){ 501 | int label = score_index_pairs[j].second.first; 502 | int idx = score_index_pairs[j].second.second; 503 | new_indices[label].push_back(idx); 504 | } 505 | all_indices.push_back(new_indices); 506 | num_kept += keep_top_k_; 507 | }else{ 508 | all_indices.push_back(indices); 509 | num_kept += num_det; 510 | } 511 | if(num_kept == 0){ 512 | printf("Couldn't find any detections"); 513 | }else{ 514 | for(map >::iterator it = all_indices[0].begin(); it != all_indices[0].end(); ++it){ 515 | int label = it->first; 516 | vector& _indices = it->second; 517 | const float* _cur_conf_data = new_conf_data + label * num_priors_; 518 | 519 | for(int j = 0; j < _indices.size(); ++j){ 520 | int idx = _indices[j]; 521 | vector detect; 522 | for(int k = 0; k < 4; ++k){ 523 | detect.push_back(decode_bboxes[idx * 4 + k]); 524 | } 525 | detect.push_back(_cur_conf_data[idx]); 526 | detect.push_back(label); 527 | detecions->push_back(detect); 528 | } 529 | } 530 | } 531 | t3 = (cv::getTickCount()-t3) / cv::getTickFrequency(); 532 | printf("======4 Forward_DetectionOutputLayer time is %f \n", t3); 533 | 534 | delete[] decode_bboxes; 535 | delete[] new_conf_data; 536 | } 537 | -------------------------------------------------------------------------------- /common.h: -------------------------------------------------------------------------------- 1 | #ifndef _TRT_COMMON_H_ 2 | #define _TRT_COMMON_H_ 3 | #include "NvInfer.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | #define CHECK(status) \ 18 | { \ 19 | if (status != 0) \ 20 | { \ 21 | std::cout << "Cuda failure: " << status; \ 22 | abort(); \ 23 | } \ 24 | } 25 | using namespace std; 26 | 27 | 28 | std::string locateFile(const std::string& input, const std::vector & directories); 29 | void readPGMFile(const std::string& fileName, uint8_t *buffer, int inH, int inW); 30 | void Forward_DetectionOutputLayer(float* loc_data, float* conf_data, float* prior_data, int num_priors_, int num_classes_, vector >* detecions); 31 | #endif // _TRT_COMMON_H_ 32 | -------------------------------------------------------------------------------- /cudaUtility.h: -------------------------------------------------------------------------------- 1 | #ifndef __CUDA_UTILITY_H_ 2 | #define __CUDA_UTILITY_H_ 3 | 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | 11 | /** 12 | * Execute a CUDA call and print out any errors 13 | * @return the original cudaError_t result 14 | * @ingroup util 15 | */ 16 | #define CUDA(x) cudaCheckError((x), #x, __FILE__, __LINE__) 17 | 18 | /** 19 | * Evaluates to true on success 20 | * @ingroup util 21 | */ 22 | #define CUDA_SUCCESS(x) (CUDA(x) == cudaSuccess) 23 | 24 | /** 25 | * Evaluates to true on failure 26 | * @ingroup util 27 | */ 28 | #define CUDA_FAILED(x) (CUDA(x) != cudaSuccess) 29 | 30 | /** 31 | * Return from the boolean function if CUDA call fails 32 | * @ingroup util 33 | */ 34 | #define CUDA_VERIFY(x) if(CUDA_FAILED(x)) return false; 35 | 36 | /** 37 | * LOG_CUDA string. 38 | * @ingroup util 39 | */ 40 | #define LOG_CUDA "[cuda] " 41 | 42 | /* 43 | * define this if you want all cuda calls to be printed 44 | */ 45 | //#define CUDA_TRACE 46 | 47 | 48 | 49 | /** 50 | * cudaCheckError 51 | * @ingroup util 52 | */ 53 | inline cudaError_t cudaCheckError(cudaError_t retval, const char* txt, const char* file, int line ) 54 | { 55 | #if !defined(CUDA_TRACE) 56 | if( retval == cudaSuccess) 57 | return cudaSuccess; 58 | #endif 59 | 60 | //int activeDevice = -1; 61 | //cudaGetDevice(&activeDevice); 62 | 63 | //Log("[cuda] device %i - %s\n", activeDevice, txt); 64 | 65 | printf(LOG_CUDA "%s\n", txt); 66 | 67 | 68 | if( retval != cudaSuccess ) 69 | { 70 | printf(LOG_CUDA " %s (error %u) (hex 0x%02X)\n", cudaGetErrorString(retval), retval, retval); 71 | printf(LOG_CUDA " %s:%i\n", file, line); 72 | } 73 | 74 | return retval; 75 | } 76 | 77 | 78 | /** 79 | * iDivUp 80 | * @ingroup util 81 | */ 82 | inline __device__ __host__ int iDivUp( int a, int b ) { return (a % b != 0) ? (a / b + 1) : (a / b); } 83 | 84 | 85 | 86 | #endif 87 | -------------------------------------------------------------------------------- /imageBuffer.h: -------------------------------------------------------------------------------- 1 | #ifndef IMAGEBUFFER_H 2 | #define IMAGEBUFFER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | 10 | template 11 | class ConsumerProducerQueue 12 | { 13 | 14 | public: 15 | ConsumerProducerQueue(int mxsz,bool dropFrame) : 16 | maxSize(mxsz),dropFrame(dropFrame) 17 | { } 18 | 19 | bool add(T request) 20 | { 21 | std::unique_lock lock(mutex); 22 | if(dropFrame && isFull()) 23 | { 24 | lock.unlock(); 25 | return false; 26 | } 27 | else { 28 | cond.wait(lock, [this]() { return !isFull(); }); 29 | cpq.push(request); 30 | //lock.unlock(); 31 | cond.notify_all(); 32 | return true; 33 | } 34 | } 35 | 36 | void consume(T &request) 37 | { 38 | std::unique_lock lock(mutex); 39 | cond.wait(lock, [this]() 40 | { return !isEmpty(); }); 41 | request = cpq.front(); 42 | cpq.pop(); 43 | //lock.unlock(); 44 | cond.notify_all(); 45 | 46 | } 47 | 48 | bool isFull() const 49 | { 50 | return cpq.size() >= maxSize; 51 | } 52 | 53 | bool isEmpty() const 54 | { 55 | return cpq.size() == 0; 56 | } 57 | 58 | int length() const 59 | { 60 | return cpq.size(); 61 | } 62 | 63 | void clear() 64 | { 65 | std::unique_lock lock(mutex); 66 | while (!isEmpty()) 67 | { 68 | cpq.pop(); 69 | } 70 | lock.unlock(); 71 | cond.notify_all(); 72 | } 73 | 74 | private: 75 | std::condition_variable cond; 76 | std::mutex mutex; 77 | std::queue cpq; 78 | int maxSize; 79 | bool dropFrame; 80 | }; 81 | 82 | 83 | 84 | #endif 85 | -------------------------------------------------------------------------------- /kernel.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * http://github.com/dusty-nv/jetson-inference 3 | */ 4 | 5 | #include "cuda/cudaUtility.h" 6 | #include 7 | 8 | 9 | // gpuPreImageNet 10 | __global__ void gpuPreImageNet( float2 scale, float4* input, int iWidth, float* output, int oWidth, int oHeight ) 11 | { 12 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 13 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 14 | const int n = oWidth * oHeight; 15 | 16 | if( x >= oWidth || y >= oHeight ) 17 | return; 18 | 19 | const int dx = ((float)x * scale.x); 20 | const int dy = ((float)y * scale.y); 21 | 22 | const float4 px = input[ dy * iWidth + dx ]; 23 | const float3 bgr = make_float3(px.z, px.y, px.x); 24 | 25 | output[n * 0 + y * oWidth + x] = bgr.x; 26 | output[n * 1 + y * oWidth + x] = bgr.y; 27 | output[n * 2 + y * oWidth + x] = bgr.z; 28 | } 29 | 30 | // cudaPreImageNet 31 | cudaError_t cudaPreImageNet( float4* input, size_t inputWidth, size_t inputHeight, 32 | float* output, size_t outputWidth, size_t outputHeight ) 33 | { 34 | if( !input || !output ) 35 | return cudaErrorInvalidDevicePointer; 36 | 37 | if( inputWidth == 0 || outputWidth == 0 || inputHeight == 0 || outputHeight == 0 ) 38 | return cudaErrorInvalidValue; 39 | 40 | const float2 scale = make_float2( float(inputWidth) / float(outputWidth), 41 | float(inputHeight) / float(outputHeight) ); 42 | 43 | // launch kernel 44 | const dim3 blockDim(8, 8); 45 | const dim3 gridDim(iDivUp(outputWidth,blockDim.x), iDivUp(outputHeight,blockDim.y)); 46 | 47 | gpuPreImageNet<<>>(scale, input, inputWidth, output, outputWidth, outputHeight); 48 | 49 | return CUDA(cudaGetLastError()); 50 | } 51 | 52 | // gpuPreImageNetMean 53 | __global__ void gpuPreImageNetMean( float2 scale, float3* input, int iWidth, float* output, int oWidth, int oHeight, float3 mean_value ) 54 | { 55 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 56 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 57 | const int n = oWidth * oHeight; 58 | 59 | if( x >= oWidth || y >= oHeight ) 60 | return; 61 | 62 | const int dx = ((float)x * scale.x); 63 | const int dy = ((float)y * scale.y); 64 | 65 | const float3 px = input[ dy * iWidth + dx ]; 66 | const float3 bgr = make_float3(px.z - mean_value.x, px.y - mean_value.y, px.x - mean_value.z); 67 | 68 | output[n * 0 + y * oWidth + x] = bgr.x; 69 | output[n * 1 + y * oWidth + x] = bgr.y; 70 | output[n * 2 + y * oWidth + x] = bgr.z; 71 | } 72 | 73 | // cudaPreImageNetMean 74 | cudaError_t cudaPreImageNetMean( float3* input, size_t inputWidth, size_t inputHeight, 75 | float* output, size_t outputWidth, size_t outputHeight, const float3& mean_value ) 76 | 77 | { 78 | if( !input || !output ){ 79 | std::cout << "error here. "<< std::endl; 80 | return cudaErrorInvalidDevicePointer; 81 | } 82 | 83 | if( inputWidth == 0 || outputWidth == 0 || inputHeight == 0 || outputHeight == 0 ){ 84 | std::cout << "Or here. " << std::endl; 85 | return cudaErrorInvalidValue; 86 | } 87 | 88 | 89 | const float2 scale = make_float2( float(inputWidth) / float(outputWidth), 90 | float(inputHeight) / float(outputHeight) ); 91 | 92 | 93 | // launch kernel 94 | 95 | const dim3 blockDim(8, 8); 96 | const dim3 gridDim(iDivUp(outputWidth,blockDim.x), iDivUp(outputHeight,blockDim.y)); 97 | 98 | gpuPreImageNetMean<<>>(scale, input, inputWidth, output, outputWidth, outputHeight, mean_value); 99 | 100 | return CUDA(cudaGetLastError()); 101 | 102 | } 103 | 104 | __global__ void kernel_extract_roi(float* input, float* output, char* mean, 105 | const int input_w, const int output_w, const int output_h, 106 | const int in_plane_r, const int in_plane_g, const int in_plane_b, 107 | const int out_plane_r, const int out_plane_g, const int out_plane_b, 108 | const int bbox_x, const int bbox_y, const int bbox_w, const int bbox_h) 109 | { 110 | uint x = blockIdx.x * blockDim.x + threadIdx.x; 111 | uint y = blockIdx.y * blockDim.y + threadIdx.y; 112 | 113 | if( x < output_w && y < output_h) 114 | { 115 | float r[2] = { float(x) * bbox_w / output_w + bbox_x, 116 | float(y) * bbox_h / output_h + bbox_y }; 117 | 118 | int pos[4][2] = { { int(floor(r[0])), int(floor(r[1])) }, 119 | { int( ceil(r[0])), int(floor(r[1])) }, 120 | { int(floor(r[0])), int(ceil(r[1])) }, 121 | { int( ceil(r[0])), int(ceil(r[1])) } }; 122 | 123 | float u = r[0]-floor(r[0]); 124 | float v = r[1]-floor(r[1]); 125 | 126 | float s[4] = { (1-u)*(1-v), u*(1-v), (1-u)*v, u*v }; 127 | 128 | int map[4] = { pos[0][1]*input_w + pos[0][0], pos[1][1]*input_w + pos[1][0], 129 | pos[2][1]*input_w + pos[2][0], pos[3][1]*input_w + pos[3][0]}; 130 | 131 | int idx = y * output_w + x; 132 | output[idx+out_plane_r] = round( s[0]*input[map[0]+in_plane_r] 133 | + s[1]*input[map[1]+in_plane_r] 134 | + s[2]*input[map[2]+in_plane_r] 135 | + s[3]*input[map[3]+in_plane_r] );// float(mean[idx+out_plane_r])); 136 | output[idx+out_plane_g] = round( s[0]*input[map[0]+in_plane_g] 137 | + s[1]*input[map[1]+in_plane_g] 138 | + s[2]*input[map[2]+in_plane_g] 139 | + s[3]*input[map[3]+in_plane_g] );//float(mean[idx+out_plane_g])); 140 | output[idx+out_plane_b] = round( s[0]*input[map[0]+in_plane_b] 141 | + s[1]*input[map[1]+in_plane_b] 142 | + s[2]*input[map[2]+in_plane_b] 143 | + s[3]*input[map[3]+in_plane_b] );//float(mean[idx+out_plane_b])); 144 | } 145 | } 146 | 147 | void convertROI(float* input, float* output, char* mean, const int* srcSize, const int* dstSize, const int* roi, cudaStream_t stream) 148 | { 149 | int in_plane_r = 0; 150 | int in_plane_g = srcSize[1] * srcSize[2]; 151 | int in_plane_b = srcSize[1] * srcSize[2] * 2; 152 | 153 | int out_plane_r = 0; 154 | int out_plane_g = dstSize[1] * dstSize[2]; 155 | int out_plane_b = dstSize[1] * dstSize[2] * 2; 156 | 157 | int bbox_x = min(max(roi[0], 0), srcSize[2]-1); 158 | int bbox_y = min(max(roi[1], 0), srcSize[1]-1); 159 | int bbox_w = min(max(roi[2]-roi[0], 0), srcSize[2]-bbox_x-1 ); 160 | int bbox_h = min(max(roi[3]-roi[1], 0), srcSize[1]-bbox_y-1 ); 161 | 162 | dim3 dimBlock(32,32); 163 | dim3 dimGrid(dstSize[2]/dimBlock.x+1, dstSize[1]/dimBlock.y+1); 164 | 165 | std::cout << "ROI: " << bbox_x << " " << bbox_y << " " << bbox_w << " " << bbox_h << std::endl; 166 | 167 | kernel_extract_roi <<< dimGrid, dimBlock, 0, stream >>> (input, output, mean, 168 | srcSize[2], dstSize[2], dstSize[1], 169 | in_plane_r, in_plane_g, in_plane_b, 170 | out_plane_r, out_plane_g, out_plane_b, 171 | bbox_x, bbox_y, bbox_w, bbox_h); 172 | } 173 | 174 | 175 | __global__ void kernelSoftmax( float* x, int channels, float* y) 176 | { 177 | 178 | extern __shared__ float mem[]; 179 | __shared__ float sum_value; 180 | sum_value=0; 181 | float number = *(x + blockDim.x*blockIdx.x + threadIdx.x); 182 | float number_exp = __expf(number); 183 | 184 | // sum_value += number_exp ; 185 | /* * 186 | * @TODO: Can do with the help of atomicAdd. 187 | * */ 188 | atomicAdd(&sum_value, number_exp); 189 | __syncthreads(); 190 | 191 | // mem[threadIdx.x] = number_exp; 192 | 193 | /* * 194 | * @TODO: Can do with the help of a for loop. Try different methods and find the time taken. 195 | * */ 196 | // float sum = 0.0f; 197 | // for (int i=0;i>>( x, channels, y); 209 | cudaDeviceSynchronize(); 210 | } 211 | 212 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "cudaUtility.h" 3 | #include "mathFunctions.h" 4 | #include "pluginImplement.h" 5 | #include "tensorNet.h" 6 | #include "loadImage.h" 7 | #include "imageBuffer.h" 8 | #include 9 | #include 10 | 11 | #define BOUND(a,min_val,max_val) ( (a < min_val) ? min_val : (a >= max_val) ? (max_val) : a 12 | 13 | const char* model = "model/pelee/pelee_deploy_iplugin.prototxt"; 14 | const char* weight = "model/pelee/pelee_merged.caffemodel"; 15 | 16 | const char* INPUT_BLOB_NAME = "data"; 17 | 18 | const char* OUTPUT_BLOB_NAME = "detection_out"; 19 | static const uint32_t BATCH_SIZE = 1; 20 | volatile bool endvideo = false; 21 | bool csi_cam = false; 22 | //image buffer size = 10 23 | //dropFrame = false 24 | ConsumerProducerQueue *imageBuffer = new ConsumerProducerQueue(5,csi_cam); 25 | 26 | class Timer { 27 | public: 28 | void tic() { 29 | start_ticking_ = true; 30 | start_ = std::chrono::high_resolution_clock::now(); 31 | } 32 | void toc() { 33 | if (!start_ticking_)return; 34 | end_ = std::chrono::high_resolution_clock::now(); 35 | start_ticking_ = false; 36 | t = std::chrono::duration(end_ - start_).count(); 37 | //std::cout << "Time: " << t << " ms" << std::endl; 38 | } 39 | double t; 40 | private: 41 | bool start_ticking_ = false; 42 | std::chrono::time_point start_; 43 | std::chrono::time_point end_; 44 | }; 45 | 46 | 47 | /* * 48 | * @TODO: unifiedMemory is used here under -> ( cudaMallocManaged ) 49 | * */ 50 | float* allocateMemory(DimsCHW dims, char* info) 51 | { 52 | float* ptr; 53 | size_t size; 54 | std::cout << "Allocate memory: " << info << std::endl; 55 | size = BATCH_SIZE * dims.c() * dims.h() * dims.w(); 56 | assert(!cudaMallocManaged( &ptr, size*sizeof(float))); 57 | return ptr; 58 | } 59 | 60 | 61 | void loadImg( cv::Mat &input, int re_width, int re_height, float *data_unifrom,const float3 mean,const float scale ) 62 | { 63 | int i; 64 | int j; 65 | int line_offset; 66 | int offset_g; 67 | int offset_r; 68 | cv::Mat dst; 69 | 70 | unsigned char *line = NULL; 71 | float *unifrom_data = data_unifrom; 72 | 73 | cv::resize( input, dst, cv::Size( re_width, re_height ), cv::INTER_LINEAR ); 74 | offset_g = re_width * re_height; 75 | offset_r = re_width * re_height * 2; 76 | //#pragma omp parallel for 77 | for( i = 0; i < re_height; ++i ) 78 | { 79 | line = dst.ptr< unsigned char >( i ); 80 | line_offset = i * re_width; 81 | for( j = 0; j < re_width; ++j ) 82 | { 83 | // b 84 | unifrom_data[ line_offset + j ] = (( float )(line[ j * 3 ] - mean.x) * scale); 85 | // g 86 | unifrom_data[ offset_g + line_offset + j ] = (( float )(line[ j * 3 + 1 ] - mean.y) * scale); 87 | // r 88 | unifrom_data[ offset_r + line_offset + j ] = (( float )(line[ j * 3 + 2 ] - mean.z) * scale); 89 | } 90 | } 91 | } 92 | std::string gstreamer_pipeline (int capture_width, int capture_height, int display_width, int display_height, int framerate, int flip_method) { 93 | return "nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)" + std::to_string(capture_width) + ", height=(int)" + 94 | std::to_string(capture_height) + ", format=(string)NV12, framerate=(fraction)" + std::to_string(framerate) + 95 | "/1 ! nvvidconv flip-method=" + std::to_string(flip_method) + " ! video/x-raw, width=(int)" + std::to_string(display_width) + ", height=(int)" + 96 | std::to_string(display_height) + ", format=(string)BGRx ! videoconvert ! video/x-raw, format=(string)BGR ! appsink"; 97 | } 98 | //thread read video 99 | void readPicture() 100 | { 101 | cv::VideoCapture cap; 102 | if(csi_cam) { 103 | int capture_width = 1280 ; 104 | int capture_height = 720 ; 105 | int display_width = 1280 ; 106 | int display_height = 720 ; 107 | int framerate = 30 ; 108 | int flip_method = 0 ; 109 | 110 | std::string pipeline = gstreamer_pipeline(capture_width, 111 | capture_height, 112 | display_width, 113 | display_height, 114 | framerate, 115 | flip_method); 116 | std::cout << "Using pipeline: \n\t" << pipeline << "\n"; 117 | cap = cv::VideoCapture(pipeline, cv::CAP_GSTREAMER); 118 | } 119 | else { 120 | cap = cv::VideoCapture("testVideo/test.avi"); 121 | } 122 | 123 | cv::Mat image; 124 | while(cap.isOpened()) 125 | { 126 | cap >> image; 127 | if(image.empty()) { 128 | endvideo = true; 129 | break; 130 | } 131 | if(!imageBuffer->add(image)) { 132 | image.release(); 133 | } 134 | } 135 | } 136 | 137 | void MatMul(cv::Mat img1, cv::Mat img2,int r,int g,int b , bool show_mode = false) 138 | { 139 | int i, j; 140 | int height = img1.rows; 141 | int width = img1.cols; 142 | //LOG(INFO) << width << "," << height << "," << img2.rows << "," << img2.cols; 143 | //#pragma omp parallel for 144 | 145 | for (i = 0; i < height; i++) { 146 | unsigned char* ptr1 = img1.ptr(i); 147 | const unsigned char* ptr2 = img2.ptr(i); 148 | int img_index1 = 0; 149 | int img_index2 = 0; 150 | for (j = 0; j < width; j++) { 151 | if(ptr2[img_index2]>90) { 152 | if(show_mode) { 153 | ptr1[img_index1] = b; 154 | ptr1[img_index1+1] = g; 155 | ptr1[img_index1+2] = r; 156 | } 157 | else { 158 | ptr1[img_index1] = b/2 + ptr1[img_index1]/2; 159 | ptr1[img_index1+1] = g/2 + ptr1[img_index1]/2; 160 | ptr1[img_index1+2] = r/2 + ptr1[img_index1]/2; 161 | } 162 | 163 | } 164 | //ptr1[img_index1+idx] = (unsigned char) BOUND(ptr1[img_index1] + ptr2[img_index2] * 1.0,0,255); 165 | //ptr1[img_index1+1] = (ptr2[img_index2]); 166 | //ptr1[img_index1+2] = (unsigned char) BOUND(ptr1[img_index1+2] + (255-ptr2[img_index2]) * 0.4,0,255); 167 | //ptr1[img_index1+2] = (unsigned char) BOUND((ptr2[img_index2]) ,0,255); 168 | img_index1+=3; 169 | img_index2++; 170 | } 171 | } 172 | 173 | } 174 | int main(int argc, char *argv[]) 175 | { 176 | std::vector output_vector = {OUTPUT_BLOB_NAME,"sigmoid"}; 177 | TensorNet tensorNet; 178 | tensorNet.LoadNetwork(model,weight,INPUT_BLOB_NAME, output_vector,BATCH_SIZE); 179 | 180 | DimsCHW dimsData = tensorNet.getTensorDims(INPUT_BLOB_NAME); 181 | DimsCHW dimsOut = tensorNet.getTensorDims(OUTPUT_BLOB_NAME); 182 | DimsCHW dimsOut2 = tensorNet.getTensorDims("sigmoid"); 183 | float* data = allocateMemory( dimsData , (char*)"input blob"); 184 | std::cout << "allocate data" << std::endl; 185 | float* output = allocateMemory( dimsOut , (char*)"output blob"); 186 | std::cout << "allocate output" << std::endl; 187 | float* output2 = allocateMemory( dimsOut2 , (char*)"output blob 2"); 188 | std::cout << "allocate output2" << std::endl; 189 | int height = 304; 190 | int width = 304; 191 | void* imgCPU; 192 | void* imgCUDA; 193 | const size_t size = width * height * sizeof(float3); 194 | 195 | if( CUDA_FAILED( cudaMalloc( &imgCUDA, size)) ) 196 | { 197 | cout <<"Cuda Memory allocation error occured."< seg_img; 210 | for(int i = 0; i color = {128,255,128,244,35,232}; 214 | while(1){ 215 | if(endvideo && imageBuffer->isEmpty()) { 216 | break; 217 | } 218 | imageBuffer->consume(frame); 219 | 220 | if(!frame.rows) { 221 | break; 222 | } 223 | //srcImg = frame.clone(); 224 | cv::resize(frame, srcImg, cv::Size(304,304)); 225 | 226 | 227 | void* imgData = malloc(size); 228 | //memset(imgData,0,size); 229 | 230 | loadImg(srcImg,height,width,(float*)imgData,make_float3(103.94,116.78,123.68),0.017); 231 | 232 | cudaMemcpyAsync(imgCUDA,imgData,size,cudaMemcpyHostToDevice); 233 | 234 | void* buffers[] = { imgCUDA, output , output2}; 235 | 236 | 237 | 238 | timer.tic(); 239 | tensorNet.imageInference( buffers, output_vector.size() + 1, BATCH_SIZE); 240 | timer.toc(); 241 | double msTime = timer.t; 242 | 243 | msTime_avg+= msTime; 244 | count++; 245 | std::cout< > detections; 247 | 248 | for (int k=0; k<100; k++) 249 | { 250 | if(output[7*k+1] == -1) 251 | break; 252 | float classIndex = output[7*k+1]; 253 | float confidence = output[7*k+2]; 254 | float xmin = output[7*k + 3]; 255 | float ymin = output[7*k + 4]; 256 | float xmax = output[7*k + 5]; 257 | float ymax = output[7*k + 6]; 258 | //std::cout << classIndex << " , " << confidence << " , " << xmin << " , " << ymin<< " , " << xmax<< " , " << ymax << std::endl; 259 | int x1 = static_cast(xmin * frame.cols); 260 | int y1 = static_cast(ymin * frame.rows); 261 | int x2 = static_cast(xmax * frame.cols); 262 | int y2 = static_cast(ymax * frame.rows); 263 | cv::rectangle(frame,cv::Rect2f(cv::Point(x1,y1),cv::Point(x2,y2)),cv::Scalar(255,0,255),1); 264 | 265 | } 266 | int scale = 4; 267 | 268 | int w = width / scale; 269 | int h = height / scale; 270 | 271 | for(int c = 0; c(y); 275 | int img_index2 = 0; 276 | for (int j = 0; j < w; j++) { 277 | int val = output2[img_index1+c*w*h] * 255; 278 | if (val>255) val = 255; 279 | if (val<0) val = 0; 280 | ptr2[img_index2] = (unsigned char)val; 281 | //if(c==1) 282 | // printf("%f\n",result2[img_index1+c*w*h]); 283 | img_index1++; 284 | img_index2++; 285 | } 286 | } 287 | } 288 | cv::Mat seg_img_resized; 289 | for(int i=0;i= 6000 24 | case CUBLAS_STATUS_NOT_SUPPORTED: 25 | return "CUBLAS_STATUS_NOT_SUPPORTED"; 26 | #endif 27 | #if CUDA_VERSION >= 6050 28 | case CUBLAS_STATUS_LICENSE_ERROR: 29 | return "CUBLAS_STATUS_LICENSE_ERROR"; 30 | #endif 31 | } 32 | return "Unknown cublas status"; 33 | } 34 | 35 | const char* curandGetErrorString(curandStatus_t error) { 36 | switch (error) { 37 | case CURAND_STATUS_SUCCESS: 38 | return "CURAND_STATUS_SUCCESS"; 39 | case CURAND_STATUS_VERSION_MISMATCH: 40 | return "CURAND_STATUS_VERSION_MISMATCH"; 41 | case CURAND_STATUS_NOT_INITIALIZED: 42 | return "CURAND_STATUS_NOT_INITIALIZED"; 43 | case CURAND_STATUS_ALLOCATION_FAILED: 44 | return "CURAND_STATUS_ALLOCATION_FAILED"; 45 | case CURAND_STATUS_TYPE_ERROR: 46 | return "CURAND_STATUS_TYPE_ERROR"; 47 | case CURAND_STATUS_OUT_OF_RANGE: 48 | return "CURAND_STATUS_OUT_OF_RANGE"; 49 | case CURAND_STATUS_LENGTH_NOT_MULTIPLE: 50 | return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; 51 | case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: 52 | return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; 53 | case CURAND_STATUS_LAUNCH_FAILURE: 54 | return "CURAND_STATUS_LAUNCH_FAILURE"; 55 | case CURAND_STATUS_PREEXISTING_FAILURE: 56 | return "CURAND_STATUS_PREEXISTING_FAILURE"; 57 | case CURAND_STATUS_INITIALIZATION_FAILED: 58 | return "CURAND_STATUS_INITIALIZATION_FAILED"; 59 | case CURAND_STATUS_ARCH_MISMATCH: 60 | return "CURAND_STATUS_ARCH_MISMATCH"; 61 | case CURAND_STATUS_INTERNAL_ERROR: 62 | return "CURAND_STATUS_INTERNAL_ERROR"; 63 | } 64 | return "Unknown curand status"; 65 | } 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /mathFunctions.cu: -------------------------------------------------------------------------------- 1 | #include "mathFunctions.h" 2 | #include 3 | #include "cudaUtility.h" 4 | 5 | 6 | //concatlayer 7 | template 8 | __global__ void Concat(const int nthreads, const Dtype* in_data, 9 | const bool forward, const int num_concats, const int concat_size, 10 | const int top_concat_axis, const int bottom_concat_axis, 11 | const int offset_concat_axis, Dtype* out_data) { 12 | CUDA_KERNEL_LOOP(index, nthreads) { 13 | const int total_concat_size = concat_size * bottom_concat_axis; 14 | const int concat_num = index / total_concat_size; 15 | const int concat_index = index % total_concat_size; 16 | const int top_index = concat_index + 17 | (concat_num * top_concat_axis + offset_concat_axis) * concat_size; 18 | if (forward) { 19 | out_data[top_index] = in_data[index]; 20 | } else { 21 | out_data[index] = in_data[top_index]; 22 | } 23 | } 24 | } 25 | 26 | cudaError_t ConcatLayer(int nthreads, const float *bottom_data, bool kForward, int num_concats_, int concat_input_size_, 27 | int top_concat_axis, int bottom_concat_axis, int offset_concat_axis, float *top_data, cudaStream_t stream) 28 | { 29 | Concat<<>>(nthreads, bottom_data, 30 | kForward, num_concats_, concat_input_size_, top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data); 31 | return cudaPeekAtLastError(); 32 | } 33 | 34 | -------------------------------------------------------------------------------- /mathFunctions.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | #ifndef __MATH_FUNCTINS_H__ 4 | #define __MATH_FUNCTINS_H__ 5 | 6 | #include 7 | #include // for std::fabs and std::signbit 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include // cuda driver types 15 | #include 16 | 17 | #include 18 | #define PERMUTELAYER_ORDERNUM 4 19 | #define BLOCK 512 20 | // 21 | // CUDA macros 22 | // 23 | 24 | // CUDA: various checks for different function calls. 25 | #define CUDA_CHECK(condition) \ 26 | /* Code block avoids redefinition of cudaError_t error */ \ 27 | do { \ 28 | cudaError_t error = condition; \ 29 | CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ 30 | } while (0) 31 | 32 | #define CUBLAS_CHECK(condition) \ 33 | do { \ 34 | cublasStatus_t status = condition; \ 35 | CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \ 36 | << cublasGetErrorString(status); \ 37 | } while (0) 38 | 39 | #define CURAND_CHECK(condition) \ 40 | do { \ 41 | curandStatus_t status = condition; \ 42 | CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \ 43 | << curandGetErrorString(status); \ 44 | } while (0) 45 | 46 | // CUDA: grid stride looping 47 | #define CUDA_KERNEL_LOOP(i, n) \ 48 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ 49 | i < (n); \ 50 | i += blockDim.x * gridDim.x) 51 | 52 | // CUDA: check for error after kernel execution and exit loudly if there is one. 53 | #define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError()) 54 | 55 | 56 | // CUDA: library error reporting. 57 | const char* cublasGetErrorString(cublasStatus_t error); 58 | const char* curandGetErrorString(curandStatus_t error); 59 | 60 | // CUDA: use 512 threads per block 61 | const int TENSORRT_CUDA_NUM_THREADS = 256; 62 | 63 | // CUDA: number of blocks for threads. 64 | inline int TENSORRT_GET_BLOCKS(const int N) { 65 | return (N + TENSORRT_CUDA_NUM_THREADS - 1) / TENSORRT_CUDA_NUM_THREADS; 66 | } 67 | 68 | 69 | /* 70 | * function: X[i] = alpha,initialize X with constant alpha 71 | * 72 | */ 73 | template 74 | void tensorrt_gpu_set(const int N, const Dtype alpha, Dtype *X); 75 | 76 | /* 77 | * function: y[index] = pow(a[index], alpha) 78 | *@params n: the dims of matrix a 79 | *@params a: matrix 80 | *@params y: vector 81 | */ 82 | template 83 | void tensorrt_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y); 84 | 85 | 86 | /* 87 | *function:y = alpha*A*x + beta*y; 88 | *@params handle: handle 89 | *@params TransA: transpose flag 90 | *@params M: the rows of A 91 | *@params N: the cols of A 92 | *@params alpha: the coefficient of A*x 93 | *@params A: matrix [M x N] 94 | *@params x: vector x 95 | *@params beta: the coefficient of y 96 | *@params y: vector y 97 | */ 98 | template 99 | void tensorrt_gpu_gemv(cublasHandle_t handle,const CBLAS_TRANSPOSE TransA, const int M, const int N, 100 | const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, 101 | Dtype* y); 102 | 103 | 104 | 105 | template 106 | void tensorrt_gpu_divbsx(const int nthreads, const Dtype* A, 107 | const Dtype* v, const int rows, const int cols, const CBLAS_TRANSPOSE trans, 108 | Dtype* B); 109 | 110 | template 111 | void tensorrt_gpu_mulbsx(const int nthreads, const Dtype* A, 112 | const Dtype* v, const int rows, const int cols, const CBLAS_TRANSPOSE trans, 113 | Dtype* B); 114 | cudaError_t tensorrt_gpu_permute(const int nthreads,float* const bottom_data,const bool forward, 115 | const int* permute_order,const int* old_steps,const int* new_steps,const int num_axes,float* const top_data,cudaStream_t stream); 116 | 117 | cudaError_t SoftmaxLayer(const float *bottom_data, int count, int channels, int outer_num_, int inner_num_, float *scale_data, float *top_data, cudaStream_t stream); 118 | 119 | cudaError_t ConcatLayer(int nthreads, const float *bottom_data, bool kForward, int num_concats_, int concat_input_size_, int top_concat_axis, int bottom_concat_axis, int offset_concat_axis, float *top_data, cudaStream_t stream); 120 | 121 | //cudaError_t cudaSoftmax(int n, int channels, float* x, float*y, cudaStream_t stream); 122 | 123 | //virtual void Forward_gpu(const vector*>& bottom,const vector*>& top); 124 | cudaError_t cudaSoftmax_caffe(int count,int channels,float* x,float* y, cudaStream_t stream); 125 | 126 | cudaError_t cudaDetectionOutput_caffe( int bottom0_count, 127 | int bottom1_count, 128 | float* loc_data, 129 | float* bottom1, 130 | float* prior_data, 131 | float* bottom3, 132 | float* bottom4, 133 | float* y, 134 | cudaStream_t stream); 135 | 136 | #endif 137 | -------------------------------------------------------------------------------- /model/pelee/pelee_merged.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric612/Pelee-Seg-TensorRT/05bf0b31c5891adaf64f40b784ef4a1927d68862/model/pelee/pelee_merged.caffemodel -------------------------------------------------------------------------------- /pluginImplement.cpp: -------------------------------------------------------------------------------- 1 | #include "pluginImplement.h" 2 | #include "mathFunctions.h" 3 | #include 4 | #include 5 | 6 | 7 | 8 | 9 | 10 | /******************************/ 11 | // PluginFactory // 12 | /******************************/ 13 | nvinfer1::IPlugin* PluginFactory::createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) 14 | { 15 | assert(isPlugin(layerName)); 16 | 17 | if (!strcmp(layerName, "ext/pm1_mbox_loc_perm")) 18 | { 19 | std::cout << layerName << std::endl; 20 | assert(mExt_pm1_mbox_loc_perm_layer.get() == nullptr); 21 | mExt_pm1_mbox_loc_perm_layer = std::unique_ptr 22 | (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter); 23 | return mExt_pm1_mbox_loc_perm_layer.get(); 24 | } 25 | else if (!strcmp(layerName, "ext/pm1_mbox_conf_perm")) 26 | { 27 | std::cout << layerName << std::endl; 28 | assert(mExt_pm1_mbox_conf_perm_layer.get() == nullptr); 29 | mExt_pm1_mbox_conf_perm_layer = std::unique_ptr 30 | (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter); 31 | return mExt_pm1_mbox_conf_perm_layer.get(); 32 | } 33 | else if (!strcmp(layerName, "ext/pm2_mbox_loc_perm")) 34 | { 35 | std::cout << layerName << std::endl; 36 | assert(mExt_pm2_mbox_loc_perm_layer.get() == nullptr); 37 | mExt_pm2_mbox_loc_perm_layer = std::unique_ptr 38 | (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter); 39 | return mExt_pm2_mbox_loc_perm_layer.get(); 40 | } 41 | else if (!strcmp(layerName, "ext/pm2_mbox_conf_perm")) 42 | { 43 | std::cout << layerName << std::endl; 44 | assert(mExt_pm2_mbox_conf_perm_layer.get() == nullptr); 45 | mExt_pm2_mbox_conf_perm_layer = std::unique_ptr 46 | (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter); 47 | return mExt_pm2_mbox_conf_perm_layer.get(); 48 | } 49 | else if (!strcmp(layerName, "ext/pm3_mbox_loc_perm")) 50 | { 51 | std::cout << layerName << std::endl; 52 | assert(mExt_pm3_mbox_loc_perm_layer.get() == nullptr); 53 | mExt_pm3_mbox_loc_perm_layer = std::unique_ptr 54 | (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter); 55 | return mExt_pm3_mbox_loc_perm_layer.get(); 56 | } 57 | else if (!strcmp(layerName, "ext/pm3_mbox_conf_perm")) 58 | { 59 | std::cout << layerName << std::endl; 60 | assert(mExt_pm3_mbox_conf_perm_layer.get() == nullptr); 61 | mExt_pm3_mbox_conf_perm_layer = std::unique_ptr 62 | (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter); 63 | return mExt_pm3_mbox_conf_perm_layer.get(); 64 | } 65 | else if (!strcmp(layerName, "ext/pm4_mbox_loc_perm")) 66 | { 67 | std::cout << layerName << std::endl; 68 | assert(mExt_pm4_mbox_loc_perm_layer.get() == nullptr); 69 | mExt_pm4_mbox_loc_perm_layer = std::unique_ptr 70 | (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter); 71 | return mExt_pm4_mbox_loc_perm_layer.get(); 72 | } 73 | else if (!strcmp(layerName, "ext/pm4_mbox_conf_perm")) 74 | { 75 | std::cout << layerName << std::endl; 76 | assert(mExt_pm4_mbox_conf_perm_layer.get() == nullptr); 77 | mExt_pm4_mbox_conf_perm_layer = std::unique_ptr 78 | (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter); 79 | return mExt_pm4_mbox_conf_perm_layer.get(); 80 | } 81 | else if (!strcmp(layerName, "ext/pm5_mbox_loc_perm")) 82 | { 83 | std::cout << layerName << std::endl; 84 | assert(mExt_pm5_mbox_loc_perm_layer.get() == nullptr); 85 | mExt_pm5_mbox_loc_perm_layer = std::unique_ptr 86 | (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter); 87 | return mExt_pm5_mbox_loc_perm_layer.get(); 88 | } 89 | else if (!strcmp(layerName, "ext/pm5_mbox_conf_perm")) 90 | { 91 | std::cout << layerName << std::endl; 92 | assert(mExt_pm5_mbox_conf_perm_layer.get() == nullptr); 93 | mExt_pm5_mbox_conf_perm_layer = std::unique_ptr 94 | (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter); 95 | return mExt_pm5_mbox_conf_perm_layer.get(); 96 | } 97 | else if (!strcmp(layerName, "ext/pm6_mbox_loc_perm")) 98 | { 99 | std::cout << layerName << std::endl; 100 | assert(mExt_pm6_mbox_loc_perm_layer.get() == nullptr); 101 | mExt_pm6_mbox_loc_perm_layer = std::unique_ptr 102 | (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter); 103 | return mExt_pm6_mbox_loc_perm_layer.get(); 104 | } 105 | else if (!strcmp(layerName, "ext/pm6_mbox_conf_perm")) 106 | { 107 | std::cout << layerName << std::endl; 108 | assert(mExt_pm6_mbox_conf_perm_layer.get() == nullptr); 109 | mExt_pm6_mbox_conf_perm_layer = std::unique_ptr 110 | (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter); 111 | return mExt_pm6_mbox_conf_perm_layer.get(); 112 | } 113 | else if (!strcmp(layerName, "ext/pm1_mbox_priorbox")) 114 | { 115 | std::cout << layerName << std::endl; 116 | assert(mExt_pm1_mbox_priorbox_layer.get() == nullptr); 117 | PriorBoxParameters params; 118 | float min_size[1] = {30.3999996185}, max_size[1] = {60.7999992371}, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 119 | params.minSize=min_size; 120 | params.aspectRatios=aspect_ratio; 121 | params.numMinSize = 1; 122 | params.numAspectRatios = 3; 123 | params.maxSize = max_size; 124 | params.numMaxSize = 1; 125 | params.flip = true; 126 | params.clip = false; 127 | params.variance[0] = 0.1; 128 | params.variance[1] = 0.1; 129 | params.variance[2] = 0.2; 130 | params.variance[3] = 0.2; 131 | params.imgH = 0; 132 | params.imgW = 0; 133 | params.stepH = 0; 134 | params.stepW = 0; 135 | params.offset = 0.5; 136 | mExt_pm1_mbox_priorbox_layer = std::unique_ptr 137 | (createSSDPriorBoxPlugin(params), nvPluginDeleter); 138 | return mExt_pm1_mbox_priorbox_layer.get(); 139 | } 140 | else if (!strcmp(layerName, "ext/pm2_mbox_priorbox")) 141 | { 142 | std::cout << layerName << std::endl; 143 | assert(mExt_pm2_mbox_priorbox_layer.get() == nullptr); 144 | float min_size[1] = {60.7999992371}, max_size[1] = {112.480003357}, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 145 | PriorBoxParameters params; 146 | params.minSize=min_size; 147 | params.aspectRatios=aspect_ratio; 148 | params.numMinSize = 1; 149 | params.numAspectRatios = 3; 150 | params.maxSize = max_size; 151 | params.numMaxSize = 1; 152 | params.flip = true; 153 | params.clip = false; 154 | params.variance[0] = 0.1; 155 | params.variance[1] = 0.1; 156 | params.variance[2] = 0.2; 157 | params.variance[3] = 0.2; 158 | params.imgH = 0; 159 | params.imgW = 0; 160 | params.stepH = 0; 161 | params.stepW = 0; 162 | params.offset = 0.5; 163 | 164 | 165 | 166 | mExt_pm2_mbox_priorbox_layer = std::unique_ptr 167 | (createSSDPriorBoxPlugin(params), nvPluginDeleter); 168 | return mExt_pm2_mbox_priorbox_layer.get(); 169 | } 170 | 171 | else if (!strcmp(layerName, "ext/pm3_mbox_priorbox")) 172 | { 173 | std::cout << layerName << std::endl; 174 | assert(mExt_pm3_mbox_priorbox_layer.get() == nullptr); 175 | float min_size[1] = {112.480003357}, max_size[1] = {164.160003662}, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 176 | PriorBoxParameters params; 177 | params.minSize=min_size; 178 | params.aspectRatios=aspect_ratio; 179 | params.numMinSize = 1; 180 | params.numAspectRatios = 3; 181 | params.maxSize = max_size; 182 | params.numMaxSize = 1; 183 | params.flip = true; 184 | params.clip = false; 185 | params.variance[0] = 0.1; 186 | params.variance[1] = 0.1; 187 | params.variance[2] = 0.2; 188 | params.variance[3] = 0.2; 189 | params.imgH = 0; 190 | params.imgW = 0; 191 | params.stepH = 0; 192 | params.stepW = 0; 193 | params.offset = 0.5; 194 | 195 | mExt_pm3_mbox_priorbox_layer = std::unique_ptr 196 | (createSSDPriorBoxPlugin(params), nvPluginDeleter); 197 | return mExt_pm3_mbox_priorbox_layer.get(); 198 | } 199 | 200 | else if (!strcmp(layerName, "ext/pm4_mbox_priorbox")) 201 | { 202 | std::cout << layerName << std::endl; 203 | assert(mExt_pm4_mbox_priorbox_layer.get() == nullptr); 204 | float min_size[1] = {164.160003662}, max_size[1] = {215.839996338}, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 205 | PriorBoxParameters params; 206 | params.minSize=min_size; 207 | params.aspectRatios=aspect_ratio; 208 | params.numMinSize = 1; 209 | params.numAspectRatios = 3; 210 | params.maxSize = max_size; 211 | params.numMaxSize = 1; 212 | params.flip = true; 213 | params.clip = false; 214 | params.variance[0] = 0.1; 215 | params.variance[1] = 0.1; 216 | params.variance[2] = 0.2; 217 | params.variance[3] = 0.2; 218 | params.imgH = 0; 219 | params.imgW = 0; 220 | params.stepH = 0; 221 | params.stepW = 0; 222 | params.offset = 0.5; 223 | mExt_pm4_mbox_priorbox_layer = std::unique_ptr 224 | (createSSDPriorBoxPlugin(params), nvPluginDeleter); 225 | return mExt_pm4_mbox_priorbox_layer.get(); 226 | } 227 | 228 | else if (!strcmp(layerName, "ext/pm5_mbox_priorbox")) 229 | { 230 | std::cout << layerName << std::endl; 231 | assert(mExt_pm5_mbox_priorbox_layer.get() == nullptr); 232 | float min_size[1]= {215.839996338}, max_size[1]= {267.519989014}, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 233 | PriorBoxParameters params; 234 | params.minSize=min_size; 235 | params.aspectRatios=aspect_ratio; 236 | params.numMinSize = 1; 237 | params.numAspectRatios = 3; 238 | params.maxSize = max_size; 239 | params.numMaxSize = 1; 240 | params.flip = true; 241 | params.clip = false; 242 | params.variance[0] = 0.1; 243 | params.variance[1] = 0.1; 244 | params.variance[2] = 0.2; 245 | params.variance[3] = 0.2; 246 | params.imgH = 0; 247 | params.imgW = 0; 248 | params.stepH = 0; 249 | params.stepW = 0; 250 | params.offset = 0.5; 251 | mExt_pm5_mbox_priorbox_layer = std::unique_ptr 252 | (createSSDPriorBoxPlugin(params), nvPluginDeleter); 253 | return mExt_pm5_mbox_priorbox_layer.get(); 254 | } 255 | 256 | else if (!strcmp(layerName, "ext/pm6_mbox_priorbox")) 257 | { 258 | std::cout << layerName << std::endl; 259 | assert(mExt_pm6_mbox_priorbox_layer.get() == nullptr); 260 | float min_size[1] = {267.519989014}, max_size[1] = {319.200012207}, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 261 | PriorBoxParameters params; 262 | params.minSize=min_size; 263 | params.aspectRatios=aspect_ratio; 264 | params.numMinSize = 1; 265 | params.numAspectRatios = 3; 266 | params.maxSize = max_size; 267 | params.numMaxSize = 1; 268 | params.flip = true; 269 | params.clip = false; 270 | params.variance[0] = 0.1; 271 | params.variance[1] = 0.1; 272 | params.variance[2] = 0.2; 273 | params.variance[3] = 0.2; 274 | params.imgH = 0; 275 | params.imgW = 0; 276 | params.stepH = 0; 277 | params.stepW = 0; 278 | params.offset = 0.5; 279 | 280 | mExt_pm6_mbox_priorbox_layer = std::unique_ptr 281 | (createSSDPriorBoxPlugin(params), nvPluginDeleter); 282 | return mExt_pm6_mbox_priorbox_layer.get(); 283 | } 284 | 285 | else if (!strcmp(layerName, "stem/concat")) 286 | { 287 | std::cout << layerName << std::endl; 288 | assert(mStem_concat_layer.get() == nullptr); 289 | mStem_concat_layer = std::unique_ptr 290 | (createConcatPlugin(1, true), nvPluginDeleter); 291 | return mStem_concat_layer.get(); 292 | } 293 | 294 | else if (!strcmp(layerName, "stage1_1/concat")) 295 | { 296 | std::cout << layerName << std::endl; 297 | assert(mStage1_1_concat_layer.get() == nullptr); 298 | mStage1_1_concat_layer = std::unique_ptr 299 | (createConcatPlugin(1, true), nvPluginDeleter); 300 | return mStage1_1_concat_layer.get(); 301 | } 302 | else if (!strcmp(layerName, "stage1_2/concat")) 303 | { 304 | std::cout << layerName << std::endl; 305 | assert(mStage1_2_concat_layer.get() == nullptr); 306 | mStage1_2_concat_layer = std::unique_ptr 307 | (createConcatPlugin(1, true), nvPluginDeleter); 308 | return mStage1_2_concat_layer.get(); 309 | } 310 | else if (!strcmp(layerName, "stage1_3/concat")) 311 | { 312 | std::cout << layerName << std::endl; 313 | assert(mStage1_3_concat_layer.get() == nullptr); 314 | mStage1_3_concat_layer = std::unique_ptr 315 | (createConcatPlugin(1, true), nvPluginDeleter); 316 | return mStage1_3_concat_layer.get(); 317 | } 318 | 319 | else if (!strcmp(layerName, "stage2_1/concat")) 320 | { 321 | std::cout << layerName << std::endl; 322 | assert(mStage2_1_concat_layer.get() == nullptr); 323 | mStage2_1_concat_layer = std::unique_ptr 324 | (createConcatPlugin(1, true), nvPluginDeleter); 325 | return mStage2_1_concat_layer.get(); 326 | } 327 | else if (!strcmp(layerName, "stage2_2/concat")) 328 | { 329 | std::cout << layerName << std::endl; 330 | assert(mStage2_2_concat_layer.get() == nullptr); 331 | mStage2_2_concat_layer = std::unique_ptr 332 | (createConcatPlugin(1, true), nvPluginDeleter); 333 | return mStage2_2_concat_layer.get(); 334 | } 335 | else if (!strcmp(layerName, "stage2_3/concat")) 336 | { 337 | std::cout << layerName << std::endl; 338 | assert(mStage2_3_concat_layer.get() == nullptr); 339 | mStage2_3_concat_layer = std::unique_ptr 340 | (createConcatPlugin(1, true), nvPluginDeleter); 341 | return mStage2_3_concat_layer.get(); 342 | } 343 | else if (!strcmp(layerName, "stage2_4/concat")) 344 | { 345 | std::cout << layerName << std::endl; 346 | assert(mStage2_4_concat_layer.get() == nullptr); 347 | mStage2_4_concat_layer = std::unique_ptr 348 | (createConcatPlugin(1, true), nvPluginDeleter); 349 | return mStage2_4_concat_layer.get(); 350 | } 351 | 352 | else if (!strcmp(layerName, "stage3_1/concat")) 353 | { 354 | std::cout << layerName << std::endl; 355 | assert(mStage3_1_concat_layer.get() == nullptr); 356 | mStage3_1_concat_layer = std::unique_ptr 357 | (createConcatPlugin(1, true), nvPluginDeleter); 358 | return mStage3_1_concat_layer.get(); 359 | } 360 | else if (!strcmp(layerName, "stage3_2/concat")) 361 | { 362 | std::cout << layerName << std::endl; 363 | assert(mStage3_2_concat_layer.get() == nullptr); 364 | mStage3_2_concat_layer = std::unique_ptr 365 | (createConcatPlugin(1, true), nvPluginDeleter); 366 | return mStage3_2_concat_layer.get(); 367 | } 368 | else if (!strcmp(layerName, "stage3_3/concat")) 369 | { 370 | std::cout << layerName << std::endl; 371 | assert(mStage3_3_concat_layer.get() == nullptr); 372 | mStage3_3_concat_layer = std::unique_ptr 373 | (createConcatPlugin(1, true), nvPluginDeleter); 374 | return mStage3_3_concat_layer.get(); 375 | } 376 | else if (!strcmp(layerName, "stage3_4/concat")) 377 | { 378 | std::cout << layerName << std::endl; 379 | assert(mStage3_4_concat_layer.get() == nullptr); 380 | mStage3_4_concat_layer = std::unique_ptr 381 | (createConcatPlugin(1, true), nvPluginDeleter); 382 | return mStage3_4_concat_layer.get(); 383 | } 384 | else if (!strcmp(layerName, "stage3_5/concat")) 385 | { 386 | std::cout << layerName << std::endl; 387 | assert(mStage3_5_concat_layer.get() == nullptr); 388 | mStage3_5_concat_layer = std::unique_ptr 389 | (createConcatPlugin(1, true), nvPluginDeleter); 390 | return mStage3_5_concat_layer.get(); 391 | } 392 | else if (!strcmp(layerName, "stage3_6/concat")) 393 | { 394 | std::cout << layerName << std::endl; 395 | assert(mStage3_6_concat_layer.get() == nullptr); 396 | mStage3_6_concat_layer = std::unique_ptr 397 | (createConcatPlugin(1, true), nvPluginDeleter); 398 | return mStage3_6_concat_layer.get(); 399 | } 400 | else if (!strcmp(layerName, "stage3_7/concat")) 401 | { 402 | std::cout << layerName << std::endl; 403 | assert(mStage3_7_concat_layer.get() == nullptr); 404 | mStage3_7_concat_layer = std::unique_ptr 405 | (createConcatPlugin(1, true), nvPluginDeleter); 406 | return mStage3_7_concat_layer.get(); 407 | } 408 | else if (!strcmp(layerName, "stage3_8/concat")) 409 | { 410 | std::cout << layerName << std::endl; 411 | assert(mStage3_8_concat_layer.get() == nullptr); 412 | mStage3_8_concat_layer = std::unique_ptr 413 | (createConcatPlugin(1, true), nvPluginDeleter); 414 | return mStage3_8_concat_layer.get(); 415 | } 416 | 417 | else if (!strcmp(layerName, "stage4_1/concat")) 418 | { 419 | std::cout << layerName << std::endl; 420 | assert(mStage4_1_concat_layer.get() == nullptr); 421 | mStage4_1_concat_layer = std::unique_ptr 422 | (createConcatPlugin(1, true), nvPluginDeleter); 423 | return mStage4_1_concat_layer.get(); 424 | } 425 | else if (!strcmp(layerName, "stage4_2/concat")) 426 | { 427 | std::cout << layerName << std::endl; 428 | assert(mStage4_2_concat_layer.get() == nullptr); 429 | mStage4_2_concat_layer = std::unique_ptr 430 | (createConcatPlugin(1, true), nvPluginDeleter); 431 | return mStage4_2_concat_layer.get(); 432 | } 433 | else if (!strcmp(layerName, "stage4_3/concat")) 434 | { 435 | std::cout << layerName << std::endl; 436 | assert(mStage4_3_concat_layer.get() == nullptr); 437 | mStage4_3_concat_layer = std::unique_ptr 438 | (createConcatPlugin(1, true), nvPluginDeleter); 439 | return mStage4_3_concat_layer.get(); 440 | } 441 | else if (!strcmp(layerName, "stage4_4/concat")) 442 | { 443 | std::cout << layerName << std::endl; 444 | assert(mStage4_4_concat_layer.get() == nullptr); 445 | mStage4_4_concat_layer = std::unique_ptr 446 | (createConcatPlugin(1, true), nvPluginDeleter); 447 | return mStage4_4_concat_layer.get(); 448 | } 449 | else if (!strcmp(layerName, "stage4_5/concat")) 450 | { 451 | std::cout << layerName << std::endl; 452 | assert(mStage4_5_concat_layer.get() == nullptr); 453 | mStage4_5_concat_layer = std::unique_ptr 454 | (createConcatPlugin(1, true), nvPluginDeleter); 455 | return mStage4_5_concat_layer.get(); 456 | } 457 | else if (!strcmp(layerName, "stage4_6/concat")) 458 | { 459 | std::cout << layerName << std::endl; 460 | assert(mStage4_6_concat_layer.get() == nullptr); 461 | mStage4_6_concat_layer = std::unique_ptr 462 | (createConcatPlugin(1, true), nvPluginDeleter); 463 | return mStage4_6_concat_layer.get(); 464 | } 465 | else if (!strcmp(layerName, "mbox_priorbox")) 466 | { 467 | std::cout << layerName << std::endl; 468 | assert(mBox_priorbox_layer.get() == nullptr); 469 | mBox_priorbox_layer = std::unique_ptr 470 | (createConcatPlugin(2, true), nvPluginDeleter); 471 | return mBox_priorbox_layer.get(); 472 | } 473 | 474 | 475 | else if (!strcmp(layerName, "mbox_loc")) 476 | { 477 | std::cout << layerName << std::endl; 478 | assert(mBox_loc_layer.get() == nullptr); 479 | mBox_loc_layer = std::unique_ptr 480 | (createConcatPlugin(1, true), nvPluginDeleter); 481 | return mBox_loc_layer.get(); 482 | } 483 | else if (!strcmp(layerName, "mbox_conf")) 484 | { 485 | std::cout << layerName << std::endl; 486 | assert(mBox_conf_layer.get() == nullptr); 487 | mBox_conf_layer = std::unique_ptr 488 | (createConcatPlugin(1, true), nvPluginDeleter); 489 | return mBox_conf_layer.get(); 490 | } 491 | 492 | //flatten 493 | else if (!strcmp(layerName, "ext/pm1_mbox_loc_flat")) 494 | { 495 | std::cout << layerName << std::endl; 496 | assert(mExt_pm1_mbox_loc_flat_layer.get() == nullptr); 497 | mExt_pm1_mbox_loc_flat_layer = std::unique_ptr(new FlattenLayer()); 498 | return mExt_pm1_mbox_loc_flat_layer.get(); 499 | } 500 | else if (!strcmp(layerName, "ext/pm1_mbox_conf_flat")) 501 | { 502 | std::cout << layerName << std::endl; 503 | assert(mExt_pm1_mbox_conf_flat_layer.get() == nullptr); 504 | mExt_pm1_mbox_conf_flat_layer = std::unique_ptr(new FlattenLayer()); 505 | return mExt_pm1_mbox_conf_flat_layer.get(); 506 | } 507 | else if (!strcmp(layerName, "ext/pm2_mbox_loc_flat")) 508 | { 509 | std::cout << layerName << std::endl; 510 | assert(mExt_pm2_mbox_loc_flat_layer.get() == nullptr); 511 | mExt_pm2_mbox_loc_flat_layer = std::unique_ptr(new FlattenLayer()); 512 | return mExt_pm2_mbox_loc_flat_layer.get(); 513 | } 514 | else if (!strcmp(layerName, "ext/pm2_mbox_conf_flat")) 515 | { 516 | std::cout << layerName << std::endl; 517 | assert(mExt_pm2_mbox_conf_flat_layer.get() == nullptr); 518 | mExt_pm2_mbox_conf_flat_layer = std::unique_ptr(new FlattenLayer()); 519 | return mExt_pm2_mbox_conf_flat_layer.get(); 520 | } 521 | else if (!strcmp(layerName, "ext/pm3_mbox_loc_flat")) 522 | { 523 | std::cout << layerName << std::endl; 524 | assert(mExt_pm3_mbox_loc_flat_layer.get() == nullptr); 525 | mExt_pm3_mbox_loc_flat_layer = std::unique_ptr(new FlattenLayer()); 526 | return mExt_pm3_mbox_loc_flat_layer.get(); 527 | } 528 | else if (!strcmp(layerName, "ext/pm3_mbox_conf_flat")) 529 | { 530 | std::cout << layerName << std::endl; 531 | assert(mExt_pm3_mbox_conf_flat_layer.get() == nullptr); 532 | mExt_pm3_mbox_conf_flat_layer = std::unique_ptr(new FlattenLayer()); 533 | return mExt_pm3_mbox_conf_flat_layer.get(); 534 | } 535 | else if (!strcmp(layerName, "ext/pm4_mbox_loc_flat")) 536 | { 537 | std::cout << layerName << std::endl; 538 | assert(mExt_pm4_mbox_loc_flat_layer.get() == nullptr); 539 | mExt_pm4_mbox_loc_flat_layer = std::unique_ptr(new FlattenLayer()); 540 | return mExt_pm4_mbox_loc_flat_layer.get(); 541 | } 542 | else if (!strcmp(layerName, "ext/pm4_mbox_conf_flat")) 543 | { 544 | std::cout << layerName << std::endl; 545 | assert(mExt_pm4_mbox_conf_flat_layer.get() == nullptr); 546 | mExt_pm4_mbox_conf_flat_layer = std::unique_ptr(new FlattenLayer()); 547 | return mExt_pm4_mbox_conf_flat_layer.get(); 548 | } 549 | else if (!strcmp(layerName, "ext/pm5_mbox_loc_flat")) 550 | { 551 | std::cout << layerName << std::endl; 552 | assert(mExt_pm5_mbox_loc_flat_layer.get() == nullptr); 553 | mExt_pm5_mbox_loc_flat_layer = std::unique_ptr(new FlattenLayer()); 554 | return mExt_pm5_mbox_loc_flat_layer.get(); 555 | } 556 | else if (!strcmp(layerName, "ext/pm5_mbox_conf_flat")) 557 | { 558 | std::cout << layerName << std::endl; 559 | assert(mExt_pm5_mbox_conf_flat_layer.get() == nullptr); 560 | mExt_pm5_mbox_conf_flat_layer = std::unique_ptr(new FlattenLayer()); 561 | return mExt_pm5_mbox_conf_flat_layer.get(); 562 | } 563 | else if (!strcmp(layerName, "ext/pm6_mbox_loc_flat")) 564 | { 565 | std::cout << layerName << std::endl; 566 | assert(mExt_pm6_mbox_loc_flat_layer.get() == nullptr); 567 | mExt_pm6_mbox_loc_flat_layer = std::unique_ptr(new FlattenLayer()); 568 | return mExt_pm6_mbox_loc_flat_layer.get(); 569 | } 570 | else if (!strcmp(layerName, "ext/pm6_mbox_conf_flat")) 571 | { 572 | std::cout << layerName << std::endl; 573 | assert(mExt_pm6_mbox_conf_flat_layer.get() == nullptr); 574 | mExt_pm6_mbox_conf_flat_layer = std::unique_ptr(new FlattenLayer()); 575 | return mExt_pm6_mbox_conf_flat_layer.get(); 576 | } 577 | 578 | else if (!strcmp(layerName, "mbox_conf_flatten")) 579 | { 580 | std::cout << layerName << std::endl; 581 | assert(mMbox_conf_flat_layer.get() == nullptr); 582 | mMbox_conf_flat_layer = std::unique_ptr(new FlattenLayer()); 583 | return mMbox_conf_flat_layer.get(); 584 | } 585 | 586 | 587 | else if (!strcmp(layerName, "mbox_conf_reshape")) 588 | { 589 | std::cout << layerName << std::endl; 590 | assert(mMbox_conf_reshape.get() == nullptr); 591 | assert(nbWeights == 0 && weights == nullptr); 592 | mMbox_conf_reshape = std::unique_ptr>(new Reshape<11>()); 593 | return mMbox_conf_reshape.get(); 594 | } 595 | //softmax layer 596 | else if (!strcmp(layerName, "mbox_conf_softmax")) 597 | { 598 | std::cout << layerName << std::endl; 599 | assert( mPluginSoftmax == nullptr); 600 | assert( nbWeights == 0 && weights == nullptr); 601 | mPluginSoftmax = std::unique_ptr(new SoftmaxPlugin()); 602 | return mPluginSoftmax.get(); 603 | } 604 | else if (!strcmp(layerName, "detection_out")) 605 | { 606 | std::cout << layerName << std::endl; 607 | assert(mDetection_out.get() == nullptr); 608 | //tensor rt 3.0 609 | //mDetection_out = std::unique_ptr(createSSDDetectionOutputPlugin({true, false, 0, 21, 400, 200, 0.5, 0.45, CodeType_t::CENTER_SIZE}), nvPluginDeleter); 610 | //tensor rt 5 611 | 612 | 613 | 614 | DetectionOutputParameters params; 615 | params.backgroundLabelId = 0; 616 | params.codeType = CodeTypeSSD::CENTER_SIZE; 617 | params.keepTopK = 200; 618 | params.shareLocation = true; 619 | params.varianceEncodedInTarget = false; 620 | params.topK = 400; 621 | params.nmsThreshold = 0.4499; 622 | params.numClasses = 11; 623 | params.inputOrder[0] = 0; 624 | params.inputOrder[1] = 1; 625 | params.inputOrder[2] = 2; 626 | params.confidenceThreshold = 0.3; 627 | params.confSigmoid = false; 628 | params.isNormalized = true; 629 | 630 | 631 | 632 | mDetection_out = std::unique_ptr 633 | (createSSDDetectionOutputPlugin(params), nvPluginDeleter); 634 | return mDetection_out.get(); 635 | } 636 | else 637 | { 638 | std::cout << layerName << std::endl; 639 | assert(0); 640 | return nullptr; 641 | } 642 | } 643 | 644 | IPlugin* PluginFactory::createPlugin(const char* layerName, const void* serialData, size_t serialLength) 645 | { 646 | assert(isPlugin(layerName)); 647 | if (!strcmp(layerName, "ext/pm1_mbox_loc_perm")) 648 | { 649 | std::cout << layerName << std::endl; 650 | assert(mExt_pm1_mbox_loc_perm_layer.get() == nullptr); 651 | mExt_pm1_mbox_loc_perm_layer = std::unique_ptr 652 | (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter); 653 | return mExt_pm1_mbox_loc_perm_layer.get(); 654 | } 655 | else if (!strcmp(layerName, "ext/pm1_mbox_conf_perm")) 656 | { 657 | assert(mExt_pm1_mbox_conf_perm_layer.get() == nullptr); 658 | mExt_pm1_mbox_conf_perm_layer = std::unique_ptr 659 | (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter); 660 | return mExt_pm1_mbox_conf_perm_layer.get(); 661 | } 662 | else if (!strcmp(layerName, "ext/pm2_mbox_loc_perm")) 663 | { 664 | assert(mExt_pm2_mbox_loc_perm_layer.get() == nullptr); 665 | mExt_pm2_mbox_loc_perm_layer = std::unique_ptr 666 | (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter); 667 | return mExt_pm2_mbox_loc_perm_layer.get(); 668 | } 669 | else if (!strcmp(layerName, "ext/pm2_mbox_conf_perm")) 670 | { 671 | assert(mExt_pm2_mbox_conf_perm_layer.get() == nullptr); 672 | mExt_pm2_mbox_conf_perm_layer = std::unique_ptr 673 | (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter); 674 | return mExt_pm2_mbox_conf_perm_layer.get(); 675 | } 676 | else if (!strcmp(layerName, "ext/pm3_mbox_loc_perm")) 677 | { 678 | assert(mExt_pm3_mbox_loc_perm_layer.get() == nullptr); 679 | mExt_pm3_mbox_loc_perm_layer = std::unique_ptr 680 | (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter); 681 | return mExt_pm3_mbox_loc_perm_layer.get(); 682 | } 683 | else if (!strcmp(layerName, "ext/pm3_mbox_conf_perm")) 684 | { 685 | assert(mExt_pm3_mbox_conf_perm_layer.get() == nullptr); 686 | mExt_pm3_mbox_conf_perm_layer = std::unique_ptr 687 | (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter); 688 | return mExt_pm3_mbox_conf_perm_layer.get(); 689 | } 690 | else if (!strcmp(layerName, "ext/pm4_mbox_loc_perm")) 691 | { 692 | assert(mExt_pm4_mbox_loc_perm_layer.get() == nullptr); 693 | mExt_pm4_mbox_loc_perm_layer = std::unique_ptr 694 | (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter); 695 | return mExt_pm4_mbox_loc_perm_layer.get(); 696 | } 697 | else if (!strcmp(layerName, "ext/pm4_mbox_conf_perm")) 698 | { 699 | assert(mExt_pm4_mbox_conf_perm_layer.get() == nullptr); 700 | mExt_pm4_mbox_conf_perm_layer = std::unique_ptr 701 | (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter); 702 | return mExt_pm4_mbox_conf_perm_layer.get(); 703 | } 704 | else if (!strcmp(layerName, "ext/pm5_mbox_loc_perm")) 705 | { 706 | assert(mExt_pm5_mbox_loc_perm_layer.get() == nullptr); 707 | mExt_pm5_mbox_loc_perm_layer = std::unique_ptr 708 | (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter); 709 | return mExt_pm5_mbox_loc_perm_layer.get(); 710 | } 711 | else if (!strcmp(layerName, "ext/pm5_mbox_conf_perm")) 712 | { 713 | assert(mExt_pm5_mbox_conf_perm_layer.get() == nullptr); 714 | mExt_pm5_mbox_conf_perm_layer = std::unique_ptr 715 | (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter); 716 | return mExt_pm5_mbox_conf_perm_layer.get(); 717 | } 718 | else if (!strcmp(layerName, "ext/pm6_mbox_loc_perm")) 719 | { 720 | assert(mExt_pm6_mbox_loc_perm_layer.get() == nullptr); 721 | mExt_pm6_mbox_loc_perm_layer = std::unique_ptr 722 | (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter); 723 | return mExt_pm6_mbox_loc_perm_layer.get(); 724 | } 725 | else if (!strcmp(layerName, "ext/pm6_mbox_conf_perm")) 726 | { 727 | assert(mExt_pm6_mbox_conf_perm_layer.get() == nullptr); 728 | mExt_pm6_mbox_conf_perm_layer = std::unique_ptr 729 | (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter); 730 | return mExt_pm6_mbox_conf_perm_layer.get(); 731 | } 732 | else if (!strcmp(layerName, "ext/pm1_mbox_priorbox")) 733 | { 734 | assert(mExt_pm1_mbox_priorbox_layer.get() == nullptr); 735 | float min_size = 30.3999996185, max_size = 60.7999992371, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 736 | mExt_pm1_mbox_priorbox_layer = std::unique_ptr 737 | (createSSDPriorBoxPlugin(serialData, serialLength), nvPluginDeleter); 738 | return mExt_pm1_mbox_priorbox_layer.get(); 739 | } 740 | else if (!strcmp(layerName, "ext/pm2_mbox_priorbox")) 741 | { 742 | assert(mExt_pm2_mbox_priorbox_layer.get() == nullptr); 743 | float min_size = 60.7999992371, max_size = 112.480003357, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 744 | mExt_pm2_mbox_priorbox_layer = std::unique_ptr 745 | (createSSDPriorBoxPlugin(serialData, serialLength), nvPluginDeleter); 746 | return mExt_pm2_mbox_priorbox_layer.get(); 747 | } 748 | 749 | else if (!strcmp(layerName, "ext/pm3_mbox_priorbox")) 750 | { 751 | assert(mExt_pm3_mbox_priorbox_layer.get() == nullptr); 752 | float min_size = 112.480003357, max_size = 164.160003662, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 753 | mExt_pm3_mbox_priorbox_layer = std::unique_ptr 754 | (createSSDPriorBoxPlugin(serialData, serialLength), nvPluginDeleter); 755 | return mExt_pm3_mbox_priorbox_layer.get(); 756 | } 757 | 758 | else if (!strcmp(layerName, "ext/pm4_mbox_priorbox")) 759 | { 760 | assert(mExt_pm4_mbox_priorbox_layer.get() == nullptr); 761 | float min_size = 164.160003662, max_size = 215.839996338, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 762 | mExt_pm4_mbox_priorbox_layer = std::unique_ptr 763 | (createSSDPriorBoxPlugin(serialData, serialLength), nvPluginDeleter); 764 | return mExt_pm4_mbox_priorbox_layer.get(); 765 | } 766 | 767 | else if (!strcmp(layerName, "ext/pm5_mbox_priorbox")) 768 | { 769 | assert(mExt_pm5_mbox_priorbox_layer.get() == nullptr); 770 | float min_size = 215.839996338, max_size = 267.519989014, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 771 | mExt_pm5_mbox_priorbox_layer = std::unique_ptr 772 | (createSSDPriorBoxPlugin(serialData, serialLength), nvPluginDeleter); 773 | return mExt_pm5_mbox_priorbox_layer.get(); 774 | } 775 | 776 | else if (!strcmp(layerName, "ext/pm6_mbox_priorbox")) 777 | { 778 | assert(mExt_pm6_mbox_priorbox_layer.get() == nullptr); 779 | float min_size = 267.519989014, max_size = 319.200012207, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 780 | mExt_pm6_mbox_priorbox_layer = std::unique_ptr 781 | (createSSDPriorBoxPlugin(serialData, serialLength), nvPluginDeleter); 782 | return mExt_pm6_mbox_priorbox_layer.get(); 783 | } 784 | 785 | else if (!strcmp(layerName, "stem/concat")) 786 | { 787 | assert(mStem_concat_layer.get() == nullptr); 788 | mStem_concat_layer = std::unique_ptr 789 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 790 | return mStem_concat_layer.get(); 791 | } 792 | 793 | else if (!strcmp(layerName, "stage1_1/concat")) 794 | { 795 | assert(mStage1_1_concat_layer.get() == nullptr); 796 | mStage1_1_concat_layer = std::unique_ptr 797 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 798 | return mStage1_1_concat_layer.get(); 799 | } 800 | else if (!strcmp(layerName, "stage1_2/concat")) 801 | { 802 | assert(mStage1_2_concat_layer.get() == nullptr); 803 | mStage1_2_concat_layer = std::unique_ptr 804 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 805 | return mStage1_2_concat_layer.get(); 806 | } 807 | else if (!strcmp(layerName, "stage1_3/concat")) 808 | { 809 | assert(mStage1_3_concat_layer.get() == nullptr); 810 | mStage1_3_concat_layer = std::unique_ptr 811 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 812 | return mStage1_3_concat_layer.get(); 813 | } 814 | 815 | else if (!strcmp(layerName, "stage2_1/concat")) 816 | { 817 | assert(mStage2_1_concat_layer.get() == nullptr); 818 | mStage2_1_concat_layer = std::unique_ptr 819 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 820 | return mStage2_1_concat_layer.get(); 821 | } 822 | else if (!strcmp(layerName, "stage2_2/concat")) 823 | { 824 | assert(mStage2_2_concat_layer.get() == nullptr); 825 | mStage2_2_concat_layer = std::unique_ptr 826 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 827 | return mStage2_2_concat_layer.get(); 828 | } 829 | else if (!strcmp(layerName, "stage2_3/concat")) 830 | { 831 | assert(mStage2_3_concat_layer.get() == nullptr); 832 | mStage2_3_concat_layer = std::unique_ptr 833 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 834 | return mStage2_3_concat_layer.get(); 835 | } 836 | else if (!strcmp(layerName, "stage2_4/concat")) 837 | { 838 | assert(mStage2_4_concat_layer.get() == nullptr); 839 | mStage2_4_concat_layer = std::unique_ptr 840 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 841 | return mStage2_4_concat_layer.get(); 842 | } 843 | 844 | else if (!strcmp(layerName, "stage3_1/concat")) 845 | { 846 | assert(mStage3_1_concat_layer.get() == nullptr); 847 | mStage3_1_concat_layer = std::unique_ptr 848 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 849 | return mStage3_1_concat_layer.get(); 850 | } 851 | else if (!strcmp(layerName, "stage3_2/concat")) 852 | { 853 | assert(mStage3_2_concat_layer.get() == nullptr); 854 | mStage3_2_concat_layer = std::unique_ptr 855 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 856 | return mStage3_2_concat_layer.get(); 857 | } 858 | else if (!strcmp(layerName, "stage3_3/concat")) 859 | { 860 | assert(mStage3_3_concat_layer.get() == nullptr); 861 | mStage3_3_concat_layer = std::unique_ptr 862 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 863 | return mStage3_3_concat_layer.get(); 864 | } 865 | else if (!strcmp(layerName, "stage3_4/concat")) 866 | { 867 | assert(mStage3_4_concat_layer.get() == nullptr); 868 | mStage3_4_concat_layer = std::unique_ptr 869 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 870 | return mStage3_4_concat_layer.get(); 871 | } 872 | else if (!strcmp(layerName, "stage3_5/concat")) 873 | { 874 | assert(mStage3_5_concat_layer.get() == nullptr); 875 | mStage3_5_concat_layer = std::unique_ptr 876 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 877 | return mStage3_5_concat_layer.get(); 878 | } 879 | else if (!strcmp(layerName, "stage3_6/concat")) 880 | { 881 | assert(mStage3_6_concat_layer.get() == nullptr); 882 | mStage3_6_concat_layer = std::unique_ptr 883 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 884 | return mStage3_6_concat_layer.get(); 885 | } 886 | else if (!strcmp(layerName, "stage3_7/concat")) 887 | { 888 | assert(mStage3_7_concat_layer.get() == nullptr); 889 | mStage3_7_concat_layer = std::unique_ptr 890 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 891 | return mStage3_7_concat_layer.get(); 892 | } 893 | else if (!strcmp(layerName, "stage3_8/concat")) 894 | { 895 | assert(mStage3_8_concat_layer.get() == nullptr); 896 | mStage3_8_concat_layer = std::unique_ptr 897 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 898 | return mStage3_8_concat_layer.get(); 899 | } 900 | 901 | else if (!strcmp(layerName, "stage4_1/concat")) 902 | { 903 | assert(mStage4_1_concat_layer.get() == nullptr); 904 | mStage4_1_concat_layer = std::unique_ptr 905 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 906 | return mStage4_1_concat_layer.get(); 907 | } 908 | else if (!strcmp(layerName, "stage4_2/concat")) 909 | { 910 | assert(mStage4_2_concat_layer.get() == nullptr); 911 | mStage4_2_concat_layer = std::unique_ptr 912 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 913 | return mStage4_2_concat_layer.get(); 914 | } 915 | else if (!strcmp(layerName, "stage4_3/concat")) 916 | { 917 | assert(mStage4_3_concat_layer.get() == nullptr); 918 | mStage4_3_concat_layer = std::unique_ptr 919 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 920 | return mStage4_3_concat_layer.get(); 921 | } 922 | else if (!strcmp(layerName, "stage4_4/concat")) 923 | { 924 | assert(mStage4_4_concat_layer.get() == nullptr); 925 | mStage4_4_concat_layer = std::unique_ptr 926 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 927 | return mStage4_4_concat_layer.get(); 928 | } 929 | else if (!strcmp(layerName, "stage4_5/concat")) 930 | { 931 | assert(mStage4_5_concat_layer.get() == nullptr); 932 | mStage4_5_concat_layer = std::unique_ptr 933 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 934 | return mStage4_5_concat_layer.get(); 935 | } 936 | else if (!strcmp(layerName, "stage4_6/concat")) 937 | { 938 | assert(mStage4_6_concat_layer.get() == nullptr); 939 | mStage4_6_concat_layer = std::unique_ptr 940 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 941 | return mStage4_6_concat_layer.get(); 942 | } 943 | else if (!strcmp(layerName, "mbox_priorbox")) 944 | { 945 | assert(mBox_priorbox_layer.get() == nullptr); 946 | mBox_priorbox_layer = std::unique_ptr 947 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 948 | return mBox_priorbox_layer.get(); 949 | } 950 | 951 | else if (!strcmp(layerName, "mbox_loc")) 952 | { 953 | assert(mBox_loc_layer.get() == nullptr); 954 | mBox_loc_layer = std::unique_ptr 955 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 956 | return mBox_loc_layer.get(); 957 | } 958 | else if (!strcmp(layerName, "mbox_conf")) 959 | { 960 | assert(mBox_conf_layer.get() == nullptr); 961 | mBox_conf_layer = std::unique_ptr 962 | (createConcatPlugin(serialData, serialLength), nvPluginDeleter); 963 | return mBox_conf_layer.get(); 964 | } 965 | 966 | //flatten 967 | else if (!strcmp(layerName, "ext/pm1_mbox_loc_flat")) 968 | { 969 | assert(mExt_pm1_mbox_loc_flat_layer.get() == nullptr); 970 | mExt_pm1_mbox_loc_flat_layer = std::unique_ptr(new FlattenLayer(serialData, serialLength)); 971 | return mExt_pm1_mbox_loc_flat_layer.get(); 972 | } 973 | else if (!strcmp(layerName, "ext/pm1_mbox_conf_flat")) 974 | { 975 | assert(mExt_pm1_mbox_conf_flat_layer.get() == nullptr); 976 | mExt_pm1_mbox_conf_flat_layer = std::unique_ptr(new FlattenLayer(serialData, serialLength)); 977 | return mExt_pm1_mbox_conf_flat_layer.get(); 978 | } 979 | else if (!strcmp(layerName, "ext/pm2_mbox_loc_flat")) 980 | { 981 | assert(mExt_pm2_mbox_loc_flat_layer.get() == nullptr); 982 | mExt_pm2_mbox_loc_flat_layer = std::unique_ptr(new FlattenLayer(serialData, serialLength)); 983 | return mExt_pm2_mbox_loc_flat_layer.get(); 984 | } 985 | else if (!strcmp(layerName, "ext/pm2_mbox_conf_flat")) 986 | { 987 | assert(mExt_pm2_mbox_conf_flat_layer.get() == nullptr); 988 | mExt_pm2_mbox_conf_flat_layer = std::unique_ptr(new FlattenLayer(serialData, serialLength)); 989 | return mExt_pm2_mbox_conf_flat_layer.get(); 990 | } 991 | else if (!strcmp(layerName, "ext/pm3_mbox_loc_flat")) 992 | { 993 | assert(mExt_pm3_mbox_loc_flat_layer.get() == nullptr); 994 | mExt_pm3_mbox_loc_flat_layer = std::unique_ptr(new FlattenLayer(serialData, serialLength)); 995 | return mExt_pm3_mbox_loc_flat_layer.get(); 996 | } 997 | else if (!strcmp(layerName, "ext/pm3_mbox_conf_flat")) 998 | { 999 | assert(mExt_pm3_mbox_conf_flat_layer.get() == nullptr); 1000 | mExt_pm3_mbox_conf_flat_layer = std::unique_ptr(new FlattenLayer(serialData, serialLength)); 1001 | return mExt_pm3_mbox_conf_flat_layer.get(); 1002 | } 1003 | else if (!strcmp(layerName, "ext/pm4_mbox_loc_flat")) 1004 | { 1005 | assert(mExt_pm4_mbox_loc_flat_layer.get() == nullptr); 1006 | mExt_pm4_mbox_loc_flat_layer = std::unique_ptr(new FlattenLayer(serialData, serialLength)); 1007 | return mExt_pm4_mbox_loc_flat_layer.get(); 1008 | } 1009 | else if (!strcmp(layerName, "ext/pm4_mbox_conf_flat")) 1010 | { 1011 | assert(mExt_pm4_mbox_conf_flat_layer.get() == nullptr); 1012 | mExt_pm4_mbox_conf_flat_layer = std::unique_ptr(new FlattenLayer(serialData, serialLength)); 1013 | return mExt_pm4_mbox_conf_flat_layer.get(); 1014 | } 1015 | else if (!strcmp(layerName, "ext/pm5_mbox_loc_flat")) 1016 | { 1017 | assert(mExt_pm5_mbox_loc_flat_layer.get() == nullptr); 1018 | mExt_pm5_mbox_loc_flat_layer = std::unique_ptr(new FlattenLayer(serialData, serialLength)); 1019 | return mExt_pm5_mbox_loc_flat_layer.get(); 1020 | } 1021 | else if (!strcmp(layerName, "ext/pm5_mbox_conf_flat")) 1022 | { 1023 | assert(mExt_pm5_mbox_conf_flat_layer.get() == nullptr); 1024 | mExt_pm5_mbox_conf_flat_layer = std::unique_ptr(new FlattenLayer(serialData, serialLength)); 1025 | return mExt_pm5_mbox_conf_flat_layer.get(); 1026 | } 1027 | else if (!strcmp(layerName, "ext/pm6_mbox_loc_flat")) 1028 | { 1029 | assert(mExt_pm6_mbox_loc_flat_layer.get() == nullptr); 1030 | mExt_pm6_mbox_loc_flat_layer = std::unique_ptr(new FlattenLayer(serialData, serialLength)); 1031 | return mExt_pm6_mbox_loc_flat_layer.get(); 1032 | } 1033 | else if (!strcmp(layerName, "ext/pm6_mbox_conf_flat")) 1034 | { 1035 | assert(mExt_pm6_mbox_conf_flat_layer.get() == nullptr); 1036 | mExt_pm6_mbox_conf_flat_layer = std::unique_ptr(new FlattenLayer(serialData, serialLength)); 1037 | return mExt_pm6_mbox_conf_flat_layer.get(); 1038 | } 1039 | 1040 | else if (!strcmp(layerName, "mbox_conf_flatten")) 1041 | { 1042 | assert(mMbox_conf_flat_layer.get() == nullptr); 1043 | mMbox_conf_flat_layer = std::unique_ptr(new FlattenLayer(serialData, serialLength)); 1044 | return mMbox_conf_flat_layer.get(); 1045 | } 1046 | 1047 | 1048 | else if (!strcmp(layerName, "mbox_conf_reshape")) 1049 | { 1050 | assert(mMbox_conf_reshape.get() == nullptr); 1051 | // assert(nbWeights == 0 && weights == nullptr); 1052 | mMbox_conf_reshape = std::unique_ptr>(new Reshape<11>(serialData, serialLength)); 1053 | return mMbox_conf_reshape.get(); 1054 | } 1055 | //softmax layer 1056 | else if (!strcmp(layerName, "mbox_conf_softmax")) 1057 | { 1058 | assert( mPluginSoftmax == nullptr); 1059 | 1060 | mPluginSoftmax = std::unique_ptr(new SoftmaxPlugin(serialData, serialLength)); 1061 | return mPluginSoftmax.get(); 1062 | } 1063 | else if (!strcmp(layerName, "detection_out")) 1064 | { 1065 | assert(mDetection_out.get() == nullptr); 1066 | //tensor rt 3.0 1067 | //mDetection_out = std::unique_ptr(createSSDDetectionOutputPlugin({true, false, 0, 21, 400, 200, 0.5, 0.45, CodeType_t::CENTER_SIZE}), nvPluginDeleter); 1068 | //tensor rt 5 1069 | 1070 | 1071 | mDetection_out = std::unique_ptr 1072 | (createSSDDetectionOutputPlugin(serialData, serialLength), nvPluginDeleter); 1073 | return mDetection_out.get(); 1074 | } 1075 | else 1076 | { 1077 | assert(0); 1078 | return nullptr; 1079 | } 1080 | } 1081 | 1082 | bool PluginFactory::isPlugin(const char* name) 1083 | { 1084 | return (!strcmp(name, "ext/pm1_mbox_loc_perm") 1085 | || !strcmp(name, "ext/pm1_mbox_conf_perm") 1086 | || !strcmp(name, "ext/pm2_mbox_loc_perm") 1087 | || !strcmp(name, "ext/pm2_mbox_conf_perm") 1088 | || !strcmp(name, "ext/pm3_mbox_loc_perm") 1089 | || !strcmp(name, "ext/pm3_mbox_conf_perm") 1090 | || !strcmp(name, "ext/pm4_mbox_loc_perm") 1091 | || !strcmp(name, "ext/pm4_mbox_conf_perm") 1092 | || !strcmp(name, "ext/pm5_mbox_loc_perm") 1093 | || !strcmp(name, "ext/pm5_mbox_conf_perm") 1094 | || !strcmp(name, "ext/pm6_mbox_loc_perm") 1095 | || !strcmp(name, "ext/pm6_mbox_conf_perm") 1096 | || !strcmp(name, "ext/pm1_mbox_priorbox") 1097 | || !strcmp(name, "ext/pm2_mbox_priorbox") 1098 | || !strcmp(name, "ext/pm3_mbox_priorbox") 1099 | || !strcmp(name, "ext/pm4_mbox_priorbox") 1100 | || !strcmp(name, "ext/pm5_mbox_priorbox") 1101 | || !strcmp(name, "ext/pm6_mbox_priorbox") 1102 | || !strcmp(name, "stem/concat") 1103 | || !strcmp(name, "stage1_1/concat") 1104 | || !strcmp(name, "stage1_2/concat") 1105 | || !strcmp(name, "stage1_3/concat") 1106 | || !strcmp(name, "stage2_1/concat") 1107 | || !strcmp(name, "stage2_2/concat") 1108 | || !strcmp(name, "stage2_3/concat") 1109 | || !strcmp(name, "stage2_4/concat") 1110 | || !strcmp(name, "stage3_1/concat") 1111 | || !strcmp(name, "stage3_2/concat") 1112 | || !strcmp(name, "stage3_3/concat") 1113 | || !strcmp(name, "stage3_4/concat") 1114 | || !strcmp(name, "stage3_5/concat") 1115 | || !strcmp(name, "stage3_6/concat") 1116 | || !strcmp(name, "stage3_7/concat") 1117 | || !strcmp(name, "stage3_8/concat") 1118 | || !strcmp(name, "stage4_1/concat") 1119 | || !strcmp(name, "stage4_2/concat") 1120 | || !strcmp(name, "stage4_3/concat") 1121 | || !strcmp(name, "stage4_4/concat") 1122 | || !strcmp(name, "stage4_5/concat") 1123 | || !strcmp(name, "stage4_6/concat") 1124 | || !strcmp(name, "mbox_loc") 1125 | || !strcmp(name, "mbox_conf") 1126 | || !strcmp(name, "ext/pm1_mbox_loc_flat") 1127 | || !strcmp(name, "ext/pm1_mbox_conf_flat") 1128 | || !strcmp(name, "ext/pm2_mbox_loc_flat") 1129 | || !strcmp(name, "ext/pm2_mbox_conf_flat") 1130 | || !strcmp(name, "ext/pm3_mbox_loc_flat") 1131 | || !strcmp(name, "ext/pm3_mbox_conf_flat") 1132 | || !strcmp(name, "ext/pm4_mbox_loc_flat") 1133 | || !strcmp(name, "ext/pm4_mbox_conf_flat") 1134 | || !strcmp(name, "ext/pm5_mbox_loc_flat") 1135 | || !strcmp(name, "ext/pm5_mbox_conf_flat") 1136 | || !strcmp(name, "ext/pm6_mbox_loc_flat") 1137 | || !strcmp(name, "ext/pm6_mbox_conf_flat") 1138 | || !strcmp(name, "mbox_conf_reshape") 1139 | || !strcmp(name, "mbox_conf_flatten") 1140 | || !strcmp(name, "mbox_loc") 1141 | || !strcmp(name, "mbox_conf") 1142 | || !strcmp(name, "mbox_priorbox") 1143 | || !strcmp(name, "detection_out") 1144 | || !strcmp(name, "mbox_conf_softmax")); 1145 | 1146 | 1147 | } 1148 | 1149 | void PluginFactory::destroyPlugin() 1150 | { 1151 | 1152 | 1153 | mExt_pm1_mbox_loc_perm_layer.release(); 1154 | mExt_pm1_mbox_conf_perm_layer.release(); 1155 | mExt_pm2_mbox_loc_perm_layer.release(); 1156 | mExt_pm2_mbox_conf_perm_layer.release(); 1157 | mExt_pm3_mbox_loc_perm_layer.release(); 1158 | mExt_pm3_mbox_conf_perm_layer.release(); 1159 | mExt_pm4_mbox_loc_perm_layer.release(); 1160 | mExt_pm4_mbox_conf_perm_layer.release(); 1161 | mExt_pm5_mbox_loc_perm_layer.release(); 1162 | mExt_pm5_mbox_conf_perm_layer.release(); 1163 | mExt_pm6_mbox_loc_perm_layer.release(); 1164 | mExt_pm6_mbox_conf_perm_layer.release(); 1165 | 1166 | mExt_pm1_mbox_priorbox_layer.release(); 1167 | mExt_pm2_mbox_priorbox_layer.release(); 1168 | mExt_pm3_mbox_priorbox_layer.release(); 1169 | mExt_pm4_mbox_priorbox_layer.release(); 1170 | mExt_pm5_mbox_priorbox_layer.release(); 1171 | mExt_pm6_mbox_priorbox_layer.release(); 1172 | 1173 | mStem_concat_layer.release(); 1174 | mStage1_1_concat_layer.release(); 1175 | mStage1_2_concat_layer.release(); 1176 | mStage1_3_concat_layer.release(); 1177 | 1178 | mStage2_1_concat_layer.release(); 1179 | mStage2_2_concat_layer.release(); 1180 | mStage2_3_concat_layer.release(); 1181 | mStage2_4_concat_layer.release(); 1182 | 1183 | 1184 | mStage3_1_concat_layer.release(); 1185 | mStage3_2_concat_layer.release(); 1186 | mStage3_3_concat_layer.release(); 1187 | mStage3_4_concat_layer.release(); 1188 | mStage3_5_concat_layer.release(); 1189 | mStage3_6_concat_layer.release(); 1190 | mStage3_7_concat_layer.release(); 1191 | mStage3_8_concat_layer.release(); 1192 | 1193 | 1194 | mStage4_1_concat_layer.release(); 1195 | mStage4_2_concat_layer.release(); 1196 | mStage4_3_concat_layer.release(); 1197 | mStage4_4_concat_layer.release(); 1198 | mStage4_5_concat_layer.release(); 1199 | mStage4_6_concat_layer.release(); 1200 | 1201 | 1202 | mExt_pm1_mbox_loc_perm_layer= nullptr; 1203 | mExt_pm1_mbox_conf_perm_layer= nullptr; 1204 | mExt_pm2_mbox_loc_perm_layer= nullptr; 1205 | mExt_pm2_mbox_conf_perm_layer= nullptr; 1206 | mExt_pm3_mbox_loc_perm_layer= nullptr; 1207 | mExt_pm3_mbox_conf_perm_layer = nullptr; 1208 | mExt_pm4_mbox_loc_perm_layer= nullptr; 1209 | mExt_pm4_mbox_conf_perm_layer= nullptr; 1210 | mExt_pm5_mbox_loc_perm_layer= nullptr; 1211 | mExt_pm5_mbox_conf_perm_layer= nullptr; 1212 | mExt_pm6_mbox_loc_perm_layer= nullptr; 1213 | mExt_pm6_mbox_conf_perm_layer= nullptr; 1214 | 1215 | mExt_pm1_mbox_priorbox_layer= nullptr; 1216 | mExt_pm2_mbox_priorbox_layer= nullptr; 1217 | mExt_pm3_mbox_priorbox_layer= nullptr; 1218 | mExt_pm4_mbox_priorbox_layer= nullptr; 1219 | mExt_pm5_mbox_priorbox_layer= nullptr; 1220 | mExt_pm6_mbox_priorbox_layer= nullptr; 1221 | 1222 | mStem_concat_layer= nullptr; 1223 | mStage1_1_concat_layer = nullptr; 1224 | mStage1_2_concat_layer= nullptr; 1225 | mStage1_3_concat_layer= nullptr; 1226 | 1227 | mStage2_1_concat_layer = nullptr; 1228 | mStage2_2_concat_layer= nullptr; 1229 | mStage2_3_concat_layer= nullptr; 1230 | mStage2_4_concat_layer= nullptr; 1231 | 1232 | 1233 | mStage3_1_concat_layer = nullptr; 1234 | mStage3_2_concat_layer= nullptr; 1235 | mStage3_3_concat_layer= nullptr; 1236 | mStage3_4_concat_layer= nullptr; 1237 | mStage3_5_concat_layer = nullptr; 1238 | mStage3_6_concat_layer= nullptr; 1239 | mStage3_7_concat_layer= nullptr; 1240 | mStage3_8_concat_layer= nullptr; 1241 | 1242 | 1243 | mStage4_1_concat_layer = nullptr; 1244 | mStage4_2_concat_layer= nullptr; 1245 | mStage4_3_concat_layer= nullptr; 1246 | mStage4_4_concat_layer= nullptr; 1247 | mStage4_5_concat_layer = nullptr; 1248 | mStage4_6_concat_layer= nullptr; 1249 | 1250 | mBox_priorbox_layer.release(); 1251 | mBox_priorbox_layer = nullptr; 1252 | mBox_loc_layer.release(); 1253 | mBox_loc_layer = nullptr; 1254 | mBox_conf_layer.release(); 1255 | mBox_conf_layer = nullptr; 1256 | 1257 | mExt_pm1_mbox_loc_flat_layer.release(); 1258 | mExt_pm1_mbox_conf_flat_layer.release(); 1259 | mExt_pm2_mbox_loc_flat_layer.release(); 1260 | mExt_pm2_mbox_conf_flat_layer.release(); 1261 | mExt_pm3_mbox_loc_flat_layer.release(); 1262 | mExt_pm3_mbox_conf_flat_layer.release(); 1263 | mExt_pm4_mbox_loc_flat_layer.release(); 1264 | mExt_pm4_mbox_conf_flat_layer.release(); 1265 | mExt_pm5_mbox_loc_flat_layer.release(); 1266 | mExt_pm5_mbox_conf_flat_layer.release(); 1267 | mExt_pm6_mbox_loc_flat_layer.release(); 1268 | mExt_pm6_mbox_conf_flat_layer.release(); 1269 | 1270 | mExt_pm1_mbox_loc_flat_layer= nullptr; 1271 | mExt_pm1_mbox_conf_flat_layer= nullptr; 1272 | mExt_pm2_mbox_loc_flat_layer= nullptr; 1273 | mExt_pm2_mbox_conf_flat_layer= nullptr; 1274 | mExt_pm3_mbox_loc_flat_layer= nullptr; 1275 | mExt_pm3_mbox_conf_flat_layer= nullptr; 1276 | mExt_pm4_mbox_loc_flat_layer= nullptr; 1277 | mExt_pm4_mbox_conf_flat_layer= nullptr; 1278 | mExt_pm5_mbox_loc_flat_layer= nullptr; 1279 | mExt_pm5_mbox_conf_flat_layer= nullptr; 1280 | mExt_pm6_mbox_loc_flat_layer= nullptr; 1281 | mExt_pm6_mbox_conf_flat_layer= nullptr; 1282 | 1283 | mMbox_conf_flat_layer.release(); 1284 | mMbox_conf_flat_layer = nullptr; 1285 | mMbox_conf_reshape.release(); 1286 | mMbox_conf_reshape = nullptr; 1287 | mPluginSoftmax.release(); 1288 | mPluginSoftmax = nullptr; 1289 | mDetection_out.release(); 1290 | mDetection_out = nullptr; 1291 | } 1292 | -------------------------------------------------------------------------------- /pluginImplement.h: -------------------------------------------------------------------------------- 1 | #ifndef __PLUGIN_LAYER_H__ 2 | #define __PLUGIN_LAYER_H__ 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "NvCaffeParser.h" 13 | #include "NvInfer.h" 14 | #include "NvInferPlugin.h" 15 | #include "NvUtils.h" 16 | //#include "fp16.h" 17 | 18 | #define CHECK(status) \ 19 | { \ 20 | if (status != 0) \ 21 | { \ 22 | std::cout << "Cuda failure: " << cudaGetErrorString(status) \ 23 | << " at line " << __LINE__ \ 24 | << std::endl; \ 25 | abort(); \ 26 | } \ 27 | } 28 | 29 | 30 | using namespace nvinfer1; 31 | using namespace nvcaffeparser1; 32 | using namespace plugin; 33 | 34 | static const int TIMING_ITERATIONS = 1000; 35 | 36 | 37 | 38 | 39 | enum FunctionType 40 | { 41 | SELECT=0, 42 | SUMMARY 43 | }; 44 | 45 | void cudaSoftmax(int n, int channels, float* x, float*y); 46 | //void cudaSoftmax(int n, int channels, __half* x, __half* y); 47 | 48 | 49 | 50 | class bboxProfile { 51 | public: 52 | bboxProfile(float4& p, int idx): pos(p), bboxNum(idx) {} 53 | 54 | float4 pos; 55 | int bboxNum = -1; 56 | int labelID = -1; 57 | 58 | }; 59 | 60 | class tagProfile 61 | { 62 | public: 63 | tagProfile(int b, int l): bboxID(b), label(l) {} 64 | int bboxID; 65 | int label; 66 | }; 67 | 68 | //SSD Reshape layer : shape{0,-1,21} 69 | template 70 | // @TODO: I think the OutC is the Out Channels and it is equal to 21. 71 | class Reshape : public IPlugin 72 | { 73 | public: 74 | Reshape() 75 | { 76 | } 77 | Reshape(const void* buffer, size_t size) 78 | { 79 | assert(size == sizeof(mCopySize)); 80 | mCopySize = *reinterpret_cast(buffer); 81 | } 82 | int getNbOutputs() const override 83 | { 84 | return 1; 85 | } 86 | Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override 87 | { 88 | assert(nbInputDims == 1); 89 | assert(index == 0); 90 | assert(inputs[index].nbDims == 3); 91 | assert((inputs[0].d[0])*(inputs[0].d[1]) % OutC == 0); 92 | 93 | // @TODO: Understood this. 94 | return DimsCHW( inputs[0].d[0] * inputs[0].d[1] / OutC, OutC, inputs[0].d[2]); 95 | } 96 | 97 | int initialize() override { return 0; } 98 | void terminate() override {} 99 | 100 | size_t getWorkspaceSize(int) const override 101 | { 102 | // @TODO: 1 is the batch size. 103 | return mCopySize*1; 104 | } 105 | 106 | // currently it is not possible for a plugin to execute "in place". Therefore we memcpy the data from the input to the output buffer 107 | int enqueue(int batchSize, const void*const *inputs, void** outputs, void* workspace, cudaStream_t stream) override 108 | { 109 | if(mDataType == DataType::kFLOAT){ // FP32 110 | CHECK(cudaMemcpyAsync(outputs[0], inputs[0] , mCopySize * batchSize, cudaMemcpyDeviceToDevice, stream)); 111 | } 112 | else{ //FP16 113 | CHECK(cudaMemcpyAsync( 114 | reinterpret_cast<__half*>(outputs[0]), 115 | reinterpret_cast(inputs[0]), mCopySize * batchSize, 116 | cudaMemcpyDeviceToDevice, stream)); 117 | } 118 | //CHECK(cudaMemcpyAsync(outputs[0], inputs[0] , mCopySize * batchSize, cudaMemcpyDeviceToDevice, stream)); 119 | return 0; 120 | } 121 | size_t getSerializationSize() override 122 | { 123 | return sizeof(mCopySize); 124 | } 125 | void serialize(void* buffer) override 126 | { 127 | *reinterpret_cast(buffer) = mCopySize; 128 | } 129 | void configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int) override 130 | { 131 | mCopySize = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2] * sizeof(float); 132 | } 133 | 134 | protected: 135 | size_t mCopySize; 136 | DataType mDataType{DataType::kFLOAT}; 137 | 138 | }; 139 | 140 | //Softmax layer.TensorRT softmax only support cross channel 141 | class SoftmaxPlugin : public IPlugin 142 | { 143 | //You need to implement it when softmax parameter axis is 2. 144 | public: 145 | int initialize() override { return 0; } 146 | inline void terminate() override {} 147 | 148 | SoftmaxPlugin(){} 149 | SoftmaxPlugin( const void* buffer, size_t size) 150 | { 151 | assert(size == sizeof(mCopySize)); 152 | mCopySize = *reinterpret_cast(buffer); 153 | } 154 | inline int getNbOutputs() const override 155 | { 156 | //@TODO: As the number of outputs are only 1, because there is only layer in top. 157 | return 1; 158 | } 159 | Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override 160 | { 161 | assert(nbInputDims == 1); 162 | assert(index == 0); 163 | assert(inputs[index].nbDims == 3); 164 | // assert((inputs[0].d[0])*(inputs[0].d[1]) % OutC == 0); 165 | 166 | // @TODO: Understood this. 167 | return DimsCHW( inputs[0].d[0] , inputs[0].d[1] , inputs[0].d[2] ); 168 | } 169 | 170 | size_t getWorkspaceSize(int) const override 171 | { 172 | // @TODO: 1 is the batch size. 173 | return mCopySize*1; 174 | } 175 | 176 | int enqueue(int batchSize, const void*const *inputs, void** outputs, void* workspace, cudaStream_t stream) override 177 | { 178 | //std::cout<<"flatten enqueue:"<(*outputs)); 183 | 184 | return 0; 185 | } 186 | 187 | size_t getSerializationSize() override 188 | { 189 | return sizeof(mCopySize); 190 | } 191 | void serialize(void* buffer) override 192 | { 193 | *reinterpret_cast(buffer) = mCopySize; 194 | } 195 | void configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int) override 196 | { 197 | mCopySize = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2] * sizeof(float); 198 | } 199 | 200 | protected: 201 | size_t mCopySize; 202 | DataType mDataType{DataType::kFLOAT}; 203 | 204 | }; 205 | 206 | 207 | //SSD Flatten layer 208 | class FlattenLayer : public IPlugin 209 | { 210 | public: 211 | 212 | FlattenLayer(){} 213 | FlattenLayer(const void* buffer, size_t size) 214 | { 215 | assert(size == 3 * sizeof(int)); 216 | const int* d = reinterpret_cast(buffer); 217 | _size = d[0] * d[1] * d[2]; 218 | dimBottom = DimsCHW{d[0], d[1], d[2]}; 219 | } 220 | 221 | inline int getNbOutputs() const override { return 1; }; 222 | Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override 223 | { 224 | assert(1 == nbInputDims); 225 | assert(0 == index); 226 | assert(3 == inputs[index].nbDims); 227 | _size = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2]; 228 | return DimsCHW(_size, 1, 1); 229 | } 230 | 231 | int initialize() override 232 | { 233 | return 0; 234 | } 235 | inline void terminate() override {} 236 | 237 | inline size_t getWorkspaceSize(int) const override { return 0; } 238 | 239 | int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override 240 | { 241 | //std::cout<<"flatten enqueue:"<(outputs[0]), 248 | reinterpret_cast(inputs[0]), 249 | batchSize*_size*sizeof(__half), 250 | cudaMemcpyDeviceToDevice,stream)); 251 | } 252 | 253 | //CHECK(cudaMemcpyAsync(outputs[0],inputs[0],batchSize*_size*sizeof(float),cudaMemcpyDeviceToDevice,stream)); 254 | return 0; 255 | } 256 | 257 | size_t getSerializationSize() override 258 | { 259 | return 3 * sizeof(int); 260 | } 261 | 262 | void serialize(void* buffer) override 263 | { 264 | int* d = reinterpret_cast(buffer); 265 | d[0] = dimBottom.c(); d[1] = dimBottom.h(); d[2] = dimBottom.w(); 266 | } 267 | 268 | void configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int) override 269 | { 270 | dimBottom = DimsCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]); 271 | } 272 | protected: 273 | DataType mDataType{DataType::kFLOAT}; 274 | DimsCHW dimBottom; 275 | int _size; 276 | }; 277 | 278 | 279 | class PluginFactory : public nvinfer1::IPluginFactory, public nvcaffeparser1::IPluginFactory 280 | { 281 | public: 282 | virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) override; 283 | IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override; 284 | 285 | void(*nvPluginDeleter)(INvPlugin*) { [](INvPlugin* ptr) {ptr->destroy(); } }; 286 | 287 | bool isPlugin(const char* name) override; 288 | void destroyPlugin(); 289 | 290 | 291 | //pelee 292 | std::unique_ptr mExt_pm1_mbox_loc_perm_layer{ nullptr, nvPluginDeleter }; 293 | std::unique_ptr mExt_pm1_mbox_conf_perm_layer{ nullptr, nvPluginDeleter }; 294 | std::unique_ptr mExt_pm2_mbox_loc_perm_layer{ nullptr, nvPluginDeleter }; 295 | std::unique_ptr mExt_pm2_mbox_conf_perm_layer{ nullptr, nvPluginDeleter }; 296 | std::unique_ptr mExt_pm3_mbox_loc_perm_layer{ nullptr, nvPluginDeleter }; 297 | std::unique_ptr mExt_pm3_mbox_conf_perm_layer{ nullptr, nvPluginDeleter }; 298 | std::unique_ptr mExt_pm4_mbox_loc_perm_layer{ nullptr, nvPluginDeleter }; 299 | std::unique_ptr mExt_pm4_mbox_conf_perm_layer{ nullptr, nvPluginDeleter }; 300 | std::unique_ptr mExt_pm5_mbox_loc_perm_layer{ nullptr, nvPluginDeleter }; 301 | std::unique_ptr mExt_pm5_mbox_conf_perm_layer{ nullptr, nvPluginDeleter }; 302 | std::unique_ptr mExt_pm6_mbox_loc_perm_layer{ nullptr, nvPluginDeleter }; 303 | std::unique_ptr mExt_pm6_mbox_conf_perm_layer{ nullptr, nvPluginDeleter }; 304 | 305 | //pelee 306 | std::unique_ptr mExt_pm1_mbox_priorbox_layer{ nullptr, nvPluginDeleter }; 307 | std::unique_ptr mExt_pm2_mbox_priorbox_layer{ nullptr, nvPluginDeleter }; 308 | std::unique_ptr mExt_pm3_mbox_priorbox_layer{ nullptr, nvPluginDeleter }; 309 | std::unique_ptr mExt_pm4_mbox_priorbox_layer{ nullptr, nvPluginDeleter }; 310 | std::unique_ptr mExt_pm5_mbox_priorbox_layer{ nullptr, nvPluginDeleter }; 311 | std::unique_ptr mExt_pm6_mbox_priorbox_layer{ nullptr, nvPluginDeleter }; 312 | 313 | //detection output layer 314 | std::unique_ptr mDetection_out{ nullptr, nvPluginDeleter }; 315 | //pelee 316 | std::unique_ptr mStem_concat_layer{ nullptr, nvPluginDeleter }; 317 | std::unique_ptr mStage1_1_concat_layer{ nullptr, nvPluginDeleter }; 318 | std::unique_ptr mStage1_2_concat_layer{ nullptr, nvPluginDeleter }; 319 | std::unique_ptr mStage1_3_concat_layer{ nullptr, nvPluginDeleter }; 320 | std::unique_ptr mStage2_1_concat_layer{ nullptr, nvPluginDeleter }; 321 | std::unique_ptr mStage2_2_concat_layer{ nullptr, nvPluginDeleter }; 322 | std::unique_ptr mStage2_3_concat_layer{ nullptr, nvPluginDeleter }; 323 | std::unique_ptr mStage2_4_concat_layer{ nullptr, nvPluginDeleter }; 324 | std::unique_ptr mStage3_1_concat_layer{ nullptr, nvPluginDeleter }; 325 | std::unique_ptr mStage3_2_concat_layer{ nullptr, nvPluginDeleter }; 326 | std::unique_ptr mStage3_3_concat_layer{ nullptr, nvPluginDeleter }; 327 | std::unique_ptr mStage3_4_concat_layer{ nullptr, nvPluginDeleter }; 328 | std::unique_ptr mStage3_5_concat_layer{ nullptr, nvPluginDeleter }; 329 | std::unique_ptr mStage3_6_concat_layer{ nullptr, nvPluginDeleter }; 330 | std::unique_ptr mStage3_7_concat_layer{ nullptr, nvPluginDeleter }; 331 | std::unique_ptr mStage3_8_concat_layer{ nullptr, nvPluginDeleter }; 332 | std::unique_ptr mStage4_1_concat_layer{ nullptr, nvPluginDeleter }; 333 | std::unique_ptr mStage4_2_concat_layer{ nullptr, nvPluginDeleter }; 334 | std::unique_ptr mStage4_3_concat_layer{ nullptr, nvPluginDeleter }; 335 | std::unique_ptr mStage4_4_concat_layer{ nullptr, nvPluginDeleter }; 336 | std::unique_ptr mStage4_5_concat_layer{ nullptr, nvPluginDeleter }; 337 | std::unique_ptr mStage4_6_concat_layer{ nullptr, nvPluginDeleter }; 338 | 339 | std::unique_ptr mBox_loc_layer{ nullptr, nvPluginDeleter }; 340 | std::unique_ptr mBox_conf_layer{ nullptr, nvPluginDeleter }; 341 | std::unique_ptr mBox_priorbox_layer{ nullptr, nvPluginDeleter }; 342 | 343 | //reshape layer 344 | std::unique_ptr> mMbox_conf_reshape{ nullptr }; 345 | //flatten layers 346 | //pelee 347 | std::unique_ptr mExt_pm1_mbox_loc_flat_layer{ nullptr }; 348 | std::unique_ptr mExt_pm1_mbox_conf_flat_layer{ nullptr }; 349 | std::unique_ptr mExt_pm2_mbox_loc_flat_layer{ nullptr }; 350 | std::unique_ptr mExt_pm2_mbox_conf_flat_layer{ nullptr }; 351 | std::unique_ptr mExt_pm3_mbox_loc_flat_layer{ nullptr }; 352 | std::unique_ptr mExt_pm3_mbox_conf_flat_layer{ nullptr }; 353 | std::unique_ptr mExt_pm4_mbox_loc_flat_layer{ nullptr }; 354 | std::unique_ptr mExt_pm4_mbox_conf_flat_layer{ nullptr }; 355 | std::unique_ptr mExt_pm5_mbox_loc_flat_layer{ nullptr }; 356 | std::unique_ptr mExt_pm5_mbox_conf_flat_layer{ nullptr }; 357 | std::unique_ptr mExt_pm6_mbox_loc_flat_layer{ nullptr }; 358 | std::unique_ptr mExt_pm6_mbox_conf_flat_layer{ nullptr }; 359 | std::unique_ptr mBox_conf_flat_layer{ nullptr }; 360 | 361 | 362 | 363 | 364 | 365 | 366 | //softmax layer 367 | std::unique_ptr mPluginSoftmax{ nullptr }; 368 | std::unique_ptr mMbox_conf_flat_layer{ nullptr }; 369 | 370 | 371 | }; 372 | 373 | #endif 374 | -------------------------------------------------------------------------------- /tensorNet.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "common.h" 3 | #include "tensorNet.h" 4 | #include 5 | #include 6 | 7 | using namespace nvinfer1; 8 | 9 | 10 | bool TensorNet::LoadNetwork(const char* prototxt_path, 11 | const char* model_path, 12 | const char* input_blob, 13 | const std::vector& output_blobs, 14 | uint32_t maxBatchSize) 15 | { 16 | //assert( !prototxt_path || !model_path ); 17 | 18 | // attempt to load network from cache before profiling with tensorRT 19 | std::stringstream gieModelStdStream; 20 | gieModelStdStream.seekg(0, gieModelStdStream.beg); 21 | char cache_path[512]; 22 | sprintf(cache_path, "%s.%u.tensorcache", model_path, maxBatchSize); 23 | printf( "attempting to open cache file %s\n", cache_path); 24 | 25 | std::ifstream cache( cache_path ); 26 | 27 | if( !cache ) 28 | { 29 | printf( "cache file not found, profiling network model\n"); 30 | 31 | // if( !caffeToTRTModel(prototxt_path, model_path, output_blobs, maxBatchSize, gieModelStdStream) ) 32 | // { 33 | // printf("failed to load %s\n", model_path); 34 | // return 0; 35 | // } 36 | bool load = caffeToTRTModel(prototxt_path, model_path, output_blobs, maxBatchSize, gieModelStdStream); 37 | if(!load){ 38 | printf("failed to load %s\n", model_path); 39 | return 0; 40 | }else{ 41 | printf( "network profiling complete, writing cache to %s\n", cache_path); 42 | } 43 | 44 | std::ofstream outFile; 45 | outFile.open(cache_path); 46 | outFile << gieModelStdStream.rdbuf(); 47 | outFile.close(); 48 | gieModelStdStream.seekg(0, gieModelStdStream.beg); 49 | printf( "completed writing cache to %s\n", cache_path); 50 | 51 | infer = createInferRuntime(gLogger); 52 | /** 53 | * deserializeCudaEngine can be used to load the serialized CuDA Engine (Plan file). 54 | * */ 55 | std::cout << "createInference" << std::endl; 56 | engine = infer->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), &pluginFactory); 57 | std::cout << "createInference_end" << std::endl; 58 | printf("Bindings after deserializing:\n"); 59 | for (int bi = 0; bi < engine->getNbBindings(); bi++) { 60 | if (engine->bindingIsInput(bi) == true) printf("Binding %d (%s): Input.\n", bi, engine->getBindingName(bi)); 61 | else printf("Binding %d (%s): Output.\n", bi, engine->getBindingName(bi)); 62 | } 63 | } 64 | else 65 | { 66 | std::cout << "loading network profile from cache..." << std::endl; 67 | gieModelStdStream << cache.rdbuf(); 68 | cache.close(); 69 | gieModelStdStream.seekg(0, std::ios::end); 70 | const int modelSize = gieModelStdStream.tellg(); 71 | gieModelStdStream.seekg(0, std::ios::beg); 72 | void* modelMem = malloc(modelSize); 73 | gieModelStdStream.read((char*)modelMem, modelSize); 74 | 75 | infer = createInferRuntime(gLogger); 76 | std::cout << "createInference" << std::endl; 77 | engine = infer->deserializeCudaEngine(modelMem, modelSize, &pluginFactory); 78 | //free(modelMem); 79 | std::cout << "createInference_end" << std::endl; 80 | printf("Bindings after deserializing:\n"); 81 | for (int bi = 0; bi < engine->getNbBindings(); bi++) { 82 | if (engine->bindingIsInput(bi) == true) printf("Binding %d (%s): Input.\n", bi, engine->getBindingName(bi)); 83 | else printf("Binding %d (%s): Output.\n", bi, engine->getBindingName(bi)); 84 | } 85 | } 86 | } 87 | 88 | bool TensorNet::caffeToTRTModel(const char* deployFile, 89 | const char* modelFile, 90 | const std::vector& outputs, 91 | unsigned int maxBatchSize, 92 | std::ostream& gieModelStdStream) 93 | { 94 | IBuilder* builder = createInferBuilder(gLogger); 95 | INetworkDefinition* network = builder->createNetwork(); 96 | // builder->setMinFindIterations(3); // allow time for TX1 GPU to spin up 97 | // builder->setAverageFindIterations(2); 98 | ICaffeParser* parser = createCaffeParser(); 99 | parser->setPluginFactory(&pluginFactory); 100 | //builder->setFp16Mode(true); 101 | bool useFp16 = false; 102 | //builder->platformHasFastFp16(); 103 | //@Seojin to fp16 104 | //useFp16 = true; 105 | 106 | DataType modelDataType = useFp16 ? DataType::kHALF : DataType::kFLOAT; 107 | 108 | //modelDataType = DataType::kHALF; 109 | 110 | // std::cout << deployFile <parse(deployFile, 115 | modelFile, 116 | *network, 117 | modelDataType); 118 | assert(blobNameToTensor != nullptr); 119 | for (auto& s : outputs) network->markOutput(*blobNameToTensor->find(s.c_str())); 120 | 121 | builder->setMaxBatchSize(maxBatchSize); 122 | builder->setMaxWorkspaceSize(16 << 20); 123 | 124 | if(useFp16) 125 | { 126 | builder->setHalf2Mode(true); 127 | std::cout <<"Use FP16 Mode:" << useFp16 <buildCudaEngine( *network ); 131 | assert(engine); 132 | // we don't need the network any more, and we can destroy the parser 133 | network->destroy(); 134 | parser->destroy(); 135 | // serialize the engine, then close everything down 136 | gieModelStream = engine->serialize(); 137 | if(!gieModelStream) 138 | { 139 | std::cout << "failed to serialize CUDA engine" << std::endl; 140 | return false; 141 | } 142 | gieModelStdStream.write((const char*)gieModelStream->data(),gieModelStream->size()); 143 | engine->destroy(); 144 | builder->destroy(); 145 | pluginFactory.destroyPlugin(); 146 | shutdownProtobufLibrary(); 147 | 148 | std::cout << "caffeToTRTModel Finished" << std::endl; 149 | return true; 150 | } 151 | 152 | /** 153 | * This function de-serializes the cuda engine. 154 | * */ 155 | void TensorNet::createInference() 156 | { 157 | infer = createInferRuntime(gLogger); 158 | /** 159 | * deserializeCudaEngine can be used to load the serialized CuDA Engine (Plan file). 160 | * */ 161 | engine = infer->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), &pluginFactory); 162 | 163 | printf("Bindings after deserializing:\n"); 164 | for (int bi = 0; bi < engine->getNbBindings(); bi++) { 165 | if (engine->bindingIsInput(bi) == true) printf("Binding %d (%s): Input.\n", bi, engine->getBindingName(bi)); 166 | else printf("Binding %d (%s): Output.\n", bi, engine->getBindingName(bi)); 167 | } 168 | } 169 | 170 | void TensorNet::imageInference(void** buffers, int nbBuffer, int batchSize) 171 | { 172 | //std::cout << "Came into the image inference method here. "<getNbBindings()==nbBuffer); 174 | IExecutionContext* context = engine->createExecutionContext(); 175 | context->setProfiler(&gProfiler); 176 | context->execute(batchSize, buffers); 177 | context->destroy(); 178 | } 179 | 180 | void TensorNet::timeInference(int iteration, int batchSize) 181 | { 182 | int inputIdx = 0; 183 | size_t inputSize = 0; 184 | void* buffers[engine->getNbBindings()]; 185 | 186 | for (int b = 0; b < engine->getNbBindings(); b++) 187 | { 188 | DimsCHW dims = static_cast(engine->getBindingDimensions(b)); 189 | size_t size = batchSize * dims.c() * dims.h() * dims.w() * sizeof(float); 190 | CHECK(cudaMalloc(&buffers[b], size)); 191 | 192 | if(engine->bindingIsInput(b) == true) 193 | { 194 | inputIdx = b; 195 | inputSize = size; 196 | } 197 | } 198 | 199 | IExecutionContext* context = engine->createExecutionContext(); 200 | context->setProfiler(&gProfiler); 201 | 202 | CHECK(cudaMemset(buffers[inputIdx], 0, inputSize)); 203 | 204 | for (int i = 0; i < iteration;i++) context->execute(batchSize, buffers); 205 | 206 | context->destroy(); 207 | for (int b = 0; b < engine->getNbBindings(); b++) CHECK(cudaFree(buffers[b])); 208 | 209 | } 210 | 211 | DimsCHW TensorNet::getTensorDims(const char* name) 212 | { 213 | for (int b = 0; b < engine->getNbBindings(); b++) { 214 | if( !strcmp( name, engine->getBindingName(b)) ) 215 | return static_cast(engine->getBindingDimensions(b)); 216 | } 217 | return DimsCHW{0,0,0}; 218 | } 219 | 220 | //void TensorNet::getLayerOutput(void** buffers, int nbBuffer, int batchSize) 221 | //{ 222 | // /* * 223 | // * @TODO: Get the layer with name name in the network 224 | // * */ 225 | // std::cout << "Came into the image inference method here. "<getNbBindings()==nbBuffer); 227 | // IExecutionContext* context = engine->createExecutionContext(); 228 | // context->setProfiler(&gProfiler); 229 | // context->execute( batchSize , buffers); 230 | // 231 | // context->destroy(); 232 | // 233 | //} 234 | 235 | void TensorNet::printTimes(int iteration) 236 | { 237 | gProfiler.printLayerTimes(iteration); 238 | } 239 | 240 | void TensorNet::destroy() 241 | { 242 | pluginFactory.destroyPlugin(); 243 | engine->destroy(); 244 | infer->destroy(); 245 | } 246 | -------------------------------------------------------------------------------- /tensorNet.h: -------------------------------------------------------------------------------- 1 | #include "pluginImplement.h" 2 | 3 | using namespace nvinfer1; 4 | using namespace nvcaffeparser1; 5 | 6 | 7 | /******************************/ 8 | // TensorRT utility 9 | /******************************/ 10 | class Logger : public ILogger 11 | { 12 | void log(Severity severity, const char* msg) override 13 | { 14 | if (severity!=Severity::kINFO) std::cout << msg << std::endl; 15 | } 16 | }; 17 | 18 | struct Profiler : public IProfiler 19 | { 20 | typedef std::pair Record; 21 | std::vector mProfile; 22 | 23 | virtual void reportLayerTime(const char* layerName, float ms) 24 | { 25 | auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; }); 26 | 27 | if (record == mProfile.end()) mProfile.push_back(std::make_pair(layerName, ms)); 28 | else record->second += ms; 29 | } 30 | 31 | void printLayerTimes(const int TIMING_ITERATIONS) 32 | { 33 | float totalTime = 0; 34 | for (size_t i = 0; i < mProfile.size(); i++) 35 | { 36 | printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / TIMING_ITERATIONS); 37 | totalTime += mProfile[i].second; 38 | } 39 | printf("Time over all layers: %4.3f\n", totalTime / TIMING_ITERATIONS); 40 | } 41 | }; 42 | 43 | 44 | /******************************/ 45 | // TensorRT Main 46 | /******************************/ 47 | class TensorNet 48 | { 49 | public: 50 | bool caffeToTRTModel(const char* deployFile, 51 | const char* modelFile, 52 | const std::vector& outputs, 53 | unsigned int maxBatchSize, 54 | std::ostream& gieModelStream); 55 | bool LoadNetwork( const char* prototxt_path, 56 | const char* model_path, 57 | const char* input_blob, 58 | const std::vector& output_blobs, 59 | uint32_t maxBatchSize ); 60 | void createInference(); 61 | 62 | void imageInference(void** buffers, int nbBuffer, int batchSize); 63 | void timeInference(int iteration, int batchSize); 64 | 65 | DimsCHW getTensorDims(const char* name); 66 | 67 | // void getLayerOutput(const char* name); 68 | 69 | void printTimes(int iteration); 70 | void destroy(); 71 | 72 | private: 73 | 74 | PluginFactory pluginFactory; 75 | IHostMemory *gieModelStream{nullptr}; 76 | 77 | IRuntime* infer; 78 | ICudaEngine* engine; 79 | 80 | Logger gLogger; 81 | Profiler gProfiler; 82 | 83 | }; 84 | 85 | 86 | //#endif 87 | 88 | -------------------------------------------------------------------------------- /testPic/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric612/Pelee-Seg-TensorRT/05bf0b31c5891adaf64f40b784ef4a1927d68862/testPic/test.png -------------------------------------------------------------------------------- /testVideo/test.avi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eric612/Pelee-Seg-TensorRT/05bf0b31c5891adaf64f40b784ef4a1927d68862/testVideo/test.avi -------------------------------------------------------------------------------- /util/cuda/cudaMappedMemory.h: -------------------------------------------------------------------------------- 1 | /* 2 | * inference-101 3 | */ 4 | 5 | #ifndef __CUDA_MAPPED_MEMORY_H_ 6 | #define __CUDA_MAPPED_MEMORY_H_ 7 | 8 | 9 | #include "cudaUtility.h" 10 | 11 | 12 | /** 13 | * Allocate ZeroCopy mapped memory, shared between CUDA and CPU. 14 | * @ingroup util 15 | */ 16 | inline bool cudaAllocMapped( void** cpuPtr, void** gpuPtr, size_t size ) 17 | { 18 | if( !cpuPtr || !gpuPtr || size == 0 ) 19 | return false; 20 | 21 | //CUDA(cudaSetDeviceFlags(cudaDeviceMapHost)); 22 | 23 | if( CUDA_FAILED(cudaHostAlloc(cpuPtr, size, cudaHostAllocMapped)) ) 24 | return false; 25 | 26 | if( CUDA_FAILED(cudaHostGetDevicePointer(gpuPtr, *cpuPtr, 0)) ) 27 | return false; 28 | 29 | memset(*cpuPtr, 0, size); 30 | printf("[cuda] cudaAllocMapped %zu bytes, CPU %p GPU %p\n", size, *cpuPtr, *gpuPtr); 31 | return true; 32 | } 33 | 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /util/cuda/cudaNormalize.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * inference-101 3 | */ 4 | 5 | #include "cudaNormalize.h" 6 | 7 | 8 | 9 | // gpuNormalize 10 | template 11 | __global__ void gpuNormalize( T* input, T* output, int width, int height, float scaling_factor ) 12 | { 13 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 14 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 15 | 16 | if( x >= width || y >= height ) 17 | return; 18 | 19 | const T px = input[ y * width + x ]; 20 | 21 | output[y*width+x] = make_float4(px.x * scaling_factor, 22 | px.y * scaling_factor, 23 | px.z * scaling_factor, 24 | px.w * scaling_factor); 25 | } 26 | 27 | 28 | // cudaNormalizeRGBA 29 | cudaError_t cudaNormalizeRGBA( float4* input, const float2& input_range, 30 | float4* output, const float2& output_range, 31 | size_t width, size_t height ) 32 | { 33 | if( !input || !output ) 34 | return cudaErrorInvalidDevicePointer; 35 | 36 | if( width == 0 || height == 0 ) 37 | return cudaErrorInvalidValue; 38 | 39 | const float multiplier = output_range.y / input_range.y; 40 | 41 | // launch kernel 42 | const dim3 blockDim(8, 8); 43 | const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height,blockDim.y)); 44 | 45 | gpuNormalize<<>>(input, output, width, height, multiplier); 46 | 47 | return CUDA(cudaGetLastError()); 48 | } 49 | 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /util/cuda/cudaNormalize.h: -------------------------------------------------------------------------------- 1 | /* 2 | * inference-101 3 | */ 4 | 5 | #ifndef __CUDA_NORMALIZE_H__ 6 | #define __CUDA_NORMALIZE_H__ 7 | 8 | 9 | #include "cudaUtility.h" 10 | 11 | 12 | /** 13 | * Rebase the pixel intensities of an image between two scales. 14 | * For example, convert an image with values 0.0-255 to 0.0-1.0. 15 | * @ingroup util 16 | */ 17 | cudaError_t cudaNormalizeRGBA( float4* input, const float2& input_range, 18 | float4* output, const float2& output_range, 19 | size_t width, size_t height ); 20 | 21 | #endif 22 | 23 | -------------------------------------------------------------------------------- /util/cuda/cudaOverlay.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * http://github.com/dusty-nv/jetson-inference 3 | */ 4 | 5 | #include "cudaOverlay.h" 6 | 7 | 8 | static inline __device__ __host__ bool eq_less( float a, float b, float epsilon ) 9 | { 10 | return (a > (b - epsilon) && a < (b + epsilon)) ? true : false; 11 | } 12 | 13 | template 14 | __global__ void gpuRectOutlines( T* input, T* output, int width, int height, 15 | float4* rects, int numRects, float4 color ) 16 | { 17 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 18 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 19 | 20 | if( x >= width || y >= height ) 21 | return; 22 | 23 | const T px_in = input[ y * width + x ]; 24 | T px_out = px_in; 25 | 26 | const float fx = x; 27 | const float fy = y; 28 | 29 | const float thick = 10.0f; 30 | const float alpha = color.w / 255.0f; 31 | const float ialph = 1.0f - alpha; 32 | 33 | for( int nr=0; nr < numRects; nr++ ) 34 | { 35 | const float4 r = rects[nr]; 36 | 37 | //printf("%i %i %i %f %f %f %f\n", numRects, x, y, r.x, r.y, r.z, r.w); 38 | 39 | if( fy >= r.y && fy <= r.w /*&& (eq_less(fx, r.x, ep) || eq_less(fx, r.z, ep))*/ ) 40 | { 41 | if( fx >= r.x && fx <= r.z /*&& (eq_less(fy, r.y, ep) || eq_less(fy, r.w, ep))*/ ) 42 | { 43 | //printf("cuda rect %i %i\n", x, y); 44 | 45 | px_out.x = alpha * color.x + ialph * px_out.x; 46 | px_out.y = alpha * color.y + ialph * px_out.y; 47 | px_out.z = alpha * color.z + ialph * px_out.z; 48 | } 49 | } 50 | } 51 | 52 | output[y * width + x] = px_out; 53 | } 54 | 55 | 56 | cudaError_t cudaRectOutlineOverlay( float4* input, float4* output, uint32_t width, uint32_t height, float4* boundingBoxes, int numBoxes, const float4& color ) 57 | { 58 | if( !input || !output || width == 0 || height == 0 || !boundingBoxes || numBoxes == 0 ) 59 | return cudaErrorInvalidValue; 60 | 61 | // launch kernel 62 | const dim3 blockDim(8, 8); 63 | const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height,blockDim.y)); 64 | 65 | gpuRectOutlines<<>>(input, output, width, height, boundingBoxes, numBoxes, color); 66 | 67 | return cudaGetLastError(); 68 | } 69 | -------------------------------------------------------------------------------- /util/cuda/cudaOverlay.h: -------------------------------------------------------------------------------- 1 | /* 2 | * http://github.com/dusty-nv/jetson-inference 3 | */ 4 | 5 | #ifndef __CUDA_OVERLAY_H__ 6 | #define __CUDA_OVERLAY_H__ 7 | 8 | #include "cudaUtility.h" 9 | 10 | 11 | /** 12 | * cudaRectOutlineOverlay 13 | * @ingroup util 14 | */ 15 | cudaError_t cudaRectOutlineOverlay( float4* input, float4* output, uint32_t width, uint32_t height, float4* boundingBoxes, int numBoxes, const float4& color ); 16 | 17 | 18 | /** 19 | * cudaRectFillOverlay 20 | * @ingroup util 21 | */ 22 | //cudaError_t cudaRectFillOverlay( float4* input, float4* output, uint32_t width, uint32_t height, float4* boundingBoxes, int numBoxes, const float4& color ); 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /util/cuda/cudaRGB.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * http://github.com/dusty-nv 3 | */ 4 | 5 | #include "cudaRGB.h" 6 | 7 | __global__ void loadImage(uchar3* srcImage, 8 | float3* dstImage, 9 | uint32_t width, 10 | uint32_t height) 11 | { 12 | int x, y, pixel; 13 | 14 | x = (blockIdx.x * blockDim.x) + threadIdx.x; 15 | y = (blockIdx.y * blockDim.y) + threadIdx.y; 16 | 17 | pixel = y * width + x; 18 | 19 | if (x >= width) 20 | return; 21 | 22 | if (y >= height) 23 | return; 24 | 25 | // printf("cuda thread %i %i %i %i pixel %i \n", x, y, width, height, pixel); 26 | 27 | const float s = 1.0f; 28 | const uchar3 px = srcImage[pixel]; 29 | 30 | dstImage[pixel] = make_float3( 255, 255, 255); 31 | 32 | } 33 | 34 | 35 | 36 | __global__ void RGBToRGBAf(uchar3* srcImage, 37 | float4* dstImage, 38 | uint32_t width, uint32_t height) 39 | { 40 | int x, y, pixel; 41 | 42 | x = (blockIdx.x * blockDim.x) + threadIdx.x; 43 | y = (blockIdx.y * blockDim.y) + threadIdx.y; 44 | 45 | pixel = y * width + x; 46 | 47 | if (x >= width) 48 | return; 49 | 50 | if (y >= height) 51 | return; 52 | 53 | // printf("cuda thread %i %i %i %i pixel %i \n", x, y, width, height, pixel); 54 | 55 | const float s = 1.0f; 56 | const uchar3 px = srcImage[pixel]; 57 | 58 | dstImage[pixel] = make_float4(px.x * s, px.y * s, px.z * s, 255.0f * s); 59 | } 60 | 61 | cudaError_t cudaRGBToRGBAf( uchar3* srcDev, float3* destDev, size_t width, size_t height ) 62 | { 63 | if( !srcDev || !destDev ) 64 | return cudaErrorInvalidDevicePointer; 65 | 66 | const dim3 blockDim(8,8,1); 67 | const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height,blockDim.y), 1); 68 | 69 | loadImage<<>>( srcDev, destDev, width, height ); 70 | 71 | return CUDA(cudaGetLastError()); 72 | } 73 | 74 | -------------------------------------------------------------------------------- /util/cuda/cudaRGB.h: -------------------------------------------------------------------------------- 1 | /* 2 | * http://github.com/dusty-nv/jetson-inference 3 | */ 4 | 5 | #ifndef __CUDA_RGB_CONVERT_H 6 | #define __CUDA_RGB_CONVERT_H 7 | 8 | 9 | #include "cudaUtility.h" 10 | #include 11 | 12 | 13 | /** 14 | * Convert 8-bit fixed-point RGB image to 32-bit floating-point RGBA image 15 | * @ingroup util 16 | */ 17 | cudaError_t cudaRGBToRGBAf( uchar3* input, float3* output, size_t width, size_t height ); 18 | 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /util/cuda/cudaResize.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * inference-101 3 | */ 4 | 5 | #include "cudaResize.h" 6 | 7 | // gpuResample 8 | template 9 | __global__ void gpuResize( float2 scale, T* input, int iWidth, T* output, int oWidth, int oHeight ) 10 | { 11 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 12 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 13 | 14 | if( x >= oWidth || y >= oHeight ) 15 | return; 16 | 17 | const int dx = ((float)x * scale.x); 18 | const int dy = ((float)y * scale.y); 19 | 20 | const T px = input[ dy * iWidth + dx ]; 21 | 22 | output[y*oWidth+x] = px; 23 | 24 | } 25 | 26 | // cudaResize 27 | cudaError_t cudaResize( float* input, size_t inputWidth, size_t inputHeight, 28 | float* output, size_t outputWidth, size_t outputHeight ) 29 | { 30 | if( !input || !output ) 31 | return cudaErrorInvalidDevicePointer; 32 | 33 | if( inputWidth == 0 || outputWidth == 0 || inputHeight == 0 || outputHeight == 0 ) 34 | return cudaErrorInvalidValue; 35 | 36 | const float2 scale = make_float2( float(inputWidth) / float(outputWidth), 37 | float(inputHeight) / float(outputHeight) ); 38 | 39 | // launch kernel 40 | const dim3 blockDim(8, 8); 41 | const dim3 gridDim(iDivUp(outputWidth,blockDim.x), iDivUp(outputHeight,blockDim.y)); 42 | 43 | gpuResize<<>>(scale, input, inputWidth, output, outputWidth, outputHeight); 44 | 45 | return CUDA(cudaGetLastError()); 46 | 47 | } 48 | 49 | 50 | // cudaResizeRGBA 51 | cudaError_t cudaResizeRGBA( float4* input, size_t inputWidth, size_t inputHeight, 52 | float4* output, size_t outputWidth, size_t outputHeight ) 53 | { 54 | if( !input || !output ) 55 | return cudaErrorInvalidDevicePointer; 56 | 57 | if( inputWidth == 0 || outputWidth == 0 || inputHeight == 0 || outputHeight == 0 ) 58 | return cudaErrorInvalidValue; 59 | 60 | const float2 scale = make_float2( float(inputWidth) / float(outputWidth), 61 | float(inputHeight) / float(outputHeight) ); 62 | 63 | // launch kernel 64 | const dim3 blockDim(8, 8); 65 | const dim3 gridDim(iDivUp(outputWidth,blockDim.x), iDivUp(outputHeight,blockDim.y)); 66 | 67 | gpuResize<<>>(scale, input, inputWidth, output, outputWidth, outputHeight); 68 | 69 | return CUDA(cudaGetLastError()); 70 | } 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /util/cuda/cudaResize.h: -------------------------------------------------------------------------------- 1 | /* 2 | * inference-101 3 | */ 4 | 5 | #ifndef __CUDA_RESIZE_H__ 6 | #define __CUDA_RESIZE_H__ 7 | 8 | 9 | #include "cudaUtility.h" 10 | 11 | 12 | /** 13 | * Function for increasing or decreasing the size of an image on the GPU. 14 | * @ingroup util 15 | */ 16 | cudaError_t cudaResize( float* input, size_t inputWidth, size_t inputHeight, 17 | float* output, size_t outputWidth, size_t outputHeight ); 18 | 19 | 20 | /** 21 | * Function for increasing or decreasing the size of an image on the GPU. 22 | * @ingroup util 23 | */ 24 | cudaError_t cudaResizeRGBA( float4* input, size_t inputWidth, size_t inputHeight, 25 | float4* output, size_t outputWidth, size_t outputHeight ); 26 | 27 | 28 | 29 | 30 | #endif 31 | 32 | -------------------------------------------------------------------------------- /util/cuda/cudaUtility.h: -------------------------------------------------------------------------------- 1 | /* 2 | * http://github.com/dusty-nv/jetson-inference 3 | */ 4 | 5 | #ifndef __CUDA_UTILITY_H_ 6 | #define __CUDA_UTILITY_H_ 7 | 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | /** 16 | * Execute a CUDA call and print out any errors 17 | * @return the original cudaError_t result 18 | * @ingroup util 19 | */ 20 | #define CUDA(x) cudaCheckError((x), #x, __FILE__, __LINE__) 21 | 22 | /** 23 | * Evaluates to true on success 24 | * @ingroup util 25 | */ 26 | #define CUDA_SUCCESS(x) (CUDA(x) == cudaSuccess) 27 | 28 | /** 29 | * Evaluates to true on failure 30 | * @ingroup util 31 | */ 32 | #define CUDA_FAILED(x) (CUDA(x) != cudaSuccess) 33 | 34 | /** 35 | * Return from the boolean function if CUDA call fails 36 | * @ingroup util 37 | */ 38 | #define CUDA_VERIFY(x) if(CUDA_FAILED(x)) return false; 39 | 40 | /** 41 | * LOG_CUDA string. 42 | * @ingroup util 43 | */ 44 | #define LOG_CUDA "[cuda] " 45 | 46 | /* 47 | * define this if you want all cuda calls to be printed 48 | */ 49 | //#define CUDA_TRACE 50 | 51 | 52 | 53 | /** 54 | * cudaCheckError 55 | * @ingroup util 56 | */ 57 | inline cudaError_t cudaCheckError(cudaError_t retval, const char* txt, const char* file, int line ) 58 | { 59 | #if !defined(CUDA_TRACE) 60 | if( retval == cudaSuccess) 61 | return cudaSuccess; 62 | #endif 63 | 64 | //int activeDevice = -1; 65 | //cudaGetDevice(&activeDevice); 66 | 67 | //Log("[cuda] device %i - %s\n", activeDevice, txt); 68 | 69 | printf(LOG_CUDA "%s\n", txt); 70 | 71 | 72 | if( retval != cudaSuccess ) 73 | { 74 | printf(LOG_CUDA " %s (error %u) (hex 0x%02X)\n", cudaGetErrorString(retval), retval, retval); 75 | printf(LOG_CUDA " %s:%i\n", file, line); 76 | } 77 | 78 | return retval; 79 | } 80 | 81 | 82 | /** 83 | * iDivUp 84 | * @ingroup util 85 | */ 86 | inline __device__ __host__ int iDivUp( int a, int b ) { return (a % b != 0) ? (a / b + 1) : (a / b); } 87 | 88 | 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /util/cuda/cudaYUV-NV12.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * inference-101 3 | */ 4 | 5 | #include "cudaYUV.h" 6 | 7 | 8 | #define COLOR_COMPONENT_MASK 0x3FF 9 | #define COLOR_COMPONENT_BIT_SIZE 10 10 | 11 | #define FIXED_DECIMAL_POINT 24 12 | #define FIXED_POINT_MULTIPLIER 1.0f 13 | #define FIXED_COLOR_COMPONENT_MASK 0xffffffff 14 | 15 | #define MUL(x,y) (x*y) 16 | 17 | 18 | 19 | __constant__ uint32_t constAlpha; 20 | __constant__ float constHueColorSpaceMat[9]; 21 | 22 | 23 | 24 | __device__ void YUV2RGB(uint32_t *yuvi, float *red, float *green, float *blue) 25 | { 26 | 27 | 28 | // Prepare for hue adjustment 29 | /* 30 | float luma, chromaCb, chromaCr; 31 | 32 | luma = (float)yuvi[0]; 33 | chromaCb = (float)((int)yuvi[1] - 512.0f); 34 | chromaCr = (float)((int)yuvi[2] - 512.0f); 35 | 36 | // Convert YUV To RGB with hue adjustment 37 | *red = MUL(luma, constHueColorSpaceMat[0]) + 38 | MUL(chromaCb, constHueColorSpaceMat[1]) + 39 | MUL(chromaCr, constHueColorSpaceMat[2]); 40 | *green= MUL(luma, constHueColorSpaceMat[3]) + 41 | MUL(chromaCb, constHueColorSpaceMat[4]) + 42 | MUL(chromaCr, constHueColorSpaceMat[5]); 43 | *blue = MUL(luma, constHueColorSpaceMat[6]) + 44 | MUL(chromaCb, constHueColorSpaceMat[7]) + 45 | MUL(chromaCr, constHueColorSpaceMat[8]);*/ 46 | 47 | const float luma = float(yuvi[0]); 48 | const float u = float(yuvi[1]) - 512.0f; 49 | const float v = float(yuvi[2]) - 512.0f; 50 | 51 | /*R = Y + 1.140V 52 | G = Y - 0.395U - 0.581V 53 | B = Y + 2.032U*/ 54 | 55 | /**green = luma + 1.140f * v; 56 | *blue = luma - 0.395f * u - 0.581f * v; 57 | *red = luma + 2.032f * u;*/ 58 | 59 | *red = luma + 1.140f * v; 60 | *green = luma - 0.395f * u - 0.581f * v; 61 | *blue = luma + 2.032f * u; 62 | } 63 | 64 | 65 | __device__ uint32_t RGBAPACK_8bit(float red, float green, float blue, uint32_t alpha) 66 | { 67 | uint32_t ARGBpixel = 0; 68 | 69 | // Clamp final 10 bit results 70 | red = min(max(red, 0.0f), 255.0f); 71 | green = min(max(green, 0.0f), 255.0f); 72 | blue = min(max(blue, 0.0f), 255.0f); 73 | 74 | // Convert to 8 bit unsigned integers per color component 75 | ARGBpixel = ((((uint32_t)red) << 24) | 76 | (((uint32_t)green) << 16) | 77 | (((uint32_t)blue) << 8) | (uint32_t)alpha); 78 | 79 | return ARGBpixel; 80 | } 81 | 82 | 83 | __device__ uint32_t RGBAPACK_10bit(float red, float green, float blue, uint32_t alpha) 84 | { 85 | uint32_t ARGBpixel = 0; 86 | 87 | // Clamp final 10 bit results 88 | red = min(max(red, 0.0f), 1023.f); 89 | green = min(max(green, 0.0f), 1023.f); 90 | blue = min(max(blue, 0.0f), 1023.f); 91 | 92 | // Convert to 8 bit unsigned integers per color component 93 | ARGBpixel = ((((uint32_t)red >> 2) << 24) | 94 | (((uint32_t)green >> 2) << 16) | 95 | (((uint32_t)blue >> 2) << 8) | (uint32_t)alpha); 96 | 97 | return ARGBpixel; 98 | } 99 | 100 | 101 | // CUDA kernel for outputing the final ARGB output from NV12; 102 | /*extern "C"*/ 103 | __global__ void Passthru(uint32_t *srcImage, size_t nSourcePitch, 104 | uint32_t *dstImage, size_t nDestPitch, 105 | uint32_t width, uint32_t height) 106 | { 107 | int x, y; 108 | uint32_t yuv101010Pel[2]; 109 | uint32_t processingPitch = ((width) + 63) & ~63; 110 | uint32_t dstImagePitch = nDestPitch >> 2; 111 | uint8_t *srcImageU8 = (uint8_t *)srcImage; 112 | 113 | processingPitch = nSourcePitch; 114 | 115 | // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread 116 | x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1); 117 | y = blockIdx.y * blockDim.y + threadIdx.y; 118 | 119 | if (x >= width) 120 | return; //x = width - 1; 121 | 122 | if (y >= height) 123 | return; // y = height - 1; 124 | 125 | // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way. 126 | // if we move to texture we could read 4 luminance values 127 | yuv101010Pel[0] = (srcImageU8[y * processingPitch + x ]); 128 | yuv101010Pel[1] = (srcImageU8[y * processingPitch + x + 1]); 129 | 130 | // this steps performs the color conversion 131 | float luma[2]; 132 | 133 | luma[0] = (yuv101010Pel[0] & 0x00FF); 134 | luma[1] = (yuv101010Pel[1] & 0x00FF); 135 | 136 | // Clamp the results to RGBA 137 | dstImage[y * dstImagePitch + x ] = RGBAPACK_8bit(luma[0], luma[0], luma[0], constAlpha); 138 | dstImage[y * dstImagePitch + x + 1 ] = RGBAPACK_8bit(luma[1], luma[1], luma[1], constAlpha); 139 | } 140 | 141 | 142 | // CUDA kernel for outputing the final ARGB output from NV12; 143 | /*extern "C"*/ 144 | __global__ void NV12ToARGB(uint32_t *srcImage, size_t nSourcePitch, 145 | uint32_t *dstImage, size_t nDestPitch, 146 | uint32_t width, uint32_t height) 147 | { 148 | int x, y; 149 | uint32_t yuv101010Pel[2]; 150 | uint32_t processingPitch = ((width) + 63) & ~63; 151 | uint32_t dstImagePitch = nDestPitch >> 2; 152 | uint8_t *srcImageU8 = (uint8_t *)srcImage; 153 | 154 | processingPitch = nSourcePitch; 155 | 156 | // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread 157 | x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1); 158 | y = blockIdx.y * blockDim.y + threadIdx.y; 159 | 160 | if (x >= width) 161 | return; //x = width - 1; 162 | 163 | if (y >= height) 164 | return; // y = height - 1; 165 | 166 | // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way. 167 | // if we move to texture we could read 4 luminance values 168 | yuv101010Pel[0] = (srcImageU8[y * processingPitch + x ]) << 2; 169 | yuv101010Pel[1] = (srcImageU8[y * processingPitch + x + 1]) << 2; 170 | 171 | uint32_t chromaOffset = processingPitch * height; 172 | int y_chroma = y >> 1; 173 | 174 | if (y & 1) // odd scanline ? 175 | { 176 | uint32_t chromaCb; 177 | uint32_t chromaCr; 178 | 179 | chromaCb = srcImageU8[chromaOffset + y_chroma * processingPitch + x ]; 180 | chromaCr = srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1]; 181 | 182 | if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically 183 | { 184 | chromaCb = (chromaCb + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x ] + 1) >> 1; 185 | chromaCr = (chromaCr + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x + 1] + 1) >> 1; 186 | } 187 | 188 | yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2)); 189 | yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); 190 | 191 | yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2)); 192 | yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); 193 | } 194 | else 195 | { 196 | yuv101010Pel[0] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x ] << (COLOR_COMPONENT_BIT_SIZE + 2)); 197 | yuv101010Pel[0] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); 198 | 199 | yuv101010Pel[1] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x ] << (COLOR_COMPONENT_BIT_SIZE + 2)); 200 | yuv101010Pel[1] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); 201 | } 202 | 203 | // this steps performs the color conversion 204 | uint32_t yuvi[6]; 205 | float red[2], green[2], blue[2]; 206 | 207 | yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK); 208 | yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK); 209 | yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK); 210 | 211 | yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK); 212 | yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK); 213 | yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK); 214 | 215 | // YUV to RGB Transformation conversion 216 | YUV2RGB(&yuvi[0], &red[0], &green[0], &blue[0]); 217 | YUV2RGB(&yuvi[3], &red[1], &green[1], &blue[1]); 218 | 219 | // Clamp the results to RGBA 220 | dstImage[y * dstImagePitch + x ] = RGBAPACK_10bit(red[0], green[0], blue[0], constAlpha); 221 | dstImage[y * dstImagePitch + x + 1 ] = RGBAPACK_10bit(red[1], green[1], blue[1], constAlpha); 222 | } 223 | 224 | 225 | bool nv12ColorspaceSetup = false; 226 | 227 | 228 | // cudaNV12ToARGB32 229 | cudaError_t cudaNV12ToRGBA( uint8_t* srcDev, size_t srcPitch, uchar4* destDev, size_t destPitch, size_t width, size_t height ) 230 | { 231 | if( !srcDev || !destDev ) 232 | return cudaErrorInvalidDevicePointer; 233 | 234 | if( srcPitch == 0 || destPitch == 0 || width == 0 || height == 0 ) 235 | return cudaErrorInvalidValue; 236 | 237 | if( !nv12ColorspaceSetup ) 238 | cudaNV12SetupColorspace(); 239 | 240 | const dim3 blockDim(32,16,1); 241 | const dim3 gridDim((width+(2*blockDim.x-1))/(2*blockDim.x), (height+(blockDim.y-1))/blockDim.y, 1); 242 | 243 | NV12ToARGB<<>>( (uint32_t*)srcDev, srcPitch, (uint32_t*)destDev, destPitch, width, height ); 244 | 245 | return CUDA(cudaGetLastError()); 246 | } 247 | 248 | cudaError_t cudaNV12ToRGBA( uint8_t* srcDev, uchar4* destDev, size_t width, size_t height ) 249 | { 250 | return cudaNV12ToRGBA(srcDev, width * sizeof(uint8_t), destDev, width * sizeof(uchar4), width, height); 251 | } 252 | 253 | 254 | //------------------------------------------------------------------------------------------------------------------------- 255 | 256 | __global__ void NV12ToRGBAf(uint32_t* srcImage, size_t nSourcePitch, 257 | float4* dstImage, size_t nDestPitch, 258 | uint32_t width, uint32_t height) 259 | { 260 | int x, y; 261 | uint32_t yuv101010Pel[2]; 262 | uint32_t processingPitch = ((width) + 63) & ~63; 263 | uint8_t *srcImageU8 = (uint8_t *)srcImage; 264 | 265 | processingPitch = nSourcePitch; 266 | 267 | // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread 268 | x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1); 269 | y = blockIdx.y * blockDim.y + threadIdx.y; 270 | 271 | if (x >= width) 272 | return; //x = width - 1; 273 | 274 | if (y >= height) 275 | return; // y = height - 1; 276 | 277 | #if 1 278 | // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way. 279 | // if we move to texture we could read 4 luminance values 280 | yuv101010Pel[0] = (srcImageU8[y * processingPitch + x ]) << 2; 281 | yuv101010Pel[1] = (srcImageU8[y * processingPitch + x + 1]) << 2; 282 | 283 | uint32_t chromaOffset = processingPitch * height; 284 | int y_chroma = y >> 1; 285 | 286 | if (y & 1) // odd scanline ? 287 | { 288 | uint32_t chromaCb; 289 | uint32_t chromaCr; 290 | 291 | chromaCb = srcImageU8[chromaOffset + y_chroma * processingPitch + x ]; 292 | chromaCr = srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1]; 293 | 294 | if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically 295 | { 296 | chromaCb = (chromaCb + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x ] + 1) >> 1; 297 | chromaCr = (chromaCr + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x + 1] + 1) >> 1; 298 | } 299 | 300 | yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2)); 301 | yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); 302 | 303 | yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2)); 304 | yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); 305 | } 306 | else 307 | { 308 | yuv101010Pel[0] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x ] << (COLOR_COMPONENT_BIT_SIZE + 2)); 309 | yuv101010Pel[0] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); 310 | 311 | yuv101010Pel[1] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x ] << (COLOR_COMPONENT_BIT_SIZE + 2)); 312 | yuv101010Pel[1] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2)); 313 | } 314 | 315 | // this steps performs the color conversion 316 | uint32_t yuvi[6]; 317 | float red[2], green[2], blue[2]; 318 | 319 | yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK); 320 | yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK); 321 | yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK); 322 | 323 | yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK); 324 | yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK); 325 | yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK); 326 | 327 | // YUV to RGB Transformation conversion 328 | YUV2RGB(&yuvi[0], &red[0], &green[0], &blue[0]); 329 | YUV2RGB(&yuvi[3], &red[1], &green[1], &blue[1]); 330 | 331 | // Clamp the results to RGBA 332 | //printf("cuda thread %i %i %f %f %f\n", x, y, red[0], green[0], blue[0]); 333 | 334 | const float s = 1.0f / 1024.0f * 255.0f; 335 | 336 | dstImage[y * width + x] = make_float4(red[0] * s, green[0] * s, blue[0] * s, 1.0f); 337 | dstImage[y * width + x + 1] = make_float4(red[1] * s, green[1] * s, blue[1] * s, 1.0f); 338 | #else 339 | //printf("cuda thread %i %i %i %i \n", x, y, width, height); 340 | 341 | dstImage[y * width + x] = make_float4(1.0f, 0.0f, 0.0f, 1.0f); 342 | dstImage[y * width + x + 1] = make_float4(1.0f, 0.0f, 0.0f, 1.0f); 343 | #endif 344 | } 345 | 346 | 347 | 348 | // cudaNV12ToRGBA 349 | cudaError_t cudaNV12ToRGBAf( uint8_t* srcDev, size_t srcPitch, float4* destDev, size_t destPitch, size_t width, size_t height ) 350 | { 351 | if( !srcDev || !destDev ) 352 | return cudaErrorInvalidDevicePointer; 353 | 354 | if( srcPitch == 0 || destPitch == 0 || width == 0 || height == 0 ) 355 | return cudaErrorInvalidValue; 356 | 357 | if( !nv12ColorspaceSetup ) 358 | cudaNV12SetupColorspace(); 359 | 360 | const dim3 blockDim(8,8,1); 361 | //const dim3 gridDim((width+(2*blockDim.x-1))/(2*blockDim.x), (height+(blockDim.y-1))/blockDim.y, 1); 362 | const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height, blockDim.y), 1); 363 | 364 | NV12ToRGBAf<<>>( (uint32_t*)srcDev, srcPitch, destDev, destPitch, width, height ); 365 | 366 | return CUDA(cudaGetLastError()); 367 | } 368 | 369 | cudaError_t cudaNV12ToRGBAf( uint8_t* srcDev, float4* destDev, size_t width, size_t height ) 370 | { 371 | return cudaNV12ToRGBAf(srcDev, width * sizeof(uint8_t), destDev, width * sizeof(float4), width, height); 372 | } 373 | 374 | 375 | // cudaNV12SetupColorspace 376 | cudaError_t cudaNV12SetupColorspace( float hue ) 377 | { 378 | const float hueSin = sin(hue); 379 | const float hueCos = cos(hue); 380 | 381 | float hueCSC[9]; 382 | 383 | const bool itu601 = false; 384 | 385 | if( itu601 /*CSC == ITU601*/) 386 | { 387 | //CCIR 601 388 | hueCSC[0] = 1.1644f; 389 | hueCSC[1] = hueSin * 1.5960f; 390 | hueCSC[2] = hueCos * 1.5960f; 391 | hueCSC[3] = 1.1644f; 392 | hueCSC[4] = (hueCos * -0.3918f) - (hueSin * 0.8130f); 393 | hueCSC[5] = (hueSin * 0.3918f) - (hueCos * 0.8130f); 394 | hueCSC[6] = 1.1644f; 395 | hueCSC[7] = hueCos * 2.0172f; 396 | hueCSC[8] = hueSin * -2.0172f; 397 | } 398 | else /*if(CSC == ITU709)*/ 399 | { 400 | //CCIR 709 401 | hueCSC[0] = 1.0f; 402 | hueCSC[1] = hueSin * 1.57480f; 403 | hueCSC[2] = hueCos * 1.57480f; 404 | hueCSC[3] = 1.0; 405 | hueCSC[4] = (hueCos * -0.18732f) - (hueSin * 0.46812f); 406 | hueCSC[5] = (hueSin * 0.18732f) - (hueCos * 0.46812f); 407 | hueCSC[6] = 1.0f; 408 | hueCSC[7] = hueCos * 1.85560f; 409 | hueCSC[8] = hueSin * -1.85560f; 410 | } 411 | 412 | 413 | if( CUDA_FAILED(cudaMemcpyToSymbol(constHueColorSpaceMat, hueCSC, sizeof(float) * 9)) ) 414 | return cudaErrorInvalidSymbol; 415 | 416 | uint32_t cudaAlpha = ((uint32_t)0xff<< 24); 417 | 418 | if( CUDA_FAILED(cudaMemcpyToSymbol(constAlpha, &cudaAlpha, sizeof(uint32_t))) ) 419 | return cudaErrorInvalidSymbol; 420 | 421 | nv12ColorspaceSetup = true; 422 | return cudaSuccess; 423 | } 424 | 425 | -------------------------------------------------------------------------------- /util/cuda/cudaYUV-YUYV.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * http://github.com/dusty-nv/jetson-inference 3 | */ 4 | 5 | #include "cudaYUV.h" 6 | 7 | 8 | inline __device__ __host__ float clamp(float f, float a, float b) 9 | { 10 | return fmaxf(a, fminf(f, b)); 11 | } 12 | 13 | 14 | /* From RGB to YUV 15 | 16 | Y = 0.299R + 0.587G + 0.114B 17 | U = 0.492 (B-Y) 18 | V = 0.877 (R-Y) 19 | 20 | It can also be represented as: 21 | 22 | Y = 0.299R + 0.587G + 0.114B 23 | U = -0.147R - 0.289G + 0.436B 24 | V = 0.615R - 0.515G - 0.100B 25 | 26 | From YUV to RGB 27 | 28 | R = Y + 1.140V 29 | G = Y - 0.395U - 0.581V 30 | B = Y + 2.032U 31 | */ 32 | 33 | struct __align__(8) uchar8 34 | { 35 | uint8_t a0, a1, a2, a3, a4, a5, a6, a7; 36 | }; 37 | static __host__ __device__ __forceinline__ uchar8 make_uchar8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7) 38 | { 39 | uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7}; 40 | return val; 41 | } 42 | 43 | 44 | //----------------------------------------------------------------------------------- 45 | // YUYV/UYVY to RGBA 46 | //----------------------------------------------------------------------------------- 47 | template 48 | __global__ void yuyvToRgba( uchar4* src, int srcAlignedWidth, uchar8* dst, int dstAlignedWidth, int width, int height ) 49 | { 50 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 51 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 52 | 53 | if( x >= srcAlignedWidth || y >= height ) 54 | return; 55 | 56 | const uchar4 macroPx = src[y * srcAlignedWidth + x]; 57 | 58 | // Y0 is the brightness of pixel 0, Y1 the brightness of pixel 1. 59 | // U0 and V0 is the color of both pixels. 60 | // UYVY [ U0 | Y0 | V0 | Y1 ] 61 | // YUYV [ Y0 | U0 | Y1 | V0 ] 62 | const float y0 = formatUYVY ? macroPx.y : macroPx.x; 63 | const float y1 = formatUYVY ? macroPx.w : macroPx.z; 64 | const float u = (formatUYVY ? macroPx.x : macroPx.y) - 128.0f; 65 | const float v = (formatUYVY ? macroPx.z : macroPx.w) - 128.0f; 66 | 67 | const float4 px0 = make_float4( y0 + 1.4065f * v, 68 | y0 - 0.3455f * u - 0.7169f * v, 69 | y0 + 1.7790f * u, 255.0f ); 70 | 71 | const float4 px1 = make_float4( y1 + 1.4065f * v, 72 | y1 - 0.3455f * u - 0.7169f * v, 73 | y1 + 1.7790f * u, 255.0f ); 74 | 75 | dst[y * dstAlignedWidth + x] = make_uchar8( clamp(px0.x, 0.0f, 255.0f), 76 | clamp(px0.y, 0.0f, 255.0f), 77 | clamp(px0.z, 0.0f, 255.0f), 78 | clamp(px0.w, 0.0f, 255.0f), 79 | clamp(px1.x, 0.0f, 255.0f), 80 | clamp(px1.y, 0.0f, 255.0f), 81 | clamp(px1.z, 0.0f, 255.0f), 82 | clamp(px1.w, 0.0f, 255.0f) ); 83 | } 84 | 85 | template 86 | cudaError_t launchYUYV( uchar2* input, size_t inputPitch, uchar4* output, size_t outputPitch, size_t width, size_t height) 87 | { 88 | if( !input || !inputPitch || !output || !outputPitch || !width || !height ) 89 | return cudaErrorInvalidValue; 90 | 91 | const dim3 block(8,8); 92 | const dim3 grid(iDivUp(width/2, block.x), iDivUp(height, block.y)); 93 | 94 | const int srcAlignedWidth = inputPitch / sizeof(uchar4); // normally would be uchar2, but we're doubling up pixels 95 | const int dstAlignedWidth = outputPitch / sizeof(uchar8); // normally would be uchar4 ^^^ 96 | 97 | //printf("yuyvToRgba %zu %zu %i %i %i %i %i\n", width, height, (int)formatUYVY, srcAlignedWidth, dstAlignedWidth, grid.x, grid.y); 98 | 99 | yuyvToRgba<<>>((uchar4*)input, srcAlignedWidth, (uchar8*)output, dstAlignedWidth, width, height); 100 | 101 | return CUDA(cudaGetLastError()); 102 | } 103 | 104 | 105 | cudaError_t cudaUYVYToRGBA( uchar2* input, uchar4* output, size_t width, size_t height ) 106 | { 107 | return cudaUYVYToRGBA(input, width * sizeof(uchar2), output, width * sizeof(uchar4), width, height); 108 | } 109 | 110 | cudaError_t cudaUYVYToRGBA( uchar2* input, size_t inputPitch, uchar4* output, size_t outputPitch, size_t width, size_t height ) 111 | { 112 | return launchYUYV(input, inputPitch, output, outputPitch, width, height); 113 | } 114 | 115 | cudaError_t cudaYUYVToRGBA( uchar2* input, uchar4* output, size_t width, size_t height ) 116 | { 117 | return cudaYUYVToRGBA(input, width * sizeof(uchar2), output, width * sizeof(uchar4), width, height); 118 | } 119 | 120 | cudaError_t cudaYUYVToRGBA( uchar2* input, size_t inputPitch, uchar4* output, size_t outputPitch, size_t width, size_t height ) 121 | { 122 | return launchYUYV(input, inputPitch, output, outputPitch, width, height); 123 | } 124 | 125 | 126 | //----------------------------------------------------------------------------------- 127 | // YUYV/UYVY to grayscale 128 | //----------------------------------------------------------------------------------- 129 | 130 | template 131 | __global__ void yuyvToGray( uchar4* src, int srcAlignedWidth, float2* dst, int dstAlignedWidth, int width, int height ) 132 | { 133 | const int x = blockIdx.x * blockDim.x + threadIdx.x; 134 | const int y = blockIdx.y * blockDim.y + threadIdx.y; 135 | 136 | if( x >= srcAlignedWidth || y >= height ) 137 | return; 138 | 139 | const uchar4 macroPx = src[y * srcAlignedWidth + x]; 140 | 141 | const float y0 = formatUYVY ? macroPx.y : macroPx.x; 142 | const float y1 = formatUYVY ? macroPx.w : macroPx.z; 143 | 144 | dst[y * dstAlignedWidth + x] = make_float2(y0/255.0f, y1/255.0f); 145 | } 146 | 147 | template 148 | cudaError_t launchGrayYUYV( uchar2* input, size_t inputPitch, float* output, size_t outputPitch, size_t width, size_t height) 149 | { 150 | if( !input || !inputPitch || !output || !outputPitch || !width || !height ) 151 | return cudaErrorInvalidValue; 152 | 153 | const dim3 block(8,8); 154 | const dim3 grid(iDivUp(width/2, block.x), iDivUp(height, block.y)); 155 | 156 | const int srcAlignedWidth = inputPitch / sizeof(uchar4); // normally would be uchar2, but we're doubling up pixels 157 | const int dstAlignedWidth = outputPitch / sizeof(float2); // normally would be float ^^^ 158 | 159 | yuyvToGray<<>>((uchar4*)input, srcAlignedWidth, (float2*)output, dstAlignedWidth, width, height); 160 | 161 | return CUDA(cudaGetLastError()); 162 | } 163 | 164 | cudaError_t cudaUYVYToGray( uchar2* input, float* output, size_t width, size_t height ) 165 | { 166 | return cudaUYVYToGray(input, width * sizeof(uchar2), output, width * sizeof(uint8_t), width, height); 167 | } 168 | 169 | cudaError_t cudaUYVYToGray( uchar2* input, size_t inputPitch, float* output, size_t outputPitch, size_t width, size_t height ) 170 | { 171 | return launchGrayYUYV(input, inputPitch, output, outputPitch, width, height); 172 | } 173 | 174 | cudaError_t cudaYUYVToGray( uchar2* input, float* output, size_t width, size_t height ) 175 | { 176 | return cudaYUYVToGray(input, width * sizeof(uchar2), output, width * sizeof(float), width, height); 177 | } 178 | 179 | cudaError_t cudaYUYVToGray( uchar2* input, size_t inputPitch, float* output, size_t outputPitch, size_t width, size_t height ) 180 | { 181 | return launchGrayYUYV(input, inputPitch, output, outputPitch, width, height); 182 | } 183 | 184 | -------------------------------------------------------------------------------- /util/cuda/cudaYUV-YV12.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * inference-101 3 | */ 4 | 5 | #include "cudaYUV.h" 6 | 7 | 8 | 9 | 10 | 11 | inline __device__ void rgb_to_y(const uint8_t r, const uint8_t g, const uint8_t b, uint8_t& y) 12 | { 13 | y = static_cast(((int)(30 * r) + (int)(59 * g) + (int)(11 * b)) / 100); 14 | } 15 | 16 | inline __device__ void rgb_to_yuv(const uint8_t r, const uint8_t g, const uint8_t b, uint8_t& y, uint8_t& u, uint8_t& v) 17 | { 18 | rgb_to_y(r, g, b, y); 19 | u = static_cast(((int)(-17 * r) - (int)(33 * g) + (int)(50 * b) + 12800) / 100); 20 | v = static_cast(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100); 21 | } 22 | 23 | template 24 | __global__ void RGB_to_YV12( T* src, int srcAlignedWidth, uint8_t* dst, int dstPitch, int width, int height ) 25 | { 26 | const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2; 27 | const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2; 28 | 29 | const int x1 = x + 1; 30 | const int y1 = y + 1; 31 | 32 | if( x1 >= width || y1 >= height ) 33 | return; 34 | 35 | const int planeSize = height * dstPitch; 36 | 37 | uint8_t* y_plane = dst; 38 | uint8_t* u_plane; 39 | uint8_t* v_plane; 40 | 41 | if( formatYV12 ) 42 | { 43 | u_plane = y_plane + planeSize; 44 | v_plane = u_plane + (planeSize / 4); // size of U & V planes is 25% of Y plane 45 | } 46 | else 47 | { 48 | v_plane = y_plane + planeSize; // in I420, order of U & V planes is reversed 49 | u_plane = v_plane + (planeSize / 4); 50 | } 51 | 52 | T px; 53 | uint8_t y_val, u_val, v_val; 54 | 55 | px = src[y * srcAlignedWidth + x]; 56 | rgb_to_y(px.x, px.y, px.z, y_val); 57 | y_plane[y * dstPitch + x] = y_val; 58 | 59 | px = src[y * srcAlignedWidth + x1]; 60 | rgb_to_y(px.x, px.y, px.z, y_val); 61 | y_plane[y * dstPitch + x1] = y_val; 62 | 63 | px = src[y1 * srcAlignedWidth + x]; 64 | rgb_to_y(px.x, px.y, px.z, y_val); 65 | y_plane[y1 * dstPitch + x] = y_val; 66 | 67 | px = src[y1 * srcAlignedWidth + x1]; 68 | rgb_to_yuv(px.x, px.y, px.z, y_val, u_val, v_val); 69 | y_plane[y1 * dstPitch + x1] = y_val; 70 | 71 | const int uvPitch = dstPitch / 2; 72 | const int uvIndex = (y / 2) * uvPitch + (x / 2); 73 | 74 | u_plane[uvIndex] = u_val; 75 | v_plane[uvIndex] = v_val; 76 | } 77 | 78 | template 79 | cudaError_t launch420( T* input, size_t inputPitch, uint8_t* output, size_t outputPitch, size_t width, size_t height) 80 | { 81 | if( !input || !inputPitch || !output || !outputPitch || !width || !height ) 82 | return cudaErrorInvalidValue; 83 | 84 | const dim3 block(32, 8); 85 | const dim3 grid(iDivUp(width, block.x * 2), iDivUp(height, block.y * 2)); 86 | 87 | const int inputAlignedWidth = inputPitch / sizeof(T); 88 | 89 | RGB_to_YV12<<>>(input, inputAlignedWidth, output, outputPitch, width, height); 90 | 91 | return CUDA(cudaGetLastError()); 92 | } 93 | 94 | 95 | 96 | // cudaRGBAToYV12 97 | cudaError_t cudaRGBAToYV12( uchar4* input, size_t inputPitch, uint8_t* output, size_t outputPitch, size_t width, size_t height ) 98 | { 99 | return launch420( input, inputPitch, output, outputPitch, width, height ); 100 | } 101 | 102 | // cudaRGBAToYV12 103 | cudaError_t cudaRGBAToYV12( uchar4* input, uint8_t* output, size_t width, size_t height ) 104 | { 105 | return cudaRGBAToYV12( input, width * sizeof(uchar4), output, width * sizeof(uint8_t), width, height ); 106 | } 107 | 108 | // cudaRGBAToI420 109 | cudaError_t cudaRGBAToI420( uchar4* input, size_t inputPitch, uint8_t* output, size_t outputPitch, size_t width, size_t height ) 110 | { 111 | return launch420( input, inputPitch, output, outputPitch, width, height ); 112 | } 113 | 114 | // cudaRGBAToI420 115 | cudaError_t cudaRGBAToI420( uchar4* input, uint8_t* output, size_t width, size_t height ) 116 | { 117 | return cudaRGBAToI420( input, width * sizeof(uchar4), output, width * sizeof(uint8_t), width, height ); 118 | } 119 | 120 | 121 | 122 | #if 0 123 | __global__ void Gray_to_YV12(const GlobPtrSz src, GlobPtr dst) 124 | { 125 | const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2; 126 | const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2; 127 | 128 | if (x + 1 >= src.cols || y + 1 >= src.rows) 129 | return; 130 | 131 | // get pointers to the data 132 | const size_t planeSize = src.rows * dst.step; 133 | GlobPtr y_plane = globPtr(dst.data, dst.step); 134 | GlobPtr u_plane = globPtr(y_plane.data + planeSize, dst.step / 2); 135 | GlobPtr v_plane = globPtr(u_plane.data + (planeSize / 4), dst.step / 2); 136 | 137 | uint8_t pix; 138 | uint8_t y_val, u_val, v_val; 139 | 140 | pix = src(y, x); 141 | rgb_to_y(pix, pix, pix, y_val); 142 | y_plane(y, x) = y_val; 143 | 144 | pix = src(y, x + 1); 145 | rgb_to_y(pix, pix, pix, y_val); 146 | y_plane(y, x + 1) = y_val; 147 | 148 | pix = src(y + 1, x); 149 | rgb_to_y(pix, pix, pix, y_val); 150 | y_plane(y + 1, x) = y_val; 151 | 152 | pix = src(y + 1, x + 1); 153 | rgb_to_yuv(pix, pix, pix, y_val, u_val, v_val); 154 | y_plane(y + 1, x + 1) = y_val; 155 | u_plane(y / 2, x / 2) = u_val; 156 | v_plane(y / 2, x / 2) = v_val; 157 | } 158 | #endif 159 | 160 | -------------------------------------------------------------------------------- /util/cuda/cudaYUV.h: -------------------------------------------------------------------------------- 1 | /* 2 | * http://github.com/dusty-nv/jetson-inference 3 | */ 4 | 5 | #ifndef __CUDA_YUV_CONVERT_H 6 | #define __CUDA_YUV_CONVERT_H 7 | 8 | 9 | #include "cudaUtility.h" 10 | #include 11 | 12 | 13 | ////////////////////////////////////////////////////////////////////////////////// 14 | /// @name RGBA to YUV 4:2:0 planar (I420 & YV12) 15 | /// @ingroup util 16 | ////////////////////////////////////////////////////////////////////////////////// 17 | 18 | ///@{ 19 | 20 | /** 21 | * Convert an RGBA uchar4 buffer into YUV I420 planar. 22 | */ 23 | cudaError_t cudaRGBAToI420( uchar4* input, uint8_t* output, size_t width, size_t height ); 24 | 25 | /** 26 | * Convert an RGBA uchar4 texture into YUV I420 planar. 27 | */ 28 | cudaError_t cudaRGBAToI420( uchar4* input, size_t inputPitch, uint8_t* output, size_t outputPitch, size_t width, size_t height ); 29 | 30 | /** 31 | * Convert an RGBA uchar4 buffer into YUV YV12 planar. 32 | */ 33 | cudaError_t cudaRGBAToYV12( uchar4* input, uint8_t* output, size_t width, size_t height ); 34 | 35 | /** 36 | * Convert an RGBA uchar4 texture into YUV YV12 planar. 37 | */ 38 | cudaError_t cudaRGBAToYV12( uchar4* input, size_t inputPitch, uint8_t* output, size_t outputPitch, size_t width, size_t height ); 39 | 40 | ///@} 41 | 42 | 43 | ////////////////////////////////////////////////////////////////////////////////// 44 | /// @name YUV 4:2:2 packed (UYVY & YUYV) to RGBA 45 | /// @ingroup util 46 | ////////////////////////////////////////////////////////////////////////////////// 47 | 48 | ///@{ 49 | 50 | /** 51 | * Convert a UYVY 422 packed image into RGBA uchar4. 52 | */ 53 | cudaError_t cudaUYVYToRGBA( uchar2* input, uchar4* output, size_t width, size_t height ); 54 | 55 | /** 56 | * Convert a UYVY 422 packed image into RGBA uchar4. 57 | */ 58 | cudaError_t cudaUYVYToRGBA( uchar2* input, size_t inputPitch, uchar4* output, size_t outputPitch, size_t width, size_t height ); 59 | 60 | /** 61 | * Convert a YUYV 422 packed image into RGBA uchar4. 62 | */ 63 | cudaError_t cudaYUYVToRGBA( uchar2* input, uchar4* output, size_t width, size_t height ); 64 | 65 | /** 66 | * Convert a YUYV 422 packed image into RGBA uchar4. 67 | */ 68 | cudaError_t cudaYUYVToRGBA( uchar2* input, size_t inputPitch, uchar4* output, size_t outputPitch, size_t width, size_t height ); 69 | 70 | ///@} 71 | 72 | 73 | ////////////////////////////////////////////////////////////////////////////////// 74 | /// @name UYUV 4:2:2 packed (UYVY & YUYV) to grayscale 75 | /// @ingroup util 76 | ////////////////////////////////////////////////////////////////////////////////// 77 | 78 | ///@{ 79 | 80 | /** 81 | * Convert a UYVY 422 packed image into a uint8 grayscale. 82 | */ 83 | cudaError_t cudaUYVYToGray( uchar2* input, float* output, size_t width, size_t height ); 84 | 85 | /** 86 | * Convert a UYVY 422 packed image into a uint8 grayscale. 87 | */ 88 | cudaError_t cudaUYVYToGray( uchar2* input, size_t inputPitch, float* output, size_t outputPitch, size_t width, size_t height ); 89 | 90 | /** 91 | * Convert a YUYV 422 packed image into a uint8 grayscale. 92 | */ 93 | cudaError_t cudaYUYVToGray( uchar2* input, float* output, size_t width, size_t height ); 94 | 95 | /** 96 | * Convert a YUYV 422 packed image into a uint8 grayscale. 97 | */ 98 | cudaError_t cudaYUYVToGray( uchar2* input, size_t inputPitch, float* output, size_t outputPitch, size_t width, size_t height ); 99 | 100 | ///@} 101 | 102 | 103 | ////////////////////////////////////////////////////////////////////////////////// 104 | /// @name YUV NV12 to RGBA 105 | /// @ingroup util 106 | ////////////////////////////////////////////////////////////////////////////////// 107 | 108 | ///@{ 109 | 110 | /** 111 | * Convert an NV12 texture (semi-planar 4:2:0) to ARGB uchar4 format. 112 | * NV12 = 8-bit Y plane followed by an interleaved U/V plane with 2x2 subsampling. 113 | */ 114 | cudaError_t cudaNV12ToRGBA( uint8_t* input, size_t inputPitch, uchar4* output, size_t outputPitch, size_t width, size_t height ); 115 | cudaError_t cudaNV12ToRGBA( uint8_t* input, uchar4* output, size_t width, size_t height ); 116 | 117 | cudaError_t cudaNV12ToRGBAf( uint8_t* input, size_t inputPitch, float4* output, size_t outputPitch, size_t width, size_t height ); 118 | cudaError_t cudaNV12ToRGBAf( uint8_t* input, float4* output, size_t width, size_t height ); 119 | 120 | /** 121 | * Setup NV12 color conversion constants. 122 | * cudaNV12SetupColorspace() isn't necessary for the user to call, it will be 123 | * called automatically by cudaNV12ToRGBA() with a hue of 0.0. 124 | * However if you want to setup custom constants (ie with a hue different than 0), 125 | * then you can call cudaNV12SetupColorspace() at any time, overriding the default. 126 | */ 127 | cudaError_t cudaNV12SetupColorspace( float hue = 0.0f ); 128 | 129 | ///@} 130 | 131 | #endif 132 | 133 | -------------------------------------------------------------------------------- /util/loadImage.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | #include 23 | #include "loadImage.h" 24 | #include "../util/cuda/cudaMappedMemory.h" 25 | 26 | //#include 27 | #include 28 | 29 | 30 | // loadImageRGBA 31 | //bool loadImageRGBA( const char* filename, float4** cpu, float4** gpu, int* width, int* height ) 32 | //{ 33 | // if( !filename || !cpu || !gpu || !width || !height ) 34 | // { 35 | // printf("loadImageRGBA - invalid parameter\n"); 36 | // return false; 37 | // } 38 | // 39 | // // load original image 40 | // QImage qImg; 41 | // 42 | // if( !qImg.load(filename) ) 43 | // { 44 | // printf("failed to load image %s\n", filename); 45 | // return false; 46 | // } 47 | // 48 | // if( *width != 0 && *height != 0 ) 49 | // qImg = qImg.scaled(*width, *height, Qt::IgnoreAspectRatio); 50 | // 51 | // const uint32_t imgWidth = qImg.width(); 52 | // const uint32_t imgHeight = qImg.height(); 53 | // const uint32_t imgPixels = imgWidth * imgHeight; 54 | // const size_t imgSize = imgWidth * imgHeight * sizeof(float) * 4; 55 | // 56 | // printf("loaded image %s (%u x %u) %zu bytes\n", filename, imgWidth, imgHeight, imgSize); 57 | // 58 | // // allocate buffer for the image 59 | // if( !cudaAllocMapped((void**)cpu, (void**)gpu, imgSize) ) 60 | // { 61 | // printf(LOG_CUDA "failed to allocated %zu bytes for image %s\n", imgSize, filename); 62 | // return false; 63 | // } 64 | // 65 | // float4* cpuPtr = *cpu; 66 | // 67 | // for( uint32_t y=0; y < imgHeight; y++ ) 68 | // { 69 | // for( uint32_t x=0; x < imgWidth; x++ ) 70 | // { 71 | // const QRgb rgb = qImg.pixel(x,y); 72 | // const float4 px = make_float4(float(qRed(rgb)), 73 | // float(qGreen(rgb)), 74 | // float(qBlue(rgb)), 75 | // float(qAlpha(rgb))); 76 | // 77 | // cpuPtr[y*imgWidth+x] = px; 78 | // } 79 | // } 80 | // 81 | // *width = imgWidth; 82 | // *height = imgHeight; 83 | // return true; 84 | //} 85 | // 86 | // 87 | 88 | /* 89 | // loadImageRGB 90 | bool loadImageRGB( const char* filename, float3** cpu, float3** gpu, int* width, int* height, const float3& mean ) 91 | { 92 | if( !filename || !cpu || !gpu || !width || !height ) 93 | { 94 | printf("loadImageRGB - invalid parameter\n"); 95 | return false; 96 | } 97 | 98 | // load original image 99 | QImage qImg; 100 | 101 | if( !qImg.load(filename) ) 102 | { 103 | printf("failed to load image %s\n", filename); 104 | return false; 105 | } 106 | 107 | if( *width != 0 && *height != 0 ) 108 | qImg = qImg.scaled(*width, *height, Qt::IgnoreAspectRatio); 109 | 110 | const uint32_t imgWidth = qImg.width(); 111 | const uint32_t imgHeight = qImg.height(); 112 | const uint32_t imgPixels = imgWidth * imgHeight; 113 | const size_t imgSize = imgWidth * imgHeight * sizeof(float) * 3; 114 | 115 | printf("loaded image %s (%u x %u) %zu bytes\n", filename, imgWidth, imgHeight, imgSize); 116 | 117 | // allocate buffer for the image 118 | if( !cudaAllocMapped((void**)cpu, (void**)gpu, imgSize) ) 119 | { 120 | printf(LOG_CUDA "failed to allocated %zu bytes for image %s\n", imgSize, filename); 121 | return false; 122 | } 123 | 124 | float* cpuPtr = (float*)*cpu; 125 | 126 | for( uint32_t y=0; y < imgHeight; y++ ) 127 | { 128 | for( uint32_t x=0; x < imgWidth; x++ ) 129 | { 130 | const QRgb rgb = qImg.pixel(x,y); 131 | const float mul = 0.007843f; //1.0f / 255.0f; 132 | const float3 px = make_float3((float(qRed(rgb)) - mean.x) * mul, 133 | (float(qGreen(rgb)) - mean.y) * mul, 134 | (float(qBlue(rgb)) - mean.z) * mul ); 135 | 136 | // note: caffe/GIE is band-sequential (as opposed to the typical Band Interleaved by Pixel) 137 | cpuPtr[imgPixels * 0 + y * imgWidth + x] = px.x; 138 | cpuPtr[imgPixels * 1 + y * imgWidth + x] = px.y; 139 | cpuPtr[imgPixels * 2 + y * imgWidth + x] = px.z; 140 | } 141 | } 142 | 143 | *width = imgWidth; 144 | *height = imgHeight; 145 | return true; 146 | } 147 | */ 148 | 149 | 150 | bool loadImageBGR( cv::Mat frame, float3** cpu, float3** gpu, int* width, int* height, const float3& mean ) 151 | { 152 | const uint32_t imgWidth = 300; 153 | const uint32_t imgHeight = 300; 154 | const uint32_t imgPixels = imgWidth * imgHeight; 155 | const size_t imgSize = imgWidth * imgHeight * sizeof(float) * 3; 156 | 157 | // allocate buffer for the image 158 | if( !cudaAllocMapped((void**)cpu, (void**)gpu, imgSize) ) 159 | { 160 | printf(LOG_CUDA "failed to allocated bytes for image"); 161 | return false; 162 | } 163 | 164 | float* cpuPtr = (float*)*cpu; 165 | 166 | for( uint32_t y=0; y < imgHeight; y++ ) 167 | { 168 | for( uint32_t x=0; x < imgWidth; x++ ) 169 | { 170 | cv::Vec3b intensity = frame.at(y,x); 171 | cpuPtr[imgPixels * 0 + y * imgWidth + x] = (float)intensity.val[0]; 172 | cpuPtr[imgPixels * 1 + y * imgWidth + x] = (float)intensity.val[1]; 173 | cpuPtr[imgPixels * 2 + y * imgWidth + x] = (float)intensity.val[2]; 174 | } 175 | } 176 | return true; 177 | } 178 | -------------------------------------------------------------------------------- /util/loadImage.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | #ifndef __IMAGE_LOADER_H_ 24 | #define __IMAGE_LOADER_H_ 25 | 26 | 27 | #include 28 | #include "../util/cuda/cudaUtility.h" 29 | 30 | 31 | /** 32 | * Load a color image from disk into CUDA memory with alpha. 33 | * This function loads the image into shared CPU/GPU memory, using the functions from cudaMappedMemory.h 34 | * 35 | * @param filename Path to the image file on disk. 36 | * @param cpu Pointer to CPU buffer allocated containing the image. 37 | * @param gpu Pointer to CUDA device buffer residing on GPU containing image. 38 | * @param width Variable containing width in pixels of the image. 39 | * @param height Variable containing height in pixels of the image. 40 | * 41 | * @ingroup util 42 | */ 43 | bool loadImageRGBA( const char* filename, float4** cpu, float4** gpu, int* width, int* height ); 44 | 45 | 46 | /** 47 | * Save an image to disk 48 | * @ingroup util 49 | */ 50 | bool saveImageRGBA( const char* filename, float4* cpu, int width, int height, float max_pixel=255.0f ); 51 | 52 | 53 | /** 54 | * Load a color image from disk into CUDA memory. 55 | * This function loads the image into shared CPU/GPU memory, using the functions from cudaMappedMemory.h 56 | * 57 | * @param filename Path to the image file on disk. 58 | * @param cpu Pointer to CPU buffer allocated containing the image. 59 | * @param gpu Pointer to CUDA device buffer residing on GPU containing image. 60 | * @param width Variable containing width in pixels of the image. 61 | * @param height Variable containing height in pixels of the image. 62 | * 63 | * @ingroup util 64 | */ 65 | bool loadImageRGB( const char* filename, float3** cpu, float3** gpu, int* width, int* height, const float3& mean=make_float3(0,0,0) ); 66 | 67 | 68 | /** 69 | * Load a color image from disk into CUDA memory. 70 | * This function loads the image into shared CPU/GPU memory, using the functions from cudaMappedMemory.h 71 | * 72 | * @param filename Path to the image file on disk. 73 | * @param cpu Pointer to CPU buffer allocated containing the image. 74 | * @param gpu Pointer to CUDA device buffer residing on GPU containing image. 75 | * @param width Variable containing width in pixels of the image. 76 | * @param height Variable containing height in pixels of the image. 77 | * 78 | * @ingroup util 79 | */ 80 | bool loadImageBGR( cv::Mat frame, float3** cpu, float3** gpu, int* width, int* height, const float3& mean=make_float3(0,0,0) ); 81 | 82 | 83 | 84 | #endif 85 | --------------------------------------------------------------------------------