├── .DS_Store
├── .gitignore
├── CMakeLists.txt
├── README.md
├── cmake_install.cmake
├── common.cpp
├── common.h
├── cudaUtility.h
├── imageBuffer.h
├── kernel.cu
├── main.cpp
├── mathFunctions.cpp
├── mathFunctions.cu
├── mathFunctions.h
├── model
    └── pelee
    │   ├── pelee_deploy_iplugin.prototxt
    │   ├── pelee_merged.caffemodel
    │   └── pelee_merged.prototxt
├── pluginImplement.cpp
├── pluginImplement.h
├── tensorNet.cpp
├── tensorNet.h
├── testPic
    └── test.png
├── testVideo
    └── test.avi
└── util
    ├── cuda
        ├── cudaMappedMemory.h
        ├── cudaNormalize.cu
        ├── cudaNormalize.h
        ├── cudaOverlay.cu
        ├── cudaOverlay.h
        ├── cudaRGB.cu
        ├── cudaRGB.h
        ├── cudaResize.cu
        ├── cudaResize.h
        ├── cudaUtility.h
        ├── cudaYUV-NV12.cu
        ├── cudaYUV-YUYV.cu
        ├── cudaYUV-YV12.cu
        └── cudaYUV.h
    ├── loadImage.cpp
    └── loadImage.h


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric612/Pelee-Seg-TensorRT/05bf0b31c5891adaf64f40b784ef4a1927d68862/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## General
  2 | jobs/* 
  3 | CMakeFiles/*
  4 | 3rdparty/*
  5 | cmake/*
  6 | tools/*
  7 | lib/*
  8 | build/* 
  9 | 
 10 | # Compiled Object files
 11 | *.slo
 12 | *.lo
 13 | *.o
 14 | *.cuo
 15 | 
 16 | # Compiled Dynamic libraries
 17 | *.so
 18 | *.dylib
 19 | 
 20 | # Compiled Static libraries
 21 | *.lai
 22 | *.la
 23 | *.a
 24 | 
 25 | # Compiled protocol buffers
 26 | *.pb.h
 27 | *.pb.cc
 28 | *_pb2.py
 29 | 
 30 | # Compiled python
 31 | *.pyc
 32 | 
 33 | # Compiled MATLAB
 34 | *.mex*
 35 | 
 36 | # IPython notebook checkpoints
 37 | .ipynb_checkpoints
 38 | 
 39 | # Editor temporaries
 40 | *.swp
 41 | *~
 42 | 
 43 | # Sublime Text settings
 44 | *.sublime-workspace
 45 | *.sublime-project
 46 | 
 47 | # Eclipse Project settings
 48 | *.*project
 49 | .settings
 50 | 
 51 | # QtCreator files
 52 | *.user
 53 | 
 54 | # PyCharm files
 55 | .idea
 56 | 
 57 | # OSX dir files
 58 | .DS_Store
 59 | 
 60 | ## Caffe
 61 | 
 62 | # User's build configuration
 63 | Makefile.config
 64 | Makefile
 65 | 
 66 | # Data and models are either
 67 | # 1. reference, and not casually committed
 68 | # 2. custom, and live on their own unless they're deliberated contributed
 69 | data/*
 70 | models/*
 71 | *.caffemodel
 72 | *.caffemodel.h5
 73 | *.solverstate
 74 | *.solverstate.h5
 75 | *.binaryproto
 76 | *leveldb
 77 | *lmdb
 78 | 
 79 | # build, distribute, and bins (+ python proto bindings)
 80 | build
 81 | .build_debug/*
 82 | .build_release/*
 83 | distribute/*
 84 | *.testbin
 85 | *.bin
 86 | python/caffe/proto/
 87 | cmake_build
 88 | .cmake_build
 89 | 
 90 | # Generated documentation
 91 | docs/_site
 92 | docs/gathered
 93 | _site
 94 | doxygen
 95 | docs/dev
 96 | 
 97 | # LevelDB files
 98 | *.sst
 99 | *.ldb
100 | LOCK
101 | LOG*
102 | CURRENT
103 | MANIFEST-*
104 | 
105 | 
106 | *.tar.gz
107 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8)
 2 | project(pelee)
 3 | 
 4 | #set(inference_VERSION_MAJOR 2)
 5 | #set(inference_VERSION_MINOR 1)
 6 | 
 7 | #set(CMAKE_CXX_STANDARD 11)
 8 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 9 | find_package(OpenMP)
10 | if (OPENMP_FOUND)
11 |     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
12 |     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
13 |     set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
14 | endif()
15 | set(BUILD_DEPS "YES" CACHE BOOL "If YES, will install dependencies into sandbox.  Automatically reset to NO after dependencies are installed.")
16 | 
17 | set(PROJECT_OUTPUT_DIR  ${PROJECT_BINARY_DIR}/build)
18 | set(PROJECT_INCLUDE_DIR ${PROJECT_OUTPUT_DIR}/include)
19 | 
20 | file(MAKE_DIRECTORY ${PROJECT_INCLUDE_DIR})
21 | file(MAKE_DIRECTORY ${PROJECT_OUTPUT_DIR}/bin)
22 | 
23 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_OUTPUT_DIR}/bin)
24 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_OUTPUT_DIR}/lib)
25 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_OUTPUT_DIR}/lib)
26 | 
27 | message("The runtime libraries are included in ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
28 | message("The library files are included in ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}")
29 | 
30 | message("-- system arch:  ${CMAKE_SYSTEM_PROCESSOR}")
31 | message("-- output path:  ${PROJECT_OUTPUT_DIR}")
32 | 
33 | find_package(CUDA)
34 | find_package(OpenCV REQUIRED)
35 | message(" -- CUDA and Opencv Found ")
36 | message(" -- opencv_version  "${OpenCV_VERSION})
37 | 
38 | 
39 | set(CUDA_NVCC_FLAGS
40 |         ${CUDA_NVCC_FLAGS};--disable-warnings;
41 |         -O3
42 |         -gencode arch=compute_30,code=sm_30
43 |         -gencode arch=compute_35,code=sm_35
44 |         -gencode arch=compute_50,code=sm_50
45 |         -gencode arch=compute_50,code=compute_50
46 |         -gencode arch=compute_52,code=sm_52
47 |         -gencode arch=compute_61,code=sm_61
48 | 	-gencode arch=compute_62,code=sm_62
49 |         )
50 | 
51 | file(GLOB cudaSources util/cuda/*.cu)
52 | file(GLOB cudaIncludes util/cuda/*.h)
53 | 
54 | file(GLOB sources *.cu *.cpp util/*.cpp util/cuda/*.cu)
55 | file(GLOB includes util/*.h util/cuda/*.h)
56 | 
57 | include_directories(${PROJECT_INCLUDE_DIR}/util)
58 | include_directories(${PROJECT_BINARY_DIR}/util)
59 | include_directories(${OpenCV_INCLUDE_DIRS})
60 | ##
61 | 
62 | link_directories(${OpenCV_LIBRARY_DIRS})
63 | 
64 | cuda_add_library(inferLib SHARED ${sources})
65 | ##
66 | target_link_libraries(inferLib /usr/lib/aarch64-linux-gnu/libnvcaffe_parser.so)
67 | target_link_libraries(inferLib /usr/lib/aarch64-linux-gnu/libnvinfer.so)
68 | target_link_libraries(inferLib /usr/lib/aarch64-linux-gnu/libnvinfer_plugin.so)
69 | target_link_libraries(inferLib /usr/lib/aarch64-linux-gnu/libnvparsers.so)
70 | 
71 | 
72 | # transfer all headers to the include directory
73 | foreach(include ${includes})
74 |     message("-- Copying ${include}")
75 |     configure_file(${include} ${PROJECT_INCLUDE_DIR} COPYONLY)
76 | endforeach()
77 | 
78 | ## install
79 | foreach(include ${includes})
80 |     install(FILES "${include}" DESTINATION include/inferLib)
81 | endforeach()
82 | 
83 | add_executable(pelee main.cpp )
84 | target_link_libraries(pelee inferLib ${OpenCV_LIBS})
85 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Pelee-TensorRT
 2 | 
 3 | **Accelerate Pelee with TensorRT**
 4 | Pelee: A Real-Time Object Detection System on Mobile Devices (NeurIPS 2018) 
 5 | 
 6 | **TensorRT-Pelee can run over 70FPS(11ms) on Jetson TX2(FP32)**
 7 | 
 8 | ---
 9 | 
10 | **Performance:** <br>
11 | Jetson TX2: 72 FPS, 13.2~11 ms (FP32) <br>
12 | Titan V:  200FPS, 5 ms (FP32)<br>
13 | 
14 | **Requierments:**
15 | 
16 | 1.TensorRT4 (Jetpack 3.3 on TX2)  <br>
17 | 2.CUDA 9.0 <br>
18 | 3.cudnn7 <br>
19 | 
20 | ---
21 | 
22 | **Run:**
23 | 
24 | ```shell
25 | cmake .
26 | make
27 | ./build/bin/pelee
28 | ```
29 | 
30 | ---
31 | 
32 | **Reference:**
33 | 
34 | https://github.com/Ghustwb/MobileNet-SSD-TensorRT
35 | 
36 | ---
37 | 
38 | **TODO:**
39 | - [ ] FP16 Implementation 
40 | - [ ] Change Custom layers IPlugin to IPluginExt
41 | 
42 | 
43 | 
44 | 
45 | **The bug has been fixed**
46 | 
47 | ![image](testPic/test.png)
48 | 


--------------------------------------------------------------------------------
/cmake_install.cmake:
--------------------------------------------------------------------------------
 1 | # Install script for directory: /home/nvidia/TRT-Pelee
 2 | 
 3 | # Set the install prefix
 4 | if(NOT DEFINED CMAKE_INSTALL_PREFIX)
 5 |   set(CMAKE_INSTALL_PREFIX "/usr/local")
 6 | endif()
 7 | string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
 8 | 
 9 | # Set the install configuration name.
10 | if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
11 |   if(BUILD_TYPE)
12 |     string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
13 |            CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
14 |   else()
15 |     set(CMAKE_INSTALL_CONFIG_NAME "")
16 |   endif()
17 |   message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
18 | endif()
19 | 
20 | # Set the component getting installed.
21 | if(NOT CMAKE_INSTALL_COMPONENT)
22 |   if(COMPONENT)
23 |     message(STATUS "Install component: \"${COMPONENT}\"")
24 |     set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
25 |   else()
26 |     set(CMAKE_INSTALL_COMPONENT)
27 |   endif()
28 | endif()
29 | 
30 | # Install shared libraries without execute permission?
31 | if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
32 |   set(CMAKE_INSTALL_SO_NO_EXE "1")
33 | endif()
34 | 
35 | if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
36 |   file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/inferLib" TYPE FILE FILES "/home/nvidia/TRT-Pelee/util/loadImage.h")
37 | endif()
38 | 
39 | if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
40 |   file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/inferLib" TYPE FILE FILES "/home/nvidia/TRT-Pelee/util/cuda/cudaOverlay.h")
41 | endif()
42 | 
43 | if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
44 |   file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/inferLib" TYPE FILE FILES "/home/nvidia/TRT-Pelee/util/cuda/cudaResize.h")
45 | endif()
46 | 
47 | if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
48 |   file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/inferLib" TYPE FILE FILES "/home/nvidia/TRT-Pelee/util/cuda/cudaRGB.h")
49 | endif()
50 | 
51 | if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
52 |   file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/inferLib" TYPE FILE FILES "/home/nvidia/TRT-Pelee/util/cuda/cudaYUV.h")
53 | endif()
54 | 
55 | if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
56 |   file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/inferLib" TYPE FILE FILES "/home/nvidia/TRT-Pelee/util/cuda/cudaMappedMemory.h")
57 | endif()
58 | 
59 | if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
60 |   file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/inferLib" TYPE FILE FILES "/home/nvidia/TRT-Pelee/util/cuda/cudaNormalize.h")
61 | endif()
62 | 
63 | if(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")
64 |   file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/inferLib" TYPE FILE FILES "/home/nvidia/TRT-Pelee/util/cuda/cudaUtility.h")
65 | endif()
66 | 
67 | if(CMAKE_INSTALL_COMPONENT)
68 |   set(CMAKE_INSTALL_MANIFEST "install_manifest_${CMAKE_INSTALL_COMPONENT}.txt")
69 | else()
70 |   set(CMAKE_INSTALL_MANIFEST "install_manifest.txt")
71 | endif()
72 | 
73 | string(REPLACE ";" "\n" CMAKE_INSTALL_MANIFEST_CONTENT
74 |        "${CMAKE_INSTALL_MANIFEST_FILES}")
75 | file(WRITE "/home/nvidia/TRT-Pelee/${CMAKE_INSTALL_MANIFEST}"
76 |      "${CMAKE_INSTALL_MANIFEST_CONTENT}")
77 | 


--------------------------------------------------------------------------------
/common.cpp:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | std::string locateFile(const std::string& input, const std::vector<std::string> & directories)
  3 | {
  4 |     std::string file;
  5 |     const int MAX_DEPTH{10};
  6 |     bool found{false};
  7 |     for (auto &dir : directories)
  8 |     {
  9 |         file = dir + input;
 10 |         std::cout << file << std::endl;
 11 |         for (int i = 0; i < MAX_DEPTH && !found; i++)
 12 |         {
 13 |             std::ifstream checkFile(file);
 14 |             found = checkFile.is_open();
 15 |             if (found) break;
 16 |             file = "../" + file;
 17 |         }
 18 |         if (found) break;
 19 |         file.clear();
 20 |     }
 21 |     std::cout << file << std::endl;
 22 |     assert(!file.empty() && "Could not find a file due to it not existing in the data directory.");
 23 |     return file;
 24 | }
 25 | 
 26 | void readPGMFile(const std::string& fileName,  uint8_t *buffer, int inH, int inW)
 27 | {
 28 |     std::ifstream infile(fileName, std::ifstream::binary);
 29 |     assert(infile.is_open() && "Attempting to read from a file that is not open.");
 30 |     std::string magic, h, w, max;
 31 |     infile >> magic >> h >> w >> max;
 32 |     infile.seekg(1, infile.cur);
 33 |     infile.read(reinterpret_cast<char*>(buffer), inH*inW);
 34 | }
 35 | 
 36 | /*********************************/
 37 | /* Updated date： 2018.3.7
 38 | /*This is my own implementation of the detectout layer code， because I met a mistake with the detectout api of 
 39 | /*tensorrt3.0 a few months ago. You can use the detectout api of tensorrt3.0 correctly by adding an extra output 
 40 | /*in the deploy prototxt file. Please refer to my deploy prototxt.
 41 | /********************************/
 42 | // Retrieve all location predictions.
 43 | void GetLocPredictions(const float* loc_data,
 44 |                        const int num_preds_per_class, const int num_loc_classes,
 45 |                        std::vector<std::vector<float> >* loc_preds) {
 46 |     for (int p = 0; p < num_preds_per_class; ++p) {
 47 |         int start_idx = p * num_loc_classes * 4;
 48 |         vector<float> labelbbox;
 49 |         for (int c = 0; c < num_loc_classes; ++c) {
 50 |             labelbbox.push_back(loc_data[start_idx + c * 4]);
 51 |             labelbbox.push_back(loc_data[start_idx + c * 4 + 1]);
 52 |             labelbbox.push_back(loc_data[start_idx + c * 4 + 2]);
 53 |             labelbbox.push_back(loc_data[start_idx + c * 4 + 3]);
 54 | 
 55 |             loc_preds->push_back(labelbbox);
 56 |         }
 57 | 
 58 |     }
 59 | }
 60 | 
 61 | // Retrieve all confidences.
 62 | void GetConfidenceScores(const float* conf_data,
 63 |                          const int num_preds_per_class, const int num_classes,
 64 |                          vector<vector<float> >* conf_preds) {
 65 |     for (int p = 0; p < num_preds_per_class; ++p) {
 66 |         int start_idx = p * num_classes;
 67 |         vector<float> conf_classes;
 68 |         for (int c = 0; c < num_classes; ++c) {
 69 |             conf_classes.push_back(conf_data[start_idx + c]);
 70 |         }
 71 |         conf_preds->push_back(conf_classes);
 72 |     }
 73 | }
 74 | 
 75 | // Retrieve all prior bboxes. bboxes and variances
 76 | void GetPriorBBoxes(const float* prior_data, const int num_priors,
 77 |                     vector<vector<float> >* prior_bboxes,
 78 |                     vector<vector<float> >* prior_variances) {
 79 |     for (int i = 0; i < num_priors; ++i) {
 80 |         int start_idx = i * 4;
 81 |         vector<float> prior_bbox;
 82 |         prior_bbox.push_back(prior_data[start_idx]);
 83 |         prior_bbox.push_back(prior_data[start_idx + 1]);
 84 |         prior_bbox.push_back(prior_data[start_idx + 2]);
 85 |         prior_bbox.push_back(prior_data[start_idx + 3]);
 86 |         prior_bboxes->push_back(prior_bbox);
 87 |     }
 88 | 
 89 |     for (int i = 0; i < num_priors; ++i) {
 90 |         int start_idx = (num_priors + i) * 4;
 91 |         vector<float> prior_variance;
 92 |         vector<float> var;
 93 |         for (int j = 0; j < 4; ++j) {
 94 |             prior_variance.push_back(prior_data[start_idx + j]);
 95 |         }
 96 |         prior_variances->push_back(prior_variance);
 97 |     }
 98 | }
 99 | 
100 | /* code_type: 0 = CORNER; 1 = CENTER_SIZE; 2 = CORNER_SIZE
101 |  *
102 |  */
103 | void DecodeBBox(
104 |         const vector<float>& prior_bbox, const vector<float>& prior_variance,
105 |         const int code_type, const bool variance_encoded_in_target,
106 |         const bool clip_bbox, const vector<float>& bbox,
107 |         vector<float>* decode_bbox) {
108 |     if (0 == code_type) {
109 |         if (variance_encoded_in_target) {
110 |             // variance is encoded in target, we simply need to add the offset
111 |             // predictions.
112 |             decode_bbox->push_back(prior_bbox[0] + bbox[0]);
113 |             decode_bbox->push_back(prior_bbox[1] + bbox[1]);
114 |             decode_bbox->push_back(prior_bbox[2] + bbox[2]);
115 |             decode_bbox->push_back(prior_bbox[3] + bbox[3]);
116 |         } else {
117 |             // variance is encoded in bbox, we need to scale the offset accordingly.
118 |             decode_bbox->push_back(
119 |                         prior_bbox[0]+ prior_variance[0] * bbox[0]);
120 |             decode_bbox->push_back(
121 |                         prior_bbox[1] + prior_variance[1] * bbox[1]);
122 |             decode_bbox->push_back(
123 |                         prior_bbox[2] + prior_variance[2] * bbox[2]);
124 |             decode_bbox->push_back(
125 |                         prior_bbox[3] + prior_variance[3] * bbox[3]);
126 |         }
127 |     } else if (1 == code_type) {
128 |         float prior_width = prior_bbox[2] - prior_bbox[0];
129 |         //CHECK_GT(prior_width, 0);
130 |         float prior_height = prior_bbox[3] - prior_bbox[1];
131 |         //CHECK_GT(prior_height, 0);
132 |         float prior_center_x = (prior_bbox[0] + prior_bbox[2]) / 2.;
133 |         float prior_center_y = (prior_bbox[1] + prior_bbox[3]) / 2.;
134 | 
135 |         float decode_bbox_center_x, decode_bbox_center_y;
136 |         float decode_bbox_width, decode_bbox_height;
137 |         if (variance_encoded_in_target) {
138 |             // variance is encoded in target, we simply need to retore the offset
139 |             // predictions.
140 |             decode_bbox_center_x = bbox[0] * prior_width + prior_center_x;
141 |             decode_bbox_center_y = bbox[1] * prior_height + prior_center_y;
142 |             decode_bbox_width = exp(bbox[2]) * prior_width;
143 |             decode_bbox_height = exp(bbox[3]) * prior_height;
144 |         } else {
145 |             // variance is encoded in bbox, we need to scale the offset accordingly.
146 |             decode_bbox_center_x =
147 |                     prior_variance[0] * bbox[0] * prior_width + prior_center_x;
148 |             decode_bbox_center_y =
149 |                     prior_variance[1] * bbox[1] * prior_height + prior_center_y;
150 |             decode_bbox_width =
151 |                     exp(prior_variance[2] * bbox[2]) * prior_width;
152 |             decode_bbox_height =
153 |                     exp(prior_variance[3] * bbox[3]) * prior_height;
154 |         }
155 | 
156 |         decode_bbox->push_back(decode_bbox_center_x - decode_bbox_width / 2.);
157 |         decode_bbox->push_back(decode_bbox_center_y - decode_bbox_height / 2.);
158 |         decode_bbox->push_back(decode_bbox_center_x + decode_bbox_width / 2.);
159 |         decode_bbox->push_back(decode_bbox_center_y + decode_bbox_height / 2.);
160 |     } else if (2 == code_type) {
161 |         float prior_width = prior_bbox[2] - prior_bbox[0];
162 |         //CHECK_GT(prior_width, 0);
163 |         float prior_height = prior_bbox[3] - prior_bbox[1];
164 |         //CHECK_GT(prior_height, 0);
165 |         if (variance_encoded_in_target) {
166 |             // variance is encoded in target, we simply need to add the offset
167 |             // predictions.
168 |             decode_bbox->push_back(prior_bbox[0] + bbox[0] * prior_width);
169 |             decode_bbox->push_back(prior_bbox[1] + bbox[1] * prior_height);
170 |             decode_bbox->push_back(prior_bbox[2] + bbox[2] * prior_width);
171 |             decode_bbox->push_back(prior_bbox[3] + bbox[3] * prior_height);
172 |         } else {
173 |             // variance is encoded in bbox, we need to scale the offset accordingly.
174 |             decode_bbox->push_back(
175 |                         prior_bbox[0] + prior_variance[0] * bbox[0] * prior_width);
176 |             decode_bbox->push_back(
177 |                         prior_bbox[1] + prior_variance[1] * bbox[1] * prior_height);
178 |             decode_bbox->push_back(
179 |                         prior_bbox[2] + prior_variance[2] * bbox[2] * prior_width);
180 |             decode_bbox->push_back(
181 |                         prior_bbox[3] + prior_variance[3] * bbox[3] * prior_height);
182 |         }
183 |     } else {
184 |         std::cout<< "Unknown LocLossType."<<std::endl;
185 |     }
186 |     //clip_bbox = false, 所以没实现
187 |     /*if (clip_bbox) {
188 |         ClipBBox(*decode_bbox, decode_bbox);
189 |     }*/
190 | }
191 | 
192 | 
193 | void DecodeBBoxes(
194 |         const vector<vector<float> >& prior_bboxes,
195 |         const vector<vector<float> >& prior_variances,
196 |         const int code_type, const bool variance_encoded_in_target,
197 |         const bool clip_bbox, const vector<vector<float> >& bboxes,
198 |         vector<vector<float> >* decode_bboxes) {
199 |     //CHECK_EQ(prior_bboxes.size(), prior_variances.size());
200 |     //CHECK_EQ(prior_bboxes.size(), bboxes.size());
201 |     int num_bboxes = prior_bboxes.size();
202 |     
203 |     for (int i = 0; i < num_bboxes; ++i) {
204 |         vector<float> decode_bbox;
205 |         DecodeBBox(prior_bboxes[i], prior_variances[i], code_type,
206 |                    variance_encoded_in_target, clip_bbox, bboxes[i], &decode_bbox);
207 |         decode_bboxes->push_back(decode_bbox);
208 |     }
209 | }
210 | 
211 | //
212 | void ConfData(const float* data, const int num_classes, const int num_prior, float* new_data) {
213 |     int idx = 0;
214 |     for (int c = 0; c < num_classes; ++c) {
215 |         for (int p = 0; p < num_prior; ++p) {
216 |             new_data[idx] = data[p*num_classes + c];
217 |             idx++;
218 |         }
219 |     }
220 |     //softmax
221 |     for (int p = 0; p < num_prior; ++p) {
222 |         int sum = 0;
223 |         float _max = new_data[p];//new_data[0*num_prior + p]
224 |         for (int c = 1; c < num_classes; ++c) {
225 |             _max = std::max(_max, new_data[c*num_prior + p]);
226 |         }
227 |         for (int c = 0; c < num_classes; ++c) {
228 |             sum += exp(new_data[c*num_prior + p]-_max);
229 |         }
230 |         for (int j = 0; j < num_classes; ++j) {
231 |             new_data[j*num_prior + p] =  exp(new_data[j*num_prior + p]-_max)/sum;
232 |         }
233 |     }
234 | 
235 | }
236 | 
237 | template <typename Dtype>
238 | void DecodeBBoxes_2(const Dtype* loc_data, const Dtype* prior_data,
239 |                     const int code_type, const bool variance_encoded_in_target,
240 |                     const int num_priors, const bool share_location,
241 |                     const int num_loc_classes, const int background_label_id,
242 |                     const bool clip_bbox, Dtype* bbox_data) {
243 | 
244 |     if(code_type == 0){
245 |         for(int p = 0; p < num_priors; p++) {
246 |             if (variance_encoded_in_target) {
247 |                 for (int i = 0; i < 4; i++) {
248 |                     bbox_data[4 * p + i] = prior_data[4 * p + i] + loc_data[4 * p + i];
249 |                 }
250 |             } else {
251 |                 for (int i = 0; i < 4; i++) {
252 |                     bbox_data[4 * p + i] = prior_data[4 * p + i] + prior_data[4 * num_priors + 4 * p + i] + loc_data[4 * p + i];
253 |                 }
254 |             }
255 |         }
256 |     }else if(code_type == 1){
257 |         for(int p = 0; p < num_priors; p++) {
258 |             float prior_width = prior_data[4 * p + 2] - prior_data[4 * p + 0];
259 |             float prior_height = prior_data[4 * p + 3] - prior_data[4 * p + 1];
260 |             float prior_center_x = (prior_data[4 * p + 0] + prior_data[4 * p + 2]) / 2.;
261 |             float prior_center_y = (prior_data[4 * p + 1] + prior_data[4 * p + 3]) / 2.;
262 |             float decode_bbox_center_x, decode_bbox_center_y;
263 |             float decode_bbox_width, decode_bbox_height;;
264 |             if (variance_encoded_in_target) {
265 |                 decode_bbox_center_x = loc_data[4 * p + 0] * prior_width + prior_center_x;
266 |                 decode_bbox_center_y = loc_data[4 * p + 1] * prior_height + prior_center_y;
267 |                 decode_bbox_width = exp(loc_data[4 * p + 2]) * prior_width;
268 |                 decode_bbox_height = exp(loc_data[4 * p + 3]) * prior_height;
269 |             }else{
270 |                 decode_bbox_center_x = prior_data[4 * num_priors + 4 * p + 0] * loc_data[4 * p + 0] * prior_width + prior_center_x;
271 |                 decode_bbox_center_y = prior_data[4 * num_priors + 4 * p + 1] * loc_data[4 * p + 1] * prior_height + prior_center_y;
272 |                 decode_bbox_width = exp(prior_data[4 * num_priors + 4 * p + 2] * loc_data[4 * p + 2]) * prior_width;
273 |                 decode_bbox_height = exp(prior_data[4 * num_priors + 4 * p + 3] * loc_data[4 * p + 3]) * prior_height;
274 |             }
275 |             bbox_data[4 * p + 0] = (decode_bbox_center_x - decode_bbox_width / 2.);
276 |             bbox_data[4 * p + 1] = (decode_bbox_center_y - decode_bbox_height / 2.);
277 |             bbox_data[4 * p + 2] = (decode_bbox_center_x + decode_bbox_width / 2.);
278 |             bbox_data[4 * p + 3] = (decode_bbox_center_y + decode_bbox_height / 2.);
279 |         }
280 | 
281 |     }else if(code_type == 2){
282 |         for(int p = 0; p < num_priors; p++) {
283 |             float prior_width = prior_data[4 * p + 2] - prior_data[4 * p + 0];
284 |             float prior_height = prior_data[4 * p + 3] - prior_data[4 * p + 1];
285 | 
286 |             if (variance_encoded_in_target) {
287 |                 bbox_data[4 * p + 0] = prior_data[4 * p + 0] + loc_data[4 * p + 0] * prior_width;
288 |                 bbox_data[4 * p + 1] = prior_data[4 * p + 1] + loc_data[4 * p + 1] * prior_height;
289 |                 bbox_data[4 * p + 2] = exp(prior_data[4 * p + 2]) + loc_data[4 * p + 2] * prior_width;
290 |                 bbox_data[4 * p + 3] = exp(prior_data[4 * p + 3]) + loc_data[4 * p + 3] * prior_height;
291 |             }else {
292 |                 bbox_data[4 * p + 0] = prior_data[4 * p + 0] +
293 |                         prior_data[4 * num_priors + 4 * p + 0] * loc_data[4 * p + 0] * prior_width;
294 |                 bbox_data[4 * p + 1] = prior_data[4 * p + 1] +
295 |                         prior_data[4 * num_priors + 4 * p + 1] * loc_data[4 * p + 1] * prior_height;
296 |                 bbox_data[4 * p + 2] = prior_data[4 * p + 2] +
297 |                         prior_data[4 * num_priors + 4 * p + 2] * loc_data[4 * p + 2] * prior_width;
298 |                 bbox_data[4 * p + 3] = prior_data[4 * p + 3] +
299 |                         prior_data[4 * num_priors + 4 * p + 3] * loc_data[4 * p + 3] * prior_height;
300 |             }
301 |         }
302 | 
303 |     }else{
304 |         std::cout << "Unknown LocLossType." << std::endl;
305 |     }
306 | }
307 | 
308 | 
309 | template <typename Dtype>
310 | Dtype BBoxSize(const Dtype* bbox, const bool normalized = true) {
311 |     if (bbox[2] < bbox[0] || bbox[3] < bbox[1]) {
312 |         // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
313 |         return Dtype(0.);
314 |     } else {
315 |         const Dtype width = bbox[2] - bbox[0];
316 |         const Dtype height = bbox[3] - bbox[1];
317 |         if (normalized) {
318 |             return width * height;
319 |         } else {
320 |             // If bbox is not within range [0, 1].
321 |             return (width + 1) * (height + 1);
322 |         }
323 |     }
324 | }
325 | 
326 | template <typename Dtype>
327 | Dtype JaccardOverlap(const Dtype* bbox1, const Dtype* bbox2) {
328 |     if (bbox2[0] > bbox1[2] || bbox2[2] < bbox1[0] ||
329 |             bbox2[1] > bbox1[3] || bbox2[3] < bbox1[1]) {
330 |         return Dtype(0.);
331 |     } else {
332 |         const Dtype inter_xmin = std::max(bbox1[0], bbox2[0]);
333 |         const Dtype inter_ymin = std::max(bbox1[1], bbox2[1]);
334 |         const Dtype inter_xmax = std::min(bbox1[2], bbox2[2]);
335 |         const Dtype inter_ymax = std::min(bbox1[3], bbox2[3]);
336 | 
337 |         const Dtype inter_width = inter_xmax - inter_xmin;
338 |         const Dtype inter_height = inter_ymax - inter_ymin;
339 |         const Dtype inter_size = inter_width * inter_height;
340 | 
341 |         const Dtype bbox1_size = BBoxSize(bbox1);
342 |         const Dtype bbox2_size = BBoxSize(bbox2);
343 | 
344 |         return inter_size / (bbox1_size + bbox2_size - inter_size);
345 |     }
346 | }
347 | 
348 | template <typename T>
349 | bool SortScorePairDescend(const pair<float, T>& pair1,
350 |                           const pair<float, T>& pair2) {
351 |     return pair1.first > pair2.first;
352 | }
353 | 
354 | template <typename Dtype>
355 | void GetMaxScoreIndex(const Dtype* scores, const int num, const float threshold,
356 |                       const int top_k, vector<pair<Dtype, int> >* score_index_vec) {
357 |     // Generate index score pairs.
358 |     for (int i = 0; i < num; ++i) {
359 |         if (scores[i] > threshold) {
360 |             score_index_vec->push_back(std::make_pair(scores[i], i));
361 |         }
362 |     }
363 | 
364 |     // Sort the score pair according to the scores in descending order
365 |     std::sort(score_index_vec->begin(), score_index_vec->end(),
366 |               SortScorePairDescend<int>);
367 | 
368 |     // Keep top_k scores if needed.
369 |     if (top_k > -1 && top_k < score_index_vec->size()) {
370 |         score_index_vec->resize(top_k);
371 |     }
372 | }
373 | 
374 | template <typename Dtype>
375 | void ApplyNMSFast(const Dtype* bboxes, const Dtype* scores, const int num,
376 |                   const float score_threshold, const float nms_threshold,
377 |                   const float eta, const int top_k, vector<int>* indices) {
378 |     // Get top_k scores (with corresponding indices).
379 |     vector<pair<Dtype, int> > score_index_vec;
380 |     //float n1 = cv::getTickCount();
381 |     GetMaxScoreIndex(scores, num, score_threshold, top_k, &score_index_vec);
382 |     // n1 = (cv::getTickCount()-n1) / cv::getTickFrequency();
383 |     //printf("======n==1 Forward_DetectionOutputLayer time is %f \n", n1);
384 | 
385 |     // Do nms.
386 |     float adaptive_threshold = nms_threshold;
387 |     indices->clear();
388 |     //float n2 = cv::getTickCount();
389 |     std::cout<<"======n==n" <<score_index_vec.size()<<std::endl;
390 |     while (score_index_vec.size() != 0) {
391 |         const int idx = score_index_vec.front().second;
392 |         bool keep = true;
393 |         for (int k = 0; k < indices->size(); ++k) {
394 |             if (keep) {
395 |                 const int kept_idx = (*indices)[k];
396 |                 float overlap = JaccardOverlap(bboxes + idx * 4, bboxes + kept_idx * 4);
397 |                 keep = overlap <= adaptive_threshold;
398 |             } else {
399 |                 break;
400 |             }
401 |         }
402 |         if (keep) {
403 |             indices->push_back(idx);
404 |         }
405 |         score_index_vec.erase(score_index_vec.begin());
406 |         if (keep && eta < 1 && adaptive_threshold > 0.5) {
407 |             adaptive_threshold *= eta;
408 |         }
409 |     }
410 |     //n2 = (cv::getTickCount()-n2) / cv::getTickFrequency();
411 |     //printf("======n==2 Forward_DetectionOutputLayer time is %f \n", n2);
412 | }
413 | 
414 | 
415 | void Forward_DetectionOutputLayer(float* loc_data, float* conf_data, float* prior_data, int num_priors_, int num_classes_, vector<vector<float> >* detecions) {
416 |     // Retrieve all location predictions.
417 |     /*vector<vector<float>> all_loc_preds;
418 |     GetLocPredictions(loc_data, num_priors_, num_loc_classes_, &all_loc_preds);
419 |     // Retrieve all confidences.
420 |     vector <vector<float>> all_conf_scores;
421 |     GetConfidenceScores(conf_data, num_priors_, num_classes_,
422 |                         &all_conf_scores);
423 |     // Retrieve all prior bboxes.
424 |     vector<vector<float>> prior_bboxes;
425 |     vector<vector<float>> prior_variances;
426 |     GetPriorBBoxes(prior_data, num_priors_, &prior_bboxes, &prior_variances);
427 |     // Decode all loc predictions to bboxes.
428 |     vector<vector<float>> all_decode_bboxes;
429 |     //const bool clip_bbox = false;
430 |     DecodeBBoxes(prior_bboxes, prior_variances, code_type_,
431 |                   variance_encoded_in_target_, clip_bbox, all_loc_preds,
432 |                  &all_decode_bboxes);*/
433 | 
434 | 
435 |     int num_kept = 0;
436 |     vector<map<int, vector<int> > > all_indices;
437 | 
438 |     map<int , vector<int>> indices;
439 |     int num_det = 0;
440 |     const int conf_idx = num_classes_ * num_priors_;
441 |     const bool share_location_ = true;
442 |     const int num_loc_classes = 1;
443 |     int background_label_id_ = 0;
444 |     float confidence_threshold_ = 0.1;
445 |     float nms_threshold_ = 0.45;
446 |     float eta_ = 1.0;//默认1.0
447 |     int top_k_ = 400;
448 |     int keep_top_k_ = 200;
449 | 
450 |     const int code_type = 1;//center
451 |     const bool variance_encoded_in_target = false;//default
452 |     const bool clip_bbox = false;
453 | 
454 |     float* decode_bboxes = new float[4 * num_priors_];
455 |     float t = cv::getTickCount();
456 |     DecodeBBoxes_2<float>(loc_data, prior_data, code_type, variance_encoded_in_target, num_priors_, share_location_, num_loc_classes,background_label_id_, clip_bbox, decode_bboxes);
457 |     t = (cv::getTickCount()-t) / cv::getTickFrequency();
458 |     printf("======1 Forward_DetectionOutputLayer time is %f \n", t);
459 |     float* new_conf_data = new float[num_priors_ * num_classes_];
460 |     float t1 = cv::getTickCount();
461 |     ConfData(conf_data, num_classes_, num_priors_, new_conf_data);
462 |     t1 = (cv::getTickCount()-t1) / cv::getTickFrequency();
463 |     printf("======2 Forward_DetectionOutputLayer time is %f \n", t1);
464 | 
465 |     float t2 = cv::getTickCount();
466 |     for(int c = 0; c < num_classes_; c++){
467 |         if(c == background_label_id_){
468 |             continue;
469 |         }
470 |         float* cur_conf_data = new_conf_data + c * num_priors_;
471 |         //float* cur_bbox_data = all_decode_bboxes
472 |         float tt = cv::getTickCount();
473 |         ApplyNMSFast<float>(decode_bboxes, cur_conf_data, num_priors_,
474 |                             confidence_threshold_, nms_threshold_, eta_, top_k_, &(indices[c]));
475 |         tt = (cv::getTickCount()-tt) / cv::getTickFrequency();
476 |         std::cout<<"===nms==="<<c<<"==nms=="<<std::endl;
477 |         printf("======nms Forward_DetectionOutputLayer time is %f \n", tt);
478 |         num_det += indices[c].size();
479 |     }
480 |     t2 = (cv::getTickCount()-t2) / cv::getTickFrequency();
481 |     printf("======3 Forward_DetectionOutputLayer time is %f \n", t2);
482 | 
483 |     float t3 = cv::getTickCount();
484 |     if(keep_top_k_ > -1 && num_det > keep_top_k_){
485 |         vector<pair<float, pair<int, int> > > score_index_pairs;
486 |         for(map<int, vector<int> >::iterator it = indices.begin(); it != indices.end(); ++it){
487 |             int label = it->first;
488 |             const vector<int>& label_indices = it->second;
489 |             for(int j = 0; j < label_indices.size(); ++j){
490 |                 int idx = label_indices[j];
491 |                 float score = new_conf_data[label * num_priors_ + idx];
492 |                 score_index_pairs.push_back(std::make_pair(score, std::make_pair(label, idx)));
493 |             }
494 |         }
495 |         // Keep top k results per image.
496 |         std::sort(score_index_pairs.begin(), score_index_pairs.end(), SortScorePairDescend<pair<int, int> >);
497 |         score_index_pairs.resize(keep_top_k_);
498 |         // Store the new indices.
499 |         map<int, vector<int> > new_indices;
500 |         for(int j = 0; j < score_index_pairs.size(); ++j){
501 |             int label = score_index_pairs[j].second.first;
502 |             int idx = score_index_pairs[j].second.second;
503 |             new_indices[label].push_back(idx);
504 |         }
505 |         all_indices.push_back(new_indices);
506 |         num_kept += keep_top_k_;
507 |     }else{
508 |         all_indices.push_back(indices);
509 |         num_kept += num_det;
510 |     }
511 |     if(num_kept == 0){
512 |         printf("Couldn't find any detections");
513 |     }else{
514 |         for(map<int, vector<int> >::iterator it = all_indices[0].begin(); it != all_indices[0].end(); ++it){
515 |             int label = it->first;
516 |             vector<int>& _indices = it->second;
517 |             const float* _cur_conf_data = new_conf_data + label * num_priors_;
518 | 
519 |             for(int j = 0; j < _indices.size(); ++j){
520 |                 int idx = _indices[j];
521 |                 vector<float> detect;
522 |                 for(int k = 0; k < 4; ++k){
523 |                     detect.push_back(decode_bboxes[idx * 4 + k]);
524 |                 }
525 |                 detect.push_back(_cur_conf_data[idx]);
526 |                 detect.push_back(label);
527 |                 detecions->push_back(detect);
528 |             }
529 |         }
530 |     }
531 |     t3 = (cv::getTickCount()-t3) / cv::getTickFrequency();
532 |     printf("======4 Forward_DetectionOutputLayer time is %f \n", t3);
533 | 
534 |     delete[] decode_bboxes;
535 |     delete[] new_conf_data;
536 | }
537 | 


--------------------------------------------------------------------------------
/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef _TRT_COMMON_H_
 2 | #define _TRT_COMMON_H_
 3 | #include "NvInfer.h"
 4 | #include <string>
 5 | #include <vector>
 6 | #include <map>
 7 | #include <cmath>
 8 | #include <algorithm>
 9 | #include <fstream>
10 | #include <cassert>
11 | #include <iostream>
12 | 
13 | #include <opencv2/core.hpp>
14 | #include <opencv2/highgui/highgui.hpp>
15 | #include <opencv2/imgproc/imgproc.hpp>
16 | 
17 | #define CHECK(status)									\
18 | {														\
19 | 	if (status != 0)									\
20 | 	{													\
21 | 		std::cout << "Cuda failure: " << status;		\
22 | 		abort();										\
23 | 	}													\
24 | }
25 | using namespace std;
26 | 
27 | 
28 | std::string locateFile(const std::string& input, const std::vector<std::string> & directories);
29 | void readPGMFile(const std::string& fileName,  uint8_t *buffer, int inH, int inW);
30 | void Forward_DetectionOutputLayer(float* loc_data, float* conf_data, float* prior_data, int  num_priors_, int num_classes_, vector<vector<float> >* detecions);
31 | #endif // _TRT_COMMON_H_
32 | 


--------------------------------------------------------------------------------
/cudaUtility.h:
--------------------------------------------------------------------------------
 1 | #ifndef __CUDA_UTILITY_H_
 2 | #define __CUDA_UTILITY_H_
 3 | 
 4 | 
 5 | #include <cuda_runtime.h>
 6 | #include <cuda.h>
 7 | #include <stdio.h>
 8 | #include <string.h>
 9 | 
10 | 
11 | /**
12 |  * Execute a CUDA call and print out any errors
13 |  * @return the original cudaError_t result
14 |  * @ingroup util
15 |  */
16 | #define CUDA(x)				cudaCheckError((x), #x, __FILE__, __LINE__)
17 | 
18 | /**
19 |  * Evaluates to true on success
20 |  * @ingroup util
21 |  */
22 | #define CUDA_SUCCESS(x)			(CUDA(x) == cudaSuccess)
23 | 
24 | /**
25 |  * Evaluates to true on failure
26 |  * @ingroup util
27 |  */
28 | #define CUDA_FAILED(x)			(CUDA(x) != cudaSuccess)
29 | 
30 | /**
31 |  * Return from the boolean function if CUDA call fails
32 |  * @ingroup util
33 |  */
34 | #define CUDA_VERIFY(x)			if(CUDA_FAILED(x))	return false;
35 | 
36 | /**
37 |  * LOG_CUDA string.
38 |  * @ingroup util
39 |  */
40 | #define LOG_CUDA "[cuda]   "
41 | 
42 | /*
43 |  * define this if you want all cuda calls to be printed
44 |  */
45 | //#define CUDA_TRACE
46 | 
47 | 
48 | 
49 | /**
50 |  * cudaCheckError
51 |  * @ingroup util
52 |  */
53 | inline cudaError_t cudaCheckError(cudaError_t retval, const char* txt, const char* file, int line )
54 | {
55 | #if !defined(CUDA_TRACE)
56 | 	if( retval == cudaSuccess)
57 | 		return cudaSuccess;
58 | #endif
59 | 
60 | 	//int activeDevice = -1;
61 | 	//cudaGetDevice(&activeDevice);
62 | 
63 | 	//Log("[cuda]   device %i  -  %s\n", activeDevice, txt);
64 | 	
65 | 	printf(LOG_CUDA "%s\n", txt);
66 | 
67 | 
68 | 	if( retval != cudaSuccess )
69 | 	{
70 | 		printf(LOG_CUDA "   %s (error %u) (hex 0x%02X)\n", cudaGetErrorString(retval), retval, retval);
71 | 		printf(LOG_CUDA "   %s:%i\n", file, line);	
72 | 	}
73 | 
74 | 	return retval;
75 | }
76 | 
77 | 
78 | /**
79 |  * iDivUp
80 |  * @ingroup util
81 |  */
82 | inline __device__ __host__ int iDivUp( int a, int b )  		{ return (a % b != 0) ? (a / b + 1) : (a / b); }
83 | 
84 | 
85 | 
86 | #endif
87 | 


--------------------------------------------------------------------------------
/imageBuffer.h:
--------------------------------------------------------------------------------
 1 | #ifndef IMAGEBUFFER_H
 2 | #define IMAGEBUFFER_H
 3 | 
 4 | #include <opencv2/opencv.hpp>
 5 | #include <mutex>
 6 | #include <condition_variable>
 7 | #include <queue>
 8 | 
 9 | 
10 | template<typename T>
11 | class ConsumerProducerQueue
12 | {
13 | 
14 | public:
15 |     ConsumerProducerQueue(int mxsz,bool dropFrame) :
16 |             maxSize(mxsz),dropFrame(dropFrame)
17 |     { }
18 | 
19 |     bool add(T request)
20 |     {
21 |         std::unique_lock<std::mutex> lock(mutex);
22 |         if(dropFrame && isFull())
23 |         {
24 |             lock.unlock();
25 | 			return false;
26 |         }
27 |         else {
28 |             cond.wait(lock, [this]() { return !isFull(); });
29 |             cpq.push(request);
30 |             //lock.unlock();
31 |             cond.notify_all();
32 |             return true;
33 |         }
34 |     }
35 | 
36 |     void consume(T &request)
37 |     {
38 |         std::unique_lock<std::mutex> lock(mutex);
39 |         cond.wait(lock, [this]()
40 |         { return !isEmpty(); });
41 |         request = cpq.front();
42 |         cpq.pop();
43 |         //lock.unlock();
44 |         cond.notify_all();
45 | 
46 |     }
47 | 
48 |     bool isFull() const
49 |     {
50 |         return cpq.size() >= maxSize;
51 |     }
52 | 
53 |     bool isEmpty() const
54 |     {
55 |         return cpq.size() == 0;
56 |     }
57 | 
58 |     int length() const
59 |     {
60 |         return cpq.size();
61 |     }
62 | 
63 |     void clear()
64 |     {
65 |         std::unique_lock<std::mutex> lock(mutex);
66 |         while (!isEmpty())
67 |         {
68 |             cpq.pop();
69 |         }
70 |         lock.unlock();
71 |         cond.notify_all();
72 |     }
73 | 
74 | private:
75 |     std::condition_variable cond;
76 |     std::mutex mutex;
77 |     std::queue<T> cpq;
78 |     int maxSize;
79 |     bool dropFrame;
80 | };
81 | 
82 | 
83 | 
84 | #endif
85 | 


--------------------------------------------------------------------------------
/kernel.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * http://github.com/dusty-nv/jetson-inference
  3 |  */
  4 | 
  5 | #include "cuda/cudaUtility.h"
  6 | #include <iostream>
  7 | 
  8 | 
  9 | // gpuPreImageNet
 10 | __global__ void gpuPreImageNet( float2 scale, float4* input, int iWidth, float* output, int oWidth, int oHeight )
 11 | {
 12 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
 13 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
 14 | 	const int n = oWidth * oHeight;
 15 | 	
 16 | 	if( x >= oWidth || y >= oHeight )
 17 | 		return;
 18 | 
 19 | 	const int dx = ((float)x * scale.x);
 20 | 	const int dy = ((float)y * scale.y);
 21 | 
 22 | 	const float4 px  = input[ dy * iWidth + dx ];
 23 | 	const float3 bgr = make_float3(px.z, px.y, px.x);
 24 | 	
 25 | 	output[n * 0 + y * oWidth + x] = bgr.x;
 26 | 	output[n * 1 + y * oWidth + x] = bgr.y;
 27 | 	output[n * 2 + y * oWidth + x] = bgr.z;
 28 | }
 29 | 
 30 | // cudaPreImageNet
 31 | cudaError_t cudaPreImageNet( float4* input, size_t inputWidth, size_t inputHeight,
 32 | 				         float* output, size_t outputWidth, size_t outputHeight )
 33 | {
 34 | 	if( !input || !output )
 35 | 		return cudaErrorInvalidDevicePointer;
 36 | 
 37 | 	if( inputWidth == 0 || outputWidth == 0 || inputHeight == 0 || outputHeight == 0 )
 38 | 		return cudaErrorInvalidValue;
 39 | 
 40 | 	const float2 scale = make_float2( float(inputWidth) / float(outputWidth),
 41 | 							    float(inputHeight) / float(outputHeight) );
 42 | 
 43 | 	// launch kernel
 44 | 	const dim3 blockDim(8, 8);
 45 | 	const dim3 gridDim(iDivUp(outputWidth,blockDim.x), iDivUp(outputHeight,blockDim.y));
 46 | 
 47 | 	gpuPreImageNet<<<gridDim, blockDim>>>(scale, input, inputWidth, output, outputWidth, outputHeight);
 48 | 
 49 | 	return CUDA(cudaGetLastError());
 50 | }
 51 | 
 52 | // gpuPreImageNetMean
 53 | __global__ void gpuPreImageNetMean( float2 scale, float3* input, int iWidth, float* output, int oWidth, int oHeight, float3 mean_value )
 54 | {
 55 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
 56 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
 57 | 	const int n = oWidth * oHeight;
 58 | 	
 59 | 	if( x >= oWidth || y >= oHeight )
 60 | 		return;
 61 | 
 62 | 	const int dx = ((float)x * scale.x);
 63 | 	const int dy = ((float)y * scale.y);
 64 | 
 65 | 	const float3 px  = input[ dy * iWidth + dx ];
 66 | 	const float3 bgr = make_float3(px.z - mean_value.x, px.y - mean_value.y, px.x - mean_value.z);
 67 | 	
 68 | 	output[n * 0 + y * oWidth + x] = bgr.x;
 69 | 	output[n * 1 + y * oWidth + x] = bgr.y;
 70 | 	output[n * 2 + y * oWidth + x] = bgr.z;
 71 | }
 72 | 
 73 | // cudaPreImageNetMean
 74 | cudaError_t cudaPreImageNetMean( float3* input, size_t inputWidth, size_t inputHeight,
 75 | 				             float* output, size_t outputWidth, size_t outputHeight, const float3& mean_value )
 76 | 
 77 | {
 78 | 	if( !input || !output ){
 79 |         std::cout << "error here. "<< std::endl;
 80 |         return cudaErrorInvalidDevicePointer;
 81 |     }
 82 | 
 83 | 	if( inputWidth == 0 || outputWidth == 0 || inputHeight == 0 || outputHeight == 0 ){
 84 |         std::cout << "Or here. " << std::endl;
 85 |         return cudaErrorInvalidValue;
 86 |     }
 87 | 
 88 | 
 89 | 	const float2 scale = make_float2( float(inputWidth) / float(outputWidth),
 90 | 							    float(inputHeight) / float(outputHeight) );
 91 | 
 92 | 
 93 | 	// launch kernel
 94 | 
 95 | 	const dim3 blockDim(8, 8);
 96 | 	const dim3 gridDim(iDivUp(outputWidth,blockDim.x), iDivUp(outputHeight,blockDim.y));
 97 | 
 98 | 	gpuPreImageNetMean<<<gridDim, blockDim>>>(scale, input, inputWidth, output, outputWidth, outputHeight, mean_value);
 99 | 
100 | 	return CUDA(cudaGetLastError());
101 | 
102 | }
103 | 
104 | __global__ void kernel_extract_roi(float* input, float* output, char* mean,
105 |     const int input_w, const int output_w, const int output_h,
106 |     const int in_plane_r, const int in_plane_g, const int in_plane_b,
107 |     const int out_plane_r, const int out_plane_g, const int out_plane_b,
108 |     const int bbox_x, const int bbox_y, const int bbox_w, const int bbox_h)
109 | {
110 |     uint x = blockIdx.x * blockDim.x + threadIdx.x;
111 |     uint y = blockIdx.y * blockDim.y + threadIdx.y;
112 | 
113 |     if( x < output_w && y < output_h)
114 |     {
115 |         float r[2] = { float(x) * bbox_w / output_w + bbox_x,
116 |                        float(y) * bbox_h / output_h + bbox_y };
117 | 
118 |         int   pos[4][2] = { { int(floor(r[0])), int(floor(r[1])) },
119 |                             { int( ceil(r[0])), int(floor(r[1])) },
120 |                             { int(floor(r[0])),  int(ceil(r[1])) },
121 |                             { int( ceil(r[0])),  int(ceil(r[1])) } };
122 | 
123 |         float u = r[0]-floor(r[0]);
124 |         float v = r[1]-floor(r[1]);
125 | 
126 |         float s[4] = { (1-u)*(1-v), u*(1-v), (1-u)*v, u*v };
127 | 
128 |         int map[4] = { pos[0][1]*input_w + pos[0][0], pos[1][1]*input_w + pos[1][0],
129 |                        pos[2][1]*input_w + pos[2][0], pos[3][1]*input_w + pos[3][0]};
130 | 
131 |         int idx = y * output_w + x;
132 |         output[idx+out_plane_r] = round( s[0]*input[map[0]+in_plane_r]
133 |                                        + s[1]*input[map[1]+in_plane_r]
134 |                                        + s[2]*input[map[2]+in_plane_r]
135 |                                        + s[3]*input[map[3]+in_plane_r] );// float(mean[idx+out_plane_r]));
136 |         output[idx+out_plane_g] = round( s[0]*input[map[0]+in_plane_g]
137 |                                        + s[1]*input[map[1]+in_plane_g]
138 |                                        + s[2]*input[map[2]+in_plane_g]
139 |                                        + s[3]*input[map[3]+in_plane_g] );//float(mean[idx+out_plane_g]));
140 |         output[idx+out_plane_b] = round( s[0]*input[map[0]+in_plane_b]
141 |                                        + s[1]*input[map[1]+in_plane_b]
142 |                                        + s[2]*input[map[2]+in_plane_b]
143 |                                        + s[3]*input[map[3]+in_plane_b] );//float(mean[idx+out_plane_b]));
144 |     }
145 | }
146 | 
147 | void convertROI(float* input, float* output, char* mean, const int* srcSize, const int* dstSize, const int* roi, cudaStream_t stream)
148 | {
149 |     int in_plane_r = 0;
150 |     int in_plane_g = srcSize[1] * srcSize[2];
151 |     int in_plane_b = srcSize[1] * srcSize[2] * 2;
152 | 
153 |     int out_plane_r = 0;
154 |     int out_plane_g = dstSize[1] * dstSize[2];
155 |     int out_plane_b = dstSize[1] * dstSize[2] * 2;
156 | 
157 |     int bbox_x = min(max(roi[0], 0), srcSize[2]-1);
158 |     int bbox_y = min(max(roi[1], 0), srcSize[1]-1);
159 |     int bbox_w = min(max(roi[2]-roi[0], 0), srcSize[2]-bbox_x-1 );
160 |     int bbox_h = min(max(roi[3]-roi[1], 0), srcSize[1]-bbox_y-1 );
161 | 
162 |     dim3 dimBlock(32,32);
163 |     dim3 dimGrid(dstSize[2]/dimBlock.x+1, dstSize[1]/dimBlock.y+1);
164 | 
165 |     std::cout << "ROI: " << bbox_x << " " << bbox_y << " " << bbox_w << " " << bbox_h << std::endl;
166 | 
167 |     kernel_extract_roi <<< dimGrid, dimBlock, 0, stream >>> (input, output, mean,
168 |                        srcSize[2], dstSize[2], dstSize[1],
169 |                        in_plane_r,   in_plane_g,  in_plane_b,
170 |                        out_plane_r, out_plane_g, out_plane_b,
171 |                        bbox_x, bbox_y, bbox_w, bbox_h);
172 | }
173 | 
174 | 
175 | __global__  void kernelSoftmax( float* x, int channels, float* y)
176 | {
177 | 
178 | 	extern __shared__ float mem[];
179 |     __shared__ float sum_value;
180 | 	sum_value=0;
181 | 	float number = *(x + blockDim.x*blockIdx.x + threadIdx.x);
182 | 	float number_exp = __expf(number);
183 | 
184 | //    sum_value += number_exp ;
185 |     /* *
186 |      * @TODO: Can do with the help of atomicAdd.
187 |      * */
188 |     atomicAdd(&sum_value, number_exp);
189 |     __syncthreads();
190 | 
191 | //	mem[threadIdx.x] = number_exp;
192 | 
193 |     /* *
194 |      * @TODO: Can do with the help of a for loop. Try different methods and find the time taken.
195 |      * */
196 |     //	float sum = 0.0f;
197 |     //	for (int i=0;i<channels;i++)
198 |     //	{
199 |     //		sum += mem[i];
200 |     //	}
201 | 
202 | 	y[blockDim.x*blockIdx.x + threadIdx.x] = __fdiv_rd(number_exp, sum_value);
203 | 
204 | }
205 | 
206 | void cudaSoftmax(int n, int channels,  float* x, float*y)
207 | {
208 | 	kernelSoftmax<<< (n/channels), channels, channels*sizeof(float)>>>( x, channels, y);
209 | 	cudaDeviceSynchronize();
210 | }
211 | 
212 | 


--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "cudaUtility.h"
  3 | #include "mathFunctions.h"
  4 | #include "pluginImplement.h"
  5 | #include "tensorNet.h"
  6 | #include "loadImage.h"
  7 | #include "imageBuffer.h"
  8 | #include <chrono>
  9 | #include <thread>
 10 | 
 11 | #define BOUND(a,min_val,max_val)           ( (a < min_val) ? min_val : (a >= max_val) ? (max_val) : a 
 12 | 
 13 | const char* model  = "model/pelee/pelee_deploy_iplugin.prototxt";
 14 | const char* weight = "model/pelee/pelee_merged.caffemodel";
 15 | 
 16 | const char* INPUT_BLOB_NAME = "data";
 17 | 
 18 | const char* OUTPUT_BLOB_NAME = "detection_out";
 19 | static const uint32_t BATCH_SIZE = 1;
 20 | volatile bool endvideo = false;
 21 | bool csi_cam = false;
 22 | //image buffer size = 10
 23 | //dropFrame = false
 24 | ConsumerProducerQueue<cv::Mat> *imageBuffer = new ConsumerProducerQueue<cv::Mat>(5,csi_cam);
 25 | 
 26 | class Timer {
 27 | public:
 28 |     void tic() {
 29 |         start_ticking_ = true;
 30 |         start_ = std::chrono::high_resolution_clock::now();
 31 |     }
 32 |     void toc() {
 33 |         if (!start_ticking_)return;
 34 |         end_ = std::chrono::high_resolution_clock::now();
 35 |         start_ticking_ = false;
 36 |         t = std::chrono::duration<double, std::milli>(end_ - start_).count();
 37 |         //std::cout << "Time: " << t << " ms" << std::endl;
 38 |     }
 39 |     double t;
 40 | private:
 41 |     bool start_ticking_ = false;
 42 |     std::chrono::time_point<std::chrono::high_resolution_clock> start_;
 43 |     std::chrono::time_point<std::chrono::high_resolution_clock> end_;
 44 | };
 45 | 
 46 | 
 47 | /* *
 48 |  * @TODO: unifiedMemory is used here under -> ( cudaMallocManaged )
 49 |  * */
 50 | float* allocateMemory(DimsCHW dims, char* info)
 51 | {
 52 |     float* ptr;
 53 |     size_t size;
 54 |     std::cout << "Allocate memory: " << info << std::endl;
 55 |     size = BATCH_SIZE * dims.c() * dims.h() * dims.w();
 56 |     assert(!cudaMallocManaged( &ptr, size*sizeof(float)));
 57 |     return ptr;
 58 | }
 59 | 
 60 | 
 61 | void loadImg( cv::Mat &input, int re_width, int re_height, float *data_unifrom,const float3 mean,const float scale )
 62 | {
 63 |     int i;
 64 |     int j;
 65 |     int line_offset;
 66 |     int offset_g;
 67 |     int offset_r;
 68 |     cv::Mat dst;
 69 | 
 70 |     unsigned char *line = NULL;
 71 |     float *unifrom_data = data_unifrom;
 72 | 
 73 |     cv::resize( input, dst, cv::Size( re_width, re_height ), cv::INTER_LINEAR );
 74 |     offset_g = re_width * re_height;
 75 |     offset_r = re_width * re_height * 2;
 76 | 	//#pragma omp parallel for
 77 |     for( i = 0; i < re_height; ++i )
 78 |     {
 79 |         line = dst.ptr< unsigned char >( i );
 80 |         line_offset = i * re_width;
 81 |         for( j = 0; j < re_width; ++j )
 82 |         {
 83 |             // b
 84 |             unifrom_data[ line_offset + j  ] = (( float )(line[ j * 3 ] - mean.x) * scale);
 85 |             // g
 86 |             unifrom_data[ offset_g + line_offset + j ] = (( float )(line[ j * 3 + 1 ] - mean.y) * scale);
 87 |             // r
 88 |             unifrom_data[ offset_r + line_offset + j ] = (( float )(line[ j * 3 + 2 ] - mean.z) * scale);
 89 |         }
 90 |     }
 91 | }
 92 | std::string gstreamer_pipeline (int capture_width, int capture_height, int display_width, int display_height, int framerate, int flip_method) {
 93 |     return "nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)" + std::to_string(capture_width) + ", height=(int)" +
 94 |            std::to_string(capture_height) + ", format=(string)NV12, framerate=(fraction)" + std::to_string(framerate) +
 95 |            "/1 ! nvvidconv flip-method=" + std::to_string(flip_method) + " ! video/x-raw, width=(int)" + std::to_string(display_width) + ", height=(int)" +
 96 |            std::to_string(display_height) + ", format=(string)BGRx ! videoconvert ! video/x-raw, format=(string)BGR ! appsink";
 97 | }
 98 | //thread read video
 99 | void readPicture()
100 | {
101 | 	cv::VideoCapture cap;
102 | 	if(csi_cam) {
103 | 		int capture_width = 1280 ;
104 | 		int capture_height = 720 ;
105 | 		int display_width = 1280 ;
106 | 		int display_height = 720 ;
107 | 		int framerate = 30 ;
108 | 		int flip_method = 0 ;
109 | 
110 | 		std::string pipeline = gstreamer_pipeline(capture_width,
111 | 		capture_height,
112 | 		display_width,
113 | 		display_height,
114 | 		framerate,
115 | 		flip_method);
116 | 		std::cout << "Using pipeline: \n\t" << pipeline << "\n";
117 | 		cap = cv::VideoCapture(pipeline, cv::CAP_GSTREAMER);
118 | 	}
119 | 	else {
120 | 		cap = cv::VideoCapture("testVideo/test.avi");
121 | 	}
122 |     
123 |     cv::Mat image;
124 |     while(cap.isOpened())
125 |     {
126 |         cap >> image;
127 |         if(image.empty()) {
128 |             endvideo = true;
129 |             break;
130 |         }
131 |         if(!imageBuffer->add(image)) {
132 | 			image.release();
133 | 		}
134 |     }
135 | }
136 | 
137 | void MatMul(cv::Mat img1, cv::Mat img2,int r,int g,int b , bool show_mode = false)
138 | {
139 |   int i, j;
140 |   int height = img1.rows;
141 |   int width = img1.cols;
142 |   //LOG(INFO) << width << "," << height << "," << img2.rows << "," << img2.cols;
143 |   //#pragma omp parallel for
144 | 
145 |   for (i = 0; i < height; i++) {
146 |     unsigned char* ptr1 = img1.ptr<unsigned char>(i);
147 |     const unsigned char* ptr2 = img2.ptr<unsigned char>(i);
148 |     int img_index1 = 0;
149 |     int img_index2 = 0;
150 |     for (j = 0; j < width; j++) {
151 |       if(ptr2[img_index2]>90) {
152 |         if(show_mode) {
153 |           ptr1[img_index1] = b;
154 |           ptr1[img_index1+1] = g;
155 |           ptr1[img_index1+2] = r;
156 |         }
157 |         else {
158 |           ptr1[img_index1] = b/2 + ptr1[img_index1]/2;
159 |           ptr1[img_index1+1] = g/2 + ptr1[img_index1]/2;
160 |           ptr1[img_index1+2] = r/2 + ptr1[img_index1]/2;
161 |         }
162 | 
163 |       }
164 |       //ptr1[img_index1+idx] = (unsigned char) BOUND(ptr1[img_index1] + ptr2[img_index2] * 1.0,0,255);
165 |       //ptr1[img_index1+1] = (ptr2[img_index2]);
166 |       //ptr1[img_index1+2] = (unsigned char) BOUND(ptr1[img_index1+2] + (255-ptr2[img_index2]) * 0.4,0,255);
167 |       //ptr1[img_index1+2] = (unsigned char) BOUND((ptr2[img_index2]) ,0,255);
168 |       img_index1+=3;
169 |       img_index2++;
170 |     }
171 |   }
172 | 
173 | }
174 | int main(int argc, char *argv[])
175 | {
176 |     std::vector<std::string> output_vector = {OUTPUT_BLOB_NAME,"sigmoid"};
177 |     TensorNet tensorNet;
178 |     tensorNet.LoadNetwork(model,weight,INPUT_BLOB_NAME, output_vector,BATCH_SIZE);
179 | 
180 |     DimsCHW dimsData = tensorNet.getTensorDims(INPUT_BLOB_NAME);
181 |     DimsCHW dimsOut  = tensorNet.getTensorDims(OUTPUT_BLOB_NAME);
182 | 	DimsCHW dimsOut2  = tensorNet.getTensorDims("sigmoid");
183 |     float* data    = allocateMemory( dimsData , (char*)"input blob");
184 |     std::cout << "allocate data" << std::endl;
185 |     float* output  = allocateMemory( dimsOut  , (char*)"output blob");
186 |     std::cout << "allocate output" << std::endl;
187 | 	float* output2  = allocateMemory( dimsOut2  , (char*)"output blob 2");
188 |     std::cout << "allocate output2" << std::endl;
189 |     int height = 304;
190 |     int width  = 304;
191 | 	void* imgCPU;
192 |     void* imgCUDA;
193 | 	const size_t size = width * height * sizeof(float3);
194 | 
195 | 	if( CUDA_FAILED( cudaMalloc( &imgCUDA, size)) )
196 | 	{
197 | 		cout <<"Cuda Memory allocation error occured."<<endl;
198 | 		return false;
199 | 	}
200 |     cv::Mat frame,srcImg;
201 | 
202 | 	
203 |     Timer timer;
204 |     std::thread readTread(readPicture);
205 |     readTread.detach();
206 |     double msTime_avg = 0.;
207 |     int count = 0;
208 |     int ch_size = dimsOut2.c();
209 | 	std::vector<cv::Mat> seg_img;
210 | 	for(int i = 0; i<ch_size;i++) {   
211 | 		seg_img.push_back(cv::Mat(76, 76, CV_8UC1));
212 | 	}
213 | 	std::vector<int> color = {128,255,128,244,35,232};
214 |     while(1){
215 | 		if(endvideo && imageBuffer->isEmpty()) {
216 | 			break;
217 | 		}
218 | 		imageBuffer->consume(frame);
219 | 
220 | 		if(!frame.rows) {
221 | 			break;
222 | 		}
223 | 		//srcImg = frame.clone();
224 | 		cv::resize(frame, srcImg, cv::Size(304,304));
225 | 
226 | 
227 | 		void* imgData = malloc(size);
228 | 		//memset(imgData,0,size);
229 | 		
230 | 		loadImg(srcImg,height,width,(float*)imgData,make_float3(103.94,116.78,123.68),0.017);
231 | 		
232 | 		cudaMemcpyAsync(imgCUDA,imgData,size,cudaMemcpyHostToDevice);
233 | 		
234 | 		void* buffers[] = { imgCUDA, output , output2}; 
235 | 
236 | 		
237 | 
238 | 		timer.tic();
239 | 		tensorNet.imageInference( buffers, output_vector.size() + 1, BATCH_SIZE);
240 | 		timer.toc();
241 | 		double msTime = timer.t;
242 | 		
243 | 		msTime_avg+= msTime;
244 | 		count++;
245 | 		std::cout<<msTime_avg/(float)count<< std::endl;	
246 | 		vector<vector<float> > detections;
247 | 
248 | 		for (int k=0; k<100; k++)
249 | 		{
250 | 			if(output[7*k+1] == -1)
251 | 				break;
252 | 			float classIndex = output[7*k+1];
253 | 			float confidence = output[7*k+2];
254 | 			float xmin = output[7*k + 3];
255 | 			float ymin = output[7*k + 4];
256 | 			float xmax = output[7*k + 5];
257 | 			float ymax = output[7*k + 6];
258 | 			//std::cout << classIndex << " , " << confidence << " , "  << xmin << " , " << ymin<< " , " << xmax<< " , " << ymax << std::endl;
259 | 			int x1 = static_cast<int>(xmin * frame.cols);
260 | 			int y1 = static_cast<int>(ymin * frame.rows);
261 | 			int x2 = static_cast<int>(xmax * frame.cols);
262 | 			int y2 = static_cast<int>(ymax * frame.rows);
263 | 			cv::rectangle(frame,cv::Rect2f(cv::Point(x1,y1),cv::Point(x2,y2)),cv::Scalar(255,0,255),1);
264 | 
265 | 		}
266 | 		int scale = 4;
267 | 
268 | 		int w = width / scale;
269 | 		int h = height / scale;
270 | 		
271 | 		for(int c = 0; c<seg_img.size();c++) { 
272 | 			int img_index1 = 0;  
273 | 			for (int y = 0; y < h; y++) {
274 | 				uchar* ptr2 = seg_img[c].ptr<uchar>(y);
275 | 				int img_index2 = 0;
276 | 				for (int j = 0; j < w; j++) {
277 | 					int val = output2[img_index1+c*w*h] * 255;
278 | 					if (val>255) val = 255; 
279 | 					if (val<0) val = 0;
280 | 					ptr2[img_index2] = (unsigned char)val;
281 | 					//if(c==1)
282 | 					//  printf("%f\n",result2[img_index1+c*w*h]);
283 | 					img_index1++;
284 | 					img_index2++;
285 | 				}
286 | 			}
287 | 		}
288 | 		cv::Mat seg_img_resized;
289 | 		for(int i=0;i<seg_img.size();i++) {
290 |           cv::resize(seg_img[i], seg_img_resized, cv::Size(frame.cols, frame.rows),cv::INTER_AREA);
291 |           int color_index = (i)*3;
292 |           MatMul(frame, seg_img_resized,color[color_index],color[color_index+1],color[color_index+2]);
293 |         }
294 | 		//cv::namedWindow("show", cv::WINDOW_NORMAL);
295 | 		//cv::resizeWindow("show", 400, 400);
296 | 		cv::imshow("show", frame);
297 | 		cv::waitKey(1);
298 | 		free(imgData);
299 | 		frame.release();
300 | 		srcImg.release();
301 |     }
302 |     cudaFree(imgCUDA);
303 |     cudaFreeHost(imgCPU);
304 |     cudaFree(output);
305 |     tensorNet.destroy();
306 |     return 0;
307 | }
308 | 


--------------------------------------------------------------------------------
/mathFunctions.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "mathFunctions.h"
 3 | 
 4 | 
 5 | const char* cublasGetErrorString(cublasStatus_t error){  
 6 |   switch (error) {  
 7 |   case CUBLAS_STATUS_SUCCESS:  
 8 |     return "CUBLAS_STATUS_SUCCESS";  
 9 |   case CUBLAS_STATUS_NOT_INITIALIZED:  
10 |     return "CUBLAS_STATUS_NOT_INITIALIZED";  
11 |   case CUBLAS_STATUS_ALLOC_FAILED:  
12 |     return "CUBLAS_STATUS_ALLOC_FAILED";  
13 |   case CUBLAS_STATUS_INVALID_VALUE:  
14 |     return "CUBLAS_STATUS_INVALID_VALUE";  
15 |   case CUBLAS_STATUS_ARCH_MISMATCH:  
16 |     return "CUBLAS_STATUS_ARCH_MISMATCH";  
17 |   case CUBLAS_STATUS_MAPPING_ERROR:  
18 |     return "CUBLAS_STATUS_MAPPING_ERROR";  
19 |   case CUBLAS_STATUS_EXECUTION_FAILED:  
20 |     return "CUBLAS_STATUS_EXECUTION_FAILED";  
21 |   case CUBLAS_STATUS_INTERNAL_ERROR:  
22 |     return "CUBLAS_STATUS_INTERNAL_ERROR";  
23 | #if CUDA_VERSION >= 6000  
24 |   case CUBLAS_STATUS_NOT_SUPPORTED:  
25 |     return "CUBLAS_STATUS_NOT_SUPPORTED";  
26 | #endif  
27 | #if CUDA_VERSION >= 6050  
28 |   case CUBLAS_STATUS_LICENSE_ERROR:  
29 |     return "CUBLAS_STATUS_LICENSE_ERROR";  
30 | #endif  
31 |   }  
32 |   return "Unknown cublas status";  
33 | }  
34 | 
35 | const char* curandGetErrorString(curandStatus_t error) {  
36 |   switch (error) {  
37 |   case CURAND_STATUS_SUCCESS:  
38 |     return "CURAND_STATUS_SUCCESS";  
39 |   case CURAND_STATUS_VERSION_MISMATCH:  
40 |     return "CURAND_STATUS_VERSION_MISMATCH";  
41 |   case CURAND_STATUS_NOT_INITIALIZED:  
42 |     return "CURAND_STATUS_NOT_INITIALIZED";  
43 |   case CURAND_STATUS_ALLOCATION_FAILED:  
44 |     return "CURAND_STATUS_ALLOCATION_FAILED";  
45 |   case CURAND_STATUS_TYPE_ERROR:  
46 |     return "CURAND_STATUS_TYPE_ERROR";  
47 |   case CURAND_STATUS_OUT_OF_RANGE:  
48 |     return "CURAND_STATUS_OUT_OF_RANGE";  
49 |   case CURAND_STATUS_LENGTH_NOT_MULTIPLE:  
50 |     return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";  
51 |   case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:  
52 |     return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";  
53 |   case CURAND_STATUS_LAUNCH_FAILURE:  
54 |     return "CURAND_STATUS_LAUNCH_FAILURE";  
55 |   case CURAND_STATUS_PREEXISTING_FAILURE:  
56 |     return "CURAND_STATUS_PREEXISTING_FAILURE";  
57 |   case CURAND_STATUS_INITIALIZATION_FAILED:  
58 |     return "CURAND_STATUS_INITIALIZATION_FAILED";  
59 |   case CURAND_STATUS_ARCH_MISMATCH:  
60 |     return "CURAND_STATUS_ARCH_MISMATCH";  
61 |   case CURAND_STATUS_INTERNAL_ERROR:  
62 |     return "CURAND_STATUS_INTERNAL_ERROR";  
63 |   }  
64 |   return "Unknown curand status";  
65 | }  
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/mathFunctions.cu:
--------------------------------------------------------------------------------
 1 | #include "mathFunctions.h"
 2 | #include <iostream>
 3 | #include "cudaUtility.h"
 4 | 
 5 | 
 6 | //concatlayer
 7 | template <typename Dtype>
 8 | __global__ void Concat(const int nthreads, const Dtype* in_data,
 9 |                        const bool forward, const int num_concats, const int concat_size,
10 |                        const int top_concat_axis, const int bottom_concat_axis,
11 |                        const int offset_concat_axis, Dtype* out_data) {
12 |     CUDA_KERNEL_LOOP(index, nthreads) {
13 |         const int total_concat_size = concat_size * bottom_concat_axis;
14 |         const int concat_num = index / total_concat_size;
15 |         const int concat_index = index % total_concat_size;
16 |         const int top_index = concat_index +
17 |                               (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
18 |         if (forward) {
19 |             out_data[top_index] = in_data[index];
20 |         } else {
21 |             out_data[index] = in_data[top_index];
22 |         }
23 |     }
24 | }
25 | 
26 | cudaError_t ConcatLayer(int nthreads, const float *bottom_data, bool kForward, int num_concats_, int concat_input_size_,
27 |                         int top_concat_axis, int bottom_concat_axis, int offset_concat_axis, float *top_data, cudaStream_t stream)
28 | {
29 |     Concat<float><<<TENSORRT_GET_BLOCKS(nthreads), TENSORRT_CUDA_NUM_THREADS,0,stream>>>(nthreads, bottom_data,
30 |     kForward, num_concats_, concat_input_size_, top_concat_axis, bottom_concat_axis, offset_concat_axis, top_data);
31 |     return cudaPeekAtLastError();
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/mathFunctions.h:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #ifndef __MATH_FUNCTINS_H__
  4 | #define __MATH_FUNCTINS_H__
  5 | 
  6 | #include <stdint.h>
  7 | #include <cmath>  // for std::fabs and std::signbit
  8 | #include <cblas.h>
  9 | #include <cudnn.h>
 10 | #include <cublas_v2.h>
 11 | #include <cuda.h>
 12 | #include <cuda_runtime.h>
 13 | #include <curand.h>
 14 | #include <driver_types.h>  // cuda driver types
 15 | #include <algorithm>
 16 | 
 17 | #include <glog/logging.h>
 18 | #define PERMUTELAYER_ORDERNUM 4
 19 | #define BLOCK 512
 20 | //
 21 | // CUDA macros
 22 | //
 23 | 
 24 | // CUDA: various checks for different function calls.
 25 | #define CUDA_CHECK(condition) \
 26 |   /* Code block avoids redefinition of cudaError_t error */ \
 27 |   do { \
 28 |     cudaError_t error = condition; \
 29 |     CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
 30 |   } while (0)
 31 | 
 32 | #define CUBLAS_CHECK(condition) \
 33 |   do { \
 34 |     cublasStatus_t status = condition; \
 35 |     CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
 36 |       << cublasGetErrorString(status); \
 37 |   } while (0)
 38 | 
 39 | #define CURAND_CHECK(condition) \
 40 |   do { \
 41 |     curandStatus_t status = condition; \
 42 |     CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
 43 |       << curandGetErrorString(status); \
 44 |   } while (0)
 45 | 
 46 | // CUDA: grid stride looping
 47 | #define CUDA_KERNEL_LOOP(i, n) \
 48 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
 49 |        i < (n); \
 50 |        i += blockDim.x * gridDim.x)
 51 | 
 52 | // CUDA: check for error after kernel execution and exit loudly if there is one.
 53 | #define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())
 54 | 
 55 | 
 56 | // CUDA: library error reporting.
 57 | const char* cublasGetErrorString(cublasStatus_t error);
 58 | const char* curandGetErrorString(curandStatus_t error);
 59 | 
 60 | // CUDA: use 512 threads per block
 61 | const int TENSORRT_CUDA_NUM_THREADS = 256;
 62 | 
 63 | // CUDA: number of blocks for threads.
 64 | inline int TENSORRT_GET_BLOCKS(const int N) {
 65 |   return (N + TENSORRT_CUDA_NUM_THREADS - 1) / TENSORRT_CUDA_NUM_THREADS;
 66 | }
 67 | 
 68 | 
 69 | /* 
 70 |  * function: X[i] = alpha,initialize X with constant alpha
 71 |  * 
 72 |  */
 73 | template <typename Dtype>
 74 | void tensorrt_gpu_set(const int N, const Dtype alpha, Dtype *X);
 75 | 
 76 | /*
 77 |  * function: y[index] = pow(a[index], alpha)
 78 |  *@params n: the dims of matrix a
 79 |  *@params a: matrix
 80 |  *@params y: vector
 81 |  */
 82 | template <typename Dtype>
 83 | void tensorrt_gpu_powx(const int n, const Dtype* a, const Dtype alpha, Dtype* y);
 84 | 
 85 | 
 86 | /*
 87 |  *function:y = alpha*A*x + beta*y;
 88 |  *@params handle: handle
 89 |  *@params TransA: transpose flag
 90 |  *@params M: the rows of A
 91 |  *@params N: the cols of A
 92 |  *@params alpha: the coefficient of A*x
 93 |  *@params A: matrix [M x N]
 94 |  *@params x: vector x
 95 |  *@params beta: the coefficient of y
 96 |  *@params y: vector y
 97 |  */
 98 | template <typename Dtype>
 99 | void tensorrt_gpu_gemv(cublasHandle_t handle,const CBLAS_TRANSPOSE TransA, const int M, const int N,
100 |     const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
101 |     Dtype* y);
102 | 
103 | 
104 | 
105 | template <typename Dtype>
106 | void tensorrt_gpu_divbsx(const int nthreads, const Dtype* A,
107 |     const Dtype* v, const int rows, const int cols, const CBLAS_TRANSPOSE trans,
108 |     Dtype* B);
109 | 
110 | template <typename Dtype>
111 | void tensorrt_gpu_mulbsx(const int nthreads, const Dtype* A,
112 |     const Dtype* v, const int rows, const int cols, const CBLAS_TRANSPOSE trans,
113 |     Dtype* B);
114 | cudaError_t tensorrt_gpu_permute(const int nthreads,float* const  bottom_data,const bool forward,
115 | 	const int* permute_order,const int* old_steps,const int* new_steps,const int num_axes,float* const top_data,cudaStream_t stream);
116 | 
117 | cudaError_t SoftmaxLayer(const float *bottom_data, int count, int channels, int outer_num_, int inner_num_, float *scale_data, float *top_data, cudaStream_t stream);
118 | 
119 | cudaError_t ConcatLayer(int nthreads, const float *bottom_data, bool kForward, int num_concats_, int concat_input_size_, int top_concat_axis, int bottom_concat_axis, int offset_concat_axis, float *top_data, cudaStream_t stream);
120 | 
121 | //cudaError_t cudaSoftmax(int n, int channels,  float* x, float*y, cudaStream_t stream);
122 | 
123 | //virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
124 | cudaError_t cudaSoftmax_caffe(int count,int channels,float* x,float* y, cudaStream_t stream);
125 | 
126 | cudaError_t cudaDetectionOutput_caffe( int bottom0_count,
127 |                                        int bottom1_count,
128 |                                        float* loc_data,
129 |                                        float* bottom1,
130 |                                        float* prior_data,
131 |                                        float* bottom3,
132 |                                        float* bottom4,
133 |                                        float* y,
134 |                                        cudaStream_t stream);
135 | 
136 | #endif
137 | 


--------------------------------------------------------------------------------
/model/pelee/pelee_merged.caffemodel:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric612/Pelee-Seg-TensorRT/05bf0b31c5891adaf64f40b784ef4a1927d68862/model/pelee/pelee_merged.caffemodel


--------------------------------------------------------------------------------
/pluginImplement.cpp:
--------------------------------------------------------------------------------
   1 | #include "pluginImplement.h"
   2 | #include "mathFunctions.h"
   3 | #include <vector>
   4 | #include <algorithm>
   5 | 
   6 | 
   7 | 
   8 | 
   9 | 
  10 | /******************************/
  11 | // PluginFactory //
  12 | /******************************/
  13 | nvinfer1::IPlugin* PluginFactory::createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights)
  14 | {
  15 |     assert(isPlugin(layerName));
  16 | 
  17 |     if (!strcmp(layerName, "ext/pm1_mbox_loc_perm"))
  18 |     {
  19 |         std::cout << layerName << std::endl;
  20 |         assert(mExt_pm1_mbox_loc_perm_layer.get() == nullptr);
  21 |         mExt_pm1_mbox_loc_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
  22 |                 (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter);
  23 |         return mExt_pm1_mbox_loc_perm_layer.get();
  24 |     }
  25 |     else if (!strcmp(layerName, "ext/pm1_mbox_conf_perm"))
  26 |     {
  27 |         std::cout << layerName << std::endl;
  28 |         assert(mExt_pm1_mbox_conf_perm_layer.get() == nullptr);
  29 |         mExt_pm1_mbox_conf_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
  30 |                 (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter);
  31 |         return mExt_pm1_mbox_conf_perm_layer.get();
  32 |     }
  33 |     else if (!strcmp(layerName, "ext/pm2_mbox_loc_perm"))
  34 |     {
  35 |         std::cout << layerName << std::endl;
  36 |         assert(mExt_pm2_mbox_loc_perm_layer.get() == nullptr);
  37 |         mExt_pm2_mbox_loc_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
  38 |                 (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter);
  39 |         return mExt_pm2_mbox_loc_perm_layer.get();
  40 |     }
  41 |     else if (!strcmp(layerName, "ext/pm2_mbox_conf_perm"))
  42 |     {
  43 |         std::cout << layerName << std::endl;
  44 |         assert(mExt_pm2_mbox_conf_perm_layer.get() == nullptr);
  45 |         mExt_pm2_mbox_conf_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
  46 |                 (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter);
  47 |         return mExt_pm2_mbox_conf_perm_layer.get();
  48 |     }
  49 |      else if (!strcmp(layerName, "ext/pm3_mbox_loc_perm"))
  50 |     {
  51 |         std::cout << layerName << std::endl;
  52 |         assert(mExt_pm3_mbox_loc_perm_layer.get() == nullptr);
  53 |         mExt_pm3_mbox_loc_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
  54 |                 (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter);
  55 |         return mExt_pm3_mbox_loc_perm_layer.get();
  56 |     }
  57 |     else if (!strcmp(layerName, "ext/pm3_mbox_conf_perm"))
  58 |     {
  59 |         std::cout << layerName << std::endl;
  60 |         assert(mExt_pm3_mbox_conf_perm_layer.get() == nullptr);
  61 |         mExt_pm3_mbox_conf_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
  62 |                 (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter);
  63 |         return mExt_pm3_mbox_conf_perm_layer.get();
  64 |     }
  65 |      else if (!strcmp(layerName, "ext/pm4_mbox_loc_perm"))
  66 |     {
  67 |         std::cout << layerName << std::endl;
  68 |         assert(mExt_pm4_mbox_loc_perm_layer.get() == nullptr);
  69 |         mExt_pm4_mbox_loc_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
  70 |                 (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter);
  71 |         return mExt_pm4_mbox_loc_perm_layer.get();
  72 |     }
  73 |     else if (!strcmp(layerName, "ext/pm4_mbox_conf_perm"))
  74 |     {
  75 |         std::cout << layerName << std::endl;
  76 |         assert(mExt_pm4_mbox_conf_perm_layer.get() == nullptr);
  77 |         mExt_pm4_mbox_conf_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
  78 |                 (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter);
  79 |         return mExt_pm4_mbox_conf_perm_layer.get();
  80 |     }
  81 |      else if (!strcmp(layerName, "ext/pm5_mbox_loc_perm"))
  82 |     {
  83 |         std::cout << layerName << std::endl;
  84 |         assert(mExt_pm5_mbox_loc_perm_layer.get() == nullptr);
  85 |         mExt_pm5_mbox_loc_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
  86 |                 (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter);
  87 |         return mExt_pm5_mbox_loc_perm_layer.get();
  88 |     }
  89 |     else if (!strcmp(layerName, "ext/pm5_mbox_conf_perm"))
  90 |     {
  91 |         std::cout << layerName << std::endl;
  92 |         assert(mExt_pm5_mbox_conf_perm_layer.get() == nullptr);
  93 |         mExt_pm5_mbox_conf_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
  94 |                 (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter);
  95 |         return mExt_pm5_mbox_conf_perm_layer.get();
  96 |     }
  97 |      else if (!strcmp(layerName, "ext/pm6_mbox_loc_perm"))
  98 |     {
  99 |         std::cout << layerName << std::endl;
 100 |         assert(mExt_pm6_mbox_loc_perm_layer.get() == nullptr);
 101 |         mExt_pm6_mbox_loc_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 102 |                 (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter);
 103 |         return mExt_pm6_mbox_loc_perm_layer.get();
 104 |     }
 105 |     else if (!strcmp(layerName, "ext/pm6_mbox_conf_perm"))
 106 |     {
 107 |         std::cout << layerName << std::endl;
 108 |         assert(mExt_pm6_mbox_conf_perm_layer.get() == nullptr);
 109 |         mExt_pm6_mbox_conf_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 110 |                 (createSSDPermutePlugin({{0, 2, 3, 1}}), nvPluginDeleter);
 111 |         return mExt_pm6_mbox_conf_perm_layer.get();
 112 |     }
 113 |      else if (!strcmp(layerName, "ext/pm1_mbox_priorbox"))
 114 |     {
 115 |         std::cout << layerName << std::endl;
 116 |         assert(mExt_pm1_mbox_priorbox_layer.get() == nullptr);
 117 |         PriorBoxParameters params; 
 118 |         float min_size[1] = {30.3999996185}, max_size[1] = {60.7999992371}, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 
 119 |         params.minSize=min_size;
 120 |         params.aspectRatios=aspect_ratio;
 121 |         params.numMinSize = 1;
 122 |         params.numAspectRatios = 3;
 123 |         params.maxSize = max_size;
 124 |         params.numMaxSize = 1;
 125 |         params.flip = true;
 126 |         params.clip = false;
 127 |         params.variance[0] = 0.1;
 128 |         params.variance[1] = 0.1;
 129 |         params.variance[2] = 0.2;
 130 |         params.variance[3] = 0.2;
 131 |         params.imgH = 0;
 132 |         params.imgW = 0;
 133 |         params.stepH = 0;
 134 |         params.stepW = 0;
 135 |         params.offset = 0.5;
 136 |         mExt_pm1_mbox_priorbox_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 137 |                 (createSSDPriorBoxPlugin(params), nvPluginDeleter);
 138 |         return mExt_pm1_mbox_priorbox_layer.get();
 139 |     }
 140 |       else if (!strcmp(layerName, "ext/pm2_mbox_priorbox"))
 141 |     {
 142 |         std::cout << layerName << std::endl;
 143 |         assert(mExt_pm2_mbox_priorbox_layer.get() == nullptr);
 144 |         float min_size[1] = {60.7999992371}, max_size[1] = {112.480003357}, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 
 145 |         PriorBoxParameters params; 
 146 |         params.minSize=min_size;
 147 |         params.aspectRatios=aspect_ratio;
 148 |         params.numMinSize = 1;
 149 |         params.numAspectRatios = 3;
 150 |         params.maxSize = max_size;
 151 |         params.numMaxSize = 1;
 152 |         params.flip = true;
 153 |         params.clip = false;
 154 |         params.variance[0] = 0.1;
 155 |         params.variance[1] = 0.1;
 156 |         params.variance[2] = 0.2;
 157 |         params.variance[3] = 0.2;
 158 |         params.imgH = 0;
 159 |         params.imgW = 0;
 160 |         params.stepH = 0;
 161 |         params.stepW = 0;
 162 |         params.offset = 0.5;
 163 | 
 164 | 
 165 | 
 166 |         mExt_pm2_mbox_priorbox_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 167 |                 (createSSDPriorBoxPlugin(params), nvPluginDeleter);
 168 |         return mExt_pm2_mbox_priorbox_layer.get();
 169 |     }
 170 | 
 171 |       else if (!strcmp(layerName, "ext/pm3_mbox_priorbox"))
 172 |     {
 173 |         std::cout << layerName << std::endl;
 174 |         assert(mExt_pm3_mbox_priorbox_layer.get() == nullptr);
 175 |         float min_size[1] = {112.480003357}, max_size[1] = {164.160003662}, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 
 176 |         PriorBoxParameters params; 
 177 |         params.minSize=min_size;
 178 |         params.aspectRatios=aspect_ratio;
 179 |         params.numMinSize = 1;
 180 |         params.numAspectRatios = 3;
 181 |         params.maxSize = max_size;
 182 |         params.numMaxSize = 1;
 183 |         params.flip = true;
 184 |         params.clip = false;
 185 |         params.variance[0] = 0.1;
 186 |         params.variance[1] = 0.1;
 187 |         params.variance[2] = 0.2;
 188 |         params.variance[3] = 0.2;
 189 |         params.imgH = 0;
 190 |         params.imgW = 0;
 191 |         params.stepH = 0;
 192 |         params.stepW = 0;
 193 |         params.offset = 0.5;
 194 | 
 195 |         mExt_pm3_mbox_priorbox_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 196 |                 (createSSDPriorBoxPlugin(params), nvPluginDeleter);
 197 |         return mExt_pm3_mbox_priorbox_layer.get();
 198 |     }
 199 | 
 200 |       else if (!strcmp(layerName, "ext/pm4_mbox_priorbox"))
 201 |     {
 202 |         std::cout << layerName << std::endl;
 203 |         assert(mExt_pm4_mbox_priorbox_layer.get() == nullptr);
 204 |         float min_size[1] = {164.160003662}, max_size[1] = {215.839996338}, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 
 205 |         PriorBoxParameters params; 
 206 |         params.minSize=min_size;
 207 |         params.aspectRatios=aspect_ratio;
 208 |         params.numMinSize = 1;
 209 |         params.numAspectRatios = 3;
 210 |         params.maxSize = max_size;
 211 |         params.numMaxSize = 1;
 212 |         params.flip = true;
 213 |         params.clip = false;
 214 |         params.variance[0] = 0.1;
 215 |         params.variance[1] = 0.1;
 216 |         params.variance[2] = 0.2;
 217 |         params.variance[3] = 0.2;
 218 |         params.imgH = 0;
 219 |         params.imgW = 0;
 220 |         params.stepH = 0;
 221 |         params.stepW = 0;
 222 |         params.offset = 0.5;
 223 |         mExt_pm4_mbox_priorbox_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 224 |                 (createSSDPriorBoxPlugin(params), nvPluginDeleter);
 225 |         return mExt_pm4_mbox_priorbox_layer.get();
 226 |     }
 227 | 
 228 |       else if (!strcmp(layerName, "ext/pm5_mbox_priorbox"))
 229 |     {
 230 |         std::cout << layerName << std::endl;
 231 |         assert(mExt_pm5_mbox_priorbox_layer.get() == nullptr);
 232 |         float min_size[1]= {215.839996338}, max_size[1]= {267.519989014}, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 
 233 |         PriorBoxParameters params; 
 234 |         params.minSize=min_size;
 235 |         params.aspectRatios=aspect_ratio;
 236 |         params.numMinSize = 1;
 237 |         params.numAspectRatios = 3;
 238 |         params.maxSize = max_size;
 239 |         params.numMaxSize = 1;
 240 |         params.flip = true;
 241 |         params.clip = false;
 242 |         params.variance[0] = 0.1;
 243 |         params.variance[1] = 0.1;
 244 |         params.variance[2] = 0.2;
 245 |         params.variance[3] = 0.2;
 246 |         params.imgH = 0;
 247 |         params.imgW = 0;
 248 |         params.stepH = 0;
 249 |         params.stepW = 0;
 250 |         params.offset = 0.5;
 251 |         mExt_pm5_mbox_priorbox_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 252 |                 (createSSDPriorBoxPlugin(params), nvPluginDeleter);
 253 |         return mExt_pm5_mbox_priorbox_layer.get();
 254 |     }
 255 | 
 256 |       else if (!strcmp(layerName, "ext/pm6_mbox_priorbox"))
 257 |     {
 258 |         std::cout << layerName << std::endl;
 259 |         assert(mExt_pm6_mbox_priorbox_layer.get() == nullptr);
 260 |         float min_size[1] = {267.519989014}, max_size[1] = {319.200012207}, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 
 261 |         PriorBoxParameters params; 
 262 |         params.minSize=min_size;
 263 |         params.aspectRatios=aspect_ratio;
 264 |         params.numMinSize = 1;
 265 |         params.numAspectRatios = 3;
 266 |         params.maxSize = max_size;
 267 |         params.numMaxSize = 1;
 268 |         params.flip = true;
 269 |         params.clip = false;
 270 |         params.variance[0] = 0.1;
 271 |         params.variance[1] = 0.1;
 272 |         params.variance[2] = 0.2;
 273 |         params.variance[3] = 0.2;
 274 |         params.imgH = 0;
 275 |         params.imgW = 0;
 276 |         params.stepH = 0;
 277 |         params.stepW = 0;
 278 |         params.offset = 0.5;
 279 | 
 280 |         mExt_pm6_mbox_priorbox_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 281 |                 (createSSDPriorBoxPlugin(params), nvPluginDeleter);
 282 |         return mExt_pm6_mbox_priorbox_layer.get();
 283 |     }
 284 | 
 285 |     else if (!strcmp(layerName, "stem/concat"))
 286 |     {
 287 |         std::cout << layerName << std::endl;
 288 |         assert(mStem_concat_layer.get() == nullptr);
 289 |         mStem_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 290 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 291 |         return mStem_concat_layer.get();
 292 |     }
 293 | 
 294 |     else if (!strcmp(layerName, "stage1_1/concat"))
 295 |     {
 296 |         std::cout << layerName << std::endl;
 297 |         assert(mStage1_1_concat_layer.get() == nullptr);
 298 |         mStage1_1_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 299 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 300 |         return mStage1_1_concat_layer.get();
 301 |     }
 302 |     else if (!strcmp(layerName, "stage1_2/concat"))
 303 |     {
 304 |         std::cout << layerName << std::endl;
 305 |         assert(mStage1_2_concat_layer.get() == nullptr);
 306 |         mStage1_2_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 307 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 308 |         return mStage1_2_concat_layer.get();
 309 |     }
 310 |     else if (!strcmp(layerName, "stage1_3/concat"))
 311 |     {
 312 |         std::cout << layerName << std::endl;
 313 |         assert(mStage1_3_concat_layer.get() == nullptr);
 314 |         mStage1_3_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 315 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 316 |         return mStage1_3_concat_layer.get();
 317 |     }
 318 | 
 319 |     else if (!strcmp(layerName, "stage2_1/concat"))
 320 |     {
 321 |         std::cout << layerName << std::endl;
 322 |         assert(mStage2_1_concat_layer.get() == nullptr);
 323 |         mStage2_1_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 324 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 325 |         return mStage2_1_concat_layer.get();
 326 |     }
 327 |     else if (!strcmp(layerName, "stage2_2/concat"))
 328 |     {
 329 |         std::cout << layerName << std::endl;
 330 |         assert(mStage2_2_concat_layer.get() == nullptr);
 331 |         mStage2_2_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 332 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 333 |         return mStage2_2_concat_layer.get();
 334 |     }
 335 |     else if (!strcmp(layerName, "stage2_3/concat"))
 336 |     {
 337 |         std::cout << layerName << std::endl;
 338 |         assert(mStage2_3_concat_layer.get() == nullptr);
 339 |         mStage2_3_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 340 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 341 |         return mStage2_3_concat_layer.get();
 342 |     }
 343 |      else if (!strcmp(layerName, "stage2_4/concat"))
 344 |     {
 345 |         std::cout << layerName << std::endl;
 346 |         assert(mStage2_4_concat_layer.get() == nullptr);
 347 |         mStage2_4_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 348 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 349 |         return mStage2_4_concat_layer.get();
 350 |     }
 351 | 
 352 |       else if (!strcmp(layerName, "stage3_1/concat"))
 353 |     {
 354 |         std::cout << layerName << std::endl;
 355 |         assert(mStage3_1_concat_layer.get() == nullptr);
 356 |         mStage3_1_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 357 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 358 |         return mStage3_1_concat_layer.get();
 359 |     }
 360 |     else if (!strcmp(layerName, "stage3_2/concat"))
 361 |     {
 362 |         std::cout << layerName << std::endl;
 363 |         assert(mStage3_2_concat_layer.get() == nullptr);
 364 |         mStage3_2_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 365 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 366 |         return mStage3_2_concat_layer.get();
 367 |     }
 368 |     else if (!strcmp(layerName, "stage3_3/concat"))
 369 |     {
 370 |         std::cout << layerName << std::endl;
 371 |         assert(mStage3_3_concat_layer.get() == nullptr);
 372 |         mStage3_3_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 373 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 374 |         return mStage3_3_concat_layer.get();
 375 |     }
 376 |      else if (!strcmp(layerName, "stage3_4/concat"))
 377 |     {
 378 |         std::cout << layerName << std::endl;
 379 |         assert(mStage3_4_concat_layer.get() == nullptr);
 380 |         mStage3_4_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 381 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 382 |         return mStage3_4_concat_layer.get();
 383 |     }
 384 |      else if (!strcmp(layerName, "stage3_5/concat"))
 385 |     {
 386 |         std::cout << layerName << std::endl;
 387 |         assert(mStage3_5_concat_layer.get() == nullptr);
 388 |         mStage3_5_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 389 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 390 |         return mStage3_5_concat_layer.get();
 391 |     }
 392 |      else if (!strcmp(layerName, "stage3_6/concat"))
 393 |     {
 394 |         std::cout << layerName << std::endl;
 395 |         assert(mStage3_6_concat_layer.get() == nullptr);
 396 |         mStage3_6_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 397 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 398 |         return mStage3_6_concat_layer.get();
 399 |     }
 400 |      else if (!strcmp(layerName, "stage3_7/concat"))
 401 |     {
 402 |         std::cout << layerName << std::endl;
 403 |         assert(mStage3_7_concat_layer.get() == nullptr);
 404 |         mStage3_7_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 405 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 406 |         return mStage3_7_concat_layer.get();
 407 |     }
 408 |      else if (!strcmp(layerName, "stage3_8/concat"))
 409 |     {
 410 |         std::cout << layerName << std::endl;
 411 |         assert(mStage3_8_concat_layer.get() == nullptr);
 412 |         mStage3_8_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 413 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 414 |         return mStage3_8_concat_layer.get();
 415 |     }
 416 | 
 417 |       else if (!strcmp(layerName, "stage4_1/concat"))
 418 |     {
 419 |         std::cout << layerName << std::endl;
 420 |         assert(mStage4_1_concat_layer.get() == nullptr);
 421 |         mStage4_1_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 422 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 423 |         return mStage4_1_concat_layer.get();
 424 |     }
 425 |     else if (!strcmp(layerName, "stage4_2/concat"))
 426 |     {
 427 |         std::cout << layerName << std::endl;
 428 |         assert(mStage4_2_concat_layer.get() == nullptr);
 429 |         mStage4_2_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 430 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 431 |         return mStage4_2_concat_layer.get();
 432 |     }
 433 |     else if (!strcmp(layerName, "stage4_3/concat"))
 434 |     {
 435 |         std::cout << layerName << std::endl;
 436 |         assert(mStage4_3_concat_layer.get() == nullptr);
 437 |         mStage4_3_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 438 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 439 |         return mStage4_3_concat_layer.get();
 440 |     }
 441 |      else if (!strcmp(layerName, "stage4_4/concat"))
 442 |     {
 443 |         std::cout << layerName << std::endl;
 444 |         assert(mStage4_4_concat_layer.get() == nullptr);
 445 |         mStage4_4_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 446 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 447 |         return mStage4_4_concat_layer.get();
 448 |     }
 449 |      else if (!strcmp(layerName, "stage4_5/concat"))
 450 |     {
 451 |         std::cout << layerName << std::endl;
 452 |         assert(mStage4_5_concat_layer.get() == nullptr);
 453 |         mStage4_5_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 454 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 455 |         return mStage4_5_concat_layer.get();
 456 |     }
 457 |      else if (!strcmp(layerName, "stage4_6/concat"))
 458 |     {
 459 |         std::cout << layerName << std::endl;
 460 |         assert(mStage4_6_concat_layer.get() == nullptr);
 461 |         mStage4_6_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 462 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 463 |         return mStage4_6_concat_layer.get();
 464 |     }
 465 |      else if (!strcmp(layerName, "mbox_priorbox"))
 466 |     {
 467 |         std::cout << layerName << std::endl;
 468 |         assert(mBox_priorbox_layer.get() == nullptr);
 469 |         mBox_priorbox_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 470 |                 (createConcatPlugin(2, true), nvPluginDeleter);
 471 |         return mBox_priorbox_layer.get();
 472 |     }
 473 | 
 474 | 
 475 |     else if (!strcmp(layerName, "mbox_loc"))
 476 |     {
 477 |         std::cout << layerName << std::endl;
 478 |         assert(mBox_loc_layer.get() == nullptr);
 479 |         mBox_loc_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 480 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 481 |         return mBox_loc_layer.get();
 482 |     }
 483 |     else if (!strcmp(layerName, "mbox_conf"))
 484 |     {
 485 |         std::cout << layerName << std::endl;
 486 |         assert(mBox_conf_layer.get() == nullptr);
 487 |         mBox_conf_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 488 |                 (createConcatPlugin(1, true), nvPluginDeleter);
 489 |         return mBox_conf_layer.get();
 490 |     }
 491 | 
 492 |         //flatten
 493 |     else if (!strcmp(layerName, "ext/pm1_mbox_loc_flat"))
 494 |     {
 495 |         std::cout << layerName << std::endl;
 496 |         assert(mExt_pm1_mbox_loc_flat_layer.get() == nullptr);
 497 |         mExt_pm1_mbox_loc_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer());
 498 |         return mExt_pm1_mbox_loc_flat_layer.get();
 499 |     }
 500 |     else if (!strcmp(layerName, "ext/pm1_mbox_conf_flat"))
 501 |     {
 502 |         std::cout << layerName << std::endl;
 503 |         assert(mExt_pm1_mbox_conf_flat_layer.get() == nullptr);
 504 |         mExt_pm1_mbox_conf_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer());
 505 |         return mExt_pm1_mbox_conf_flat_layer.get();
 506 |     }
 507 |      else if (!strcmp(layerName, "ext/pm2_mbox_loc_flat"))
 508 |     {
 509 |         std::cout << layerName << std::endl;
 510 |         assert(mExt_pm2_mbox_loc_flat_layer.get() == nullptr);
 511 |         mExt_pm2_mbox_loc_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer());
 512 |         return mExt_pm2_mbox_loc_flat_layer.get();
 513 |     }
 514 |     else if (!strcmp(layerName, "ext/pm2_mbox_conf_flat"))
 515 |     {
 516 |         std::cout << layerName << std::endl;
 517 |         assert(mExt_pm2_mbox_conf_flat_layer.get() == nullptr);
 518 |         mExt_pm2_mbox_conf_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer());
 519 |         return mExt_pm2_mbox_conf_flat_layer.get();
 520 |     }
 521 |      else if (!strcmp(layerName, "ext/pm3_mbox_loc_flat"))
 522 |     {
 523 |         std::cout << layerName << std::endl;
 524 |         assert(mExt_pm3_mbox_loc_flat_layer.get() == nullptr);
 525 |         mExt_pm3_mbox_loc_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer());
 526 |         return mExt_pm3_mbox_loc_flat_layer.get();
 527 |     }
 528 |     else if (!strcmp(layerName, "ext/pm3_mbox_conf_flat"))
 529 |     {
 530 |         std::cout << layerName << std::endl;
 531 |         assert(mExt_pm3_mbox_conf_flat_layer.get() == nullptr);
 532 |         mExt_pm3_mbox_conf_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer());
 533 |         return mExt_pm3_mbox_conf_flat_layer.get();
 534 |     }
 535 |      else if (!strcmp(layerName, "ext/pm4_mbox_loc_flat"))
 536 |     {
 537 |         std::cout << layerName << std::endl;
 538 |         assert(mExt_pm4_mbox_loc_flat_layer.get() == nullptr);
 539 |         mExt_pm4_mbox_loc_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer());
 540 |         return mExt_pm4_mbox_loc_flat_layer.get();
 541 |     }
 542 |     else if (!strcmp(layerName, "ext/pm4_mbox_conf_flat"))
 543 |     {
 544 |         std::cout << layerName << std::endl;
 545 |         assert(mExt_pm4_mbox_conf_flat_layer.get() == nullptr);
 546 |         mExt_pm4_mbox_conf_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer());
 547 |         return mExt_pm4_mbox_conf_flat_layer.get();
 548 |     }
 549 |      else if (!strcmp(layerName, "ext/pm5_mbox_loc_flat"))
 550 |     {
 551 |         std::cout << layerName << std::endl;
 552 |         assert(mExt_pm5_mbox_loc_flat_layer.get() == nullptr);
 553 |         mExt_pm5_mbox_loc_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer());
 554 |         return mExt_pm5_mbox_loc_flat_layer.get();
 555 |     }
 556 |     else if (!strcmp(layerName, "ext/pm5_mbox_conf_flat"))
 557 |     {
 558 |         std::cout << layerName << std::endl;
 559 |         assert(mExt_pm5_mbox_conf_flat_layer.get() == nullptr);
 560 |         mExt_pm5_mbox_conf_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer());
 561 |         return mExt_pm5_mbox_conf_flat_layer.get();
 562 |     }
 563 |      else if (!strcmp(layerName, "ext/pm6_mbox_loc_flat"))
 564 |     {
 565 |         std::cout << layerName << std::endl;
 566 |         assert(mExt_pm6_mbox_loc_flat_layer.get() == nullptr);
 567 |         mExt_pm6_mbox_loc_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer());
 568 |         return mExt_pm6_mbox_loc_flat_layer.get();
 569 |     }
 570 |     else if (!strcmp(layerName, "ext/pm6_mbox_conf_flat"))
 571 |     {
 572 |         std::cout << layerName << std::endl;
 573 |         assert(mExt_pm6_mbox_conf_flat_layer.get() == nullptr);
 574 |         mExt_pm6_mbox_conf_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer());
 575 |         return mExt_pm6_mbox_conf_flat_layer.get();
 576 |     }
 577 |    
 578 |     else if (!strcmp(layerName, "mbox_conf_flatten"))
 579 |     {
 580 |         std::cout << layerName << std::endl;
 581 |         assert(mMbox_conf_flat_layer.get() == nullptr);
 582 |         mMbox_conf_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer());
 583 |         return mMbox_conf_flat_layer.get();
 584 |     }
 585 | 
 586 | 
 587 |     else if (!strcmp(layerName, "mbox_conf_reshape"))
 588 |     {
 589 |         std::cout << layerName << std::endl;
 590 |         assert(mMbox_conf_reshape.get() == nullptr);
 591 |         assert(nbWeights == 0 && weights == nullptr);
 592 |         mMbox_conf_reshape = std::unique_ptr<Reshape<11>>(new Reshape<11>());
 593 |         return mMbox_conf_reshape.get();
 594 |     }
 595 |     //softmax layer
 596 |     else if (!strcmp(layerName, "mbox_conf_softmax"))
 597 |     {
 598 |         std::cout << layerName << std::endl;
 599 |         assert( mPluginSoftmax == nullptr);
 600 |         assert( nbWeights == 0 && weights == nullptr);
 601 |         mPluginSoftmax = std::unique_ptr<SoftmaxPlugin>(new SoftmaxPlugin());
 602 |         return mPluginSoftmax.get();
 603 |     }
 604 |     else if (!strcmp(layerName, "detection_out"))
 605 |     {
 606 |         std::cout << layerName << std::endl;
 607 |         assert(mDetection_out.get() == nullptr);
 608 |         //tensor rt 3.0 
 609 |         //mDetection_out = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>(createSSDDetectionOutputPlugin({true, false, 0, 21, 400, 200, 0.5, 0.45, CodeType_t::CENTER_SIZE}), nvPluginDeleter);
 610 |         //tensor rt 5
 611 | 
 612 | 
 613 | 
 614 |         DetectionOutputParameters params;
 615 |         params.backgroundLabelId = 0;
 616 |         params.codeType = CodeTypeSSD::CENTER_SIZE;
 617 |         params.keepTopK = 200;
 618 |         params.shareLocation = true;
 619 |         params.varianceEncodedInTarget = false;
 620 |         params.topK = 400;
 621 |         params.nmsThreshold = 0.4499;
 622 |         params.numClasses = 11;
 623 |         params.inputOrder[0] = 0;
 624 |         params.inputOrder[1] = 1;
 625 |         params.inputOrder[2] = 2;
 626 |         params.confidenceThreshold = 0.3;
 627 |         params.confSigmoid = false;
 628 |         params.isNormalized = true;
 629 | 
 630 | 
 631 | 
 632 |         mDetection_out = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 633 |                 (createSSDDetectionOutputPlugin(params), nvPluginDeleter);
 634 |         return mDetection_out.get();
 635 |     }
 636 |     else
 637 |     {
 638 |         std::cout << layerName << std::endl;
 639 |         assert(0);
 640 |         return nullptr;
 641 |     }
 642 | }
 643 | 
 644 | IPlugin* PluginFactory::createPlugin(const char* layerName, const void* serialData, size_t serialLength)
 645 | {
 646 |     assert(isPlugin(layerName));
 647 |        if (!strcmp(layerName, "ext/pm1_mbox_loc_perm"))
 648 |     {
 649 |         std::cout << layerName << std::endl;
 650 |         assert(mExt_pm1_mbox_loc_perm_layer.get() == nullptr);
 651 |         mExt_pm1_mbox_loc_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 652 |                 (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter);
 653 |         return mExt_pm1_mbox_loc_perm_layer.get();
 654 |     }
 655 |     else if (!strcmp(layerName, "ext/pm1_mbox_conf_perm"))
 656 |     {
 657 |         assert(mExt_pm1_mbox_conf_perm_layer.get() == nullptr);
 658 |         mExt_pm1_mbox_conf_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 659 |                 (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter);
 660 |         return mExt_pm1_mbox_conf_perm_layer.get();
 661 |     }
 662 |     else if (!strcmp(layerName, "ext/pm2_mbox_loc_perm"))
 663 |     {
 664 |         assert(mExt_pm2_mbox_loc_perm_layer.get() == nullptr);
 665 |         mExt_pm2_mbox_loc_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 666 |                 (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter);
 667 |         return mExt_pm2_mbox_loc_perm_layer.get();
 668 |     }
 669 |     else if (!strcmp(layerName, "ext/pm2_mbox_conf_perm"))
 670 |     {
 671 |         assert(mExt_pm2_mbox_conf_perm_layer.get() == nullptr);
 672 |         mExt_pm2_mbox_conf_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 673 |                 (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter);
 674 |         return mExt_pm2_mbox_conf_perm_layer.get();
 675 |     }
 676 |      else if (!strcmp(layerName, "ext/pm3_mbox_loc_perm"))
 677 |     {
 678 |         assert(mExt_pm3_mbox_loc_perm_layer.get() == nullptr);
 679 |         mExt_pm3_mbox_loc_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 680 |                 (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter);
 681 |         return mExt_pm3_mbox_loc_perm_layer.get();
 682 |     }
 683 |     else if (!strcmp(layerName, "ext/pm3_mbox_conf_perm"))
 684 |     {
 685 |         assert(mExt_pm3_mbox_conf_perm_layer.get() == nullptr);
 686 |         mExt_pm3_mbox_conf_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 687 |                 (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter);
 688 |         return mExt_pm3_mbox_conf_perm_layer.get();
 689 |     }
 690 |      else if (!strcmp(layerName, "ext/pm4_mbox_loc_perm"))
 691 |     {
 692 |         assert(mExt_pm4_mbox_loc_perm_layer.get() == nullptr);
 693 |         mExt_pm4_mbox_loc_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 694 |                 (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter);
 695 |         return mExt_pm4_mbox_loc_perm_layer.get();
 696 |     }
 697 |     else if (!strcmp(layerName, "ext/pm4_mbox_conf_perm"))
 698 |     {
 699 |         assert(mExt_pm4_mbox_conf_perm_layer.get() == nullptr);
 700 |         mExt_pm4_mbox_conf_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 701 |                 (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter);
 702 |         return mExt_pm4_mbox_conf_perm_layer.get();
 703 |     }
 704 |      else if (!strcmp(layerName, "ext/pm5_mbox_loc_perm"))
 705 |     {
 706 |         assert(mExt_pm5_mbox_loc_perm_layer.get() == nullptr);
 707 |         mExt_pm5_mbox_loc_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 708 |                 (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter);
 709 |         return mExt_pm5_mbox_loc_perm_layer.get();
 710 |     }
 711 |     else if (!strcmp(layerName, "ext/pm5_mbox_conf_perm"))
 712 |     {
 713 |         assert(mExt_pm5_mbox_conf_perm_layer.get() == nullptr);
 714 |         mExt_pm5_mbox_conf_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 715 |                 (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter);
 716 |         return mExt_pm5_mbox_conf_perm_layer.get();
 717 |     }
 718 |      else if (!strcmp(layerName, "ext/pm6_mbox_loc_perm"))
 719 |     {
 720 |         assert(mExt_pm6_mbox_loc_perm_layer.get() == nullptr);
 721 |         mExt_pm6_mbox_loc_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 722 |                 (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter);
 723 |         return mExt_pm6_mbox_loc_perm_layer.get();
 724 |     }
 725 |     else if (!strcmp(layerName, "ext/pm6_mbox_conf_perm"))
 726 |     {
 727 |         assert(mExt_pm6_mbox_conf_perm_layer.get() == nullptr);
 728 |         mExt_pm6_mbox_conf_perm_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 729 |                 (createSSDPermutePlugin(serialData, serialLength), nvPluginDeleter);
 730 |         return mExt_pm6_mbox_conf_perm_layer.get();
 731 |     }
 732 |      else if (!strcmp(layerName, "ext/pm1_mbox_priorbox"))
 733 |     {
 734 |         assert(mExt_pm1_mbox_priorbox_layer.get() == nullptr);
 735 |         float min_size = 30.3999996185, max_size = 60.7999992371, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 
 736 |         mExt_pm1_mbox_priorbox_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 737 |                 (createSSDPriorBoxPlugin(serialData, serialLength), nvPluginDeleter);
 738 |         return mExt_pm1_mbox_priorbox_layer.get();
 739 |     }
 740 |       else if (!strcmp(layerName, "ext/pm2_mbox_priorbox"))
 741 |     {
 742 |         assert(mExt_pm2_mbox_priorbox_layer.get() == nullptr);
 743 |         float min_size = 60.7999992371, max_size = 112.480003357, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 
 744 |         mExt_pm2_mbox_priorbox_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 745 |                 (createSSDPriorBoxPlugin(serialData, serialLength), nvPluginDeleter);
 746 |         return mExt_pm2_mbox_priorbox_layer.get();
 747 |     }
 748 | 
 749 |       else if (!strcmp(layerName, "ext/pm3_mbox_priorbox"))
 750 |     {
 751 |         assert(mExt_pm3_mbox_priorbox_layer.get() == nullptr);
 752 |         float min_size = 112.480003357, max_size = 164.160003662, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 
 753 |         mExt_pm3_mbox_priorbox_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 754 |                 (createSSDPriorBoxPlugin(serialData, serialLength), nvPluginDeleter);
 755 |         return mExt_pm3_mbox_priorbox_layer.get();
 756 |     }
 757 | 
 758 |       else if (!strcmp(layerName, "ext/pm4_mbox_priorbox"))
 759 |     {
 760 |         assert(mExt_pm4_mbox_priorbox_layer.get() == nullptr);
 761 |         float min_size = 164.160003662, max_size = 215.839996338, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 
 762 |         mExt_pm4_mbox_priorbox_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 763 |                 (createSSDPriorBoxPlugin(serialData, serialLength), nvPluginDeleter);
 764 |         return mExt_pm4_mbox_priorbox_layer.get();
 765 |     }
 766 | 
 767 |       else if (!strcmp(layerName, "ext/pm5_mbox_priorbox"))
 768 |     {
 769 |         assert(mExt_pm5_mbox_priorbox_layer.get() == nullptr);
 770 |         float min_size = 215.839996338, max_size = 267.519989014, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 
 771 |         mExt_pm5_mbox_priorbox_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 772 |                 (createSSDPriorBoxPlugin(serialData, serialLength), nvPluginDeleter);
 773 |         return mExt_pm5_mbox_priorbox_layer.get();
 774 |     }
 775 | 
 776 |       else if (!strcmp(layerName, "ext/pm6_mbox_priorbox"))
 777 |     {
 778 |         assert(mExt_pm6_mbox_priorbox_layer.get() == nullptr);
 779 |         float min_size = 267.519989014, max_size = 319.200012207, aspect_ratio[3] = {1.0, 2.0, 3.0}; //aspect_ratio[2] = {1.0, 2.0}; 
 780 |         mExt_pm6_mbox_priorbox_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 781 |                 (createSSDPriorBoxPlugin(serialData, serialLength), nvPluginDeleter);
 782 |         return mExt_pm6_mbox_priorbox_layer.get();
 783 |     }
 784 | 
 785 |     else if (!strcmp(layerName, "stem/concat"))
 786 |     {
 787 |         assert(mStem_concat_layer.get() == nullptr);
 788 |         mStem_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 789 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 790 |         return mStem_concat_layer.get();
 791 |     }
 792 | 
 793 |     else if (!strcmp(layerName, "stage1_1/concat"))
 794 |     {
 795 |         assert(mStage1_1_concat_layer.get() == nullptr);
 796 |         mStage1_1_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 797 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 798 |         return mStage1_1_concat_layer.get();
 799 |     }
 800 |     else if (!strcmp(layerName, "stage1_2/concat"))
 801 |     {
 802 |         assert(mStage1_2_concat_layer.get() == nullptr);
 803 |         mStage1_2_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 804 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 805 |         return mStage1_2_concat_layer.get();
 806 |     }
 807 |     else if (!strcmp(layerName, "stage1_3/concat"))
 808 |     {
 809 |         assert(mStage1_3_concat_layer.get() == nullptr);
 810 |         mStage1_3_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 811 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 812 |         return mStage1_3_concat_layer.get();
 813 |     }
 814 | 
 815 |     else if (!strcmp(layerName, "stage2_1/concat"))
 816 |     {
 817 |         assert(mStage2_1_concat_layer.get() == nullptr);
 818 |         mStage2_1_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 819 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 820 |         return mStage2_1_concat_layer.get();
 821 |     }
 822 |     else if (!strcmp(layerName, "stage2_2/concat"))
 823 |     {
 824 |         assert(mStage2_2_concat_layer.get() == nullptr);
 825 |         mStage2_2_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 826 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 827 |         return mStage2_2_concat_layer.get();
 828 |     }
 829 |     else if (!strcmp(layerName, "stage2_3/concat"))
 830 |     {
 831 |         assert(mStage2_3_concat_layer.get() == nullptr);
 832 |         mStage2_3_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 833 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 834 |         return mStage2_3_concat_layer.get();
 835 |     }
 836 |      else if (!strcmp(layerName, "stage2_4/concat"))
 837 |     {
 838 |         assert(mStage2_4_concat_layer.get() == nullptr);
 839 |         mStage2_4_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 840 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 841 |         return mStage2_4_concat_layer.get();
 842 |     }
 843 | 
 844 |       else if (!strcmp(layerName, "stage3_1/concat"))
 845 |     {
 846 |         assert(mStage3_1_concat_layer.get() == nullptr);
 847 |         mStage3_1_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 848 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 849 |         return mStage3_1_concat_layer.get();
 850 |     }
 851 |     else if (!strcmp(layerName, "stage3_2/concat"))
 852 |     {
 853 |         assert(mStage3_2_concat_layer.get() == nullptr);
 854 |         mStage3_2_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 855 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 856 |         return mStage3_2_concat_layer.get();
 857 |     }
 858 |     else if (!strcmp(layerName, "stage3_3/concat"))
 859 |     {
 860 |         assert(mStage3_3_concat_layer.get() == nullptr);
 861 |         mStage3_3_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 862 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 863 |         return mStage3_3_concat_layer.get();
 864 |     }
 865 |      else if (!strcmp(layerName, "stage3_4/concat"))
 866 |     {
 867 |         assert(mStage3_4_concat_layer.get() == nullptr);
 868 |         mStage3_4_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 869 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 870 |         return mStage3_4_concat_layer.get();
 871 |     }
 872 |      else if (!strcmp(layerName, "stage3_5/concat"))
 873 |     {
 874 |         assert(mStage3_5_concat_layer.get() == nullptr);
 875 |         mStage3_5_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 876 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 877 |         return mStage3_5_concat_layer.get();
 878 |     }
 879 |      else if (!strcmp(layerName, "stage3_6/concat"))
 880 |     {
 881 |         assert(mStage3_6_concat_layer.get() == nullptr);
 882 |         mStage3_6_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 883 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 884 |         return mStage3_6_concat_layer.get();
 885 |     }
 886 |      else if (!strcmp(layerName, "stage3_7/concat"))
 887 |     {
 888 |         assert(mStage3_7_concat_layer.get() == nullptr);
 889 |         mStage3_7_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 890 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 891 |         return mStage3_7_concat_layer.get();
 892 |     }
 893 |      else if (!strcmp(layerName, "stage3_8/concat"))
 894 |     {
 895 |         assert(mStage3_8_concat_layer.get() == nullptr);
 896 |         mStage3_8_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 897 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 898 |         return mStage3_8_concat_layer.get();
 899 |     }
 900 | 
 901 |       else if (!strcmp(layerName, "stage4_1/concat"))
 902 |     {
 903 |         assert(mStage4_1_concat_layer.get() == nullptr);
 904 |         mStage4_1_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 905 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 906 |         return mStage4_1_concat_layer.get();
 907 |     }
 908 |     else if (!strcmp(layerName, "stage4_2/concat"))
 909 |     {
 910 |         assert(mStage4_2_concat_layer.get() == nullptr);
 911 |         mStage4_2_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 912 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 913 |         return mStage4_2_concat_layer.get();
 914 |     }
 915 |     else if (!strcmp(layerName, "stage4_3/concat"))
 916 |     {
 917 |         assert(mStage4_3_concat_layer.get() == nullptr);
 918 |         mStage4_3_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 919 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 920 |         return mStage4_3_concat_layer.get();
 921 |     }
 922 |      else if (!strcmp(layerName, "stage4_4/concat"))
 923 |     {
 924 |         assert(mStage4_4_concat_layer.get() == nullptr);
 925 |         mStage4_4_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 926 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 927 |         return mStage4_4_concat_layer.get();
 928 |     }
 929 |      else if (!strcmp(layerName, "stage4_5/concat"))
 930 |     {
 931 |         assert(mStage4_5_concat_layer.get() == nullptr);
 932 |         mStage4_5_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 933 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 934 |         return mStage4_5_concat_layer.get();
 935 |     }
 936 |      else if (!strcmp(layerName, "stage4_6/concat"))
 937 |     {
 938 |         assert(mStage4_6_concat_layer.get() == nullptr);
 939 |         mStage4_6_concat_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 940 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 941 |         return mStage4_6_concat_layer.get();
 942 |     }
 943 |     else if (!strcmp(layerName, "mbox_priorbox"))
 944 |     {
 945 |         assert(mBox_priorbox_layer.get() == nullptr);
 946 |         mBox_priorbox_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 947 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 948 |         return mBox_priorbox_layer.get();
 949 |     }
 950 | 
 951 |     else if (!strcmp(layerName, "mbox_loc"))
 952 |     {
 953 |         assert(mBox_loc_layer.get() == nullptr);
 954 |         mBox_loc_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 955 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 956 |         return mBox_loc_layer.get();
 957 |     }
 958 |     else if (!strcmp(layerName, "mbox_conf"))
 959 |     {
 960 |         assert(mBox_conf_layer.get() == nullptr);
 961 |         mBox_conf_layer = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
 962 |                 (createConcatPlugin(serialData, serialLength), nvPluginDeleter);
 963 |         return mBox_conf_layer.get();
 964 |     }
 965 | 
 966 |         //flatten
 967 |     else if (!strcmp(layerName, "ext/pm1_mbox_loc_flat"))
 968 |     {
 969 |         assert(mExt_pm1_mbox_loc_flat_layer.get() == nullptr);
 970 |         mExt_pm1_mbox_loc_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer(serialData, serialLength));
 971 |         return mExt_pm1_mbox_loc_flat_layer.get();
 972 |     }
 973 |     else if (!strcmp(layerName, "ext/pm1_mbox_conf_flat"))
 974 |     {
 975 |         assert(mExt_pm1_mbox_conf_flat_layer.get() == nullptr);
 976 |         mExt_pm1_mbox_conf_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer(serialData, serialLength));
 977 |         return mExt_pm1_mbox_conf_flat_layer.get();
 978 |     }
 979 |      else if (!strcmp(layerName, "ext/pm2_mbox_loc_flat"))
 980 |     {
 981 |         assert(mExt_pm2_mbox_loc_flat_layer.get() == nullptr);
 982 |         mExt_pm2_mbox_loc_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer(serialData, serialLength));
 983 |         return mExt_pm2_mbox_loc_flat_layer.get();
 984 |     }
 985 |     else if (!strcmp(layerName, "ext/pm2_mbox_conf_flat"))
 986 |     {
 987 |         assert(mExt_pm2_mbox_conf_flat_layer.get() == nullptr);
 988 |         mExt_pm2_mbox_conf_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer(serialData, serialLength));
 989 |         return mExt_pm2_mbox_conf_flat_layer.get();
 990 |     }
 991 |      else if (!strcmp(layerName, "ext/pm3_mbox_loc_flat"))
 992 |     {
 993 |         assert(mExt_pm3_mbox_loc_flat_layer.get() == nullptr);
 994 |         mExt_pm3_mbox_loc_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer(serialData, serialLength));
 995 |         return mExt_pm3_mbox_loc_flat_layer.get();
 996 |     }
 997 |     else if (!strcmp(layerName, "ext/pm3_mbox_conf_flat"))
 998 |     {
 999 |         assert(mExt_pm3_mbox_conf_flat_layer.get() == nullptr);
1000 |         mExt_pm3_mbox_conf_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer(serialData, serialLength));
1001 |         return mExt_pm3_mbox_conf_flat_layer.get();
1002 |     }
1003 |      else if (!strcmp(layerName, "ext/pm4_mbox_loc_flat"))
1004 |     {
1005 |         assert(mExt_pm4_mbox_loc_flat_layer.get() == nullptr);
1006 |         mExt_pm4_mbox_loc_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer(serialData, serialLength));
1007 |         return mExt_pm4_mbox_loc_flat_layer.get();
1008 |     }
1009 |     else if (!strcmp(layerName, "ext/pm4_mbox_conf_flat"))
1010 |     {
1011 |         assert(mExt_pm4_mbox_conf_flat_layer.get() == nullptr);
1012 |         mExt_pm4_mbox_conf_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer(serialData, serialLength));
1013 |         return mExt_pm4_mbox_conf_flat_layer.get();
1014 |     }
1015 |      else if (!strcmp(layerName, "ext/pm5_mbox_loc_flat"))
1016 |     {
1017 |         assert(mExt_pm5_mbox_loc_flat_layer.get() == nullptr);
1018 |         mExt_pm5_mbox_loc_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer(serialData, serialLength));
1019 |         return mExt_pm5_mbox_loc_flat_layer.get();
1020 |     }
1021 |     else if (!strcmp(layerName, "ext/pm5_mbox_conf_flat"))
1022 |     {
1023 |         assert(mExt_pm5_mbox_conf_flat_layer.get() == nullptr);
1024 |         mExt_pm5_mbox_conf_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer(serialData, serialLength));
1025 |         return mExt_pm5_mbox_conf_flat_layer.get();
1026 |     }
1027 |      else if (!strcmp(layerName, "ext/pm6_mbox_loc_flat"))
1028 |     {
1029 |         assert(mExt_pm6_mbox_loc_flat_layer.get() == nullptr);
1030 |         mExt_pm6_mbox_loc_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer(serialData, serialLength));
1031 |         return mExt_pm6_mbox_loc_flat_layer.get();
1032 |     }
1033 |     else if (!strcmp(layerName, "ext/pm6_mbox_conf_flat"))
1034 |     {
1035 |         assert(mExt_pm6_mbox_conf_flat_layer.get() == nullptr);
1036 |         mExt_pm6_mbox_conf_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer(serialData, serialLength));
1037 |         return mExt_pm6_mbox_conf_flat_layer.get();
1038 |     }
1039 |    
1040 |     else if (!strcmp(layerName, "mbox_conf_flatten"))
1041 |     {
1042 |         assert(mMbox_conf_flat_layer.get() == nullptr);
1043 |         mMbox_conf_flat_layer = std::unique_ptr<FlattenLayer>(new FlattenLayer(serialData, serialLength));
1044 |         return mMbox_conf_flat_layer.get();
1045 |     }
1046 | 
1047 | 
1048 |     else if (!strcmp(layerName, "mbox_conf_reshape"))
1049 |     {
1050 |         assert(mMbox_conf_reshape.get() == nullptr);
1051 |        // assert(nbWeights == 0 && weights == nullptr);
1052 |         mMbox_conf_reshape = std::unique_ptr<Reshape<11>>(new Reshape<11>(serialData, serialLength));
1053 |         return mMbox_conf_reshape.get();
1054 |     }
1055 |     //softmax layer
1056 |     else if (!strcmp(layerName, "mbox_conf_softmax"))
1057 |     {
1058 |         assert( mPluginSoftmax == nullptr);
1059 |        
1060 |         mPluginSoftmax = std::unique_ptr<SoftmaxPlugin>(new SoftmaxPlugin(serialData, serialLength));
1061 |         return mPluginSoftmax.get();
1062 |     }
1063 |     else if (!strcmp(layerName, "detection_out"))
1064 |     {
1065 |         assert(mDetection_out.get() == nullptr);
1066 |         //tensor rt 3.0 
1067 |         //mDetection_out = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>(createSSDDetectionOutputPlugin({true, false, 0, 21, 400, 200, 0.5, 0.45, CodeType_t::CENTER_SIZE}), nvPluginDeleter);
1068 |         //tensor rt 5
1069 |        
1070 | 
1071 |         mDetection_out = std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)>
1072 |                 (createSSDDetectionOutputPlugin(serialData, serialLength), nvPluginDeleter);
1073 |         return mDetection_out.get();
1074 |     }
1075 |     else
1076 |     {
1077 |         assert(0);
1078 |         return nullptr;
1079 |     }
1080 | }
1081 | 
1082 | bool PluginFactory::isPlugin(const char* name)
1083 | {
1084 |     return (!strcmp(name, "ext/pm1_mbox_loc_perm")
1085 |             || !strcmp(name, "ext/pm1_mbox_conf_perm")
1086 |             || !strcmp(name, "ext/pm2_mbox_loc_perm")
1087 |             || !strcmp(name, "ext/pm2_mbox_conf_perm") 
1088 |             || !strcmp(name, "ext/pm3_mbox_loc_perm")
1089 |             || !strcmp(name, "ext/pm3_mbox_conf_perm") 
1090 |             || !strcmp(name, "ext/pm4_mbox_loc_perm")
1091 |             || !strcmp(name, "ext/pm4_mbox_conf_perm") 
1092 |             || !strcmp(name, "ext/pm5_mbox_loc_perm")
1093 |             || !strcmp(name, "ext/pm5_mbox_conf_perm") 
1094 |             || !strcmp(name, "ext/pm6_mbox_loc_perm")
1095 |             || !strcmp(name, "ext/pm6_mbox_conf_perm") 
1096 |             || !strcmp(name, "ext/pm1_mbox_priorbox") 
1097 |             || !strcmp(name, "ext/pm2_mbox_priorbox") 
1098 |             || !strcmp(name, "ext/pm3_mbox_priorbox") 
1099 |             || !strcmp(name, "ext/pm4_mbox_priorbox") 
1100 |             || !strcmp(name, "ext/pm5_mbox_priorbox") 
1101 |             || !strcmp(name, "ext/pm6_mbox_priorbox") 
1102 |             || !strcmp(name, "stem/concat") 
1103 |             || !strcmp(name, "stage1_1/concat") 
1104 |             || !strcmp(name, "stage1_2/concat") 
1105 |             || !strcmp(name, "stage1_3/concat") 
1106 |             || !strcmp(name, "stage2_1/concat") 
1107 |             || !strcmp(name, "stage2_2/concat") 
1108 |             || !strcmp(name, "stage2_3/concat") 
1109 |             || !strcmp(name, "stage2_4/concat") 
1110 |             || !strcmp(name, "stage3_1/concat") 
1111 |             || !strcmp(name, "stage3_2/concat") 
1112 |             || !strcmp(name, "stage3_3/concat") 
1113 |             || !strcmp(name, "stage3_4/concat") 
1114 |             || !strcmp(name, "stage3_5/concat") 
1115 |             || !strcmp(name, "stage3_6/concat") 
1116 |             || !strcmp(name, "stage3_7/concat") 
1117 |             || !strcmp(name, "stage3_8/concat") 
1118 |             || !strcmp(name, "stage4_1/concat") 
1119 |             || !strcmp(name, "stage4_2/concat") 
1120 |             || !strcmp(name, "stage4_3/concat") 
1121 |             || !strcmp(name, "stage4_4/concat") 
1122 |             || !strcmp(name, "stage4_5/concat") 
1123 |             || !strcmp(name, "stage4_6/concat") 
1124 |             || !strcmp(name, "mbox_loc")
1125 |             || !strcmp(name, "mbox_conf")
1126 |             || !strcmp(name, "ext/pm1_mbox_loc_flat")
1127 |             || !strcmp(name, "ext/pm1_mbox_conf_flat")
1128 |             || !strcmp(name, "ext/pm2_mbox_loc_flat")
1129 |             || !strcmp(name, "ext/pm2_mbox_conf_flat")
1130 |             || !strcmp(name, "ext/pm3_mbox_loc_flat")
1131 |             || !strcmp(name, "ext/pm3_mbox_conf_flat")
1132 |             || !strcmp(name, "ext/pm4_mbox_loc_flat")
1133 |             || !strcmp(name, "ext/pm4_mbox_conf_flat")
1134 |             || !strcmp(name, "ext/pm5_mbox_loc_flat")
1135 |             || !strcmp(name, "ext/pm5_mbox_conf_flat")
1136 |             || !strcmp(name, "ext/pm6_mbox_loc_flat")
1137 |             || !strcmp(name, "ext/pm6_mbox_conf_flat")
1138 |             || !strcmp(name, "mbox_conf_reshape")
1139 |             || !strcmp(name, "mbox_conf_flatten")
1140 |             || !strcmp(name, "mbox_loc")
1141 |             || !strcmp(name, "mbox_conf")
1142 |             || !strcmp(name, "mbox_priorbox")
1143 |             || !strcmp(name, "detection_out")
1144 |             || !strcmp(name, "mbox_conf_softmax"));
1145 | 
1146 | 
1147 | }
1148 | 
1149 | void PluginFactory::destroyPlugin()
1150 | {
1151 |    
1152 | 
1153 |     mExt_pm1_mbox_loc_perm_layer.release();
1154 |     mExt_pm1_mbox_conf_perm_layer.release();
1155 |     mExt_pm2_mbox_loc_perm_layer.release();
1156 |     mExt_pm2_mbox_conf_perm_layer.release();
1157 |     mExt_pm3_mbox_loc_perm_layer.release();
1158 |     mExt_pm3_mbox_conf_perm_layer.release();
1159 |     mExt_pm4_mbox_loc_perm_layer.release();
1160 |     mExt_pm4_mbox_conf_perm_layer.release();
1161 |     mExt_pm5_mbox_loc_perm_layer.release();
1162 |     mExt_pm5_mbox_conf_perm_layer.release();
1163 |     mExt_pm6_mbox_loc_perm_layer.release();
1164 |     mExt_pm6_mbox_conf_perm_layer.release();
1165 | 
1166 |     mExt_pm1_mbox_priorbox_layer.release();
1167 |     mExt_pm2_mbox_priorbox_layer.release();
1168 |     mExt_pm3_mbox_priorbox_layer.release();
1169 |     mExt_pm4_mbox_priorbox_layer.release();
1170 |     mExt_pm5_mbox_priorbox_layer.release();
1171 |     mExt_pm6_mbox_priorbox_layer.release();
1172 | 
1173 |     mStem_concat_layer.release();
1174 |     mStage1_1_concat_layer.release(); 
1175 |     mStage1_2_concat_layer.release();
1176 |     mStage1_3_concat_layer.release();
1177 | 
1178 |     mStage2_1_concat_layer.release(); 
1179 |     mStage2_2_concat_layer.release();
1180 |     mStage2_3_concat_layer.release();
1181 |     mStage2_4_concat_layer.release();
1182 | 
1183 | 
1184 |     mStage3_1_concat_layer.release();
1185 |     mStage3_2_concat_layer.release();
1186 |     mStage3_3_concat_layer.release();
1187 |     mStage3_4_concat_layer.release();
1188 |     mStage3_5_concat_layer.release(); 
1189 |     mStage3_6_concat_layer.release();
1190 |     mStage3_7_concat_layer.release();
1191 |     mStage3_8_concat_layer.release();
1192 | 
1193 | 
1194 |     mStage4_1_concat_layer.release(); 
1195 |     mStage4_2_concat_layer.release();
1196 |     mStage4_3_concat_layer.release();
1197 |     mStage4_4_concat_layer.release();
1198 |     mStage4_5_concat_layer.release();
1199 |     mStage4_6_concat_layer.release();
1200 | 
1201 | 
1202 |     mExt_pm1_mbox_loc_perm_layer= nullptr;
1203 |     mExt_pm1_mbox_conf_perm_layer= nullptr;
1204 |     mExt_pm2_mbox_loc_perm_layer= nullptr;
1205 |     mExt_pm2_mbox_conf_perm_layer= nullptr;
1206 |     mExt_pm3_mbox_loc_perm_layer= nullptr;
1207 |     mExt_pm3_mbox_conf_perm_layer = nullptr;
1208 |     mExt_pm4_mbox_loc_perm_layer= nullptr;
1209 |     mExt_pm4_mbox_conf_perm_layer= nullptr;
1210 |     mExt_pm5_mbox_loc_perm_layer= nullptr;
1211 |     mExt_pm5_mbox_conf_perm_layer= nullptr;
1212 |     mExt_pm6_mbox_loc_perm_layer= nullptr;
1213 |     mExt_pm6_mbox_conf_perm_layer= nullptr;
1214 | 
1215 |     mExt_pm1_mbox_priorbox_layer= nullptr;
1216 |     mExt_pm2_mbox_priorbox_layer= nullptr;
1217 |     mExt_pm3_mbox_priorbox_layer= nullptr;
1218 |     mExt_pm4_mbox_priorbox_layer= nullptr;
1219 |     mExt_pm5_mbox_priorbox_layer= nullptr;
1220 |     mExt_pm6_mbox_priorbox_layer= nullptr;
1221 | 
1222 |     mStem_concat_layer= nullptr;
1223 |     mStage1_1_concat_layer = nullptr;
1224 |     mStage1_2_concat_layer= nullptr;
1225 |     mStage1_3_concat_layer= nullptr;
1226 | 
1227 |     mStage2_1_concat_layer = nullptr;
1228 |     mStage2_2_concat_layer= nullptr;
1229 |     mStage2_3_concat_layer= nullptr;
1230 |     mStage2_4_concat_layer= nullptr;
1231 | 
1232 | 
1233 |     mStage3_1_concat_layer = nullptr;
1234 |     mStage3_2_concat_layer= nullptr;
1235 |     mStage3_3_concat_layer= nullptr;
1236 |     mStage3_4_concat_layer= nullptr;
1237 |     mStage3_5_concat_layer = nullptr;
1238 |     mStage3_6_concat_layer= nullptr;
1239 |     mStage3_7_concat_layer= nullptr;
1240 |     mStage3_8_concat_layer= nullptr;
1241 | 
1242 | 
1243 |     mStage4_1_concat_layer = nullptr;
1244 |     mStage4_2_concat_layer= nullptr;
1245 |     mStage4_3_concat_layer= nullptr;
1246 |     mStage4_4_concat_layer= nullptr;
1247 |     mStage4_5_concat_layer = nullptr;
1248 |     mStage4_6_concat_layer= nullptr;
1249 |     
1250 |     mBox_priorbox_layer.release();
1251 |     mBox_priorbox_layer = nullptr;
1252 |     mBox_loc_layer.release();
1253 |     mBox_loc_layer = nullptr;
1254 |     mBox_conf_layer.release();
1255 |     mBox_conf_layer = nullptr;
1256 | 
1257 |     mExt_pm1_mbox_loc_flat_layer.release();
1258 |     mExt_pm1_mbox_conf_flat_layer.release();
1259 |     mExt_pm2_mbox_loc_flat_layer.release();
1260 |     mExt_pm2_mbox_conf_flat_layer.release();
1261 |     mExt_pm3_mbox_loc_flat_layer.release();
1262 |     mExt_pm3_mbox_conf_flat_layer.release();
1263 |     mExt_pm4_mbox_loc_flat_layer.release();
1264 |     mExt_pm4_mbox_conf_flat_layer.release();
1265 |     mExt_pm5_mbox_loc_flat_layer.release();
1266 |     mExt_pm5_mbox_conf_flat_layer.release();
1267 |     mExt_pm6_mbox_loc_flat_layer.release();
1268 |     mExt_pm6_mbox_conf_flat_layer.release();
1269 | 
1270 |     mExt_pm1_mbox_loc_flat_layer= nullptr;
1271 |     mExt_pm1_mbox_conf_flat_layer= nullptr;
1272 |     mExt_pm2_mbox_loc_flat_layer= nullptr;
1273 |     mExt_pm2_mbox_conf_flat_layer= nullptr;
1274 |     mExt_pm3_mbox_loc_flat_layer= nullptr;
1275 |     mExt_pm3_mbox_conf_flat_layer= nullptr;
1276 |     mExt_pm4_mbox_loc_flat_layer= nullptr;
1277 |     mExt_pm4_mbox_conf_flat_layer= nullptr;
1278 |     mExt_pm5_mbox_loc_flat_layer= nullptr;
1279 |     mExt_pm5_mbox_conf_flat_layer= nullptr;
1280 |     mExt_pm6_mbox_loc_flat_layer= nullptr;
1281 |     mExt_pm6_mbox_conf_flat_layer= nullptr;
1282 | 
1283 |     mMbox_conf_flat_layer.release();
1284 |     mMbox_conf_flat_layer = nullptr;
1285 |     mMbox_conf_reshape.release();
1286 |     mMbox_conf_reshape = nullptr;
1287 |     mPluginSoftmax.release();
1288 |     mPluginSoftmax = nullptr;
1289 |     mDetection_out.release();
1290 |     mDetection_out = nullptr;
1291 | }
1292 | 


--------------------------------------------------------------------------------
/pluginImplement.h:
--------------------------------------------------------------------------------
  1 | #ifndef __PLUGIN_LAYER_H__
  2 | #define __PLUGIN_LAYER_H__
  3 | #include <memory>
  4 | 
  5 | #include <cassert>
  6 | #include <iostream>
  7 | #include <cudnn.h>
  8 | #include <cstring>
  9 | #include <cuda_runtime.h>
 10 | #include <cublas_v2.h>
 11 | 
 12 | #include "NvCaffeParser.h"
 13 | #include "NvInfer.h"
 14 | #include "NvInferPlugin.h"
 15 | #include "NvUtils.h"
 16 | //#include "fp16.h"
 17 | 
 18 | #define CHECK(status)                                                                                           \
 19 |     {                                                                                                                           \
 20 |         if (status != 0)                                                                                                \
 21 |         {                                                                                                                               \
 22 |             std::cout << "Cuda failure: " << cudaGetErrorString(status) \
 23 |                       << " at line " << __LINE__                                                        \
 24 |                       << std::endl;                                                                     \
 25 |             abort();                                                                                                    \
 26 |         }                                                                                                                               \
 27 |     }
 28 | 
 29 | 
 30 | using namespace nvinfer1;
 31 | using namespace nvcaffeparser1;
 32 | using namespace plugin;
 33 | 
 34 | static const int TIMING_ITERATIONS = 1000; 
 35 | 
 36 | 
 37 | 
 38 | 
 39 | enum FunctionType
 40 | {
 41 |     SELECT=0,
 42 |     SUMMARY
 43 | };
 44 | 
 45 | void cudaSoftmax(int n, int channels,  float* x, float*y);
 46 | //void cudaSoftmax(int n, int channels, __half* x, __half* y);
 47 | 
 48 | 
 49 | 
 50 | class bboxProfile {
 51 | public:
 52 |     bboxProfile(float4& p, int idx): pos(p), bboxNum(idx) {}
 53 | 
 54 |     float4 pos;
 55 |     int bboxNum = -1;
 56 |     int labelID = -1;
 57 | 
 58 | };
 59 | 
 60 | class tagProfile
 61 | {
 62 | public:
 63 |     tagProfile(int b, int l): bboxID(b), label(l) {}
 64 |     int bboxID;
 65 |     int label;
 66 | };
 67 | 
 68 | //SSD Reshape layer : shape{0,-1,21}
 69 | template<int OutC>
 70 | // @TODO: I think the OutC is the Out Channels and it is equal to 21.
 71 | class Reshape : public IPlugin
 72 | {
 73 | public:
 74 |     Reshape()
 75 |     {
 76 |     }
 77 |     Reshape(const void* buffer, size_t size)
 78 |     {
 79 |         assert(size == sizeof(mCopySize));
 80 |         mCopySize = *reinterpret_cast<const size_t*>(buffer);
 81 |     }
 82 |     int getNbOutputs() const override
 83 |     {
 84 |         return 1;
 85 |     }
 86 |     Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
 87 |     {
 88 |         assert(nbInputDims == 1);
 89 |         assert(index == 0);
 90 |         assert(inputs[index].nbDims == 3);
 91 |         assert((inputs[0].d[0])*(inputs[0].d[1]) % OutC == 0);
 92 | 
 93 |         // @TODO: Understood this.
 94 |         return DimsCHW( inputs[0].d[0] * inputs[0].d[1] / OutC, OutC, inputs[0].d[2]);
 95 |     }
 96 | 
 97 |     int initialize() override { return 0; }
 98 |     void terminate() override {}
 99 | 
100 |     size_t getWorkspaceSize(int) const override
101 |     {
102 |         // @TODO: 1 is the batch size.
103 |         return mCopySize*1;
104 |     }
105 | 
106 |     // currently it is not possible for a plugin to execute "in place". Therefore we memcpy the data from the input to the output buffer
107 |     int enqueue(int batchSize, const void*const *inputs, void** outputs, void* workspace, cudaStream_t stream) override
108 |     {  	
109 |    	if(mDataType == DataType::kFLOAT){ // FP32 
110 | 	CHECK(cudaMemcpyAsync(outputs[0], inputs[0] , mCopySize * batchSize, cudaMemcpyDeviceToDevice, stream)); 
111 | 	} 
112 | 	else{  //FP16 
113 | 	CHECK(cudaMemcpyAsync(
114 | 		reinterpret_cast<__half*>(outputs[0]), 
115 | 		reinterpret_cast<const __half*>(inputs[0]), mCopySize * batchSize, 
116 | 		cudaMemcpyDeviceToDevice, stream)); 
117 | 	}
118 |         //CHECK(cudaMemcpyAsync(outputs[0], inputs[0] , mCopySize * batchSize, cudaMemcpyDeviceToDevice, stream));
119 |         return 0;
120 |     }
121 |     size_t getSerializationSize() override
122 |     {
123 |         return sizeof(mCopySize);
124 |     }
125 |     void serialize(void* buffer) override
126 |     {
127 |         *reinterpret_cast<size_t*>(buffer) = mCopySize;
128 |     }
129 |     void configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int)	override
130 |     {
131 |         mCopySize = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2] * sizeof(float);
132 |     }
133 | 
134 | protected:
135 |     size_t mCopySize;
136 |     DataType mDataType{DataType::kFLOAT};
137 | 
138 | };
139 | 
140 | //Softmax layer.TensorRT softmax only support cross channel
141 | class SoftmaxPlugin : public IPlugin
142 | {
143 |     //You need to implement it when softmax parameter axis is 2.
144 | public:
145 |     int initialize() override { return 0; }
146 |     inline void terminate() override {}
147 | 
148 |     SoftmaxPlugin(){}
149 |     SoftmaxPlugin( const void* buffer, size_t size)
150 |     {
151 |         assert(size == sizeof(mCopySize));
152 |         mCopySize = *reinterpret_cast<const size_t*>(buffer);
153 |     }
154 |     inline int getNbOutputs() const override
155 |     {
156 |         //@TODO:  As the number of outputs are only 1, because there is only layer in top.
157 |         return 1;
158 |     }
159 |     Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
160 |     {
161 |         assert(nbInputDims == 1);
162 |         assert(index == 0);
163 |         assert(inputs[index].nbDims == 3);
164 | //        assert((inputs[0].d[0])*(inputs[0].d[1]) % OutC == 0);
165 | 
166 |         // @TODO: Understood this.
167 |         return DimsCHW( inputs[0].d[0] , inputs[0].d[1] , inputs[0].d[2] );
168 |     }
169 | 
170 |     size_t getWorkspaceSize(int) const override
171 |     {
172 |         // @TODO: 1 is the batch size.
173 |         return mCopySize*1;
174 |     }
175 | 
176 |     int enqueue(int batchSize, const void*const *inputs, void** outputs, void* workspace, cudaStream_t stream) override
177 |     {
178 |         //std::cout<<"flatten enqueue:"<<batchSize<<";"<< mCopySize<<std::endl;
179 | //        CHECK(cudaMemcpyAsync(outputs[0],inputs[0],batchSize*mCopySize*sizeof(float),cudaMemcpyDeviceToDevice,stream));
180 |         //@Seojin add fp16 inference code 
181 | 	//if(mDataType == DataType::kFLOAT){ //FP32 
182 | 	  cudaSoftmax( 8732*11, 11, (float *) *inputs, static_cast<float *>(*outputs));
183 | 
184 |         return 0;
185 |     }
186 | 
187 |     size_t getSerializationSize() override
188 |     {
189 |         return sizeof(mCopySize);
190 |     }
191 |     void serialize(void* buffer) override
192 |     {
193 |         *reinterpret_cast<size_t*>(buffer) = mCopySize;
194 |     }
195 |     void configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int)	override
196 |     {
197 |         mCopySize = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2] * sizeof(float);
198 |     }
199 | 
200 | protected:
201 |     size_t mCopySize;
202 |     DataType mDataType{DataType::kFLOAT}; 
203 | 
204 | };
205 | 
206 | 
207 | //SSD Flatten layer
208 | class FlattenLayer : public IPlugin
209 | {
210 | public:
211 | 
212 |     FlattenLayer(){}
213 |     FlattenLayer(const void* buffer, size_t size)
214 |     {
215 |         assert(size == 3 * sizeof(int));
216 |         const int* d = reinterpret_cast<const int*>(buffer);
217 |         _size = d[0] * d[1] * d[2];
218 |         dimBottom = DimsCHW{d[0], d[1], d[2]};
219 |     }
220 | 
221 |     inline int getNbOutputs() const override { return 1; };
222 |     Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
223 |     {
224 |         assert(1 == nbInputDims);
225 |         assert(0 == index);
226 |         assert(3 == inputs[index].nbDims);
227 |         _size = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2];
228 |         return DimsCHW(_size, 1, 1);
229 |     }
230 | 
231 |     int initialize() override
232 |     {
233 |         return 0;
234 |     }
235 |     inline void terminate() override {}
236 | 
237 |     inline size_t getWorkspaceSize(int) const override { return 0; }
238 | 
239 |     int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override
240 |     {
241 |         //std::cout<<"flatten enqueue:"<<batchSize<<";"<<_size<<std::endl;
242 |         if(mDataType == DataType::kFLOAT){ //FP32 
243 | 	  CHECK(cudaMemcpyAsync(outputs[0],inputs[0],batchSize*_size*sizeof(float),cudaMemcpyDeviceToDevice,stream));
244 | 	} 
245 | 	else{ //FP16
246 | 	  CHECK(cudaMemcpyAsync(
247 | 		reinterpret_cast<__half*>(outputs[0]),
248 | 		reinterpret_cast<const __half*>(inputs[0]),
249 | 		batchSize*_size*sizeof(__half),
250 | 		cudaMemcpyDeviceToDevice,stream));
251 | 	}
252 | 
253 | 	//CHECK(cudaMemcpyAsync(outputs[0],inputs[0],batchSize*_size*sizeof(float),cudaMemcpyDeviceToDevice,stream));
254 |         return 0;
255 |     }
256 | 
257 |     size_t getSerializationSize() override
258 |     {
259 |         return 3 * sizeof(int);
260 |     }
261 | 
262 |     void serialize(void* buffer) override
263 |     {
264 |         int* d = reinterpret_cast<int*>(buffer);
265 |         d[0] = dimBottom.c(); d[1] = dimBottom.h(); d[2] = dimBottom.w();
266 |     }
267 | 
268 |     void configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int) override
269 |     {
270 |         dimBottom = DimsCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]);
271 |     }
272 | protected: 
273 |     DataType mDataType{DataType::kFLOAT}; 
274 |     DimsCHW dimBottom;
275 |     int _size;
276 | };
277 | 
278 | 
279 | class PluginFactory : public nvinfer1::IPluginFactory, public nvcaffeparser1::IPluginFactory
280 | {
281 | public:
282 |     virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) override;
283 |     IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override;
284 | 
285 |     void(*nvPluginDeleter)(INvPlugin*) { [](INvPlugin* ptr) {ptr->destroy(); } };
286 | 
287 |     bool isPlugin(const char* name) override;
288 |     void destroyPlugin();
289 | 
290 |    
291 |     //pelee
292 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm1_mbox_loc_perm_layer{ nullptr, nvPluginDeleter };
293 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm1_mbox_conf_perm_layer{ nullptr, nvPluginDeleter };
294 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm2_mbox_loc_perm_layer{ nullptr, nvPluginDeleter };
295 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm2_mbox_conf_perm_layer{ nullptr, nvPluginDeleter };
296 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm3_mbox_loc_perm_layer{ nullptr, nvPluginDeleter };
297 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm3_mbox_conf_perm_layer{ nullptr, nvPluginDeleter };
298 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm4_mbox_loc_perm_layer{ nullptr, nvPluginDeleter };
299 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm4_mbox_conf_perm_layer{ nullptr, nvPluginDeleter };
300 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm5_mbox_loc_perm_layer{ nullptr, nvPluginDeleter };
301 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm5_mbox_conf_perm_layer{ nullptr, nvPluginDeleter };
302 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm6_mbox_loc_perm_layer{ nullptr, nvPluginDeleter };
303 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm6_mbox_conf_perm_layer{ nullptr, nvPluginDeleter };
304 | 
305 |     //pelee 
306 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm1_mbox_priorbox_layer{ nullptr, nvPluginDeleter };
307 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm2_mbox_priorbox_layer{ nullptr, nvPluginDeleter };
308 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm3_mbox_priorbox_layer{ nullptr, nvPluginDeleter };
309 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm4_mbox_priorbox_layer{ nullptr, nvPluginDeleter };
310 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm5_mbox_priorbox_layer{ nullptr, nvPluginDeleter };
311 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mExt_pm6_mbox_priorbox_layer{ nullptr, nvPluginDeleter };
312 | 
313 |     //detection output layer
314 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mDetection_out{ nullptr, nvPluginDeleter };
315 |     //pelee 
316 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStem_concat_layer{ nullptr, nvPluginDeleter };
317 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage1_1_concat_layer{ nullptr, nvPluginDeleter };
318 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage1_2_concat_layer{ nullptr, nvPluginDeleter };
319 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage1_3_concat_layer{ nullptr, nvPluginDeleter };
320 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage2_1_concat_layer{ nullptr, nvPluginDeleter };
321 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage2_2_concat_layer{ nullptr, nvPluginDeleter };
322 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage2_3_concat_layer{ nullptr, nvPluginDeleter };
323 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage2_4_concat_layer{ nullptr, nvPluginDeleter };
324 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage3_1_concat_layer{ nullptr, nvPluginDeleter };
325 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage3_2_concat_layer{ nullptr, nvPluginDeleter };
326 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage3_3_concat_layer{ nullptr, nvPluginDeleter };
327 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage3_4_concat_layer{ nullptr, nvPluginDeleter };
328 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage3_5_concat_layer{ nullptr, nvPluginDeleter };
329 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage3_6_concat_layer{ nullptr, nvPluginDeleter };
330 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage3_7_concat_layer{ nullptr, nvPluginDeleter };
331 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage3_8_concat_layer{ nullptr, nvPluginDeleter };
332 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage4_1_concat_layer{ nullptr, nvPluginDeleter };
333 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage4_2_concat_layer{ nullptr, nvPluginDeleter };
334 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage4_3_concat_layer{ nullptr, nvPluginDeleter };
335 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage4_4_concat_layer{ nullptr, nvPluginDeleter };
336 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage4_5_concat_layer{ nullptr, nvPluginDeleter };
337 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mStage4_6_concat_layer{ nullptr, nvPluginDeleter }; 
338 | 
339 |      std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mBox_loc_layer{ nullptr, nvPluginDeleter };
340 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mBox_conf_layer{ nullptr, nvPluginDeleter };
341 |     std::unique_ptr<INvPlugin, decltype(nvPluginDeleter)> mBox_priorbox_layer{ nullptr, nvPluginDeleter };
342 |    
343 |     //reshape layer
344 |     std::unique_ptr<Reshape<11>> mMbox_conf_reshape{ nullptr };
345 |     //flatten layers
346 |     //pelee 
347 |     std::unique_ptr<FlattenLayer> mExt_pm1_mbox_loc_flat_layer{ nullptr };
348 |     std::unique_ptr<FlattenLayer> mExt_pm1_mbox_conf_flat_layer{ nullptr };
349 |     std::unique_ptr<FlattenLayer> mExt_pm2_mbox_loc_flat_layer{ nullptr };
350 |     std::unique_ptr<FlattenLayer> mExt_pm2_mbox_conf_flat_layer{ nullptr };
351 |     std::unique_ptr<FlattenLayer> mExt_pm3_mbox_loc_flat_layer{ nullptr };
352 |     std::unique_ptr<FlattenLayer> mExt_pm3_mbox_conf_flat_layer{ nullptr };
353 |     std::unique_ptr<FlattenLayer> mExt_pm4_mbox_loc_flat_layer{ nullptr };
354 |     std::unique_ptr<FlattenLayer> mExt_pm4_mbox_conf_flat_layer{ nullptr };
355 |     std::unique_ptr<FlattenLayer> mExt_pm5_mbox_loc_flat_layer{ nullptr };
356 |     std::unique_ptr<FlattenLayer> mExt_pm5_mbox_conf_flat_layer{ nullptr };
357 |     std::unique_ptr<FlattenLayer> mExt_pm6_mbox_loc_flat_layer{ nullptr };
358 |     std::unique_ptr<FlattenLayer> mExt_pm6_mbox_conf_flat_layer{ nullptr };
359 |     std::unique_ptr<FlattenLayer> mBox_conf_flat_layer{ nullptr };
360 | 
361 | 
362 | 
363 | 
364 | 
365 | 
366 |     //softmax layer
367 |     std::unique_ptr<SoftmaxPlugin> mPluginSoftmax{ nullptr };
368 |     std::unique_ptr<FlattenLayer> mMbox_conf_flat_layer{ nullptr };
369 | 
370 | 
371 | };
372 | 
373 | #endif
374 | 


--------------------------------------------------------------------------------
/tensorNet.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include "common.h"
  3 | #include "tensorNet.h"
  4 | #include <sstream>
  5 | #include <fstream>
  6 | 
  7 | using namespace nvinfer1;
  8 | 
  9 | 
 10 | bool TensorNet::LoadNetwork(const char* prototxt_path,
 11 |                             const char* model_path,
 12 |                             const char* input_blob,
 13 |                             const std::vector<std::string>& output_blobs,
 14 |                             uint32_t maxBatchSize)
 15 | {
 16 |     //assert( !prototxt_path || !model_path );
 17 | 
 18 |     // attempt to load network from cache before profiling with tensorRT
 19 |     std::stringstream gieModelStdStream;
 20 |     gieModelStdStream.seekg(0, gieModelStdStream.beg);
 21 |     char cache_path[512];
 22 |     sprintf(cache_path, "%s.%u.tensorcache", model_path, maxBatchSize);
 23 |     printf( "attempting to open cache file %s\n", cache_path);
 24 | 
 25 |     std::ifstream cache( cache_path );
 26 | 
 27 |     if( !cache )
 28 |     {
 29 |         printf( "cache file not found, profiling network model\n");
 30 | 
 31 |         // if( !caffeToTRTModel(prototxt_path, model_path, output_blobs, maxBatchSize, gieModelStdStream) )
 32 |         // {
 33 |         //     printf("failed to load %s\n", model_path);
 34 |         //     return 0;
 35 |         // }
 36 |         bool load = caffeToTRTModel(prototxt_path, model_path, output_blobs, maxBatchSize, gieModelStdStream);
 37 |         if(!load){
 38 |              printf("failed to load %s\n", model_path);
 39 |              return 0;
 40 |         }else{
 41 |             printf( "network profiling complete, writing cache to %s\n", cache_path);
 42 |         }
 43 |         
 44 |         std::ofstream outFile;
 45 |         outFile.open(cache_path);
 46 |         outFile << gieModelStdStream.rdbuf();
 47 |         outFile.close();
 48 |         gieModelStdStream.seekg(0, gieModelStdStream.beg);
 49 |         printf( "completed writing cache to %s\n", cache_path);
 50 | 
 51 |         infer = createInferRuntime(gLogger);
 52 |         /**
 53 |          * deserializeCudaEngine can be used to load the serialized CuDA Engine (Plan file).
 54 |          * */
 55 |         std::cout << "createInference" << std::endl;
 56 |         engine = infer->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), &pluginFactory);
 57 |         std::cout << "createInference_end" << std::endl;
 58 |         printf("Bindings after deserializing:\n");
 59 |         for (int bi = 0; bi < engine->getNbBindings(); bi++) {
 60 |             if (engine->bindingIsInput(bi) == true) printf("Binding %d (%s): Input.\n",  bi, engine->getBindingName(bi));
 61 |             else printf("Binding %d (%s): Output.\n", bi, engine->getBindingName(bi));
 62 |         }
 63 |     }
 64 |     else
 65 |     {
 66 |         std::cout << "loading network profile from cache..." << std::endl;
 67 |         gieModelStdStream << cache.rdbuf();
 68 |         cache.close();
 69 |         gieModelStdStream.seekg(0, std::ios::end);
 70 |         const int modelSize = gieModelStdStream.tellg();
 71 |         gieModelStdStream.seekg(0, std::ios::beg);
 72 |         void* modelMem = malloc(modelSize);
 73 |         gieModelStdStream.read((char*)modelMem, modelSize);
 74 | 
 75 |         infer = createInferRuntime(gLogger);
 76 |         std::cout << "createInference" << std::endl;
 77 |         engine = infer->deserializeCudaEngine(modelMem, modelSize, &pluginFactory);
 78 |         //free(modelMem);
 79 |         std::cout << "createInference_end" << std::endl;
 80 |         printf("Bindings after deserializing:\n");
 81 |         for (int bi = 0; bi < engine->getNbBindings(); bi++) {
 82 |             if (engine->bindingIsInput(bi) == true) printf("Binding %d (%s): Input.\n",  bi, engine->getBindingName(bi));
 83 |             else printf("Binding %d (%s): Output.\n", bi, engine->getBindingName(bi));
 84 |         }
 85 |     }
 86 | }
 87 | 
 88 | bool TensorNet::caffeToTRTModel(const char* deployFile,
 89 |                                 const char* modelFile,
 90 |                                 const std::vector<std::string>& outputs,
 91 |                                 unsigned int maxBatchSize,
 92 |                                 std::ostream& gieModelStdStream)
 93 | {
 94 |     IBuilder* builder = createInferBuilder(gLogger);
 95 |     INetworkDefinition* network = builder->createNetwork();
 96 |     //    builder->setMinFindIterations(3);	// allow time for TX1 GPU to spin up
 97 |     //    builder->setAverageFindIterations(2);
 98 |     ICaffeParser* parser = createCaffeParser();
 99 |     parser->setPluginFactory(&pluginFactory);
100 |     //builder->setFp16Mode(true);
101 |     bool useFp16 = false;
102 |     //builder->platformHasFastFp16();
103 |     //@Seojin to fp16 
104 |     //useFp16 = true; 
105 | 
106 |     DataType modelDataType = useFp16 ? DataType::kHALF : DataType::kFLOAT;
107 |  
108 |     //modelDataType = DataType::kHALF;
109 | 
110 |    // std::cout << deployFile <<std::endl;
111 |    // std::cout << modelFile <<std::endl;
112 |     //std::cout << useFp16 <<std::endl;
113 | 
114 |     const IBlobNameToTensor* blobNameToTensor =	parser->parse(deployFile,
115 |                                                               modelFile,
116 |                                                               *network,
117 |                                                               modelDataType);
118 |     assert(blobNameToTensor != nullptr);
119 |     for (auto& s : outputs) network->markOutput(*blobNameToTensor->find(s.c_str()));
120 | 
121 |     builder->setMaxBatchSize(maxBatchSize);
122 |     builder->setMaxWorkspaceSize(16 << 20);
123 | 
124 |     if(useFp16)
125 |     {
126 |         builder->setHalf2Mode(true);
127 | 	std::cout <<"Use FP16 Mode:" << useFp16 <<std::endl;
128 | 
129 |     }
130 |     ICudaEngine* engine = builder->buildCudaEngine( *network );
131 |     assert(engine);
132 |     // we don't need the network any more, and we can destroy the parser
133 |     network->destroy();
134 |     parser->destroy();
135 |     // serialize the engine, then close everything down
136 |     gieModelStream = engine->serialize();
137 |     if(!gieModelStream)
138 |     {
139 |         std::cout << "failed to serialize CUDA engine" << std::endl;
140 |         return false;
141 |     }
142 |     gieModelStdStream.write((const char*)gieModelStream->data(),gieModelStream->size());
143 |     engine->destroy();
144 |     builder->destroy();
145 |     pluginFactory.destroyPlugin();
146 |     shutdownProtobufLibrary();
147 | 
148 |     std::cout << "caffeToTRTModel Finished" << std::endl;
149 |     return true;
150 | }
151 | 
152 | /**
153 |  * This function de-serializes the cuda engine.
154 |  * */
155 | void TensorNet::createInference()
156 | {
157 |     infer = createInferRuntime(gLogger);
158 |     /**
159 |      * deserializeCudaEngine can be used to load the serialized CuDA Engine (Plan file).
160 |      * */
161 |     engine = infer->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), &pluginFactory);
162 | 
163 |     printf("Bindings after deserializing:\n");
164 |     for (int bi = 0; bi < engine->getNbBindings(); bi++) {
165 |         if (engine->bindingIsInput(bi) == true) printf("Binding %d (%s): Input.\n",  bi, engine->getBindingName(bi));
166 |         else printf("Binding %d (%s): Output.\n", bi, engine->getBindingName(bi));
167 |     }
168 | }
169 | 
170 | void TensorNet::imageInference(void** buffers, int nbBuffer, int batchSize)
171 | {
172 |     //std::cout << "Came into the image inference method here. "<<std::endl;
173 |     assert( engine->getNbBindings()==nbBuffer);
174 |     IExecutionContext* context = engine->createExecutionContext();
175 |     context->setProfiler(&gProfiler);
176 |     context->execute(batchSize, buffers);
177 |     context->destroy();
178 | }
179 | 
180 | void TensorNet::timeInference(int iteration, int batchSize)
181 | {
182 |     int inputIdx = 0;
183 |     size_t inputSize = 0;
184 |     void* buffers[engine->getNbBindings()];
185 | 
186 |     for (int b = 0; b < engine->getNbBindings(); b++)
187 |     {
188 |         DimsCHW dims = static_cast<DimsCHW&&>(engine->getBindingDimensions(b));
189 |         size_t size = batchSize * dims.c() * dims.h() * dims.w() * sizeof(float);
190 |         CHECK(cudaMalloc(&buffers[b], size));
191 | 
192 |         if(engine->bindingIsInput(b) == true)
193 |         {
194 |             inputIdx = b;
195 |             inputSize = size;
196 |         }
197 |     }
198 | 
199 |     IExecutionContext* context = engine->createExecutionContext();
200 |     context->setProfiler(&gProfiler);
201 | 
202 |     CHECK(cudaMemset(buffers[inputIdx], 0, inputSize));
203 | 
204 |     for (int i = 0; i < iteration;i++) context->execute(batchSize, buffers);
205 | 
206 |     context->destroy();
207 |     for (int b = 0; b < engine->getNbBindings(); b++) CHECK(cudaFree(buffers[b]));
208 | 
209 | }
210 | 
211 | DimsCHW TensorNet::getTensorDims(const char* name)
212 | {
213 |     for (int b = 0; b < engine->getNbBindings(); b++) {
214 |         if( !strcmp( name, engine->getBindingName(b)) )
215 |             return static_cast<DimsCHW&&>(engine->getBindingDimensions(b));
216 |     }
217 |     return DimsCHW{0,0,0};
218 | }
219 | 
220 | //void TensorNet::getLayerOutput(void** buffers, int nbBuffer, int batchSize)
221 | //{
222 | //    /* *
223 | //     * @TODO: Get the layer with name name in the network
224 | //     * */
225 | //    std::cout << "Came into the image inference method here. "<<std::endl;
226 | //    assert( engine->getNbBindings()==nbBuffer);
227 | //    IExecutionContext* context = engine->createExecutionContext();
228 | //    context->setProfiler(&gProfiler);
229 | //    context->execute( batchSize , buffers);
230 | //
231 | //    context->destroy();
232 | //
233 | //}
234 | 
235 | void TensorNet::printTimes(int iteration)
236 | {
237 |     gProfiler.printLayerTimes(iteration);
238 | }
239 | 
240 | void TensorNet::destroy()
241 | {
242 |     pluginFactory.destroyPlugin();
243 |     engine->destroy();
244 |     infer->destroy();
245 | }
246 | 


--------------------------------------------------------------------------------
/tensorNet.h:
--------------------------------------------------------------------------------
 1 | #include "pluginImplement.h"
 2 | 
 3 | using namespace nvinfer1;
 4 | using namespace nvcaffeparser1;
 5 | 
 6 | 
 7 | /******************************/
 8 | // TensorRT utility
 9 | /******************************/
10 | class Logger : public ILogger
11 | {
12 |     void log(Severity severity, const char* msg) override
13 |     {
14 |         if (severity!=Severity::kINFO) std::cout << msg << std::endl;
15 |     }
16 | };
17 | 
18 | struct Profiler : public IProfiler
19 | {
20 |     typedef std::pair<std::string, float> Record;
21 |     std::vector<Record> mProfile;
22 | 
23 |     virtual void reportLayerTime(const char* layerName, float ms)
24 |     {
25 |         auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
26 | 
27 |         if (record == mProfile.end()) mProfile.push_back(std::make_pair(layerName, ms));
28 |         else record->second += ms;
29 |     }
30 | 
31 |     void printLayerTimes(const int TIMING_ITERATIONS)
32 |     {
33 |         float totalTime = 0;
34 |         for (size_t i = 0; i < mProfile.size(); i++)
35 |         {
36 |             printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / TIMING_ITERATIONS);
37 |             totalTime += mProfile[i].second;
38 |         }
39 |         printf("Time over all layers: %4.3f\n", totalTime / TIMING_ITERATIONS);
40 |     }
41 | };
42 | 
43 | 
44 | /******************************/
45 | // TensorRT Main
46 | /******************************/
47 | class TensorNet
48 | {
49 | public:
50 |     bool caffeToTRTModel(const char* deployFile,
51 |                          const char* modelFile,
52 |                          const std::vector<std::string>& outputs,
53 |                          unsigned int maxBatchSize,
54 |                          std::ostream& gieModelStream);
55 |     bool LoadNetwork( const char* prototxt_path,
56 |                       const char* model_path,
57 |                       const char* input_blob,
58 |                       const std::vector<std::string>& output_blobs,
59 |                       uint32_t maxBatchSize );
60 |     void createInference();
61 | 
62 |     void imageInference(void** buffers, int nbBuffer, int batchSize);
63 |     void timeInference(int iteration, int batchSize);
64 | 
65 |     DimsCHW getTensorDims(const char* name);
66 | 
67 | //    void getLayerOutput(const char* name);
68 | 
69 |     void printTimes(int iteration);
70 |     void destroy();
71 | 
72 | private:
73 | 
74 |     PluginFactory pluginFactory;
75 |     IHostMemory *gieModelStream{nullptr};
76 | 
77 |     IRuntime* infer;
78 |     ICudaEngine* engine;
79 | 
80 |     Logger gLogger;
81 |     Profiler gProfiler;
82 | 
83 | };
84 | 
85 | 
86 | //#endif
87 | 
88 | 


--------------------------------------------------------------------------------
/testPic/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric612/Pelee-Seg-TensorRT/05bf0b31c5891adaf64f40b784ef4a1927d68862/testPic/test.png


--------------------------------------------------------------------------------
/testVideo/test.avi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eric612/Pelee-Seg-TensorRT/05bf0b31c5891adaf64f40b784ef4a1927d68862/testVideo/test.avi


--------------------------------------------------------------------------------
/util/cuda/cudaMappedMemory.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * inference-101
 3 |  */
 4 | 
 5 | #ifndef __CUDA_MAPPED_MEMORY_H_
 6 | #define __CUDA_MAPPED_MEMORY_H_
 7 | 
 8 | 
 9 | #include "cudaUtility.h"
10 | 
11 | 
12 | /**
13 |  * Allocate ZeroCopy mapped memory, shared between CUDA and CPU.
14 |  * @ingroup util
15 |  */
16 | inline bool cudaAllocMapped( void** cpuPtr, void** gpuPtr, size_t size )
17 | {
18 | 	if( !cpuPtr || !gpuPtr || size == 0 )
19 | 		return false;
20 | 
21 | 	//CUDA(cudaSetDeviceFlags(cudaDeviceMapHost));
22 | 
23 | 	if( CUDA_FAILED(cudaHostAlloc(cpuPtr, size, cudaHostAllocMapped)) )
24 | 		return false;
25 | 
26 | 	if( CUDA_FAILED(cudaHostGetDevicePointer(gpuPtr, *cpuPtr, 0)) )
27 | 		return false;
28 | 
29 | 	memset(*cpuPtr, 0, size);
30 | 	printf("[cuda]  cudaAllocMapped %zu bytes, CPU %p GPU %p\n", size, *cpuPtr, *gpuPtr);
31 | 	return true;
32 | }
33 | 
34 | 
35 | #endif
36 | 


--------------------------------------------------------------------------------
/util/cuda/cudaNormalize.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * inference-101
 3 |  */
 4 | 
 5 | #include "cudaNormalize.h"
 6 | 
 7 | 
 8 | 
 9 | // gpuNormalize
10 | template <typename T>
11 | __global__ void gpuNormalize( T* input, T* output, int width, int height, float scaling_factor )
12 | {
13 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
14 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
15 | 
16 | 	if( x >= width || y >= height )
17 | 		return;
18 | 
19 | 	const T px = input[ y * width + x ];
20 | 
21 | 	output[y*width+x] = make_float4(px.x * scaling_factor,
22 | 							  px.y * scaling_factor,
23 | 							  px.z * scaling_factor,
24 | 							  px.w * scaling_factor);
25 | }
26 | 
27 | 
28 | // cudaNormalizeRGBA
29 | cudaError_t cudaNormalizeRGBA( float4* input, const float2& input_range,
30 | 						 float4* output, const float2& output_range,
31 | 						 size_t  width,  size_t height )
32 | {
33 | 	if( !input || !output )
34 | 		return cudaErrorInvalidDevicePointer;
35 | 
36 | 	if( width == 0 || height == 0  )
37 | 		return cudaErrorInvalidValue;
38 | 
39 | 	const float multiplier = output_range.y / input_range.y;
40 | 
41 | 	// launch kernel
42 | 	const dim3 blockDim(8, 8);
43 | 	const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height,blockDim.y));
44 | 
45 | 	gpuNormalize<float4><<<gridDim, blockDim>>>(input, output, width, height, multiplier);
46 | 
47 | 	return CUDA(cudaGetLastError());
48 | }
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/util/cuda/cudaNormalize.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * inference-101
 3 |  */
 4 | 
 5 | #ifndef __CUDA_NORMALIZE_H__
 6 | #define __CUDA_NORMALIZE_H__
 7 | 
 8 | 
 9 | #include "cudaUtility.h"
10 | 
11 | 
12 | /**
13 |  * Rebase the pixel intensities of an image between two scales.
14 |  * For example, convert an image with values 0.0-255 to 0.0-1.0.
15 |  * @ingroup util
16 |  */
17 | cudaError_t cudaNormalizeRGBA( float4* input,  const float2& input_range,
18 | 						 float4* output, const float2& output_range,
19 | 						 size_t  width,  size_t height );
20 | 
21 | #endif
22 | 
23 | 


--------------------------------------------------------------------------------
/util/cuda/cudaOverlay.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * http://github.com/dusty-nv/jetson-inference
 3 |  */
 4 | 
 5 | #include "cudaOverlay.h"
 6 | 
 7 | 
 8 | static inline __device__ __host__ bool eq_less( float a, float b, float epsilon )
 9 | {
10 | 	return (a > (b - epsilon) && a < (b + epsilon)) ? true : false;
11 | }
12 | 
13 | template<typename T>
14 | __global__ void gpuRectOutlines( T* input, T* output, int width, int height,
15 | 						        float4* rects, int numRects, float4 color ) 
16 | {
17 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
18 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
19 | 
20 | 	if( x >= width || y >= height )
21 | 		return;
22 | 
23 | 	const T px_in = input[ y * width + x ];
24 | 	T px_out = px_in;
25 | 	
26 | 	const float fx = x;
27 | 	const float fy = y;
28 | 	
29 | 	const float thick = 10.0f;
30 | 	const float alpha = color.w / 255.0f;
31 | 	const float ialph = 1.0f - alpha;
32 | 	
33 | 	for( int nr=0; nr < numRects; nr++ )
34 | 	{
35 | 		const float4 r = rects[nr];
36 | 		
37 | 		//printf("%i %i %i  %f %f %f %f\n", numRects, x, y, r.x, r.y, r.z, r.w);
38 | 		
39 | 		if( fy >= r.y && fy <= r.w /*&& (eq_less(fx, r.x, ep) || eq_less(fx, r.z, ep))*/ )
40 | 		{
41 | 			if( fx >= r.x && fx <= r.z /*&& (eq_less(fy, r.y, ep) || eq_less(fy, r.w, ep))*/ )
42 | 			{
43 | 				//printf("cuda rect %i %i\n", x, y);
44 | 
45 | 				px_out.x = alpha * color.x + ialph * px_out.x;
46 | 				px_out.y = alpha * color.y + ialph * px_out.y;
47 | 				px_out.z = alpha * color.z + ialph * px_out.z;
48 | 			}
49 | 		}
50 | 	}
51 | 	
52 | 	output[y * width + x] = px_out;	 
53 | }
54 | 
55 | 
56 | cudaError_t cudaRectOutlineOverlay( float4* input, float4* output, uint32_t width, uint32_t height, float4* boundingBoxes, int numBoxes, const float4& color )
57 | {
58 | 	if( !input || !output || width == 0 || height == 0 || !boundingBoxes || numBoxes == 0 )
59 | 		return cudaErrorInvalidValue;
60 | 
61 | 	// launch kernel
62 | 	const dim3 blockDim(8, 8);
63 | 	const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height,blockDim.y));
64 | 
65 | 	gpuRectOutlines<float4><<<gridDim, blockDim>>>(input, output, width, height, boundingBoxes, numBoxes, color); 
66 | 
67 | 	return cudaGetLastError();
68 | }
69 | 


--------------------------------------------------------------------------------
/util/cuda/cudaOverlay.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * http://github.com/dusty-nv/jetson-inference
 3 |  */
 4 | 
 5 | #ifndef __CUDA_OVERLAY_H__
 6 | #define __CUDA_OVERLAY_H__
 7 | 
 8 | #include "cudaUtility.h"
 9 | 
10 | 
11 | /**
12 |  * cudaRectOutlineOverlay
13 |  * @ingroup util
14 |  */
15 | cudaError_t cudaRectOutlineOverlay( float4* input, float4* output, uint32_t width, uint32_t height, float4* boundingBoxes, int numBoxes, const float4& color );
16 | 
17 | 
18 | /**
19 |  * cudaRectFillOverlay
20 |  * @ingroup util
21 |  */
22 | //cudaError_t cudaRectFillOverlay( float4* input, float4* output, uint32_t width, uint32_t height, float4* boundingBoxes, int numBoxes, const float4& color );
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/util/cuda/cudaRGB.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * http://github.com/dusty-nv
 3 |  */
 4 | 
 5 | #include "cudaRGB.h"
 6 | 
 7 | __global__ void loadImage(uchar3* srcImage,
 8 |                           float3* dstImage,
 9 |                           uint32_t width,
10 |                           uint32_t height)
11 | {
12 |     int x, y, pixel;
13 | 
14 |     x = (blockIdx.x * blockDim.x) + threadIdx.x;
15 |     y = (blockIdx.y * blockDim.y) + threadIdx.y;
16 | 
17 |     pixel = y * width + x;
18 | 
19 |     if (x >= width)
20 |         return;
21 | 
22 |     if (y >= height)
23 |         return;
24 | 
25 | //	printf("cuda thread %i %i  %i %i pixel %i \n", x, y, width, height, pixel);
26 | 
27 |     const float  s  = 1.0f;
28 |     const uchar3 px = srcImage[pixel];
29 | 
30 |     dstImage[pixel] = make_float3( 255, 255, 255);
31 | 
32 | }
33 | 
34 | 
35 | 
36 | __global__ void RGBToRGBAf(uchar3* srcImage,
37 |                            float4* dstImage,
38 |                            uint32_t width,       uint32_t height)
39 | {
40 |     int x, y, pixel;
41 | 
42 |     x = (blockIdx.x * blockDim.x) + threadIdx.x;
43 |     y = (blockIdx.y * blockDim.y) + threadIdx.y;
44 | 	
45 |     pixel = y * width + x;
46 | 
47 |     if (x >= width)
48 |         return; 
49 | 
50 |     if (y >= height)
51 |         return;
52 | 
53 | //	printf("cuda thread %i %i  %i %i pixel %i \n", x, y, width, height, pixel);
54 | 		
55 | 	const float  s  = 1.0f;
56 | 	const uchar3 px = srcImage[pixel];
57 | 	
58 | 	dstImage[pixel] = make_float4(px.x * s, px.y * s, px.z * s, 255.0f * s);
59 | }
60 | 
61 | cudaError_t cudaRGBToRGBAf( uchar3* srcDev, float3* destDev, size_t width, size_t height )
62 | {
63 | 	if( !srcDev || !destDev )
64 | 		return cudaErrorInvalidDevicePointer;
65 | 
66 | 	const dim3 blockDim(8,8,1);
67 | 	const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height,blockDim.y), 1);
68 | 
69 | 	loadImage<<<gridDim, blockDim>>>( srcDev, destDev, width, height );
70 | 	
71 | 	return CUDA(cudaGetLastError());
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/util/cuda/cudaRGB.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * http://github.com/dusty-nv/jetson-inference
 3 |  */
 4 | 
 5 | #ifndef __CUDA_RGB_CONVERT_H
 6 | #define __CUDA_RGB_CONVERT_H
 7 | 
 8 | 
 9 | #include "cudaUtility.h"
10 | #include <stdint.h>
11 | 
12 | 
13 | /**
14 |  * Convert 8-bit fixed-point RGB image to 32-bit floating-point RGBA image
15 |  * @ingroup util
16 |  */
17 | cudaError_t cudaRGBToRGBAf( uchar3* input, float3* output, size_t width, size_t height );
18 | 
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/util/cuda/cudaResize.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * inference-101
 3 |  */
 4 | 
 5 | #include "cudaResize.h"
 6 | 
 7 | // gpuResample
 8 | template <typename T>
 9 | __global__ void gpuResize( float2 scale, T* input, int iWidth, T* output, int oWidth, int oHeight )
10 | {
11 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
12 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
13 | 
14 | 	if( x >= oWidth || y >= oHeight )
15 | 		return;
16 | 
17 | 	const int dx = ((float)x * scale.x);
18 | 	const int dy = ((float)y * scale.y);
19 | 
20 | 	const T px = input[ dy * iWidth + dx ];
21 | 
22 | 	output[y*oWidth+x] = px;
23 | 
24 | }
25 | 
26 | // cudaResize
27 | cudaError_t cudaResize( float* input, size_t inputWidth, size_t inputHeight,
28 | 				        float* output, size_t outputWidth, size_t outputHeight )
29 | {
30 | 	if( !input || !output )
31 | 		return cudaErrorInvalidDevicePointer;
32 | 
33 | 	if( inputWidth == 0 || outputWidth == 0 || inputHeight == 0 || outputHeight == 0 )
34 | 		return cudaErrorInvalidValue;
35 | 
36 | 	const float2 scale = make_float2( float(inputWidth) / float(outputWidth),
37 | 							          float(inputHeight) / float(outputHeight) );
38 | 
39 | 	// launch kernel
40 | 	const dim3 blockDim(8, 8);
41 | 	const dim3 gridDim(iDivUp(outputWidth,blockDim.x), iDivUp(outputHeight,blockDim.y));
42 | 
43 | 	gpuResize<float><<<gridDim, blockDim>>>(scale, input, inputWidth, output, outputWidth, outputHeight);
44 | 
45 | 	return CUDA(cudaGetLastError());
46 | 
47 | }
48 | 
49 | 
50 | // cudaResizeRGBA
51 | cudaError_t cudaResizeRGBA( float4* input,  size_t inputWidth, size_t inputHeight,
52 | 				            float4* output, size_t outputWidth, size_t outputHeight )
53 | {
54 | 	if( !input || !output )
55 | 		return cudaErrorInvalidDevicePointer;
56 | 
57 | 	if( inputWidth == 0 || outputWidth == 0 || inputHeight == 0 || outputHeight == 0 )
58 | 		return cudaErrorInvalidValue;
59 | 
60 | 	const float2 scale = make_float2( float(inputWidth) / float(outputWidth),
61 | 							    float(inputHeight) / float(outputHeight) );
62 | 
63 | 	// launch kernel
64 | 	const dim3 blockDim(8, 8);
65 | 	const dim3 gridDim(iDivUp(outputWidth,blockDim.x), iDivUp(outputHeight,blockDim.y));
66 | 
67 | 	gpuResize<float4><<<gridDim, blockDim>>>(scale, input, inputWidth, output, outputWidth, outputHeight);
68 | 
69 | 	return CUDA(cudaGetLastError());
70 | }
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/util/cuda/cudaResize.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * inference-101
 3 |  */
 4 | 
 5 | #ifndef __CUDA_RESIZE_H__
 6 | #define __CUDA_RESIZE_H__
 7 | 
 8 | 
 9 | #include "cudaUtility.h"
10 | 
11 | 
12 | /**
13 |  * Function for increasing or decreasing the size of an image on the GPU.
14 |  * @ingroup util
15 |  */
16 | cudaError_t cudaResize( float* input,  size_t inputWidth,  size_t inputHeight,
17 | 				    float* output, size_t outputWidth, size_t outputHeight );
18 | 
19 | 
20 | /**
21 |  * Function for increasing or decreasing the size of an image on the GPU.
22 |  * @ingroup util
23 |  */
24 | cudaError_t cudaResizeRGBA( float4* input,  size_t inputWidth,  size_t inputHeight,
25 | 				        float4* output, size_t outputWidth, size_t outputHeight );
26 | 
27 | 
28 | 						
29 | 
30 | #endif
31 | 
32 | 


--------------------------------------------------------------------------------
/util/cuda/cudaUtility.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * http://github.com/dusty-nv/jetson-inference
 3 |  */
 4 | 
 5 | #ifndef __CUDA_UTILITY_H_
 6 | #define __CUDA_UTILITY_H_
 7 | 
 8 | 
 9 | #include <cuda_runtime.h>
10 | #include <cuda.h>
11 | #include <stdio.h>
12 | #include <string.h>
13 | 
14 | 
15 | /**
16 |  * Execute a CUDA call and print out any errors
17 |  * @return the original cudaError_t result
18 |  * @ingroup util
19 |  */
20 | #define CUDA(x)				cudaCheckError((x), #x, __FILE__, __LINE__)
21 | 
22 | /**
23 |  * Evaluates to true on success
24 |  * @ingroup util
25 |  */
26 | #define CUDA_SUCCESS(x)			(CUDA(x) == cudaSuccess)
27 | 
28 | /**
29 |  * Evaluates to true on failure
30 |  * @ingroup util
31 |  */
32 | #define CUDA_FAILED(x)			(CUDA(x) != cudaSuccess)
33 | 
34 | /**
35 |  * Return from the boolean function if CUDA call fails
36 |  * @ingroup util
37 |  */
38 | #define CUDA_VERIFY(x)			if(CUDA_FAILED(x))	return false;
39 | 
40 | /**
41 |  * LOG_CUDA string.
42 |  * @ingroup util
43 |  */
44 | #define LOG_CUDA "[cuda]   "
45 | 
46 | /*
47 |  * define this if you want all cuda calls to be printed
48 |  */
49 | //#define CUDA_TRACE
50 | 
51 | 
52 | 
53 | /**
54 |  * cudaCheckError
55 |  * @ingroup util
56 |  */
57 | inline cudaError_t cudaCheckError(cudaError_t retval, const char* txt, const char* file, int line )
58 | {
59 | #if !defined(CUDA_TRACE)
60 | 	if( retval == cudaSuccess)
61 | 		return cudaSuccess;
62 | #endif
63 | 
64 | 	//int activeDevice = -1;
65 | 	//cudaGetDevice(&activeDevice);
66 | 
67 | 	//Log("[cuda]   device %i  -  %s\n", activeDevice, txt);
68 | 	
69 | 	printf(LOG_CUDA "%s\n", txt);
70 | 
71 | 
72 | 	if( retval != cudaSuccess )
73 | 	{
74 | 		printf(LOG_CUDA "   %s (error %u) (hex 0x%02X)\n", cudaGetErrorString(retval), retval, retval);
75 | 		printf(LOG_CUDA "   %s:%i\n", file, line);	
76 | 	}
77 | 
78 | 	return retval;
79 | }
80 | 
81 | 
82 | /**
83 |  * iDivUp
84 |  * @ingroup util
85 |  */
86 | inline __device__ __host__ int iDivUp( int a, int b )  		{ return (a % b != 0) ? (a / b + 1) : (a / b); }
87 | 
88 | 
89 | 
90 | #endif
91 | 


--------------------------------------------------------------------------------
/util/cuda/cudaYUV-NV12.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * inference-101
  3 |  */
  4 | 
  5 | #include "cudaYUV.h"
  6 | 
  7 | 
  8 | #define COLOR_COMPONENT_MASK            0x3FF
  9 | #define COLOR_COMPONENT_BIT_SIZE        10
 10 | 
 11 | #define FIXED_DECIMAL_POINT             24
 12 | #define FIXED_POINT_MULTIPLIER          1.0f
 13 | #define FIXED_COLOR_COMPONENT_MASK      0xffffffff
 14 | 
 15 | #define MUL(x,y)    (x*y)
 16 | 
 17 | 
 18 | 
 19 | __constant__ uint32_t constAlpha;
 20 | __constant__ float  constHueColorSpaceMat[9];
 21 | 
 22 | 
 23 | 
 24 | __device__ void YUV2RGB(uint32_t *yuvi, float *red, float *green, float *blue)
 25 | {
 26 |    
 27 | 
 28 |     // Prepare for hue adjustment
 29 |     /*
 30 | 	 float luma, chromaCb, chromaCr;
 31 | 
 32 | 	luma     = (float)yuvi[0];
 33 |     chromaCb = (float)((int)yuvi[1] - 512.0f);
 34 |     chromaCr = (float)((int)yuvi[2] - 512.0f);
 35 | 
 36 |     // Convert YUV To RGB with hue adjustment
 37 |     *red  = MUL(luma,     constHueColorSpaceMat[0]) +
 38 |             MUL(chromaCb, constHueColorSpaceMat[1]) +
 39 |             MUL(chromaCr, constHueColorSpaceMat[2]);
 40 |     *green= MUL(luma,     constHueColorSpaceMat[3]) +
 41 |             MUL(chromaCb, constHueColorSpaceMat[4]) +
 42 |             MUL(chromaCr, constHueColorSpaceMat[5]);
 43 |     *blue = MUL(luma,     constHueColorSpaceMat[6]) +
 44 |             MUL(chromaCb, constHueColorSpaceMat[7]) +
 45 |             MUL(chromaCr, constHueColorSpaceMat[8]);*/
 46 | 
 47 | 	const float luma = float(yuvi[0]);
 48 | 	const float u    = float(yuvi[1]) - 512.0f;
 49 | 	const float v    = float(yuvi[2]) - 512.0f;
 50 | 
 51 |    /*R = Y + 1.140V
 52 |    G = Y - 0.395U - 0.581V
 53 |    B = Y + 2.032U*/
 54 | 
 55 | 	/**green = luma + 1.140f * v;
 56 | 	*blue  = luma - 0.395f * u - 0.581f * v;
 57 | 	*red   = luma + 2.032f * u;*/
 58 | 
 59 | 	*red    = luma + 1.140f * v;
 60 | 	*green  = luma - 0.395f * u - 0.581f * v;
 61 | 	*blue   = luma + 2.032f * u;
 62 | }
 63 | 
 64 | 
 65 | __device__ uint32_t RGBAPACK_8bit(float red, float green, float blue, uint32_t alpha)
 66 | {
 67 |     uint32_t ARGBpixel = 0;
 68 | 
 69 |     // Clamp final 10 bit results
 70 |     red   = min(max(red,   0.0f), 255.0f);
 71 |     green = min(max(green, 0.0f), 255.0f);
 72 |     blue  = min(max(blue,  0.0f), 255.0f);
 73 | 
 74 |     // Convert to 8 bit unsigned integers per color component
 75 |     ARGBpixel = ((((uint32_t)red)   << 24) |
 76 |                  (((uint32_t)green) << 16) |
 77 | 		       (((uint32_t)blue)  <<  8) | (uint32_t)alpha);
 78 | 
 79 |     return  ARGBpixel;
 80 | }
 81 | 
 82 | 
 83 | __device__ uint32_t RGBAPACK_10bit(float red, float green, float blue, uint32_t alpha)
 84 | {
 85 |     uint32_t ARGBpixel = 0;
 86 | 
 87 |     // Clamp final 10 bit results
 88 |     red   = min(max(red,   0.0f), 1023.f);
 89 |     green = min(max(green, 0.0f), 1023.f);
 90 |     blue  = min(max(blue,  0.0f), 1023.f);
 91 | 
 92 |     // Convert to 8 bit unsigned integers per color component
 93 |     ARGBpixel = ((((uint32_t)red   >> 2) << 24) |
 94 |                  (((uint32_t)green >> 2) << 16) |
 95 |                  (((uint32_t)blue  >> 2) <<  8) | (uint32_t)alpha);
 96 | 
 97 |     return  ARGBpixel;
 98 | }
 99 | 
100 | 
101 | // CUDA kernel for outputing the final ARGB output from NV12;
102 | /*extern "C"*/
103 | __global__ void Passthru(uint32_t *srcImage,   size_t nSourcePitch,
104 |                          uint32_t *dstImage,   size_t nDestPitch,
105 |                          uint32_t width,       uint32_t height)
106 | {
107 |     int x, y;
108 |     uint32_t yuv101010Pel[2];
109 |     uint32_t processingPitch = ((width) + 63) & ~63;
110 |     uint32_t dstImagePitch   = nDestPitch >> 2;
111 |     uint8_t *srcImageU8     = (uint8_t *)srcImage;
112 | 
113 |     processingPitch = nSourcePitch;
114 | 
115 |     // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
116 |     x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
117 |     y = blockIdx.y *  blockDim.y       +  threadIdx.y;
118 | 
119 |     if (x >= width)
120 |         return; //x = width - 1;
121 | 
122 |     if (y >= height)
123 |         return; // y = height - 1;
124 | 
125 |     // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
126 |     // if we move to texture we could read 4 luminance values
127 |     yuv101010Pel[0] = (srcImageU8[y * processingPitch + x    ]);
128 |     yuv101010Pel[1] = (srcImageU8[y * processingPitch + x + 1]);
129 | 
130 |     // this steps performs the color conversion
131 |     float luma[2];
132 | 
133 |     luma[0]   = (yuv101010Pel[0]        & 0x00FF);
134 |     luma[1]   = (yuv101010Pel[1]        & 0x00FF);
135 | 
136 |     // Clamp the results to RGBA
137 |     dstImage[y * dstImagePitch + x     ] = RGBAPACK_8bit(luma[0], luma[0], luma[0], constAlpha);
138 |     dstImage[y * dstImagePitch + x + 1 ] = RGBAPACK_8bit(luma[1], luma[1], luma[1], constAlpha);
139 | }
140 | 
141 | 
142 | // CUDA kernel for outputing the final ARGB output from NV12;
143 | /*extern "C"*/
144 | __global__ void NV12ToARGB(uint32_t *srcImage,     size_t nSourcePitch,
145 |                            uint32_t *dstImage,     size_t nDestPitch,
146 |                            uint32_t width,         uint32_t height)
147 | {
148 |     int x, y;
149 |     uint32_t yuv101010Pel[2];
150 |     uint32_t processingPitch = ((width) + 63) & ~63;
151 |     uint32_t dstImagePitch   = nDestPitch >> 2;
152 |     uint8_t *srcImageU8     = (uint8_t *)srcImage;
153 | 
154 |     processingPitch = nSourcePitch;
155 | 
156 |     // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
157 |     x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
158 |     y = blockIdx.y *  blockDim.y       +  threadIdx.y;
159 | 
160 |     if (x >= width)
161 |         return; //x = width - 1;
162 | 
163 |     if (y >= height)
164 |         return; // y = height - 1;
165 | 
166 |     // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
167 |     // if we move to texture we could read 4 luminance values
168 |     yuv101010Pel[0] = (srcImageU8[y * processingPitch + x    ]) << 2;
169 |     yuv101010Pel[1] = (srcImageU8[y * processingPitch + x + 1]) << 2;
170 | 
171 |     uint32_t chromaOffset    = processingPitch * height;
172 |     int y_chroma = y >> 1;
173 | 
174 |     if (y & 1)  // odd scanline ?
175 |     {
176 |         uint32_t chromaCb;
177 |         uint32_t chromaCr;
178 | 
179 |         chromaCb = srcImageU8[chromaOffset + y_chroma * processingPitch + x    ];
180 |         chromaCr = srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1];
181 | 
182 |         if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
183 |         {
184 |             chromaCb = (chromaCb + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x    ] + 1) >> 1;
185 |             chromaCr = (chromaCr + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x + 1] + 1) >> 1;
186 |         }
187 | 
188 |         yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE       + 2));
189 |         yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
190 | 
191 |         yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE       + 2));
192 |         yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
193 |     }
194 |     else
195 |     {
196 |         yuv101010Pel[0] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x    ] << (COLOR_COMPONENT_BIT_SIZE       + 2));
197 |         yuv101010Pel[0] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
198 | 
199 |         yuv101010Pel[1] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x    ] << (COLOR_COMPONENT_BIT_SIZE       + 2));
200 |         yuv101010Pel[1] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
201 |     }
202 | 
203 |     // this steps performs the color conversion
204 |     uint32_t yuvi[6];
205 |     float red[2], green[2], blue[2];
206 | 
207 |     yuvi[0] = (yuv101010Pel[0] &   COLOR_COMPONENT_MASK);
208 |     yuvi[1] = ((yuv101010Pel[0] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
209 |     yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
210 | 
211 |     yuvi[3] = (yuv101010Pel[1] &   COLOR_COMPONENT_MASK);
212 |     yuvi[4] = ((yuv101010Pel[1] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
213 |     yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
214 | 
215 |     // YUV to RGB Transformation conversion
216 |     YUV2RGB(&yuvi[0], &red[0], &green[0], &blue[0]);
217 |     YUV2RGB(&yuvi[3], &red[1], &green[1], &blue[1]);
218 | 
219 |     // Clamp the results to RGBA
220 |     dstImage[y * dstImagePitch + x     ] = RGBAPACK_10bit(red[0], green[0], blue[0], constAlpha);
221 |     dstImage[y * dstImagePitch + x + 1 ] = RGBAPACK_10bit(red[1], green[1], blue[1], constAlpha);
222 | }
223 | 
224 | 
225 | bool nv12ColorspaceSetup = false;
226 | 
227 | 
228 | // cudaNV12ToARGB32
229 | cudaError_t cudaNV12ToRGBA( uint8_t* srcDev, size_t srcPitch, uchar4* destDev, size_t destPitch, size_t width, size_t height )
230 | {
231 | 	if( !srcDev || !destDev )
232 | 		return cudaErrorInvalidDevicePointer;
233 | 
234 | 	if( srcPitch == 0 || destPitch == 0 || width == 0 || height == 0 )
235 | 		return cudaErrorInvalidValue;
236 | 
237 | 	if( !nv12ColorspaceSetup )
238 | 		cudaNV12SetupColorspace();
239 | 
240 | 	const dim3 blockDim(32,16,1);
241 | 	const dim3 gridDim((width+(2*blockDim.x-1))/(2*blockDim.x), (height+(blockDim.y-1))/blockDim.y, 1);
242 | 
243 | 	NV12ToARGB<<<gridDim, blockDim>>>( (uint32_t*)srcDev, srcPitch, (uint32_t*)destDev, destPitch, width, height );
244 | 	
245 | 	return CUDA(cudaGetLastError());
246 | }
247 | 
248 | cudaError_t cudaNV12ToRGBA( uint8_t* srcDev, uchar4* destDev, size_t width, size_t height )
249 | {
250 | 	return cudaNV12ToRGBA(srcDev, width * sizeof(uint8_t), destDev, width * sizeof(uchar4), width, height);
251 | }
252 | 
253 | 
254 | //-------------------------------------------------------------------------------------------------------------------------
255 | 
256 | __global__ void NV12ToRGBAf(uint32_t* srcImage,  size_t nSourcePitch,
257 |                            float4* dstImage,     size_t nDestPitch,
258 |                            uint32_t width,       uint32_t height)
259 | {
260 |     int x, y;
261 |     uint32_t yuv101010Pel[2];
262 |     uint32_t processingPitch = ((width) + 63) & ~63;
263 |     uint8_t *srcImageU8     = (uint8_t *)srcImage;
264 | 
265 |     processingPitch = nSourcePitch;
266 | 
267 |     // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
268 |     x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
269 |     y = blockIdx.y *  blockDim.y       +  threadIdx.y;
270 | 
271 |     if (x >= width)
272 |         return; //x = width - 1;
273 | 
274 |     if (y >= height)
275 |         return; // y = height - 1;
276 | 
277 | #if 1	
278 |     // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
279 |     // if we move to texture we could read 4 luminance values
280 |     yuv101010Pel[0] = (srcImageU8[y * processingPitch + x    ]) << 2;
281 |     yuv101010Pel[1] = (srcImageU8[y * processingPitch + x + 1]) << 2;
282 | 
283 |     uint32_t chromaOffset    = processingPitch * height;
284 |     int y_chroma = y >> 1;
285 | 
286 |     if (y & 1)  // odd scanline ?
287 |     {
288 |         uint32_t chromaCb;
289 |         uint32_t chromaCr;
290 | 
291 |         chromaCb = srcImageU8[chromaOffset + y_chroma * processingPitch + x    ];
292 |         chromaCr = srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1];
293 | 
294 |         if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
295 |         {
296 |             chromaCb = (chromaCb + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x    ] + 1) >> 1;
297 |             chromaCr = (chromaCr + srcImageU8[chromaOffset + (y_chroma + 1) * processingPitch + x + 1] + 1) >> 1;
298 |         }
299 | 
300 |         yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE       + 2));
301 |         yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
302 | 
303 |         yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE       + 2));
304 |         yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
305 |     }
306 |     else
307 |     {
308 |         yuv101010Pel[0] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x    ] << (COLOR_COMPONENT_BIT_SIZE       + 2));
309 |         yuv101010Pel[0] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
310 | 
311 |         yuv101010Pel[1] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x    ] << (COLOR_COMPONENT_BIT_SIZE       + 2));
312 |         yuv101010Pel[1] |= ((uint32_t)srcImageU8[chromaOffset + y_chroma * processingPitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
313 |     }
314 | 
315 |     // this steps performs the color conversion
316 |     uint32_t yuvi[6];
317 |     float red[2], green[2], blue[2];
318 | 
319 |     yuvi[0] = (yuv101010Pel[0] &   COLOR_COMPONENT_MASK);
320 |     yuvi[1] = ((yuv101010Pel[0] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
321 |     yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
322 | 
323 |     yuvi[3] = (yuv101010Pel[1] &   COLOR_COMPONENT_MASK);
324 |     yuvi[4] = ((yuv101010Pel[1] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
325 |     yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
326 | 
327 |     // YUV to RGB Transformation conversion
328 |     YUV2RGB(&yuvi[0], &red[0], &green[0], &blue[0]);
329 |     YUV2RGB(&yuvi[3], &red[1], &green[1], &blue[1]);
330 | 
331 |     // Clamp the results to RGBA
332 | 	//printf("cuda thread %i %i  %f %f %f\n", x, y, red[0], green[0], blue[0]);
333 | 
334 | 	const float s = 1.0f / 1024.0f * 255.0f;
335 | 
336 | 	dstImage[y * width + x]     = make_float4(red[0] * s, green[0] * s, blue[0] * s, 1.0f);
337 | 	dstImage[y * width + x + 1] = make_float4(red[1] * s, green[1] * s, blue[1] * s, 1.0f);
338 | #else
339 | 	//printf("cuda thread %i %i  %i %i \n", x, y, width, height);
340 | 		
341 | 	dstImage[y * width + x]     = make_float4(1.0f, 0.0f, 0.0f, 1.0f);
342 | 	dstImage[y * width + x + 1] = make_float4(1.0f, 0.0f, 0.0f, 1.0f);
343 | #endif
344 | }
345 | 
346 | 
347 | 
348 | // cudaNV12ToRGBA
349 | cudaError_t cudaNV12ToRGBAf( uint8_t* srcDev, size_t srcPitch, float4* destDev, size_t destPitch, size_t width, size_t height )
350 | {
351 | 	if( !srcDev || !destDev )
352 | 		return cudaErrorInvalidDevicePointer;
353 | 
354 | 	if( srcPitch == 0 || destPitch == 0 || width == 0 || height == 0 )
355 | 		return cudaErrorInvalidValue;
356 | 
357 | 	if( !nv12ColorspaceSetup )
358 | 		cudaNV12SetupColorspace();
359 | 
360 | 	const dim3 blockDim(8,8,1);
361 | 	//const dim3 gridDim((width+(2*blockDim.x-1))/(2*blockDim.x), (height+(blockDim.y-1))/blockDim.y, 1);
362 | 	const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height, blockDim.y), 1);
363 | 
364 | 	NV12ToRGBAf<<<gridDim, blockDim>>>( (uint32_t*)srcDev, srcPitch, destDev, destPitch, width, height );
365 | 	
366 | 	return CUDA(cudaGetLastError());
367 | }
368 | 
369 | cudaError_t cudaNV12ToRGBAf( uint8_t* srcDev, float4* destDev, size_t width, size_t height )
370 | {
371 | 	return cudaNV12ToRGBAf(srcDev, width * sizeof(uint8_t), destDev, width * sizeof(float4), width, height);
372 | }
373 | 
374 | 
375 | // cudaNV12SetupColorspace
376 | cudaError_t cudaNV12SetupColorspace( float hue )
377 | {
378 | 	const float hueSin = sin(hue);
379 | 	const float hueCos = cos(hue);
380 | 
381 | 	float hueCSC[9];
382 | 
383 | 	const bool itu601 = false;
384 | 
385 | 	if( itu601 /*CSC == ITU601*/)
386 | 	{
387 | 		//CCIR 601
388 | 		hueCSC[0] = 1.1644f;
389 | 		hueCSC[1] = hueSin * 1.5960f;
390 | 		hueCSC[2] = hueCos * 1.5960f;
391 | 		hueCSC[3] = 1.1644f;
392 | 		hueCSC[4] = (hueCos * -0.3918f) - (hueSin * 0.8130f);
393 | 		hueCSC[5] = (hueSin *  0.3918f) - (hueCos * 0.8130f);
394 | 		hueCSC[6] = 1.1644f;
395 | 		hueCSC[7] = hueCos *  2.0172f;
396 | 		hueCSC[8] = hueSin * -2.0172f;
397 | 	}
398 | 	else /*if(CSC == ITU709)*/
399 | 	{
400 | 		//CCIR 709
401 | 		hueCSC[0] = 1.0f;
402 | 		hueCSC[1] = hueSin * 1.57480f;
403 | 		hueCSC[2] = hueCos * 1.57480f;
404 | 		hueCSC[3] = 1.0;
405 | 		hueCSC[4] = (hueCos * -0.18732f) - (hueSin * 0.46812f);
406 | 		hueCSC[5] = (hueSin *  0.18732f) - (hueCos * 0.46812f);
407 | 		hueCSC[6] = 1.0f;
408 | 		hueCSC[7] = hueCos *  1.85560f;
409 | 		hueCSC[8] = hueSin * -1.85560f;
410 | 	}
411 | 
412 | 
413 | 	if( CUDA_FAILED(cudaMemcpyToSymbol(constHueColorSpaceMat, hueCSC, sizeof(float) * 9)) )
414 | 		return cudaErrorInvalidSymbol;
415 | 
416 | 	uint32_t cudaAlpha = ((uint32_t)0xff<< 24);
417 | 
418 | 	if( CUDA_FAILED(cudaMemcpyToSymbol(constAlpha, &cudaAlpha, sizeof(uint32_t))) )
419 | 		return cudaErrorInvalidSymbol;
420 | 
421 | 	nv12ColorspaceSetup = true;
422 | 	return cudaSuccess;
423 | }
424 | 
425 | 


--------------------------------------------------------------------------------
/util/cuda/cudaYUV-YUYV.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * http://github.com/dusty-nv/jetson-inference
  3 |  */
  4 | 
  5 | #include "cudaYUV.h"
  6 | 
  7 | 
  8 | inline __device__ __host__ float clamp(float f, float a, float b)
  9 | {
 10 |     return fmaxf(a, fminf(f, b));
 11 | }
 12 | 
 13 | 
 14 | /* From RGB to YUV
 15 | 
 16 |    Y = 0.299R + 0.587G + 0.114B
 17 |    U = 0.492 (B-Y)
 18 |    V = 0.877 (R-Y)
 19 | 
 20 |    It can also be represented as:
 21 | 
 22 |    Y =  0.299R + 0.587G + 0.114B
 23 |    U = -0.147R - 0.289G + 0.436B
 24 |    V =  0.615R - 0.515G - 0.100B
 25 | 
 26 |    From YUV to RGB
 27 | 
 28 |    R = Y + 1.140V
 29 |    G = Y - 0.395U - 0.581V
 30 |    B = Y + 2.032U
 31 |  */
 32 | 
 33 | struct __align__(8) uchar8
 34 | {
 35 |    uint8_t a0, a1, a2, a3, a4, a5, a6, a7;
 36 | };
 37 | static __host__ __device__ __forceinline__ uchar8 make_uchar8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7)
 38 | {
 39 |    uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
 40 |    return val;
 41 | }
 42 | 
 43 | 
 44 | //-----------------------------------------------------------------------------------
 45 | // YUYV/UYVY to RGBA
 46 | //-----------------------------------------------------------------------------------
 47 | template <bool formatUYVY>
 48 | __global__ void yuyvToRgba( uchar4* src, int srcAlignedWidth, uchar8* dst, int dstAlignedWidth, int width, int height )
 49 | {
 50 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
 51 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
 52 | 
 53 | 	if( x >= srcAlignedWidth || y >= height )
 54 | 		return;
 55 | 
 56 | 	const uchar4 macroPx = src[y * srcAlignedWidth + x];
 57 | 
 58 | 	// Y0 is the brightness of pixel 0, Y1 the brightness of pixel 1.
 59 | 	// U0 and V0 is the color of both pixels.
 60 | 	// UYVY [ U0 | Y0 | V0 | Y1 ] 
 61 | 	// YUYV [ Y0 | U0 | Y1 | V0 ]
 62 | 	const float y0 = formatUYVY ? macroPx.y : macroPx.x;
 63 | 	const float y1 = formatUYVY ? macroPx.w : macroPx.z; 
 64 | 	const float u = (formatUYVY ? macroPx.x : macroPx.y) - 128.0f;
 65 | 	const float v = (formatUYVY ? macroPx.z : macroPx.w) - 128.0f;
 66 | 
 67 | 	const float4 px0 = make_float4( y0 + 1.4065f * v,
 68 | 							  y0 - 0.3455f * u - 0.7169f * v,
 69 | 							  y0 + 1.7790f * u, 255.0f );
 70 | 
 71 | 	const float4 px1 = make_float4( y1 + 1.4065f * v,
 72 | 							  y1 - 0.3455f * u - 0.7169f * v,
 73 | 							  y1 + 1.7790f * u, 255.0f );
 74 | 
 75 | 	dst[y * dstAlignedWidth + x] = make_uchar8( clamp(px0.x, 0.0f, 255.0f), 
 76 | 									    clamp(px0.y, 0.0f, 255.0f),
 77 | 									    clamp(px0.z, 0.0f, 255.0f),
 78 | 									    clamp(px0.w, 0.0f, 255.0f),
 79 | 									    clamp(px1.x, 0.0f, 255.0f),
 80 | 									    clamp(px1.y, 0.0f, 255.0f),
 81 | 									    clamp(px1.z, 0.0f, 255.0f),
 82 | 									    clamp(px1.w, 0.0f, 255.0f) );
 83 | } 
 84 | 
 85 | template<bool formatUYVY>
 86 | cudaError_t launchYUYV( uchar2* input, size_t inputPitch, uchar4* output, size_t outputPitch, size_t width, size_t height)
 87 | {
 88 | 	if( !input || !inputPitch || !output || !outputPitch || !width || !height )
 89 | 		return cudaErrorInvalidValue;
 90 | 
 91 | 	const dim3 block(8,8);
 92 | 	const dim3 grid(iDivUp(width/2, block.x), iDivUp(height, block.y));
 93 | 
 94 | 	const int srcAlignedWidth = inputPitch / sizeof(uchar4);	// normally would be uchar2, but we're doubling up pixels
 95 | 	const int dstAlignedWidth = outputPitch / sizeof(uchar8);	// normally would be uchar4 ^^^
 96 | 
 97 | 	//printf("yuyvToRgba %zu %zu %i %i %i %i %i\n", width, height, (int)formatUYVY, srcAlignedWidth, dstAlignedWidth, grid.x, grid.y);
 98 | 
 99 | 	yuyvToRgba<formatUYVY><<<grid, block>>>((uchar4*)input, srcAlignedWidth, (uchar8*)output, dstAlignedWidth, width, height);
100 | 
101 | 	return CUDA(cudaGetLastError());
102 | }
103 | 
104 | 
105 | cudaError_t cudaUYVYToRGBA( uchar2* input, uchar4* output, size_t width, size_t height )
106 | {
107 | 	return cudaUYVYToRGBA(input, width * sizeof(uchar2), output, width * sizeof(uchar4), width, height);
108 | }
109 | 
110 | cudaError_t cudaUYVYToRGBA( uchar2* input, size_t inputPitch, uchar4* output, size_t outputPitch, size_t width, size_t height )
111 | {
112 | 	return launchYUYV<true>(input, inputPitch, output, outputPitch, width, height);
113 | }
114 | 
115 | cudaError_t cudaYUYVToRGBA( uchar2* input, uchar4* output, size_t width, size_t height )
116 | {
117 | 	return cudaYUYVToRGBA(input, width * sizeof(uchar2), output, width * sizeof(uchar4), width, height);
118 | }
119 | 
120 | cudaError_t cudaYUYVToRGBA( uchar2* input, size_t inputPitch, uchar4* output, size_t outputPitch, size_t width, size_t height )
121 | {
122 | 	return launchYUYV<false>(input, inputPitch, output, outputPitch, width, height);
123 | }
124 | 
125 | 
126 | //-----------------------------------------------------------------------------------
127 | // YUYV/UYVY to grayscale
128 | //-----------------------------------------------------------------------------------
129 | 
130 | template <bool formatUYVY>
131 | __global__ void yuyvToGray( uchar4* src, int srcAlignedWidth, float2* dst, int dstAlignedWidth, int width, int height )
132 | {
133 | 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
134 | 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
135 | 
136 | 	if( x >= srcAlignedWidth || y >= height )
137 | 		return;
138 | 
139 | 	const uchar4 macroPx = src[y * srcAlignedWidth + x];
140 | 
141 | 	const float y0 = formatUYVY ? macroPx.y : macroPx.x;
142 | 	const float y1 = formatUYVY ? macroPx.w : macroPx.z; 
143 | 
144 | 	dst[y * dstAlignedWidth + x] = make_float2(y0/255.0f, y1/255.0f);
145 | } 
146 | 
147 | template<bool formatUYVY>
148 | cudaError_t launchGrayYUYV( uchar2* input, size_t inputPitch, float* output, size_t outputPitch, size_t width, size_t height)
149 | {
150 | 	if( !input || !inputPitch || !output || !outputPitch || !width || !height )
151 | 		return cudaErrorInvalidValue;
152 | 
153 | 	const dim3 block(8,8);
154 | 	const dim3 grid(iDivUp(width/2, block.x), iDivUp(height, block.y));
155 | 
156 | 	const int srcAlignedWidth = inputPitch / sizeof(uchar4);	// normally would be uchar2, but we're doubling up pixels
157 | 	const int dstAlignedWidth = outputPitch / sizeof(float2);	// normally would be float ^^^
158 | 
159 | 	yuyvToGray<formatUYVY><<<grid, block>>>((uchar4*)input, srcAlignedWidth, (float2*)output, dstAlignedWidth, width, height);
160 | 
161 | 	return CUDA(cudaGetLastError());
162 | }
163 | 
164 | cudaError_t cudaUYVYToGray( uchar2* input, float* output, size_t width, size_t height )
165 | {
166 | 	return cudaUYVYToGray(input, width * sizeof(uchar2), output, width * sizeof(uint8_t), width, height);
167 | }
168 | 
169 | cudaError_t cudaUYVYToGray( uchar2* input, size_t inputPitch, float* output, size_t outputPitch, size_t width, size_t height )
170 | {
171 | 	return launchGrayYUYV<true>(input, inputPitch, output, outputPitch, width, height);
172 | }
173 | 
174 | cudaError_t cudaYUYVToGray( uchar2* input, float* output, size_t width, size_t height )
175 | {
176 | 	return cudaYUYVToGray(input, width * sizeof(uchar2), output, width * sizeof(float), width, height);
177 | }
178 | 
179 | cudaError_t cudaYUYVToGray( uchar2* input, size_t inputPitch, float* output, size_t outputPitch, size_t width, size_t height )
180 | {
181 | 	return launchGrayYUYV<false>(input, inputPitch, output, outputPitch, width, height);
182 | }
183 | 
184 | 


--------------------------------------------------------------------------------
/util/cuda/cudaYUV-YV12.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * inference-101
  3 |  */
  4 | 
  5 | #include "cudaYUV.h"
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | inline __device__ void rgb_to_y(const uint8_t r, const uint8_t g, const uint8_t b, uint8_t& y)
 12 | {
 13 | 	y = static_cast<uint8_t>(((int)(30 * r) + (int)(59 * g) + (int)(11 * b)) / 100);
 14 | }
 15 | 
 16 | inline __device__ void rgb_to_yuv(const uint8_t r, const uint8_t g, const uint8_t b, uint8_t& y, uint8_t& u, uint8_t& v)
 17 | {
 18 | 	rgb_to_y(r, g, b, y);
 19 | 	u = static_cast<uint8_t>(((int)(-17 * r) - (int)(33 * g) + (int)(50 * b) + 12800) / 100);
 20 | 	v = static_cast<uint8_t>(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100);
 21 | }
 22 | 
 23 | template <typename T, bool formatYV12>
 24 | __global__ void RGB_to_YV12( T* src, int srcAlignedWidth, uint8_t* dst, int dstPitch, int width, int height )
 25 | {
 26 | 	const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
 27 | 	const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
 28 | 
 29 | 	const int x1 = x + 1;
 30 | 	const int y1 = y + 1;
 31 | 
 32 | 	if( x1 >= width || y1 >= height )
 33 | 		return;
 34 | 
 35 | 	const int planeSize = height * dstPitch;
 36 | 	
 37 | 	uint8_t* y_plane = dst;
 38 | 	uint8_t* u_plane;
 39 | 	uint8_t* v_plane;
 40 | 
 41 | 	if( formatYV12 )
 42 | 	{
 43 | 		u_plane = y_plane + planeSize;
 44 | 		v_plane = u_plane + (planeSize / 4);	// size of U & V planes is 25% of Y plane
 45 | 	}
 46 | 	else
 47 | 	{
 48 | 		v_plane = y_plane + planeSize;		// in I420, order of U & V planes is reversed
 49 | 		u_plane = v_plane + (planeSize / 4);
 50 | 	}
 51 | 
 52 | 	T px;
 53 | 	uint8_t y_val, u_val, v_val;
 54 | 
 55 | 	px = src[y * srcAlignedWidth + x];
 56 | 	rgb_to_y(px.x, px.y, px.z, y_val);
 57 | 	y_plane[y * dstPitch + x] = y_val;
 58 | 
 59 | 	px = src[y * srcAlignedWidth + x1];
 60 | 	rgb_to_y(px.x, px.y, px.z, y_val);
 61 | 	y_plane[y * dstPitch + x1] = y_val;
 62 | 
 63 | 	px = src[y1 * srcAlignedWidth + x];
 64 | 	rgb_to_y(px.x, px.y, px.z, y_val);
 65 | 	y_plane[y1 * dstPitch + x] = y_val;
 66 | 	
 67 | 	px = src[y1 * srcAlignedWidth + x1];
 68 | 	rgb_to_yuv(px.x, px.y, px.z, y_val, u_val, v_val);
 69 | 	y_plane[y1 * dstPitch + x1] = y_val;
 70 | 
 71 | 	const int uvPitch = dstPitch / 2;
 72 | 	const int uvIndex = (y / 2) * uvPitch + (x / 2);
 73 | 
 74 | 	u_plane[uvIndex] = u_val;
 75 | 	v_plane[uvIndex] = v_val;
 76 | } 
 77 | 
 78 | template<typename T, bool formatYV12>
 79 | cudaError_t launch420( T* input, size_t inputPitch, uint8_t* output, size_t outputPitch, size_t width, size_t height)
 80 | {
 81 | 	if( !input || !inputPitch || !output || !outputPitch || !width || !height )
 82 | 		return cudaErrorInvalidValue;
 83 | 
 84 | 	const dim3 block(32, 8);
 85 | 	const dim3 grid(iDivUp(width, block.x * 2), iDivUp(height, block.y * 2));
 86 | 
 87 | 	const int inputAlignedWidth = inputPitch / sizeof(T);
 88 | 
 89 | 	RGB_to_YV12<T, formatYV12><<<grid, block>>>(input, inputAlignedWidth, output, outputPitch, width, height);
 90 | 
 91 | 	return CUDA(cudaGetLastError());
 92 | }
 93 | 
 94 | 
 95 | 
 96 | // cudaRGBAToYV12
 97 | cudaError_t cudaRGBAToYV12( uchar4* input, size_t inputPitch, uint8_t* output, size_t outputPitch, size_t width, size_t height )
 98 | {
 99 | 	return launch420<uchar4,false>( input, inputPitch, output, outputPitch, width, height );
100 | }
101 | 
102 | // cudaRGBAToYV12
103 | cudaError_t cudaRGBAToYV12( uchar4* input, uint8_t* output, size_t width, size_t height )
104 | {
105 | 	return cudaRGBAToYV12( input, width * sizeof(uchar4), output, width * sizeof(uint8_t), width, height );
106 | }
107 | 
108 | // cudaRGBAToI420
109 | cudaError_t cudaRGBAToI420( uchar4* input, size_t inputPitch, uint8_t* output, size_t outputPitch, size_t width, size_t height )
110 | {
111 | 	return launch420<uchar4,true>( input, inputPitch, output, outputPitch, width, height );
112 | }
113 | 
114 | // cudaRGBAToI420
115 | cudaError_t cudaRGBAToI420( uchar4* input, uint8_t* output, size_t width, size_t height )
116 | {
117 | 	return cudaRGBAToI420( input, width * sizeof(uchar4), output, width * sizeof(uint8_t), width, height );
118 | }
119 | 
120 | 
121 | 
122 | #if 0
123 | __global__ void Gray_to_YV12(const GlobPtrSz<uint8_t> src, GlobPtr<uint8_t> dst)
124 | {
125 | 	const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
126 | 	const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
127 | 
128 | 	if (x + 1 >= src.cols || y + 1 >= src.rows)
129 | 		return;
130 | 
131 | 	// get pointers to the data
132 | 	const size_t planeSize = src.rows * dst.step;
133 |    GlobPtr<uint8_t> y_plane = globPtr(dst.data, dst.step);
134 |    GlobPtr<uint8_t> u_plane = globPtr(y_plane.data + planeSize, dst.step / 2);
135 |    GlobPtr<uint8_t> v_plane = globPtr(u_plane.data + (planeSize / 4), dst.step / 2);
136 | 
137 |    uint8_t pix;
138 |    uint8_t y_val, u_val, v_val;
139 | 
140 |    pix = src(y, x);
141 |    rgb_to_y(pix, pix, pix, y_val);
142 |    y_plane(y, x) = y_val;
143 | 
144 |    pix = src(y, x + 1);
145 |    rgb_to_y(pix, pix, pix, y_val);
146 |    y_plane(y, x + 1) = y_val;
147 | 
148 |    pix = src(y + 1, x);
149 |    rgb_to_y(pix, pix, pix, y_val);
150 |    y_plane(y + 1, x) = y_val;
151 | 
152 |    pix = src(y + 1, x + 1);
153 |    rgb_to_yuv(pix, pix, pix, y_val, u_val, v_val);
154 |    y_plane(y + 1, x + 1) = y_val;
155 |    u_plane(y / 2, x / 2) = u_val;
156 |    v_plane(y / 2, x / 2) = v_val;
157 | }
158 | #endif
159 | 
160 | 


--------------------------------------------------------------------------------
/util/cuda/cudaYUV.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * http://github.com/dusty-nv/jetson-inference
  3 |  */
  4 | 
  5 | #ifndef __CUDA_YUV_CONVERT_H
  6 | #define __CUDA_YUV_CONVERT_H
  7 | 
  8 | 
  9 | #include "cudaUtility.h"
 10 | #include <stdint.h>
 11 | 
 12 | 
 13 | //////////////////////////////////////////////////////////////////////////////////
 14 | /// @name RGBA to YUV 4:2:0 planar (I420 & YV12)
 15 | /// @ingroup util
 16 | //////////////////////////////////////////////////////////////////////////////////
 17 | 
 18 | ///@{
 19 | 
 20 | /**
 21 |  * Convert an RGBA uchar4 buffer into YUV I420 planar.
 22 |  */
 23 | cudaError_t cudaRGBAToI420( uchar4* input, uint8_t* output, size_t width, size_t height );
 24 | 
 25 | /**
 26 |  * Convert an RGBA uchar4 texture into YUV I420 planar.
 27 |  */
 28 | cudaError_t cudaRGBAToI420( uchar4* input, size_t inputPitch, uint8_t* output, size_t outputPitch, size_t width, size_t height );
 29 | 
 30 | /**
 31 |  * Convert an RGBA uchar4 buffer into YUV YV12 planar.
 32 |  */
 33 | cudaError_t cudaRGBAToYV12( uchar4* input, uint8_t* output, size_t width, size_t height );
 34 | 
 35 | /**
 36 |  * Convert an RGBA uchar4 texture into YUV YV12 planar.
 37 |  */
 38 | cudaError_t cudaRGBAToYV12( uchar4* input, size_t inputPitch, uint8_t* output, size_t outputPitch, size_t width, size_t height );
 39 | 
 40 | ///@}
 41 | 
 42 | 
 43 | //////////////////////////////////////////////////////////////////////////////////
 44 | /// @name YUV 4:2:2 packed (UYVY & YUYV) to RGBA
 45 | /// @ingroup util
 46 | //////////////////////////////////////////////////////////////////////////////////
 47 | 
 48 | ///@{
 49 | 
 50 | /**
 51 |  * Convert a UYVY 422 packed image into RGBA uchar4.
 52 |  */
 53 | cudaError_t cudaUYVYToRGBA( uchar2* input, uchar4* output, size_t width, size_t height );
 54 | 
 55 | /**
 56 |  * Convert a UYVY 422 packed image into RGBA uchar4.
 57 |  */
 58 | cudaError_t cudaUYVYToRGBA( uchar2* input, size_t inputPitch, uchar4* output, size_t outputPitch, size_t width, size_t height );
 59 | 
 60 | /**
 61 |  * Convert a YUYV 422 packed image into RGBA uchar4.
 62 |  */
 63 | cudaError_t cudaYUYVToRGBA( uchar2* input, uchar4* output, size_t width, size_t height );
 64 | 
 65 | /**
 66 |  * Convert a YUYV 422 packed image into RGBA uchar4.
 67 |  */
 68 | cudaError_t cudaYUYVToRGBA( uchar2* input, size_t inputPitch, uchar4* output, size_t outputPitch, size_t width, size_t height );
 69 | 
 70 | ///@}
 71 | 
 72 | 
 73 | //////////////////////////////////////////////////////////////////////////////////
 74 | /// @name UYUV 4:2:2 packed (UYVY & YUYV) to grayscale
 75 | /// @ingroup util
 76 | //////////////////////////////////////////////////////////////////////////////////
 77 | 
 78 | ///@{
 79 | 
 80 | /**
 81 |  * Convert a UYVY 422 packed image into a uint8 grayscale.
 82 |  */
 83 | cudaError_t cudaUYVYToGray( uchar2* input, float* output, size_t width, size_t height );
 84 | 
 85 | /**
 86 |  * Convert a UYVY 422 packed image into a uint8 grayscale.
 87 |  */
 88 | cudaError_t cudaUYVYToGray( uchar2* input, size_t inputPitch, float* output, size_t outputPitch, size_t width, size_t height );
 89 | 
 90 | /**
 91 |  * Convert a YUYV 422 packed image into a uint8 grayscale.
 92 |  */
 93 | cudaError_t cudaYUYVToGray( uchar2* input, float* output, size_t width, size_t height );
 94 | 
 95 | /**
 96 |  * Convert a YUYV 422 packed image into a uint8 grayscale.
 97 |  */
 98 | cudaError_t cudaYUYVToGray( uchar2* input, size_t inputPitch, float* output, size_t outputPitch, size_t width, size_t height );
 99 | 
100 | ///@}
101 | 
102 | 
103 | //////////////////////////////////////////////////////////////////////////////////
104 | /// @name YUV NV12 to RGBA
105 | /// @ingroup util
106 | //////////////////////////////////////////////////////////////////////////////////
107 | 
108 | ///@{
109 | 
110 | /**
111 |  * Convert an NV12 texture (semi-planar 4:2:0) to ARGB uchar4 format.
112 |  * NV12 = 8-bit Y plane followed by an interleaved U/V plane with 2x2 subsampling.
113 |  */
114 | cudaError_t cudaNV12ToRGBA( uint8_t* input, size_t inputPitch, uchar4* output, size_t outputPitch, size_t width, size_t height );
115 | cudaError_t cudaNV12ToRGBA( uint8_t* input, uchar4* output, size_t width, size_t height );
116 | 
117 | cudaError_t cudaNV12ToRGBAf( uint8_t* input, size_t inputPitch, float4* output, size_t outputPitch, size_t width, size_t height );
118 | cudaError_t cudaNV12ToRGBAf( uint8_t* input, float4* output, size_t width, size_t height );
119 | 
120 | /**
121 |  * Setup NV12 color conversion constants.
122 |  * cudaNV12SetupColorspace() isn't necessary for the user to call, it will be
123 |  * called automatically by cudaNV12ToRGBA() with a hue of 0.0.
124 |  * However if you want to setup custom constants (ie with a hue different than 0),
125 |  * then you can call cudaNV12SetupColorspace() at any time, overriding the default.
126 |  */
127 | cudaError_t cudaNV12SetupColorspace( float hue = 0.0f ); 
128 | 
129 | ///@}
130 | 
131 | #endif
132 | 
133 | 


--------------------------------------------------------------------------------
/util/loadImage.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Permission is hereby granted, free of charge, to any person obtaining a
  5 |  * copy of this software and associated documentation files (the "Software"),
  6 |  * to deal in the Software without restriction, including without limitation
  7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8 |  * and/or sell copies of the Software, and to permit persons to whom the
  9 |  * Software is furnished to do so, subject to the following conditions:
 10 |  *
 11 |  * The above copyright notice and this permission notice shall be included in
 12 |  * all copies or substantial portions of the Software.
 13 |  *
 14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 20 |  * DEALINGS IN THE SOFTWARE.
 21 |  */
 22 | #include <iostream>
 23 | #include "loadImage.h"
 24 | #include "../util/cuda/cudaMappedMemory.h"
 25 | 
 26 | //#include <QImage>
 27 | #include <opencv2/core/mat.hpp>
 28 | 
 29 | 
 30 | // loadImageRGBA
 31 | //bool loadImageRGBA( const char* filename, float4** cpu, float4** gpu, int* width, int* height )
 32 | //{
 33 | //	if( !filename || !cpu || !gpu || !width || !height )
 34 | //	{
 35 | //		printf("loadImageRGBA - invalid parameter\n");
 36 | //		return false;
 37 | //	}
 38 | //
 39 | //	// load original image
 40 | //	QImage qImg;
 41 | //
 42 | //	if( !qImg.load(filename) )
 43 | //	{
 44 | //		printf("failed to load image %s\n", filename);
 45 | //		return false;
 46 | //	}
 47 | //
 48 | //	if( *width != 0 && *height != 0 )
 49 | //		qImg = qImg.scaled(*width, *height, Qt::IgnoreAspectRatio);
 50 | //
 51 | //	const uint32_t imgWidth  = qImg.width();
 52 | //	const uint32_t imgHeight = qImg.height();
 53 | //	const uint32_t imgPixels = imgWidth * imgHeight;
 54 | //	const size_t   imgSize   = imgWidth * imgHeight * sizeof(float) * 4;
 55 | //
 56 | //	printf("loaded image  %s  (%u x %u)  %zu bytes\n", filename, imgWidth, imgHeight, imgSize);
 57 | //
 58 | //	// allocate buffer for the image
 59 | //	if( !cudaAllocMapped((void**)cpu, (void**)gpu, imgSize) )
 60 | //	{
 61 | //		printf(LOG_CUDA "failed to allocated %zu bytes for image %s\n", imgSize, filename);
 62 | //		return false;
 63 | //	}
 64 | //
 65 | //	float4* cpuPtr = *cpu;
 66 | //
 67 | //	for( uint32_t y=0; y < imgHeight; y++ )
 68 | //	{
 69 | //		for( uint32_t x=0; x < imgWidth; x++ )
 70 | //		{
 71 | //			const QRgb rgb  = qImg.pixel(x,y);
 72 | //			const float4 px = make_float4(float(qRed(rgb)),
 73 | //										  float(qGreen(rgb)),
 74 | //										  float(qBlue(rgb)),
 75 | //										  float(qAlpha(rgb)));
 76 | //
 77 | //			cpuPtr[y*imgWidth+x] = px;
 78 | //		}
 79 | //	}
 80 | //
 81 | //	*width  = imgWidth;
 82 | //	*height = imgHeight;
 83 | //	return true;
 84 | //}
 85 | //
 86 | //
 87 | 
 88 | /*
 89 | // loadImageRGB
 90 | bool loadImageRGB( const char* filename, float3** cpu, float3** gpu, int* width, int* height, const float3& mean )
 91 | {
 92 |     if( !filename || !cpu || !gpu || !width || !height )
 93 |     {
 94 |         printf("loadImageRGB - invalid parameter\n");
 95 |         return false;
 96 |     }
 97 | 
 98 |     // load original image
 99 |     QImage qImg;
100 | 
101 |     if( !qImg.load(filename) )
102 |     {
103 |         printf("failed to load image %s\n", filename);
104 |         return false;
105 |     }
106 | 
107 |     if( *width != 0 && *height != 0 )
108 |         qImg = qImg.scaled(*width, *height, Qt::IgnoreAspectRatio);
109 | 
110 |     const uint32_t imgWidth  = qImg.width();
111 |     const uint32_t imgHeight = qImg.height();
112 |     const uint32_t imgPixels = imgWidth * imgHeight;
113 |     const size_t   imgSize   = imgWidth * imgHeight * sizeof(float) * 3;
114 | 
115 |     printf("loaded image  %s  (%u x %u)  %zu bytes\n", filename, imgWidth, imgHeight, imgSize);
116 | 
117 |     // allocate buffer for the image
118 |     if( !cudaAllocMapped((void**)cpu, (void**)gpu, imgSize) )
119 |     {
120 |         printf(LOG_CUDA "failed to allocated %zu bytes for image %s\n", imgSize, filename);
121 |         return false;
122 |     }
123 | 
124 |     float* cpuPtr = (float*)*cpu;
125 | 
126 |     for( uint32_t y=0; y < imgHeight; y++ )
127 |     {
128 |         for( uint32_t x=0; x < imgWidth; x++ )
129 |         {
130 |             const QRgb rgb  = qImg.pixel(x,y);
131 |             const float mul = 0.007843f; 	//1.0f / 255.0f;
132 |             const float3 px = make_float3((float(qRed(rgb))   - mean.x) * mul,
133 |                                           (float(qGreen(rgb)) - mean.y) * mul,
134 |                                           (float(qBlue(rgb))  - mean.z) * mul );
135 | 
136 |             // note:  caffe/GIE is band-sequential (as opposed to the typical Band Interleaved by Pixel)
137 |             cpuPtr[imgPixels * 0 + y * imgWidth + x] = px.x;
138 |             cpuPtr[imgPixels * 1 + y * imgWidth + x] = px.y;
139 |             cpuPtr[imgPixels * 2 + y * imgWidth + x] = px.z;
140 |         }
141 |     }
142 | 
143 |     *width  = imgWidth;
144 |     *height = imgHeight;
145 |     return true;
146 | }
147 | */
148 | 
149 | 
150 | bool    loadImageBGR( cv::Mat frame, float3** cpu, float3** gpu, int* width, int* height, const float3& mean )
151 | {
152 | 	const uint32_t imgWidth  = 300;
153 | 	const uint32_t imgHeight = 300;
154 | 	const uint32_t imgPixels = imgWidth * imgHeight;
155 | 	const size_t   imgSize   = imgWidth * imgHeight * sizeof(float) * 3;
156 | 
157 | 	// allocate buffer for the image
158 | 	if( !cudaAllocMapped((void**)cpu, (void**)gpu, imgSize) )
159 | 	{
160 | 		printf(LOG_CUDA "failed to allocated bytes for image");
161 | 		return false;
162 | 	}
163 | 
164 | 	float* cpuPtr = (float*)*cpu;
165 | 
166 | 	for( uint32_t y=0; y < imgHeight; y++ )
167 | 	{
168 | 		for( uint32_t x=0; x < imgWidth; x++ )
169 | 		{
170 |       cv::Vec3b intensity = frame.at<cv::Vec3b>(y,x);
171 | 			cpuPtr[imgPixels * 0 + y * imgWidth + x] = (float)intensity.val[0];
172 | 			cpuPtr[imgPixels * 1 + y * imgWidth + x] = (float)intensity.val[1];
173 | 			cpuPtr[imgPixels * 2 + y * imgWidth + x] = (float)intensity.val[2];
174 | 		}
175 | 	}
176 | 	return true;
177 | }
178 | 


--------------------------------------------------------------------------------
/util/loadImage.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and associated documentation files (the "Software"),
 6 |  * to deal in the Software without restriction, including without limitation
 7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 |  * and/or sell copies of the Software, and to permit persons to whom the
 9 |  * Software is furnished to do so, subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in
12 |  * all copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 |  * DEALINGS IN THE SOFTWARE.
21 |  */
22 |  
23 | #ifndef __IMAGE_LOADER_H_
24 | #define __IMAGE_LOADER_H_
25 | 
26 | 
27 | #include <opencv2/core/mat.hpp>
28 | #include "../util/cuda/cudaUtility.h"
29 | 
30 | 
31 | /**
32 |  * Load a color image from disk into CUDA memory with alpha.
33 |  * This function loads the image into shared CPU/GPU memory, using the functions from cudaMappedMemory.h
34 |  *
35 |  * @param filename Path to the image file on disk.
36 |  * @param cpu Pointer to CPU buffer allocated containing the image.
37 |  * @param gpu Pointer to CUDA device buffer residing on GPU containing image.
38 |  * @param width Variable containing width in pixels of the image.
39 |  * @param height Variable containing height in pixels of the image.
40 |  *
41 |  * @ingroup util
42 |  */
43 | bool loadImageRGBA( const char* filename, float4** cpu, float4** gpu, int* width, int* height );
44 | 
45 | 
46 | /**
47 |  * Save an image to disk
48 |  * @ingroup util
49 |  */
50 | bool saveImageRGBA( const char* filename, float4* cpu, int width, int height, float max_pixel=255.0f );
51 | 
52 | 
53 | /**
54 |  * Load a color image from disk into CUDA memory.
55 |  * This function loads the image into shared CPU/GPU memory, using the functions from cudaMappedMemory.h
56 |  *
57 |  * @param filename Path to the image file on disk.
58 |  * @param cpu Pointer to CPU buffer allocated containing the image.
59 |  * @param gpu Pointer to CUDA device buffer residing on GPU containing image.
60 |  * @param width Variable containing width in pixels of the image.
61 |  * @param height Variable containing height in pixels of the image.
62 |  *
63 |  * @ingroup util
64 |  */
65 | bool loadImageRGB( const char* filename, float3** cpu, float3** gpu, int* width, int* height, const float3& mean=make_float3(0,0,0) );
66 | 
67 | 
68 | /**
69 |  * Load a color image from disk into CUDA memory.
70 |  * This function loads the image into shared CPU/GPU memory, using the functions from cudaMappedMemory.h
71 |  *
72 |  * @param filename Path to the image file on disk.
73 |  * @param cpu Pointer to CPU buffer allocated containing the image.
74 |  * @param gpu Pointer to CUDA device buffer residing on GPU containing image.
75 |  * @param width Variable containing width in pixels of the image.
76 |  * @param height Variable containing height in pixels of the image.
77 |  *
78 |  * @ingroup util
79 |  */
80 | bool loadImageBGR( cv::Mat frame, float3** cpu, float3** gpu, int* width, int* height, const float3& mean=make_float3(0,0,0) );
81 | 
82 | 
83 | 
84 | #endif
85 | 


--------------------------------------------------------------------------------