├── Makefile ├── Makefile~ ├── README.md ├── bboxParser.cu ├── bboxParser.cu~ ├── bboxParser.h ├── bboxParser.h~ ├── classifier.h ├── common.cu ├── common.h ├── common.h~ ├── draw.h ├── draw.h~ ├── gridSearchParam.sh ├── gridSearchParam.sh~ ├── include ├── bitmap_image.hpp ├── cub │ ├── agent │ │ ├── agent_histogram.cuh │ │ ├── agent_radix_sort_downsweep.cuh │ │ ├── agent_radix_sort_upsweep.cuh │ │ ├── agent_reduce.cuh │ │ ├── agent_reduce_by_key.cuh │ │ ├── agent_rle.cuh │ │ ├── agent_scan.cuh │ │ ├── agent_segment_fixup.cuh │ │ ├── agent_select_if.cuh │ │ ├── agent_spmv_orig.cuh │ │ └── single_pass_scan_operators.cuh │ ├── block │ │ ├── block_adjacent_difference.cuh │ │ ├── block_discontinuity.cuh │ │ ├── block_exchange.cuh │ │ ├── block_histogram.cuh │ │ ├── block_load.cuh │ │ ├── block_radix_rank.cuh │ │ ├── block_radix_sort.cuh │ │ ├── block_raking_layout.cuh │ │ ├── block_reduce.cuh │ │ ├── block_scan.cuh │ │ ├── block_shuffle.cuh │ │ ├── block_store.cuh │ │ └── specializations │ │ │ ├── block_histogram_atomic.cuh │ │ │ ├── block_histogram_sort.cuh │ │ │ ├── block_reduce_raking.cuh │ │ │ ├── block_reduce_raking_commutative_only.cuh │ │ │ ├── block_reduce_warp_reductions.cuh │ │ │ ├── block_scan_raking.cuh │ │ │ ├── block_scan_warp_scans.cuh │ │ │ ├── block_scan_warp_scans2.cuh │ │ │ └── block_scan_warp_scans3.cuh │ ├── cub.cuh │ ├── device │ │ ├── device_histogram.cuh │ │ ├── device_partition.cuh │ │ ├── device_radix_sort.cuh │ │ ├── device_reduce.cuh │ │ ├── device_run_length_encode.cuh │ │ ├── device_scan.cuh │ │ ├── device_segmented_radix_sort.cuh │ │ ├── device_segmented_reduce.cuh │ │ ├── device_select.cuh │ │ ├── device_spmv.cuh │ │ └── dispatch │ │ │ ├── dispatch_histogram.cuh │ │ │ ├── dispatch_radix_sort.cuh │ │ │ ├── dispatch_reduce.cuh │ │ │ ├── dispatch_reduce_by_key.cuh │ │ │ ├── dispatch_rle.cuh │ │ │ ├── dispatch_scan.cuh │ │ │ ├── dispatch_select_if.cuh │ │ │ └── dispatch_spmv_orig.cuh │ ├── grid │ │ ├── grid_barrier.cuh │ │ ├── grid_even_share.cuh │ │ ├── grid_mapping.cuh │ │ └── grid_queue.cuh │ ├── host │ │ └── mutex.cuh │ ├── iterator │ │ ├── arg_index_input_iterator.cuh │ │ ├── cache_modified_input_iterator.cuh │ │ ├── cache_modified_output_iterator.cuh │ │ ├── constant_input_iterator.cuh │ │ ├── counting_input_iterator.cuh │ │ ├── discard_output_iterator.cuh │ │ ├── tex_obj_input_iterator.cuh │ │ ├── tex_ref_input_iterator.cuh │ │ └── transform_input_iterator.cuh │ ├── thread │ │ ├── thread_load.cuh │ │ ├── thread_operators.cuh │ │ ├── thread_reduce.cuh │ │ ├── thread_scan.cuh │ │ ├── thread_search.cuh │ │ └── thread_store.cuh │ ├── util_allocator.cuh │ ├── util_arch.cuh │ ├── util_debug.cuh │ ├── util_device.cuh │ ├── util_macro.cuh │ ├── util_namespace.cuh │ ├── util_ptx.cuh │ ├── util_type.cuh │ └── warp │ │ ├── specializations │ │ ├── warp_reduce_shfl.cuh │ │ ├── warp_reduce_smem.cuh │ │ ├── warp_scan_shfl.cuh │ │ └── warp_scan_smem.cuh │ │ ├── warp_reduce.cuh │ │ └── warp_scan.cuh ├── helper_cuda.h ├── helper_string.h └── logger.h ├── interpPlugin.cu ├── interpPlugin.h ├── main.cpp ├── main.cpp~ ├── nvUtils.h ├── predictions_fp32.jpg ├── preproc_yolov3.h ├── procInferOutput.h ├── procInferOutput.h~ ├── regionLayer.cu ├── regionLayer.cu~ ├── regionLayer.h ├── regionLayer.h~ ├── results ├── calc_mAP.py ├── calc_mAP.py~ ├── mAP.csv ├── mAP.csv~ ├── voc_eval.py └── voc_eval.pyc ├── run.sh ├── run.sh~ ├── src ├── bboxParser.cu ├── bboxParser.cu~ ├── bboxParser.h ├── bboxParser.h~ ├── classifier.h ├── common.cu ├── common.h ├── common.h~ ├── draw.h ├── draw.h~ ├── interpPlugin.cu ├── interpPlugin.h ├── main.cpp ├── main.cpp~ ├── nvUtils.h ├── preproc_yolov3.h ├── procInferOutput.h ├── procInferOutput.h~ ├── regionLayer.cu ├── regionLayer.cu~ ├── regionLayer.h ├── regionLayer.h~ ├── tags ├── tensorRTClassifier.cpp └── tensorRTClassifier.h ├── tags ├── tensorRTClassifier.cpp ├── tensorRTClassifier.h ├── test.py~ └── test.sh~ /Makefile: -------------------------------------------------------------------------------- 1 | DEBUG := 1 2 | NVPROFILER := 0 3 | 4 | #-DVOC # model trained on VOC 5 | #-Dcal_mAP # calculate mAP 6 | #-DPRINT_LOG # print the prediction result 7 | #-DVISULIZATION # draw boxes on the image && save 8 | CUSTOM_MICRO := -DVOC -DPRINT_LOG # -DVISULIZATION 9 | 10 | GCC := g++ 11 | CCFLAGS := -m64 -std=c++11 -O3 $(CUSTOM_MICRO) 12 | NVCC := nvcc 13 | # Choose your arch for fast compilation, 14 | # sm_60 and sm_61 are for pascal gpu, 15 | # sm_30 and sm_35 are for Tesla K40 gpu 16 | NVCC_FLAGS := -gencode arch=compute_60,code=compute_60 \ 17 | -gencode arch=compute_61,code=compute_61 18 | ifeq ($(DEBUG), 1) 19 | CCFLAGS += -g 20 | NVCC_FLAGS += -G 21 | endif 22 | ifeq ($(NVPROFILER), 1) 23 | NVCC_FLAGS += -lineinfo 24 | endif 25 | NVCC_FLAGS += $(CCFLAGS) 26 | 27 | TENSORRT_VERSION := 212GA 28 | SRC_PATH := ./src 29 | INC_PATH := ./include 30 | 31 | TENSORRT_INC_PATH := ./tensorRT_$(TENSORRT_VERSION)/include 32 | TENSORRT_LIB_PATH := ./tensorRT_$(TENSORRT_VERSION)/lib 33 | 34 | INCLUDES := -I$(SRC_PATH) -I$(INC_PATH) -I$(TENSORRT_INC_PATH) -I/usr/local/cuda/include -I/usr/local/include 35 | 36 | LDPATH := -L/usr/local/lib -L/usr/lib -L$(TENSORRT_LIB_PATH) -L/usr/local/cuda/lib64 -Wl,-rpath,$(TENSORRT_LIB_PATH) 37 | LDFLAGS := $(LDPATH) -ldl -lcudart -lcudnn -lnvinfer -lnvcaffe_parser $(shell pkg-config opencv --libs) 38 | 39 | OBJ_PATH := ./bin/obj 40 | BIN_PATH := ./bin 41 | EXE_FILE := runYOLOv3 42 | 43 | all: build 44 | 45 | build: $(BIN_PATH)/$(EXE_FILE) 46 | 47 | $(OBJ_PATH)/tensorRTClassifier.o: $(SRC_PATH)/tensorRTClassifier.cpp 48 | $(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $< 49 | 50 | $(OBJ_PATH)/main.o: $(SRC_PATH)/main.cpp 51 | $(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $< 52 | 53 | $(OBJ_PATH)/interpPlugin.o: $(SRC_PATH)/interpPlugin.cu 54 | $(NVCC) $(NVCC_FLAGS) $(INCLUDES) -o $@ -c $< 55 | 56 | $(OBJ_PATH)/bboxParser.o: $(SRC_PATH)/bboxParser.cu 57 | $(NVCC) $(NVCC_FLAGS) $(INCLUDES) -o $@ -c $< 58 | 59 | $(OBJ_PATH)/regionLayer.o: $(SRC_PATH)/regionLayer.cu 60 | $(NVCC) $(NVCC_FLAGS) $(INCLUDES) -o $@ -c $< 61 | 62 | $(OBJ_PATH)/common.o: $(SRC_PATH)/common.cu 63 | $(NVCC) $(NVCC_FLAGS) $(INCLUDES) -o $@ -c $< 64 | 65 | $(BIN_PATH)/$(EXE_FILE): $(OBJ_PATH)/tensorRTClassifier.o $(OBJ_PATH)/main.o $(OBJ_PATH)/interpPlugin.o $(OBJ_PATH)/bboxParser.o $(OBJ_PATH)/regionLayer.o $(OBJ_PATH)/common.o 66 | $(GCC) $+ $(CCFLAGS) $(LDFLAGS) -o $@ 67 | 68 | clean: 69 | rm -rf $(OBJ_PATH)/* $(BIN_PATH)/$(EXE_FILE) 70 | -------------------------------------------------------------------------------- /Makefile~: -------------------------------------------------------------------------------- 1 | DEBUG := 0 2 | NVPROFILER := 1 3 | 4 | #-DVOC # model trained on VOC 5 | #-Dcal_mAP # calculate mAP 6 | #-DPRINT_LOG # print the prediction result 7 | #-DVISULIZATION # draw boxes on the image && save 8 | CUSTOM_MICRO := -DVOC # -DPRINT_LOG -DVISULIZATION 9 | 10 | GCC := g++ 11 | CCFLAGS := -m64 -std=c++11 -O3 $(CUSTOM_MICRO) 12 | NVCC := nvcc 13 | # Choose your arch for fast compilation, 14 | # sm_60 and sm_61 are for pascal gpu, 15 | # sm_30 and sm_35 are for Tesla K40 gpu 16 | NVCC_FLAGS := -gencode arch=compute_60,code=compute_60 \ 17 | -gencode arch=compute_61,code=compute_61 18 | ifeq ($(DEBUG), 1) 19 | CCFLAGS += -g 20 | NVCC_FLAGS += -G 21 | endif 22 | ifeq ($(NVPROFILER), 1) 23 | NVCC_FLAGS += -lineinfo 24 | endif 25 | NVCC_FLAGS += $(CCFLAGS) 26 | 27 | TENSORRT_VERSION := 212GA 28 | SRC_PATH := ./src 29 | INC_PATH := ./include 30 | 31 | TENSORRT_INC_PATH := ./tensorRT_$(TENSORRT_VERSION)/include 32 | TENSORRT_LIB_PATH := ./tensorRT_$(TENSORRT_VERSION)/lib 33 | 34 | INCLUDES := -I$(SRC_PATH) -I$(INC_PATH) -I$(TENSORRT_INC_PATH) -I/usr/local/cuda/include -I/usr/local/include 35 | 36 | LDPATH := -L/usr/local/lib -L/usr/lib -L$(TENSORRT_LIB_PATH) -L/usr/local/cuda/lib64 -Wl,-rpath,$(TENSORRT_LIB_PATH) 37 | LDFLAGS := $(LDPATH) -ldl -lcudart -lcudnn -lnvinfer -lnvcaffe_parser $(shell pkg-config opencv --libs) 38 | 39 | OBJ_PATH := ./bin/obj 40 | BIN_PATH := ./bin 41 | EXE_FILE := runYOLOv3 42 | 43 | all: build 44 | 45 | build: $(BIN_PATH)/$(EXE_FILE) 46 | 47 | $(OBJ_PATH)/tensorRTClassifier.o: $(SRC_PATH)/tensorRTClassifier.cpp 48 | $(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $< 49 | 50 | $(OBJ_PATH)/main.o: $(SRC_PATH)/main.cpp 51 | $(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $< 52 | 53 | $(OBJ_PATH)/interpPlugin.o: $(SRC_PATH)/interpPlugin.cu 54 | $(NVCC) $(NVCC_FLAGS) $(INCLUDES) -o $@ -c $< 55 | 56 | $(OBJ_PATH)/bboxParser.o: $(SRC_PATH)/bboxParser.cu 57 | $(NVCC) $(NVCC_FLAGS) $(INCLUDES) -o $@ -c $< 58 | 59 | $(OBJ_PATH)/regionLayer.o: $(SRC_PATH)/regionLayer.cu 60 | $(NVCC) $(NVCC_FLAGS) $(INCLUDES) -o $@ -c $< 61 | 62 | $(OBJ_PATH)/common.o: $(SRC_PATH)/common.cu 63 | $(NVCC) $(NVCC_FLAGS) $(INCLUDES) -o $@ -c $< 64 | 65 | $(BIN_PATH)/$(EXE_FILE): $(OBJ_PATH)/tensorRTClassifier.o $(OBJ_PATH)/main.o $(OBJ_PATH)/interpPlugin.o $(OBJ_PATH)/bboxParser.o $(OBJ_PATH)/regionLayer.o $(OBJ_PATH)/common.o 66 | $(GCC) $+ $(CCFLAGS) $(LDFLAGS) -o $@ 67 | 68 | clean: 69 | rm -rf $(OBJ_PATH)/* $(BIN_PATH)/$(EXE_FILE) 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # YOLO-V3-Acceleration 2 | Using TensorRT to implement and accelerate YOLO v3. Multi-scale and NMS are included. The acceleration ratio reaches 3 compared to the original darknet. 3 | Model: 4 | /data/model 5 | 6 | Image: 7 | /data/images 8 | 9 | Build the sample: 10 | $ make -j 11 | 12 | Run the sample 13 | $ ./run.sh 14 | 15 | 16 | Plugin 17 | =========================================== 18 | 19 | 1. Upsample layer with nearest-neighbour interpolution. (Interp85 Interp97) 20 | 21 | 22 | Bounding box parser 23 | =========================================== 24 | 25 | * solution 1(used): launch reorgOutputKernel to fuse 3 output layers into 1 out layer form, but cost copy time, then do parser and NMS. 26 | 27 | * solution 2(to be implement): iterate every output layer to do parser, then collect all bboxes to do NMS, also cost copy time during collection. 28 | 29 | * solution 3(to be implement): create temp GPU memory to maintain a (float**) variable referring to 3 output layers, then do parser and NMS like ONE layer, based on index relation, but FAKE-ONE layer need to launch kernel 3 times. 30 | 31 | -------------------------------------------------------------------------------- /bboxParser.h: -------------------------------------------------------------------------------- 1 | #ifndef BBOX_PARSER_H 2 | #define BBOX_PARSER_H 3 | 4 | #include 5 | // cub for sort 6 | #include "regionLayer.h" 7 | 8 | void sortScoresPerImage_gpu( 9 | const int nBatch, 10 | const int nItemsPerImage, 11 | void * unsorted_scores, 12 | void * unsorted_bbox_indices, 13 | void * sorted_scores, 14 | void * sorted_bbox_indices, 15 | void * workspace, 16 | const size_t maxSizeofWorkspaceInByte, 17 | cudaStream_t stream); 18 | 19 | void splitOutputData_gpu( 20 | const int nBatch, // batch 21 | const int nClasses, 22 | const int nBboxesPerLoc, // #box 23 | const int coords, // x,y,w,h 24 | const int l0_w, 25 | const int l0_h, 26 | const int nCells, 27 | const bool background, // use background conf or not 28 | const bool only_objectness, // no class conf 29 | const float thres, 30 | const float* predictions, 31 | const float* biases, 32 | float* probes, 33 | box* bboxes, 34 | cudaStream_t stream); 35 | 36 | 37 | void correct_region_boxes_gpu( 38 | const int nBatch, // batch 39 | const int nClasses, 40 | const int nBboxesPerLoc, // #box 41 | const int nCells, 42 | const int image_w, 43 | const int image_h, 44 | const int net_input_w, 45 | const int net_input_h, 46 | box* bboxes, 47 | cudaStream_t stream); 48 | 49 | 50 | void sortScoresPerClass_gpu( 51 | const int nBatch, 52 | const int nClasses, 53 | const int nBboxesPerLoc, 54 | const void * probes, 55 | void * sorted_boxIdx, 56 | void * workspace, 57 | const size_t maxSizeofWorkspaceInByte, 58 | cudaStream_t stream); 59 | 60 | 61 | void allClassNMS_gpu( 62 | const int nBatch, //batch 63 | const int nClasses, 64 | const int nBboxesPerLoc, 65 | const int nCells, 66 | const float nms_threshold, 67 | void * bboxes, 68 | void * probes, 69 | void * afterNMS_probes, 70 | void * indexes, 71 | void * afterNMS_indexes, 72 | cudaStream_t stream); 73 | 74 | 75 | size_t getWorkspaceSizeInByte( 76 | const int nBatch, 77 | const int nClasses, 78 | const int nBboxesPerLoc, 79 | const int nCells); 80 | 81 | #endif 82 | -------------------------------------------------------------------------------- /bboxParser.h~: -------------------------------------------------------------------------------- 1 | #ifndef BBOX_PARSER_H 2 | #define BBOX_PARSER_H 3 | 4 | #include 5 | // cub for sort 6 | #include "regionLayer.h" 7 | 8 | void sortScoresPerImage_gpu( 9 | const int nBatch, 10 | const int nItemsPerImage, 11 | void * unsorted_scores, 12 | void * unsorted_bbox_indices, 13 | void * sorted_scores, 14 | void * sorted_bbox_indices, 15 | void * workspace, 16 | const size_t maxSizeofWorkspaceInByte, 17 | cudaStream_t stream); 18 | 19 | void splitOutputData_gpu( 20 | const int nBatch, // batch 21 | const int nClasses, 22 | const int nBboxesPerLoc, // #box 23 | const int coords, // x,y,w,h 24 | const int l0_w, 25 | const int l0_h, 26 | const int nCells, 27 | const bool background, // use background conf or not 28 | const bool only_objectness, // no class conf 29 | const float thres, 30 | const float* predictions, 31 | const float* biases, 32 | float* probes, 33 | box* bboxes, 34 | cudaStream_t stream); 35 | 36 | 37 | void correct_region_boxes_gpu( 38 | const int nBatch, // batch 39 | const int nClasses, 40 | const int nBboxesPerLoc, // #box 41 | const int nCells, 42 | const int image_w, 43 | const int image_h, 44 | const int net_input_w, 45 | const int net_input_h, 46 | box* bboxes, 47 | cudaStream_t stream); 48 | 49 | 50 | void sortScoresPerClass_gpu( 51 | const int nBatch, 52 | const int nClasses, 53 | const int nBboxesPerLoc, 54 | const void * probes, 55 | void * sorted_boxIdx, 56 | void * workspace, 57 | const size_t maxSizeofWorkspaceInByte, 58 | cudaStream_t stream); 59 | 60 | 61 | void allClassNMS_gpu( 62 | const int nBatch, //batch 63 | const int nClasses, 64 | const int nBboxesPerLoc, 65 | const int w, 66 | const int h, 67 | const float nms_threshold, 68 | void * bboxes, 69 | void * probes, 70 | void * afterNMS_probes, 71 | void * indexes, 72 | void * afterNMS_indexes, 73 | cudaStream_t stream); 74 | 75 | 76 | size_t getWorkspaceSizeInByte( 77 | const int nBatch, 78 | const int nClasses, 79 | const int nBboxesPerLoc, 80 | const int w, 81 | const int h); 82 | 83 | #endif 84 | -------------------------------------------------------------------------------- /classifier.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2016 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NOTICE TO USER: 5 | * 6 | * This source code is subject to NVIDIA ownership rights under U.S. and 7 | * international Copyright laws. 8 | * 9 | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 10 | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 11 | * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH 12 | * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 13 | * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 14 | * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 15 | * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 16 | * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 17 | * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 18 | * OR PERFORMANCE OF THIS SOURCE CODE. 19 | * 20 | * U.S. Government End Users. This source code is a "commercial item" as 21 | * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of 22 | * "commercial computer software" and "commercial computer software 23 | * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 24 | * and is provided to the U.S. Government only as a commercial end item. 25 | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 26 | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 27 | * source code with only those rights set forth herein. 28 | */ 29 | 30 | #ifndef CLASSIFIER_H 31 | #define CLASSIFIER_H 32 | 33 | #include 34 | #include "NvInfer.h" 35 | #include 36 | 37 | using namespace nvinfer1; 38 | 39 | typedef struct INFER_OUTPUT_PARAMS_ { 40 | int nBatchSize_; 41 | std::vector vpInferResults_; 42 | std::vector vnLens_; 43 | std::vector vOutputDims_; 44 | } INFER_OUTPUT_PARAMS; 45 | 46 | class IClassifier { 47 | public: 48 | virtual void setInputData(float *pBGR, 49 | const int nWidth, 50 | const int nHeight, 51 | const int nBatchSize) = 0; 52 | 53 | virtual void forward(INFER_OUTPUT_PARAMS *) = 0; 54 | 55 | virtual int getInferWidth() const = 0; 56 | 57 | virtual int getInferHeight() const = 0; 58 | 59 | virtual std::vector getMeanValues() const = 0; 60 | 61 | protected: 62 | virtual ~IClassifier() {} 63 | }; 64 | 65 | #endif 66 | 67 | 68 | -------------------------------------------------------------------------------- /common.cu: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | // alignptr 4 | int8_t * alignPtr(int8_t * ptr, uintptr_t to) 5 | { 6 | uintptr_t addr = (uintptr_t)ptr; 7 | if (addr % to) { 8 | addr += to - addr % to; 9 | } 10 | return (int8_t *)addr; 11 | } 12 | 13 | // calc next ptr (consider alignment) 14 | int8_t * nextWorkspacePtr(int8_t * ptr, uintptr_t previousWorkspaceSize) 15 | { 16 | uintptr_t addr = (uintptr_t) ptr; 17 | addr += previousWorkspaceSize; 18 | return alignPtr((int8_t *)addr, CUDA_MEM_ALIGN); 19 | } 20 | 21 | 22 | template 23 | __launch_bounds__ (nthds_per_cta) 24 | __global__ void setUniformOffsets_kernel( 25 | const int num_segments, 26 | const int offset, 27 | int * d_offsets) 28 | { 29 | const int idx = blockIdx.x * nthds_per_cta + threadIdx.x; 30 | if (idx <= num_segments){ 31 | d_offsets[idx] = idx * offset; 32 | } 33 | } 34 | 35 | void setUniformOffsets( 36 | const int num_segments, 37 | const int offset, 38 | int * d_offsets, 39 | cudaStream_t stream) 40 | { 41 | const int blockSize = 32; 42 | const int gridSize = (num_segments + 1 + blockSize - 1) / blockSize; 43 | setUniformOffsets_kernel 44 | <<>> 45 | (num_segments, offset, d_offsets); 46 | } 47 | -------------------------------------------------------------------------------- /common.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMON_H_ 2 | #define COMMON_H_ 3 | 4 | #include 5 | 6 | #define CUDA_MEM_ALIGN 256 7 | 8 | // alignptr 9 | int8_t * alignPtr(int8_t * ptr, uintptr_t to); 10 | 11 | int8_t * nextWorkspacePtr(int8_t * ptr, uintptr_t previousWorkspaceSize); 12 | 13 | void setUniformOffsets(const int num_segments, const int offset, int * d_offsets, cudaStream_t stream); 14 | 15 | /** 16 | * Determine the usage of temporary memory for cub sort 17 | * The cub::DeviceSegmentedRadixSort can be used for batched (segmented) sort. 18 | */ 19 | template 20 | size_t cubSortPairsWorkspaceSize(int num_items, int num_segments) 21 | { 22 | size_t temp_storage_bytes = 0; 23 | cub::DeviceSegmentedRadixSort::SortPairsDescending( 24 | (void *)NULL, temp_storage_bytes, 25 | (const KeyT *)NULL, (KeyT *)NULL, 26 | (const ValueT *)NULL, (ValueT *)NULL, 27 | num_items, // # items 28 | num_segments, // # segments 29 | (const int *)NULL, (const int *)NULL); 30 | return temp_storage_bytes; 31 | } 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /common.h~: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangjinsong3/YOLO-V3-Acceleration/384746bb6d8f61c8def70bbc0b5e04b98c60356e/common.h~ -------------------------------------------------------------------------------- /draw.h: -------------------------------------------------------------------------------- 1 | #include "preproc_yolov3.h" 2 | #include "regionLayer.h" 3 | #include "bboxParser.h" 4 | 5 | void draw_box(image a, int x1, int y1, int x2, int y2, float r, float g, float b) 6 | { 7 | int i; 8 | if(x1 < 0) x1 = 0; 9 | if(x1 >= a.w) x1 = a.w-1; 10 | if(x2 < 0) x2 = 0; 11 | if(x2 >= a.w) x2 = a.w-1; 12 | 13 | if(y1 < 0) y1 = 0; 14 | if(y1 >= a.h) y1 = a.h-1; 15 | if(y2 < 0) y2 = 0; 16 | if(y2 >= a.h) y2 = a.h-1; 17 | 18 | for(i = x1; i <= x2; ++i){ 19 | a.data[i + y1*a.w + 0*a.w*a.h] = r; 20 | a.data[i + y2*a.w + 0*a.w*a.h] = r; 21 | 22 | a.data[i + y1*a.w + 1*a.w*a.h] = g; 23 | a.data[i + y2*a.w + 1*a.w*a.h] = g; 24 | 25 | a.data[i + y1*a.w + 2*a.w*a.h] = b; 26 | a.data[i + y2*a.w + 2*a.w*a.h] = b; 27 | } 28 | for(i = y1; i <= y2; ++i){ 29 | a.data[x1 + i*a.w + 0*a.w*a.h] = r; 30 | a.data[x2 + i*a.w + 0*a.w*a.h] = r; 31 | 32 | a.data[x1 + i*a.w + 1*a.w*a.h] = g; 33 | a.data[x2 + i*a.w + 1*a.w*a.h] = g; 34 | 35 | a.data[x1 + i*a.w + 2*a.w*a.h] = b; 36 | a.data[x2 + i*a.w + 2*a.w*a.h] = b; 37 | } 38 | } 39 | 40 | 41 | 42 | void draw_box_width(image a, int x1, int y1, int x2, int y2, int w) 43 | { 44 | int i; 45 | for(i = 0; i < w; ++i){ 46 | draw_box(a, x1+i, y1+i, x2-i, y2-i, 255, 0, 0); 47 | } 48 | } 49 | 50 | 51 | void draw_detections(image im, 52 | int batchIdx, 53 | float thresh, 54 | box *boxes, 55 | float *probs, 56 | int * indexes, 57 | int sizeOfClass, 58 | int sizeOfBatch) 59 | { 60 | int n = batchIdx; 61 | // int sizeOfClass = l.n * l.h * l.w; 62 | // int sizeOfBatch = l.classes * sizeOfClass; 63 | 64 | int count = 0; 65 | for(int i = 0; i < sizeOfBatch; ++i){ 66 | int id = n * sizeOfBatch + i; 67 | int indexes_idx = indexes[id]; 68 | 69 | if (probs[id] > thresh){ 70 | int category = (indexes_idx % sizeOfBatch) / sizeOfClass; 71 | int boxId = indexes_idx % sizeOfClass; 72 | 73 | int width = im.h * .006; 74 | box b = boxes[boxId]; 75 | 76 | int left = (b.x-b.w/2.)*im.w; 77 | int right = (b.x+b.w/2.)*im.w; 78 | int top = (b.y-b.h/2.)*im.h; 79 | int bot = (b.y+b.h/2.)*im.h; 80 | 81 | if(left < 0) left = 0; 82 | if(right > im.w-1) right = im.w-1; 83 | if(top < 0) top = 0; 84 | if(bot > im.h-1) bot = im.h-1; 85 | 86 | draw_box_width(im, left, top, right, bot, width); 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /draw.h~: -------------------------------------------------------------------------------- 1 | #include "preproc_yolov2.h" 2 | #include "regionLayer.h" 3 | #include "bboxParser.h" 4 | 5 | void draw_box(image a, int x1, int y1, int x2, int y2, float r, float g, float b) 6 | { 7 | int i; 8 | if(x1 < 0) x1 = 0; 9 | if(x1 >= a.w) x1 = a.w-1; 10 | if(x2 < 0) x2 = 0; 11 | if(x2 >= a.w) x2 = a.w-1; 12 | 13 | if(y1 < 0) y1 = 0; 14 | if(y1 >= a.h) y1 = a.h-1; 15 | if(y2 < 0) y2 = 0; 16 | if(y2 >= a.h) y2 = a.h-1; 17 | 18 | for(i = x1; i <= x2; ++i){ 19 | a.data[i + y1*a.w + 0*a.w*a.h] = r; 20 | a.data[i + y2*a.w + 0*a.w*a.h] = r; 21 | 22 | a.data[i + y1*a.w + 1*a.w*a.h] = g; 23 | a.data[i + y2*a.w + 1*a.w*a.h] = g; 24 | 25 | a.data[i + y1*a.w + 2*a.w*a.h] = b; 26 | a.data[i + y2*a.w + 2*a.w*a.h] = b; 27 | } 28 | for(i = y1; i <= y2; ++i){ 29 | a.data[x1 + i*a.w + 0*a.w*a.h] = r; 30 | a.data[x2 + i*a.w + 0*a.w*a.h] = r; 31 | 32 | a.data[x1 + i*a.w + 1*a.w*a.h] = g; 33 | a.data[x2 + i*a.w + 1*a.w*a.h] = g; 34 | 35 | a.data[x1 + i*a.w + 2*a.w*a.h] = b; 36 | a.data[x2 + i*a.w + 2*a.w*a.h] = b; 37 | } 38 | } 39 | 40 | 41 | 42 | void draw_box_width(image a, int x1, int y1, int x2, int y2, int w) 43 | { 44 | int i; 45 | for(i = 0; i < w; ++i){ 46 | draw_box(a, x1+i, y1+i, x2-i, y2-i, 255, 0, 0); 47 | } 48 | } 49 | 50 | 51 | void draw_detections(image im, 52 | int batchIdx, 53 | float thresh, 54 | box *boxes, 55 | float *probs, 56 | int * indexes, 57 | int sizeOfClass, 58 | int sizeOfBatch) 59 | { 60 | int n = batchIdx; 61 | // int sizeOfClass = l.n * l.h * l.w; 62 | // int sizeOfBatch = l.classes * sizeOfClass; 63 | 64 | int count = 0; 65 | for(int i = 0; i < sizeOfBatch; ++i){ 66 | int id = n * sizeOfBatch + i; 67 | int indexes_idx = indexes[id]; 68 | 69 | if (probs[id] > thresh){ 70 | int category = (indexes_idx % sizeOfBatch) / sizeOfClass; 71 | int boxId = indexes_idx % sizeOfClass; 72 | 73 | int width = im.h * .006; 74 | box b = boxes[boxId]; 75 | 76 | int left = (b.x-b.w/2.)*im.w; 77 | int right = (b.x+b.w/2.)*im.w; 78 | int top = (b.y-b.h/2.)*im.h; 79 | int bot = (b.y+b.h/2.)*im.h; 80 | 81 | if(left < 0) left = 0; 82 | if(right > im.w-1) right = im.w-1; 83 | if(top < 0) top = 0; 84 | if(bot > im.h-1) bot = im.h-1; 85 | 86 | draw_box_width(im, left, top, right, bot, width); 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /gridSearchParam.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | result=mAP.csv 4 | mAP=0.0 5 | for nms in `seq 0.4 0.05 0.7` 6 | do 7 | for conf in `seq 0.001 0.001 0.005` 8 | do 9 | ./run.sh 0 ${nms} ${conf} 10 | cd results/ 11 | mAP=`python calc_mAP.py 0.5 2>&1 1>/dev/null` 12 | echo $( printf '%f %f %f' ${nms} ${conf} ${mAP} ) >> ${result} 13 | cat ${result} 14 | rm -rf cache comp4_det_test_* 15 | cd ../ 16 | done 17 | done 18 | -------------------------------------------------------------------------------- /gridSearchParam.sh~: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | result=mAP.csv 4 | mAP=0.0 5 | for nms in `seq 0.05 0.05 0.4` 6 | do 7 | for conf in `seq 0.005 0.005 0.03` 8 | do 9 | ./run.sh 0 ${nms} ${conf} 10 | cd results/ 11 | mAP=`python calc_mAP.py 0.5 2>&1 1>/dev/null` 12 | echo $( printf '%f %f %f' ${nms} ${conf} ${mAP} ) >> ${result} 13 | cat ${result} 14 | rm -rf cache comp4_det_test_* 15 | cd ../ 16 | done 17 | done 18 | -------------------------------------------------------------------------------- /include/cub/block/block_raking_layout.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data. 32 | */ 33 | 34 | 35 | #pragma once 36 | 37 | #include "../util_macro.cuh" 38 | #include "../util_arch.cuh" 39 | #include "../util_type.cuh" 40 | #include "../util_namespace.cuh" 41 | 42 | /// Optional outer namespace(s) 43 | CUB_NS_PREFIX 44 | 45 | /// CUB namespace 46 | namespace cub { 47 | 48 | /** 49 | * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data. ![](raking.png) 50 | * \ingroup BlockModule 51 | * 52 | * \par Overview 53 | * This type facilitates a shared memory usage pattern where a block of CUDA 54 | * threads places elements into shared memory and then reduces the active 55 | * parallelism to one "raking" warp of threads for serially aggregating consecutive 56 | * sequences of shared items. Padding is inserted to eliminate bank conflicts 57 | * (for most data types). 58 | * 59 | * \tparam T The data type to be exchanged. 60 | * \tparam BLOCK_THREADS The thread block size in threads. 61 | * \tparam PTX_ARCH [optional] \ptxversion 62 | */ 63 | template < 64 | typename T, 65 | int BLOCK_THREADS, 66 | int PTX_ARCH = CUB_PTX_ARCH> 67 | struct BlockRakingLayout 68 | { 69 | //--------------------------------------------------------------------- 70 | // Constants and type definitions 71 | //--------------------------------------------------------------------- 72 | 73 | enum 74 | { 75 | /// The total number of elements that need to be cooperatively reduced 76 | SHARED_ELEMENTS = BLOCK_THREADS, 77 | 78 | /// Maximum number of warp-synchronous raking threads 79 | MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)), 80 | 81 | /// Number of raking elements per warp-synchronous raking thread (rounded up) 82 | SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS, 83 | 84 | /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads) 85 | RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH, 86 | 87 | /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1) 88 | HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0), 89 | 90 | /// Degree of bank conflicts (e.g., 4-way) 91 | CONFLICT_DEGREE = (HAS_CONFLICTS) ? 92 | (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) : 93 | 1, 94 | 95 | /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load 96 | USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2), 97 | 98 | /// Total number of elements in the raking grid 99 | GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING), 100 | 101 | /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads) 102 | UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0), 103 | }; 104 | 105 | 106 | /** 107 | * \brief Shared memory storage type 108 | */ 109 | struct __align__(16) _TempStorage 110 | { 111 | T buff[BlockRakingLayout::GRID_ELEMENTS]; 112 | }; 113 | 114 | /// Alias wrapper allowing storage to be unioned 115 | struct TempStorage : Uninitialized<_TempStorage> {}; 116 | 117 | 118 | /** 119 | * \brief Returns the location for the calling thread to place data into the grid 120 | */ 121 | static __device__ __forceinline__ T* PlacementPtr( 122 | TempStorage &temp_storage, 123 | unsigned int linear_tid) 124 | { 125 | // Offset for partial 126 | unsigned int offset = linear_tid; 127 | 128 | // Add in one padding element for every segment 129 | if (USE_SEGMENT_PADDING > 0) 130 | { 131 | offset += offset / SEGMENT_LENGTH; 132 | } 133 | 134 | // Incorporating a block of padding partials every shared memory segment 135 | return temp_storage.Alias().buff + offset; 136 | } 137 | 138 | 139 | /** 140 | * \brief Returns the location for the calling thread to begin sequential raking 141 | */ 142 | static __device__ __forceinline__ T* RakingPtr( 143 | TempStorage &temp_storage, 144 | unsigned int linear_tid) 145 | { 146 | return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING)); 147 | } 148 | }; 149 | 150 | } // CUB namespace 151 | CUB_NS_POSTFIX // Optional outer namespace(s) 152 | 153 | -------------------------------------------------------------------------------- /include/cub/block/specializations/block_histogram_atomic.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../../util_namespace.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | 45 | /** 46 | * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 47 | */ 48 | template 49 | struct BlockHistogramAtomic 50 | { 51 | /// Shared memory storage layout type 52 | struct TempStorage {}; 53 | 54 | 55 | /// Constructor 56 | __device__ __forceinline__ BlockHistogramAtomic( 57 | TempStorage &temp_storage) 58 | {} 59 | 60 | 61 | /// Composite data onto an existing histogram 62 | template < 63 | typename T, 64 | typename CounterT, 65 | int ITEMS_PER_THREAD> 66 | __device__ __forceinline__ void Composite( 67 | T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram 68 | CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram 69 | { 70 | // Update histogram 71 | #pragma unroll 72 | for (int i = 0; i < ITEMS_PER_THREAD; ++i) 73 | { 74 | atomicAdd(histogram + items[i], 1); 75 | } 76 | } 77 | 78 | }; 79 | 80 | } // CUB namespace 81 | CUB_NS_POSTFIX // Optional outer namespace(s) 82 | 83 | -------------------------------------------------------------------------------- /include/cub/cub.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * CUB umbrella include file 32 | */ 33 | 34 | #pragma once 35 | 36 | 37 | // Block 38 | #include "block/block_histogram.cuh" 39 | #include "block/block_discontinuity.cuh" 40 | #include "block/block_exchange.cuh" 41 | #include "block/block_load.cuh" 42 | #include "block/block_radix_rank.cuh" 43 | #include "block/block_radix_sort.cuh" 44 | #include "block/block_reduce.cuh" 45 | #include "block/block_scan.cuh" 46 | #include "block/block_store.cuh" 47 | //#include "block/block_shift.cuh" 48 | 49 | // Device 50 | #include "device/device_histogram.cuh" 51 | #include "device/device_partition.cuh" 52 | #include "device/device_radix_sort.cuh" 53 | #include "device/device_reduce.cuh" 54 | #include "device/device_run_length_encode.cuh" 55 | #include "device/device_scan.cuh" 56 | #include "device/device_segmented_radix_sort.cuh" 57 | #include "device/device_segmented_reduce.cuh" 58 | #include "device/device_select.cuh" 59 | #include "device/device_spmv.cuh" 60 | 61 | // Grid 62 | //#include "grid/grid_barrier.cuh" 63 | #include "grid/grid_even_share.cuh" 64 | #include "grid/grid_mapping.cuh" 65 | #include "grid/grid_queue.cuh" 66 | 67 | // Thread 68 | #include "thread/thread_load.cuh" 69 | #include "thread/thread_operators.cuh" 70 | #include "thread/thread_reduce.cuh" 71 | #include "thread/thread_scan.cuh" 72 | #include "thread/thread_store.cuh" 73 | 74 | // Warp 75 | #include "warp/warp_reduce.cuh" 76 | #include "warp/warp_scan.cuh" 77 | 78 | // Iterator 79 | #include "iterator/arg_index_input_iterator.cuh" 80 | #include "iterator/cache_modified_input_iterator.cuh" 81 | #include "iterator/cache_modified_output_iterator.cuh" 82 | #include "iterator/constant_input_iterator.cuh" 83 | #include "iterator/counting_input_iterator.cuh" 84 | #include "iterator/tex_obj_input_iterator.cuh" 85 | #include "iterator/tex_ref_input_iterator.cuh" 86 | #include "iterator/transform_input_iterator.cuh" 87 | 88 | // Util 89 | #include "util_arch.cuh" 90 | #include "util_debug.cuh" 91 | #include "util_device.cuh" 92 | #include "util_macro.cuh" 93 | #include "util_ptx.cuh" 94 | #include "util_type.cuh" 95 | 96 | -------------------------------------------------------------------------------- /include/cub/grid/grid_barrier.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../util_debug.cuh" 37 | #include "../util_namespace.cuh" 38 | #include "../thread/thread_load.cuh" 39 | 40 | /// Optional outer namespace(s) 41 | CUB_NS_PREFIX 42 | 43 | /// CUB namespace 44 | namespace cub { 45 | 46 | 47 | /** 48 | * \addtogroup GridModule 49 | * @{ 50 | */ 51 | 52 | 53 | /** 54 | * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid 55 | */ 56 | class GridBarrier 57 | { 58 | protected : 59 | 60 | typedef unsigned int SyncFlag; 61 | 62 | // Counters in global device memory 63 | SyncFlag* d_sync; 64 | 65 | public: 66 | 67 | /** 68 | * Constructor 69 | */ 70 | GridBarrier() : d_sync(NULL) {} 71 | 72 | 73 | /** 74 | * Synchronize 75 | */ 76 | __device__ __forceinline__ void Sync() const 77 | { 78 | volatile SyncFlag *d_vol_sync = d_sync; 79 | 80 | // Threadfence and syncthreads to make sure global writes are visible before 81 | // thread-0 reports in with its sync counter 82 | __threadfence(); 83 | CTA_SYNC(); 84 | 85 | if (blockIdx.x == 0) 86 | { 87 | // Report in ourselves 88 | if (threadIdx.x == 0) 89 | { 90 | d_vol_sync[blockIdx.x] = 1; 91 | } 92 | 93 | CTA_SYNC(); 94 | 95 | // Wait for everyone else to report in 96 | for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) 97 | { 98 | while (ThreadLoad(d_sync + peer_block) == 0) 99 | { 100 | __threadfence_block(); 101 | } 102 | } 103 | 104 | CTA_SYNC(); 105 | 106 | // Let everyone know it's safe to proceed 107 | for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) 108 | { 109 | d_vol_sync[peer_block] = 0; 110 | } 111 | } 112 | else 113 | { 114 | if (threadIdx.x == 0) 115 | { 116 | // Report in 117 | d_vol_sync[blockIdx.x] = 1; 118 | 119 | // Wait for acknowledgment 120 | while (ThreadLoad(d_sync + blockIdx.x) == 1) 121 | { 122 | __threadfence_block(); 123 | } 124 | } 125 | 126 | CTA_SYNC(); 127 | } 128 | } 129 | }; 130 | 131 | 132 | /** 133 | * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation. 134 | * 135 | * Uses RAII for lifetime, i.e., device resources are reclaimed when 136 | * the destructor is called. 137 | */ 138 | class GridBarrierLifetime : public GridBarrier 139 | { 140 | protected: 141 | 142 | // Number of bytes backed by d_sync 143 | size_t sync_bytes; 144 | 145 | public: 146 | 147 | /** 148 | * Constructor 149 | */ 150 | GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {} 151 | 152 | 153 | /** 154 | * DeviceFrees and resets the progress counters 155 | */ 156 | cudaError_t HostReset() 157 | { 158 | cudaError_t retval = cudaSuccess; 159 | if (d_sync) 160 | { 161 | CubDebug(retval = cudaFree(d_sync)); 162 | d_sync = NULL; 163 | } 164 | sync_bytes = 0; 165 | return retval; 166 | } 167 | 168 | 169 | /** 170 | * Destructor 171 | */ 172 | virtual ~GridBarrierLifetime() 173 | { 174 | HostReset(); 175 | } 176 | 177 | 178 | /** 179 | * Sets up the progress counters for the next kernel launch (lazily 180 | * allocating and initializing them if necessary) 181 | */ 182 | cudaError_t Setup(int sweep_grid_size) 183 | { 184 | cudaError_t retval = cudaSuccess; 185 | do { 186 | size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag); 187 | if (new_sync_bytes > sync_bytes) 188 | { 189 | if (d_sync) 190 | { 191 | if (CubDebug(retval = cudaFree(d_sync))) break; 192 | } 193 | 194 | sync_bytes = new_sync_bytes; 195 | 196 | // Allocate and initialize to zero 197 | if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break; 198 | if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break; 199 | } 200 | } while (0); 201 | 202 | return retval; 203 | } 204 | }; 205 | 206 | 207 | /** @} */ // end group GridModule 208 | 209 | } // CUB namespace 210 | CUB_NS_POSTFIX // Optional outer namespace(s) 211 | 212 | -------------------------------------------------------------------------------- /include/cub/grid/grid_mapping.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../util_namespace.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | 45 | /** 46 | * \addtogroup GridModule 47 | * @{ 48 | */ 49 | 50 | 51 | /****************************************************************************** 52 | * Mapping policies 53 | *****************************************************************************/ 54 | 55 | 56 | /** 57 | * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. 58 | */ 59 | enum GridMappingStrategy 60 | { 61 | /** 62 | * \brief An a "raking" access pattern in which each thread block is 63 | * assigned a consecutive sequence of input tiles 64 | * 65 | * \par Overview 66 | * The input is evenly partitioned into \p p segments, where \p p is 67 | * constant and corresponds loosely to the number of thread blocks that may 68 | * actively reside on the target device. Each segment is comprised of 69 | * consecutive tiles, where a tile is a small, constant-sized unit of input 70 | * to be processed to completion before the thread block terminates or 71 | * obtains more work. The kernel invokes \p p thread blocks, each 72 | * of which iteratively consumes a segment of n/p elements 73 | * in tile-size increments. 74 | */ 75 | GRID_MAPPING_RAKE, 76 | 77 | /** 78 | * \brief An a "strip mining" access pattern in which the input tiles assigned 79 | * to each thread block are separated by a stride equal to the the extent of 80 | * the grid. 81 | * 82 | * \par Overview 83 | * The input is evenly partitioned into \p p sets, where \p p is 84 | * constant and corresponds loosely to the number of thread blocks that may 85 | * actively reside on the target device. Each set is comprised of 86 | * data tiles separated by stride \p tiles, where a tile is a small, 87 | * constant-sized unit of input to be processed to completion before the 88 | * thread block terminates or obtains more work. The kernel invokes \p p 89 | * thread blocks, each of which iteratively consumes a segment of 90 | * n/p elements in tile-size increments. 91 | */ 92 | GRID_MAPPING_STRIP_MINE, 93 | 94 | /** 95 | * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. 96 | * 97 | * \par Overview 98 | * The input is treated as a queue to be dynamically consumed by a grid of 99 | * thread blocks. Work is atomically dequeued in tiles, where a tile is a 100 | * unit of input to be processed to completion before the thread block 101 | * terminates or obtains more work. The grid size \p p is constant, 102 | * loosely corresponding to the number of thread blocks that may actively 103 | * reside on the target device. 104 | */ 105 | GRID_MAPPING_DYNAMIC, 106 | }; 107 | 108 | 109 | /** @} */ // end group GridModule 110 | 111 | } // CUB namespace 112 | CUB_NS_POSTFIX // Optional outer namespace(s) 113 | 114 | -------------------------------------------------------------------------------- /include/cub/host/mutex.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Simple portable mutex 32 | */ 33 | 34 | 35 | #pragma once 36 | 37 | #if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) 38 | #include 39 | #else 40 | #if defined(_WIN32) || defined(_WIN64) 41 | #include 42 | 43 | #define WIN32_LEAN_AND_MEAN 44 | #define NOMINMAX 45 | #include 46 | #undef WIN32_LEAN_AND_MEAN 47 | #undef NOMINMAX 48 | 49 | /** 50 | * Compiler read/write barrier 51 | */ 52 | #pragma intrinsic(_ReadWriteBarrier) 53 | 54 | #endif 55 | #endif 56 | 57 | #include "../util_namespace.cuh" 58 | 59 | 60 | /// Optional outer namespace(s) 61 | CUB_NS_PREFIX 62 | 63 | /// CUB namespace 64 | namespace cub { 65 | 66 | 67 | /** 68 | * Simple portable mutex 69 | * - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms) 70 | * - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++) 71 | */ 72 | struct Mutex 73 | { 74 | #if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) 75 | 76 | std::mutex mtx; 77 | 78 | void Lock() 79 | { 80 | mtx.lock(); 81 | } 82 | 83 | void Unlock() 84 | { 85 | mtx.unlock(); 86 | } 87 | 88 | void TryLock() 89 | { 90 | mtx.try_lock(); 91 | } 92 | 93 | #else //__cplusplus > 199711L 94 | 95 | #if defined(_MSC_VER) 96 | 97 | // Microsoft VC++ 98 | typedef long Spinlock; 99 | 100 | #else 101 | 102 | // GNU g++ 103 | typedef int Spinlock; 104 | 105 | /** 106 | * Compiler read/write barrier 107 | */ 108 | __forceinline__ void _ReadWriteBarrier() 109 | { 110 | __sync_synchronize(); 111 | } 112 | 113 | /** 114 | * Atomic exchange 115 | */ 116 | __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value) 117 | { 118 | // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier 119 | _ReadWriteBarrier(); 120 | return __sync_lock_test_and_set(Target, Value); 121 | } 122 | 123 | /** 124 | * Pause instruction to prevent excess processor bus usage 125 | */ 126 | __forceinline__ void YieldProcessor() 127 | { 128 | } 129 | 130 | #endif // defined(_MSC_VER) 131 | 132 | /// Lock member 133 | volatile Spinlock lock; 134 | 135 | /** 136 | * Constructor 137 | */ 138 | Mutex() : lock(0) {} 139 | 140 | /** 141 | * Return when the specified spinlock has been acquired 142 | */ 143 | __forceinline__ void Lock() 144 | { 145 | while (1) 146 | { 147 | if (!_InterlockedExchange(&lock, 1)) return; 148 | while (lock) YieldProcessor(); 149 | } 150 | } 151 | 152 | 153 | /** 154 | * Release the specified spinlock 155 | */ 156 | __forceinline__ void Unlock() 157 | { 158 | _ReadWriteBarrier(); 159 | lock = 0; 160 | } 161 | 162 | #endif // __cplusplus > 199711L 163 | 164 | }; 165 | 166 | 167 | 168 | 169 | } // CUB namespace 170 | CUB_NS_POSTFIX // Optional outer namespace(s) 171 | 172 | -------------------------------------------------------------------------------- /include/cub/iterator/discard_output_iterator.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Random-access iterator types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | 39 | #include "../util_namespace.cuh" 40 | #include "../util_macro.cuh" 41 | 42 | #if (THRUST_VERSION >= 100700) 43 | // This iterator is compatible with Thrust API 1.7 and newer 44 | #include 45 | #include 46 | #endif // THRUST_VERSION 47 | 48 | 49 | /// Optional outer namespace(s) 50 | CUB_NS_PREFIX 51 | 52 | /// CUB namespace 53 | namespace cub { 54 | 55 | 56 | /** 57 | * \addtogroup UtilIterator 58 | * @{ 59 | */ 60 | 61 | 62 | /** 63 | * \brief A discard iterator 64 | */ 65 | template 66 | class DiscardOutputIterator 67 | { 68 | public: 69 | 70 | // Required iterator traits 71 | typedef DiscardOutputIterator self_type; ///< My own type 72 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another 73 | typedef void value_type; ///< The type of the element the iterator can point to 74 | typedef void pointer; ///< The type of a pointer to an element the iterator can point to 75 | typedef void reference; ///< The type of a reference to an element the iterator can point to 76 | 77 | #if (THRUST_VERSION >= 100700) 78 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods 79 | typedef typename thrust::detail::iterator_facade_category< 80 | thrust::any_system_tag, 81 | thrust::random_access_traversal_tag, 82 | value_type, 83 | reference 84 | >::type iterator_category; ///< The iterator category 85 | #else 86 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category 87 | #endif // THRUST_VERSION 88 | 89 | private: 90 | 91 | OffsetT offset; 92 | 93 | #if defined(_WIN32) || !defined(_WIN64) 94 | // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) 95 | OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; 96 | #endif 97 | 98 | public: 99 | 100 | /// Constructor 101 | __host__ __device__ __forceinline__ DiscardOutputIterator( 102 | OffsetT offset = 0) ///< Base offset 103 | : 104 | offset(offset) 105 | {} 106 | 107 | /// Postfix increment 108 | __host__ __device__ __forceinline__ self_type operator++(int) 109 | { 110 | self_type retval = *this; 111 | offset++; 112 | return retval; 113 | } 114 | 115 | /// Prefix increment 116 | __host__ __device__ __forceinline__ self_type operator++() 117 | { 118 | offset++; 119 | return *this; 120 | } 121 | 122 | /// Indirection 123 | __host__ __device__ __forceinline__ self_type& operator*() 124 | { 125 | // return self reference, which can be assigned to anything 126 | return *this; 127 | } 128 | 129 | /// Addition 130 | template 131 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const 132 | { 133 | self_type retval(offset + n); 134 | return retval; 135 | } 136 | 137 | /// Addition assignment 138 | template 139 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n) 140 | { 141 | offset += n; 142 | return *this; 143 | } 144 | 145 | /// Subtraction 146 | template 147 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const 148 | { 149 | self_type retval(offset - n); 150 | return retval; 151 | } 152 | 153 | /// Subtraction assignment 154 | template 155 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n) 156 | { 157 | offset -= n; 158 | return *this; 159 | } 160 | 161 | /// Distance 162 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const 163 | { 164 | return offset - other.offset; 165 | } 166 | 167 | /// Array subscript 168 | template 169 | __host__ __device__ __forceinline__ self_type& operator[](Distance n) 170 | { 171 | // return self reference, which can be assigned to anything 172 | return *this; 173 | } 174 | 175 | /// Structure dereference 176 | __host__ __device__ __forceinline__ pointer operator->() 177 | { 178 | return; 179 | } 180 | 181 | /// Assignment to self (no-op) 182 | __host__ __device__ __forceinline__ void operator=(self_type const& other) 183 | { 184 | offset = other.offset; 185 | } 186 | 187 | /// Assignment to anything else (no-op) 188 | template 189 | __host__ __device__ __forceinline__ void operator=(T const&) 190 | {} 191 | 192 | /// Cast to void* operator 193 | __host__ __device__ __forceinline__ operator void*() const { return NULL; } 194 | 195 | /// Equal to 196 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) 197 | { 198 | return (offset == rhs.offset); 199 | } 200 | 201 | /// Not equal to 202 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) 203 | { 204 | return (offset != rhs.offset); 205 | } 206 | 207 | /// ostream operator 208 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr) 209 | { 210 | os << "[" << itr.offset << "]"; 211 | return os; 212 | } 213 | 214 | }; 215 | 216 | 217 | /** @} */ // end group UtilIterator 218 | 219 | } // CUB namespace 220 | CUB_NS_POSTFIX // Optional outer namespace(s) 221 | -------------------------------------------------------------------------------- /include/cub/thread/thread_reduce.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Thread utilities for sequential reduction over statically-sized array types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../thread/thread_operators.cuh" 37 | #include "../util_namespace.cuh" 38 | 39 | /// Optional outer namespace(s) 40 | CUB_NS_PREFIX 41 | 42 | /// CUB namespace 43 | namespace cub { 44 | 45 | /// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) 46 | namespace internal { 47 | 48 | /** 49 | * Sequential reduction over statically-sized array types 50 | */ 51 | template < 52 | int LENGTH, 53 | typename T, 54 | typename ReductionOp> 55 | __device__ __forceinline__ T ThreadReduce( 56 | T* input, ///< [in] Input array 57 | ReductionOp reduction_op, ///< [in] Binary reduction operator 58 | T prefix, ///< [in] Prefix to seed reduction with 59 | Int2Type /*length*/) 60 | { 61 | T retval = prefix; 62 | 63 | #pragma unroll 64 | for (int i = 0; i < LENGTH; ++i) 65 | retval = reduction_op(retval, input[i]); 66 | 67 | return retval; 68 | } 69 | 70 | 71 | /** 72 | * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. 73 | * 74 | * \tparam LENGTH LengthT of input array 75 | * \tparam T [inferred] The data type to be reduced. 76 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 77 | */ 78 | template < 79 | int LENGTH, 80 | typename T, 81 | typename ReductionOp> 82 | __device__ __forceinline__ T ThreadReduce( 83 | T* input, ///< [in] Input array 84 | ReductionOp reduction_op, ///< [in] Binary reduction operator 85 | T prefix) ///< [in] Prefix to seed reduction with 86 | { 87 | return ThreadReduce(input, reduction_op, prefix, Int2Type()); 88 | } 89 | 90 | 91 | /** 92 | * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned. 93 | * 94 | * \tparam LENGTH LengthT of input array 95 | * \tparam T [inferred] The data type to be reduced. 96 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 97 | */ 98 | template < 99 | int LENGTH, 100 | typename T, 101 | typename ReductionOp> 102 | __device__ __forceinline__ T ThreadReduce( 103 | T* input, ///< [in] Input array 104 | ReductionOp reduction_op) ///< [in] Binary reduction operator 105 | { 106 | T prefix = input[0]; 107 | return ThreadReduce(input + 1, reduction_op, prefix); 108 | } 109 | 110 | 111 | /** 112 | * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. 113 | * 114 | * \tparam LENGTH [inferred] LengthT of \p input array 115 | * \tparam T [inferred] The data type to be reduced. 116 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 117 | */ 118 | template < 119 | int LENGTH, 120 | typename T, 121 | typename ReductionOp> 122 | __device__ __forceinline__ T ThreadReduce( 123 | T (&input)[LENGTH], ///< [in] Input array 124 | ReductionOp reduction_op, ///< [in] Binary reduction operator 125 | T prefix) ///< [in] Prefix to seed reduction with 126 | { 127 | return ThreadReduce(input, reduction_op, prefix, Int2Type()); 128 | } 129 | 130 | 131 | /** 132 | * \brief Serial reduction with the specified operator 133 | * 134 | * \tparam LENGTH [inferred] LengthT of \p input array 135 | * \tparam T [inferred] The data type to be reduced. 136 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 137 | */ 138 | template < 139 | int LENGTH, 140 | typename T, 141 | typename ReductionOp> 142 | __device__ __forceinline__ T ThreadReduce( 143 | T (&input)[LENGTH], ///< [in] Input array 144 | ReductionOp reduction_op) ///< [in] Binary reduction operator 145 | { 146 | return ThreadReduce((T*) input, reduction_op); 147 | } 148 | 149 | 150 | } // internal namespace 151 | } // CUB namespace 152 | CUB_NS_POSTFIX // Optional outer namespace(s) 153 | -------------------------------------------------------------------------------- /include/cub/thread/thread_search.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Thread utilities for sequential search 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../util_namespace.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | 45 | /** 46 | * Computes the begin offsets into A and B for the specific diagonal 47 | */ 48 | template < 49 | typename AIteratorT, 50 | typename BIteratorT, 51 | typename OffsetT, 52 | typename CoordinateT> 53 | __host__ __device__ __forceinline__ void MergePathSearch( 54 | OffsetT diagonal, 55 | AIteratorT a, 56 | BIteratorT b, 57 | OffsetT a_len, 58 | OffsetT b_len, 59 | CoordinateT& path_coordinate) 60 | { 61 | /// The value type of the input iterator 62 | typedef typename std::iterator_traits::value_type T; 63 | 64 | OffsetT split_min = CUB_MAX(diagonal - b_len, 0); 65 | OffsetT split_max = CUB_MIN(diagonal, a_len); 66 | 67 | while (split_min < split_max) 68 | { 69 | OffsetT split_pivot = (split_min + split_max) >> 1; 70 | if (a[split_pivot] <= b[diagonal - split_pivot - 1]) 71 | { 72 | // Move candidate split range up A, down B 73 | split_min = split_pivot + 1; 74 | } 75 | else 76 | { 77 | // Move candidate split range up B, down A 78 | split_max = split_pivot; 79 | } 80 | } 81 | 82 | path_coordinate.x = CUB_MIN(split_min, a_len); 83 | path_coordinate.y = diagonal - split_min; 84 | } 85 | 86 | 87 | 88 | /** 89 | * \brief Returns the offset of the first value within \p input which does not compare less than \p val 90 | */ 91 | template < 92 | typename InputIteratorT, 93 | typename OffsetT, 94 | typename T> 95 | __device__ __forceinline__ OffsetT LowerBound( 96 | InputIteratorT input, ///< [in] Input sequence 97 | OffsetT num_items, ///< [in] Input sequence length 98 | T val) ///< [in] Search key 99 | { 100 | OffsetT retval = 0; 101 | while (num_items > 0) 102 | { 103 | OffsetT half = num_items >> 1; 104 | if (input[retval + half] < val) 105 | { 106 | retval = retval + (half + 1); 107 | num_items = num_items - (half + 1); 108 | } 109 | else 110 | { 111 | num_items = half; 112 | } 113 | } 114 | 115 | return retval; 116 | } 117 | 118 | 119 | /** 120 | * \brief Returns the offset of the first value within \p input which compares greater than \p val 121 | */ 122 | template < 123 | typename InputIteratorT, 124 | typename OffsetT, 125 | typename T> 126 | __device__ __forceinline__ OffsetT UpperBound( 127 | InputIteratorT input, ///< [in] Input sequence 128 | OffsetT num_items, ///< [in] Input sequence length 129 | T val) ///< [in] Search key 130 | { 131 | OffsetT retval = 0; 132 | while (num_items > 0) 133 | { 134 | OffsetT half = num_items >> 1; 135 | if (val < input[retval + half]) 136 | { 137 | num_items = half; 138 | } 139 | else 140 | { 141 | retval = retval + (half + 1); 142 | num_items = num_items - (half + 1); 143 | } 144 | } 145 | 146 | return retval; 147 | } 148 | 149 | 150 | 151 | 152 | 153 | } // CUB namespace 154 | CUB_NS_POSTFIX // Optional outer namespace(s) 155 | -------------------------------------------------------------------------------- /include/cub/util_arch.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Static architectural properties by SM version. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "util_namespace.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 45 | 46 | #if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS) 47 | #define CUB_USE_COOPERATIVE_GROUPS 48 | #endif 49 | 50 | /// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass). 51 | #ifndef CUB_PTX_ARCH 52 | #ifndef __CUDA_ARCH__ 53 | #define CUB_PTX_ARCH 0 54 | #else 55 | #define CUB_PTX_ARCH __CUDA_ARCH__ 56 | #endif 57 | #endif 58 | 59 | 60 | /// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. 61 | #ifndef CUB_RUNTIME_FUNCTION 62 | #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__)) 63 | #define CUB_RUNTIME_ENABLED 64 | #define CUB_RUNTIME_FUNCTION __host__ __device__ 65 | #else 66 | #define CUB_RUNTIME_FUNCTION __host__ 67 | #endif 68 | #endif 69 | 70 | 71 | /// Number of threads per warp 72 | #ifndef CUB_LOG_WARP_THREADS 73 | #define CUB_LOG_WARP_THREADS(arch) \ 74 | (5) 75 | #define CUB_WARP_THREADS(arch) \ 76 | (1 << CUB_LOG_WARP_THREADS(arch)) 77 | 78 | #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH) 79 | #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH) 80 | #endif 81 | 82 | 83 | /// Number of smem banks 84 | #ifndef CUB_LOG_SMEM_BANKS 85 | #define CUB_LOG_SMEM_BANKS(arch) \ 86 | ((arch >= 200) ? \ 87 | (5) : \ 88 | (4)) 89 | #define CUB_SMEM_BANKS(arch) \ 90 | (1 << CUB_LOG_SMEM_BANKS(arch)) 91 | 92 | #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH) 93 | #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH) 94 | #endif 95 | 96 | 97 | /// Oversubscription factor 98 | #ifndef CUB_SUBSCRIPTION_FACTOR 99 | #define CUB_SUBSCRIPTION_FACTOR(arch) \ 100 | ((arch >= 300) ? \ 101 | (5) : \ 102 | ((arch >= 200) ? \ 103 | (3) : \ 104 | (10))) 105 | #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH) 106 | #endif 107 | 108 | 109 | /// Prefer padding overhead vs X-way conflicts greater than this threshold 110 | #ifndef CUB_PREFER_CONFLICT_OVER_PADDING 111 | #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \ 112 | ((arch >= 300) ? \ 113 | (1) : \ 114 | (4)) 115 | #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH) 116 | #endif 117 | 118 | 119 | /// Scale down the number of threads to keep same amount of scratch storage as the nominal configuration for 4B data. Minimum of two warps. 120 | #ifndef CUB_SCALED_BLOCK_THREADS 121 | #define CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ 122 | (CUB_MIN( \ 123 | NOMINAL_4B_BLOCK_THREADS, \ 124 | CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX( \ 125 | 2, \ 126 | (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T)))) 127 | #endif 128 | 129 | /// Scale down number of items per thread to keep the same amount of register storage as the nominal configuration for 4B data. Minimum 1 item per thread 130 | #ifndef CUB_SCALED_ITEMS_PER_THREAD 131 | #define CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ 132 | CUB_MAX( \ 133 | 1, \ 134 | (sizeof(T) < 4) ? \ 135 | ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) / 2 : \ 136 | ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)) 137 | #endif 138 | 139 | /// Define both nominal threads-per-block and items-per-thread 140 | #ifndef CUB_SCALED_GRANULARITIES 141 | #define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T) \ 142 | CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200), \ 143 | CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200) 144 | #endif 145 | 146 | 147 | 148 | #endif // Do not document 149 | 150 | } // CUB namespace 151 | CUB_NS_POSTFIX // Optional outer namespace(s) 152 | -------------------------------------------------------------------------------- /include/cub/util_debug.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Error and event logging routines. 32 | * 33 | * The following macros definitions are supported: 34 | * - \p CUB_LOG. Simple event messages are printed to \p stdout. 35 | */ 36 | 37 | #pragma once 38 | 39 | #include 40 | #include "util_namespace.cuh" 41 | #include "util_arch.cuh" 42 | 43 | /// Optional outer namespace(s) 44 | CUB_NS_PREFIX 45 | 46 | /// CUB namespace 47 | namespace cub { 48 | 49 | 50 | /** 51 | * \addtogroup UtilMgmt 52 | * @{ 53 | */ 54 | 55 | 56 | /// CUB error reporting macro (prints error messages to stderr) 57 | #if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR) 58 | #define CUB_STDERR 59 | #endif 60 | 61 | 62 | 63 | /** 64 | * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context. 65 | * 66 | * \return The CUDA error. 67 | */ 68 | __host__ __device__ __forceinline__ cudaError_t Debug( 69 | cudaError_t error, 70 | const char* filename, 71 | int line) 72 | { 73 | (void)filename; 74 | (void)line; 75 | #ifdef CUB_STDERR 76 | if (error) 77 | { 78 | #if (CUB_PTX_ARCH == 0) 79 | fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); 80 | fflush(stderr); 81 | #elif (CUB_PTX_ARCH >= 200) 82 | printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line); 83 | #endif 84 | } 85 | #endif 86 | return error; 87 | } 88 | 89 | 90 | /** 91 | * \brief Debug macro 92 | */ 93 | #ifndef CubDebug 94 | #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__) 95 | #endif 96 | 97 | 98 | /** 99 | * \brief Debug macro with exit 100 | */ 101 | #ifndef CubDebugExit 102 | #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); } 103 | #endif 104 | 105 | 106 | /** 107 | * \brief Log macro for printf statements. 108 | */ 109 | #if !defined(_CubLog) 110 | #if !(defined(__clang__) && defined(__CUDA__)) 111 | #if (CUB_PTX_ARCH == 0) 112 | #define _CubLog(format, ...) printf(format,__VA_ARGS__); 113 | #elif (CUB_PTX_ARCH >= 200) 114 | #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__); 115 | #endif 116 | #else 117 | // XXX shameless hack for clang around variadic printf... 118 | // Compilies w/o supplying -std=c++11 but shows warning, 119 | // so we sielence them :) 120 | #pragma clang diagnostic ignored "-Wc++11-extensions" 121 | #pragma clang diagnostic ignored "-Wunnamed-type-template-args" 122 | template 123 | inline __host__ __device__ void va_printf(char const* format, Args const&... args) 124 | { 125 | #ifdef __CUDA_ARCH__ 126 | printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...); 127 | #else 128 | printf(format, args...); 129 | #endif 130 | } 131 | #ifndef __CUDA_ARCH__ 132 | #define _CubLog(format, ...) va_printf(format,__VA_ARGS__); 133 | #else 134 | #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__); 135 | #endif 136 | #endif 137 | #endif 138 | 139 | 140 | 141 | 142 | /** @} */ // end group UtilMgmt 143 | 144 | } // CUB namespace 145 | CUB_NS_POSTFIX // Optional outer namespace(s) 146 | -------------------------------------------------------------------------------- /include/cub/util_macro.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /****************************************************************************** 30 | * Common C/C++ macro utilities 31 | ******************************************************************************/ 32 | 33 | #pragma once 34 | 35 | #include "util_namespace.cuh" 36 | 37 | /// Optional outer namespace(s) 38 | CUB_NS_PREFIX 39 | 40 | /// CUB namespace 41 | namespace cub { 42 | 43 | 44 | /** 45 | * \addtogroup UtilModule 46 | * @{ 47 | */ 48 | 49 | #ifndef CUB_ALIGN 50 | #if defined(_WIN32) || defined(_WIN64) 51 | /// Align struct 52 | #define CUB_ALIGN(bytes) __declspec(align(32)) 53 | #else 54 | /// Align struct 55 | #define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) 56 | #endif 57 | #endif 58 | 59 | #ifndef CUB_MAX 60 | /// Select maximum(a, b) 61 | #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) 62 | #endif 63 | 64 | #ifndef CUB_MIN 65 | /// Select minimum(a, b) 66 | #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) 67 | #endif 68 | 69 | #ifndef CUB_QUOTIENT_FLOOR 70 | /// Quotient of x/y rounded down to nearest integer 71 | #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) 72 | #endif 73 | 74 | #ifndef CUB_QUOTIENT_CEILING 75 | /// Quotient of x/y rounded up to nearest integer 76 | #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) 77 | #endif 78 | 79 | #ifndef CUB_ROUND_UP_NEAREST 80 | /// x rounded up to the nearest multiple of y 81 | #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y) 82 | #endif 83 | 84 | #ifndef CUB_ROUND_DOWN_NEAREST 85 | /// x rounded down to the nearest multiple of y 86 | #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) 87 | #endif 88 | 89 | 90 | #ifndef CUB_STATIC_ASSERT 91 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 92 | #define CUB_CAT_(a, b) a ## b 93 | #define CUB_CAT(a, b) CUB_CAT_(a, b) 94 | #endif // DOXYGEN_SHOULD_SKIP_THIS 95 | 96 | /// Static assert 97 | #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] 98 | #endif 99 | 100 | /** @} */ // end group UtilModule 101 | 102 | } // CUB namespace 103 | CUB_NS_POSTFIX // Optional outer namespace(s) 104 | -------------------------------------------------------------------------------- /include/cub/util_namespace.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Place-holder for prefixing the cub namespace 32 | */ 33 | 34 | #pragma once 35 | 36 | // For example: 37 | //#define CUB_NS_PREFIX namespace thrust{ namespace detail { 38 | //#define CUB_NS_POSTFIX } } 39 | 40 | #ifndef CUB_NS_PREFIX 41 | #define CUB_NS_PREFIX 42 | #endif 43 | 44 | #ifndef CUB_NS_POSTFIX 45 | #define CUB_NS_POSTFIX 46 | #endif 47 | -------------------------------------------------------------------------------- /include/logger.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2016 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NOTICE TO USER: 5 | * 6 | * This source code is subject to NVIDIA ownership rights under U.S. and 7 | * international Copyright laws. 8 | * 9 | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 10 | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 11 | * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH 12 | * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 13 | * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 14 | * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 15 | * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 16 | * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 17 | * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 18 | * OR PERFORMANCE OF THIS SOURCE CODE. 19 | * 20 | * U.S. Government End Users. This source code is a "commercial item" as 21 | * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of 22 | * "commercial computer software" and "commercial computer software 23 | * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 24 | * and is provided to the U.S. Government only as a commercial end item. 25 | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 26 | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 27 | * source code with only those rights set forth herein. 28 | */ 29 | 30 | #ifndef LOGGER_H 31 | #define LOGGER_H 32 | 33 | #pragma once 34 | 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | 42 | #ifdef _WIN32 43 | #include 44 | #include 45 | 46 | #pragma comment(lib, "ws2_32.lib") 47 | #else 48 | #include 49 | #include 50 | #include 51 | #include 52 | #define SOCKET int 53 | #define INVALID_SOCKET -1 54 | #endif 55 | 56 | namespace simplelogger{ 57 | 58 | enum LogLevel { 59 | TRACE, 60 | DEBUG, 61 | INFO, 62 | WARN, 63 | ERR 64 | }; 65 | 66 | class Logger { 67 | public: 68 | Logger(LogLevel level, bool bPrintTimeStamp) : level(level), bPrintTimeStamp(bPrintTimeStamp) {} 69 | virtual ~Logger() {} 70 | virtual std::ostream& GetStream() = 0; 71 | virtual void FlushStream() {} 72 | bool ShouldLogFor(LogLevel l) { 73 | return l >= level; 74 | } 75 | char* GetLead(LogLevel l, const char *szFile, int nLine, const char *szFunc) { 76 | if (l < TRACE || l > ERR) { 77 | sprintf(szLead, "[?????] "); 78 | return szLead; 79 | } 80 | const char *szLevels[] = {"TRACE", "DEBUG", "INFO", "WARN", "ERROR"}; 81 | if (bPrintTimeStamp) { 82 | time_t t = time(NULL); 83 | struct tm *ptm = localtime(&t); 84 | sprintf(szLead, "[%-5s][%02d:%02d:%02d] ", 85 | szLevels[l], ptm->tm_hour, ptm->tm_min, ptm->tm_sec); 86 | } else { 87 | sprintf(szLead, "[%-5s] ", szLevels[l]); 88 | } 89 | return szLead; 90 | } 91 | void EnterCriticalSection() { 92 | mtx.lock(); 93 | } 94 | void LeaveCriticalSection() { 95 | mtx.unlock(); 96 | } 97 | private: 98 | LogLevel level; 99 | char szLead[80]; 100 | bool bPrintTimeStamp; 101 | std::mutex mtx; 102 | }; 103 | 104 | class LoggerFactory { 105 | public: 106 | static Logger* CreateFileLogger(std::string strFilePath, 107 | LogLevel level = DEBUG, bool bPrintTimeStamp = true) { 108 | return new FileLogger(strFilePath, level, bPrintTimeStamp); 109 | } 110 | static Logger* CreateConsoleLogger(LogLevel level = DEBUG, 111 | bool bPrintTimeStamp = true) { 112 | return new ConsoleLogger(level, bPrintTimeStamp); 113 | } 114 | static Logger* CreateUdpLogger(char *szHost, unsigned uPort, LogLevel level = DEBUG, 115 | bool bPrintTimeStamp = true) { 116 | return new UdpLogger(szHost, uPort, level, bPrintTimeStamp); 117 | } 118 | private: 119 | LoggerFactory() {} 120 | 121 | class FileLogger : public Logger { 122 | public: 123 | FileLogger(std::string strFilePath, LogLevel level, bool bPrintTimeStamp) 124 | : Logger(level, bPrintTimeStamp) { 125 | pFileOut = new std::ofstream(); 126 | pFileOut->open(strFilePath.c_str()); 127 | } 128 | ~FileLogger() { 129 | pFileOut->close(); 130 | } 131 | std::ostream& GetStream() { 132 | return *pFileOut; 133 | } 134 | private: 135 | std::ofstream *pFileOut; 136 | }; 137 | 138 | class ConsoleLogger : public Logger { 139 | public: 140 | ConsoleLogger(LogLevel level, bool bPrintTimeStamp) 141 | : Logger(level, bPrintTimeStamp) {} 142 | std::ostream& GetStream() { 143 | return std::cout; 144 | } 145 | }; 146 | 147 | class UdpLogger : public Logger { 148 | private: 149 | class UdpOstream : public std::ostream { 150 | public: 151 | UdpOstream(char *szHost, unsigned short uPort) : std::ostream(&sb), socket(INVALID_SOCKET){ 152 | #ifdef _WIN32 153 | WSADATA w; 154 | if (WSAStartup(0x0101, &w) != 0) { 155 | fprintf(stderr, "WSAStartup() failed.\n"); 156 | return; 157 | } 158 | #endif 159 | socket = ::socket(AF_INET, SOCK_DGRAM, 0); 160 | if (socket == INVALID_SOCKET) { 161 | #ifdef _WIN32 162 | WSACleanup(); 163 | #endif 164 | fprintf(stderr, "socket() failed.\n"); 165 | return; 166 | } 167 | #ifdef _WIN32 168 | unsigned int b1, b2, b3, b4; 169 | sscanf(szHost, "%u.%u.%u.%u", &b1, &b2, &b3, &b4); 170 | struct in_addr addr = {(unsigned char)b1, (unsigned char)b2, (unsigned char)b3, (unsigned char)b4}; 171 | #else 172 | struct in_addr addr = {inet_addr(szHost)}; 173 | #endif 174 | struct sockaddr_in s = {AF_INET, htons(uPort), addr}; 175 | server = s; 176 | } 177 | virtual ~UdpOstream() { 178 | if (socket == INVALID_SOCKET) { 179 | return; 180 | } 181 | #ifdef _WIN32 182 | closesocket(socket); 183 | WSACleanup(); 184 | #else 185 | close(socket); 186 | #endif 187 | } 188 | void Flush() { 189 | if (sendto(socket, sb.str().c_str(), (int)sb.str().length() + 1, 190 | 0, (struct sockaddr *)&server, (int)sizeof(sockaddr_in)) == -1) { 191 | fprintf(stderr, "sendto() failed.\n"); 192 | } 193 | sb.str(""); 194 | } 195 | 196 | private: 197 | std::stringbuf sb; 198 | SOCKET socket; 199 | struct sockaddr_in server; 200 | }; 201 | public: 202 | UdpLogger(char *szHost, unsigned uPort, LogLevel level, bool bPrintTimeStamp) 203 | : Logger(level, bPrintTimeStamp), udpOut(szHost, (unsigned short)uPort) {} 204 | UdpOstream& GetStream() { 205 | return udpOut; 206 | } 207 | virtual void FlushStream() { 208 | udpOut.Flush(); 209 | } 210 | private: 211 | UdpOstream udpOut; 212 | }; 213 | }; 214 | 215 | } 216 | 217 | #define LOG_(pLogger, event, level) \ 218 | do { \ 219 | if (!pLogger || !pLogger->ShouldLogFor(level)) { \ 220 | break; \ 221 | } \ 222 | pLogger->EnterCriticalSection(); \ 223 | pLogger->GetStream() \ 224 | << pLogger->GetLead(level, __FILE__, __LINE__, \ 225 | __FUNCTION__) \ 226 | << event << std::endl; \ 227 | pLogger->FlushStream(); \ 228 | pLogger->LeaveCriticalSection(); \ 229 | } while (0); 230 | 231 | #define LOG_TRACE(pLogger, event) LOG_(pLogger, event, simplelogger::TRACE) 232 | #define LOG_DEBUG(pLogger, event) LOG_(pLogger, event, simplelogger::DEBUG) 233 | #define LOG_INFO(pLogger, event) LOG_(pLogger, event, simplelogger::INFO) 234 | #define LOG_WARN(pLogger, event) LOG_(pLogger, event, simplelogger::WARN) 235 | #define LOG_ERROR(pLogger, event) LOG_(pLogger, event, simplelogger::ERR) 236 | 237 | 238 | #endif // LOGGER_H 239 | -------------------------------------------------------------------------------- /interpPlugin.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | dim3 cuda_gridsize(unsigned int n){ 5 | unsigned int k = (n-1) / BLOCK + 1; 6 | unsigned int x = k; 7 | unsigned int y = 1; 8 | if(x > 65535){ 9 | x = ceil(sqrt(k)); 10 | y = (n-1)/(x*BLOCK) + 1; 11 | } 12 | dim3 d = {x, y, 1}; 13 | return d; 14 | } 15 | 16 | /* nearest neighbor upsampling used in darknet*/ 17 | __global__ void upsample_gpu(int N, const float *x, int w, int h, int c, int batch, int zoomFactor, float *out, const char* mode="nearest") 18 | { 19 | int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; 20 | if(i >= N) return; 21 | int out_index = i; 22 | int out_w = i%(w*zoomFactor); 23 | i = i/(w*zoomFactor); 24 | int out_h = i%(h*zoomFactor); 25 | i = i/(h*zoomFactor); 26 | int _c = i%c; 27 | i = i/_c; 28 | int _b = i%batch; 29 | int in_w = out_w/zoomFactor; 30 | int in_h = out_h/zoomFactor; 31 | int in_offset = _b*c*w*h + _c*w*h; 32 | int in_index00 = in_offset + in_h*w + in_w; 33 | if(mode == "bilinear"){ 34 | int in_index01 = (in_w+1 > w) ? in_index00 : (in_index00 + 1); 35 | int in_index10 = (in_h+1 > h) ? in_index00 : (in_index00 + w); 36 | int in_index11 = (in_index01 == in_index10) ? in_index00 : (in_index10 + 1); 37 | 38 | float u = (float)(out_h % zoomFactor)/zoomFactor; 39 | float v = (float)(out_w % zoomFactor)/zoomFactor; 40 | out[out_index] = (1-u)*(1-v)*x[in_index00] + \ 41 | (1-u)*v*x[in_index01] + \ 42 | u*(1-v)*x[in_index10] + \ 43 | u*v*x[in_index11]; 44 | } 45 | else if(mode == "nearest"){ 46 | out[out_index] = x[in_index00]; 47 | } 48 | } 49 | 50 | void interp_gpu(const float *x, int w, int h, int c, int batch, int zoomFactor, float *out, cudaStream_t stream) 51 | { 52 | int outSize = w*zoomFactor*h*zoomFactor*c*batch; 53 | upsample_gpu<<>>(outSize, x, w, h, c, batch, zoomFactor, out); 54 | } 55 | -------------------------------------------------------------------------------- /interpPlugin.h: -------------------------------------------------------------------------------- 1 | #ifndef INTERP_PLUGIN_H 2 | #define INTERP_PLUGIN_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "NvInfer.h" 9 | #include "NvCaffeParser.h" 10 | #include "NvInferPlugin.h" 11 | #include 12 | 13 | using namespace nvinfer1; 14 | using namespace nvcaffeparser1; 15 | using namespace plugin; 16 | 17 | #define BLOCK 512 18 | #define ZOOM 2 // upsample *2 19 | 20 | void interp_gpu(const float *x, int w, int h, int c, int batch, int zoomFactor, float *out, cudaStream_t stream); 21 | 22 | template 23 | class Interp : public IPlugin 24 | { 25 | public: 26 | Interp() {} 27 | Interp(const void* buffer, size_t size) 28 | { 29 | // assert(size == sizeof(mInputSize)); 30 | // mInputSize = *reinterpret_cast(buffer); 31 | assert(size == sizeof(mInputDims)); 32 | mInputDims = *reinterpret_cast(buffer); 33 | } 34 | ~Interp() {} 35 | 36 | // @ when creating the network 37 | int getNbOutputs() const override 38 | { 39 | return 1; 40 | } 41 | Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override 42 | { 43 | assert(nbInputDims == 1); 44 | assert(index == 0); 45 | assert(inputs[index].nbDims == 3); 46 | 47 | mOutputDims = DimsCHW(inputs[index].d[0], inputs[index].d[1] * zoomFactor, inputs[index].d[2] * zoomFactor); 48 | if (0) { 49 | std::cout << "IPlugin input dim = [" << inputs[index].d[0] << ", " << inputs[index].d[1] 50 | << ", " << inputs[index].d[2] << "]" << std::endl; 51 | std::cout << "IPlugin output dim = [" << mOutputDims.d[0] << ", " << mOutputDims.d[1] 52 | << ", " << mOutputDims.d[2] << "]" << std::endl; 53 | } 54 | return mOutputDims; 55 | } 56 | 57 | // @ when building the engine 58 | void configure(const Dims* inputs, int nbInputs, const Dims* outputs, int nbOutputs, int maxBatchSize) override 59 | { 60 | assert(1 == nbInputs && 1 == nbOutputs); 61 | mInputDims = inputs[0]; 62 | mInputSize = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2] * sizeof(float); 63 | // mOutputSize = outputs[0].d[0] * outputs[0].d[1] * outputs[0].d[2] * sizeof(float); 64 | } 65 | size_t getWorkspaceSize(int) const override 66 | { 67 | return 0; 68 | } 69 | 70 | // @ when serializing the engine 71 | size_t getSerializationSize() override 72 | { 73 | return sizeof(mInputDims); 74 | } 75 | void serialize(void* buffer) override 76 | { 77 | // *reinterpret_cast(buffer) = mInputSize; 78 | *reinterpret_cast(buffer) = mInputDims; 79 | } 80 | 81 | // @ when deserializing && executing the engine(at runtime) 82 | int initialize() override 83 | { 84 | return 0; 85 | } 86 | void terminate() override 87 | { 88 | } 89 | int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override 90 | { 91 | // TODO: why inputs idx 0? 92 | interp_gpu((const float*)inputs[0], mInputDims.d[2], mInputDims.d[1], mInputDims.d[0], batchSize, zoomFactor, (float *)outputs[0], stream); // TODO: didnt serialize mInputDims, can we use it? in that case, i serialized mInputDims, instead of mInputSize. 93 | return 0; 94 | } 95 | 96 | protected: 97 | Dims mInputDims; //CHW 98 | Dims mOutputDims; 99 | size_t mInputSize; 100 | // size_t mOutputSize; 101 | }; 102 | 103 | 104 | class PluginFactory : public nvinfer1::IPluginFactory, public nvcaffeparser1::IPluginFactory 105 | { 106 | public: 107 | // @ when building the engine 108 | // caffe parser plugin implementation 109 | bool isPlugin(const char* layerName) override 110 | { 111 | return !(strcmp(layerName, "Interp85") && strcmp(layerName, "Interp97")); 112 | } 113 | virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) override 114 | { 115 | assert(isPlugin(layerName)); 116 | if (!strcmp(layerName, "Interp85")) 117 | { 118 | assert(layerName != "Interp85"); // debug_ 119 | assert(mPluginInterp85.get() == nullptr); 120 | assert(nbWeights == 0 && weights == nullptr); 121 | mPluginInterp85 = std::unique_ptr>(new Interp()); 122 | return mPluginInterp85.get(); 123 | } 124 | else if (!strcmp(layerName, "Interp97")) 125 | { 126 | assert(layerName != "Interp97"); // debug_ 127 | assert(mPluginInterp97.get() == nullptr); 128 | assert(nbWeights == 0 && weights == nullptr); 129 | mPluginInterp97 = std::unique_ptr>(new Interp()); 130 | return mPluginInterp97.get(); 131 | } 132 | else 133 | { 134 | assert(0); 135 | return nullptr; 136 | } 137 | } 138 | 139 | // @ at runtime 140 | IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override 141 | { 142 | assert(isPlugin(layerName)); 143 | if (!strcmp(layerName, "Interp85")) 144 | { 145 | assert(mPluginInterp85.get() == nullptr); 146 | mPluginInterp85 = std::unique_ptr>(new Interp(serialData, serialLength)); 147 | return mPluginInterp85.get(); 148 | } 149 | else if (!strcmp(layerName, "Interp97")) 150 | { 151 | assert(mPluginInterp97.get() == nullptr); 152 | mPluginInterp97 = std::unique_ptr>(new Interp(serialData, serialLength)); 153 | return mPluginInterp97.get(); 154 | } 155 | else 156 | { 157 | assert(0); 158 | return nullptr; 159 | } 160 | } 161 | 162 | void destroyPlugin() 163 | { 164 | //mPluginInterp97.release(); mPluginInterp97 = nullptr; 165 | //mPluginInterp85.release(); mPluginInterp85 = nullptr; 166 | } 167 | 168 | std::unique_ptr> mPluginInterp85{ nullptr }; 169 | std::unique_ptr> mPluginInterp97{ nullptr }; 170 | }; 171 | 172 | #endif 173 | -------------------------------------------------------------------------------- /nvUtils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2016 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NOTICE TO USER: 5 | * 6 | * This source code is subject to NVIDIA ownership rights under U.S. and 7 | * international Copyright laws. 8 | * 9 | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 10 | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 11 | * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH 12 | * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 13 | * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 14 | * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 15 | * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 16 | * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 17 | * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 18 | * OR PERFORMANCE OF THIS SOURCE CODE. 19 | * 20 | * U.S. Government End Users. This source code is a "commercial item" as 21 | * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of 22 | * "commercial computer software" and "commercial computer software 23 | * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 24 | * and is provided to the U.S. Government only as a commercial end item. 25 | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 26 | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 27 | * source code with only those rights set forth herein. 28 | */ 29 | 30 | #ifndef NV_CODEC_UTILS_H 31 | #define NV_CODEC_UTILS_H 32 | 33 | #pragma once 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | 42 | extern simplelogger::Logger *logger; 43 | 44 | #ifdef _WIN32 45 | #ifndef STRCASECMP 46 | #define STRCASECMP _stricmp 47 | #endif 48 | #ifndef STRNCASECMP 49 | #define STRNCASECMP _strnicmp 50 | #endif 51 | #ifndef STRCPY 52 | #define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) 53 | #endif 54 | 55 | #ifndef FOPEN 56 | #define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode) 57 | #endif 58 | #ifndef FOPEN_FAIL 59 | #define FOPEN_FAIL(result) (result != 0) 60 | #endif 61 | #ifndef SSCANF 62 | #define SSCANF sscanf_s 63 | #endif 64 | #else 65 | #include 66 | #include 67 | 68 | #ifndef STRCASECMP 69 | #define STRCASECMP strcasecmp 70 | #endif 71 | #ifndef STRNCASECMP 72 | #define STRNCASECMP strncasecmp 73 | #endif 74 | #ifndef STRCPY 75 | #define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) 76 | #endif 77 | 78 | #ifndef FOPEN 79 | #define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode)) 80 | #endif 81 | #ifndef FOPEN_FAIL 82 | #define FOPEN_FAIL(result) (result == NULL) 83 | #endif 84 | #ifndef SSCANF 85 | #define SSCANF sscanf 86 | #endif 87 | #endif 88 | 89 | 90 | #ifdef __cuda_cuda_h__ 91 | inline bool CHECK_(CUresult e, int iLine, const char *szFile) { 92 | if (e != CUDA_SUCCESS) { 93 | LOG_ERROR(logger, "CUDA error " << e << " at line " << iLine << " in file " << szFile); 94 | return false; 95 | } 96 | return true; 97 | } 98 | #endif 99 | 100 | #ifdef __CUDA_RUNTIME_H__ 101 | inline bool CHECK_(cudaError_t e, int iLine, const char *szFile) { 102 | if (e != cudaSuccess) { 103 | LOG_ERROR(logger, "CUDA runtime error " << e << " at line " << iLine << " in file " << szFile); 104 | return false; 105 | } 106 | return true; 107 | } 108 | #endif 109 | 110 | #ifdef _NV_ENCODEAPI_H_ 111 | inline bool CHECK_(NVENCSTATUS e, int iLine, const char *szFile) { 112 | if (e != NV_ENC_SUCCESS) { 113 | LOG_ERROR(logger, "NVENC error " << e << " at line " << iLine << " in file " << szFile); 114 | return false; 115 | } 116 | return true; 117 | } 118 | #endif 119 | 120 | #ifdef _WINERROR_ 121 | inline bool CHECK_(HRESULT e, int iLine, const char *szFile) { 122 | if (e != S_OK) { 123 | LOG_ERROR(logger, "HRESULT error " << e << " at line " << iLine << " in file " << szFile); 124 | return false; 125 | } 126 | return true; 127 | } 128 | #endif 129 | 130 | #if defined(__gl_h_) || defined(__GL_H__) 131 | inline bool CHECK_(GLenum e, int iLine, const char *szFile) { 132 | if (e != 0) { 133 | LOG_ERROR(logger, "GLenum error " << e << " at line " << iLine << " in file " << szFile); 134 | return false; 135 | } 136 | return true; 137 | } 138 | #endif 139 | 140 | #define ck(call) CHECK_(call, __LINE__, __FILE__) 141 | /* 142 | */ 143 | 144 | #ifdef _WIN32 145 | #include 146 | #else 147 | #include 148 | inline int _getch( ) { 149 | struct termios oldt, newt; 150 | int ch; 151 | tcgetattr( STDIN_FILENO, &oldt ); 152 | newt = oldt; 153 | newt.c_lflag &= ~( ICANON | ECHO ); 154 | tcsetattr( STDIN_FILENO, TCSANOW, &newt ); 155 | ch = getchar(); 156 | tcsetattr( STDIN_FILENO, TCSANOW, &oldt ); 157 | return ch; 158 | } 159 | #define _stricmp strcasecmp 160 | #endif 161 | 162 | class BufferedFileReader { 163 | public: 164 | BufferedFileReader(const char *szFileName) { 165 | struct stat st; 166 | 167 | if (stat(szFileName, &st) != 0) { 168 | return; 169 | } 170 | 171 | nSize = st.st_size; 172 | pBuf = new uint8_t[nSize]; 173 | if (!pBuf) { 174 | LOG_ERROR(logger, "Failed to allocate memory in BufferedReader"); 175 | return; 176 | } 177 | 178 | FILE *fp = fopen(szFileName, "rb"); 179 | int nRead = fread(pBuf, 1, nSize, fp); 180 | fclose(fp); 181 | 182 | assert(nRead == nSize); 183 | } 184 | ~BufferedFileReader() { 185 | if (pBuf) { 186 | delete[] pBuf; 187 | } 188 | } 189 | bool GetBuffer(uint8_t **ppBuf, int *pnSize) { 190 | if (!pBuf) { 191 | return false; 192 | } 193 | 194 | *ppBuf = pBuf; 195 | *pnSize = nSize; 196 | return true; 197 | } 198 | 199 | private: 200 | uint8_t *pBuf = NULL; 201 | int nSize = 0; 202 | }; 203 | 204 | /* 205 | class YuvConverter { 206 | public: 207 | YuvConverter(int nWidth, int nHeight) : nWidth(nWidth), nHeight(nHeight) { 208 | pu = new uint8_t[nWidth * nHeight / 4]; 209 | } 210 | ~YuvConverter() { 211 | delete pu; 212 | } 213 | void I420ToNv12(uint8_t *pFrame, int nPitch = 0) { 214 | if (nPitch == 0) { 215 | nPitch = nWidth; 216 | } 217 | uint8_t *puv = pFrame + nPitch * nHeight; 218 | if (nPitch == nWidth) { 219 | memcpy(pu, puv, nWidth * nHeight / 4); 220 | } else { 221 | for (int i = 0; i < nHeight / 2; i++) { 222 | memcpy(pu + nWidth / 2 * i, puv + nPitch / 2 * i, nWidth / 2); 223 | } 224 | } 225 | uint8_t *pv = puv + (nPitch / 2) * (nHeight / 2); 226 | for (int y = 0; y < nHeight / 2; y++) { 227 | for (int x = 0; x < nWidth / 2; x++) { 228 | puv[y * nPitch + x * 2] = pu[y * nWidth / 2 + x]; 229 | puv[y * nPitch + x * 2 + 1] = pv[y * nPitch / 2 + x]; 230 | } 231 | } 232 | } 233 | 234 | private: 235 | uint8_t *pu; 236 | int nWidth, nHeight; 237 | }; 238 | */ 239 | class StopWatch { 240 | public: 241 | void Start() { 242 | t0 = std::chrono::high_resolution_clock::now(); 243 | } 244 | double Stop() { 245 | return std::chrono::duration_cast(std::chrono::high_resolution_clock::now().time_since_epoch() - t0.time_since_epoch()).count() / 1.0e9; 246 | } 247 | 248 | private: 249 | std::chrono::time_point t0; 250 | }; 251 | /* 252 | class StopWatchNew { 253 | public: 254 | void Start() { 255 | //t0 = std::chrono::high_resolution_clock::now(); 256 | gettimeofday(&t0, NULL); 257 | } 258 | double Stop() { 259 | struct timeval t1; 260 | gettimeofday(&t1, NULL); 261 | return (t1.tv_sec - t0.tv_sec) + (t1.tv_usec - t0.tv_usec)/1000000; 262 | } 263 | 264 | private: 265 | struct timeval t0; 266 | };*/ 267 | 268 | #endif // NV_CODEC_UTILS_H 269 | -------------------------------------------------------------------------------- /predictions_fp32.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangjinsong3/YOLO-V3-Acceleration/384746bb6d8f61c8def70bbc0b5e04b98c60356e/predictions_fp32.jpg -------------------------------------------------------------------------------- /preproc_yolov3.h: -------------------------------------------------------------------------------- 1 | #ifndef YOLO_PREPROC_H 2 | #define YOLO_PREPROC_H 3 | 4 | #include 5 | #include 6 | #include "opencv2/highgui/highgui_c.h" 7 | #include "opencv2/imgproc/imgproc_c.h" 8 | #include "opencv2/core/version.hpp" 9 | #if CV_MAJOR_VERSION == 3 10 | #include "opencv2/videoio/videoio_c.h" 11 | #endif 12 | 13 | typedef struct { 14 | int w; 15 | int h; 16 | int c; 17 | float *data; 18 | } image; 19 | 20 | image make_empty_image(int w, int h, int c) 21 | { 22 | image out; 23 | out.data = 0; 24 | out.h = h; 25 | out.w = w; 26 | out.c = c; 27 | return out; 28 | } 29 | 30 | image make_image(int w, int h, int c) 31 | { 32 | image out = make_empty_image(w,h,c); 33 | out.data = (float*)calloc(h*w*c, sizeof(float)); 34 | return out; 35 | } 36 | void free_image(image m) 37 | { 38 | if(m.data){ 39 | free(m.data); 40 | } 41 | } 42 | void fill_image(image m, float s) 43 | { 44 | int i; 45 | for(i = 0; i < m.h*m.w*m.c; ++i) m.data[i] = s; 46 | } 47 | 48 | float get_pixel(image m, int x, int y, int c) 49 | { 50 | assert(x < m.w && y < m.h && c < m.c); 51 | return m.data[c*m.h*m.w + y*m.w + x]; 52 | } 53 | 54 | void set_pixel(image m, int x, int y, int c, float val) 55 | { 56 | if (x < 0 || y < 0 || c < 0 || x >= m.w || y >= m.h || c >= m.c) return; 57 | assert(x < m.w && y < m.h && c < m.c); 58 | m.data[c*m.h*m.w + y*m.w + x] = val; 59 | } 60 | void add_pixel(image m, int x, int y, int c, float val) 61 | { 62 | assert(x < m.w && y < m.h && c < m.c); 63 | m.data[c*m.h*m.w + y*m.w + x] += val; 64 | } 65 | void embed_image(image source, image dest, int dx, int dy) 66 | { 67 | int x,y,k; 68 | for(k = 0; k < source.c; ++k){ 69 | for(y = 0; y < source.h; ++y){ 70 | for(x = 0; x < source.w; ++x){ 71 | float val = get_pixel(source, x,y,k); 72 | set_pixel(dest, dx+x, dy+y, k, val); 73 | } 74 | } 75 | } 76 | } 77 | 78 | void ipl_into_image(IplImage* src, image im) 79 | { 80 | unsigned char *data = (unsigned char *)src->imageData; 81 | int h = src->height; 82 | int w = src->width; 83 | int c = src->nChannels; 84 | int step = src->widthStep; 85 | int i, j, k; 86 | 87 | for(i = 0; i < h; ++i){ 88 | for(k= 0; k < c; ++k){ 89 | for(j = 0; j < w; ++j){ 90 | im.data[k*w*h + i*w + j] = data[i*step + j*c + k]/255.; 91 | } 92 | } 93 | } 94 | } 95 | 96 | image ipl_to_image(IplImage* src) 97 | { 98 | // ross 99 | if (0 == src) { 100 | printf("file %s, line %d, src == 0\n", __FILE__, __LINE__); 101 | exit(0); 102 | } 103 | int h = src->height; 104 | int w = src->width; 105 | int c = src->nChannels; 106 | image out = make_image(w, h, c); 107 | ipl_into_image(src, out); 108 | return out; 109 | } 110 | 111 | void rgbgr_image(image im) 112 | { 113 | int i; 114 | for(i = 0; i < im.w*im.h; ++i){ 115 | float swap = im.data[i]; 116 | im.data[i] = im.data[i+im.w*im.h*2]; 117 | im.data[i+im.w*im.h*2] = swap; 118 | } 119 | } 120 | 121 | image load_image_cv(char *filename, int channels) 122 | { 123 | IplImage* src = 0; 124 | int flag = -1; 125 | if (channels == 0) flag = -1; 126 | else if (channels == 1) flag = 0; 127 | else if (channels == 3) flag = 1; 128 | else { 129 | fprintf(stderr, "OpenCV can't force load with %d channels\n", channels); 130 | } 131 | 132 | if( (src = (IplImage*)cvLoadImage(filename, flag)) == NULL ) 133 | { 134 | fprintf(stderr, "Cannot load image \"%s\"\n", filename); 135 | exit(0); 136 | } 137 | image out = ipl_to_image(src); 138 | cvReleaseImage(&src); 139 | rgbgr_image(out); 140 | return out; 141 | } 142 | 143 | image resize_image(image im, int w, int h) 144 | { 145 | image resized = make_image(w, h, im.c); 146 | image part = make_image(w, im.h, im.c); 147 | int r, c, k; 148 | float w_scale = (float)(im.w - 1) / (w - 1); 149 | float h_scale = (float)(im.h - 1) / (h - 1); 150 | for(k = 0; k < im.c; ++k){ 151 | for(r = 0; r < im.h; ++r){ 152 | for(c = 0; c < w; ++c){ 153 | float val = 0; 154 | if(c == w-1 || im.w == 1){ 155 | val = get_pixel(im, im.w-1, r, k); 156 | } else { 157 | float sx = c*w_scale; 158 | int ix = (int) sx; 159 | float dx = sx - ix; 160 | val = (1 - dx) * get_pixel(im, ix, r, k) + dx * get_pixel(im, ix+1, r, k); 161 | } 162 | set_pixel(part, c, r, k, val); 163 | } 164 | } 165 | } 166 | for(k = 0; k < im.c; ++k){ 167 | for(r = 0; r < h; ++r){ 168 | float sy = r*h_scale; 169 | int iy = (int) sy; 170 | float dy = sy - iy; 171 | for(c = 0; c < w; ++c){ 172 | float val = (1-dy) * get_pixel(part, c, iy, k); 173 | set_pixel(resized, c, r, k, val); 174 | } 175 | if(r == h-1 || im.h == 1) continue; 176 | for(c = 0; c < w; ++c){ 177 | float val = dy * get_pixel(part, c, iy+1, k); 178 | add_pixel(resized, c, r, k, val); 179 | } 180 | } 181 | } 182 | 183 | free_image(part); 184 | return resized; 185 | } 186 | 187 | image load_image(char *filename, int w, int h, int c) 188 | { 189 | image out = load_image_cv(filename, c); 190 | 191 | if((h && w) && (h != out.h || w != out.w)){ 192 | image resized = resize_image(out, w, h); 193 | free_image(out); 194 | out = resized; 195 | } 196 | return out; 197 | } 198 | 199 | 200 | image load_image_color(char *filename, int w, int h) 201 | { 202 | return load_image(filename, w, h, 3); 203 | } 204 | 205 | image copy_image(image p) 206 | { 207 | image copy = p; 208 | copy.data = (float*)calloc(p.h*p.w*p.c, sizeof(float)); 209 | memcpy(copy.data, p.data, p.h*p.w*p.c*sizeof(float)); 210 | return copy; 211 | } 212 | 213 | void save_image_jpg(image p, const char *name) 214 | { 215 | image copy = copy_image(p); 216 | if(p.c == 3) rgbgr_image(copy); 217 | int x,y,k; 218 | 219 | char buff[256]; 220 | sprintf(buff, "%s.jpg", name); 221 | 222 | IplImage *disp = cvCreateImage(cvSize(p.w,p.h), IPL_DEPTH_8U, p.c); 223 | int step = disp->widthStep; 224 | for(y = 0; y < p.h; ++y){ 225 | for(x = 0; x < p.w; ++x){ 226 | for(k= 0; k < p.c; ++k){ 227 | disp->imageData[y*step + x*p.c + k] = (unsigned char)(get_pixel(copy,x,y,k)*255); 228 | } 229 | } 230 | } 231 | cvSaveImage(buff, disp,0); 232 | cvReleaseImage(&disp); 233 | free_image(copy); 234 | } 235 | 236 | void save_image(image im, const char *name) 237 | { 238 | save_image_jpg(im, name); 239 | } 240 | 241 | 242 | image letterbox_image(image im, int w, int h) 243 | { 244 | int new_w = im.w; 245 | int new_h = im.h; 246 | if (((float)w/im.w) < ((float)h/im.h)) { 247 | new_w = w; 248 | new_h = (im.h * w)/im.w; 249 | } else { 250 | new_h = h; 251 | new_w = (im.w * h)/im.h; 252 | } 253 | image resized = resize_image(im, new_w, new_h); 254 | image boxed = make_image(w, h, im.c); 255 | fill_image(boxed, .5); 256 | embed_image(resized, boxed, (w-new_w)/2, (h-new_h)/2); 257 | free_image(resized); 258 | return boxed; 259 | } 260 | 261 | #endif 262 | -------------------------------------------------------------------------------- /regionLayer.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /** \brief kernel for softmax 5 | * - n is the number of classes (included the background) 6 | * 7 | * - The CPU implementation is 8 | * for b in batch: 9 | * for g in groups: 10 | * softmax(input + b*batchOffset + g*groupOffset, n, temp, stride, output + b*batchOffset + g*groupOffset) 11 | * 12 | * - The GPU implementation put the two for-loop into parallel. 13 | * 14 | * - nthdsPerCTA: the max number of threads per block. 15 | * - Each thread will in charge of one point softmax for all classes. 16 | * - Total number of threads: batch * groups 17 | * 18 | * - TODO: using warp shuffle instead of loop in one thread. 19 | */ 20 | template 21 | __launch_bounds__(nthdsPerCTA) 22 | __global__ void softmaxKernel(const float * input, 23 | const int n, 24 | const int batch, 25 | const int batchOffset, 26 | const int groups, 27 | const int groupOffset, 28 | const int stride, 29 | const float temp, 30 | float * output) 31 | { 32 | int id = blockIdx.x * nthdsPerCTA + threadIdx.x; 33 | 34 | // per batch, per group 35 | if (id < batch * groups) 36 | { 37 | int b = id / groups; 38 | int g = id % groups; 39 | float sum = 0.; 40 | float largest = -FLT_MAX; 41 | int offset = b*batchOffset + g*groupOffset; 42 | for (int i = 0; i < n; ++i) 43 | { 44 | float val = input[i*stride + offset]; 45 | largest = (val > largest) ? val : largest; 46 | } 47 | for (int i = 0; i < n; ++i) 48 | { 49 | float e = exp(input[i*stride + offset]/temp - largest/temp); // bound score in (-inf,0], and denominator fractor in (0,1]. 50 | sum += e; 51 | output[i*stride + offset] = e; 52 | } 53 | for (int i = 0; i < n; ++i) 54 | output[i*stride + offset] /= sum; 55 | } 56 | } 57 | 58 | 59 | /** 60 | * \brief Sigmoid function 61 | * 62 | * "__launch_bounds__" ensures the universality of kernel 63 | */ 64 | template 65 | __launch_bounds__(nthdsPerCTA) 66 | __global__ void activateKernel(float * data, 67 | const int range) 68 | { 69 | int i = blockIdx.x * nthdsPerCTA + threadIdx.x; 70 | if (i < range) 71 | data[i] = 1. / (1. + exp(-data[i])); 72 | } 73 | 74 | /** 75 | * \brief region layer of YOLOv3 76 | * Includes activation and softmax. 77 | * - num: # bounding box per location 78 | * 79 | * If we integrated into tensorRT, we can use input and output are different memory. 80 | * If it is standalone GPU code (in main.cpp), we can use input and output the same buffer. 81 | * 82 | * Note: The elements in YOLOv3 83 | * * 4*nCells coords, 84 | * * nCells conf, 85 | * * classes*nCells classes 86 | * e.g. 87 | * * nCells for 0 class (background) 88 | * * nCells for 1 class 89 | * * ... 90 | */ 91 | void regionLayer_gpu( 92 | const int batch, 93 | const int C, 94 | const int nCells, 95 | const int num, 96 | const int coords, 97 | const int classes, 98 | const float * input, 99 | float * output, 100 | cudaStream_t stream) 101 | { 102 | const int blockSize = 256; 103 | const int gridSize_Act1 = (2*nCells + blockSize - 1) / blockSize; // x, y 104 | const int gridSize_Act2 = (nCells + blockSize - 1) / blockSize; // conf 105 | const int gridSize_Softmax = (nCells + blockSize - 1) / blockSize; // classes 106 | // for YOLOv3, the output of final layer is C*nCells, in which, C includes all the conf, coord, and claesses. 107 | 108 | #ifdef REGION_IN_TRT 109 | // TRT, input and output are diff buffer 110 | ck(cudaMemcpy((void*)output, (void*)input, batch*C*nCells*sizeof(float), cudaMemcpyDeviceToDevice)); 111 | #endif 112 | // else input and output can be same buffer 113 | 114 | for (int b = 0; b < batch; ++b) { 115 | for (int n = 0; n < num; ++n) { 116 | // activate on (x,y) 117 | int index = b*C*nCells // per batch 118 | + n*nCells*(coords+classes+1); // coords, classes and confidence 119 | activateKernel 120 | <<>> 121 | (output + index, 2*nCells); 122 | 123 | // activate on probes on conf 124 | index = b*C*nCells 125 | + n*nCells*(coords+classes+1) 126 | + 4*nCells; // skip coords 127 | activateKernel 128 | <<>> 129 | (output + index, nCells); 130 | 131 | // softmax for all classes 132 | index = b*C*nCells 133 | + n*nCells*(coords+classes+1) 134 | + 5*nCells; // skip conf 135 | softmaxKernel 136 | <<>> 137 | (input + index, // input: skip loc, conf 138 | classes, // n: #classes 139 | batch*num, // batch: batch * #bound_box 140 | (C*nCells/num), // batchOffset: number of bounding_box in total 141 | nCells, // groups 142 | 1, // groupOffset 143 | nCells, // stride 144 | 1.f, // temp 145 | output + index); // output 146 | } 147 | } 148 | } 149 | 150 | #define nOutputLayer 3 151 | template 152 | __launch_bounds__(nthdsPerCTA) 153 | __global__ void reorgOutputKernel( 154 | const int nBatch, 155 | const int nClasses, 156 | const int nBboxesPerLoc, 157 | const int coords, 158 | const int l0_w, 159 | const int l0_h, 160 | const int nCells, 161 | float* dpData_unordered[], 162 | float* dpData) 163 | { 164 | long i = blockIdx.x * nthdsPerCTA + threadIdx.x; 165 | const int bboxMemLen = (nClasses + coords + 1) * nCells; 166 | const int batchMemLen = nBboxesPerLoc * bboxMemLen; 167 | const long range = nBatch * batchMemLen; 168 | if (i < range) // voc<266175 coco<904995 wrt. 416*416 input 169 | { 170 | int b = i / batchMemLen; 171 | int bboxIdx = (i % batchMemLen) / bboxMemLen; 172 | int channelIdx = ((i % batchMemLen) % bboxMemLen) / nCells; 173 | int locIdx = (i % batchMemLen) % nCells; 174 | int locLayer, cnt_offset = 1+2*2+4*4; 175 | for(int j = nOutputLayer-1; j >= 0; --j){ 176 | cnt_offset -= (1<= cnt_offset*l0_w*l0_h){ 178 | locLayer = j; 179 | break; 180 | } 181 | } 182 | dpData[i] = dpData_unordered[locLayer]\ 183 | [b*nBboxesPerLoc*(nClasses+coords+1)*(1< 206 | <<>> 207 | (nBatch, nClasses, nBboxesPerLoc, coords, l0_w, l0_h, nCells, dpData_unordered, dpData); 208 | 209 | } 210 | -------------------------------------------------------------------------------- /regionLayer.cu~: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /** \brief kernel for softmax 5 | * - n is the number of classes (included the background) 6 | * 7 | * - The CPU implementation is 8 | * for b in batch: 9 | * for g in groups: 10 | * softmax(input + b*batchOffset + g*groupOffset, n, temp, stride, output + b*batchOffset + g*groupOffset) 11 | * 12 | * - The GPU implementation put the two for-loop into parallel. 13 | * 14 | * - nthdsPerCTA: the max number of threads per block. 15 | * - Each thread will in charge of one point softmax for all classes. 16 | * - Total number of threads: batch * groups 17 | * 18 | * - TODO: using warp shuffle instead of loop in one thread. 19 | */ 20 | template 21 | __launch_bounds__(nthdsPerCTA) 22 | __global__ void softmaxKernel(const float * input, 23 | const int n, 24 | const int batch, 25 | const int batchOffset, 26 | const int groups, 27 | const int groupOffset, 28 | const int stride, 29 | const float temp, 30 | float * output) 31 | { 32 | int id = blockIdx.x * nthdsPerCTA + threadIdx.x; 33 | 34 | // per batch, per group 35 | if (id < batch * groups) 36 | { 37 | int b = id / groups; 38 | int g = id % groups; 39 | float sum = 0.; 40 | float largest = -FLT_MAX; 41 | int offset = b*batchOffset + g*groupOffset; 42 | for (int i = 0; i < n; ++i) 43 | { 44 | float val = input[i*stride + offset]; 45 | largest = (val > largest) ? val : largest; 46 | } 47 | for (int i = 0; i < n; ++i) 48 | { 49 | float e = exp(input[i*stride + offset]/temp - largest/temp); // bound score in (-inf,0], and denominator fractor in (0,1]. 50 | sum += e; 51 | output[i*stride + offset] = e; 52 | } 53 | for (int i = 0; i < n; ++i) 54 | output[i*stride + offset] /= sum; 55 | } 56 | } 57 | 58 | 59 | /** 60 | * \brief Sigmoid function 61 | * 62 | * "__launch_bounds__" ensures the universality of kernel 63 | */ 64 | template 65 | __launch_bounds__(nthdsPerCTA) 66 | __global__ void activateKernel(float * data, 67 | const int range) 68 | { 69 | int i = blockIdx.x * nthdsPerCTA + threadIdx.x; 70 | if (i < range) 71 | data[i] = 1. / (1. + exp(-data[i])); 72 | } 73 | 74 | /** 75 | * \brief region layer of YOLOv3 76 | * Includes activation and softmax. 77 | * - num: # bounding box per location 78 | * 79 | * If we integrated into tensorRT, we can use input and output are different memory. 80 | * If it is standalone GPU code (in main.cpp), we can use input and output the same buffer. 81 | * 82 | * Note: The elements in YOLOv2 83 | * * 4*nCells coords, 84 | * * nCells conf, 85 | * * classes*nCells classes 86 | * e.g. 87 | * * nCells for 0 class (background) 88 | * * nCells for 1 class 89 | * * ... 90 | */ 91 | void regionLayer_gpu( 92 | const int batch, 93 | const int C, 94 | const int nCells, 95 | const int num, 96 | const int coords, 97 | const int classes, 98 | const float * input, 99 | float * output, 100 | cudaStream_t stream) 101 | { 102 | const int blockSize = 256; 103 | const int gridSize_Act1 = (2*nCells + blockSize - 1) / blockSize; // x, y 104 | const int gridSize_Act2 = (nCells + blockSize - 1) / blockSize; // conf 105 | const int gridSize_Softmax = (nCells + blockSize - 1) / blockSize; // classes 106 | // for YOLOv3, the output of final layer is C*nCells, in which, C includes all the conf, coord, and claesses. 107 | 108 | #ifdef REGION_IN_TRT 109 | // TRT, input and output are diff buffer 110 | ck(cudaMemcpy((void*)output, (void*)input, batch*C*nCells*sizeof(float), cudaMemcpyDeviceToDevice)); 111 | #endif 112 | // else input and output can be same buffer 113 | 114 | for (int b = 0; b < batch; ++b) { 115 | for (int n = 0; n < num; ++n) { 116 | // activate on (x,y) 117 | int index = b*C*nCells // per batch 118 | + n*nCells*(coords+classes+1); // coords, classes and confidence 119 | activateKernel 120 | <<>> 121 | (output + index, 2*nCells); 122 | 123 | // activate on probes on conf 124 | index = b*C*nCells 125 | + n*nCells*(coords+classes+1) 126 | + 4*nCells; // skip coords 127 | activateKernel 128 | <<>> 129 | (output + index, nCells); 130 | 131 | // softmax for all classes 132 | index = b*C*nCells 133 | + n*nCells*(coords+classes+1) 134 | + 5*nCells; // skip conf 135 | softmaxKernel 136 | <<>> 137 | (input + index, // input: skip loc, conf 138 | classes, // n: #classes 139 | batch*num, // batch: batch * #bound_box 140 | (C*nCells/num), // batchOffset: number of bounding_box in total 141 | nCells, // groups 142 | 1, // groupOffset 143 | nCells, // stride 144 | 1.f, // temp 145 | output + index); // output 146 | } 147 | } 148 | } 149 | 150 | #define nOutputLayer 3 151 | template 152 | __launch_bounds__(nthdsPerCTA) 153 | __global__ void reorgOutputKernel( 154 | const int nBatch, 155 | const int nClasses, 156 | const int nBboxesPerLoc, 157 | const int coords, 158 | const int l0_w, 159 | const int l0_h, 160 | const int nCells, 161 | float* dpData_unordered[], 162 | float* dpData) 163 | { 164 | long i = blockIdx.x * nthdsPerCTA + threadIdx.x; 165 | const int bboxMemLen = (nClasses + coords + 1) * nCells; 166 | const int batchMemLen = nBboxesPerLoc * bboxMemLen; 167 | const long range = nBatch * batchMemLen; 168 | if (i < range) // voc<266175 coco<904995 wrt. 416*416 input 169 | { 170 | int b = i / batchMemLen; 171 | int bboxIdx = (i % batchMemLen) / bboxMemLen; 172 | int channelIdx = ((i % batchMemLen) % bboxMemLen) / nCells; 173 | int locIdx = (i % batchMemLen) % nCells; 174 | int locLayer, cnt_offset = 1+2*2+4*4; 175 | for(int j = nOutputLayer-1; j >= 0; --j){ 176 | cnt_offset -= (1<= cnt_offset*l0_w*l0_h){ 178 | locLayer = j; 179 | break; 180 | } 181 | } 182 | dpData[i] = dpData_unordered[locLayer]\ 183 | [b*nBboxesPerLoc*(nClasses+coords+1)*(1< 206 | <<>> 207 | (nBatch, nClasses, nBboxesPerLoc, coords, l0_w, l0_h, nCells, dpData_unordered, dpData); 208 | 209 | } 210 | -------------------------------------------------------------------------------- /regionLayer.h: -------------------------------------------------------------------------------- 1 | #ifndef REGION_LAYER_H_ 2 | #define REGION_LAYER_H_ 3 | 4 | #include "nvUtils.h" 5 | 6 | class regionParams{ 7 | public: 8 | int classes; // number of class 9 | int n; // number of bbox per location 10 | int coords; // number of coords (4) 11 | int w; // w (darknet) 12 | int h; // h (darknet), in total, we have w*h*n bbox 13 | int outputs; // outputs (darknet), output dimension of previous layer, 14 | 15 | bool softmax; // 1 for softmax process 16 | int background; // background index 17 | }; 18 | 19 | typedef struct{ 20 | float x, y, w, h; 21 | } box; 22 | 23 | void regionLayer_gpu(const int batch, 24 | const int C, 25 | const int nCells, 26 | const int num, 27 | const int coords, 28 | const int classes, 29 | const float * input, 30 | float * output, 31 | cudaStream_t stream); 32 | 33 | void reorgOutput_gpu(const int nBatch, 34 | const int nClasses, 35 | const int nBboxesPerLoc, 36 | const int coords, 37 | const int l0_w, 38 | const int l0_h, 39 | const int nCells, 40 | float* dpData_unordered[], 41 | float* dpData, 42 | const long nData, 43 | cudaStream_t stream); 44 | #endif 45 | -------------------------------------------------------------------------------- /regionLayer.h~: -------------------------------------------------------------------------------- 1 | #ifndef REGION_LAYER_H_ 2 | #define REGION_LAYER_H_ 3 | 4 | #include "nvUtils.h" 5 | 6 | class regionParams{ 7 | public: 8 | int classes; // number of class 9 | int n; // number of bbox per location 10 | int coords; // number of coords (4) 11 | int w; // w (darknet) 12 | int h; // h (darknet), in total, we have w*h*n bbox 13 | int outputs; // outputs (darknet), output dimension of previous layer, 14 | 15 | bool softmax; // 1 for softmax process 16 | int background; // background index 17 | }; 18 | 19 | typedef struct{ 20 | float x, y, w, h; 21 | } box; 22 | 23 | void regionLayer_gpu(const int batch, 24 | const int C, 25 | const int nCells, 26 | const int num, 27 | const int coords, 28 | const int classes, 29 | const float * input, 30 | float * output, 31 | cudaStream_t stream); 32 | 33 | void reorgOutput_gpu(const int nBatch, 34 | const int nClasses, 35 | const int nBboxesPerLoc, 36 | const int coords, 37 | const int l0_w, 38 | const int l0_h, 39 | const int nCells, 40 | float* __constant__ dpData_unordered[], 41 | float* dpData, 42 | const long nData, 43 | cudaStream_t stream); 44 | #endif 45 | -------------------------------------------------------------------------------- /results/calc_mAP.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | from voc_eval import voc_eval 4 | 5 | 6 | names = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 7 | 'bus', 'car', 'cat', 'chair', 'cow', 8 | 'diningtable', 'dog', 'horse', 'motorbike', 'person', 9 | 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'] 10 | 11 | iou_threshold = float(sys.argv[1]) 12 | print 'IOU threshold %.5f' % iou_threshold 13 | 14 | mAP = [] 15 | for name in names: 16 | recall, precision, ap = voc_eval( 17 | # change this to your results file 18 | './comp4_det_test_{}.txt', 19 | # change these 2 to your voc dataset 20 | '/home/weisong/_yolov3/YOLOv3-darknet/data_voc/VOCdevkit/VOC2007/Annotations/{}.xml', 21 | '/home/weisong/_yolov3/YOLOv3-darknet/data_voc/VOCdevkit/VOC2007/ImageSets/Main/test.txt', 22 | name, 23 | './cache/', 24 | iou_threshold) 25 | 26 | print "%-15s %.5f" % (name, ap) 27 | mAP.append(ap) 28 | 29 | ret = (float)(sum(mAP) / len(mAP)) 30 | print 'mAP = %.5f' % ret 31 | exit(ret) 32 | -------------------------------------------------------------------------------- /results/calc_mAP.py~: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | from voc_eval import voc_eval 4 | 5 | 6 | names = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 7 | 'bus', 'car', 'cat', 'chair', 'cow', 8 | 'diningtable', 'dog', 'horse', 'motorbike', 'person', 9 | 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'] 10 | 11 | iou_threshold = float(sys.argv[1]) 12 | print 'IOU threshold %.5f' % iou_threshold 13 | 14 | mAP = [] 15 | for name in names: 16 | recall, precision, ap = voc_eval( 17 | # change this to your results file 18 | './comp4_det_test_{}.txt', 19 | # change these 2 to your voc dataset 20 | '/home/weisong/_yolov3/YOLOv3-darknet/data_voc/VOCdevkit/VOC2007/Annotations/{}.xml', 21 | '/home/weisong/_yolov3/YOLOv3-darknet/data_voc/VOCdevkit/VOC2007/ImageSets/Main/test.txt', 22 | name, 23 | './cache/', 24 | iou_threshold) 25 | 26 | print "%-15s %.5f" % (name, ap) 27 | mAP.append(ap) 28 | 29 | print 'mAP = %.5f' % (sum(mAP) / len(mAP)) 30 | exit((sum(mAP) / len(mAP))) 31 | -------------------------------------------------------------------------------- /results/mAP.csv: -------------------------------------------------------------------------------- 1 | 0.450000 0.003000 0.737443 2 | 0.050000 0.005000 0.685275 3 | 0.050000 0.010000 0.679230 4 | 0.050000 0.015000 0.670850 5 | 0.050000 0.020000 0.671227 6 | 0.050000 0.025000 0.666309 7 | 0.050000 0.030000 0.661274 8 | 0.100000 0.005000 0.700229 9 | 0.100000 0.010000 0.694647 10 | 0.100000 0.015000 0.688362 11 | 0.100000 0.020000 0.683568 12 | 0.100000 0.025000 0.678220 13 | 0.100000 0.030000 0.675738 14 | 0.150000 0.005000 0.713384 15 | 0.150000 0.010000 0.706828 16 | 0.150000 0.015000 0.699336 17 | 0.150000 0.020000 0.697449 18 | 0.150000 0.025000 0.692435 19 | 0.150000 0.030000 0.686543 20 | 0.200000 0.005000 0.715730 21 | 0.200000 0.010000 0.711944 22 | 0.200000 0.015000 0.705454 23 | 0.200000 0.020000 0.701342 24 | 0.200000 0.025000 0.697500 25 | 0.200000 0.030000 0.694639 26 | 0.250000 0.005000 0.725145 27 | 0.250000 0.010000 0.718843 28 | 0.250000 0.015000 0.711384 29 | 0.250000 0.020000 0.706877 30 | 0.250000 0.025000 0.703751 31 | 0.250000 0.030000 0.700477 32 | 0.300000 0.005000 0.724348 33 | 0.300000 0.010000 0.720032 34 | 0.300000 0.015000 0.710854 35 | 0.300000 0.020000 0.711754 36 | 0.300000 0.025000 0.702736 37 | 0.300000 0.030000 0.704244 38 | 0.350000 0.005000 0.726985 39 | 0.350000 0.010000 0.721257 40 | 0.350000 0.015000 0.715500 41 | 0.350000 0.020000 0.711058 42 | 0.350000 0.025000 0.703841 43 | 0.350000 0.030000 0.704462 44 | 0.400000 0.005000 0.727567 45 | 0.400000 0.010000 0.719089 46 | 0.400000 0.015000 0.718993 47 | 0.400000 0.020000 0.708048 48 | 0.400000 0.025000 0.707436 49 | 0.400000 0.030000 0.704881 50 | 0.400000 0.001000 0.744281 51 | 0.400000 0.002000 0.740347 52 | 0.400000 0.003000 0.736656 53 | 0.400000 0.004000 0.731862 54 | 0.400000 0.005000 0.730218 55 | 0.450000 0.001000 0.745715 56 | 0.450000 0.002000 0.741461 57 | 0.450000 0.003000 0.738510 58 | 0.450000 0.004000 0.732984 59 | 0.450000 0.005000 0.733175 60 | 0.500000 0.001000 0.745760 61 | 0.500000 0.002000 0.741433 62 | 0.500000 0.003000 0.739151 63 | 0.500000 0.004000 0.731776 64 | 0.500000 0.005000 0.732461 65 | 0.550000 0.001000 0.741887 66 | 0.550000 0.002000 0.735486 67 | 0.550000 0.003000 0.735518 68 | 0.550000 0.004000 0.732146 69 | 0.550000 0.005000 0.730386 70 | 0.600000 0.001000 0.736716 71 | 0.600000 0.002000 0.732227 72 | 0.600000 0.003000 0.729687 73 | 0.600000 0.004000 0.723691 74 | 0.600000 0.005000 0.725624 75 | 0.650000 0.001000 0.722332 76 | 0.650000 0.002000 0.719295 77 | 0.650000 0.003000 0.716194 78 | 0.650000 0.004000 0.712005 79 | 0.650000 0.005000 0.713273 80 | 0.700000 0.001000 0.697123 81 | 0.700000 0.002000 0.692214 82 | 0.700000 0.003000 0.694202 83 | 0.700000 0.004000 0.688393 84 | 0.700000 0.005000 0.690033 85 | -------------------------------------------------------------------------------- /results/mAP.csv~: -------------------------------------------------------------------------------- 1 | 0.450000 0.003000 0.737443 2 | 0.000000 0.000000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 99.0000000.000000 0.000000 0.0000000.000000 0.000000 0.00000047.000000 0.000000 0.0000000.000000 0.000000 0.0000000.000000 114.000000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 0.00000046.000000 0.000000 0.000000 3 | 0.000000 0.005000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 99.0000000.000000 0.000000 0.0000000.000000 0.000000 0.00000047.000000 0.000000 0.0000000.000000 0.000000 0.0000000.000000 114.000000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 0.00000046.000000 0.000000 0.000000 4 | 0.000000 0.010000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 99.0000000.000000 0.000000 0.0000000.000000 0.000000 0.00000047.000000 0.000000 0.0000000.000000 0.000000 0.0000000.000000 114.000000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 0.00000046.000000 0.000000 0.000000 5 | 0.050000 0.005000 0.685275 6 | 0.050000 0.010000 0.679230 7 | -------------------------------------------------------------------------------- /results/voc_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | 7 | import xml.etree.ElementTree as ET 8 | import os 9 | import cPickle 10 | import numpy as np 11 | 12 | def parse_rec(filename): 13 | """ Parse a PASCAL VOC xml file """ 14 | tree = ET.parse(filename) 15 | objects = [] 16 | for obj in tree.findall('object'): 17 | obj_struct = {} 18 | obj_struct['name'] = obj.find('name').text 19 | obj_struct['pose'] = obj.find('pose').text 20 | obj_struct['truncated'] = int(obj.find('truncated').text) 21 | obj_struct['difficult'] = int(obj.find('difficult').text) 22 | bbox = obj.find('bndbox') 23 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 24 | int(bbox.find('ymin').text), 25 | int(bbox.find('xmax').text), 26 | int(bbox.find('ymax').text)] 27 | objects.append(obj_struct) 28 | 29 | return objects 30 | 31 | def voc_ap(rec, prec, use_07_metric=False): 32 | """ ap = voc_ap(rec, prec, [use_07_metric]) 33 | Compute VOC AP given precision and recall. 34 | If use_07_metric is true, uses the 35 | VOC 07 11 point method (default:False). 36 | """ 37 | if use_07_metric: 38 | # 11 point metric 39 | ap = 0. 40 | for t in np.arange(0., 1.1, 0.1): 41 | if np.sum(rec >= t) == 0: 42 | p = 0 43 | else: 44 | p = np.max(prec[rec >= t]) 45 | ap = ap + p / 11. 46 | else: 47 | # correct AP calculation 48 | # first append sentinel values at the end 49 | mrec = np.concatenate(([0.], rec, [1.])) 50 | mpre = np.concatenate(([0.], prec, [0.])) 51 | 52 | # compute the precision envelope 53 | for i in range(mpre.size - 1, 0, -1): 54 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 55 | 56 | # to calculate area under PR curve, look for points 57 | # where X axis (recall) changes value 58 | i = np.where(mrec[1:] != mrec[:-1])[0] 59 | 60 | # and sum (\Delta recall) * prec 61 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 62 | return ap 63 | 64 | def voc_eval(detpath, 65 | annopath, 66 | imagesetfile, 67 | classname, 68 | cachedir, 69 | ovthresh=0.5, 70 | use_07_metric=False): 71 | """rec, prec, ap = voc_eval(detpath, 72 | annopath, 73 | imagesetfile, 74 | classname, 75 | [ovthresh], 76 | [use_07_metric]) 77 | 78 | Top level function that does the PASCAL VOC evaluation. 79 | 80 | detpath: Path to detections 81 | detpath.format(classname) should produce the detection results file. 82 | annopath: Path to annotations 83 | annopath.format(imagename) should be the xml annotations file. 84 | imagesetfile: Text file containing the list of images, one image per line. 85 | classname: Category name (duh) 86 | cachedir: Directory for caching the annotations 87 | [ovthresh]: Overlap threshold (default = 0.5) 88 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 89 | (default False) 90 | """ 91 | # assumes detections are in detpath.format(classname) 92 | # assumes annotations are in annopath.format(imagename) 93 | # assumes imagesetfile is a text file with each line an image name 94 | # cachedir caches the annotations in a pickle file 95 | 96 | # first load gt 97 | if not os.path.isdir(cachedir): 98 | os.mkdir(cachedir) 99 | cachefile = os.path.join(cachedir, 'annots.pkl') 100 | # read list of images 101 | with open(imagesetfile, 'r') as f: 102 | lines = f.readlines() 103 | imagenames = [x.strip() for x in lines] 104 | 105 | if not os.path.isfile(cachefile): 106 | # load annots 107 | recs = {} 108 | for i, imagename in enumerate(imagenames): 109 | recs[imagename] = parse_rec(annopath.format(imagename)) 110 | if i % 100 == 0: 111 | print 'Reading annotation for {:d}/{:d}'.format( 112 | i + 1, len(imagenames)) 113 | # save 114 | print 'Saving cached annotations to {:s}'.format(cachefile) 115 | with open(cachefile, 'w') as f: 116 | cPickle.dump(recs, f) 117 | else: 118 | # load 119 | with open(cachefile, 'r') as f: 120 | recs = cPickle.load(f) 121 | 122 | # extract gt objects for this class 123 | class_recs = {} 124 | npos = 0 125 | for imagename in imagenames: 126 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 127 | bbox = np.array([x['bbox'] for x in R]) 128 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 129 | det = [False] * len(R) 130 | npos = npos + sum(~difficult) 131 | class_recs[imagename] = {'bbox': bbox, 132 | 'difficult': difficult, 133 | 'det': det} 134 | 135 | # read dets 136 | detfile = detpath.format(classname) 137 | with open(detfile, 'r') as f: 138 | lines = f.readlines() 139 | 140 | splitlines = [x.strip().split(' ') for x in lines] 141 | image_ids = [x[0] for x in splitlines] 142 | confidence = np.array([float(x[1]) for x in splitlines]) 143 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 144 | 145 | # sort by confidence 146 | sorted_ind = np.argsort(-confidence) 147 | sorted_scores = np.sort(-confidence) 148 | BB = BB[sorted_ind, :] 149 | image_ids = [image_ids[x] for x in sorted_ind] 150 | 151 | # go down dets and mark TPs and FPs 152 | nd = len(image_ids) 153 | tp = np.zeros(nd) 154 | fp = np.zeros(nd) 155 | for d in range(nd): 156 | R = class_recs[image_ids[d]] 157 | bb = BB[d, :].astype(float) 158 | ovmax = -np.inf 159 | BBGT = R['bbox'].astype(float) 160 | 161 | if BBGT.size > 0: 162 | # compute overlaps 163 | # intersection 164 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 165 | iymin = np.maximum(BBGT[:, 1], bb[1]) 166 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 167 | iymax = np.minimum(BBGT[:, 3], bb[3]) 168 | iw = np.maximum(ixmax - ixmin + 1., 0.) 169 | ih = np.maximum(iymax - iymin + 1., 0.) 170 | inters = iw * ih 171 | 172 | # union 173 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 174 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 175 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 176 | 177 | overlaps = inters / uni 178 | ovmax = np.max(overlaps) 179 | jmax = np.argmax(overlaps) 180 | 181 | if ovmax > ovthresh: 182 | if not R['difficult'][jmax]: 183 | if not R['det'][jmax]: 184 | tp[d] = 1. 185 | R['det'][jmax] = 1 186 | else: 187 | fp[d] = 1. 188 | else: 189 | fp[d] = 1. 190 | 191 | # compute precision recall 192 | fp = np.cumsum(fp) 193 | tp = np.cumsum(tp) 194 | rec = tp / float(npos) 195 | # avoid divide by zero in case the first detection matches a difficult 196 | # ground truth 197 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 198 | ap = voc_ap(rec, prec, use_07_metric) 199 | 200 | return rec, prec, ap 201 | -------------------------------------------------------------------------------- /results/voc_eval.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangjinsong3/YOLO-V3-Acceleration/384746bb6d8f61c8def70bbc0b5e04b98c60356e/results/voc_eval.pyc -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | #DEBUG="gdb --args " 4 | DEBUG="cuda-gdb --args " 5 | #DEBUG="ddd --debugger cuda-gdb --args " 6 | #DEBUG="cgdb -d cuda-gdb --args " 7 | #DEBUG="cuda-memcheck " 8 | 9 | MODEL="./data/model/yolov3-voc-relu.caffemodel" 10 | DEPLOY="./data/model/yolov3-voc-relu.prototxt" 11 | # CALIBRATION="./data/model/CalibrationTable" 12 | SYNSET="./data/model/voc.names" 13 | IMAGELIST="./data/images/test.txt" 14 | 15 | DEV_ID=$1 16 | NMS=0.45 # $2 17 | CONF=0.001 # $3 18 | MODE=0 # 0 fp32, 1 fp16, 2 int8 19 | BATCH_SIZE=1 20 | N_ITERS=1 21 | 22 | # Add this argument for INT8 inference. 23 | # Note that only pascal GPU support INT8, like NVIDIA Tesla P4, P40 24 | 25 | 26 | if [ ${MODE} -eq 2 ] 27 | then 28 | ${DEBUG} ./bin/runYOLOv3 -devID=${DEV_ID} \ 29 | -batchSize=${BATCH_SIZE} \ 30 | -nIters=${N_ITERS} \ 31 | -deployFile=${DEPLOY} \ 32 | -modelFile=${MODEL} \ 33 | -synsetFile=${SYNSET} \ 34 | -cali=${CALIBRATION} \ 35 | -imageFile=${IMAGELIST} \ 36 | -nmsThreshold=${NMS} \ 37 | -confThreshold=${CONF} 38 | #2>&1 | tee ./log/log.txt 39 | else 40 | ${DEBUG} ./bin/runYOLOv3 -devID=${DEV_ID} \ 41 | -batchSize=${BATCH_SIZE} \ 42 | -nIters=${N_ITERS} \ 43 | -deployFile=${DEPLOY} \ 44 | -modelFile=${MODEL} \ 45 | -synsetFile=${SYNSET} \ 46 | -imageFile=${IMAGELIST} \ 47 | -nmsThreshold=${NMS} \ 48 | -confThreshold=${CONF} 49 | #2>&1 | tee ./log/log.txt 50 | fi 51 | -------------------------------------------------------------------------------- /run.sh~: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | #DEBUG="gdb --args " 4 | DEBUG="cuda-gdb --args " 5 | #DEBUG="ddd --debugger cuda-gdb --args " 6 | #DEBUG="cgdb -d cuda-gdb --args " 7 | #DEBUG="cuda-memcheck " 8 | 9 | MODEL="./data/model/yolov3-voc-relu.caffemodel" 10 | DEPLOY="./data/model/yolov3-voc-relu.prototxt" 11 | # CALIBRATION="./data/model/CalibrationTable" 12 | SYNSET="./data/model/voc.names" 13 | IMAGELIST="./data/images/test.txt" 14 | 15 | DEV_ID=$1 16 | NMS=0.45 # $2 17 | CONF=0.001 # $3 18 | MODE=0 # 0 fp32, 1 fp16, 2 int8 19 | BATCH_SIZE=1 20 | N_ITERS=1 21 | 22 | # Add this argument for INT8 inference. 23 | # Note that only pascal GPU support INT8, like NVIDIA Tesla P4, P40 24 | 25 | 26 | if [ ${MODE} -eq 2 ] 27 | then 28 | ${DEBUG} ./bin/runYOLOv3 -devID=${DEV_ID} \ 29 | -batchSize=${BATCH_SIZE} \ 30 | -nIters=${N_ITERS} \ 31 | -deployFile=${DEPLOY} \ 32 | -modelFile=${MODEL} \ 33 | -synsetFile=${SYNSET} \ 34 | -cali=${CALIBRATION} \ 35 | -imageFile=${IMAGELIST} \ 36 | -nmsThreshold=${NMS} \ 37 | -confThreshold=${CONF} 38 | #2>&1 | tee ./log/log.txt 39 | else 40 | ${DEBUG} ./bin/runYOLOv3 -devID=${DEV_ID} \ 41 | -batchSize=${BATCH_SIZE} \ 42 | -nIters=${N_ITERS} \ 43 | -deployFile=${DEPLOY} \ 44 | -modelFile=${MODEL} \ 45 | -synsetFile=${SYNSET} \ 46 | -imageFile=${IMAGELIST} \ 47 | -nmsThreshold=${NMS} \ 48 | -confThreshold=${CONF} 49 | #2>&1 | tee ./log/log.txt 50 | fi 51 | -------------------------------------------------------------------------------- /src/bboxParser.h: -------------------------------------------------------------------------------- 1 | #ifndef BBOX_PARSER_H 2 | #define BBOX_PARSER_H 3 | 4 | #include 5 | // cub for sort 6 | #include "regionLayer.h" 7 | 8 | void sortScoresPerImage_gpu( 9 | const int nBatch, 10 | const int nItemsPerImage, 11 | void * unsorted_scores, 12 | void * unsorted_bbox_indices, 13 | void * sorted_scores, 14 | void * sorted_bbox_indices, 15 | void * workspace, 16 | const size_t maxSizeofWorkspaceInByte, 17 | cudaStream_t stream); 18 | 19 | void splitOutputData_gpu( 20 | const int nBatch, // batch 21 | const int nClasses, 22 | const int nBboxesPerLoc, // #box 23 | const int coords, // x,y,w,h 24 | const int l0_w, 25 | const int l0_h, 26 | const int nCells, 27 | const bool background, // use background conf or not 28 | const bool only_objectness, // no class conf 29 | const float thres, 30 | const float* predictions, 31 | const float* biases, 32 | float* probes, 33 | box* bboxes, 34 | cudaStream_t stream); 35 | 36 | 37 | void correct_region_boxes_gpu( 38 | const int nBatch, // batch 39 | const int nClasses, 40 | const int nBboxesPerLoc, // #box 41 | const int nCells, 42 | const int image_w, 43 | const int image_h, 44 | const int net_input_w, 45 | const int net_input_h, 46 | box* bboxes, 47 | cudaStream_t stream); 48 | 49 | 50 | void sortScoresPerClass_gpu( 51 | const int nBatch, 52 | const int nClasses, 53 | const int nBboxesPerLoc, 54 | const void * probes, 55 | void * sorted_boxIdx, 56 | void * workspace, 57 | const size_t maxSizeofWorkspaceInByte, 58 | cudaStream_t stream); 59 | 60 | 61 | void allClassNMS_gpu( 62 | const int nBatch, //batch 63 | const int nClasses, 64 | const int nBboxesPerLoc, 65 | const int nCells, 66 | const float nms_threshold, 67 | void * bboxes, 68 | void * probes, 69 | void * afterNMS_probes, 70 | void * indexes, 71 | void * afterNMS_indexes, 72 | cudaStream_t stream); 73 | 74 | 75 | size_t getWorkspaceSizeInByte( 76 | const int nBatch, 77 | const int nClasses, 78 | const int nBboxesPerLoc, 79 | const int nCells); 80 | 81 | #endif 82 | -------------------------------------------------------------------------------- /src/bboxParser.h~: -------------------------------------------------------------------------------- 1 | #ifndef BBOX_PARSER_H 2 | #define BBOX_PARSER_H 3 | 4 | #include 5 | // cub for sort 6 | #include "regionLayer.h" 7 | 8 | void sortScoresPerImage_gpu( 9 | const int nBatch, 10 | const int nItemsPerImage, 11 | void * unsorted_scores, 12 | void * unsorted_bbox_indices, 13 | void * sorted_scores, 14 | void * sorted_bbox_indices, 15 | void * workspace, 16 | const size_t maxSizeofWorkspaceInByte, 17 | cudaStream_t stream); 18 | 19 | void splitOutputData_gpu( 20 | const int nBatch, // batch 21 | const int nClasses, 22 | const int nBboxesPerLoc, // #box 23 | const int coords, // x,y,w,h 24 | const int l0_w, 25 | const int l0_h, 26 | const int nCells, 27 | const bool background, // use background conf or not 28 | const bool only_objectness, // no class conf 29 | const float thres, 30 | const float* predictions, 31 | const float* biases, 32 | float* probes, 33 | box* bboxes, 34 | cudaStream_t stream); 35 | 36 | 37 | void correct_region_boxes_gpu( 38 | const int nBatch, // batch 39 | const int nClasses, 40 | const int nBboxesPerLoc, // #box 41 | const int nCells, 42 | const int image_w, 43 | const int image_h, 44 | const int net_input_w, 45 | const int net_input_h, 46 | box* bboxes, 47 | cudaStream_t stream); 48 | 49 | 50 | void sortScoresPerClass_gpu( 51 | const int nBatch, 52 | const int nClasses, 53 | const int nBboxesPerLoc, 54 | const void * probes, 55 | void * sorted_boxIdx, 56 | void * workspace, 57 | const size_t maxSizeofWorkspaceInByte, 58 | cudaStream_t stream); 59 | 60 | 61 | void allClassNMS_gpu( 62 | const int nBatch, //batch 63 | const int nClasses, 64 | const int nBboxesPerLoc, 65 | const int w, 66 | const int h, 67 | const float nms_threshold, 68 | void * bboxes, 69 | void * probes, 70 | void * afterNMS_probes, 71 | void * indexes, 72 | void * afterNMS_indexes, 73 | cudaStream_t stream); 74 | 75 | 76 | size_t getWorkspaceSizeInByte( 77 | const int nBatch, 78 | const int nClasses, 79 | const int nBboxesPerLoc, 80 | const int w, 81 | const int h); 82 | 83 | #endif 84 | -------------------------------------------------------------------------------- /src/classifier.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2016 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NOTICE TO USER: 5 | * 6 | * This source code is subject to NVIDIA ownership rights under U.S. and 7 | * international Copyright laws. 8 | * 9 | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 10 | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 11 | * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH 12 | * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 13 | * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 14 | * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 15 | * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 16 | * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 17 | * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 18 | * OR PERFORMANCE OF THIS SOURCE CODE. 19 | * 20 | * U.S. Government End Users. This source code is a "commercial item" as 21 | * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of 22 | * "commercial computer software" and "commercial computer software 23 | * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 24 | * and is provided to the U.S. Government only as a commercial end item. 25 | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 26 | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 27 | * source code with only those rights set forth herein. 28 | */ 29 | 30 | #ifndef CLASSIFIER_H 31 | #define CLASSIFIER_H 32 | 33 | #include 34 | #include "NvInfer.h" 35 | #include 36 | 37 | using namespace nvinfer1; 38 | 39 | typedef struct INFER_OUTPUT_PARAMS_ { 40 | int nBatchSize_; 41 | std::vector vpInferResults_; 42 | std::vector vnLens_; 43 | std::vector vOutputDims_; 44 | } INFER_OUTPUT_PARAMS; 45 | 46 | class IClassifier { 47 | public: 48 | virtual void setInputData(float *pBGR, 49 | const int nWidth, 50 | const int nHeight, 51 | const int nBatchSize) = 0; 52 | 53 | virtual void forward(INFER_OUTPUT_PARAMS *) = 0; 54 | 55 | virtual int getInferWidth() const = 0; 56 | 57 | virtual int getInferHeight() const = 0; 58 | 59 | virtual std::vector getMeanValues() const = 0; 60 | 61 | protected: 62 | virtual ~IClassifier() {} 63 | }; 64 | 65 | #endif 66 | 67 | 68 | -------------------------------------------------------------------------------- /src/common.cu: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | // alignptr 4 | int8_t * alignPtr(int8_t * ptr, uintptr_t to) 5 | { 6 | uintptr_t addr = (uintptr_t)ptr; 7 | if (addr % to) { 8 | addr += to - addr % to; 9 | } 10 | return (int8_t *)addr; 11 | } 12 | 13 | // calc next ptr (consider alignment) 14 | int8_t * nextWorkspacePtr(int8_t * ptr, uintptr_t previousWorkspaceSize) 15 | { 16 | uintptr_t addr = (uintptr_t) ptr; 17 | addr += previousWorkspaceSize; 18 | return alignPtr((int8_t *)addr, CUDA_MEM_ALIGN); 19 | } 20 | 21 | 22 | template 23 | __launch_bounds__ (nthds_per_cta) 24 | __global__ void setUniformOffsets_kernel( 25 | const int num_segments, 26 | const int offset, 27 | int * d_offsets) 28 | { 29 | const int idx = blockIdx.x * nthds_per_cta + threadIdx.x; 30 | if (idx <= num_segments){ 31 | d_offsets[idx] = idx * offset; 32 | } 33 | } 34 | 35 | void setUniformOffsets( 36 | const int num_segments, 37 | const int offset, 38 | int * d_offsets, 39 | cudaStream_t stream) 40 | { 41 | const int blockSize = 32; 42 | const int gridSize = (num_segments + 1 + blockSize - 1) / blockSize; 43 | setUniformOffsets_kernel 44 | <<>> 45 | (num_segments, offset, d_offsets); 46 | } 47 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMON_H_ 2 | #define COMMON_H_ 3 | 4 | #include 5 | 6 | #define CUDA_MEM_ALIGN 256 7 | 8 | // alignptr 9 | int8_t * alignPtr(int8_t * ptr, uintptr_t to); 10 | 11 | int8_t * nextWorkspacePtr(int8_t * ptr, uintptr_t previousWorkspaceSize); 12 | 13 | void setUniformOffsets(const int num_segments, const int offset, int * d_offsets, cudaStream_t stream); 14 | 15 | /** 16 | * Determine the usage of temporary memory for cub sort 17 | * The cub::DeviceSegmentedRadixSort can be used for batched (segmented) sort. 18 | */ 19 | template 20 | size_t cubSortPairsWorkspaceSize(int num_items, int num_segments) 21 | { 22 | size_t temp_storage_bytes = 0; 23 | cub::DeviceSegmentedRadixSort::SortPairsDescending( 24 | (void *)NULL, temp_storage_bytes, 25 | (const KeyT *)NULL, (KeyT *)NULL, 26 | (const ValueT *)NULL, (ValueT *)NULL, 27 | num_items, // # items 28 | num_segments, // # segments 29 | (const int *)NULL, (const int *)NULL); 30 | return temp_storage_bytes; 31 | } 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /src/common.h~: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangjinsong3/YOLO-V3-Acceleration/384746bb6d8f61c8def70bbc0b5e04b98c60356e/src/common.h~ -------------------------------------------------------------------------------- /src/draw.h: -------------------------------------------------------------------------------- 1 | #include "preproc_yolov3.h" 2 | #include "regionLayer.h" 3 | #include "bboxParser.h" 4 | 5 | void draw_box(image a, int x1, int y1, int x2, int y2, float r, float g, float b) 6 | { 7 | int i; 8 | if(x1 < 0) x1 = 0; 9 | if(x1 >= a.w) x1 = a.w-1; 10 | if(x2 < 0) x2 = 0; 11 | if(x2 >= a.w) x2 = a.w-1; 12 | 13 | if(y1 < 0) y1 = 0; 14 | if(y1 >= a.h) y1 = a.h-1; 15 | if(y2 < 0) y2 = 0; 16 | if(y2 >= a.h) y2 = a.h-1; 17 | 18 | for(i = x1; i <= x2; ++i){ 19 | a.data[i + y1*a.w + 0*a.w*a.h] = r; 20 | a.data[i + y2*a.w + 0*a.w*a.h] = r; 21 | 22 | a.data[i + y1*a.w + 1*a.w*a.h] = g; 23 | a.data[i + y2*a.w + 1*a.w*a.h] = g; 24 | 25 | a.data[i + y1*a.w + 2*a.w*a.h] = b; 26 | a.data[i + y2*a.w + 2*a.w*a.h] = b; 27 | } 28 | for(i = y1; i <= y2; ++i){ 29 | a.data[x1 + i*a.w + 0*a.w*a.h] = r; 30 | a.data[x2 + i*a.w + 0*a.w*a.h] = r; 31 | 32 | a.data[x1 + i*a.w + 1*a.w*a.h] = g; 33 | a.data[x2 + i*a.w + 1*a.w*a.h] = g; 34 | 35 | a.data[x1 + i*a.w + 2*a.w*a.h] = b; 36 | a.data[x2 + i*a.w + 2*a.w*a.h] = b; 37 | } 38 | } 39 | 40 | 41 | 42 | void draw_box_width(image a, int x1, int y1, int x2, int y2, int w) 43 | { 44 | int i; 45 | for(i = 0; i < w; ++i){ 46 | draw_box(a, x1+i, y1+i, x2-i, y2-i, 255, 0, 0); 47 | } 48 | } 49 | 50 | 51 | void draw_detections(image im, 52 | int batchIdx, 53 | float thresh, 54 | box *boxes, 55 | float *probs, 56 | int * indexes, 57 | int sizeOfClass, 58 | int sizeOfBatch) 59 | { 60 | int n = batchIdx; 61 | // int sizeOfClass = l.n * l.h * l.w; 62 | // int sizeOfBatch = l.classes * sizeOfClass; 63 | 64 | int count = 0; 65 | for(int i = 0; i < sizeOfBatch; ++i){ 66 | int id = n * sizeOfBatch + i; 67 | int indexes_idx = indexes[id]; 68 | 69 | if (probs[id] > thresh){ 70 | int category = (indexes_idx % sizeOfBatch) / sizeOfClass; 71 | int boxId = indexes_idx % sizeOfClass; 72 | 73 | int width = im.h * .006; 74 | box b = boxes[boxId]; 75 | 76 | int left = (b.x-b.w/2.)*im.w; 77 | int right = (b.x+b.w/2.)*im.w; 78 | int top = (b.y-b.h/2.)*im.h; 79 | int bot = (b.y+b.h/2.)*im.h; 80 | 81 | if(left < 0) left = 0; 82 | if(right > im.w-1) right = im.w-1; 83 | if(top < 0) top = 0; 84 | if(bot > im.h-1) bot = im.h-1; 85 | 86 | draw_box_width(im, left, top, right, bot, width); 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/draw.h~: -------------------------------------------------------------------------------- 1 | #include "preproc_yolov2.h" 2 | #include "regionLayer.h" 3 | #include "bboxParser.h" 4 | 5 | void draw_box(image a, int x1, int y1, int x2, int y2, float r, float g, float b) 6 | { 7 | int i; 8 | if(x1 < 0) x1 = 0; 9 | if(x1 >= a.w) x1 = a.w-1; 10 | if(x2 < 0) x2 = 0; 11 | if(x2 >= a.w) x2 = a.w-1; 12 | 13 | if(y1 < 0) y1 = 0; 14 | if(y1 >= a.h) y1 = a.h-1; 15 | if(y2 < 0) y2 = 0; 16 | if(y2 >= a.h) y2 = a.h-1; 17 | 18 | for(i = x1; i <= x2; ++i){ 19 | a.data[i + y1*a.w + 0*a.w*a.h] = r; 20 | a.data[i + y2*a.w + 0*a.w*a.h] = r; 21 | 22 | a.data[i + y1*a.w + 1*a.w*a.h] = g; 23 | a.data[i + y2*a.w + 1*a.w*a.h] = g; 24 | 25 | a.data[i + y1*a.w + 2*a.w*a.h] = b; 26 | a.data[i + y2*a.w + 2*a.w*a.h] = b; 27 | } 28 | for(i = y1; i <= y2; ++i){ 29 | a.data[x1 + i*a.w + 0*a.w*a.h] = r; 30 | a.data[x2 + i*a.w + 0*a.w*a.h] = r; 31 | 32 | a.data[x1 + i*a.w + 1*a.w*a.h] = g; 33 | a.data[x2 + i*a.w + 1*a.w*a.h] = g; 34 | 35 | a.data[x1 + i*a.w + 2*a.w*a.h] = b; 36 | a.data[x2 + i*a.w + 2*a.w*a.h] = b; 37 | } 38 | } 39 | 40 | 41 | 42 | void draw_box_width(image a, int x1, int y1, int x2, int y2, int w) 43 | { 44 | int i; 45 | for(i = 0; i < w; ++i){ 46 | draw_box(a, x1+i, y1+i, x2-i, y2-i, 255, 0, 0); 47 | } 48 | } 49 | 50 | 51 | void draw_detections(image im, 52 | int batchIdx, 53 | float thresh, 54 | box *boxes, 55 | float *probs, 56 | int * indexes, 57 | int sizeOfClass, 58 | int sizeOfBatch) 59 | { 60 | int n = batchIdx; 61 | // int sizeOfClass = l.n * l.h * l.w; 62 | // int sizeOfBatch = l.classes * sizeOfClass; 63 | 64 | int count = 0; 65 | for(int i = 0; i < sizeOfBatch; ++i){ 66 | int id = n * sizeOfBatch + i; 67 | int indexes_idx = indexes[id]; 68 | 69 | if (probs[id] > thresh){ 70 | int category = (indexes_idx % sizeOfBatch) / sizeOfClass; 71 | int boxId = indexes_idx % sizeOfClass; 72 | 73 | int width = im.h * .006; 74 | box b = boxes[boxId]; 75 | 76 | int left = (b.x-b.w/2.)*im.w; 77 | int right = (b.x+b.w/2.)*im.w; 78 | int top = (b.y-b.h/2.)*im.h; 79 | int bot = (b.y+b.h/2.)*im.h; 80 | 81 | if(left < 0) left = 0; 82 | if(right > im.w-1) right = im.w-1; 83 | if(top < 0) top = 0; 84 | if(bot > im.h-1) bot = im.h-1; 85 | 86 | draw_box_width(im, left, top, right, bot, width); 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/interpPlugin.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | dim3 cuda_gridsize(unsigned int n){ 5 | unsigned int k = (n-1) / BLOCK + 1; 6 | unsigned int x = k; 7 | unsigned int y = 1; 8 | if(x > 65535){ 9 | x = ceil(sqrt(k)); 10 | y = (n-1)/(x*BLOCK) + 1; 11 | } 12 | dim3 d = {x, y, 1}; 13 | return d; 14 | } 15 | 16 | /* nearest neighbor upsampling used in darknet*/ 17 | __global__ void upsample_gpu(int N, const float *x, int w, int h, int c, int batch, int zoomFactor, float *out, const char* mode="nearest") 18 | { 19 | int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; 20 | if(i >= N) return; 21 | int out_index = i; 22 | int out_w = i%(w*zoomFactor); 23 | i = i/(w*zoomFactor); 24 | int out_h = i%(h*zoomFactor); 25 | i = i/(h*zoomFactor); 26 | int _c = i%c; 27 | i = i/_c; 28 | int _b = i%batch; 29 | int in_w = out_w/zoomFactor; 30 | int in_h = out_h/zoomFactor; 31 | int in_offset = _b*c*w*h + _c*w*h; 32 | int in_index00 = in_offset + in_h*w + in_w; 33 | if(mode == "bilinear"){ 34 | int in_index01 = (in_w+1 > w) ? in_index00 : (in_index00 + 1); 35 | int in_index10 = (in_h+1 > h) ? in_index00 : (in_index00 + w); 36 | int in_index11 = (in_index01 == in_index10) ? in_index00 : (in_index10 + 1); 37 | 38 | float u = (float)(out_h % zoomFactor)/zoomFactor; 39 | float v = (float)(out_w % zoomFactor)/zoomFactor; 40 | out[out_index] = (1-u)*(1-v)*x[in_index00] + \ 41 | (1-u)*v*x[in_index01] + \ 42 | u*(1-v)*x[in_index10] + \ 43 | u*v*x[in_index11]; 44 | } 45 | else if(mode == "nearest"){ 46 | out[out_index] = x[in_index00]; 47 | } 48 | } 49 | 50 | void interp_gpu(const float *x, int w, int h, int c, int batch, int zoomFactor, float *out, cudaStream_t stream) 51 | { 52 | int outSize = w*zoomFactor*h*zoomFactor*c*batch; 53 | upsample_gpu<<>>(outSize, x, w, h, c, batch, zoomFactor, out); 54 | } 55 | -------------------------------------------------------------------------------- /src/interpPlugin.h: -------------------------------------------------------------------------------- 1 | #ifndef INTERP_PLUGIN_H 2 | #define INTERP_PLUGIN_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "NvInfer.h" 9 | #include "NvCaffeParser.h" 10 | #include "NvInferPlugin.h" 11 | #include 12 | 13 | using namespace nvinfer1; 14 | using namespace nvcaffeparser1; 15 | using namespace plugin; 16 | 17 | #define BLOCK 512 18 | #define ZOOM 2 // upsample *2 19 | 20 | void interp_gpu(const float *x, int w, int h, int c, int batch, int zoomFactor, float *out, cudaStream_t stream); 21 | 22 | template 23 | class Interp : public IPlugin 24 | { 25 | public: 26 | Interp() {} 27 | Interp(const void* buffer, size_t size) 28 | { 29 | // assert(size == sizeof(mInputSize)); 30 | // mInputSize = *reinterpret_cast(buffer); 31 | assert(size == sizeof(mInputDims)); 32 | mInputDims = *reinterpret_cast(buffer); 33 | } 34 | ~Interp() {} 35 | 36 | // @ when creating the network 37 | int getNbOutputs() const override 38 | { 39 | return 1; 40 | } 41 | Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override 42 | { 43 | assert(nbInputDims == 1); 44 | assert(index == 0); 45 | assert(inputs[index].nbDims == 3); 46 | 47 | mOutputDims = DimsCHW(inputs[index].d[0], inputs[index].d[1] * zoomFactor, inputs[index].d[2] * zoomFactor); 48 | if (0) { 49 | std::cout << "IPlugin input dim = [" << inputs[index].d[0] << ", " << inputs[index].d[1] 50 | << ", " << inputs[index].d[2] << "]" << std::endl; 51 | std::cout << "IPlugin output dim = [" << mOutputDims.d[0] << ", " << mOutputDims.d[1] 52 | << ", " << mOutputDims.d[2] << "]" << std::endl; 53 | } 54 | return mOutputDims; 55 | } 56 | 57 | // @ when building the engine 58 | void configure(const Dims* inputs, int nbInputs, const Dims* outputs, int nbOutputs, int maxBatchSize) override 59 | { 60 | assert(1 == nbInputs && 1 == nbOutputs); 61 | mInputDims = inputs[0]; 62 | mInputSize = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2] * sizeof(float); 63 | // mOutputSize = outputs[0].d[0] * outputs[0].d[1] * outputs[0].d[2] * sizeof(float); 64 | } 65 | size_t getWorkspaceSize(int) const override 66 | { 67 | return 0; 68 | } 69 | 70 | // @ when serializing the engine 71 | size_t getSerializationSize() override 72 | { 73 | return sizeof(mInputDims); 74 | } 75 | void serialize(void* buffer) override 76 | { 77 | // *reinterpret_cast(buffer) = mInputSize; 78 | *reinterpret_cast(buffer) = mInputDims; 79 | } 80 | 81 | // @ when deserializing && executing the engine(at runtime) 82 | int initialize() override 83 | { 84 | return 0; 85 | } 86 | void terminate() override 87 | { 88 | } 89 | int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override 90 | { 91 | // TODO: why inputs idx 0? 92 | interp_gpu((const float*)inputs[0], mInputDims.d[2], mInputDims.d[1], mInputDims.d[0], batchSize, zoomFactor, (float *)outputs[0], stream); // TODO: didnt serialize mInputDims, can we use it? in that case, i serialized mInputDims, instead of mInputSize. 93 | return 0; 94 | } 95 | 96 | protected: 97 | Dims mInputDims; //CHW 98 | Dims mOutputDims; 99 | size_t mInputSize; 100 | // size_t mOutputSize; 101 | }; 102 | 103 | 104 | class PluginFactory : public nvinfer1::IPluginFactory, public nvcaffeparser1::IPluginFactory 105 | { 106 | public: 107 | // @ when building the engine 108 | // caffe parser plugin implementation 109 | bool isPlugin(const char* layerName) override 110 | { 111 | return !(strcmp(layerName, "Interp85") && strcmp(layerName, "Interp97")); 112 | } 113 | virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) override 114 | { 115 | assert(isPlugin(layerName)); 116 | if (!strcmp(layerName, "Interp85")) 117 | { 118 | assert(layerName != "Interp85"); // debug_ 119 | assert(mPluginInterp85.get() == nullptr); 120 | assert(nbWeights == 0 && weights == nullptr); 121 | mPluginInterp85 = std::unique_ptr>(new Interp()); 122 | return mPluginInterp85.get(); 123 | } 124 | else if (!strcmp(layerName, "Interp97")) 125 | { 126 | assert(layerName != "Interp97"); // debug_ 127 | assert(mPluginInterp97.get() == nullptr); 128 | assert(nbWeights == 0 && weights == nullptr); 129 | mPluginInterp97 = std::unique_ptr>(new Interp()); 130 | return mPluginInterp97.get(); 131 | } 132 | else 133 | { 134 | assert(0); 135 | return nullptr; 136 | } 137 | } 138 | 139 | // @ at runtime 140 | IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override 141 | { 142 | assert(isPlugin(layerName)); 143 | if (!strcmp(layerName, "Interp85")) 144 | { 145 | assert(mPluginInterp85.get() == nullptr); 146 | mPluginInterp85 = std::unique_ptr>(new Interp(serialData, serialLength)); 147 | return mPluginInterp85.get(); 148 | } 149 | else if (!strcmp(layerName, "Interp97")) 150 | { 151 | assert(mPluginInterp97.get() == nullptr); 152 | mPluginInterp97 = std::unique_ptr>(new Interp(serialData, serialLength)); 153 | return mPluginInterp97.get(); 154 | } 155 | else 156 | { 157 | assert(0); 158 | return nullptr; 159 | } 160 | } 161 | 162 | void destroyPlugin() 163 | { 164 | //mPluginInterp97.release(); mPluginInterp97 = nullptr; 165 | //mPluginInterp85.release(); mPluginInterp85 = nullptr; 166 | } 167 | 168 | std::unique_ptr> mPluginInterp85{ nullptr }; 169 | std::unique_ptr> mPluginInterp97{ nullptr }; 170 | }; 171 | 172 | #endif 173 | -------------------------------------------------------------------------------- /src/preproc_yolov3.h: -------------------------------------------------------------------------------- 1 | #ifndef YOLO_PREPROC_H 2 | #define YOLO_PREPROC_H 3 | 4 | #include 5 | #include 6 | #include "opencv2/highgui/highgui_c.h" 7 | #include "opencv2/imgproc/imgproc_c.h" 8 | #include "opencv2/core/version.hpp" 9 | #if CV_MAJOR_VERSION == 3 10 | #include "opencv2/videoio/videoio_c.h" 11 | #endif 12 | 13 | typedef struct { 14 | int w; 15 | int h; 16 | int c; 17 | float *data; 18 | } image; 19 | 20 | image make_empty_image(int w, int h, int c) 21 | { 22 | image out; 23 | out.data = 0; 24 | out.h = h; 25 | out.w = w; 26 | out.c = c; 27 | return out; 28 | } 29 | 30 | image make_image(int w, int h, int c) 31 | { 32 | image out = make_empty_image(w,h,c); 33 | out.data = (float*)calloc(h*w*c, sizeof(float)); 34 | return out; 35 | } 36 | void free_image(image m) 37 | { 38 | if(m.data){ 39 | free(m.data); 40 | } 41 | } 42 | void fill_image(image m, float s) 43 | { 44 | int i; 45 | for(i = 0; i < m.h*m.w*m.c; ++i) m.data[i] = s; 46 | } 47 | 48 | float get_pixel(image m, int x, int y, int c) 49 | { 50 | assert(x < m.w && y < m.h && c < m.c); 51 | return m.data[c*m.h*m.w + y*m.w + x]; 52 | } 53 | 54 | void set_pixel(image m, int x, int y, int c, float val) 55 | { 56 | if (x < 0 || y < 0 || c < 0 || x >= m.w || y >= m.h || c >= m.c) return; 57 | assert(x < m.w && y < m.h && c < m.c); 58 | m.data[c*m.h*m.w + y*m.w + x] = val; 59 | } 60 | void add_pixel(image m, int x, int y, int c, float val) 61 | { 62 | assert(x < m.w && y < m.h && c < m.c); 63 | m.data[c*m.h*m.w + y*m.w + x] += val; 64 | } 65 | void embed_image(image source, image dest, int dx, int dy) 66 | { 67 | int x,y,k; 68 | for(k = 0; k < source.c; ++k){ 69 | for(y = 0; y < source.h; ++y){ 70 | for(x = 0; x < source.w; ++x){ 71 | float val = get_pixel(source, x,y,k); 72 | set_pixel(dest, dx+x, dy+y, k, val); 73 | } 74 | } 75 | } 76 | } 77 | 78 | void ipl_into_image(IplImage* src, image im) 79 | { 80 | unsigned char *data = (unsigned char *)src->imageData; 81 | int h = src->height; 82 | int w = src->width; 83 | int c = src->nChannels; 84 | int step = src->widthStep; 85 | int i, j, k; 86 | 87 | for(i = 0; i < h; ++i){ 88 | for(k= 0; k < c; ++k){ 89 | for(j = 0; j < w; ++j){ 90 | im.data[k*w*h + i*w + j] = data[i*step + j*c + k]/255.; 91 | } 92 | } 93 | } 94 | } 95 | 96 | image ipl_to_image(IplImage* src) 97 | { 98 | // ross 99 | if (0 == src) { 100 | printf("file %s, line %d, src == 0\n", __FILE__, __LINE__); 101 | exit(0); 102 | } 103 | int h = src->height; 104 | int w = src->width; 105 | int c = src->nChannels; 106 | image out = make_image(w, h, c); 107 | ipl_into_image(src, out); 108 | return out; 109 | } 110 | 111 | void rgbgr_image(image im) 112 | { 113 | int i; 114 | for(i = 0; i < im.w*im.h; ++i){ 115 | float swap = im.data[i]; 116 | im.data[i] = im.data[i+im.w*im.h*2]; 117 | im.data[i+im.w*im.h*2] = swap; 118 | } 119 | } 120 | 121 | image load_image_cv(char *filename, int channels) 122 | { 123 | IplImage* src = 0; 124 | int flag = -1; 125 | if (channels == 0) flag = -1; 126 | else if (channels == 1) flag = 0; 127 | else if (channels == 3) flag = 1; 128 | else { 129 | fprintf(stderr, "OpenCV can't force load with %d channels\n", channels); 130 | } 131 | 132 | if( (src = (IplImage*)cvLoadImage(filename, flag)) == NULL ) 133 | { 134 | fprintf(stderr, "Cannot load image \"%s\"\n", filename); 135 | exit(0); 136 | } 137 | image out = ipl_to_image(src); 138 | cvReleaseImage(&src); 139 | rgbgr_image(out); 140 | return out; 141 | } 142 | 143 | image resize_image(image im, int w, int h) 144 | { 145 | image resized = make_image(w, h, im.c); 146 | image part = make_image(w, im.h, im.c); 147 | int r, c, k; 148 | float w_scale = (float)(im.w - 1) / (w - 1); 149 | float h_scale = (float)(im.h - 1) / (h - 1); 150 | for(k = 0; k < im.c; ++k){ 151 | for(r = 0; r < im.h; ++r){ 152 | for(c = 0; c < w; ++c){ 153 | float val = 0; 154 | if(c == w-1 || im.w == 1){ 155 | val = get_pixel(im, im.w-1, r, k); 156 | } else { 157 | float sx = c*w_scale; 158 | int ix = (int) sx; 159 | float dx = sx - ix; 160 | val = (1 - dx) * get_pixel(im, ix, r, k) + dx * get_pixel(im, ix+1, r, k); 161 | } 162 | set_pixel(part, c, r, k, val); 163 | } 164 | } 165 | } 166 | for(k = 0; k < im.c; ++k){ 167 | for(r = 0; r < h; ++r){ 168 | float sy = r*h_scale; 169 | int iy = (int) sy; 170 | float dy = sy - iy; 171 | for(c = 0; c < w; ++c){ 172 | float val = (1-dy) * get_pixel(part, c, iy, k); 173 | set_pixel(resized, c, r, k, val); 174 | } 175 | if(r == h-1 || im.h == 1) continue; 176 | for(c = 0; c < w; ++c){ 177 | float val = dy * get_pixel(part, c, iy+1, k); 178 | add_pixel(resized, c, r, k, val); 179 | } 180 | } 181 | } 182 | 183 | free_image(part); 184 | return resized; 185 | } 186 | 187 | image load_image(char *filename, int w, int h, int c) 188 | { 189 | image out = load_image_cv(filename, c); 190 | 191 | if((h && w) && (h != out.h || w != out.w)){ 192 | image resized = resize_image(out, w, h); 193 | free_image(out); 194 | out = resized; 195 | } 196 | return out; 197 | } 198 | 199 | 200 | image load_image_color(char *filename, int w, int h) 201 | { 202 | return load_image(filename, w, h, 3); 203 | } 204 | 205 | image copy_image(image p) 206 | { 207 | image copy = p; 208 | copy.data = (float*)calloc(p.h*p.w*p.c, sizeof(float)); 209 | memcpy(copy.data, p.data, p.h*p.w*p.c*sizeof(float)); 210 | return copy; 211 | } 212 | 213 | void save_image_jpg(image p, const char *name) 214 | { 215 | image copy = copy_image(p); 216 | if(p.c == 3) rgbgr_image(copy); 217 | int x,y,k; 218 | 219 | char buff[256]; 220 | sprintf(buff, "%s.jpg", name); 221 | 222 | IplImage *disp = cvCreateImage(cvSize(p.w,p.h), IPL_DEPTH_8U, p.c); 223 | int step = disp->widthStep; 224 | for(y = 0; y < p.h; ++y){ 225 | for(x = 0; x < p.w; ++x){ 226 | for(k= 0; k < p.c; ++k){ 227 | disp->imageData[y*step + x*p.c + k] = (unsigned char)(get_pixel(copy,x,y,k)*255); 228 | } 229 | } 230 | } 231 | cvSaveImage(buff, disp,0); 232 | cvReleaseImage(&disp); 233 | free_image(copy); 234 | } 235 | 236 | void save_image(image im, const char *name) 237 | { 238 | save_image_jpg(im, name); 239 | } 240 | 241 | 242 | image letterbox_image(image im, int w, int h) 243 | { 244 | int new_w = im.w; 245 | int new_h = im.h; 246 | if (((float)w/im.w) < ((float)h/im.h)) { 247 | new_w = w; 248 | new_h = (im.h * w)/im.w; 249 | } else { 250 | new_h = h; 251 | new_w = (im.w * h)/im.h; 252 | } 253 | image resized = resize_image(im, new_w, new_h); 254 | image boxed = make_image(w, h, im.c); 255 | fill_image(boxed, .5); 256 | embed_image(resized, boxed, (w-new_w)/2, (h-new_h)/2); 257 | free_image(resized); 258 | return boxed; 259 | } 260 | 261 | #endif 262 | -------------------------------------------------------------------------------- /src/regionLayer.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /** \brief kernel for softmax 5 | * - n is the number of classes (included the background) 6 | * 7 | * - The CPU implementation is 8 | * for b in batch: 9 | * for g in groups: 10 | * softmax(input + b*batchOffset + g*groupOffset, n, temp, stride, output + b*batchOffset + g*groupOffset) 11 | * 12 | * - The GPU implementation put the two for-loop into parallel. 13 | * 14 | * - nthdsPerCTA: the max number of threads per block. 15 | * - Each thread will in charge of one point softmax for all classes. 16 | * - Total number of threads: batch * groups 17 | * 18 | * - TODO: using warp shuffle instead of loop in one thread. 19 | */ 20 | template 21 | __launch_bounds__(nthdsPerCTA) 22 | __global__ void softmaxKernel(const float * input, 23 | const int n, 24 | const int batch, 25 | const int batchOffset, 26 | const int groups, 27 | const int groupOffset, 28 | const int stride, 29 | const float temp, 30 | float * output) 31 | { 32 | int id = blockIdx.x * nthdsPerCTA + threadIdx.x; 33 | 34 | // per batch, per group 35 | if (id < batch * groups) 36 | { 37 | int b = id / groups; 38 | int g = id % groups; 39 | float sum = 0.; 40 | float largest = -FLT_MAX; 41 | int offset = b*batchOffset + g*groupOffset; 42 | for (int i = 0; i < n; ++i) 43 | { 44 | float val = input[i*stride + offset]; 45 | largest = (val > largest) ? val : largest; 46 | } 47 | for (int i = 0; i < n; ++i) 48 | { 49 | float e = exp(input[i*stride + offset]/temp - largest/temp); // bound score in (-inf,0], and denominator fractor in (0,1]. 50 | sum += e; 51 | output[i*stride + offset] = e; 52 | } 53 | for (int i = 0; i < n; ++i) 54 | output[i*stride + offset] /= sum; 55 | } 56 | } 57 | 58 | 59 | /** 60 | * \brief Sigmoid function 61 | * 62 | * "__launch_bounds__" ensures the universality of kernel 63 | */ 64 | template 65 | __launch_bounds__(nthdsPerCTA) 66 | __global__ void activateKernel(float * data, 67 | const int range) 68 | { 69 | int i = blockIdx.x * nthdsPerCTA + threadIdx.x; 70 | if (i < range) 71 | data[i] = 1. / (1. + exp(-data[i])); 72 | } 73 | 74 | /** 75 | * \brief region layer of YOLOv3 76 | * Includes activation and softmax. 77 | * - num: # bounding box per location 78 | * 79 | * If we integrated into tensorRT, we can use input and output are different memory. 80 | * If it is standalone GPU code (in main.cpp), we can use input and output the same buffer. 81 | * 82 | * Note: The elements in YOLOv3 83 | * * 4*nCells coords, 84 | * * nCells conf, 85 | * * classes*nCells classes 86 | * e.g. 87 | * * nCells for 0 class (background) 88 | * * nCells for 1 class 89 | * * ... 90 | */ 91 | void regionLayer_gpu( 92 | const int batch, 93 | const int C, 94 | const int nCells, 95 | const int num, 96 | const int coords, 97 | const int classes, 98 | const float * input, 99 | float * output, 100 | cudaStream_t stream) 101 | { 102 | const int blockSize = 256; 103 | const int gridSize_Act1 = (2*nCells + blockSize - 1) / blockSize; // x, y 104 | const int gridSize_Act2 = (nCells + blockSize - 1) / blockSize; // conf 105 | const int gridSize_Softmax = (nCells + blockSize - 1) / blockSize; // classes 106 | // for YOLOv3, the output of final layer is C*nCells, in which, C includes all the conf, coord, and claesses. 107 | 108 | #ifdef REGION_IN_TRT 109 | // TRT, input and output are diff buffer 110 | ck(cudaMemcpy((void*)output, (void*)input, batch*C*nCells*sizeof(float), cudaMemcpyDeviceToDevice)); 111 | #endif 112 | // else input and output can be same buffer 113 | 114 | for (int b = 0; b < batch; ++b) { 115 | for (int n = 0; n < num; ++n) { 116 | // activate on (x,y) 117 | int index = b*C*nCells // per batch 118 | + n*nCells*(coords+classes+1); // coords, classes and confidence 119 | activateKernel 120 | <<>> 121 | (output + index, 2*nCells); 122 | 123 | // activate on probes on conf 124 | index = b*C*nCells 125 | + n*nCells*(coords+classes+1) 126 | + 4*nCells; // skip coords 127 | activateKernel 128 | <<>> 129 | (output + index, nCells); 130 | 131 | // softmax for all classes 132 | index = b*C*nCells 133 | + n*nCells*(coords+classes+1) 134 | + 5*nCells; // skip conf 135 | softmaxKernel 136 | <<>> 137 | (input + index, // input: skip loc, conf 138 | classes, // n: #classes 139 | batch*num, // batch: batch * #bound_box 140 | (C*nCells/num), // batchOffset: number of bounding_box in total 141 | nCells, // groups 142 | 1, // groupOffset 143 | nCells, // stride 144 | 1.f, // temp 145 | output + index); // output 146 | } 147 | } 148 | } 149 | 150 | #define nOutputLayer 3 151 | template 152 | __launch_bounds__(nthdsPerCTA) 153 | __global__ void reorgOutputKernel( 154 | const int nBatch, 155 | const int nClasses, 156 | const int nBboxesPerLoc, 157 | const int coords, 158 | const int l0_w, 159 | const int l0_h, 160 | const int nCells, 161 | float* dpData_unordered[], 162 | float* dpData) 163 | { 164 | long i = blockIdx.x * nthdsPerCTA + threadIdx.x; 165 | const int bboxMemLen = (nClasses + coords + 1) * nCells; 166 | const int batchMemLen = nBboxesPerLoc * bboxMemLen; 167 | const long range = nBatch * batchMemLen; 168 | if (i < range) // voc<266175 coco<904995 wrt. 416*416 input 169 | { 170 | int b = i / batchMemLen; 171 | int bboxIdx = (i % batchMemLen) / bboxMemLen; 172 | int channelIdx = ((i % batchMemLen) % bboxMemLen) / nCells; 173 | int locIdx = (i % batchMemLen) % nCells; 174 | int locLayer, cnt_offset = 1+2*2+4*4; 175 | for(int j = nOutputLayer-1; j >= 0; --j){ 176 | cnt_offset -= (1<= cnt_offset*l0_w*l0_h){ 178 | locLayer = j; 179 | break; 180 | } 181 | } 182 | dpData[i] = dpData_unordered[locLayer]\ 183 | [b*nBboxesPerLoc*(nClasses+coords+1)*(1< 206 | <<>> 207 | (nBatch, nClasses, nBboxesPerLoc, coords, l0_w, l0_h, nCells, dpData_unordered, dpData); 208 | 209 | } 210 | -------------------------------------------------------------------------------- /src/regionLayer.cu~: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /** \brief kernel for softmax 5 | * - n is the number of classes (included the background) 6 | * 7 | * - The CPU implementation is 8 | * for b in batch: 9 | * for g in groups: 10 | * softmax(input + b*batchOffset + g*groupOffset, n, temp, stride, output + b*batchOffset + g*groupOffset) 11 | * 12 | * - The GPU implementation put the two for-loop into parallel. 13 | * 14 | * - nthdsPerCTA: the max number of threads per block. 15 | * - Each thread will in charge of one point softmax for all classes. 16 | * - Total number of threads: batch * groups 17 | * 18 | * - TODO: using warp shuffle instead of loop in one thread. 19 | */ 20 | template 21 | __launch_bounds__(nthdsPerCTA) 22 | __global__ void softmaxKernel(const float * input, 23 | const int n, 24 | const int batch, 25 | const int batchOffset, 26 | const int groups, 27 | const int groupOffset, 28 | const int stride, 29 | const float temp, 30 | float * output) 31 | { 32 | int id = blockIdx.x * nthdsPerCTA + threadIdx.x; 33 | 34 | // per batch, per group 35 | if (id < batch * groups) 36 | { 37 | int b = id / groups; 38 | int g = id % groups; 39 | float sum = 0.; 40 | float largest = -FLT_MAX; 41 | int offset = b*batchOffset + g*groupOffset; 42 | for (int i = 0; i < n; ++i) 43 | { 44 | float val = input[i*stride + offset]; 45 | largest = (val > largest) ? val : largest; 46 | } 47 | for (int i = 0; i < n; ++i) 48 | { 49 | float e = exp(input[i*stride + offset]/temp - largest/temp); // bound score in (-inf,0], and denominator fractor in (0,1]. 50 | sum += e; 51 | output[i*stride + offset] = e; 52 | } 53 | for (int i = 0; i < n; ++i) 54 | output[i*stride + offset] /= sum; 55 | } 56 | } 57 | 58 | 59 | /** 60 | * \brief Sigmoid function 61 | * 62 | * "__launch_bounds__" ensures the universality of kernel 63 | */ 64 | template 65 | __launch_bounds__(nthdsPerCTA) 66 | __global__ void activateKernel(float * data, 67 | const int range) 68 | { 69 | int i = blockIdx.x * nthdsPerCTA + threadIdx.x; 70 | if (i < range) 71 | data[i] = 1. / (1. + exp(-data[i])); 72 | } 73 | 74 | /** 75 | * \brief region layer of YOLOv3 76 | * Includes activation and softmax. 77 | * - num: # bounding box per location 78 | * 79 | * If we integrated into tensorRT, we can use input and output are different memory. 80 | * If it is standalone GPU code (in main.cpp), we can use input and output the same buffer. 81 | * 82 | * Note: The elements in YOLOv2 83 | * * 4*nCells coords, 84 | * * nCells conf, 85 | * * classes*nCells classes 86 | * e.g. 87 | * * nCells for 0 class (background) 88 | * * nCells for 1 class 89 | * * ... 90 | */ 91 | void regionLayer_gpu( 92 | const int batch, 93 | const int C, 94 | const int nCells, 95 | const int num, 96 | const int coords, 97 | const int classes, 98 | const float * input, 99 | float * output, 100 | cudaStream_t stream) 101 | { 102 | const int blockSize = 256; 103 | const int gridSize_Act1 = (2*nCells + blockSize - 1) / blockSize; // x, y 104 | const int gridSize_Act2 = (nCells + blockSize - 1) / blockSize; // conf 105 | const int gridSize_Softmax = (nCells + blockSize - 1) / blockSize; // classes 106 | // for YOLOv3, the output of final layer is C*nCells, in which, C includes all the conf, coord, and claesses. 107 | 108 | #ifdef REGION_IN_TRT 109 | // TRT, input and output are diff buffer 110 | ck(cudaMemcpy((void*)output, (void*)input, batch*C*nCells*sizeof(float), cudaMemcpyDeviceToDevice)); 111 | #endif 112 | // else input and output can be same buffer 113 | 114 | for (int b = 0; b < batch; ++b) { 115 | for (int n = 0; n < num; ++n) { 116 | // activate on (x,y) 117 | int index = b*C*nCells // per batch 118 | + n*nCells*(coords+classes+1); // coords, classes and confidence 119 | activateKernel 120 | <<>> 121 | (output + index, 2*nCells); 122 | 123 | // activate on probes on conf 124 | index = b*C*nCells 125 | + n*nCells*(coords+classes+1) 126 | + 4*nCells; // skip coords 127 | activateKernel 128 | <<>> 129 | (output + index, nCells); 130 | 131 | // softmax for all classes 132 | index = b*C*nCells 133 | + n*nCells*(coords+classes+1) 134 | + 5*nCells; // skip conf 135 | softmaxKernel 136 | <<>> 137 | (input + index, // input: skip loc, conf 138 | classes, // n: #classes 139 | batch*num, // batch: batch * #bound_box 140 | (C*nCells/num), // batchOffset: number of bounding_box in total 141 | nCells, // groups 142 | 1, // groupOffset 143 | nCells, // stride 144 | 1.f, // temp 145 | output + index); // output 146 | } 147 | } 148 | } 149 | 150 | #define nOutputLayer 3 151 | template 152 | __launch_bounds__(nthdsPerCTA) 153 | __global__ void reorgOutputKernel( 154 | const int nBatch, 155 | const int nClasses, 156 | const int nBboxesPerLoc, 157 | const int coords, 158 | const int l0_w, 159 | const int l0_h, 160 | const int nCells, 161 | float* dpData_unordered[], 162 | float* dpData) 163 | { 164 | long i = blockIdx.x * nthdsPerCTA + threadIdx.x; 165 | const int bboxMemLen = (nClasses + coords + 1) * nCells; 166 | const int batchMemLen = nBboxesPerLoc * bboxMemLen; 167 | const long range = nBatch * batchMemLen; 168 | if (i < range) // voc<266175 coco<904995 wrt. 416*416 input 169 | { 170 | int b = i / batchMemLen; 171 | int bboxIdx = (i % batchMemLen) / bboxMemLen; 172 | int channelIdx = ((i % batchMemLen) % bboxMemLen) / nCells; 173 | int locIdx = (i % batchMemLen) % nCells; 174 | int locLayer, cnt_offset = 1+2*2+4*4; 175 | for(int j = nOutputLayer-1; j >= 0; --j){ 176 | cnt_offset -= (1<= cnt_offset*l0_w*l0_h){ 178 | locLayer = j; 179 | break; 180 | } 181 | } 182 | dpData[i] = dpData_unordered[locLayer]\ 183 | [b*nBboxesPerLoc*(nClasses+coords+1)*(1< 206 | <<>> 207 | (nBatch, nClasses, nBboxesPerLoc, coords, l0_w, l0_h, nCells, dpData_unordered, dpData); 208 | 209 | } 210 | -------------------------------------------------------------------------------- /src/regionLayer.h: -------------------------------------------------------------------------------- 1 | #ifndef REGION_LAYER_H_ 2 | #define REGION_LAYER_H_ 3 | 4 | #include "nvUtils.h" 5 | 6 | class regionParams{ 7 | public: 8 | int classes; // number of class 9 | int n; // number of bbox per location 10 | int coords; // number of coords (4) 11 | int w; // w (darknet) 12 | int h; // h (darknet), in total, we have w*h*n bbox 13 | int outputs; // outputs (darknet), output dimension of previous layer, 14 | 15 | bool softmax; // 1 for softmax process 16 | int background; // background index 17 | }; 18 | 19 | typedef struct{ 20 | float x, y, w, h; 21 | } box; 22 | 23 | void regionLayer_gpu(const int batch, 24 | const int C, 25 | const int nCells, 26 | const int num, 27 | const int coords, 28 | const int classes, 29 | const float * input, 30 | float * output, 31 | cudaStream_t stream); 32 | 33 | void reorgOutput_gpu(const int nBatch, 34 | const int nClasses, 35 | const int nBboxesPerLoc, 36 | const int coords, 37 | const int l0_w, 38 | const int l0_h, 39 | const int nCells, 40 | float* dpData_unordered[], 41 | float* dpData, 42 | const long nData, 43 | cudaStream_t stream); 44 | #endif 45 | -------------------------------------------------------------------------------- /src/regionLayer.h~: -------------------------------------------------------------------------------- 1 | #ifndef REGION_LAYER_H_ 2 | #define REGION_LAYER_H_ 3 | 4 | #include "nvUtils.h" 5 | 6 | class regionParams{ 7 | public: 8 | int classes; // number of class 9 | int n; // number of bbox per location 10 | int coords; // number of coords (4) 11 | int w; // w (darknet) 12 | int h; // h (darknet), in total, we have w*h*n bbox 13 | int outputs; // outputs (darknet), output dimension of previous layer, 14 | 15 | bool softmax; // 1 for softmax process 16 | int background; // background index 17 | }; 18 | 19 | typedef struct{ 20 | float x, y, w, h; 21 | } box; 22 | 23 | void regionLayer_gpu(const int batch, 24 | const int C, 25 | const int nCells, 26 | const int num, 27 | const int coords, 28 | const int classes, 29 | const float * input, 30 | float * output, 31 | cudaStream_t stream); 32 | 33 | void reorgOutput_gpu(const int nBatch, 34 | const int nClasses, 35 | const int nBboxesPerLoc, 36 | const int coords, 37 | const int l0_w, 38 | const int l0_h, 39 | const int nCells, 40 | float* __constant__ dpData_unordered[], 41 | float* dpData, 42 | const long nData, 43 | cudaStream_t stream); 44 | #endif 45 | -------------------------------------------------------------------------------- /src/tensorRTClassifier.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2016 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NOTICE TO USER: 5 | * 6 | * This source code is subject to NVIDIA ownership rights under U.S. and 7 | * international Copyright laws. 8 | * 9 | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 10 | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 11 | * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH 12 | * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 13 | * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 14 | * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 15 | * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 16 | * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 17 | * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 18 | * OR PERFORMANCE OF THIS SOURCE CODE. 19 | * 20 | * U.S. Government End Users. This source code is a "commercial item" as 21 | * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of 22 | * "commercial computer software" and "commercial computer software 23 | * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 24 | * and is provided to the U.S. Government only as a commercial end item. 25 | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 26 | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 27 | * source code with only those rights set forth herein. 28 | */ 29 | 30 | #ifndef TENSORRT_CLASSIFIER_H 31 | #define TENSORRT_CLASSIFIER_H 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | #include "NvInfer.h" 39 | #include "NvCaffeParser.h" 40 | #include "NvInferPlugin.h" 41 | #include "logger.h" 42 | #include "classifier.h" 43 | 44 | using namespace nvinfer1; 45 | using namespace nvcaffeparser1; 46 | using namespace plugin; 47 | 48 | static const int MAX_BUFFERS_ = 10; 49 | 50 | 51 | // Logger for GIE info/warning/errors 52 | class Logger : public ILogger 53 | { 54 | void log(Severity severity, const char* msg) override 55 | { 56 | // suppress info-level messages 57 | if (severity != Severity::kINFO) 58 | std::cout << msg << std::endl; 59 | } 60 | }; 61 | 62 | class Int8Calibrator : public IInt8EntropyCalibrator 63 | { 64 | public: 65 | Int8Calibrator(std::string calibrationTableFile) 66 | : calibrationTableFile_(calibrationTableFile) {} 67 | 68 | ~Int8Calibrator() {} 69 | 70 | int getBatchSize() const override { 71 | return 0; 72 | } 73 | 74 | bool getBatch(void* bindings[], const char* names[], int nbBindings) override 75 | { 76 | return false; 77 | } 78 | 79 | const void* readCalibrationCache(size_t& length) override 80 | { 81 | vCalibrationCache_.clear(); 82 | std::ifstream input(calibrationTableFile_.c_str(), std::ios::binary); 83 | input >> std::noskipws; 84 | if (input.good()) 85 | std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(vCalibrationCache_)); 86 | 87 | length = vCalibrationCache_.size(); 88 | return length ? &vCalibrationCache_[0] : nullptr; 89 | } 90 | 91 | void writeCalibrationCache(const void* cache, size_t length) override 92 | { 93 | std::cout << "writeCalibrationCache is called!" << std::endl; 94 | } 95 | 96 | private: 97 | std::string calibrationTableFile_; 98 | std::vector vCalibrationCache_; 99 | }; 100 | 101 | class TensorRTClassifier : public IClassifier { 102 | public: 103 | TensorRTClassifier(const char *deployFile, // caffe prototxt file 104 | const char *modelFile, // trained caffe model 105 | const char *meanFile, // mean file 106 | const std::string& inputs, 107 | const std::vector& outputs, 108 | const int maxBatchSize, 109 | const int devID, 110 | nvcaffeparser1::IPluginFactory* pPluginFactory = nullptr, 111 | std::string table = std::string()); 112 | 113 | ~TensorRTClassifier(); 114 | 115 | void caffeToTensorRTModel(const char *deployFile, 116 | const char *modelFile, 117 | ICaffeParser *parser); 118 | void initInfer(); 119 | 120 | // override 121 | void setInputData(float *pBGR, 122 | const int nWidth, 123 | const int nHeight, 124 | const int nBatchSize) override; 125 | 126 | void forward(INFER_OUTPUT_PARAMS *) override; 127 | 128 | int getInferWidth() const override; 129 | 130 | int getInferHeight() const override; 131 | 132 | std::vector getMeanValues() const override; 133 | 134 | private: 135 | int devID_; 136 | int maxBatchSize_; 137 | 138 | // tensorRT params 139 | ICudaEngine *pEngine_ = nullptr; 140 | ICaffeParser *pCaffeParser_ = nullptr; 141 | IBinaryProtoBlob *pMeanBlob_ = nullptr; 142 | IExecutionContext *pContext_ = nullptr; 143 | 144 | std::string inputBlobName_; 145 | std::vector vOutputBlobNames_; 146 | nvcaffeparser1::IPluginFactory* pPluginFactory_{ nullptr }; // factory for plugin layers 147 | Int8Calibrator *pCalibrator_{ nullptr }; 148 | std::string calibrationTable_; 149 | 150 | int nInputs_; 151 | int inputIndex_; 152 | DimsCHW inputDim_; 153 | size_t inputSize_; 154 | 155 | int nOutputs_; 156 | std::vector vOutputIndexs_; 157 | std::vector vOutputDims_; 158 | std::vector vOutputSizes_; 159 | 160 | void *apBuffers_[MAX_BUFFERS_]; // input and output buffer 161 | std::vector vMeanValues_{0.f, 0.f, 0.f}; 162 | 163 | // tensorRT logger 164 | Logger logger_; 165 | }; 166 | 167 | 168 | #endif // TENSORRT_CLASSIFIER_H 169 | -------------------------------------------------------------------------------- /tensorRTClassifier.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2016 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NOTICE TO USER: 5 | * 6 | * This source code is subject to NVIDIA ownership rights under U.S. and 7 | * international Copyright laws. 8 | * 9 | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 10 | * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 11 | * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH 12 | * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 13 | * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 14 | * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 15 | * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 16 | * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 17 | * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 18 | * OR PERFORMANCE OF THIS SOURCE CODE. 19 | * 20 | * U.S. Government End Users. This source code is a "commercial item" as 21 | * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of 22 | * "commercial computer software" and "commercial computer software 23 | * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 24 | * and is provided to the U.S. Government only as a commercial end item. 25 | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 26 | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 27 | * source code with only those rights set forth herein. 28 | */ 29 | 30 | #ifndef TENSORRT_CLASSIFIER_H 31 | #define TENSORRT_CLASSIFIER_H 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | #include "NvInfer.h" 39 | #include "NvCaffeParser.h" 40 | #include "NvInferPlugin.h" 41 | #include "logger.h" 42 | #include "classifier.h" 43 | 44 | using namespace nvinfer1; 45 | using namespace nvcaffeparser1; 46 | using namespace plugin; 47 | 48 | static const int MAX_BUFFERS_ = 10; 49 | 50 | 51 | // Logger for GIE info/warning/errors 52 | class Logger : public ILogger 53 | { 54 | void log(Severity severity, const char* msg) override 55 | { 56 | // suppress info-level messages 57 | if (severity != Severity::kINFO) 58 | std::cout << msg << std::endl; 59 | } 60 | }; 61 | 62 | class Int8Calibrator : public IInt8EntropyCalibrator 63 | { 64 | public: 65 | Int8Calibrator(std::string calibrationTableFile) 66 | : calibrationTableFile_(calibrationTableFile) {} 67 | 68 | ~Int8Calibrator() {} 69 | 70 | int getBatchSize() const override { 71 | return 0; 72 | } 73 | 74 | bool getBatch(void* bindings[], const char* names[], int nbBindings) override 75 | { 76 | return false; 77 | } 78 | 79 | const void* readCalibrationCache(size_t& length) override 80 | { 81 | vCalibrationCache_.clear(); 82 | std::ifstream input(calibrationTableFile_.c_str(), std::ios::binary); 83 | input >> std::noskipws; 84 | if (input.good()) 85 | std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(vCalibrationCache_)); 86 | 87 | length = vCalibrationCache_.size(); 88 | return length ? &vCalibrationCache_[0] : nullptr; 89 | } 90 | 91 | void writeCalibrationCache(const void* cache, size_t length) override 92 | { 93 | std::cout << "writeCalibrationCache is called!" << std::endl; 94 | } 95 | 96 | private: 97 | std::string calibrationTableFile_; 98 | std::vector vCalibrationCache_; 99 | }; 100 | 101 | class TensorRTClassifier : public IClassifier { 102 | public: 103 | TensorRTClassifier(const char *deployFile, // caffe prototxt file 104 | const char *modelFile, // trained caffe model 105 | const char *meanFile, // mean file 106 | const std::string& inputs, 107 | const std::vector& outputs, 108 | const int maxBatchSize, 109 | const int devID, 110 | nvcaffeparser1::IPluginFactory* pPluginFactory = nullptr, 111 | std::string table = std::string()); 112 | 113 | ~TensorRTClassifier(); 114 | 115 | void caffeToTensorRTModel(const char *deployFile, 116 | const char *modelFile, 117 | ICaffeParser *parser); 118 | void initInfer(); 119 | 120 | // override 121 | void setInputData(float *pBGR, 122 | const int nWidth, 123 | const int nHeight, 124 | const int nBatchSize) override; 125 | 126 | void forward(INFER_OUTPUT_PARAMS *) override; 127 | 128 | int getInferWidth() const override; 129 | 130 | int getInferHeight() const override; 131 | 132 | std::vector getMeanValues() const override; 133 | 134 | private: 135 | int devID_; 136 | int maxBatchSize_; 137 | 138 | // tensorRT params 139 | ICudaEngine *pEngine_ = nullptr; 140 | ICaffeParser *pCaffeParser_ = nullptr; 141 | IBinaryProtoBlob *pMeanBlob_ = nullptr; 142 | IExecutionContext *pContext_ = nullptr; 143 | 144 | std::string inputBlobName_; 145 | std::vector vOutputBlobNames_; 146 | nvcaffeparser1::IPluginFactory* pPluginFactory_{ nullptr }; // factory for plugin layers 147 | Int8Calibrator *pCalibrator_{ nullptr }; 148 | std::string calibrationTable_; 149 | 150 | int nInputs_; 151 | int inputIndex_; 152 | DimsCHW inputDim_; 153 | size_t inputSize_; 154 | 155 | int nOutputs_; 156 | std::vector vOutputIndexs_; 157 | std::vector vOutputDims_; 158 | std::vector vOutputSizes_; 159 | 160 | void *apBuffers_[MAX_BUFFERS_]; // input and output buffer 161 | std::vector vMeanValues_{0.f, 0.f, 0.f}; 162 | 163 | // tensorRT logger 164 | Logger logger_; 165 | }; 166 | 167 | 168 | #endif // TENSORRT_CLASSIFIER_H 169 | -------------------------------------------------------------------------------- /test.py~: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | c = float(sys.argv[1]) 4 | mAP = [1., c] 5 | ret = (float)(sum(mAP) / len(mAP)) 6 | print 'mAP = %.5f' % ret 7 | exit((sum(mAP) / len(mAP))) 8 | -------------------------------------------------------------------------------- /test.sh~: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | result=test.txt 4 | a=0.45 5 | b=0.003 6 | mAP=`python test.py 3.0 2>&1 1>/dev/null` 7 | echo $( printf '%f %f %f' ${a} ${b} ${mAP}) >> ${result} 8 | cat ${result} 9 | --------------------------------------------------------------------------------