├── Makefile
├── Makefile~
├── README.md
├── bboxParser.cu
├── bboxParser.cu~
├── bboxParser.h
├── bboxParser.h~
├── classifier.h
├── common.cu
├── common.h
├── common.h~
├── draw.h
├── draw.h~
├── gridSearchParam.sh
├── gridSearchParam.sh~
├── include
    ├── bitmap_image.hpp
    ├── cub
    │   ├── agent
    │   │   ├── agent_histogram.cuh
    │   │   ├── agent_radix_sort_downsweep.cuh
    │   │   ├── agent_radix_sort_upsweep.cuh
    │   │   ├── agent_reduce.cuh
    │   │   ├── agent_reduce_by_key.cuh
    │   │   ├── agent_rle.cuh
    │   │   ├── agent_scan.cuh
    │   │   ├── agent_segment_fixup.cuh
    │   │   ├── agent_select_if.cuh
    │   │   ├── agent_spmv_orig.cuh
    │   │   └── single_pass_scan_operators.cuh
    │   ├── block
    │   │   ├── block_adjacent_difference.cuh
    │   │   ├── block_discontinuity.cuh
    │   │   ├── block_exchange.cuh
    │   │   ├── block_histogram.cuh
    │   │   ├── block_load.cuh
    │   │   ├── block_radix_rank.cuh
    │   │   ├── block_radix_sort.cuh
    │   │   ├── block_raking_layout.cuh
    │   │   ├── block_reduce.cuh
    │   │   ├── block_scan.cuh
    │   │   ├── block_shuffle.cuh
    │   │   ├── block_store.cuh
    │   │   └── specializations
    │   │   │   ├── block_histogram_atomic.cuh
    │   │   │   ├── block_histogram_sort.cuh
    │   │   │   ├── block_reduce_raking.cuh
    │   │   │   ├── block_reduce_raking_commutative_only.cuh
    │   │   │   ├── block_reduce_warp_reductions.cuh
    │   │   │   ├── block_scan_raking.cuh
    │   │   │   ├── block_scan_warp_scans.cuh
    │   │   │   ├── block_scan_warp_scans2.cuh
    │   │   │   └── block_scan_warp_scans3.cuh
    │   ├── cub.cuh
    │   ├── device
    │   │   ├── device_histogram.cuh
    │   │   ├── device_partition.cuh
    │   │   ├── device_radix_sort.cuh
    │   │   ├── device_reduce.cuh
    │   │   ├── device_run_length_encode.cuh
    │   │   ├── device_scan.cuh
    │   │   ├── device_segmented_radix_sort.cuh
    │   │   ├── device_segmented_reduce.cuh
    │   │   ├── device_select.cuh
    │   │   ├── device_spmv.cuh
    │   │   └── dispatch
    │   │   │   ├── dispatch_histogram.cuh
    │   │   │   ├── dispatch_radix_sort.cuh
    │   │   │   ├── dispatch_reduce.cuh
    │   │   │   ├── dispatch_reduce_by_key.cuh
    │   │   │   ├── dispatch_rle.cuh
    │   │   │   ├── dispatch_scan.cuh
    │   │   │   ├── dispatch_select_if.cuh
    │   │   │   └── dispatch_spmv_orig.cuh
    │   ├── grid
    │   │   ├── grid_barrier.cuh
    │   │   ├── grid_even_share.cuh
    │   │   ├── grid_mapping.cuh
    │   │   └── grid_queue.cuh
    │   ├── host
    │   │   └── mutex.cuh
    │   ├── iterator
    │   │   ├── arg_index_input_iterator.cuh
    │   │   ├── cache_modified_input_iterator.cuh
    │   │   ├── cache_modified_output_iterator.cuh
    │   │   ├── constant_input_iterator.cuh
    │   │   ├── counting_input_iterator.cuh
    │   │   ├── discard_output_iterator.cuh
    │   │   ├── tex_obj_input_iterator.cuh
    │   │   ├── tex_ref_input_iterator.cuh
    │   │   └── transform_input_iterator.cuh
    │   ├── thread
    │   │   ├── thread_load.cuh
    │   │   ├── thread_operators.cuh
    │   │   ├── thread_reduce.cuh
    │   │   ├── thread_scan.cuh
    │   │   ├── thread_search.cuh
    │   │   └── thread_store.cuh
    │   ├── util_allocator.cuh
    │   ├── util_arch.cuh
    │   ├── util_debug.cuh
    │   ├── util_device.cuh
    │   ├── util_macro.cuh
    │   ├── util_namespace.cuh
    │   ├── util_ptx.cuh
    │   ├── util_type.cuh
    │   └── warp
    │   │   ├── specializations
    │   │       ├── warp_reduce_shfl.cuh
    │   │       ├── warp_reduce_smem.cuh
    │   │       ├── warp_scan_shfl.cuh
    │   │       └── warp_scan_smem.cuh
    │   │   ├── warp_reduce.cuh
    │   │   └── warp_scan.cuh
    ├── helper_cuda.h
    ├── helper_string.h
    └── logger.h
├── interpPlugin.cu
├── interpPlugin.h
├── main.cpp
├── main.cpp~
├── nvUtils.h
├── predictions_fp32.jpg
├── preproc_yolov3.h
├── procInferOutput.h
├── procInferOutput.h~
├── regionLayer.cu
├── regionLayer.cu~
├── regionLayer.h
├── regionLayer.h~
├── results
    ├── calc_mAP.py
    ├── calc_mAP.py~
    ├── mAP.csv
    ├── mAP.csv~
    ├── voc_eval.py
    └── voc_eval.pyc
├── run.sh
├── run.sh~
├── src
    ├── bboxParser.cu
    ├── bboxParser.cu~
    ├── bboxParser.h
    ├── bboxParser.h~
    ├── classifier.h
    ├── common.cu
    ├── common.h
    ├── common.h~
    ├── draw.h
    ├── draw.h~
    ├── interpPlugin.cu
    ├── interpPlugin.h
    ├── main.cpp
    ├── main.cpp~
    ├── nvUtils.h
    ├── preproc_yolov3.h
    ├── procInferOutput.h
    ├── procInferOutput.h~
    ├── regionLayer.cu
    ├── regionLayer.cu~
    ├── regionLayer.h
    ├── regionLayer.h~
    ├── tags
    ├── tensorRTClassifier.cpp
    └── tensorRTClassifier.h
├── tags
├── tensorRTClassifier.cpp
├── tensorRTClassifier.h
├── test.py~
└── test.sh~


/Makefile:
--------------------------------------------------------------------------------
 1 | DEBUG := 1
 2 | NVPROFILER := 0
 3 | 
 4 | #-DVOC  # model trained on VOC
 5 | #-Dcal_mAP  # calculate mAP
 6 | #-DPRINT_LOG  # print the prediction result
 7 | #-DVISULIZATION # draw boxes on the image && save
 8 | CUSTOM_MICRO := -DVOC -DPRINT_LOG #  -DVISULIZATION
 9 | 
10 | GCC := g++
11 | CCFLAGS := -m64 -std=c++11 -O3 $(CUSTOM_MICRO)
12 | NVCC := nvcc
13 | # Choose your arch for fast compilation, 
14 | # sm_60 and sm_61 are for pascal gpu,
15 | # sm_30 and sm_35 are for Tesla K40 gpu
16 | NVCC_FLAGS := -gencode arch=compute_60,code=compute_60 \
17 |               -gencode arch=compute_61,code=compute_61 
18 | ifeq ($(DEBUG), 1)
19 | 	CCFLAGS += -g
20 | 	NVCC_FLAGS += -G 
21 | endif
22 | ifeq ($(NVPROFILER), 1)
23 | 	NVCC_FLAGS += -lineinfo
24 | endif
25 | NVCC_FLAGS += $(CCFLAGS)
26 | 
27 | TENSORRT_VERSION := 212GA
28 | SRC_PATH := ./src
29 | INC_PATH := ./include
30 | 
31 | TENSORRT_INC_PATH := ./tensorRT_$(TENSORRT_VERSION)/include
32 | TENSORRT_LIB_PATH := ./tensorRT_$(TENSORRT_VERSION)/lib
33 | 
34 | INCLUDES := -I$(SRC_PATH) -I$(INC_PATH) -I$(TENSORRT_INC_PATH) -I/usr/local/cuda/include -I/usr/local/include
35 | 
36 | LDPATH :=  -L/usr/local/lib -L/usr/lib -L$(TENSORRT_LIB_PATH) -L/usr/local/cuda/lib64 -Wl,-rpath,$(TENSORRT_LIB_PATH) 
37 | LDFLAGS := $(LDPATH) -ldl -lcudart -lcudnn -lnvinfer -lnvcaffe_parser $(shell pkg-config opencv --libs)
38 | 
39 | OBJ_PATH := ./bin/obj
40 | BIN_PATH := ./bin
41 | EXE_FILE := runYOLOv3
42 | 
43 | all: build
44 | 
45 | build: $(BIN_PATH)/$(EXE_FILE)
46 | 
47 | $(OBJ_PATH)/tensorRTClassifier.o: $(SRC_PATH)/tensorRTClassifier.cpp
48 | 	$(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $<
49 | 
50 | $(OBJ_PATH)/main.o: $(SRC_PATH)/main.cpp
51 | 	$(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $<
52 | 
53 | $(OBJ_PATH)/interpPlugin.o: $(SRC_PATH)/interpPlugin.cu
54 | 	$(NVCC) $(NVCC_FLAGS) $(INCLUDES) -o $@ -c $<
55 | 	
56 | $(OBJ_PATH)/bboxParser.o: $(SRC_PATH)/bboxParser.cu
57 | 	$(NVCC) $(NVCC_FLAGS) $(INCLUDES) -o $@ -c $<
58 | 
59 | $(OBJ_PATH)/regionLayer.o: $(SRC_PATH)/regionLayer.cu
60 | 	$(NVCC) $(NVCC_FLAGS) $(INCLUDES) -o $@ -c $<
61 | 	
62 | $(OBJ_PATH)/common.o: $(SRC_PATH)/common.cu
63 | 	$(NVCC) $(NVCC_FLAGS) $(INCLUDES) -o $@ -c $<
64 | 
65 | $(BIN_PATH)/$(EXE_FILE): $(OBJ_PATH)/tensorRTClassifier.o $(OBJ_PATH)/main.o $(OBJ_PATH)/interpPlugin.o $(OBJ_PATH)/bboxParser.o $(OBJ_PATH)/regionLayer.o $(OBJ_PATH)/common.o
66 | 	$(GCC) $+ $(CCFLAGS)  $(LDFLAGS) -o $@
67 | 
68 | clean:
69 | 	rm -rf $(OBJ_PATH)/* $(BIN_PATH)/$(EXE_FILE)
70 | 


--------------------------------------------------------------------------------
/Makefile~:
--------------------------------------------------------------------------------
 1 | DEBUG := 0
 2 | NVPROFILER := 1
 3 | 
 4 | #-DVOC  # model trained on VOC
 5 | #-Dcal_mAP  # calculate mAP
 6 | #-DPRINT_LOG  # print the prediction result
 7 | #-DVISULIZATION # draw boxes on the image && save
 8 | CUSTOM_MICRO := -DVOC # -DPRINT_LOG -DVISULIZATION
 9 | 
10 | GCC := g++
11 | CCFLAGS := -m64 -std=c++11 -O3 $(CUSTOM_MICRO)
12 | NVCC := nvcc
13 | # Choose your arch for fast compilation, 
14 | # sm_60 and sm_61 are for pascal gpu,
15 | # sm_30 and sm_35 are for Tesla K40 gpu
16 | NVCC_FLAGS := -gencode arch=compute_60,code=compute_60 \
17 |               -gencode arch=compute_61,code=compute_61 
18 | ifeq ($(DEBUG), 1)
19 | 	CCFLAGS += -g
20 | 	NVCC_FLAGS += -G 
21 | endif
22 | ifeq ($(NVPROFILER), 1)
23 | 	NVCC_FLAGS += -lineinfo
24 | endif
25 | NVCC_FLAGS += $(CCFLAGS)
26 | 
27 | TENSORRT_VERSION := 212GA
28 | SRC_PATH := ./src
29 | INC_PATH := ./include
30 | 
31 | TENSORRT_INC_PATH := ./tensorRT_$(TENSORRT_VERSION)/include
32 | TENSORRT_LIB_PATH := ./tensorRT_$(TENSORRT_VERSION)/lib
33 | 
34 | INCLUDES := -I$(SRC_PATH) -I$(INC_PATH) -I$(TENSORRT_INC_PATH) -I/usr/local/cuda/include -I/usr/local/include
35 | 
36 | LDPATH :=  -L/usr/local/lib -L/usr/lib -L$(TENSORRT_LIB_PATH) -L/usr/local/cuda/lib64 -Wl,-rpath,$(TENSORRT_LIB_PATH) 
37 | LDFLAGS := $(LDPATH) -ldl -lcudart -lcudnn -lnvinfer -lnvcaffe_parser $(shell pkg-config opencv --libs)
38 | 
39 | OBJ_PATH := ./bin/obj
40 | BIN_PATH := ./bin
41 | EXE_FILE := runYOLOv3
42 | 
43 | all: build
44 | 
45 | build: $(BIN_PATH)/$(EXE_FILE)
46 | 
47 | $(OBJ_PATH)/tensorRTClassifier.o: $(SRC_PATH)/tensorRTClassifier.cpp
48 | 	$(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $<
49 | 
50 | $(OBJ_PATH)/main.o: $(SRC_PATH)/main.cpp
51 | 	$(GCC) $(CCFLAGS) $(INCLUDES) -o $@ -c $<
52 | 
53 | $(OBJ_PATH)/interpPlugin.o: $(SRC_PATH)/interpPlugin.cu
54 | 	$(NVCC) $(NVCC_FLAGS) $(INCLUDES) -o $@ -c $<
55 | 	
56 | $(OBJ_PATH)/bboxParser.o: $(SRC_PATH)/bboxParser.cu
57 | 	$(NVCC) $(NVCC_FLAGS) $(INCLUDES) -o $@ -c $<
58 | 
59 | $(OBJ_PATH)/regionLayer.o: $(SRC_PATH)/regionLayer.cu
60 | 	$(NVCC) $(NVCC_FLAGS) $(INCLUDES) -o $@ -c $<
61 | 	
62 | $(OBJ_PATH)/common.o: $(SRC_PATH)/common.cu
63 | 	$(NVCC) $(NVCC_FLAGS) $(INCLUDES) -o $@ -c $<
64 | 
65 | $(BIN_PATH)/$(EXE_FILE): $(OBJ_PATH)/tensorRTClassifier.o $(OBJ_PATH)/main.o $(OBJ_PATH)/interpPlugin.o $(OBJ_PATH)/bboxParser.o $(OBJ_PATH)/regionLayer.o $(OBJ_PATH)/common.o
66 | 	$(GCC) $+ $(CCFLAGS)  $(LDFLAGS) -o $@
67 | 
68 | clean:
69 | 	rm -rf $(OBJ_PATH)/* $(BIN_PATH)/$(EXE_FILE)
70 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # YOLO-V3-Acceleration
 2 | Using TensorRT to implement and accelerate YOLO v3. Multi-scale and NMS are included.  The acceleration ratio reaches 3 compared to the original darknet.
 3 | Model:
 4 | /data/model
 5 | 
 6 | Image:
 7 | /data/images
 8 | 
 9 | Build the sample:
10 | $ make -j
11 | 
12 | Run the sample
13 | $ ./run.sh
14 | 
15 | 
16 | Plugin
17 | ===========================================
18 | 
19 | 1. Upsample layer with nearest-neighbour interpolution. (Interp85 Interp97)
20 | 
21 | 
22 | Bounding box parser
23 | ===========================================
24 | 
25 | * solution 1(used): launch reorgOutputKernel to fuse 3 output layers into 1 out layer form, but cost copy time, then do parser and NMS.
26 | 
27 | * solution 2(to be implement): iterate every output layer to do parser, then collect all bboxes to do NMS, also cost copy time during collection.
28 | 
29 | * solution 3(to be implement): create temp GPU memory to maintain a (float**) variable referring to 3 output layers, then do parser and NMS like ONE layer, based on index relation, but FAKE-ONE layer need to launch kernel 3 times.
30 | 
31 | 


--------------------------------------------------------------------------------
/bboxParser.h:
--------------------------------------------------------------------------------
 1 | #ifndef BBOX_PARSER_H
 2 | #define BBOX_PARSER_H
 3 | 
 4 | #include <vector>
 5 | // cub for sort
 6 | #include "regionLayer.h"
 7 | 
 8 | void sortScoresPerImage_gpu(
 9 |         const int   nBatch,
10 |         const int   nItemsPerImage,
11 |         void *      unsorted_scores,
12 |         void *      unsorted_bbox_indices,
13 |         void *      sorted_scores,
14 |         void *      sorted_bbox_indices,
15 |         void *      workspace,
16 |         const size_t    maxSizeofWorkspaceInByte,
17 |         cudaStream_t stream);
18 | 
19 | void splitOutputData_gpu(
20 |         const int       nBatch,                    // batch
21 |         const int       nClasses,
22 |         const int       nBboxesPerLoc,    // #box
23 |         const int       coords,                 // x,y,w,h
24 |         const int       l0_w,
25 |         const int				l0_h,
26 |         const int       nCells,
27 |         const bool      background,             // use background conf or not
28 |         const bool      only_objectness,        // no class conf
29 |         const float     thres,
30 |         const float*    predictions,
31 |         const float*    biases,
32 |         float*          probes,
33 |         box*            bboxes,
34 |         cudaStream_t    stream);
35 | 
36 | 
37 | void correct_region_boxes_gpu(
38 |         const int       nBatch,                    // batch
39 |         const int       nClasses,
40 |         const int       nBboxesPerLoc,    // #box
41 |         const int       nCells,
42 |         const int       image_w,
43 |         const int       image_h,
44 |         const int       net_input_w,
45 |         const int       net_input_h,
46 |         box*            bboxes,
47 |         cudaStream_t    stream);
48 | 
49 | 
50 | void sortScoresPerClass_gpu(
51 |         const int       nBatch,
52 |         const int       nClasses,
53 |         const int       nBboxesPerLoc,
54 |         const void *    probes,
55 |         void *          sorted_boxIdx,
56 |         void *          workspace,
57 |         const size_t    maxSizeofWorkspaceInByte,
58 |         cudaStream_t    stream);
59 | 
60 | 
61 | void allClassNMS_gpu(
62 |         const int       nBatch,                    //batch
63 |         const int       nClasses,
64 |         const int       nBboxesPerLoc,
65 |         const int       nCells,
66 |         const float     nms_threshold,
67 |         void *          bboxes,
68 |         void *          probes,
69 |         void *          afterNMS_probes,
70 |         void *          indexes,
71 |         void *          afterNMS_indexes,
72 |         cudaStream_t    stream);
73 | 
74 | 
75 | size_t getWorkspaceSizeInByte(
76 |         const int       nBatch,
77 |         const int       nClasses,
78 |         const int       nBboxesPerLoc,
79 |         const int       nCells);
80 | 
81 | #endif
82 | 


--------------------------------------------------------------------------------
/bboxParser.h~:
--------------------------------------------------------------------------------
 1 | #ifndef BBOX_PARSER_H
 2 | #define BBOX_PARSER_H
 3 | 
 4 | #include <vector>
 5 | // cub for sort
 6 | #include "regionLayer.h"
 7 | 
 8 | void sortScoresPerImage_gpu(
 9 |         const int   nBatch,
10 |         const int   nItemsPerImage,
11 |         void *      unsorted_scores,
12 |         void *      unsorted_bbox_indices,
13 |         void *      sorted_scores,
14 |         void *      sorted_bbox_indices,
15 |         void *      workspace,
16 |         const size_t    maxSizeofWorkspaceInByte,
17 |         cudaStream_t stream);
18 | 
19 | void splitOutputData_gpu(
20 |         const int       nBatch,                    // batch
21 |         const int       nClasses,
22 |         const int       nBboxesPerLoc,    // #box
23 |         const int       coords,                 // x,y,w,h
24 |         const int       l0_w,
25 |         const int				l0_h,
26 |         const int       nCells,
27 |         const bool      background,             // use background conf or not
28 |         const bool      only_objectness,        // no class conf
29 |         const float     thres,
30 |         const float*    predictions,
31 |         const float*    biases,
32 |         float*          probes,
33 |         box*            bboxes,
34 |         cudaStream_t    stream);
35 | 
36 | 
37 | void correct_region_boxes_gpu(
38 |         const int       nBatch,                    // batch
39 |         const int       nClasses,
40 |         const int       nBboxesPerLoc,    // #box
41 |         const int       nCells,
42 |         const int       image_w,
43 |         const int       image_h,
44 |         const int       net_input_w,
45 |         const int       net_input_h,
46 |         box*            bboxes,
47 |         cudaStream_t    stream);
48 | 
49 | 
50 | void sortScoresPerClass_gpu(
51 |         const int       nBatch,
52 |         const int       nClasses,
53 |         const int       nBboxesPerLoc,
54 |         const void *    probes,
55 |         void *          sorted_boxIdx,
56 |         void *          workspace,
57 |         const size_t    maxSizeofWorkspaceInByte,
58 |         cudaStream_t    stream);
59 | 
60 | 
61 | void allClassNMS_gpu(
62 |         const int       nBatch,                    //batch
63 |         const int       nClasses,
64 |         const int       nBboxesPerLoc,
65 |         const int       w,
66 |         const int       h,
67 |         const float     nms_threshold,
68 |         void *          bboxes,
69 |         void *          probes,
70 |         void *          afterNMS_probes,
71 |         void *          indexes,
72 |         void *          afterNMS_indexes,
73 |         cudaStream_t    stream);
74 | 
75 | 
76 | size_t getWorkspaceSizeInByte(
77 |         const int       nBatch,
78 |         const int       nClasses,
79 |         const int       nBboxesPerLoc,
80 |         const int       w,
81 |         const int       h);
82 | 
83 | #endif
84 | 


--------------------------------------------------------------------------------
/classifier.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * NOTICE TO USER:
 5 | *
 6 | * This source code is subject to NVIDIA ownership rights under U.S. and
 7 | * international Copyright laws.
 8 | *
 9 | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
10 | * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
11 | * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
12 | * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
13 | * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
14 | * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
15 | * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
16 | * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
17 | * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
18 | * OR PERFORMANCE OF THIS SOURCE CODE.
19 | *
20 | * U.S. Government End Users.  This source code is a "commercial item" as
21 | * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
22 | * "commercial computer software" and "commercial computer software
23 | * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
24 | * and is provided to the U.S. Government only as a commercial end item.
25 | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
26 | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
27 | * source code with only those rights set forth herein.
28 | */
29 | 
30 | #ifndef CLASSIFIER_H
31 | #define CLASSIFIER_H 
32 | 
33 | #include <vector>
34 | #include "NvInfer.h"
35 | #include <cuda_runtime.h>
36 | 
37 | using namespace nvinfer1;
38 | 
39 | typedef struct INFER_OUTPUT_PARAMS_ {
40 | 	int nBatchSize_;
41 | 	std::vector<float *> vpInferResults_;
42 | 	std::vector<int    > vnLens_;
43 |     std::vector<DimsCHW > vOutputDims_;
44 | } INFER_OUTPUT_PARAMS;
45 | 
46 | class IClassifier {
47 | public:
48 | 	virtual void setInputData(float *pBGR,
49 | 								const int nWidth,
50 | 								const int nHeight,
51 | 								const int nBatchSize) = 0;
52 | 	
53 | 	virtual void forward(INFER_OUTPUT_PARAMS *) = 0;
54 | 	
55 | 	virtual int getInferWidth() const = 0;
56 | 	
57 | 	virtual int getInferHeight() const = 0;
58 | 	
59 | 	virtual std::vector<float > getMeanValues() const = 0;
60 | 
61 | protected:
62 | 	virtual ~IClassifier() {}
63 | };
64 | 
65 | #endif
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/common.cu:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | // alignptr
 4 | int8_t * alignPtr(int8_t * ptr, uintptr_t to)
 5 | {
 6 |     uintptr_t addr = (uintptr_t)ptr;
 7 |     if (addr % to) {
 8 |         addr += to - addr % to;
 9 |     }
10 |     return (int8_t *)addr;
11 | }
12 | 
13 | // calc next ptr (consider alignment)
14 | int8_t * nextWorkspacePtr(int8_t * ptr, uintptr_t previousWorkspaceSize)
15 | {
16 |     uintptr_t addr = (uintptr_t) ptr;
17 |     addr += previousWorkspaceSize;
18 |     return alignPtr((int8_t *)addr, CUDA_MEM_ALIGN);
19 | }
20 | 
21 | 
22 | template <unsigned nthds_per_cta>
23 | __launch_bounds__ (nthds_per_cta)
24 | __global__ void setUniformOffsets_kernel(
25 |         const int   num_segments,
26 |         const int   offset,
27 |         int *       d_offsets)
28 | {
29 |     const int idx = blockIdx.x * nthds_per_cta + threadIdx.x;
30 |     if (idx <= num_segments){
31 |         d_offsets[idx] = idx * offset;
32 |     }
33 | }
34 | 
35 | void setUniformOffsets(
36 |         const int       num_segments,
37 |         const int       offset,
38 |         int *           d_offsets,
39 |         cudaStream_t    stream)
40 | {
41 |     const int blockSize = 32;
42 |     const int gridSize = (num_segments + 1 + blockSize - 1) / blockSize;
43 |     setUniformOffsets_kernel<blockSize>
44 |         <<<gridSize, blockSize, 0, stream>>>
45 |         (num_segments, offset, d_offsets);
46 | }
47 | 


--------------------------------------------------------------------------------
/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef COMMON_H_
 2 | #define COMMON_H_
 3 | 
 4 | #include <cub/cub.cuh>
 5 | 
 6 | #define CUDA_MEM_ALIGN 256
 7 | 
 8 | // alignptr
 9 | int8_t * alignPtr(int8_t * ptr, uintptr_t to);
10 | 
11 | int8_t * nextWorkspacePtr(int8_t * ptr, uintptr_t previousWorkspaceSize);
12 | 
13 | void setUniformOffsets(const int num_segments, const int offset, int * d_offsets, cudaStream_t stream);
14 | 
15 | /**
16 |  * Determine the usage of temporary memory for cub sort
17 |  * The cub::DeviceSegmentedRadixSort can be used for batched (segmented) sort.
18 |  */
19 | template <typename KeyT, typename ValueT>
20 | size_t cubSortPairsWorkspaceSize(int num_items, int num_segments)
21 | {
22 |     size_t temp_storage_bytes = 0;
23 |     cub::DeviceSegmentedRadixSort::SortPairsDescending(
24 | 	(void *)NULL, temp_storage_bytes,
25 | 	(const KeyT *)NULL, (KeyT *)NULL,
26 | 	(const ValueT *)NULL, (ValueT *)NULL,
27 | 	num_items,     // # items
28 | 	num_segments,  // # segments
29 | 	(const int *)NULL, (const int *)NULL);
30 |     return temp_storage_bytes;
31 | }
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/common.h~:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangjinsong3/YOLO-V3-Acceleration/384746bb6d8f61c8def70bbc0b5e04b98c60356e/common.h~


--------------------------------------------------------------------------------
/draw.h:
--------------------------------------------------------------------------------
 1 | #include "preproc_yolov3.h"
 2 | #include "regionLayer.h"
 3 | #include "bboxParser.h"
 4 | 
 5 | void draw_box(image a, int x1, int y1, int x2, int y2, float r, float g, float b)
 6 | {
 7 |     int i;
 8 |     if(x1 < 0) x1 = 0;
 9 |     if(x1 >= a.w) x1 = a.w-1;
10 |     if(x2 < 0) x2 = 0;
11 |     if(x2 >= a.w) x2 = a.w-1;
12 | 
13 |     if(y1 < 0) y1 = 0;
14 |     if(y1 >= a.h) y1 = a.h-1;
15 |     if(y2 < 0) y2 = 0;
16 |     if(y2 >= a.h) y2 = a.h-1;
17 | 
18 |     for(i = x1; i <= x2; ++i){
19 |         a.data[i + y1*a.w + 0*a.w*a.h] = r;
20 |         a.data[i + y2*a.w + 0*a.w*a.h] = r;
21 | 
22 |         a.data[i + y1*a.w + 1*a.w*a.h] = g;
23 |         a.data[i + y2*a.w + 1*a.w*a.h] = g;
24 | 
25 |         a.data[i + y1*a.w + 2*a.w*a.h] = b;
26 |         a.data[i + y2*a.w + 2*a.w*a.h] = b;
27 |     }
28 |     for(i = y1; i <= y2; ++i){
29 |         a.data[x1 + i*a.w + 0*a.w*a.h] = r;
30 |         a.data[x2 + i*a.w + 0*a.w*a.h] = r;
31 | 
32 |         a.data[x1 + i*a.w + 1*a.w*a.h] = g;
33 |         a.data[x2 + i*a.w + 1*a.w*a.h] = g;
34 | 
35 |         a.data[x1 + i*a.w + 2*a.w*a.h] = b;
36 |         a.data[x2 + i*a.w + 2*a.w*a.h] = b;
37 |     }
38 | }
39 | 
40 | 
41 | 
42 | void draw_box_width(image a, int x1, int y1, int x2, int y2, int w)
43 | {
44 |     int i;
45 |     for(i = 0; i < w; ++i){
46 |         draw_box(a, x1+i, y1+i, x2-i, y2-i, 255, 0, 0);
47 |     }
48 | }
49 | 
50 | 
51 | void draw_detections(image im,
52 | 					 int batchIdx,
53 | 					 float thresh,
54 | 					 box *boxes,
55 | 					 float *probs,
56 | 					 int * indexes,
57 | 					 int sizeOfClass,
58 | 					 int sizeOfBatch)
59 | {
60 | 	int n = batchIdx;
61 | 	// int sizeOfClass = l.n * l.h * l.w;
62 | 	// int sizeOfBatch = l.classes * sizeOfClass;
63 | 
64 | 	int count = 0;
65 |     for(int i = 0; i < sizeOfBatch; ++i){
66 | 		int id = n * sizeOfBatch + i;
67 | 		int indexes_idx = indexes[id];
68 | 
69 | 		if (probs[id] > thresh){
70 | 			int category = (indexes_idx % sizeOfBatch) / sizeOfClass;
71 | 			int boxId = indexes_idx % sizeOfClass;
72 | 
73 |             int width = im.h * .006;
74 |             box b = boxes[boxId];
75 | 
76 |             int left  = (b.x-b.w/2.)*im.w;
77 |             int right = (b.x+b.w/2.)*im.w;
78 |             int top   = (b.y-b.h/2.)*im.h;
79 |             int bot   = (b.y+b.h/2.)*im.h;
80 | 
81 |             if(left < 0) left = 0;
82 |             if(right > im.w-1) right = im.w-1;
83 |             if(top < 0) top = 0;
84 |             if(bot > im.h-1) bot = im.h-1;
85 | 
86 |             draw_box_width(im, left, top, right, bot, width);
87 |         }
88 |     }
89 | }
90 | 


--------------------------------------------------------------------------------
/draw.h~:
--------------------------------------------------------------------------------
 1 | #include "preproc_yolov2.h"
 2 | #include "regionLayer.h"
 3 | #include "bboxParser.h"
 4 | 
 5 | void draw_box(image a, int x1, int y1, int x2, int y2, float r, float g, float b)
 6 | {
 7 |     int i;
 8 |     if(x1 < 0) x1 = 0;
 9 |     if(x1 >= a.w) x1 = a.w-1;
10 |     if(x2 < 0) x2 = 0;
11 |     if(x2 >= a.w) x2 = a.w-1;
12 | 
13 |     if(y1 < 0) y1 = 0;
14 |     if(y1 >= a.h) y1 = a.h-1;
15 |     if(y2 < 0) y2 = 0;
16 |     if(y2 >= a.h) y2 = a.h-1;
17 | 
18 |     for(i = x1; i <= x2; ++i){
19 |         a.data[i + y1*a.w + 0*a.w*a.h] = r;
20 |         a.data[i + y2*a.w + 0*a.w*a.h] = r;
21 | 
22 |         a.data[i + y1*a.w + 1*a.w*a.h] = g;
23 |         a.data[i + y2*a.w + 1*a.w*a.h] = g;
24 | 
25 |         a.data[i + y1*a.w + 2*a.w*a.h] = b;
26 |         a.data[i + y2*a.w + 2*a.w*a.h] = b;
27 |     }
28 |     for(i = y1; i <= y2; ++i){
29 |         a.data[x1 + i*a.w + 0*a.w*a.h] = r;
30 |         a.data[x2 + i*a.w + 0*a.w*a.h] = r;
31 | 
32 |         a.data[x1 + i*a.w + 1*a.w*a.h] = g;
33 |         a.data[x2 + i*a.w + 1*a.w*a.h] = g;
34 | 
35 |         a.data[x1 + i*a.w + 2*a.w*a.h] = b;
36 |         a.data[x2 + i*a.w + 2*a.w*a.h] = b;
37 |     }
38 | }
39 | 
40 | 
41 | 
42 | void draw_box_width(image a, int x1, int y1, int x2, int y2, int w)
43 | {
44 |     int i;
45 |     for(i = 0; i < w; ++i){
46 |         draw_box(a, x1+i, y1+i, x2-i, y2-i, 255, 0, 0);
47 |     }
48 | }
49 | 
50 | 
51 | void draw_detections(image im,
52 | 					 int batchIdx,
53 | 					 float thresh,
54 | 					 box *boxes,
55 | 					 float *probs,
56 | 					 int * indexes,
57 | 					 int sizeOfClass,
58 | 					 int sizeOfBatch)
59 | {
60 | 	int n = batchIdx;
61 | 	// int sizeOfClass = l.n * l.h * l.w;
62 | 	// int sizeOfBatch = l.classes * sizeOfClass;
63 | 
64 | 	int count = 0;
65 |     for(int i = 0; i < sizeOfBatch; ++i){
66 | 		int id = n * sizeOfBatch + i;
67 | 		int indexes_idx = indexes[id];
68 | 
69 | 		if (probs[id] > thresh){
70 | 			int category = (indexes_idx % sizeOfBatch) / sizeOfClass;
71 | 			int boxId = indexes_idx % sizeOfClass;
72 | 
73 |             int width = im.h * .006;
74 |             box b = boxes[boxId];
75 | 
76 |             int left  = (b.x-b.w/2.)*im.w;
77 |             int right = (b.x+b.w/2.)*im.w;
78 |             int top   = (b.y-b.h/2.)*im.h;
79 |             int bot   = (b.y+b.h/2.)*im.h;
80 | 
81 |             if(left < 0) left = 0;
82 |             if(right > im.w-1) right = im.w-1;
83 |             if(top < 0) top = 0;
84 |             if(bot > im.h-1) bot = im.h-1;
85 | 
86 |             draw_box_width(im, left, top, right, bot, width);
87 |         }
88 |     }
89 | }
90 | 


--------------------------------------------------------------------------------
/gridSearchParam.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | result=mAP.csv
 4 | mAP=0.0
 5 | for nms in `seq 0.4 0.05 0.7`
 6 | do
 7 |     for conf in `seq 0.001 0.001 0.005`
 8 |     do
 9 |         ./run.sh 0 ${nms} ${conf}
10 |         cd results/
11 |         mAP=`python calc_mAP.py 0.5  2>&1 1>/dev/null`
12 |         echo $( printf '%f %f %f' ${nms} ${conf} ${mAP} ) >> ${result}
13 |         cat ${result}
14 |         rm -rf cache comp4_det_test_*
15 |         cd ../
16 |     done
17 | done
18 | 


--------------------------------------------------------------------------------
/gridSearchParam.sh~:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | result=mAP.csv
 4 | mAP=0.0
 5 | for nms in `seq 0.05 0.05 0.4`
 6 | do
 7 |     for conf in `seq 0.005 0.005 0.03`
 8 |     do
 9 |         ./run.sh 0 ${nms} ${conf}
10 |         cd results/
11 |         mAP=`python calc_mAP.py 0.5  2>&1 1>/dev/null`
12 |         echo $( printf '%f %f %f' ${nms} ${conf} ${mAP} ) >> ${result}
13 |         cat ${result}
14 |         rm -rf cache comp4_det_test_*
15 |         cd ../
16 |     done
17 | done
18 | 


--------------------------------------------------------------------------------
/include/cub/block/block_raking_layout.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
 32 |  */
 33 | 
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "../util_macro.cuh"
 38 | #include "../util_arch.cuh"
 39 | #include "../util_type.cuh"
 40 | #include "../util_namespace.cuh"
 41 | 
 42 | /// Optional outer namespace(s)
 43 | CUB_NS_PREFIX
 44 | 
 45 | /// CUB namespace
 46 | namespace cub {
 47 | 
 48 | /**
 49 |  * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
 50 |  * \ingroup BlockModule
 51 |  *
 52 |  * \par Overview
 53 |  * This type facilitates a shared memory usage pattern where a block of CUDA
 54 |  * threads places elements into shared memory and then reduces the active
 55 |  * parallelism to one "raking" warp of threads for serially aggregating consecutive
 56 |  * sequences of shared items.  Padding is inserted to eliminate bank conflicts
 57 |  * (for most data types).
 58 |  *
 59 |  * \tparam T                        The data type to be exchanged.
 60 |  * \tparam BLOCK_THREADS            The thread block size in threads.
 61 |  * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
 62 |  */
 63 | template <
 64 |     typename    T,
 65 |     int         BLOCK_THREADS,
 66 |     int         PTX_ARCH = CUB_PTX_ARCH>
 67 | struct BlockRakingLayout
 68 | {
 69 |     //---------------------------------------------------------------------
 70 |     // Constants and type definitions
 71 |     //---------------------------------------------------------------------
 72 | 
 73 |     enum
 74 |     {
 75 |         /// The total number of elements that need to be cooperatively reduced
 76 |         SHARED_ELEMENTS = BLOCK_THREADS,
 77 | 
 78 |         /// Maximum number of warp-synchronous raking threads
 79 |         MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
 80 | 
 81 |         /// Number of raking elements per warp-synchronous raking thread (rounded up)
 82 |         SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
 83 | 
 84 |         /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
 85 |         RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
 86 | 
 87 |         /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
 88 |         HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
 89 | 
 90 |         /// Degree of bank conflicts (e.g., 4-way)
 91 |         CONFLICT_DEGREE = (HAS_CONFLICTS) ?
 92 |             (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
 93 |             1,
 94 | 
 95 |         /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
 96 |         USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
 97 | 
 98 |         /// Total number of elements in the raking grid
 99 |         GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
100 | 
101 |         /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
102 |         UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
103 |     };
104 | 
105 | 
106 |     /**
107 |      * \brief Shared memory storage type
108 |      */
109 |     struct __align__(16) _TempStorage
110 |     {
111 |         T buff[BlockRakingLayout::GRID_ELEMENTS];
112 |     };
113 | 
114 |     /// Alias wrapper allowing storage to be unioned
115 |     struct TempStorage : Uninitialized<_TempStorage> {};
116 | 
117 | 
118 |     /**
119 |      * \brief Returns the location for the calling thread to place data into the grid
120 |      */
121 |     static __device__ __forceinline__ T* PlacementPtr(
122 |         TempStorage &temp_storage,
123 |         unsigned int linear_tid)
124 |     {
125 |         // Offset for partial
126 |         unsigned int offset = linear_tid;
127 | 
128 |         // Add in one padding element for every segment
129 |         if (USE_SEGMENT_PADDING > 0)
130 |         {
131 |             offset += offset / SEGMENT_LENGTH;
132 |         }
133 | 
134 |         // Incorporating a block of padding partials every shared memory segment
135 |         return temp_storage.Alias().buff + offset;
136 |     }
137 | 
138 | 
139 |     /**
140 |      * \brief Returns the location for the calling thread to begin sequential raking
141 |      */
142 |     static __device__ __forceinline__ T* RakingPtr(
143 |         TempStorage &temp_storage,
144 |         unsigned int linear_tid)
145 |     {
146 |         return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
147 |     }
148 | };
149 | 
150 | }               // CUB namespace
151 | CUB_NS_POSTFIX  // Optional outer namespace(s)
152 | 
153 | 


--------------------------------------------------------------------------------
/include/cub/block/specializations/block_histogram_atomic.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 4 |  * 
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  * 
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
32 |  */
33 | 
34 | #pragma once
35 | 
36 | #include "../../util_namespace.cuh"
37 | 
38 | /// Optional outer namespace(s)
39 | CUB_NS_PREFIX
40 | 
41 | /// CUB namespace
42 | namespace cub {
43 | 
44 | 
45 | /**
46 |  * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
47 |  */
48 | template <int BINS>
49 | struct BlockHistogramAtomic
50 | {
51 |     /// Shared memory storage layout type
52 |     struct TempStorage {};
53 | 
54 | 
55 |     /// Constructor
56 |     __device__ __forceinline__ BlockHistogramAtomic(
57 |         TempStorage &temp_storage)
58 |     {}
59 | 
60 | 
61 |     /// Composite data onto an existing histogram
62 |     template <
63 |         typename            T,
64 |         typename            CounterT,     
65 |         int                 ITEMS_PER_THREAD>
66 |     __device__ __forceinline__ void Composite(
67 |         T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
68 |         CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
69 |     {
70 |         // Update histogram
71 |         #pragma unroll
72 |         for (int i = 0; i < ITEMS_PER_THREAD; ++i)
73 |         {
74 |               atomicAdd(histogram + items[i], 1);
75 |         }
76 |     }
77 | 
78 | };
79 | 
80 | }               // CUB namespace
81 | CUB_NS_POSTFIX  // Optional outer namespace(s)
82 | 
83 | 


--------------------------------------------------------------------------------
/include/cub/cub.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 4 |  * 
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  * 
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * CUB umbrella include file
32 |  */
33 | 
34 | #pragma once
35 | 
36 | 
37 | // Block
38 | #include "block/block_histogram.cuh"
39 | #include "block/block_discontinuity.cuh"
40 | #include "block/block_exchange.cuh"
41 | #include "block/block_load.cuh"
42 | #include "block/block_radix_rank.cuh"
43 | #include "block/block_radix_sort.cuh"
44 | #include "block/block_reduce.cuh"
45 | #include "block/block_scan.cuh"
46 | #include "block/block_store.cuh"
47 | //#include "block/block_shift.cuh"
48 | 
49 | // Device
50 | #include "device/device_histogram.cuh"
51 | #include "device/device_partition.cuh"
52 | #include "device/device_radix_sort.cuh"
53 | #include "device/device_reduce.cuh"
54 | #include "device/device_run_length_encode.cuh"
55 | #include "device/device_scan.cuh"
56 | #include "device/device_segmented_radix_sort.cuh"
57 | #include "device/device_segmented_reduce.cuh"
58 | #include "device/device_select.cuh"
59 | #include "device/device_spmv.cuh"
60 | 
61 | // Grid
62 | //#include "grid/grid_barrier.cuh"
63 | #include "grid/grid_even_share.cuh"
64 | #include "grid/grid_mapping.cuh"
65 | #include "grid/grid_queue.cuh"
66 | 
67 | // Thread
68 | #include "thread/thread_load.cuh"
69 | #include "thread/thread_operators.cuh"
70 | #include "thread/thread_reduce.cuh"
71 | #include "thread/thread_scan.cuh"
72 | #include "thread/thread_store.cuh"
73 | 
74 | // Warp
75 | #include "warp/warp_reduce.cuh"
76 | #include "warp/warp_scan.cuh"
77 | 
78 | // Iterator
79 | #include "iterator/arg_index_input_iterator.cuh"
80 | #include "iterator/cache_modified_input_iterator.cuh"
81 | #include "iterator/cache_modified_output_iterator.cuh"
82 | #include "iterator/constant_input_iterator.cuh"
83 | #include "iterator/counting_input_iterator.cuh"
84 | #include "iterator/tex_obj_input_iterator.cuh"
85 | #include "iterator/tex_ref_input_iterator.cuh"
86 | #include "iterator/transform_input_iterator.cuh"
87 | 
88 | // Util
89 | #include "util_arch.cuh"
90 | #include "util_debug.cuh"
91 | #include "util_device.cuh"
92 | #include "util_macro.cuh"
93 | #include "util_ptx.cuh"
94 | #include "util_type.cuh"
95 | 
96 | 


--------------------------------------------------------------------------------
/include/cub/grid/grid_barrier.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../util_debug.cuh"
 37 | #include "../util_namespace.cuh"
 38 | #include "../thread/thread_load.cuh"
 39 | 
 40 | /// Optional outer namespace(s)
 41 | CUB_NS_PREFIX
 42 | 
 43 | /// CUB namespace
 44 | namespace cub {
 45 | 
 46 | 
 47 | /**
 48 |  * \addtogroup GridModule
 49 |  * @{
 50 |  */
 51 | 
 52 | 
 53 | /**
 54 |  * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
 55 |  */
 56 | class GridBarrier
 57 | {
 58 | protected :
 59 | 
 60 |     typedef unsigned int SyncFlag;
 61 | 
 62 |     // Counters in global device memory
 63 |     SyncFlag* d_sync;
 64 | 
 65 | public:
 66 | 
 67 |     /**
 68 |      * Constructor
 69 |      */
 70 |     GridBarrier() : d_sync(NULL) {}
 71 | 
 72 | 
 73 |     /**
 74 |      * Synchronize
 75 |      */
 76 |     __device__ __forceinline__ void Sync() const
 77 |     {
 78 |         volatile SyncFlag *d_vol_sync = d_sync;
 79 | 
 80 |         // Threadfence and syncthreads to make sure global writes are visible before
 81 |         // thread-0 reports in with its sync counter
 82 |         __threadfence();
 83 |         CTA_SYNC();
 84 | 
 85 |         if (blockIdx.x == 0)
 86 |         {
 87 |             // Report in ourselves
 88 |             if (threadIdx.x == 0)
 89 |             {
 90 |                 d_vol_sync[blockIdx.x] = 1;
 91 |             }
 92 | 
 93 |             CTA_SYNC();
 94 | 
 95 |             // Wait for everyone else to report in
 96 |             for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
 97 |             {
 98 |                 while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
 99 |                 {
100 |                     __threadfence_block();
101 |                 }
102 |             }
103 | 
104 |             CTA_SYNC();
105 | 
106 |             // Let everyone know it's safe to proceed
107 |             for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
108 |             {
109 |                 d_vol_sync[peer_block] = 0;
110 |             }
111 |         }
112 |         else
113 |         {
114 |             if (threadIdx.x == 0)
115 |             {
116 |                 // Report in
117 |                 d_vol_sync[blockIdx.x] = 1;
118 | 
119 |                 // Wait for acknowledgment
120 |                 while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
121 |                 {
122 |                     __threadfence_block();
123 |                 }
124 |             }
125 | 
126 |             CTA_SYNC();
127 |         }
128 |     }
129 | };
130 | 
131 | 
132 | /**
133 |  * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
134 |  *
135 |  * Uses RAII for lifetime, i.e., device resources are reclaimed when
136 |  * the destructor is called.
137 |  */
138 | class GridBarrierLifetime : public GridBarrier
139 | {
140 | protected:
141 | 
142 |     // Number of bytes backed by d_sync
143 |     size_t sync_bytes;
144 | 
145 | public:
146 | 
147 |     /**
148 |      * Constructor
149 |      */
150 |     GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
151 | 
152 | 
153 |     /**
154 |      * DeviceFrees and resets the progress counters
155 |      */
156 |     cudaError_t HostReset()
157 |     {
158 |         cudaError_t retval = cudaSuccess;
159 |         if (d_sync)
160 |         {
161 |             CubDebug(retval = cudaFree(d_sync));
162 |             d_sync = NULL;
163 |         }
164 |         sync_bytes = 0;
165 |         return retval;
166 |     }
167 | 
168 | 
169 |     /**
170 |      * Destructor
171 |      */
172 |     virtual ~GridBarrierLifetime()
173 |     {
174 |         HostReset();
175 |     }
176 | 
177 | 
178 |     /**
179 |      * Sets up the progress counters for the next kernel launch (lazily
180 |      * allocating and initializing them if necessary)
181 |      */
182 |     cudaError_t Setup(int sweep_grid_size)
183 |     {
184 |         cudaError_t retval = cudaSuccess;
185 |         do {
186 |             size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
187 |             if (new_sync_bytes > sync_bytes)
188 |             {
189 |                 if (d_sync)
190 |                 {
191 |                     if (CubDebug(retval = cudaFree(d_sync))) break;
192 |                 }
193 | 
194 |                 sync_bytes = new_sync_bytes;
195 | 
196 |                 // Allocate and initialize to zero
197 |                 if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
198 |                 if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
199 |             }
200 |         } while (0);
201 | 
202 |         return retval;
203 |     }
204 | };
205 | 
206 | 
207 | /** @} */       // end group GridModule
208 | 
209 | }               // CUB namespace
210 | CUB_NS_POSTFIX  // Optional outer namespace(s)
211 | 
212 | 


--------------------------------------------------------------------------------
/include/cub/grid/grid_mapping.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../util_namespace.cuh"
 37 | 
 38 | /// Optional outer namespace(s)
 39 | CUB_NS_PREFIX
 40 | 
 41 | /// CUB namespace
 42 | namespace cub {
 43 | 
 44 | 
 45 | /**
 46 |  * \addtogroup GridModule
 47 |  * @{
 48 |  */
 49 | 
 50 | 
 51 | /******************************************************************************
 52 |  * Mapping policies
 53 |  *****************************************************************************/
 54 | 
 55 | 
 56 | /**
 57 |  * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
 58 |  */
 59 | enum GridMappingStrategy
 60 | {
 61 |     /**
 62 |      * \brief An a "raking" access pattern in which each thread block is
 63 |      * assigned a consecutive sequence of input tiles
 64 |      *
 65 |      * \par Overview
 66 |      * The input is evenly partitioned into \p p segments, where \p p is
 67 |      * constant and corresponds loosely to the number of thread blocks that may
 68 |      * actively reside on the target device. Each segment is comprised of
 69 |      * consecutive tiles, where a tile is a small, constant-sized unit of input
 70 |      * to be processed to completion before the thread block terminates or
 71 |      * obtains more work.  The kernel invokes \p p thread blocks, each
 72 |      * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
 73 |      * in tile-size increments.
 74 |      */
 75 |     GRID_MAPPING_RAKE,
 76 | 
 77 |     /**
 78 |      * \brief An a "strip mining" access pattern in which the input tiles assigned
 79 |      * to each thread block are separated by a stride equal to the the extent of
 80 |      * the grid.
 81 |      *
 82 |      * \par Overview
 83 |      * The input is evenly partitioned into \p p sets, where \p p is
 84 |      * constant and corresponds loosely to the number of thread blocks that may
 85 |      * actively reside on the target device. Each set is comprised of
 86 |      * data tiles separated by stride \p tiles, where a tile is a small,
 87 |      * constant-sized unit of input to be processed to completion before the
 88 |      * thread block terminates or obtains more work.  The kernel invokes \p p
 89 |      * thread blocks, each of which iteratively consumes a segment of
 90 |      * <em>n</em>/<em>p</em> elements in tile-size increments.
 91 |      */
 92 |     GRID_MAPPING_STRIP_MINE,
 93 | 
 94 |     /**
 95 |      * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
 96 |      *
 97 |      * \par Overview
 98 |      * The input is treated as a queue to be dynamically consumed by a grid of
 99 |      * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
100 |      * unit of input to be processed to completion before the thread block
101 |      * terminates or obtains more work.  The grid size \p p is constant,
102 |      * loosely corresponding to the number of thread blocks that may actively
103 |      * reside on the target device.
104 |      */
105 |     GRID_MAPPING_DYNAMIC,
106 | };
107 | 
108 | 
109 | /** @} */       // end group GridModule
110 | 
111 | }               // CUB namespace
112 | CUB_NS_POSTFIX  // Optional outer namespace(s)
113 | 
114 | 


--------------------------------------------------------------------------------
/include/cub/host/mutex.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Simple portable mutex
 32 |  */
 33 | 
 34 | 
 35 | #pragma once
 36 | 
 37 | #if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
 38 |     #include <mutex>
 39 | #else
 40 |     #if defined(_WIN32) || defined(_WIN64)
 41 |         #include <intrin.h>
 42 | 
 43 |         #define WIN32_LEAN_AND_MEAN
 44 |         #define NOMINMAX
 45 |         #include <windows.h>
 46 |         #undef WIN32_LEAN_AND_MEAN
 47 |         #undef NOMINMAX
 48 | 
 49 |         /**
 50 |          * Compiler read/write barrier
 51 |          */
 52 |         #pragma intrinsic(_ReadWriteBarrier)
 53 | 
 54 |     #endif
 55 | #endif
 56 | 
 57 | #include "../util_namespace.cuh"
 58 | 
 59 | 
 60 | /// Optional outer namespace(s)
 61 | CUB_NS_PREFIX
 62 | 
 63 | /// CUB namespace
 64 | namespace cub {
 65 | 
 66 | 
 67 | /**
 68 |  * Simple portable mutex
 69 |  *   - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms)
 70 |  *   - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++)
 71 |  */
 72 | struct Mutex
 73 | {
 74 | #if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
 75 | 
 76 |     std::mutex mtx;
 77 | 
 78 |     void Lock()
 79 |     {
 80 |         mtx.lock();
 81 |     }
 82 | 
 83 |     void Unlock()
 84 |     {
 85 |         mtx.unlock();
 86 |     }
 87 | 
 88 |     void TryLock()
 89 |     {
 90 |         mtx.try_lock();
 91 |     }
 92 | 
 93 | #else       //__cplusplus > 199711L
 94 | 
 95 |     #if defined(_MSC_VER)
 96 | 
 97 |         // Microsoft VC++
 98 |         typedef long Spinlock;
 99 | 
100 |     #else
101 | 
102 |         // GNU g++
103 |         typedef int Spinlock;
104 | 
105 |         /**
106 |          * Compiler read/write barrier
107 |          */
108 |         __forceinline__ void _ReadWriteBarrier()
109 |         {
110 |             __sync_synchronize();
111 |         }
112 | 
113 |         /**
114 |          * Atomic exchange
115 |          */
116 |         __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
117 |         {
118 |             // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
119 |             _ReadWriteBarrier();
120 |             return __sync_lock_test_and_set(Target, Value);
121 |         }
122 | 
123 |         /**
124 |          * Pause instruction to prevent excess processor bus usage
125 |          */
126 |         __forceinline__ void YieldProcessor()
127 |         {
128 |         }
129 | 
130 |     #endif  // defined(_MSC_VER)
131 | 
132 |         /// Lock member
133 |         volatile Spinlock lock;
134 | 
135 |         /**
136 |          * Constructor
137 |          */
138 |         Mutex() : lock(0) {}
139 | 
140 |         /**
141 |          * Return when the specified spinlock has been acquired
142 |          */
143 |         __forceinline__ void Lock()
144 |         {
145 |             while (1)
146 |             {
147 |                 if (!_InterlockedExchange(&lock, 1)) return;
148 |                 while (lock) YieldProcessor();
149 |             }
150 |         }
151 | 
152 | 
153 |         /**
154 |          * Release the specified spinlock
155 |          */
156 |         __forceinline__ void Unlock()
157 |         {
158 |             _ReadWriteBarrier();
159 |             lock = 0;
160 |         }
161 | 
162 | #endif      // __cplusplus > 199711L
163 | 
164 | };
165 | 
166 | 
167 | 
168 | 
169 | }               // CUB namespace
170 | CUB_NS_POSTFIX  // Optional outer namespace(s)
171 | 
172 | 


--------------------------------------------------------------------------------
/include/cub/iterator/discard_output_iterator.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Random-access iterator types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <iterator>
 37 | #include <iostream>
 38 | 
 39 | #include "../util_namespace.cuh"
 40 | #include "../util_macro.cuh"
 41 | 
 42 | #if (THRUST_VERSION >= 100700)
 43 |     // This iterator is compatible with Thrust API 1.7 and newer
 44 |     #include <thrust/iterator/iterator_facade.h>
 45 |     #include <thrust/iterator/iterator_traits.h>
 46 | #endif // THRUST_VERSION
 47 | 
 48 | 
 49 | /// Optional outer namespace(s)
 50 | CUB_NS_PREFIX
 51 | 
 52 | /// CUB namespace
 53 | namespace cub {
 54 | 
 55 | 
 56 | /**
 57 |  * \addtogroup UtilIterator
 58 |  * @{
 59 |  */
 60 | 
 61 | 
 62 | /**
 63 |  * \brief A discard iterator
 64 |  */
 65 | template <typename OffsetT = ptrdiff_t>
 66 | class DiscardOutputIterator
 67 | {
 68 | public:
 69 | 
 70 |     // Required iterator traits
 71 |     typedef DiscardOutputIterator   self_type;              ///< My own type
 72 |     typedef OffsetT                 difference_type;        ///< Type to express the result of subtracting one iterator from another
 73 |     typedef void                    value_type;             ///< The type of the element the iterator can point to
 74 |     typedef void                    pointer;                ///< The type of a pointer to an element the iterator can point to
 75 |     typedef void                    reference;              ///< The type of a reference to an element the iterator can point to
 76 | 
 77 | #if (THRUST_VERSION >= 100700)
 78 |     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
 79 |     typedef typename thrust::detail::iterator_facade_category<
 80 |         thrust::any_system_tag,
 81 |         thrust::random_access_traversal_tag,
 82 |         value_type,
 83 |         reference
 84 |       >::type iterator_category;                                        ///< The iterator category
 85 | #else
 86 |     typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
 87 | #endif  // THRUST_VERSION
 88 | 
 89 | private:
 90 | 
 91 |     OffsetT offset;
 92 | 
 93 | #if defined(_WIN32) || !defined(_WIN64)
 94 |     // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
 95 |     OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];
 96 | #endif
 97 | 
 98 | public:
 99 | 
100 |     /// Constructor
101 |     __host__ __device__ __forceinline__ DiscardOutputIterator(
102 |         OffsetT offset = 0)     ///< Base offset
103 |     :
104 |         offset(offset)
105 |     {}
106 | 
107 |     /// Postfix increment
108 |     __host__ __device__ __forceinline__ self_type operator++(int)
109 |     {
110 |         self_type retval = *this;
111 |         offset++;
112 |         return retval;
113 |     }
114 | 
115 |     /// Prefix increment
116 |     __host__ __device__ __forceinline__ self_type operator++()
117 |     {
118 |         offset++;
119 |         return *this;
120 |     }
121 | 
122 |     /// Indirection
123 |     __host__ __device__ __forceinline__ self_type& operator*()
124 |     {
125 |         // return self reference, which can be assigned to anything
126 |         return *this;
127 |     }
128 | 
129 |     /// Addition
130 |     template <typename Distance>
131 |     __host__ __device__ __forceinline__ self_type operator+(Distance n) const
132 |     {
133 |         self_type retval(offset + n);
134 |         return retval;
135 |     }
136 | 
137 |     /// Addition assignment
138 |     template <typename Distance>
139 |     __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
140 |     {
141 |         offset += n;
142 |         return *this;
143 |     }
144 | 
145 |     /// Subtraction
146 |     template <typename Distance>
147 |     __host__ __device__ __forceinline__ self_type operator-(Distance n) const
148 |     {
149 |         self_type retval(offset - n);
150 |         return retval;
151 |     }
152 | 
153 |     /// Subtraction assignment
154 |     template <typename Distance>
155 |     __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
156 |     {
157 |         offset -= n;
158 |         return *this;
159 |     }
160 | 
161 |     /// Distance
162 |     __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
163 |     {
164 |         return offset - other.offset;
165 |     }
166 | 
167 |     /// Array subscript
168 |     template <typename Distance>
169 |     __host__ __device__ __forceinline__ self_type& operator[](Distance n)
170 |     {
171 |         // return self reference, which can be assigned to anything
172 |         return *this;
173 |     }
174 | 
175 |     /// Structure dereference
176 |     __host__ __device__ __forceinline__ pointer operator->()
177 |     {
178 |         return;
179 |     }
180 | 
181 |     /// Assignment to self (no-op)
182 |     __host__ __device__ __forceinline__ void operator=(self_type const& other)
183 |     {
184 |         offset = other.offset;
185 |     }
186 | 
187 |     /// Assignment to anything else (no-op)
188 |     template<typename T>
189 |     __host__ __device__ __forceinline__ void operator=(T const&)
190 |     {}
191 | 
192 |     /// Cast to void* operator
193 |     __host__ __device__ __forceinline__ operator void*() const { return NULL; }
194 | 
195 |     /// Equal to
196 |     __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
197 |     {
198 |         return (offset == rhs.offset);
199 |     }
200 | 
201 |     /// Not equal to
202 |     __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
203 |     {
204 |         return (offset != rhs.offset);
205 |     }
206 | 
207 |     /// ostream operator
208 |     friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
209 |     {
210 |         os << "[" << itr.offset << "]";
211 |         return os;
212 |     }
213 | 
214 | };
215 | 
216 | 
217 | /** @} */       // end group UtilIterator
218 | 
219 | }               // CUB namespace
220 | CUB_NS_POSTFIX  // Optional outer namespace(s)
221 | 


--------------------------------------------------------------------------------
/include/cub/thread/thread_reduce.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Thread utilities for sequential reduction over statically-sized array types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../thread/thread_operators.cuh"
 37 | #include "../util_namespace.cuh"
 38 | 
 39 | /// Optional outer namespace(s)
 40 | CUB_NS_PREFIX
 41 | 
 42 | /// CUB namespace
 43 | namespace cub {
 44 | 
 45 | /// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
 46 | namespace internal {
 47 | 
 48 | /**
 49 |  * Sequential reduction over statically-sized array types
 50 |  */
 51 | template <
 52 |     int         LENGTH,
 53 |     typename    T,
 54 |     typename    ReductionOp>
 55 | __device__ __forceinline__ T ThreadReduce(
 56 |     T*                  input,                  ///< [in] Input array
 57 |     ReductionOp         reduction_op,           ///< [in] Binary reduction operator
 58 |     T                   prefix,                 ///< [in] Prefix to seed reduction with
 59 |     Int2Type<LENGTH>    /*length*/)
 60 | {
 61 |     T retval = prefix;
 62 | 
 63 |     #pragma unroll
 64 |     for (int i = 0; i < LENGTH; ++i)
 65 |         retval = reduction_op(retval, input[i]);
 66 | 
 67 |     return retval;
 68 | }
 69 | 
 70 | 
 71 | /**
 72 |  * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
 73 |  *
 74 |  * \tparam LENGTH     LengthT of input array
 75 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
 76 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
 77 |  */
 78 | template <
 79 |     int         LENGTH,
 80 |     typename    T,
 81 |     typename    ReductionOp>
 82 | __device__ __forceinline__ T ThreadReduce(
 83 |     T*          input,                  ///< [in] Input array
 84 |     ReductionOp reduction_op,           ///< [in] Binary reduction operator
 85 |     T           prefix)                 ///< [in] Prefix to seed reduction with
 86 | {
 87 |     return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
 88 | }
 89 | 
 90 | 
 91 | /**
 92 |  * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
 93 |  *
 94 |  * \tparam LENGTH     LengthT of input array
 95 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
 96 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
 97 |  */
 98 | template <
 99 |     int         LENGTH,
100 |     typename    T,
101 |     typename    ReductionOp>
102 | __device__ __forceinline__ T ThreadReduce(
103 |     T*          input,                  ///< [in] Input array
104 |     ReductionOp reduction_op)           ///< [in] Binary reduction operator
105 | {
106 |     T prefix = input[0];
107 |     return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
108 | }
109 | 
110 | 
111 | /**
112 |  * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
113 |  *
114 |  * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
115 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
116 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
117 |  */
118 | template <
119 |     int         LENGTH,
120 |     typename    T,
121 |     typename    ReductionOp>
122 | __device__ __forceinline__ T ThreadReduce(
123 |     T           (&input)[LENGTH],       ///< [in] Input array
124 |     ReductionOp reduction_op,           ///< [in] Binary reduction operator
125 |     T           prefix)                 ///< [in] Prefix to seed reduction with
126 | {
127 |     return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
128 | }
129 | 
130 | 
131 | /**
132 |  * \brief Serial reduction with the specified operator
133 |  *
134 |  * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
135 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
136 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
137 |  */
138 | template <
139 |     int         LENGTH,
140 |     typename    T,
141 |     typename    ReductionOp>
142 | __device__ __forceinline__ T ThreadReduce(
143 |     T           (&input)[LENGTH],       ///< [in] Input array
144 |     ReductionOp reduction_op)           ///< [in] Binary reduction operator
145 | {
146 |     return ThreadReduce<LENGTH>((T*) input, reduction_op);
147 | }
148 | 
149 | 
150 | }               // internal namespace
151 | }               // CUB namespace
152 | CUB_NS_POSTFIX  // Optional outer namespace(s)
153 | 


--------------------------------------------------------------------------------
/include/cub/thread/thread_search.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Thread utilities for sequential search
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../util_namespace.cuh"
 37 | 
 38 | /// Optional outer namespace(s)
 39 | CUB_NS_PREFIX
 40 | 
 41 | /// CUB namespace
 42 | namespace cub {
 43 | 
 44 | 
 45 | /**
 46 |  * Computes the begin offsets into A and B for the specific diagonal
 47 |  */
 48 | template <
 49 |     typename AIteratorT,
 50 |     typename BIteratorT,
 51 |     typename OffsetT,
 52 |     typename CoordinateT>
 53 | __host__ __device__ __forceinline__ void MergePathSearch(
 54 |     OffsetT         diagonal,
 55 |     AIteratorT      a,
 56 |     BIteratorT      b,
 57 |     OffsetT         a_len,
 58 |     OffsetT         b_len,
 59 |     CoordinateT&    path_coordinate)
 60 | {
 61 |     /// The value type of the input iterator
 62 |     typedef typename std::iterator_traits<AIteratorT>::value_type T;
 63 | 
 64 |     OffsetT split_min = CUB_MAX(diagonal - b_len, 0);
 65 |     OffsetT split_max = CUB_MIN(diagonal, a_len);
 66 | 
 67 |     while (split_min < split_max)
 68 |     {
 69 |         OffsetT split_pivot = (split_min + split_max) >> 1;
 70 |         if (a[split_pivot] <= b[diagonal - split_pivot - 1])
 71 |         {
 72 |             // Move candidate split range up A, down B
 73 |             split_min = split_pivot + 1;
 74 |         }
 75 |         else
 76 |         {
 77 |             // Move candidate split range up B, down A
 78 |             split_max = split_pivot;
 79 |         }
 80 |     }
 81 | 
 82 |     path_coordinate.x = CUB_MIN(split_min, a_len);
 83 |     path_coordinate.y = diagonal - split_min;
 84 | }
 85 | 
 86 | 
 87 | 
 88 | /**
 89 |  * \brief Returns the offset of the first value within \p input which does not compare less than \p val
 90 |  */
 91 | template <
 92 |     typename InputIteratorT,
 93 |     typename OffsetT,
 94 |     typename T>
 95 | __device__ __forceinline__ OffsetT LowerBound(
 96 |     InputIteratorT      input,              ///< [in] Input sequence
 97 |     OffsetT             num_items,          ///< [in] Input sequence length
 98 |     T                   val)                ///< [in] Search key
 99 | {
100 |     OffsetT retval = 0;
101 |     while (num_items > 0)
102 |     {
103 |         OffsetT half = num_items >> 1;
104 |         if (input[retval + half] < val)
105 |         {
106 |             retval = retval + (half + 1);
107 |             num_items = num_items - (half + 1);
108 |         }
109 |         else
110 |         {
111 |             num_items = half;
112 |         }
113 |     }
114 | 
115 |     return retval;
116 | }
117 | 
118 | 
119 | /**
120 |  * \brief Returns the offset of the first value within \p input which compares greater than \p val
121 |  */
122 | template <
123 |     typename InputIteratorT,
124 |     typename OffsetT,
125 |     typename T>
126 | __device__ __forceinline__ OffsetT UpperBound(
127 |     InputIteratorT      input,              ///< [in] Input sequence
128 |     OffsetT             num_items,          ///< [in] Input sequence length
129 |     T                   val)                ///< [in] Search key
130 | {
131 |     OffsetT retval = 0;
132 |     while (num_items > 0)
133 |     {
134 |         OffsetT half = num_items >> 1;
135 |         if (val < input[retval + half])
136 |         {
137 |             num_items = half;
138 |         }
139 |         else
140 |         {
141 |             retval = retval + (half + 1);
142 |             num_items = num_items - (half + 1);
143 |         }
144 |     }
145 | 
146 |     return retval;
147 | }
148 | 
149 | 
150 | 
151 | 
152 | 
153 | }               // CUB namespace
154 | CUB_NS_POSTFIX  // Optional outer namespace(s)
155 | 


--------------------------------------------------------------------------------
/include/cub/util_arch.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Static architectural properties by SM version.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "util_namespace.cuh"
 37 | 
 38 | /// Optional outer namespace(s)
 39 | CUB_NS_PREFIX
 40 | 
 41 | /// CUB namespace
 42 | namespace cub {
 43 | 
 44 | #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 45 | 
 46 | #if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS)
 47 |     #define CUB_USE_COOPERATIVE_GROUPS
 48 | #endif
 49 | 
 50 | /// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
 51 | #ifndef CUB_PTX_ARCH
 52 |     #ifndef __CUDA_ARCH__
 53 |         #define CUB_PTX_ARCH 0
 54 |     #else
 55 |         #define CUB_PTX_ARCH __CUDA_ARCH__
 56 |     #endif
 57 | #endif
 58 | 
 59 | 
 60 | /// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
 61 | #ifndef CUB_RUNTIME_FUNCTION
 62 |     #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
 63 |         #define CUB_RUNTIME_ENABLED
 64 |         #define CUB_RUNTIME_FUNCTION __host__ __device__
 65 |     #else
 66 |         #define CUB_RUNTIME_FUNCTION __host__
 67 |     #endif
 68 | #endif
 69 | 
 70 | 
 71 | /// Number of threads per warp
 72 | #ifndef CUB_LOG_WARP_THREADS
 73 |     #define CUB_LOG_WARP_THREADS(arch)                      \
 74 |         (5)
 75 |     #define CUB_WARP_THREADS(arch)                          \
 76 |         (1 << CUB_LOG_WARP_THREADS(arch))
 77 | 
 78 |     #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(CUB_PTX_ARCH)
 79 |     #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
 80 | #endif
 81 | 
 82 | 
 83 | /// Number of smem banks
 84 | #ifndef CUB_LOG_SMEM_BANKS
 85 |     #define CUB_LOG_SMEM_BANKS(arch)                        \
 86 |         ((arch >= 200) ?                                    \
 87 |             (5) :                                           \
 88 |             (4))
 89 |     #define CUB_SMEM_BANKS(arch)                            \
 90 |         (1 << CUB_LOG_SMEM_BANKS(arch))
 91 | 
 92 |     #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
 93 |     #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS(CUB_PTX_ARCH)
 94 | #endif
 95 | 
 96 | 
 97 | /// Oversubscription factor
 98 | #ifndef CUB_SUBSCRIPTION_FACTOR
 99 |     #define CUB_SUBSCRIPTION_FACTOR(arch)                   \
100 |         ((arch >= 300) ?                                    \
101 |             (5) :                                           \
102 |             ((arch >= 200) ?                                \
103 |                 (3) :                                       \
104 |                 (10)))
105 |     #define CUB_PTX_SUBSCRIPTION_FACTOR             CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH)
106 | #endif
107 | 
108 | 
109 | /// Prefer padding overhead vs X-way conflicts greater than this threshold
110 | #ifndef CUB_PREFER_CONFLICT_OVER_PADDING
111 |     #define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
112 |         ((arch >= 300) ?                                    \
113 |             (1) :                                           \
114 |             (4))
115 |     #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
116 | #endif
117 | 
118 | 
119 | /// Scale down the number of threads to keep same amount of scratch storage as the nominal configuration for 4B data.  Minimum of two warps.
120 | #ifndef CUB_SCALED_BLOCK_THREADS
121 |     #define CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)                   \
122 |         (CUB_MIN(                                                                           \
123 |             NOMINAL_4B_BLOCK_THREADS,                                                       \
124 |             CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX(                                           \
125 |                 2,                                                                          \
126 |                 (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T))))
127 | #endif
128 | 
129 | /// Scale down number of items per thread to keep the same amount of register storage as the nominal configuration for 4B data.  Minimum 1 item per thread
130 | #ifndef CUB_SCALED_ITEMS_PER_THREAD
131 |     #define CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)     \
132 |         CUB_MAX(                                                                                                \
133 |             1,                                                                                                  \
134 |             (sizeof(T) < 4) ?                                                                                   \
135 |                 ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) / 2 :  \
136 |                 ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH))
137 | #endif
138 | 
139 | /// Define both nominal threads-per-block and items-per-thread
140 | #ifndef CUB_SCALED_GRANULARITIES
141 |     #define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)      \
142 |         CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200),                                   \
143 |         CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200)
144 | #endif
145 | 
146 | 
147 | 
148 | #endif  // Do not document
149 | 
150 | }               // CUB namespace
151 | CUB_NS_POSTFIX  // Optional outer namespace(s)
152 | 


--------------------------------------------------------------------------------
/include/cub/util_debug.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Error and event logging routines.
 32 |  *
 33 |  * The following macros definitions are supported:
 34 |  * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
 35 |  */
 36 | 
 37 | #pragma once
 38 | 
 39 | #include <stdio.h>
 40 | #include "util_namespace.cuh"
 41 | #include "util_arch.cuh"
 42 | 
 43 | /// Optional outer namespace(s)
 44 | CUB_NS_PREFIX
 45 | 
 46 | /// CUB namespace
 47 | namespace cub {
 48 | 
 49 | 
 50 | /**
 51 |  * \addtogroup UtilMgmt
 52 |  * @{
 53 |  */
 54 | 
 55 | 
 56 | /// CUB error reporting macro (prints error messages to stderr)
 57 | #if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
 58 |     #define CUB_STDERR
 59 | #endif
 60 | 
 61 | 
 62 | 
 63 | /**
 64 |  * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
 65 |  *
 66 |  * \return The CUDA error.
 67 |  */
 68 | __host__ __device__ __forceinline__ cudaError_t Debug(
 69 |     cudaError_t     error,
 70 |     const char*     filename,
 71 |     int             line)
 72 | {
 73 |     (void)filename;
 74 |     (void)line;
 75 | #ifdef CUB_STDERR
 76 |     if (error)
 77 |     {
 78 |     #if (CUB_PTX_ARCH == 0)
 79 |         fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
 80 |         fflush(stderr);
 81 |     #elif (CUB_PTX_ARCH >= 200)
 82 |         printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
 83 |     #endif
 84 |     }
 85 | #endif
 86 |     return error;
 87 | }
 88 | 
 89 | 
 90 | /**
 91 |  * \brief Debug macro
 92 |  */
 93 | #ifndef CubDebug
 94 |     #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__)
 95 | #endif
 96 | 
 97 | 
 98 | /**
 99 |  * \brief Debug macro with exit
100 |  */
101 | #ifndef CubDebugExit
102 |     #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); }
103 | #endif
104 | 
105 | 
106 | /**
107 |  * \brief Log macro for printf statements.
108 |  */
109 | #if !defined(_CubLog)
110 |     #if !(defined(__clang__) && defined(__CUDA__))
111 |         #if (CUB_PTX_ARCH == 0)
112 |             #define _CubLog(format, ...) printf(format,__VA_ARGS__);
113 |         #elif (CUB_PTX_ARCH >= 200)
114 |             #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__);
115 |         #endif
116 |     #else
117 |         // XXX shameless hack for clang around variadic printf...
118 |         //     Compilies w/o supplying -std=c++11 but shows warning,
119 |         //     so we sielence them :)
120 |         #pragma clang diagnostic ignored "-Wc++11-extensions"
121 |         #pragma clang diagnostic ignored "-Wunnamed-type-template-args"
122 |             template <class... Args>
123 |             inline __host__ __device__ void va_printf(char const* format, Args const&... args)
124 |             {
125 |         #ifdef __CUDA_ARCH__
126 |               printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...);
127 |         #else
128 |               printf(format, args...);
129 |         #endif
130 |             }
131 |         #ifndef __CUDA_ARCH__
132 |             #define _CubLog(format, ...) va_printf(format,__VA_ARGS__);
133 |         #else
134 |             #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__);
135 |         #endif
136 |     #endif
137 | #endif
138 | 
139 | 
140 | 
141 | 
142 | /** @} */       // end group UtilMgmt
143 | 
144 | }               // CUB namespace
145 | CUB_NS_POSTFIX  // Optional outer namespace(s)
146 | 


--------------------------------------------------------------------------------
/include/cub/util_macro.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /******************************************************************************
 30 |  * Common C/C++ macro utilities
 31 |  ******************************************************************************/
 32 | 
 33 | #pragma once
 34 | 
 35 | #include "util_namespace.cuh"
 36 | 
 37 | /// Optional outer namespace(s)
 38 | CUB_NS_PREFIX
 39 | 
 40 | /// CUB namespace
 41 | namespace cub {
 42 | 
 43 | 
 44 | /**
 45 |  * \addtogroup UtilModule
 46 |  * @{
 47 |  */
 48 | 
 49 | #ifndef CUB_ALIGN
 50 |     #if defined(_WIN32) || defined(_WIN64)
 51 |         /// Align struct
 52 |         #define CUB_ALIGN(bytes) __declspec(align(32))
 53 |     #else
 54 |         /// Align struct
 55 |         #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
 56 |     #endif
 57 | #endif
 58 | 
 59 | #ifndef CUB_MAX
 60 |     /// Select maximum(a, b)
 61 |     #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
 62 | #endif
 63 | 
 64 | #ifndef CUB_MIN
 65 |     /// Select minimum(a, b)
 66 |     #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
 67 | #endif
 68 | 
 69 | #ifndef CUB_QUOTIENT_FLOOR
 70 |     /// Quotient of x/y rounded down to nearest integer
 71 |     #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
 72 | #endif
 73 | 
 74 | #ifndef CUB_QUOTIENT_CEILING
 75 |     /// Quotient of x/y rounded up to nearest integer
 76 |     #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
 77 | #endif
 78 | 
 79 | #ifndef CUB_ROUND_UP_NEAREST
 80 |     /// x rounded up to the nearest multiple of y
 81 |     #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
 82 | #endif
 83 | 
 84 | #ifndef CUB_ROUND_DOWN_NEAREST
 85 |     /// x rounded down to the nearest multiple of y
 86 |     #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
 87 | #endif
 88 | 
 89 | 
 90 | #ifndef CUB_STATIC_ASSERT
 91 |     #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 92 |         #define CUB_CAT_(a, b) a ## b
 93 |         #define CUB_CAT(a, b) CUB_CAT_(a, b)
 94 |     #endif // DOXYGEN_SHOULD_SKIP_THIS
 95 | 
 96 |     /// Static assert
 97 |     #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
 98 | #endif
 99 | 
100 | /** @} */       // end group UtilModule
101 | 
102 | }               // CUB namespace
103 | CUB_NS_POSTFIX  // Optional outer namespace(s)
104 | 


--------------------------------------------------------------------------------
/include/cub/util_namespace.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 4 |  *
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  *
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * Place-holder for prefixing the cub namespace
32 |  */
33 | 
34 | #pragma once
35 | 
36 | // For example:
37 | //#define CUB_NS_PREFIX namespace thrust{ namespace detail {
38 | //#define CUB_NS_POSTFIX } }
39 | 
40 | #ifndef CUB_NS_PREFIX
41 | #define CUB_NS_PREFIX
42 | #endif
43 | 
44 | #ifndef CUB_NS_POSTFIX
45 | #define CUB_NS_POSTFIX
46 | #endif
47 | 


--------------------------------------------------------------------------------
/include/logger.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * NOTICE TO USER:
  5 | *
  6 | * This source code is subject to NVIDIA ownership rights under U.S. and
  7 | * international Copyright laws.
  8 | *
  9 | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 10 | * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 11 | * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 12 | * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 13 | * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 14 | * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 15 | * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 16 | * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 17 | * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
 18 | * OR PERFORMANCE OF THIS SOURCE CODE.
 19 | *
 20 | * U.S. Government End Users.  This source code is a "commercial item" as
 21 | * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
 22 | * "commercial computer software" and "commercial computer software
 23 | * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
 24 | * and is provided to the U.S. Government only as a commercial end item.
 25 | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 26 | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 27 | * source code with only those rights set forth herein.
 28 | */
 29 | 
 30 | #ifndef LOGGER_H
 31 | #define LOGGER_H
 32 | 
 33 | #pragma once
 34 | 
 35 | #include <iostream>
 36 | #include <fstream>
 37 | #include <string>
 38 | #include <sstream>
 39 | #include <mutex>
 40 | #include <time.h>
 41 | 
 42 | #ifdef _WIN32
 43 | #include <winsock.h>
 44 | #include <windows.h>
 45 | 
 46 | #pragma comment(lib, "ws2_32.lib")
 47 | #else
 48 | #include <unistd.h>
 49 | #include <sys/socket.h>
 50 | #include <netinet/in.h>
 51 | #include <arpa/inet.h>
 52 | #define SOCKET int
 53 | #define INVALID_SOCKET -1
 54 | #endif
 55 | 
 56 | namespace simplelogger{
 57 | 
 58 | enum LogLevel {
 59 | 	TRACE,
 60 | 	DEBUG,
 61 | 	INFO,
 62 | 	WARN,
 63 | 	ERR
 64 | };
 65 | 
 66 | class Logger {
 67 | public:
 68 | 	Logger(LogLevel level, bool bPrintTimeStamp) : level(level), bPrintTimeStamp(bPrintTimeStamp) {}
 69 | 	virtual ~Logger() {}
 70 | 	virtual std::ostream& GetStream() = 0;
 71 | 	virtual void FlushStream() {}
 72 | 	bool ShouldLogFor(LogLevel l) {
 73 | 		return l >= level;
 74 | 	}
 75 | 	char* GetLead(LogLevel l, const char *szFile, int nLine, const char *szFunc) {
 76 | 		if (l < TRACE || l > ERR) {
 77 | 		    sprintf(szLead, "[?????] ");
 78 | 			return szLead;
 79 | 		}
 80 | 		const char *szLevels[] = {"TRACE", "DEBUG", "INFO", "WARN", "ERROR"};
 81 | 		if (bPrintTimeStamp) {
 82 | 			time_t t = time(NULL);
 83 | 			struct tm *ptm = localtime(&t);
 84 | 			sprintf(szLead, "[%-5s][%02d:%02d:%02d] ", 
 85 | 				szLevels[l], ptm->tm_hour, ptm->tm_min, ptm->tm_sec);
 86 | 		} else {
 87 | 			sprintf(szLead, "[%-5s] ", szLevels[l]);
 88 | 		}
 89 | 		return szLead;
 90 | 	}
 91 | 	void EnterCriticalSection() {
 92 | 		mtx.lock();
 93 | 	}
 94 | 	void LeaveCriticalSection() {
 95 | 		mtx.unlock();
 96 | 	}
 97 | private:
 98 | 	LogLevel level;
 99 | 	char szLead[80];
100 | 	bool bPrintTimeStamp;
101 | 	std::mutex mtx;
102 | };
103 | 
104 | class LoggerFactory {
105 | public:
106 | 	static Logger* CreateFileLogger(std::string strFilePath, 
107 | 			LogLevel level = DEBUG, bool bPrintTimeStamp = true) {
108 | 		return new FileLogger(strFilePath, level, bPrintTimeStamp);
109 | 	}
110 | 	static Logger* CreateConsoleLogger(LogLevel level = DEBUG, 
111 | 			bool bPrintTimeStamp = true) {
112 | 		return new ConsoleLogger(level, bPrintTimeStamp);
113 | 	}
114 | 	static Logger* CreateUdpLogger(char *szHost, unsigned uPort, LogLevel level = DEBUG, 
115 | 			bool bPrintTimeStamp = true) {
116 | 		return new UdpLogger(szHost, uPort, level, bPrintTimeStamp);
117 | 	}
118 | private:
119 | 	LoggerFactory() {}
120 | 
121 | 	class FileLogger : public Logger {
122 | 	public:
123 | 		FileLogger(std::string strFilePath, LogLevel level, bool bPrintTimeStamp) 
124 | 		: Logger(level, bPrintTimeStamp) {
125 | 			pFileOut = new std::ofstream();
126 | 			pFileOut->open(strFilePath.c_str());
127 | 		}
128 | 		~FileLogger() {
129 | 			pFileOut->close();
130 | 		}
131 | 		std::ostream& GetStream() {
132 | 			return *pFileOut;
133 | 		}
134 | 	private:
135 | 		std::ofstream *pFileOut;
136 | 	};
137 | 
138 | 	class ConsoleLogger : public Logger {
139 | 	public:
140 | 		ConsoleLogger(LogLevel level, bool bPrintTimeStamp) 
141 | 		: Logger(level, bPrintTimeStamp) {}
142 | 		std::ostream& GetStream() {
143 | 			return std::cout;
144 | 		}
145 | 	};
146 | 
147 | 	class UdpLogger : public Logger {
148 | 	private:
149 | 		class UdpOstream : public std::ostream {
150 | 		public:
151 | 			UdpOstream(char *szHost, unsigned short uPort) : std::ostream(&sb), socket(INVALID_SOCKET){
152 | #ifdef _WIN32
153 | 				WSADATA w;
154 | 				if (WSAStartup(0x0101, &w) != 0) {
155 | 					fprintf(stderr, "WSAStartup() failed.\n");
156 | 					return;
157 | 				}
158 | #endif
159 | 				socket = ::socket(AF_INET, SOCK_DGRAM, 0);
160 | 				if (socket == INVALID_SOCKET) {
161 | #ifdef _WIN32
162 |                     WSACleanup();
163 | #endif
164 | 					fprintf(stderr, "socket() failed.\n");
165 | 					return;
166 | 				}
167 | #ifdef _WIN32
168 | 				unsigned int b1, b2, b3, b4;
169 | 				sscanf(szHost, "%u.%u.%u.%u", &b1, &b2, &b3, &b4);
170 |                 struct in_addr addr = {(unsigned char)b1, (unsigned char)b2, (unsigned char)b3, (unsigned char)b4};
171 | #else
172 |                 struct in_addr addr = {inet_addr(szHost)};
173 | #endif
174 | 				struct sockaddr_in s = {AF_INET, htons(uPort), addr};
175 | 				server = s;
176 | 			}
177 | 			virtual ~UdpOstream() {
178 | 				if (socket == INVALID_SOCKET) {
179 | 					return;
180 | 				}
181 | #ifdef _WIN32
182 | 				closesocket(socket);
183 | 				WSACleanup();
184 | #else
185 |                 close(socket);
186 | #endif
187 | 			}
188 | 			void Flush() {
189 | 				if (sendto(socket, sb.str().c_str(), (int)sb.str().length() + 1, 
190 | 						0, (struct sockaddr *)&server, (int)sizeof(sockaddr_in)) == -1) {
191 | 					fprintf(stderr, "sendto() failed.\n");
192 | 				}
193 | 				sb.str("");
194 | 			}
195 | 
196 | 		private:
197 | 			std::stringbuf sb;
198 | 			SOCKET socket;
199 | 			struct sockaddr_in server;
200 | 		};
201 | 	public:
202 | 		UdpLogger(char *szHost, unsigned uPort, LogLevel level, bool bPrintTimeStamp) 
203 | 		: Logger(level, bPrintTimeStamp), udpOut(szHost, (unsigned short)uPort) {}
204 | 		UdpOstream& GetStream() {
205 | 			return udpOut;
206 | 		}
207 | 		virtual void FlushStream() {
208 | 			udpOut.Flush();
209 | 		}
210 | 	private:
211 | 		UdpOstream udpOut;
212 | 	};
213 | };
214 | 
215 | }
216 | 
217 | #define LOG_(pLogger, event, level) \
218 | 	do {													\
219 | 		if (!pLogger || !pLogger->ShouldLogFor(level)) {	\
220 | 			break;											\
221 | 		}													\
222 | 		pLogger->EnterCriticalSection();					\
223 | 		pLogger->GetStream()								\
224 | 			<< pLogger->GetLead(level, __FILE__, __LINE__,	\
225 | 				__FUNCTION__)								\
226 | 			<< event << std::endl;							\
227 | 		pLogger->FlushStream();								\
228 | 		pLogger->LeaveCriticalSection();					\
229 | 	} while (0);
230 | 
231 | #define LOG_TRACE(pLogger, event)	LOG_(pLogger, event, simplelogger::TRACE)
232 | #define LOG_DEBUG(pLogger, event)	LOG_(pLogger, event, simplelogger::DEBUG)
233 | #define LOG_INFO(pLogger, event)	LOG_(pLogger, event, simplelogger::INFO)
234 | #define LOG_WARN(pLogger, event)	LOG_(pLogger, event, simplelogger::WARN)
235 | #define LOG_ERROR(pLogger, event)	LOG_(pLogger, event, simplelogger::ERR)
236 | 
237 | 
238 | #endif // LOGGER_H
239 | 


--------------------------------------------------------------------------------
/interpPlugin.cu:
--------------------------------------------------------------------------------
 1 | #include <interpPlugin.h>
 2 | 
 3 | 
 4 | dim3 cuda_gridsize(unsigned int n){
 5 | 	unsigned int k = (n-1) / BLOCK + 1;
 6 | 	unsigned int x = k;
 7 | 	unsigned int y = 1;
 8 | 	if(x > 65535){
 9 | 		x = ceil(sqrt(k));
10 | 		y = (n-1)/(x*BLOCK) + 1;
11 | 	}
12 | 	dim3 d = {x, y, 1};
13 | 	return d;
14 | } 
15 | 
16 | /* nearest neighbor upsampling used in darknet*/
17 | __global__ void upsample_gpu(int N, const float *x, int w, int h, int c, int batch, int zoomFactor, float *out, const char* mode="nearest")
18 | {
19 | 	int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
20 | 	if(i >= N) return;
21 | 	int out_index = i;
22 | 	int out_w = i%(w*zoomFactor);
23 | 	i = i/(w*zoomFactor);
24 | 	int out_h = i%(h*zoomFactor);
25 | 	i = i/(h*zoomFactor);
26 | 	int _c = i%c;
27 | 	i = i/_c;
28 | 	int _b = i%batch;
29 | 	int in_w = out_w/zoomFactor;
30 | 	int in_h = out_h/zoomFactor;
31 | 	int in_offset = _b*c*w*h + _c*w*h;
32 | 	int in_index00 = in_offset + in_h*w + in_w;
33 | 	if(mode == "bilinear"){
34 | 		int in_index01 = (in_w+1 > w) ? in_index00 : (in_index00 + 1);
35 | 		int in_index10 = (in_h+1 > h) ? in_index00 : (in_index00 + w);
36 | 		int in_index11 = (in_index01 == in_index10) ? in_index00 : (in_index10 + 1);
37 | 		
38 | 		float u = (float)(out_h % zoomFactor)/zoomFactor;
39 | 		float v = (float)(out_w % zoomFactor)/zoomFactor;
40 | 		out[out_index] = (1-u)*(1-v)*x[in_index00] + \
41 | 										 (1-u)*v*x[in_index01] + \
42 | 										 u*(1-v)*x[in_index10] + \
43 | 										 u*v*x[in_index11];
44 | 	}
45 | 	else if(mode == "nearest"){
46 | 		out[out_index] = x[in_index00];
47 | 	}
48 | }
49 | 
50 | void interp_gpu(const float *x, int w, int h, int c, int batch, int zoomFactor, float *out, cudaStream_t stream)
51 | {
52 | 	int outSize = w*zoomFactor*h*zoomFactor*c*batch;
53 | 	upsample_gpu<<<cuda_gridsize(outSize), BLOCK, 0, stream>>>(outSize, x, w, h, c, batch, zoomFactor, out);
54 | }
55 | 


--------------------------------------------------------------------------------
/interpPlugin.h:
--------------------------------------------------------------------------------
  1 | #ifndef INTERP_PLUGIN_H
  2 | #define INTERP_PLUGIN_H
  3 | 
  4 | #include <cuda_runtime.h>
  5 | #include <iostream>
  6 | #include <cassert>
  7 | 
  8 | #include "NvInfer.h"
  9 | #include "NvCaffeParser.h"
 10 | #include "NvInferPlugin.h"
 11 | #include <stdio.h>
 12 | 
 13 | using namespace nvinfer1;
 14 | using namespace nvcaffeparser1;
 15 | using namespace plugin;
 16 | 
 17 | #define BLOCK 512
 18 | #define ZOOM 2 // upsample *2
 19 | 
 20 | void interp_gpu(const float *x, int w, int h, int c, int batch, int zoomFactor, float *out, cudaStream_t stream);
 21 | 
 22 | template<int zoomFactor>
 23 | class Interp : public IPlugin
 24 | {
 25 | public:
 26 | 	Interp() {}
 27 | 	Interp(const void* buffer, size_t size)
 28 | 	{
 29 | 		// assert(size == sizeof(mInputSize));
 30 | 		// mInputSize = *reinterpret_cast<const size_t*>(buffer);
 31 | 		assert(size == sizeof(mInputDims));
 32 | 		mInputDims = *reinterpret_cast<const Dims*>(buffer);
 33 | 	}
 34 | 	~Interp() {}
 35 | 
 36 | 	// @ when creating the network
 37 | 	int getNbOutputs() const override
 38 | 	{
 39 | 		return 1;
 40 | 	}
 41 | 	Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
 42 | 	{
 43 | 		assert(nbInputDims == 1);
 44 | 		assert(index == 0);
 45 | 		assert(inputs[index].nbDims == 3);
 46 |         
 47 | 		mOutputDims = DimsCHW(inputs[index].d[0], inputs[index].d[1] * zoomFactor, inputs[index].d[2] * zoomFactor);
 48 | 		if (0) {
 49 | 			std::cout << "IPlugin input dim = [" << inputs[index].d[0] << ", " << inputs[index].d[1]
 50 | 				<< ", " << inputs[index].d[2] << "]" << std::endl;
 51 | 			std::cout << "IPlugin output dim = [" << mOutputDims.d[0] << ", " << mOutputDims.d[1]
 52 | 				<< ", " << mOutputDims.d[2] << "]" << std::endl;
 53 | 		}
 54 | 		return mOutputDims;
 55 | 	}
 56 | 
 57 | 	// @ when building the engine
 58 | 	void configure(const Dims* inputs, int nbInputs, const Dims* outputs, int nbOutputs, int maxBatchSize)	override
 59 | 	{
 60 | 		assert(1 == nbInputs && 1 == nbOutputs);
 61 | 		mInputDims = inputs[0];
 62 | 		mInputSize = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2] * sizeof(float);
 63 | 		// mOutputSize = outputs[0].d[0] * outputs[0].d[1] * outputs[0].d[2] * sizeof(float);
 64 | 	}
 65 | 	size_t getWorkspaceSize(int) const override
 66 | 	{
 67 | 		return 0;
 68 | 	}
 69 | 
 70 | 	// @ when serializing the engine
 71 | 	size_t getSerializationSize() override
 72 | 	{
 73 | 		return sizeof(mInputDims);
 74 | 	}
 75 | 	void serialize(void* buffer) override
 76 | 	{
 77 | 		// *reinterpret_cast<size_t*>(buffer) = mInputSize;
 78 | 		*reinterpret_cast<Dims*>(buffer) = mInputDims;
 79 | 	}
 80 | 
 81 | 	// @ when deserializing && executing the engine(at runtime)
 82 | 	int initialize() override
 83 | 	{
 84 | 		return 0;
 85 | 	}
 86 | 	void terminate() override
 87 | 	{
 88 | 	}
 89 | 	int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override
 90 | 	{
 91 | 		// TODO: why inputs idx 0?
 92 | 		interp_gpu((const float*)inputs[0], mInputDims.d[2], mInputDims.d[1], mInputDims.d[0], batchSize, zoomFactor, (float *)outputs[0], stream); // TODO: didnt serialize mInputDims, can we use it? in that case, i serialized mInputDims, instead of mInputSize.
 93 | 		return 0;
 94 | 	}
 95 | 	
 96 | protected:
 97 | 	Dims mInputDims; //CHW
 98 | 	Dims mOutputDims;
 99 | 	size_t mInputSize;
100 | 	// size_t mOutputSize;
101 | };
102 | 
103 | 
104 | class PluginFactory : public nvinfer1::IPluginFactory, public nvcaffeparser1::IPluginFactory
105 | {
106 | public:
107 | 	// @ when building the engine
108 | 	// caffe parser plugin implementation
109 | 	bool isPlugin(const char* layerName) override
110 | 	{
111 | 		return !(strcmp(layerName, "Interp85") && strcmp(layerName, "Interp97"));
112 | 	}
113 | 	virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) override
114 | 	{
115 | 		assert(isPlugin(layerName));
116 | 		if (!strcmp(layerName, "Interp85"))
117 | 		{
118 | 			assert(layerName != "Interp85"); // debug_
119 | 			assert(mPluginInterp85.get() == nullptr);
120 | 			assert(nbWeights == 0 && weights == nullptr);
121 | 			mPluginInterp85 = std::unique_ptr<Interp<ZOOM>>(new Interp<ZOOM>());
122 | 			return mPluginInterp85.get();
123 | 		}
124 | 		else if (!strcmp(layerName, "Interp97"))
125 | 		{
126 | 			assert(layerName != "Interp97"); // debug_
127 | 			assert(mPluginInterp97.get() == nullptr);
128 | 			assert(nbWeights == 0 && weights == nullptr);
129 | 			mPluginInterp97 = std::unique_ptr<Interp<ZOOM>>(new Interp<ZOOM>());
130 | 			return mPluginInterp97.get();
131 | 		}
132 | 		else
133 | 		{
134 | 			assert(0);
135 | 			return nullptr;
136 | 		}
137 | 	}
138 | 
139 | 	// @ at runtime
140 | 	IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override
141 | 	{
142 | 		assert(isPlugin(layerName));
143 | 		if (!strcmp(layerName, "Interp85"))
144 | 		{
145 | 			assert(mPluginInterp85.get() == nullptr);
146 | 			mPluginInterp85 = std::unique_ptr<Interp<ZOOM>>(new Interp<ZOOM>(serialData, serialLength));
147 | 			return mPluginInterp85.get();
148 | 		}
149 | 		else if (!strcmp(layerName, "Interp97"))
150 | 		{
151 | 			assert(mPluginInterp97.get() == nullptr);
152 | 			mPluginInterp97 = std::unique_ptr<Interp<ZOOM>>(new Interp<ZOOM>(serialData, serialLength));
153 | 			return mPluginInterp97.get();
154 | 		}
155 | 		else
156 | 		{
157 | 			assert(0);
158 | 			return nullptr;
159 | 		}
160 | 	}
161 | 
162 | 	void destroyPlugin()
163 | 	{
164 | 		//mPluginInterp97.release();		mPluginInterp97 = nullptr;
165 | 		//mPluginInterp85.release();		mPluginInterp85 = nullptr;
166 | 	}
167 | 
168 | 	std::unique_ptr<Interp<ZOOM>> mPluginInterp85{ nullptr };
169 |   std::unique_ptr<Interp<ZOOM>> mPluginInterp97{ nullptr };
170 | };
171 | 
172 | #endif
173 | 


--------------------------------------------------------------------------------
/nvUtils.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * NOTICE TO USER:
  5 | *
  6 | * This source code is subject to NVIDIA ownership rights under U.S. and
  7 | * international Copyright laws.
  8 | *
  9 | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 10 | * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 11 | * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 12 | * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 13 | * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 14 | * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 15 | * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 16 | * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 17 | * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
 18 | * OR PERFORMANCE OF THIS SOURCE CODE.
 19 | *
 20 | * U.S. Government End Users.  This source code is a "commercial item" as
 21 | * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
 22 | * "commercial computer software" and "commercial computer software
 23 | * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
 24 | * and is provided to the U.S. Government only as a commercial end item.
 25 | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 26 | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 27 | * source code with only those rights set forth herein.
 28 | */
 29 | 
 30 | #ifndef NV_CODEC_UTILS_H
 31 | #define NV_CODEC_UTILS_H
 32 | 
 33 | #pragma once
 34 | #include <chrono>
 35 | #include <logger.h>
 36 | #include <sys/stat.h>
 37 | #include <assert.h>
 38 | #include <stdint.h>
 39 | #include <string.h>
 40 | #include <cuda_runtime.h>
 41 | 
 42 | extern simplelogger::Logger *logger;
 43 | 
 44 | #ifdef _WIN32
 45 | #ifndef STRCASECMP
 46 | #define STRCASECMP  _stricmp
 47 | #endif
 48 | #ifndef STRNCASECMP
 49 | #define STRNCASECMP _strnicmp
 50 | #endif
 51 | #ifndef STRCPY
 52 | #define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
 53 | #endif
 54 | 
 55 | #ifndef FOPEN
 56 | #define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)
 57 | #endif
 58 | #ifndef FOPEN_FAIL
 59 | #define FOPEN_FAIL(result) (result != 0)
 60 | #endif
 61 | #ifndef SSCANF
 62 | #define SSCANF sscanf_s
 63 | #endif
 64 | #else
 65 | #include <string.h>
 66 | #include <strings.h>
 67 | 
 68 | #ifndef STRCASECMP
 69 | #define STRCASECMP  strcasecmp
 70 | #endif
 71 | #ifndef STRNCASECMP
 72 | #define STRNCASECMP strncasecmp
 73 | #endif
 74 | #ifndef STRCPY
 75 | #define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
 76 | #endif
 77 | 
 78 | #ifndef FOPEN
 79 | #define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
 80 | #endif
 81 | #ifndef FOPEN_FAIL
 82 | #define FOPEN_FAIL(result) (result == NULL)
 83 | #endif
 84 | #ifndef SSCANF
 85 | #define SSCANF sscanf
 86 | #endif
 87 | #endif
 88 | 
 89 | 
 90 | #ifdef __cuda_cuda_h__
 91 | inline bool CHECK_(CUresult e, int iLine, const char *szFile) {
 92 |     if (e != CUDA_SUCCESS) {
 93 |         LOG_ERROR(logger, "CUDA error " << e << " at line " << iLine << " in file " << szFile);
 94 |         return false;
 95 |     }
 96 |     return true;
 97 | }
 98 | #endif
 99 | 
100 | #ifdef __CUDA_RUNTIME_H__
101 | inline bool CHECK_(cudaError_t e, int iLine, const char *szFile) {
102 |     if (e != cudaSuccess) {
103 |         LOG_ERROR(logger, "CUDA runtime error " << e << " at line " << iLine << " in file " << szFile);
104 |         return false;
105 |     }
106 |     return true;
107 | }
108 | #endif
109 | 
110 | #ifdef _NV_ENCODEAPI_H_
111 | inline bool CHECK_(NVENCSTATUS e, int iLine, const char *szFile) {
112 |     if (e != NV_ENC_SUCCESS) {
113 |         LOG_ERROR(logger, "NVENC error " << e << " at line " << iLine << " in file " << szFile);
114 |         return false;
115 |     }
116 |     return true;
117 | }
118 | #endif
119 | 
120 | #ifdef _WINERROR_
121 | inline bool CHECK_(HRESULT e, int iLine, const char *szFile) {
122 |     if (e != S_OK) {
123 |         LOG_ERROR(logger, "HRESULT error " << e << " at line " << iLine << " in file " << szFile);
124 |         return false;
125 |     }
126 |     return true;
127 | }
128 | #endif
129 | 
130 | #if defined(__gl_h_) || defined(__GL_H__)
131 | inline bool CHECK_(GLenum e, int iLine, const char *szFile) {
132 |     if (e != 0) {
133 |         LOG_ERROR(logger, "GLenum error " << e << " at line " << iLine << " in file " << szFile);
134 |         return false;
135 |     }
136 |     return true;
137 | }
138 | #endif
139 | 
140 | #define ck(call) CHECK_(call, __LINE__, __FILE__)
141 | /*
142 | */
143 | 
144 | #ifdef _WIN32
145 | #include <conio.h>
146 | #else
147 | #include <termios.h>
148 | inline int _getch( ) {
149 |   struct termios oldt, newt;
150 |   int ch;
151 |   tcgetattr( STDIN_FILENO, &oldt );
152 |   newt = oldt;
153 |   newt.c_lflag &= ~( ICANON | ECHO );
154 |   tcsetattr( STDIN_FILENO, TCSANOW, &newt );
155 |   ch = getchar();
156 |   tcsetattr( STDIN_FILENO, TCSANOW, &oldt );
157 |   return ch;
158 | }
159 | #define _stricmp strcasecmp
160 | #endif
161 | 
162 | class BufferedFileReader {
163 | public:
164 |     BufferedFileReader(const char *szFileName) {
165 |         struct stat st;
166 | 
167 |         if (stat(szFileName, &st) != 0) {
168 |             return;
169 |         }
170 |         
171 |         nSize = st.st_size;
172 |         pBuf = new uint8_t[nSize];
173 |         if (!pBuf) {
174 |             LOG_ERROR(logger, "Failed to allocate memory in BufferedReader");
175 |             return;
176 |         }
177 | 
178 |         FILE *fp = fopen(szFileName, "rb");
179 |         int nRead = fread(pBuf, 1, nSize, fp);
180 |         fclose(fp);
181 | 
182 |         assert(nRead == nSize);
183 |     }
184 |     ~BufferedFileReader() {
185 |         if (pBuf) {
186 |             delete[] pBuf;
187 |         }
188 |     }
189 |     bool GetBuffer(uint8_t **ppBuf, int *pnSize) {
190 |         if (!pBuf) {
191 |             return false;
192 |         }
193 | 
194 |         *ppBuf = pBuf;
195 |         *pnSize = nSize;
196 |         return true;
197 |     }
198 | 
199 | private:
200 |     uint8_t *pBuf = NULL;
201 |     int nSize = 0;
202 | };
203 | 
204 | /*
205 | class YuvConverter {
206 | public:
207 |     YuvConverter(int nWidth, int nHeight) : nWidth(nWidth), nHeight(nHeight) {
208 |         pu = new uint8_t[nWidth * nHeight / 4];
209 |     }
210 |     ~YuvConverter() {
211 |         delete pu;
212 |     }
213 |     void I420ToNv12(uint8_t *pFrame, int nPitch = 0) {
214 |         if (nPitch == 0) {
215 |             nPitch = nWidth;
216 |         }
217 |         uint8_t *puv = pFrame + nPitch * nHeight;
218 |         if (nPitch == nWidth) {
219 |             memcpy(pu, puv, nWidth * nHeight / 4);
220 |         } else {
221 |             for (int i = 0; i < nHeight / 2; i++) {
222 |                 memcpy(pu + nWidth / 2 * i, puv + nPitch / 2 * i, nWidth / 2);
223 |             }
224 |         }
225 |         uint8_t *pv = puv + (nPitch / 2) * (nHeight / 2);
226 |         for (int y = 0; y < nHeight / 2; y++) {
227 |             for (int x = 0; x < nWidth / 2; x++) {
228 |                 puv[y * nPitch + x * 2] = pu[y * nWidth / 2 + x];
229 |                 puv[y * nPitch + x * 2 + 1] = pv[y * nPitch / 2 + x];
230 |             }
231 |         }
232 |     }
233 | 
234 | private:
235 |     uint8_t *pu;
236 |     int nWidth, nHeight;
237 | };
238 | */
239 | class StopWatch {
240 | public:
241 |     void Start() {
242 |         t0 = std::chrono::high_resolution_clock::now();
243 |     }
244 |     double Stop() {
245 |         return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now().time_since_epoch() - t0.time_since_epoch()).count() / 1.0e9;
246 |     }
247 | 
248 | private:
249 |     std::chrono::time_point<std::chrono::system_clock> t0;
250 | };
251 | /*
252 | class StopWatchNew {
253 | public:
254 |     void Start() {
255 |         //t0 = std::chrono::high_resolution_clock::now();
256 |     	gettimeofday(&t0, NULL);
257 | 	}
258 |     double Stop() {
259 |     	struct timeval t1;
260 | 		gettimeofday(&t1, NULL);
261 | 		return (t1.tv_sec - t0.tv_sec) + (t1.tv_usec - t0.tv_usec)/1000000;
262 | 	}
263 | 
264 | private:
265 | 	struct timeval t0;
266 | };*/
267 | 
268 | #endif // NV_CODEC_UTILS_H
269 | 


--------------------------------------------------------------------------------
/predictions_fp32.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangjinsong3/YOLO-V3-Acceleration/384746bb6d8f61c8def70bbc0b5e04b98c60356e/predictions_fp32.jpg


--------------------------------------------------------------------------------
/preproc_yolov3.h:
--------------------------------------------------------------------------------
  1 | #ifndef YOLO_PREPROC_H
  2 | #define YOLO_PREPROC_H
  3 | 
  4 | #include <stdlib.h>
  5 | #include <stdio.h>
  6 | #include "opencv2/highgui/highgui_c.h"
  7 | #include "opencv2/imgproc/imgproc_c.h"
  8 | #include "opencv2/core/version.hpp"
  9 | #if CV_MAJOR_VERSION == 3
 10 | #include "opencv2/videoio/videoio_c.h"
 11 | #endif
 12 | 
 13 | typedef struct {
 14 |     int w;
 15 |     int h;
 16 |     int c;
 17 |     float *data;
 18 | } image;
 19 | 
 20 | image make_empty_image(int w, int h, int c)
 21 | {
 22 |     image out;
 23 |     out.data = 0;
 24 |     out.h = h;
 25 |     out.w = w;
 26 |     out.c = c;
 27 |     return out;
 28 | }
 29 | 
 30 | image make_image(int w, int h, int c)
 31 | {
 32 |     image out = make_empty_image(w,h,c);
 33 |     out.data = (float*)calloc(h*w*c, sizeof(float));
 34 |     return out;
 35 | }
 36 | void free_image(image m)
 37 | {
 38 |     if(m.data){
 39 |         free(m.data);
 40 |     }
 41 | }
 42 | void fill_image(image m, float s)
 43 | {
 44 |     int i;
 45 |     for(i = 0; i < m.h*m.w*m.c; ++i) m.data[i] = s;
 46 | }
 47 | 
 48 | float get_pixel(image m, int x, int y, int c)
 49 | {
 50 |     assert(x < m.w && y < m.h && c < m.c);
 51 |     return m.data[c*m.h*m.w + y*m.w + x];
 52 | }
 53 | 
 54 | void set_pixel(image m, int x, int y, int c, float val)
 55 | {
 56 |     if (x < 0 || y < 0 || c < 0 || x >= m.w || y >= m.h || c >= m.c) return;
 57 |     assert(x < m.w && y < m.h && c < m.c);
 58 |     m.data[c*m.h*m.w + y*m.w + x] = val;
 59 | }
 60 | void add_pixel(image m, int x, int y, int c, float val)
 61 | {
 62 |     assert(x < m.w && y < m.h && c < m.c);
 63 |     m.data[c*m.h*m.w + y*m.w + x] += val;
 64 | }
 65 | void embed_image(image source, image dest, int dx, int dy)
 66 | {
 67 |     int x,y,k;
 68 |     for(k = 0; k < source.c; ++k){
 69 |         for(y = 0; y < source.h; ++y){
 70 |             for(x = 0; x < source.w; ++x){
 71 |                 float val = get_pixel(source, x,y,k);
 72 |                 set_pixel(dest, dx+x, dy+y, k, val);
 73 |             }
 74 |         }
 75 |     }
 76 | }
 77 | 
 78 | void ipl_into_image(IplImage* src, image im)
 79 | {
 80 |     unsigned char *data = (unsigned char *)src->imageData;
 81 |     int h = src->height;
 82 |     int w = src->width;
 83 |     int c = src->nChannels;
 84 |     int step = src->widthStep;
 85 |     int i, j, k;
 86 | 
 87 |     for(i = 0; i < h; ++i){
 88 |         for(k= 0; k < c; ++k){
 89 |             for(j = 0; j < w; ++j){
 90 |                 im.data[k*w*h + i*w + j] = data[i*step + j*c + k]/255.;
 91 |             }
 92 |         }
 93 |     }
 94 | }
 95 | 
 96 | image ipl_to_image(IplImage* src)
 97 | {
 98 |     // ross
 99 |     if (0 == src) {
100 |         printf("file %s, line %d, src == 0\n", __FILE__, __LINE__);
101 |         exit(0);
102 |     }
103 |     int h = src->height;
104 |     int w = src->width;
105 |     int c = src->nChannels;
106 |     image out = make_image(w, h, c);
107 |     ipl_into_image(src, out);
108 |     return out;
109 | }
110 | 
111 | void rgbgr_image(image im)
112 | {
113 |     int i;
114 |     for(i = 0; i < im.w*im.h; ++i){
115 |         float swap = im.data[i];
116 |         im.data[i] = im.data[i+im.w*im.h*2];
117 |         im.data[i+im.w*im.h*2] = swap;
118 |     }
119 | }
120 | 
121 | image load_image_cv(char *filename, int channels)
122 | {
123 |     IplImage* src = 0;
124 |     int flag = -1;
125 |     if (channels == 0) flag = -1;
126 |     else if (channels == 1) flag = 0;
127 |     else if (channels == 3) flag = 1;
128 |     else {
129 |         fprintf(stderr, "OpenCV can't force load with %d channels\n", channels);
130 |     }
131 | 
132 |     if( (src = (IplImage*)cvLoadImage(filename, flag)) == NULL )
133 |     {
134 |         fprintf(stderr, "Cannot load image \"%s\"\n", filename);
135 |         exit(0);
136 |     }
137 |     image out = ipl_to_image(src);
138 |     cvReleaseImage(&src);
139 |     rgbgr_image(out);
140 |     return out;
141 | }
142 | 
143 | image resize_image(image im, int w, int h)
144 | {
145 |     image resized = make_image(w, h, im.c);   
146 |     image part = make_image(w, im.h, im.c);
147 |     int r, c, k;
148 |     float w_scale = (float)(im.w - 1) / (w - 1);
149 |     float h_scale = (float)(im.h - 1) / (h - 1);
150 |     for(k = 0; k < im.c; ++k){
151 |         for(r = 0; r < im.h; ++r){
152 |             for(c = 0; c < w; ++c){
153 |                 float val = 0;
154 |                 if(c == w-1 || im.w == 1){
155 |                     val = get_pixel(im, im.w-1, r, k);
156 |                 } else {
157 |                     float sx = c*w_scale;
158 |                     int ix = (int) sx;
159 |                     float dx = sx - ix;
160 |                     val = (1 - dx) * get_pixel(im, ix, r, k) + dx * get_pixel(im, ix+1, r, k);
161 |                 }
162 |                 set_pixel(part, c, r, k, val);
163 |             }
164 |         }
165 |     }
166 |     for(k = 0; k < im.c; ++k){
167 |         for(r = 0; r < h; ++r){
168 |             float sy = r*h_scale;
169 |             int iy = (int) sy;
170 |             float dy = sy - iy;
171 |             for(c = 0; c < w; ++c){
172 |                 float val = (1-dy) * get_pixel(part, c, iy, k);
173 |                 set_pixel(resized, c, r, k, val);
174 |             }
175 |             if(r == h-1 || im.h == 1) continue;
176 |             for(c = 0; c < w; ++c){
177 |                 float val = dy * get_pixel(part, c, iy+1, k);
178 |                 add_pixel(resized, c, r, k, val);
179 |             }
180 |         }
181 |     }
182 | 
183 |     free_image(part);
184 |     return resized;
185 | }
186 | 
187 | image load_image(char *filename, int w, int h, int c)
188 | {
189 |     image out = load_image_cv(filename, c);
190 | 
191 |     if((h && w) && (h != out.h || w != out.w)){
192 |         image resized = resize_image(out, w, h);
193 |         free_image(out);
194 |         out = resized;
195 |     }
196 |     return out;
197 | }
198 | 
199 | 
200 | image load_image_color(char *filename, int w, int h)
201 | {
202 |     return load_image(filename, w, h, 3);
203 | }
204 | 
205 | image copy_image(image p)
206 | {
207 |     image copy = p;
208 |     copy.data = (float*)calloc(p.h*p.w*p.c, sizeof(float));
209 |     memcpy(copy.data, p.data, p.h*p.w*p.c*sizeof(float));
210 |     return copy;
211 | }
212 | 
213 | void save_image_jpg(image p, const char *name)
214 | {
215 |     image copy = copy_image(p);
216 |     if(p.c == 3) rgbgr_image(copy);
217 |     int x,y,k;
218 | 
219 |     char buff[256];
220 |     sprintf(buff, "%s.jpg", name);
221 | 
222 |     IplImage *disp = cvCreateImage(cvSize(p.w,p.h), IPL_DEPTH_8U, p.c);
223 |     int step = disp->widthStep;
224 |     for(y = 0; y < p.h; ++y){
225 |         for(x = 0; x < p.w; ++x){
226 |             for(k= 0; k < p.c; ++k){
227 |                 disp->imageData[y*step + x*p.c + k] = (unsigned char)(get_pixel(copy,x,y,k)*255);
228 |             }
229 |         }
230 |     }
231 |     cvSaveImage(buff, disp,0);
232 |     cvReleaseImage(&disp);
233 |     free_image(copy);
234 | }
235 | 
236 | void save_image(image im, const char *name)
237 | {
238 |     save_image_jpg(im, name);
239 | }
240 | 
241 | 
242 | image letterbox_image(image im, int w, int h)
243 | {
244 |     int new_w = im.w;
245 |     int new_h = im.h;
246 |     if (((float)w/im.w) < ((float)h/im.h)) {
247 |         new_w = w;
248 |         new_h = (im.h * w)/im.w;
249 |     } else {
250 |         new_h = h;
251 |         new_w = (im.w * h)/im.h;
252 |     }
253 |     image resized = resize_image(im, new_w, new_h);
254 |     image boxed = make_image(w, h, im.c);
255 |     fill_image(boxed, .5);
256 |     embed_image(resized, boxed, (w-new_w)/2, (h-new_h)/2); 
257 |     free_image(resized);
258 |     return boxed;
259 | }
260 | 
261 | #endif
262 | 


--------------------------------------------------------------------------------
/regionLayer.cu:
--------------------------------------------------------------------------------
  1 | #include <regionLayer.h>
  2 | #include <cfloat>
  3 | 
  4 | /** \brief kernel for softmax
  5 |  *  - n is the number of classes (included the background)
  6 |  *
  7 |  *  - The CPU implementation is
  8 |  *  for b in batch:
  9 |  *      for g in groups:
 10 |  *          softmax(input + b*batchOffset + g*groupOffset, n, temp, stride, output + b*batchOffset + g*groupOffset)
 11 |  *
 12 |  *  - The GPU implementation put the two for-loop into parallel.
 13 |  *
 14 |  *  - nthdsPerCTA: the max number of threads per block.
 15 |  *  - Each thread will in charge of one point softmax for all classes.
 16 |  *  - Total number of threads: batch * groups
 17 |  *
 18 |  *  - TODO: using warp shuffle instead of loop in one thread.
 19 |  */
 20 | template <unsigned nthdsPerCTA>
 21 | __launch_bounds__(nthdsPerCTA)
 22 | __global__ void softmaxKernel(const float * input,
 23 |             const int n,
 24 |             const int batch,
 25 |             const int batchOffset,
 26 |             const int groups,
 27 |             const int groupOffset,
 28 |             const int stride,
 29 |             const float temp,
 30 |             float * output)
 31 | {
 32 |     int id = blockIdx.x * nthdsPerCTA + threadIdx.x;
 33 | 
 34 |     // per batch, per group
 35 |     if (id < batch * groups)
 36 |     {
 37 |         int b = id / groups;
 38 |         int g = id % groups;
 39 |         float sum = 0.;
 40 |         float largest = -FLT_MAX;
 41 |         int offset = b*batchOffset + g*groupOffset;
 42 |         for (int i = 0; i < n; ++i)
 43 |         {
 44 |             float val = input[i*stride + offset];
 45 |             largest = (val > largest) ? val : largest;
 46 |         }
 47 |         for (int i = 0; i < n; ++i)
 48 |         {
 49 |             float e = exp(input[i*stride + offset]/temp - largest/temp); // bound score in (-inf,0], and denominator fractor in (0,1].
 50 |             sum += e;
 51 |             output[i*stride + offset] = e;
 52 |         }
 53 |         for (int i = 0; i < n; ++i)
 54 |           output[i*stride + offset] /= sum;
 55 |     }
 56 | }
 57 | 
 58 | 
 59 | /**
 60 |  * \brief Sigmoid function
 61 |  *
 62 |  * "__launch_bounds__" ensures the universality of kernel
 63 |  */
 64 | template <unsigned nthdsPerCTA>
 65 | __launch_bounds__(nthdsPerCTA)
 66 | __global__ void activateKernel(float * data,
 67 |             const int range)
 68 | {
 69 |     int i = blockIdx.x * nthdsPerCTA + threadIdx.x;
 70 |     if (i < range)
 71 |       data[i] = 1. / (1. + exp(-data[i]));
 72 | }
 73 | 
 74 | /**
 75 |  * \brief region layer of YOLOv3
 76 |  * Includes activation and softmax.
 77 |  * - num: # bounding box per location
 78 |  *
 79 |  * If we integrated into tensorRT, we can use input and output are different memory.
 80 |  * If it is standalone GPU code (in main.cpp), we can use input and output the same buffer.
 81 |  *
 82 |  * Note: The elements in YOLOv3
 83 |  * * 4*nCells coords,
 84 |  * * nCells conf,
 85 |  * * classes*nCells classes
 86 |  *  e.g.
 87 |  *      * nCells for 0 class (background)
 88 |  *      * nCells for 1 class
 89 |  *      * ...
 90 |  */
 91 | void regionLayer_gpu(
 92 |         const int batch,
 93 |         const int C,
 94 |         const int nCells,
 95 |         const int num,
 96 |         const int coords,
 97 |         const int classes,
 98 |         const float * input,
 99 |         float * output,
100 |         cudaStream_t stream)
101 | {
102 |     const int blockSize = 256;
103 |     const int gridSize_Act1 = (2*nCells + blockSize - 1) / blockSize;  // x, y
104 |     const int gridSize_Act2 = (nCells + blockSize - 1) / blockSize;    // conf
105 |     const int gridSize_Softmax = (nCells + blockSize - 1) / blockSize;   // classes
106 |     // for YOLOv3, the output of final layer is C*nCells, in which, C includes all the conf, coord, and claesses.
107 | 
108 | #ifdef REGION_IN_TRT
109 |     // TRT, input and output are diff buffer
110 |     ck(cudaMemcpy((void*)output, (void*)input, batch*C*nCells*sizeof(float), cudaMemcpyDeviceToDevice));
111 | #endif
112 |     // else input and output can be same buffer
113 | 
114 |     for (int b = 0; b < batch; ++b) {
115 |         for (int n = 0; n < num; ++n) {
116 |             // activate on (x,y)
117 |             int index = b*C*nCells   // per batch
118 |                         + n*nCells*(coords+classes+1);  // coords, classes and confidence
119 |             activateKernel<blockSize>
120 |                 <<<gridSize_Act1, blockSize, 0, stream>>>
121 |                 (output + index, 2*nCells);
122 | 
123 |             // activate on probes on conf
124 |             index = b*C*nCells
125 |                     + n*nCells*(coords+classes+1)
126 |                     + 4*nCells;                        // skip coords
127 |             activateKernel<blockSize>
128 |                 <<<gridSize_Act2, blockSize, 0, stream>>>
129 |                 (output + index, nCells);
130 | 
131 | 						// softmax for all classes
132 |             index = b*C*nCells
133 |                     + n*nCells*(coords+classes+1)
134 |                     + 5*nCells;                        // skip conf
135 | 						softmaxKernel<blockSize>
136 | 						    <<<gridSize_Softmax, blockSize, 0, stream>>>
137 |                 (input + index,     // input: skip loc, conf
138 |                  classes,           // n: #classes
139 |                  batch*num,         // batch: batch * #bound_box
140 |                  (C*nCells/num),    // batchOffset: number of bounding_box in total
141 |                  nCells,            // groups
142 |                  1,                 // groupOffset
143 |                  nCells,            // stride
144 |                  1.f,               // temp
145 |                  output + index);   // output
146 |         }
147 |     }
148 | }
149 | 
150 | #define nOutputLayer 3
151 | template <unsigned nthdsPerCTA>
152 | __launch_bounds__(nthdsPerCTA)
153 | __global__ void reorgOutputKernel(
154 |         const int       nBatch,
155 |         const int       nClasses,
156 |         const int       nBboxesPerLoc,
157 |         const int       coords,
158 | 				const int				l0_w,
159 | 				const int				l0_h,
160 |         const int       nCells,
161 |         float*          dpData_unordered[],
162 |         float*          dpData)
163 | {
164 |     long i = blockIdx.x * nthdsPerCTA + threadIdx.x;
165 |     const int bboxMemLen  = (nClasses + coords + 1) * nCells;
166 |     const int batchMemLen = nBboxesPerLoc * bboxMemLen;
167 |     const long range = nBatch * batchMemLen;
168 |     if (i < range) // voc<266175 coco<904995 wrt. 416*416 input
169 |     {
170 |         int b = i / batchMemLen;
171 |         int bboxIdx = (i % batchMemLen) / bboxMemLen;
172 |         int channelIdx = ((i % batchMemLen) % bboxMemLen) / nCells;
173 |         int locIdx = (i % batchMemLen) % nCells;
174 |         int locLayer, cnt_offset = 1+2*2+4*4;
175 | 				for(int j = nOutputLayer-1; j >= 0; --j){
176 | 				    cnt_offset -= (1<<j)*(1<<j); // zoomFactor = 2
177 | 				    if(locIdx >= cnt_offset*l0_w*l0_h){
178 | 				        locLayer = j;
179 | 				        break;
180 | 				    }
181 | 				}
182 | 				dpData[i] = dpData_unordered[locLayer]\
183 | 				                [b*nBboxesPerLoc*(nClasses+coords+1)*(1<<locLayer)*(1<<locLayer)*l0_w*l0_h +\
184 | 				                bboxIdx*(nClasses+coords+1)*(1<<locLayer)*(1<<locLayer)*l0_w*l0_h +\
185 | 				                channelIdx*(1<<locLayer)*(1<<locLayer)*l0_w*l0_h +\
186 | 				                locIdx - cnt_offset*l0_w*l0_h];
187 |     }
188 | }
189 | 
190 | void reorgOutput_gpu(
191 |         const int       nBatch,
192 |         const int       nClasses,
193 |         const int       nBboxesPerLoc,
194 |         const int       coords,
195 | 				const int				l0_w,
196 | 				const int				l0_h,
197 |         const int       nCells,
198 |         float*          dpData_unordered[],
199 |         float*          dpData,
200 |         const long      nData,
201 |         cudaStream_t    stream)
202 | {
203 |     const int blockSize = 512;
204 |     const int gridSize = (nData + blockSize - 1) / blockSize;
205 |     reorgOutputKernel<blockSize>
206 |         <<<gridSize, blockSize, 0, stream>>>
207 |         (nBatch, nClasses, nBboxesPerLoc, coords, l0_w, l0_h, nCells, dpData_unordered, dpData);
208 |     
209 | }
210 | 


--------------------------------------------------------------------------------
/regionLayer.cu~:
--------------------------------------------------------------------------------
  1 | #include <regionLayer.h>
  2 | #include <cfloat>
  3 | 
  4 | /** \brief kernel for softmax
  5 |  *  - n is the number of classes (included the background)
  6 |  *
  7 |  *  - The CPU implementation is
  8 |  *  for b in batch:
  9 |  *      for g in groups:
 10 |  *          softmax(input + b*batchOffset + g*groupOffset, n, temp, stride, output + b*batchOffset + g*groupOffset)
 11 |  *
 12 |  *  - The GPU implementation put the two for-loop into parallel.
 13 |  *
 14 |  *  - nthdsPerCTA: the max number of threads per block.
 15 |  *  - Each thread will in charge of one point softmax for all classes.
 16 |  *  - Total number of threads: batch * groups
 17 |  *
 18 |  *  - TODO: using warp shuffle instead of loop in one thread.
 19 |  */
 20 | template <unsigned nthdsPerCTA>
 21 | __launch_bounds__(nthdsPerCTA)
 22 | __global__ void softmaxKernel(const float * input,
 23 |             const int n,
 24 |             const int batch,
 25 |             const int batchOffset,
 26 |             const int groups,
 27 |             const int groupOffset,
 28 |             const int stride,
 29 |             const float temp,
 30 |             float * output)
 31 | {
 32 |     int id = blockIdx.x * nthdsPerCTA + threadIdx.x;
 33 | 
 34 |     // per batch, per group
 35 |     if (id < batch * groups)
 36 |     {
 37 |         int b = id / groups;
 38 |         int g = id % groups;
 39 |         float sum = 0.;
 40 |         float largest = -FLT_MAX;
 41 |         int offset = b*batchOffset + g*groupOffset;
 42 |         for (int i = 0; i < n; ++i)
 43 |         {
 44 |             float val = input[i*stride + offset];
 45 |             largest = (val > largest) ? val : largest;
 46 |         }
 47 |         for (int i = 0; i < n; ++i)
 48 |         {
 49 |             float e = exp(input[i*stride + offset]/temp - largest/temp); // bound score in (-inf,0], and denominator fractor in (0,1].
 50 |             sum += e;
 51 |             output[i*stride + offset] = e;
 52 |         }
 53 |         for (int i = 0; i < n; ++i)
 54 |           output[i*stride + offset] /= sum;
 55 |     }
 56 | }
 57 | 
 58 | 
 59 | /**
 60 |  * \brief Sigmoid function
 61 |  *
 62 |  * "__launch_bounds__" ensures the universality of kernel
 63 |  */
 64 | template <unsigned nthdsPerCTA>
 65 | __launch_bounds__(nthdsPerCTA)
 66 | __global__ void activateKernel(float * data,
 67 |             const int range)
 68 | {
 69 |     int i = blockIdx.x * nthdsPerCTA + threadIdx.x;
 70 |     if (i < range)
 71 |       data[i] = 1. / (1. + exp(-data[i]));
 72 | }
 73 | 
 74 | /**
 75 |  * \brief region layer of YOLOv3
 76 |  * Includes activation and softmax.
 77 |  * - num: # bounding box per location
 78 |  *
 79 |  * If we integrated into tensorRT, we can use input and output are different memory.
 80 |  * If it is standalone GPU code (in main.cpp), we can use input and output the same buffer.
 81 |  *
 82 |  * Note: The elements in YOLOv2
 83 |  * * 4*nCells coords,
 84 |  * * nCells conf,
 85 |  * * classes*nCells classes
 86 |  *  e.g.
 87 |  *      * nCells for 0 class (background)
 88 |  *      * nCells for 1 class
 89 |  *      * ...
 90 |  */
 91 | void regionLayer_gpu(
 92 |         const int batch,
 93 |         const int C,
 94 |         const int nCells,
 95 |         const int num,
 96 |         const int coords,
 97 |         const int classes,
 98 |         const float * input,
 99 |         float * output,
100 |         cudaStream_t stream)
101 | {
102 |     const int blockSize = 256;
103 |     const int gridSize_Act1 = (2*nCells + blockSize - 1) / blockSize;  // x, y
104 |     const int gridSize_Act2 = (nCells + blockSize - 1) / blockSize;    // conf
105 |     const int gridSize_Softmax = (nCells + blockSize - 1) / blockSize;   // classes
106 |     // for YOLOv3, the output of final layer is C*nCells, in which, C includes all the conf, coord, and claesses.
107 | 
108 | #ifdef REGION_IN_TRT
109 |     // TRT, input and output are diff buffer
110 |     ck(cudaMemcpy((void*)output, (void*)input, batch*C*nCells*sizeof(float), cudaMemcpyDeviceToDevice));
111 | #endif
112 |     // else input and output can be same buffer
113 | 
114 |     for (int b = 0; b < batch; ++b) {
115 |         for (int n = 0; n < num; ++n) {
116 |             // activate on (x,y)
117 |             int index = b*C*nCells   // per batch
118 |                         + n*nCells*(coords+classes+1);  // coords, classes and confidence
119 |             activateKernel<blockSize>
120 |                 <<<gridSize_Act1, blockSize, 0, stream>>>
121 |                 (output + index, 2*nCells);
122 | 
123 |             // activate on probes on conf
124 |             index = b*C*nCells
125 |                     + n*nCells*(coords+classes+1)
126 |                     + 4*nCells;                        // skip coords
127 |             activateKernel<blockSize>
128 |                 <<<gridSize_Act2, blockSize, 0, stream>>>
129 |                 (output + index, nCells);
130 | 
131 | 						// softmax for all classes
132 |             index = b*C*nCells
133 |                     + n*nCells*(coords+classes+1)
134 |                     + 5*nCells;                        // skip conf
135 | 						softmaxKernel<blockSize>
136 | 						    <<<gridSize_Softmax, blockSize, 0, stream>>>
137 |                 (input + index,     // input: skip loc, conf
138 |                  classes,           // n: #classes
139 |                  batch*num,         // batch: batch * #bound_box
140 |                  (C*nCells/num),    // batchOffset: number of bounding_box in total
141 |                  nCells,            // groups
142 |                  1,                 // groupOffset
143 |                  nCells,            // stride
144 |                  1.f,               // temp
145 |                  output + index);   // output
146 |         }
147 |     }
148 | }
149 | 
150 | #define nOutputLayer 3
151 | template <unsigned nthdsPerCTA>
152 | __launch_bounds__(nthdsPerCTA)
153 | __global__ void reorgOutputKernel(
154 |         const int       nBatch,
155 |         const int       nClasses,
156 |         const int       nBboxesPerLoc,
157 |         const int       coords,
158 | 				const int				l0_w,
159 | 				const int				l0_h,
160 |         const int       nCells,
161 |         float*          dpData_unordered[],
162 |         float*          dpData)
163 | {
164 |     long i = blockIdx.x * nthdsPerCTA + threadIdx.x;
165 |     const int bboxMemLen  = (nClasses + coords + 1) * nCells;
166 |     const int batchMemLen = nBboxesPerLoc * bboxMemLen;
167 |     const long range = nBatch * batchMemLen;
168 |     if (i < range) // voc<266175 coco<904995 wrt. 416*416 input
169 |     {
170 |         int b = i / batchMemLen;
171 |         int bboxIdx = (i % batchMemLen) / bboxMemLen;
172 |         int channelIdx = ((i % batchMemLen) % bboxMemLen) / nCells;
173 |         int locIdx = (i % batchMemLen) % nCells;
174 |         int locLayer, cnt_offset = 1+2*2+4*4;
175 | 				for(int j = nOutputLayer-1; j >= 0; --j){
176 | 				    cnt_offset -= (1<<j)*(1<<j); // zoomFactor = 2
177 | 				    if(locIdx >= cnt_offset*l0_w*l0_h){
178 | 				        locLayer = j;
179 | 				        break;
180 | 				    }
181 | 				}
182 | 				dpData[i] = dpData_unordered[locLayer]\
183 | 				                [b*nBboxesPerLoc*(nClasses+coords+1)*(1<<locLayer)*(1<<locLayer)*l0_w*l0_h +\
184 | 				                bboxIdx*(nClasses+coords+1)*(1<<locLayer)*(1<<locLayer)*l0_w*l0_h +\
185 | 				                channelIdx*(1<<locLayer)*(1<<locLayer)*l0_w*l0_h +\
186 | 				                locIdx - cnt_offset*l0_w*l0_h];
187 |     }
188 | }
189 | 
190 | void reorgOutput_gpu(
191 |         const int       nBatch,
192 |         const int       nClasses,
193 |         const int       nBboxesPerLoc,
194 |         const int       coords,
195 | 				const int				l0_w,
196 | 				const int				l0_h,
197 |         const int       nCells,
198 |         float*          dpData_unordered[],
199 |         float*          dpData,
200 |         const long      nData,
201 |         cudaStream_t    stream)
202 | {
203 |     const int blockSize = 512;
204 |     const int gridSize = (nData + blockSize - 1) / blockSize;
205 |     reorgOutputKernel<blockSize>
206 |         <<<gridSize, blockSize, 0, stream>>>
207 |         (nBatch, nClasses, nBboxesPerLoc, coords, l0_w, l0_h, nCells, dpData_unordered, dpData);
208 |     
209 | }
210 | 


--------------------------------------------------------------------------------
/regionLayer.h:
--------------------------------------------------------------------------------
 1 | #ifndef REGION_LAYER_H_
 2 | #define REGION_LAYER_H_
 3 | 
 4 | #include "nvUtils.h"
 5 | 
 6 | class regionParams{
 7 | public:
 8 |     int classes;        // number of class
 9 |     int n;              // number of bbox per location
10 |     int coords;         // number of coords (4)
11 |     int w;              // w (darknet)
12 |     int h;              // h (darknet), in total, we have w*h*n bbox
13 |     int outputs;        // outputs (darknet), output dimension of previous layer,
14 | 
15 |     bool softmax;       // 1 for softmax process
16 |     int background;     // background index
17 | };
18 | 
19 | typedef struct{
20 |     float x, y, w, h;
21 | } box;
22 | 
23 | void regionLayer_gpu(const int batch,
24 |                      const int C,
25 |                      const int nCells,
26 |                      const int num,
27 |                      const int coords,
28 |                      const int classes,
29 |                      const float * input,
30 |                      float * output,
31 |                      cudaStream_t stream);
32 |                      
33 | void reorgOutput_gpu(const int       nBatch,
34 |                      const int       nClasses,
35 |                      const int       nBboxesPerLoc,
36 |                      const int       coords,
37 |                      const int			 l0_w,
38 |                      const int			 l0_h,
39 |                      const int       nCells,
40 |                      float*          dpData_unordered[],
41 |                      float*          dpData,
42 |                      const long      nData,
43 |                      cudaStream_t    stream);
44 | #endif
45 | 


--------------------------------------------------------------------------------
/regionLayer.h~:
--------------------------------------------------------------------------------
 1 | #ifndef REGION_LAYER_H_
 2 | #define REGION_LAYER_H_
 3 | 
 4 | #include "nvUtils.h"
 5 | 
 6 | class regionParams{
 7 | public:
 8 |     int classes;        // number of class
 9 |     int n;              // number of bbox per location
10 |     int coords;         // number of coords (4)
11 |     int w;              // w (darknet)
12 |     int h;              // h (darknet), in total, we have w*h*n bbox
13 |     int outputs;        // outputs (darknet), output dimension of previous layer,
14 | 
15 |     bool softmax;       // 1 for softmax process
16 |     int background;     // background index
17 | };
18 | 
19 | typedef struct{
20 |     float x, y, w, h;
21 | } box;
22 | 
23 | void regionLayer_gpu(const int batch,
24 |                      const int C,
25 |                      const int nCells,
26 |                      const int num,
27 |                      const int coords,
28 |                      const int classes,
29 |                      const float * input,
30 |                      float * output,
31 |                      cudaStream_t stream);
32 |                      
33 | void reorgOutput_gpu(const int       nBatch,
34 |                      const int       nClasses,
35 |                      const int       nBboxesPerLoc,
36 |                      const int       coords,
37 |                      const int			 l0_w,
38 |                      const int			 l0_h,
39 |                      const int       nCells,
40 |                      float* __constant__ dpData_unordered[],
41 |                      float*          dpData,
42 |                      const long      nData,
43 |                      cudaStream_t    stream);
44 | #endif
45 | 


--------------------------------------------------------------------------------
/results/calc_mAP.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | from voc_eval import voc_eval
 4 | 
 5 | 
 6 | names = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
 7 |          'bus', 'car', 'cat', 'chair', 'cow',
 8 |          'diningtable', 'dog', 'horse', 'motorbike', 'person',
 9 |          'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']
10 | 
11 | iou_threshold = float(sys.argv[1])
12 | print 'IOU threshold %.5f' % iou_threshold
13 | 
14 | mAP = []
15 | for name in names:
16 |     recall, precision, ap =  voc_eval(
17 |         # change this to your results file
18 |         './comp4_det_test_{}.txt',
19 |         # change these 2 to your voc dataset
20 |         '/home/weisong/_yolov3/YOLOv3-darknet/data_voc/VOCdevkit/VOC2007/Annotations/{}.xml',
21 |         '/home/weisong/_yolov3/YOLOv3-darknet/data_voc/VOCdevkit/VOC2007/ImageSets/Main/test.txt',
22 |         name,
23 |         './cache/',
24 |         iou_threshold)
25 | 
26 |     print "%-15s %.5f" % (name, ap)
27 |     mAP.append(ap)
28 | 
29 | ret = (float)(sum(mAP) / len(mAP))
30 | print 'mAP = %.5f' % ret
31 | exit(ret)
32 | 


--------------------------------------------------------------------------------
/results/calc_mAP.py~:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | from voc_eval import voc_eval
 4 | 
 5 | 
 6 | names = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
 7 |          'bus', 'car', 'cat', 'chair', 'cow',
 8 |          'diningtable', 'dog', 'horse', 'motorbike', 'person',
 9 |          'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']
10 | 
11 | iou_threshold = float(sys.argv[1])
12 | print 'IOU threshold %.5f' % iou_threshold
13 | 
14 | mAP = []
15 | for name in names:
16 |     recall, precision, ap =  voc_eval(
17 |         # change this to your results file
18 |         './comp4_det_test_{}.txt',
19 |         # change these 2 to your voc dataset
20 |         '/home/weisong/_yolov3/YOLOv3-darknet/data_voc/VOCdevkit/VOC2007/Annotations/{}.xml',
21 |         '/home/weisong/_yolov3/YOLOv3-darknet/data_voc/VOCdevkit/VOC2007/ImageSets/Main/test.txt',
22 |         name,
23 |         './cache/',
24 |         iou_threshold)
25 | 
26 |     print "%-15s %.5f" % (name, ap)
27 |     mAP.append(ap)
28 | 
29 | print 'mAP = %.5f' % (sum(mAP) / len(mAP))
30 | exit((sum(mAP) / len(mAP)))
31 | 


--------------------------------------------------------------------------------
/results/mAP.csv:
--------------------------------------------------------------------------------
 1 | 0.450000 0.003000 0.737443
 2 | 0.050000 0.005000 0.685275
 3 | 0.050000 0.010000 0.679230
 4 | 0.050000 0.015000 0.670850
 5 | 0.050000 0.020000 0.671227
 6 | 0.050000 0.025000 0.666309
 7 | 0.050000 0.030000 0.661274
 8 | 0.100000 0.005000 0.700229
 9 | 0.100000 0.010000 0.694647
10 | 0.100000 0.015000 0.688362
11 | 0.100000 0.020000 0.683568
12 | 0.100000 0.025000 0.678220
13 | 0.100000 0.030000 0.675738
14 | 0.150000 0.005000 0.713384
15 | 0.150000 0.010000 0.706828
16 | 0.150000 0.015000 0.699336
17 | 0.150000 0.020000 0.697449
18 | 0.150000 0.025000 0.692435
19 | 0.150000 0.030000 0.686543
20 | 0.200000 0.005000 0.715730
21 | 0.200000 0.010000 0.711944
22 | 0.200000 0.015000 0.705454
23 | 0.200000 0.020000 0.701342
24 | 0.200000 0.025000 0.697500
25 | 0.200000 0.030000 0.694639
26 | 0.250000 0.005000 0.725145
27 | 0.250000 0.010000 0.718843
28 | 0.250000 0.015000 0.711384
29 | 0.250000 0.020000 0.706877
30 | 0.250000 0.025000 0.703751
31 | 0.250000 0.030000 0.700477
32 | 0.300000 0.005000 0.724348
33 | 0.300000 0.010000 0.720032
34 | 0.300000 0.015000 0.710854
35 | 0.300000 0.020000 0.711754
36 | 0.300000 0.025000 0.702736
37 | 0.300000 0.030000 0.704244
38 | 0.350000 0.005000 0.726985
39 | 0.350000 0.010000 0.721257
40 | 0.350000 0.015000 0.715500
41 | 0.350000 0.020000 0.711058
42 | 0.350000 0.025000 0.703841
43 | 0.350000 0.030000 0.704462
44 | 0.400000 0.005000 0.727567
45 | 0.400000 0.010000 0.719089
46 | 0.400000 0.015000 0.718993
47 | 0.400000 0.020000 0.708048
48 | 0.400000 0.025000 0.707436
49 | 0.400000 0.030000 0.704881
50 | 0.400000 0.001000 0.744281
51 | 0.400000 0.002000 0.740347
52 | 0.400000 0.003000 0.736656
53 | 0.400000 0.004000 0.731862
54 | 0.400000 0.005000 0.730218
55 | 0.450000 0.001000 0.745715
56 | 0.450000 0.002000 0.741461
57 | 0.450000 0.003000 0.738510
58 | 0.450000 0.004000 0.732984
59 | 0.450000 0.005000 0.733175
60 | 0.500000 0.001000 0.745760
61 | 0.500000 0.002000 0.741433
62 | 0.500000 0.003000 0.739151
63 | 0.500000 0.004000 0.731776
64 | 0.500000 0.005000 0.732461
65 | 0.550000 0.001000 0.741887
66 | 0.550000 0.002000 0.735486
67 | 0.550000 0.003000 0.735518
68 | 0.550000 0.004000 0.732146
69 | 0.550000 0.005000 0.730386
70 | 0.600000 0.001000 0.736716
71 | 0.600000 0.002000 0.732227
72 | 0.600000 0.003000 0.729687
73 | 0.600000 0.004000 0.723691
74 | 0.600000 0.005000 0.725624
75 | 0.650000 0.001000 0.722332
76 | 0.650000 0.002000 0.719295
77 | 0.650000 0.003000 0.716194
78 | 0.650000 0.004000 0.712005
79 | 0.650000 0.005000 0.713273
80 | 0.700000 0.001000 0.697123
81 | 0.700000 0.002000 0.692214
82 | 0.700000 0.003000 0.694202
83 | 0.700000 0.004000 0.688393
84 | 0.700000 0.005000 0.690033
85 | 


--------------------------------------------------------------------------------
/results/mAP.csv~:
--------------------------------------------------------------------------------
1 | 0.450000 0.003000 0.737443
2 | 0.000000 0.000000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 99.0000000.000000 0.000000 0.0000000.000000 0.000000 0.00000047.000000 0.000000 0.0000000.000000 0.000000 0.0000000.000000 114.000000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 0.00000046.000000 0.000000 0.000000
3 | 0.000000 0.005000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 99.0000000.000000 0.000000 0.0000000.000000 0.000000 0.00000047.000000 0.000000 0.0000000.000000 0.000000 0.0000000.000000 114.000000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 0.00000046.000000 0.000000 0.000000
4 | 0.000000 0.010000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 99.0000000.000000 0.000000 0.0000000.000000 0.000000 0.00000047.000000 0.000000 0.0000000.000000 0.000000 0.0000000.000000 114.000000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 0.0000000.000000 0.000000 0.00000046.000000 0.000000 0.000000
5 | 0.050000 0.005000 0.685275
6 | 0.050000 0.010000 0.679230
7 | 


--------------------------------------------------------------------------------
/results/voc_eval.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast/er R-CNN
  3 | # Licensed under The MIT License [see LICENSE for details]
  4 | # Written by Bharath Hariharan
  5 | # --------------------------------------------------------
  6 | 
  7 | import xml.etree.ElementTree as ET
  8 | import os
  9 | import cPickle
 10 | import numpy as np
 11 | 
 12 | def parse_rec(filename):
 13 |     """ Parse a PASCAL VOC xml file """
 14 |     tree = ET.parse(filename)
 15 |     objects = []
 16 |     for obj in tree.findall('object'):
 17 |         obj_struct = {}
 18 |         obj_struct['name'] = obj.find('name').text
 19 |         obj_struct['pose'] = obj.find('pose').text
 20 |         obj_struct['truncated'] = int(obj.find('truncated').text)
 21 |         obj_struct['difficult'] = int(obj.find('difficult').text)
 22 |         bbox = obj.find('bndbox')
 23 |         obj_struct['bbox'] = [int(bbox.find('xmin').text),
 24 |                               int(bbox.find('ymin').text),
 25 |                               int(bbox.find('xmax').text),
 26 |                               int(bbox.find('ymax').text)]
 27 |         objects.append(obj_struct)
 28 | 
 29 |     return objects
 30 | 
 31 | def voc_ap(rec, prec, use_07_metric=False):
 32 |     """ ap = voc_ap(rec, prec, [use_07_metric])
 33 |     Compute VOC AP given precision and recall.
 34 |     If use_07_metric is true, uses the
 35 |     VOC 07 11 point method (default:False).
 36 |     """
 37 |     if use_07_metric:
 38 |         # 11 point metric
 39 |         ap = 0.
 40 |         for t in np.arange(0., 1.1, 0.1):
 41 |             if np.sum(rec >= t) == 0:
 42 |                 p = 0
 43 |             else:
 44 |                 p = np.max(prec[rec >= t])
 45 |             ap = ap + p / 11.
 46 |     else:
 47 |         # correct AP calculation
 48 |         # first append sentinel values at the end
 49 |         mrec = np.concatenate(([0.], rec, [1.]))
 50 |         mpre = np.concatenate(([0.], prec, [0.]))
 51 | 
 52 |         # compute the precision envelope
 53 |         for i in range(mpre.size - 1, 0, -1):
 54 |             mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
 55 | 
 56 |         # to calculate area under PR curve, look for points
 57 |         # where X axis (recall) changes value
 58 |         i = np.where(mrec[1:] != mrec[:-1])[0]
 59 | 
 60 |         # and sum (\Delta recall) * prec
 61 |         ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
 62 |     return ap
 63 | 
 64 | def voc_eval(detpath,
 65 |              annopath,
 66 |              imagesetfile,
 67 |              classname,
 68 |              cachedir,
 69 |              ovthresh=0.5,
 70 |              use_07_metric=False):
 71 |     """rec, prec, ap = voc_eval(detpath,
 72 |                                 annopath,
 73 |                                 imagesetfile,
 74 |                                 classname,
 75 |                                 [ovthresh],
 76 |                                 [use_07_metric])
 77 | 
 78 |     Top level function that does the PASCAL VOC evaluation.
 79 | 
 80 |     detpath: Path to detections
 81 |         detpath.format(classname) should produce the detection results file.
 82 |     annopath: Path to annotations
 83 |         annopath.format(imagename) should be the xml annotations file.
 84 |     imagesetfile: Text file containing the list of images, one image per line.
 85 |     classname: Category name (duh)
 86 |     cachedir: Directory for caching the annotations
 87 |     [ovthresh]: Overlap threshold (default = 0.5)
 88 |     [use_07_metric]: Whether to use VOC07's 11 point AP computation
 89 |         (default False)
 90 |     """
 91 |     # assumes detections are in detpath.format(classname)
 92 |     # assumes annotations are in annopath.format(imagename)
 93 |     # assumes imagesetfile is a text file with each line an image name
 94 |     # cachedir caches the annotations in a pickle file
 95 | 
 96 |     # first load gt
 97 |     if not os.path.isdir(cachedir):
 98 |         os.mkdir(cachedir)
 99 |     cachefile = os.path.join(cachedir, 'annots.pkl')
100 |     # read list of images
101 |     with open(imagesetfile, 'r') as f:
102 |         lines = f.readlines()
103 |     imagenames = [x.strip() for x in lines]
104 | 
105 |     if not os.path.isfile(cachefile):
106 |         # load annots
107 |         recs = {}
108 |         for i, imagename in enumerate(imagenames):
109 |             recs[imagename] = parse_rec(annopath.format(imagename))
110 |             if i % 100 == 0:
111 |                 print 'Reading annotation for {:d}/{:d}'.format(
112 |                     i + 1, len(imagenames))
113 |         # save
114 |         print 'Saving cached annotations to {:s}'.format(cachefile)
115 |         with open(cachefile, 'w') as f:
116 |             cPickle.dump(recs, f)
117 |     else:
118 |         # load
119 |         with open(cachefile, 'r') as f:
120 |             recs = cPickle.load(f)
121 | 
122 |     # extract gt objects for this class
123 |     class_recs = {}
124 |     npos = 0
125 |     for imagename in imagenames:
126 |         R = [obj for obj in recs[imagename] if obj['name'] == classname]
127 |         bbox = np.array([x['bbox'] for x in R])
128 |         difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
129 |         det = [False] * len(R)
130 |         npos = npos + sum(~difficult)
131 |         class_recs[imagename] = {'bbox': bbox,
132 |                                  'difficult': difficult,
133 |                                  'det': det}
134 | 
135 |     # read dets
136 |     detfile = detpath.format(classname)
137 |     with open(detfile, 'r') as f:
138 |         lines = f.readlines()
139 | 
140 |     splitlines = [x.strip().split(' ') for x in lines]
141 |     image_ids = [x[0] for x in splitlines]
142 |     confidence = np.array([float(x[1]) for x in splitlines])
143 |     BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
144 | 
145 |     # sort by confidence
146 |     sorted_ind = np.argsort(-confidence)
147 |     sorted_scores = np.sort(-confidence)
148 |     BB = BB[sorted_ind, :]
149 |     image_ids = [image_ids[x] for x in sorted_ind]
150 | 
151 |     # go down dets and mark TPs and FPs
152 |     nd = len(image_ids)
153 |     tp = np.zeros(nd)
154 |     fp = np.zeros(nd)
155 |     for d in range(nd):
156 |         R = class_recs[image_ids[d]]
157 |         bb = BB[d, :].astype(float)
158 |         ovmax = -np.inf
159 |         BBGT = R['bbox'].astype(float)
160 | 
161 |         if BBGT.size > 0:
162 |             # compute overlaps
163 |             # intersection
164 |             ixmin = np.maximum(BBGT[:, 0], bb[0])
165 |             iymin = np.maximum(BBGT[:, 1], bb[1])
166 |             ixmax = np.minimum(BBGT[:, 2], bb[2])
167 |             iymax = np.minimum(BBGT[:, 3], bb[3])
168 |             iw = np.maximum(ixmax - ixmin + 1., 0.)
169 |             ih = np.maximum(iymax - iymin + 1., 0.)
170 |             inters = iw * ih
171 | 
172 |             # union
173 |             uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
174 |                    (BBGT[:, 2] - BBGT[:, 0] + 1.) *
175 |                    (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
176 | 
177 |             overlaps = inters / uni
178 |             ovmax = np.max(overlaps)
179 |             jmax = np.argmax(overlaps)
180 | 
181 |         if ovmax > ovthresh:
182 |             if not R['difficult'][jmax]:
183 |                 if not R['det'][jmax]:
184 |                     tp[d] = 1.
185 |                     R['det'][jmax] = 1
186 |                 else:
187 |                     fp[d] = 1.
188 |         else:
189 |             fp[d] = 1.
190 | 
191 |     # compute precision recall
192 |     fp = np.cumsum(fp)
193 |     tp = np.cumsum(tp)
194 |     rec = tp / float(npos)
195 |     # avoid divide by zero in case the first detection matches a difficult
196 |     # ground truth
197 |     prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
198 |     ap = voc_ap(rec, prec, use_07_metric)
199 | 
200 |     return rec, prec, ap
201 | 


--------------------------------------------------------------------------------
/results/voc_eval.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangjinsong3/YOLO-V3-Acceleration/384746bb6d8f61c8def70bbc0b5e04b98c60356e/results/voc_eval.pyc


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | #DEBUG="gdb --args "
 4 | DEBUG="cuda-gdb --args "
 5 | #DEBUG="ddd --debugger cuda-gdb --args "
 6 | #DEBUG="cgdb -d cuda-gdb --args "
 7 | #DEBUG="cuda-memcheck "
 8 | 
 9 | MODEL="./data/model/yolov3-voc-relu.caffemodel"
10 | DEPLOY="./data/model/yolov3-voc-relu.prototxt"
11 | # CALIBRATION="./data/model/CalibrationTable"
12 | SYNSET="./data/model/voc.names"
13 | IMAGELIST="./data/images/test.txt"
14 | 
15 | DEV_ID=$1
16 | NMS=0.45       # $2
17 | CONF=0.001     # $3
18 | MODE=0         # 0 fp32, 1 fp16, 2 int8
19 | BATCH_SIZE=1
20 | N_ITERS=1
21 | 
22 | # Add this argument for INT8 inference.
23 | # Note that only pascal GPU support INT8, like NVIDIA Tesla P4, P40
24 | 
25 | 
26 | if [ ${MODE} -eq 2 ]
27 | then
28 | ${DEBUG} ./bin/runYOLOv3 		-devID=${DEV_ID}			\
29 | 					-batchSize=${BATCH_SIZE}								\
30 | 					-nIters=${N_ITERS}											\
31 | 					-deployFile=${DEPLOY}										\
32 | 					-modelFile=${MODEL}											\
33 | 					-synsetFile=${SYNSET}										\
34 |           -cali=${CALIBRATION}       						  \
35 |           -imageFile=${IMAGELIST} 								\
36 |           -nmsThreshold=${NMS}                    \
37 |           -confThreshold=${CONF}
38 | 					#2>&1 | tee ./log/log.txt
39 | else
40 | ${DEBUG} ./bin/runYOLOv3 		-devID=${DEV_ID}			\
41 | 					-batchSize=${BATCH_SIZE}								\
42 | 					-nIters=${N_ITERS}											\
43 | 					-deployFile=${DEPLOY}										\
44 | 					-modelFile=${MODEL}											\
45 | 					-synsetFile=${SYNSET}										\
46 |           -imageFile=${IMAGELIST} 								\
47 |           -nmsThreshold=${NMS}                    \
48 |           -confThreshold=${CONF}
49 | 					#2>&1 | tee ./log/log.txt
50 | fi
51 | 


--------------------------------------------------------------------------------
/run.sh~:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | #DEBUG="gdb --args "
 4 | DEBUG="cuda-gdb --args "
 5 | #DEBUG="ddd --debugger cuda-gdb --args "
 6 | #DEBUG="cgdb -d cuda-gdb --args "
 7 | #DEBUG="cuda-memcheck "
 8 | 
 9 | MODEL="./data/model/yolov3-voc-relu.caffemodel"
10 | DEPLOY="./data/model/yolov3-voc-relu.prototxt"
11 | # CALIBRATION="./data/model/CalibrationTable"
12 | SYNSET="./data/model/voc.names"
13 | IMAGELIST="./data/images/test.txt"
14 | 
15 | DEV_ID=$1
16 | NMS=0.45       # $2
17 | CONF=0.001     # $3
18 | MODE=0         # 0 fp32, 1 fp16, 2 int8
19 | BATCH_SIZE=1
20 | N_ITERS=1
21 | 
22 | # Add this argument for INT8 inference.
23 | # Note that only pascal GPU support INT8, like NVIDIA Tesla P4, P40
24 | 
25 | 
26 | if [ ${MODE} -eq 2 ]
27 | then
28 | ${DEBUG} ./bin/runYOLOv3 		-devID=${DEV_ID}			\
29 | 					-batchSize=${BATCH_SIZE}								\
30 | 					-nIters=${N_ITERS}											\
31 | 					-deployFile=${DEPLOY}										\
32 | 					-modelFile=${MODEL}											\
33 | 					-synsetFile=${SYNSET}										\
34 |           -cali=${CALIBRATION}       						  \
35 |           -imageFile=${IMAGELIST} 								\
36 |           -nmsThreshold=${NMS}                    \
37 |           -confThreshold=${CONF}
38 | 					#2>&1 | tee ./log/log.txt
39 | else
40 | ${DEBUG} ./bin/runYOLOv3 		-devID=${DEV_ID}			\
41 | 					-batchSize=${BATCH_SIZE}								\
42 | 					-nIters=${N_ITERS}											\
43 | 					-deployFile=${DEPLOY}										\
44 | 					-modelFile=${MODEL}											\
45 | 					-synsetFile=${SYNSET}										\
46 |           -imageFile=${IMAGELIST} 								\
47 |           -nmsThreshold=${NMS}                    \
48 |           -confThreshold=${CONF}
49 | 					#2>&1 | tee ./log/log.txt
50 | fi
51 | 


--------------------------------------------------------------------------------
/src/bboxParser.h:
--------------------------------------------------------------------------------
 1 | #ifndef BBOX_PARSER_H
 2 | #define BBOX_PARSER_H
 3 | 
 4 | #include <vector>
 5 | // cub for sort
 6 | #include "regionLayer.h"
 7 | 
 8 | void sortScoresPerImage_gpu(
 9 |         const int   nBatch,
10 |         const int   nItemsPerImage,
11 |         void *      unsorted_scores,
12 |         void *      unsorted_bbox_indices,
13 |         void *      sorted_scores,
14 |         void *      sorted_bbox_indices,
15 |         void *      workspace,
16 |         const size_t    maxSizeofWorkspaceInByte,
17 |         cudaStream_t stream);
18 | 
19 | void splitOutputData_gpu(
20 |         const int       nBatch,                    // batch
21 |         const int       nClasses,
22 |         const int       nBboxesPerLoc,    // #box
23 |         const int       coords,                 // x,y,w,h
24 |         const int       l0_w,
25 |         const int				l0_h,
26 |         const int       nCells,
27 |         const bool      background,             // use background conf or not
28 |         const bool      only_objectness,        // no class conf
29 |         const float     thres,
30 |         const float*    predictions,
31 |         const float*    biases,
32 |         float*          probes,
33 |         box*            bboxes,
34 |         cudaStream_t    stream);
35 | 
36 | 
37 | void correct_region_boxes_gpu(
38 |         const int       nBatch,                    // batch
39 |         const int       nClasses,
40 |         const int       nBboxesPerLoc,    // #box
41 |         const int       nCells,
42 |         const int       image_w,
43 |         const int       image_h,
44 |         const int       net_input_w,
45 |         const int       net_input_h,
46 |         box*            bboxes,
47 |         cudaStream_t    stream);
48 | 
49 | 
50 | void sortScoresPerClass_gpu(
51 |         const int       nBatch,
52 |         const int       nClasses,
53 |         const int       nBboxesPerLoc,
54 |         const void *    probes,
55 |         void *          sorted_boxIdx,
56 |         void *          workspace,
57 |         const size_t    maxSizeofWorkspaceInByte,
58 |         cudaStream_t    stream);
59 | 
60 | 
61 | void allClassNMS_gpu(
62 |         const int       nBatch,                    //batch
63 |         const int       nClasses,
64 |         const int       nBboxesPerLoc,
65 |         const int       nCells,
66 |         const float     nms_threshold,
67 |         void *          bboxes,
68 |         void *          probes,
69 |         void *          afterNMS_probes,
70 |         void *          indexes,
71 |         void *          afterNMS_indexes,
72 |         cudaStream_t    stream);
73 | 
74 | 
75 | size_t getWorkspaceSizeInByte(
76 |         const int       nBatch,
77 |         const int       nClasses,
78 |         const int       nBboxesPerLoc,
79 |         const int       nCells);
80 | 
81 | #endif
82 | 


--------------------------------------------------------------------------------
/src/bboxParser.h~:
--------------------------------------------------------------------------------
 1 | #ifndef BBOX_PARSER_H
 2 | #define BBOX_PARSER_H
 3 | 
 4 | #include <vector>
 5 | // cub for sort
 6 | #include "regionLayer.h"
 7 | 
 8 | void sortScoresPerImage_gpu(
 9 |         const int   nBatch,
10 |         const int   nItemsPerImage,
11 |         void *      unsorted_scores,
12 |         void *      unsorted_bbox_indices,
13 |         void *      sorted_scores,
14 |         void *      sorted_bbox_indices,
15 |         void *      workspace,
16 |         const size_t    maxSizeofWorkspaceInByte,
17 |         cudaStream_t stream);
18 | 
19 | void splitOutputData_gpu(
20 |         const int       nBatch,                    // batch
21 |         const int       nClasses,
22 |         const int       nBboxesPerLoc,    // #box
23 |         const int       coords,                 // x,y,w,h
24 |         const int       l0_w,
25 |         const int				l0_h,
26 |         const int       nCells,
27 |         const bool      background,             // use background conf or not
28 |         const bool      only_objectness,        // no class conf
29 |         const float     thres,
30 |         const float*    predictions,
31 |         const float*    biases,
32 |         float*          probes,
33 |         box*            bboxes,
34 |         cudaStream_t    stream);
35 | 
36 | 
37 | void correct_region_boxes_gpu(
38 |         const int       nBatch,                    // batch
39 |         const int       nClasses,
40 |         const int       nBboxesPerLoc,    // #box
41 |         const int       nCells,
42 |         const int       image_w,
43 |         const int       image_h,
44 |         const int       net_input_w,
45 |         const int       net_input_h,
46 |         box*            bboxes,
47 |         cudaStream_t    stream);
48 | 
49 | 
50 | void sortScoresPerClass_gpu(
51 |         const int       nBatch,
52 |         const int       nClasses,
53 |         const int       nBboxesPerLoc,
54 |         const void *    probes,
55 |         void *          sorted_boxIdx,
56 |         void *          workspace,
57 |         const size_t    maxSizeofWorkspaceInByte,
58 |         cudaStream_t    stream);
59 | 
60 | 
61 | void allClassNMS_gpu(
62 |         const int       nBatch,                    //batch
63 |         const int       nClasses,
64 |         const int       nBboxesPerLoc,
65 |         const int       w,
66 |         const int       h,
67 |         const float     nms_threshold,
68 |         void *          bboxes,
69 |         void *          probes,
70 |         void *          afterNMS_probes,
71 |         void *          indexes,
72 |         void *          afterNMS_indexes,
73 |         cudaStream_t    stream);
74 | 
75 | 
76 | size_t getWorkspaceSizeInByte(
77 |         const int       nBatch,
78 |         const int       nClasses,
79 |         const int       nBboxesPerLoc,
80 |         const int       w,
81 |         const int       h);
82 | 
83 | #endif
84 | 


--------------------------------------------------------------------------------
/src/classifier.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
 3 | *
 4 | * NOTICE TO USER:
 5 | *
 6 | * This source code is subject to NVIDIA ownership rights under U.S. and
 7 | * international Copyright laws.
 8 | *
 9 | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
10 | * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
11 | * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
12 | * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
13 | * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
14 | * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
15 | * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
16 | * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
17 | * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
18 | * OR PERFORMANCE OF THIS SOURCE CODE.
19 | *
20 | * U.S. Government End Users.  This source code is a "commercial item" as
21 | * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
22 | * "commercial computer software" and "commercial computer software
23 | * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
24 | * and is provided to the U.S. Government only as a commercial end item.
25 | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
26 | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
27 | * source code with only those rights set forth herein.
28 | */
29 | 
30 | #ifndef CLASSIFIER_H
31 | #define CLASSIFIER_H 
32 | 
33 | #include <vector>
34 | #include "NvInfer.h"
35 | #include <cuda_runtime.h>
36 | 
37 | using namespace nvinfer1;
38 | 
39 | typedef struct INFER_OUTPUT_PARAMS_ {
40 | 	int nBatchSize_;
41 | 	std::vector<float *> vpInferResults_;
42 | 	std::vector<int    > vnLens_;
43 |     std::vector<DimsCHW > vOutputDims_;
44 | } INFER_OUTPUT_PARAMS;
45 | 
46 | class IClassifier {
47 | public:
48 | 	virtual void setInputData(float *pBGR,
49 | 								const int nWidth,
50 | 								const int nHeight,
51 | 								const int nBatchSize) = 0;
52 | 	
53 | 	virtual void forward(INFER_OUTPUT_PARAMS *) = 0;
54 | 	
55 | 	virtual int getInferWidth() const = 0;
56 | 	
57 | 	virtual int getInferHeight() const = 0;
58 | 	
59 | 	virtual std::vector<float > getMeanValues() const = 0;
60 | 
61 | protected:
62 | 	virtual ~IClassifier() {}
63 | };
64 | 
65 | #endif
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/src/common.cu:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | // alignptr
 4 | int8_t * alignPtr(int8_t * ptr, uintptr_t to)
 5 | {
 6 |     uintptr_t addr = (uintptr_t)ptr;
 7 |     if (addr % to) {
 8 |         addr += to - addr % to;
 9 |     }
10 |     return (int8_t *)addr;
11 | }
12 | 
13 | // calc next ptr (consider alignment)
14 | int8_t * nextWorkspacePtr(int8_t * ptr, uintptr_t previousWorkspaceSize)
15 | {
16 |     uintptr_t addr = (uintptr_t) ptr;
17 |     addr += previousWorkspaceSize;
18 |     return alignPtr((int8_t *)addr, CUDA_MEM_ALIGN);
19 | }
20 | 
21 | 
22 | template <unsigned nthds_per_cta>
23 | __launch_bounds__ (nthds_per_cta)
24 | __global__ void setUniformOffsets_kernel(
25 |         const int   num_segments,
26 |         const int   offset,
27 |         int *       d_offsets)
28 | {
29 |     const int idx = blockIdx.x * nthds_per_cta + threadIdx.x;
30 |     if (idx <= num_segments){
31 |         d_offsets[idx] = idx * offset;
32 |     }
33 | }
34 | 
35 | void setUniformOffsets(
36 |         const int       num_segments,
37 |         const int       offset,
38 |         int *           d_offsets,
39 |         cudaStream_t    stream)
40 | {
41 |     const int blockSize = 32;
42 |     const int gridSize = (num_segments + 1 + blockSize - 1) / blockSize;
43 |     setUniformOffsets_kernel<blockSize>
44 |         <<<gridSize, blockSize, 0, stream>>>
45 |         (num_segments, offset, d_offsets);
46 | }
47 | 


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef COMMON_H_
 2 | #define COMMON_H_
 3 | 
 4 | #include <cub/cub.cuh>
 5 | 
 6 | #define CUDA_MEM_ALIGN 256
 7 | 
 8 | // alignptr
 9 | int8_t * alignPtr(int8_t * ptr, uintptr_t to);
10 | 
11 | int8_t * nextWorkspacePtr(int8_t * ptr, uintptr_t previousWorkspaceSize);
12 | 
13 | void setUniformOffsets(const int num_segments, const int offset, int * d_offsets, cudaStream_t stream);
14 | 
15 | /**
16 |  * Determine the usage of temporary memory for cub sort
17 |  * The cub::DeviceSegmentedRadixSort can be used for batched (segmented) sort.
18 |  */
19 | template <typename KeyT, typename ValueT>
20 | size_t cubSortPairsWorkspaceSize(int num_items, int num_segments)
21 | {
22 |     size_t temp_storage_bytes = 0;
23 |     cub::DeviceSegmentedRadixSort::SortPairsDescending(
24 | 	(void *)NULL, temp_storage_bytes,
25 | 	(const KeyT *)NULL, (KeyT *)NULL,
26 | 	(const ValueT *)NULL, (ValueT *)NULL,
27 | 	num_items,     // # items
28 | 	num_segments,  // # segments
29 | 	(const int *)NULL, (const int *)NULL);
30 |     return temp_storage_bytes;
31 | }
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/src/common.h~:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangjinsong3/YOLO-V3-Acceleration/384746bb6d8f61c8def70bbc0b5e04b98c60356e/src/common.h~


--------------------------------------------------------------------------------
/src/draw.h:
--------------------------------------------------------------------------------
 1 | #include "preproc_yolov3.h"
 2 | #include "regionLayer.h"
 3 | #include "bboxParser.h"
 4 | 
 5 | void draw_box(image a, int x1, int y1, int x2, int y2, float r, float g, float b)
 6 | {
 7 |     int i;
 8 |     if(x1 < 0) x1 = 0;
 9 |     if(x1 >= a.w) x1 = a.w-1;
10 |     if(x2 < 0) x2 = 0;
11 |     if(x2 >= a.w) x2 = a.w-1;
12 | 
13 |     if(y1 < 0) y1 = 0;
14 |     if(y1 >= a.h) y1 = a.h-1;
15 |     if(y2 < 0) y2 = 0;
16 |     if(y2 >= a.h) y2 = a.h-1;
17 | 
18 |     for(i = x1; i <= x2; ++i){
19 |         a.data[i + y1*a.w + 0*a.w*a.h] = r;
20 |         a.data[i + y2*a.w + 0*a.w*a.h] = r;
21 | 
22 |         a.data[i + y1*a.w + 1*a.w*a.h] = g;
23 |         a.data[i + y2*a.w + 1*a.w*a.h] = g;
24 | 
25 |         a.data[i + y1*a.w + 2*a.w*a.h] = b;
26 |         a.data[i + y2*a.w + 2*a.w*a.h] = b;
27 |     }
28 |     for(i = y1; i <= y2; ++i){
29 |         a.data[x1 + i*a.w + 0*a.w*a.h] = r;
30 |         a.data[x2 + i*a.w + 0*a.w*a.h] = r;
31 | 
32 |         a.data[x1 + i*a.w + 1*a.w*a.h] = g;
33 |         a.data[x2 + i*a.w + 1*a.w*a.h] = g;
34 | 
35 |         a.data[x1 + i*a.w + 2*a.w*a.h] = b;
36 |         a.data[x2 + i*a.w + 2*a.w*a.h] = b;
37 |     }
38 | }
39 | 
40 | 
41 | 
42 | void draw_box_width(image a, int x1, int y1, int x2, int y2, int w)
43 | {
44 |     int i;
45 |     for(i = 0; i < w; ++i){
46 |         draw_box(a, x1+i, y1+i, x2-i, y2-i, 255, 0, 0);
47 |     }
48 | }
49 | 
50 | 
51 | void draw_detections(image im,
52 | 					 int batchIdx,
53 | 					 float thresh,
54 | 					 box *boxes,
55 | 					 float *probs,
56 | 					 int * indexes,
57 | 					 int sizeOfClass,
58 | 					 int sizeOfBatch)
59 | {
60 | 	int n = batchIdx;
61 | 	// int sizeOfClass = l.n * l.h * l.w;
62 | 	// int sizeOfBatch = l.classes * sizeOfClass;
63 | 
64 | 	int count = 0;
65 |     for(int i = 0; i < sizeOfBatch; ++i){
66 | 		int id = n * sizeOfBatch + i;
67 | 		int indexes_idx = indexes[id];
68 | 
69 | 		if (probs[id] > thresh){
70 | 			int category = (indexes_idx % sizeOfBatch) / sizeOfClass;
71 | 			int boxId = indexes_idx % sizeOfClass;
72 | 
73 |             int width = im.h * .006;
74 |             box b = boxes[boxId];
75 | 
76 |             int left  = (b.x-b.w/2.)*im.w;
77 |             int right = (b.x+b.w/2.)*im.w;
78 |             int top   = (b.y-b.h/2.)*im.h;
79 |             int bot   = (b.y+b.h/2.)*im.h;
80 | 
81 |             if(left < 0) left = 0;
82 |             if(right > im.w-1) right = im.w-1;
83 |             if(top < 0) top = 0;
84 |             if(bot > im.h-1) bot = im.h-1;
85 | 
86 |             draw_box_width(im, left, top, right, bot, width);
87 |         }
88 |     }
89 | }
90 | 


--------------------------------------------------------------------------------
/src/draw.h~:
--------------------------------------------------------------------------------
 1 | #include "preproc_yolov2.h"
 2 | #include "regionLayer.h"
 3 | #include "bboxParser.h"
 4 | 
 5 | void draw_box(image a, int x1, int y1, int x2, int y2, float r, float g, float b)
 6 | {
 7 |     int i;
 8 |     if(x1 < 0) x1 = 0;
 9 |     if(x1 >= a.w) x1 = a.w-1;
10 |     if(x2 < 0) x2 = 0;
11 |     if(x2 >= a.w) x2 = a.w-1;
12 | 
13 |     if(y1 < 0) y1 = 0;
14 |     if(y1 >= a.h) y1 = a.h-1;
15 |     if(y2 < 0) y2 = 0;
16 |     if(y2 >= a.h) y2 = a.h-1;
17 | 
18 |     for(i = x1; i <= x2; ++i){
19 |         a.data[i + y1*a.w + 0*a.w*a.h] = r;
20 |         a.data[i + y2*a.w + 0*a.w*a.h] = r;
21 | 
22 |         a.data[i + y1*a.w + 1*a.w*a.h] = g;
23 |         a.data[i + y2*a.w + 1*a.w*a.h] = g;
24 | 
25 |         a.data[i + y1*a.w + 2*a.w*a.h] = b;
26 |         a.data[i + y2*a.w + 2*a.w*a.h] = b;
27 |     }
28 |     for(i = y1; i <= y2; ++i){
29 |         a.data[x1 + i*a.w + 0*a.w*a.h] = r;
30 |         a.data[x2 + i*a.w + 0*a.w*a.h] = r;
31 | 
32 |         a.data[x1 + i*a.w + 1*a.w*a.h] = g;
33 |         a.data[x2 + i*a.w + 1*a.w*a.h] = g;
34 | 
35 |         a.data[x1 + i*a.w + 2*a.w*a.h] = b;
36 |         a.data[x2 + i*a.w + 2*a.w*a.h] = b;
37 |     }
38 | }
39 | 
40 | 
41 | 
42 | void draw_box_width(image a, int x1, int y1, int x2, int y2, int w)
43 | {
44 |     int i;
45 |     for(i = 0; i < w; ++i){
46 |         draw_box(a, x1+i, y1+i, x2-i, y2-i, 255, 0, 0);
47 |     }
48 | }
49 | 
50 | 
51 | void draw_detections(image im,
52 | 					 int batchIdx,
53 | 					 float thresh,
54 | 					 box *boxes,
55 | 					 float *probs,
56 | 					 int * indexes,
57 | 					 int sizeOfClass,
58 | 					 int sizeOfBatch)
59 | {
60 | 	int n = batchIdx;
61 | 	// int sizeOfClass = l.n * l.h * l.w;
62 | 	// int sizeOfBatch = l.classes * sizeOfClass;
63 | 
64 | 	int count = 0;
65 |     for(int i = 0; i < sizeOfBatch; ++i){
66 | 		int id = n * sizeOfBatch + i;
67 | 		int indexes_idx = indexes[id];
68 | 
69 | 		if (probs[id] > thresh){
70 | 			int category = (indexes_idx % sizeOfBatch) / sizeOfClass;
71 | 			int boxId = indexes_idx % sizeOfClass;
72 | 
73 |             int width = im.h * .006;
74 |             box b = boxes[boxId];
75 | 
76 |             int left  = (b.x-b.w/2.)*im.w;
77 |             int right = (b.x+b.w/2.)*im.w;
78 |             int top   = (b.y-b.h/2.)*im.h;
79 |             int bot   = (b.y+b.h/2.)*im.h;
80 | 
81 |             if(left < 0) left = 0;
82 |             if(right > im.w-1) right = im.w-1;
83 |             if(top < 0) top = 0;
84 |             if(bot > im.h-1) bot = im.h-1;
85 | 
86 |             draw_box_width(im, left, top, right, bot, width);
87 |         }
88 |     }
89 | }
90 | 


--------------------------------------------------------------------------------
/src/interpPlugin.cu:
--------------------------------------------------------------------------------
 1 | #include <interpPlugin.h>
 2 | 
 3 | 
 4 | dim3 cuda_gridsize(unsigned int n){
 5 | 	unsigned int k = (n-1) / BLOCK + 1;
 6 | 	unsigned int x = k;
 7 | 	unsigned int y = 1;
 8 | 	if(x > 65535){
 9 | 		x = ceil(sqrt(k));
10 | 		y = (n-1)/(x*BLOCK) + 1;
11 | 	}
12 | 	dim3 d = {x, y, 1};
13 | 	return d;
14 | } 
15 | 
16 | /* nearest neighbor upsampling used in darknet*/
17 | __global__ void upsample_gpu(int N, const float *x, int w, int h, int c, int batch, int zoomFactor, float *out, const char* mode="nearest")
18 | {
19 | 	int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
20 | 	if(i >= N) return;
21 | 	int out_index = i;
22 | 	int out_w = i%(w*zoomFactor);
23 | 	i = i/(w*zoomFactor);
24 | 	int out_h = i%(h*zoomFactor);
25 | 	i = i/(h*zoomFactor);
26 | 	int _c = i%c;
27 | 	i = i/_c;
28 | 	int _b = i%batch;
29 | 	int in_w = out_w/zoomFactor;
30 | 	int in_h = out_h/zoomFactor;
31 | 	int in_offset = _b*c*w*h + _c*w*h;
32 | 	int in_index00 = in_offset + in_h*w + in_w;
33 | 	if(mode == "bilinear"){
34 | 		int in_index01 = (in_w+1 > w) ? in_index00 : (in_index00 + 1);
35 | 		int in_index10 = (in_h+1 > h) ? in_index00 : (in_index00 + w);
36 | 		int in_index11 = (in_index01 == in_index10) ? in_index00 : (in_index10 + 1);
37 | 		
38 | 		float u = (float)(out_h % zoomFactor)/zoomFactor;
39 | 		float v = (float)(out_w % zoomFactor)/zoomFactor;
40 | 		out[out_index] = (1-u)*(1-v)*x[in_index00] + \
41 | 										 (1-u)*v*x[in_index01] + \
42 | 										 u*(1-v)*x[in_index10] + \
43 | 										 u*v*x[in_index11];
44 | 	}
45 | 	else if(mode == "nearest"){
46 | 		out[out_index] = x[in_index00];
47 | 	}
48 | }
49 | 
50 | void interp_gpu(const float *x, int w, int h, int c, int batch, int zoomFactor, float *out, cudaStream_t stream)
51 | {
52 | 	int outSize = w*zoomFactor*h*zoomFactor*c*batch;
53 | 	upsample_gpu<<<cuda_gridsize(outSize), BLOCK, 0, stream>>>(outSize, x, w, h, c, batch, zoomFactor, out);
54 | }
55 | 


--------------------------------------------------------------------------------
/src/interpPlugin.h:
--------------------------------------------------------------------------------
  1 | #ifndef INTERP_PLUGIN_H
  2 | #define INTERP_PLUGIN_H
  3 | 
  4 | #include <cuda_runtime.h>
  5 | #include <iostream>
  6 | #include <cassert>
  7 | 
  8 | #include "NvInfer.h"
  9 | #include "NvCaffeParser.h"
 10 | #include "NvInferPlugin.h"
 11 | #include <stdio.h>
 12 | 
 13 | using namespace nvinfer1;
 14 | using namespace nvcaffeparser1;
 15 | using namespace plugin;
 16 | 
 17 | #define BLOCK 512
 18 | #define ZOOM 2 // upsample *2
 19 | 
 20 | void interp_gpu(const float *x, int w, int h, int c, int batch, int zoomFactor, float *out, cudaStream_t stream);
 21 | 
 22 | template<int zoomFactor>
 23 | class Interp : public IPlugin
 24 | {
 25 | public:
 26 | 	Interp() {}
 27 | 	Interp(const void* buffer, size_t size)
 28 | 	{
 29 | 		// assert(size == sizeof(mInputSize));
 30 | 		// mInputSize = *reinterpret_cast<const size_t*>(buffer);
 31 | 		assert(size == sizeof(mInputDims));
 32 | 		mInputDims = *reinterpret_cast<const Dims*>(buffer);
 33 | 	}
 34 | 	~Interp() {}
 35 | 
 36 | 	// @ when creating the network
 37 | 	int getNbOutputs() const override
 38 | 	{
 39 | 		return 1;
 40 | 	}
 41 | 	Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
 42 | 	{
 43 | 		assert(nbInputDims == 1);
 44 | 		assert(index == 0);
 45 | 		assert(inputs[index].nbDims == 3);
 46 |         
 47 | 		mOutputDims = DimsCHW(inputs[index].d[0], inputs[index].d[1] * zoomFactor, inputs[index].d[2] * zoomFactor);
 48 | 		if (0) {
 49 | 			std::cout << "IPlugin input dim = [" << inputs[index].d[0] << ", " << inputs[index].d[1]
 50 | 				<< ", " << inputs[index].d[2] << "]" << std::endl;
 51 | 			std::cout << "IPlugin output dim = [" << mOutputDims.d[0] << ", " << mOutputDims.d[1]
 52 | 				<< ", " << mOutputDims.d[2] << "]" << std::endl;
 53 | 		}
 54 | 		return mOutputDims;
 55 | 	}
 56 | 
 57 | 	// @ when building the engine
 58 | 	void configure(const Dims* inputs, int nbInputs, const Dims* outputs, int nbOutputs, int maxBatchSize)	override
 59 | 	{
 60 | 		assert(1 == nbInputs && 1 == nbOutputs);
 61 | 		mInputDims = inputs[0];
 62 | 		mInputSize = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2] * sizeof(float);
 63 | 		// mOutputSize = outputs[0].d[0] * outputs[0].d[1] * outputs[0].d[2] * sizeof(float);
 64 | 	}
 65 | 	size_t getWorkspaceSize(int) const override
 66 | 	{
 67 | 		return 0;
 68 | 	}
 69 | 
 70 | 	// @ when serializing the engine
 71 | 	size_t getSerializationSize() override
 72 | 	{
 73 | 		return sizeof(mInputDims);
 74 | 	}
 75 | 	void serialize(void* buffer) override
 76 | 	{
 77 | 		// *reinterpret_cast<size_t*>(buffer) = mInputSize;
 78 | 		*reinterpret_cast<Dims*>(buffer) = mInputDims;
 79 | 	}
 80 | 
 81 | 	// @ when deserializing && executing the engine(at runtime)
 82 | 	int initialize() override
 83 | 	{
 84 | 		return 0;
 85 | 	}
 86 | 	void terminate() override
 87 | 	{
 88 | 	}
 89 | 	int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override
 90 | 	{
 91 | 		// TODO: why inputs idx 0?
 92 | 		interp_gpu((const float*)inputs[0], mInputDims.d[2], mInputDims.d[1], mInputDims.d[0], batchSize, zoomFactor, (float *)outputs[0], stream); // TODO: didnt serialize mInputDims, can we use it? in that case, i serialized mInputDims, instead of mInputSize.
 93 | 		return 0;
 94 | 	}
 95 | 	
 96 | protected:
 97 | 	Dims mInputDims; //CHW
 98 | 	Dims mOutputDims;
 99 | 	size_t mInputSize;
100 | 	// size_t mOutputSize;
101 | };
102 | 
103 | 
104 | class PluginFactory : public nvinfer1::IPluginFactory, public nvcaffeparser1::IPluginFactory
105 | {
106 | public:
107 | 	// @ when building the engine
108 | 	// caffe parser plugin implementation
109 | 	bool isPlugin(const char* layerName) override
110 | 	{
111 | 		return !(strcmp(layerName, "Interp85") && strcmp(layerName, "Interp97"));
112 | 	}
113 | 	virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) override
114 | 	{
115 | 		assert(isPlugin(layerName));
116 | 		if (!strcmp(layerName, "Interp85"))
117 | 		{
118 | 			assert(layerName != "Interp85"); // debug_
119 | 			assert(mPluginInterp85.get() == nullptr);
120 | 			assert(nbWeights == 0 && weights == nullptr);
121 | 			mPluginInterp85 = std::unique_ptr<Interp<ZOOM>>(new Interp<ZOOM>());
122 | 			return mPluginInterp85.get();
123 | 		}
124 | 		else if (!strcmp(layerName, "Interp97"))
125 | 		{
126 | 			assert(layerName != "Interp97"); // debug_
127 | 			assert(mPluginInterp97.get() == nullptr);
128 | 			assert(nbWeights == 0 && weights == nullptr);
129 | 			mPluginInterp97 = std::unique_ptr<Interp<ZOOM>>(new Interp<ZOOM>());
130 | 			return mPluginInterp97.get();
131 | 		}
132 | 		else
133 | 		{
134 | 			assert(0);
135 | 			return nullptr;
136 | 		}
137 | 	}
138 | 
139 | 	// @ at runtime
140 | 	IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override
141 | 	{
142 | 		assert(isPlugin(layerName));
143 | 		if (!strcmp(layerName, "Interp85"))
144 | 		{
145 | 			assert(mPluginInterp85.get() == nullptr);
146 | 			mPluginInterp85 = std::unique_ptr<Interp<ZOOM>>(new Interp<ZOOM>(serialData, serialLength));
147 | 			return mPluginInterp85.get();
148 | 		}
149 | 		else if (!strcmp(layerName, "Interp97"))
150 | 		{
151 | 			assert(mPluginInterp97.get() == nullptr);
152 | 			mPluginInterp97 = std::unique_ptr<Interp<ZOOM>>(new Interp<ZOOM>(serialData, serialLength));
153 | 			return mPluginInterp97.get();
154 | 		}
155 | 		else
156 | 		{
157 | 			assert(0);
158 | 			return nullptr;
159 | 		}
160 | 	}
161 | 
162 | 	void destroyPlugin()
163 | 	{
164 | 		//mPluginInterp97.release();		mPluginInterp97 = nullptr;
165 | 		//mPluginInterp85.release();		mPluginInterp85 = nullptr;
166 | 	}
167 | 
168 | 	std::unique_ptr<Interp<ZOOM>> mPluginInterp85{ nullptr };
169 |   std::unique_ptr<Interp<ZOOM>> mPluginInterp97{ nullptr };
170 | };
171 | 
172 | #endif
173 | 


--------------------------------------------------------------------------------
/src/preproc_yolov3.h:
--------------------------------------------------------------------------------
  1 | #ifndef YOLO_PREPROC_H
  2 | #define YOLO_PREPROC_H
  3 | 
  4 | #include <stdlib.h>
  5 | #include <stdio.h>
  6 | #include "opencv2/highgui/highgui_c.h"
  7 | #include "opencv2/imgproc/imgproc_c.h"
  8 | #include "opencv2/core/version.hpp"
  9 | #if CV_MAJOR_VERSION == 3
 10 | #include "opencv2/videoio/videoio_c.h"
 11 | #endif
 12 | 
 13 | typedef struct {
 14 |     int w;
 15 |     int h;
 16 |     int c;
 17 |     float *data;
 18 | } image;
 19 | 
 20 | image make_empty_image(int w, int h, int c)
 21 | {
 22 |     image out;
 23 |     out.data = 0;
 24 |     out.h = h;
 25 |     out.w = w;
 26 |     out.c = c;
 27 |     return out;
 28 | }
 29 | 
 30 | image make_image(int w, int h, int c)
 31 | {
 32 |     image out = make_empty_image(w,h,c);
 33 |     out.data = (float*)calloc(h*w*c, sizeof(float));
 34 |     return out;
 35 | }
 36 | void free_image(image m)
 37 | {
 38 |     if(m.data){
 39 |         free(m.data);
 40 |     }
 41 | }
 42 | void fill_image(image m, float s)
 43 | {
 44 |     int i;
 45 |     for(i = 0; i < m.h*m.w*m.c; ++i) m.data[i] = s;
 46 | }
 47 | 
 48 | float get_pixel(image m, int x, int y, int c)
 49 | {
 50 |     assert(x < m.w && y < m.h && c < m.c);
 51 |     return m.data[c*m.h*m.w + y*m.w + x];
 52 | }
 53 | 
 54 | void set_pixel(image m, int x, int y, int c, float val)
 55 | {
 56 |     if (x < 0 || y < 0 || c < 0 || x >= m.w || y >= m.h || c >= m.c) return;
 57 |     assert(x < m.w && y < m.h && c < m.c);
 58 |     m.data[c*m.h*m.w + y*m.w + x] = val;
 59 | }
 60 | void add_pixel(image m, int x, int y, int c, float val)
 61 | {
 62 |     assert(x < m.w && y < m.h && c < m.c);
 63 |     m.data[c*m.h*m.w + y*m.w + x] += val;
 64 | }
 65 | void embed_image(image source, image dest, int dx, int dy)
 66 | {
 67 |     int x,y,k;
 68 |     for(k = 0; k < source.c; ++k){
 69 |         for(y = 0; y < source.h; ++y){
 70 |             for(x = 0; x < source.w; ++x){
 71 |                 float val = get_pixel(source, x,y,k);
 72 |                 set_pixel(dest, dx+x, dy+y, k, val);
 73 |             }
 74 |         }
 75 |     }
 76 | }
 77 | 
 78 | void ipl_into_image(IplImage* src, image im)
 79 | {
 80 |     unsigned char *data = (unsigned char *)src->imageData;
 81 |     int h = src->height;
 82 |     int w = src->width;
 83 |     int c = src->nChannels;
 84 |     int step = src->widthStep;
 85 |     int i, j, k;
 86 | 
 87 |     for(i = 0; i < h; ++i){
 88 |         for(k= 0; k < c; ++k){
 89 |             for(j = 0; j < w; ++j){
 90 |                 im.data[k*w*h + i*w + j] = data[i*step + j*c + k]/255.;
 91 |             }
 92 |         }
 93 |     }
 94 | }
 95 | 
 96 | image ipl_to_image(IplImage* src)
 97 | {
 98 |     // ross
 99 |     if (0 == src) {
100 |         printf("file %s, line %d, src == 0\n", __FILE__, __LINE__);
101 |         exit(0);
102 |     }
103 |     int h = src->height;
104 |     int w = src->width;
105 |     int c = src->nChannels;
106 |     image out = make_image(w, h, c);
107 |     ipl_into_image(src, out);
108 |     return out;
109 | }
110 | 
111 | void rgbgr_image(image im)
112 | {
113 |     int i;
114 |     for(i = 0; i < im.w*im.h; ++i){
115 |         float swap = im.data[i];
116 |         im.data[i] = im.data[i+im.w*im.h*2];
117 |         im.data[i+im.w*im.h*2] = swap;
118 |     }
119 | }
120 | 
121 | image load_image_cv(char *filename, int channels)
122 | {
123 |     IplImage* src = 0;
124 |     int flag = -1;
125 |     if (channels == 0) flag = -1;
126 |     else if (channels == 1) flag = 0;
127 |     else if (channels == 3) flag = 1;
128 |     else {
129 |         fprintf(stderr, "OpenCV can't force load with %d channels\n", channels);
130 |     }
131 | 
132 |     if( (src = (IplImage*)cvLoadImage(filename, flag)) == NULL )
133 |     {
134 |         fprintf(stderr, "Cannot load image \"%s\"\n", filename);
135 |         exit(0);
136 |     }
137 |     image out = ipl_to_image(src);
138 |     cvReleaseImage(&src);
139 |     rgbgr_image(out);
140 |     return out;
141 | }
142 | 
143 | image resize_image(image im, int w, int h)
144 | {
145 |     image resized = make_image(w, h, im.c);   
146 |     image part = make_image(w, im.h, im.c);
147 |     int r, c, k;
148 |     float w_scale = (float)(im.w - 1) / (w - 1);
149 |     float h_scale = (float)(im.h - 1) / (h - 1);
150 |     for(k = 0; k < im.c; ++k){
151 |         for(r = 0; r < im.h; ++r){
152 |             for(c = 0; c < w; ++c){
153 |                 float val = 0;
154 |                 if(c == w-1 || im.w == 1){
155 |                     val = get_pixel(im, im.w-1, r, k);
156 |                 } else {
157 |                     float sx = c*w_scale;
158 |                     int ix = (int) sx;
159 |                     float dx = sx - ix;
160 |                     val = (1 - dx) * get_pixel(im, ix, r, k) + dx * get_pixel(im, ix+1, r, k);
161 |                 }
162 |                 set_pixel(part, c, r, k, val);
163 |             }
164 |         }
165 |     }
166 |     for(k = 0; k < im.c; ++k){
167 |         for(r = 0; r < h; ++r){
168 |             float sy = r*h_scale;
169 |             int iy = (int) sy;
170 |             float dy = sy - iy;
171 |             for(c = 0; c < w; ++c){
172 |                 float val = (1-dy) * get_pixel(part, c, iy, k);
173 |                 set_pixel(resized, c, r, k, val);
174 |             }
175 |             if(r == h-1 || im.h == 1) continue;
176 |             for(c = 0; c < w; ++c){
177 |                 float val = dy * get_pixel(part, c, iy+1, k);
178 |                 add_pixel(resized, c, r, k, val);
179 |             }
180 |         }
181 |     }
182 | 
183 |     free_image(part);
184 |     return resized;
185 | }
186 | 
187 | image load_image(char *filename, int w, int h, int c)
188 | {
189 |     image out = load_image_cv(filename, c);
190 | 
191 |     if((h && w) && (h != out.h || w != out.w)){
192 |         image resized = resize_image(out, w, h);
193 |         free_image(out);
194 |         out = resized;
195 |     }
196 |     return out;
197 | }
198 | 
199 | 
200 | image load_image_color(char *filename, int w, int h)
201 | {
202 |     return load_image(filename, w, h, 3);
203 | }
204 | 
205 | image copy_image(image p)
206 | {
207 |     image copy = p;
208 |     copy.data = (float*)calloc(p.h*p.w*p.c, sizeof(float));
209 |     memcpy(copy.data, p.data, p.h*p.w*p.c*sizeof(float));
210 |     return copy;
211 | }
212 | 
213 | void save_image_jpg(image p, const char *name)
214 | {
215 |     image copy = copy_image(p);
216 |     if(p.c == 3) rgbgr_image(copy);
217 |     int x,y,k;
218 | 
219 |     char buff[256];
220 |     sprintf(buff, "%s.jpg", name);
221 | 
222 |     IplImage *disp = cvCreateImage(cvSize(p.w,p.h), IPL_DEPTH_8U, p.c);
223 |     int step = disp->widthStep;
224 |     for(y = 0; y < p.h; ++y){
225 |         for(x = 0; x < p.w; ++x){
226 |             for(k= 0; k < p.c; ++k){
227 |                 disp->imageData[y*step + x*p.c + k] = (unsigned char)(get_pixel(copy,x,y,k)*255);
228 |             }
229 |         }
230 |     }
231 |     cvSaveImage(buff, disp,0);
232 |     cvReleaseImage(&disp);
233 |     free_image(copy);
234 | }
235 | 
236 | void save_image(image im, const char *name)
237 | {
238 |     save_image_jpg(im, name);
239 | }
240 | 
241 | 
242 | image letterbox_image(image im, int w, int h)
243 | {
244 |     int new_w = im.w;
245 |     int new_h = im.h;
246 |     if (((float)w/im.w) < ((float)h/im.h)) {
247 |         new_w = w;
248 |         new_h = (im.h * w)/im.w;
249 |     } else {
250 |         new_h = h;
251 |         new_w = (im.w * h)/im.h;
252 |     }
253 |     image resized = resize_image(im, new_w, new_h);
254 |     image boxed = make_image(w, h, im.c);
255 |     fill_image(boxed, .5);
256 |     embed_image(resized, boxed, (w-new_w)/2, (h-new_h)/2); 
257 |     free_image(resized);
258 |     return boxed;
259 | }
260 | 
261 | #endif
262 | 


--------------------------------------------------------------------------------
/src/regionLayer.cu:
--------------------------------------------------------------------------------
  1 | #include <regionLayer.h>
  2 | #include <cfloat>
  3 | 
  4 | /** \brief kernel for softmax
  5 |  *  - n is the number of classes (included the background)
  6 |  *
  7 |  *  - The CPU implementation is
  8 |  *  for b in batch:
  9 |  *      for g in groups:
 10 |  *          softmax(input + b*batchOffset + g*groupOffset, n, temp, stride, output + b*batchOffset + g*groupOffset)
 11 |  *
 12 |  *  - The GPU implementation put the two for-loop into parallel.
 13 |  *
 14 |  *  - nthdsPerCTA: the max number of threads per block.
 15 |  *  - Each thread will in charge of one point softmax for all classes.
 16 |  *  - Total number of threads: batch * groups
 17 |  *
 18 |  *  - TODO: using warp shuffle instead of loop in one thread.
 19 |  */
 20 | template <unsigned nthdsPerCTA>
 21 | __launch_bounds__(nthdsPerCTA)
 22 | __global__ void softmaxKernel(const float * input,
 23 |             const int n,
 24 |             const int batch,
 25 |             const int batchOffset,
 26 |             const int groups,
 27 |             const int groupOffset,
 28 |             const int stride,
 29 |             const float temp,
 30 |             float * output)
 31 | {
 32 |     int id = blockIdx.x * nthdsPerCTA + threadIdx.x;
 33 | 
 34 |     // per batch, per group
 35 |     if (id < batch * groups)
 36 |     {
 37 |         int b = id / groups;
 38 |         int g = id % groups;
 39 |         float sum = 0.;
 40 |         float largest = -FLT_MAX;
 41 |         int offset = b*batchOffset + g*groupOffset;
 42 |         for (int i = 0; i < n; ++i)
 43 |         {
 44 |             float val = input[i*stride + offset];
 45 |             largest = (val > largest) ? val : largest;
 46 |         }
 47 |         for (int i = 0; i < n; ++i)
 48 |         {
 49 |             float e = exp(input[i*stride + offset]/temp - largest/temp); // bound score in (-inf,0], and denominator fractor in (0,1].
 50 |             sum += e;
 51 |             output[i*stride + offset] = e;
 52 |         }
 53 |         for (int i = 0; i < n; ++i)
 54 |           output[i*stride + offset] /= sum;
 55 |     }
 56 | }
 57 | 
 58 | 
 59 | /**
 60 |  * \brief Sigmoid function
 61 |  *
 62 |  * "__launch_bounds__" ensures the universality of kernel
 63 |  */
 64 | template <unsigned nthdsPerCTA>
 65 | __launch_bounds__(nthdsPerCTA)
 66 | __global__ void activateKernel(float * data,
 67 |             const int range)
 68 | {
 69 |     int i = blockIdx.x * nthdsPerCTA + threadIdx.x;
 70 |     if (i < range)
 71 |       data[i] = 1. / (1. + exp(-data[i]));
 72 | }
 73 | 
 74 | /**
 75 |  * \brief region layer of YOLOv3
 76 |  * Includes activation and softmax.
 77 |  * - num: # bounding box per location
 78 |  *
 79 |  * If we integrated into tensorRT, we can use input and output are different memory.
 80 |  * If it is standalone GPU code (in main.cpp), we can use input and output the same buffer.
 81 |  *
 82 |  * Note: The elements in YOLOv3
 83 |  * * 4*nCells coords,
 84 |  * * nCells conf,
 85 |  * * classes*nCells classes
 86 |  *  e.g.
 87 |  *      * nCells for 0 class (background)
 88 |  *      * nCells for 1 class
 89 |  *      * ...
 90 |  */
 91 | void regionLayer_gpu(
 92 |         const int batch,
 93 |         const int C,
 94 |         const int nCells,
 95 |         const int num,
 96 |         const int coords,
 97 |         const int classes,
 98 |         const float * input,
 99 |         float * output,
100 |         cudaStream_t stream)
101 | {
102 |     const int blockSize = 256;
103 |     const int gridSize_Act1 = (2*nCells + blockSize - 1) / blockSize;  // x, y
104 |     const int gridSize_Act2 = (nCells + blockSize - 1) / blockSize;    // conf
105 |     const int gridSize_Softmax = (nCells + blockSize - 1) / blockSize;   // classes
106 |     // for YOLOv3, the output of final layer is C*nCells, in which, C includes all the conf, coord, and claesses.
107 | 
108 | #ifdef REGION_IN_TRT
109 |     // TRT, input and output are diff buffer
110 |     ck(cudaMemcpy((void*)output, (void*)input, batch*C*nCells*sizeof(float), cudaMemcpyDeviceToDevice));
111 | #endif
112 |     // else input and output can be same buffer
113 | 
114 |     for (int b = 0; b < batch; ++b) {
115 |         for (int n = 0; n < num; ++n) {
116 |             // activate on (x,y)
117 |             int index = b*C*nCells   // per batch
118 |                         + n*nCells*(coords+classes+1);  // coords, classes and confidence
119 |             activateKernel<blockSize>
120 |                 <<<gridSize_Act1, blockSize, 0, stream>>>
121 |                 (output + index, 2*nCells);
122 | 
123 |             // activate on probes on conf
124 |             index = b*C*nCells
125 |                     + n*nCells*(coords+classes+1)
126 |                     + 4*nCells;                        // skip coords
127 |             activateKernel<blockSize>
128 |                 <<<gridSize_Act2, blockSize, 0, stream>>>
129 |                 (output + index, nCells);
130 | 
131 | 						// softmax for all classes
132 |             index = b*C*nCells
133 |                     + n*nCells*(coords+classes+1)
134 |                     + 5*nCells;                        // skip conf
135 | 						softmaxKernel<blockSize>
136 | 						    <<<gridSize_Softmax, blockSize, 0, stream>>>
137 |                 (input + index,     // input: skip loc, conf
138 |                  classes,           // n: #classes
139 |                  batch*num,         // batch: batch * #bound_box
140 |                  (C*nCells/num),    // batchOffset: number of bounding_box in total
141 |                  nCells,            // groups
142 |                  1,                 // groupOffset
143 |                  nCells,            // stride
144 |                  1.f,               // temp
145 |                  output + index);   // output
146 |         }
147 |     }
148 | }
149 | 
150 | #define nOutputLayer 3
151 | template <unsigned nthdsPerCTA>
152 | __launch_bounds__(nthdsPerCTA)
153 | __global__ void reorgOutputKernel(
154 |         const int       nBatch,
155 |         const int       nClasses,
156 |         const int       nBboxesPerLoc,
157 |         const int       coords,
158 | 				const int				l0_w,
159 | 				const int				l0_h,
160 |         const int       nCells,
161 |         float*          dpData_unordered[],
162 |         float*          dpData)
163 | {
164 |     long i = blockIdx.x * nthdsPerCTA + threadIdx.x;
165 |     const int bboxMemLen  = (nClasses + coords + 1) * nCells;
166 |     const int batchMemLen = nBboxesPerLoc * bboxMemLen;
167 |     const long range = nBatch * batchMemLen;
168 |     if (i < range) // voc<266175 coco<904995 wrt. 416*416 input
169 |     {
170 |         int b = i / batchMemLen;
171 |         int bboxIdx = (i % batchMemLen) / bboxMemLen;
172 |         int channelIdx = ((i % batchMemLen) % bboxMemLen) / nCells;
173 |         int locIdx = (i % batchMemLen) % nCells;
174 |         int locLayer, cnt_offset = 1+2*2+4*4;
175 | 				for(int j = nOutputLayer-1; j >= 0; --j){
176 | 				    cnt_offset -= (1<<j)*(1<<j); // zoomFactor = 2
177 | 				    if(locIdx >= cnt_offset*l0_w*l0_h){
178 | 				        locLayer = j;
179 | 				        break;
180 | 				    }
181 | 				}
182 | 				dpData[i] = dpData_unordered[locLayer]\
183 | 				                [b*nBboxesPerLoc*(nClasses+coords+1)*(1<<locLayer)*(1<<locLayer)*l0_w*l0_h +\
184 | 				                bboxIdx*(nClasses+coords+1)*(1<<locLayer)*(1<<locLayer)*l0_w*l0_h +\
185 | 				                channelIdx*(1<<locLayer)*(1<<locLayer)*l0_w*l0_h +\
186 | 				                locIdx - cnt_offset*l0_w*l0_h];
187 |     }
188 | }
189 | 
190 | void reorgOutput_gpu(
191 |         const int       nBatch,
192 |         const int       nClasses,
193 |         const int       nBboxesPerLoc,
194 |         const int       coords,
195 | 				const int				l0_w,
196 | 				const int				l0_h,
197 |         const int       nCells,
198 |         float*          dpData_unordered[],
199 |         float*          dpData,
200 |         const long      nData,
201 |         cudaStream_t    stream)
202 | {
203 |     const int blockSize = 512;
204 |     const int gridSize = (nData + blockSize - 1) / blockSize;
205 |     reorgOutputKernel<blockSize>
206 |         <<<gridSize, blockSize, 0, stream>>>
207 |         (nBatch, nClasses, nBboxesPerLoc, coords, l0_w, l0_h, nCells, dpData_unordered, dpData);
208 |     
209 | }
210 | 


--------------------------------------------------------------------------------
/src/regionLayer.cu~:
--------------------------------------------------------------------------------
  1 | #include <regionLayer.h>
  2 | #include <cfloat>
  3 | 
  4 | /** \brief kernel for softmax
  5 |  *  - n is the number of classes (included the background)
  6 |  *
  7 |  *  - The CPU implementation is
  8 |  *  for b in batch:
  9 |  *      for g in groups:
 10 |  *          softmax(input + b*batchOffset + g*groupOffset, n, temp, stride, output + b*batchOffset + g*groupOffset)
 11 |  *
 12 |  *  - The GPU implementation put the two for-loop into parallel.
 13 |  *
 14 |  *  - nthdsPerCTA: the max number of threads per block.
 15 |  *  - Each thread will in charge of one point softmax for all classes.
 16 |  *  - Total number of threads: batch * groups
 17 |  *
 18 |  *  - TODO: using warp shuffle instead of loop in one thread.
 19 |  */
 20 | template <unsigned nthdsPerCTA>
 21 | __launch_bounds__(nthdsPerCTA)
 22 | __global__ void softmaxKernel(const float * input,
 23 |             const int n,
 24 |             const int batch,
 25 |             const int batchOffset,
 26 |             const int groups,
 27 |             const int groupOffset,
 28 |             const int stride,
 29 |             const float temp,
 30 |             float * output)
 31 | {
 32 |     int id = blockIdx.x * nthdsPerCTA + threadIdx.x;
 33 | 
 34 |     // per batch, per group
 35 |     if (id < batch * groups)
 36 |     {
 37 |         int b = id / groups;
 38 |         int g = id % groups;
 39 |         float sum = 0.;
 40 |         float largest = -FLT_MAX;
 41 |         int offset = b*batchOffset + g*groupOffset;
 42 |         for (int i = 0; i < n; ++i)
 43 |         {
 44 |             float val = input[i*stride + offset];
 45 |             largest = (val > largest) ? val : largest;
 46 |         }
 47 |         for (int i = 0; i < n; ++i)
 48 |         {
 49 |             float e = exp(input[i*stride + offset]/temp - largest/temp); // bound score in (-inf,0], and denominator fractor in (0,1].
 50 |             sum += e;
 51 |             output[i*stride + offset] = e;
 52 |         }
 53 |         for (int i = 0; i < n; ++i)
 54 |           output[i*stride + offset] /= sum;
 55 |     }
 56 | }
 57 | 
 58 | 
 59 | /**
 60 |  * \brief Sigmoid function
 61 |  *
 62 |  * "__launch_bounds__" ensures the universality of kernel
 63 |  */
 64 | template <unsigned nthdsPerCTA>
 65 | __launch_bounds__(nthdsPerCTA)
 66 | __global__ void activateKernel(float * data,
 67 |             const int range)
 68 | {
 69 |     int i = blockIdx.x * nthdsPerCTA + threadIdx.x;
 70 |     if (i < range)
 71 |       data[i] = 1. / (1. + exp(-data[i]));
 72 | }
 73 | 
 74 | /**
 75 |  * \brief region layer of YOLOv3
 76 |  * Includes activation and softmax.
 77 |  * - num: # bounding box per location
 78 |  *
 79 |  * If we integrated into tensorRT, we can use input and output are different memory.
 80 |  * If it is standalone GPU code (in main.cpp), we can use input and output the same buffer.
 81 |  *
 82 |  * Note: The elements in YOLOv2
 83 |  * * 4*nCells coords,
 84 |  * * nCells conf,
 85 |  * * classes*nCells classes
 86 |  *  e.g.
 87 |  *      * nCells for 0 class (background)
 88 |  *      * nCells for 1 class
 89 |  *      * ...
 90 |  */
 91 | void regionLayer_gpu(
 92 |         const int batch,
 93 |         const int C,
 94 |         const int nCells,
 95 |         const int num,
 96 |         const int coords,
 97 |         const int classes,
 98 |         const float * input,
 99 |         float * output,
100 |         cudaStream_t stream)
101 | {
102 |     const int blockSize = 256;
103 |     const int gridSize_Act1 = (2*nCells + blockSize - 1) / blockSize;  // x, y
104 |     const int gridSize_Act2 = (nCells + blockSize - 1) / blockSize;    // conf
105 |     const int gridSize_Softmax = (nCells + blockSize - 1) / blockSize;   // classes
106 |     // for YOLOv3, the output of final layer is C*nCells, in which, C includes all the conf, coord, and claesses.
107 | 
108 | #ifdef REGION_IN_TRT
109 |     // TRT, input and output are diff buffer
110 |     ck(cudaMemcpy((void*)output, (void*)input, batch*C*nCells*sizeof(float), cudaMemcpyDeviceToDevice));
111 | #endif
112 |     // else input and output can be same buffer
113 | 
114 |     for (int b = 0; b < batch; ++b) {
115 |         for (int n = 0; n < num; ++n) {
116 |             // activate on (x,y)
117 |             int index = b*C*nCells   // per batch
118 |                         + n*nCells*(coords+classes+1);  // coords, classes and confidence
119 |             activateKernel<blockSize>
120 |                 <<<gridSize_Act1, blockSize, 0, stream>>>
121 |                 (output + index, 2*nCells);
122 | 
123 |             // activate on probes on conf
124 |             index = b*C*nCells
125 |                     + n*nCells*(coords+classes+1)
126 |                     + 4*nCells;                        // skip coords
127 |             activateKernel<blockSize>
128 |                 <<<gridSize_Act2, blockSize, 0, stream>>>
129 |                 (output + index, nCells);
130 | 
131 | 						// softmax for all classes
132 |             index = b*C*nCells
133 |                     + n*nCells*(coords+classes+1)
134 |                     + 5*nCells;                        // skip conf
135 | 						softmaxKernel<blockSize>
136 | 						    <<<gridSize_Softmax, blockSize, 0, stream>>>
137 |                 (input + index,     // input: skip loc, conf
138 |                  classes,           // n: #classes
139 |                  batch*num,         // batch: batch * #bound_box
140 |                  (C*nCells/num),    // batchOffset: number of bounding_box in total
141 |                  nCells,            // groups
142 |                  1,                 // groupOffset
143 |                  nCells,            // stride
144 |                  1.f,               // temp
145 |                  output + index);   // output
146 |         }
147 |     }
148 | }
149 | 
150 | #define nOutputLayer 3
151 | template <unsigned nthdsPerCTA>
152 | __launch_bounds__(nthdsPerCTA)
153 | __global__ void reorgOutputKernel(
154 |         const int       nBatch,
155 |         const int       nClasses,
156 |         const int       nBboxesPerLoc,
157 |         const int       coords,
158 | 				const int				l0_w,
159 | 				const int				l0_h,
160 |         const int       nCells,
161 |         float*          dpData_unordered[],
162 |         float*          dpData)
163 | {
164 |     long i = blockIdx.x * nthdsPerCTA + threadIdx.x;
165 |     const int bboxMemLen  = (nClasses + coords + 1) * nCells;
166 |     const int batchMemLen = nBboxesPerLoc * bboxMemLen;
167 |     const long range = nBatch * batchMemLen;
168 |     if (i < range) // voc<266175 coco<904995 wrt. 416*416 input
169 |     {
170 |         int b = i / batchMemLen;
171 |         int bboxIdx = (i % batchMemLen) / bboxMemLen;
172 |         int channelIdx = ((i % batchMemLen) % bboxMemLen) / nCells;
173 |         int locIdx = (i % batchMemLen) % nCells;
174 |         int locLayer, cnt_offset = 1+2*2+4*4;
175 | 				for(int j = nOutputLayer-1; j >= 0; --j){
176 | 				    cnt_offset -= (1<<j)*(1<<j); // zoomFactor = 2
177 | 				    if(locIdx >= cnt_offset*l0_w*l0_h){
178 | 				        locLayer = j;
179 | 				        break;
180 | 				    }
181 | 				}
182 | 				dpData[i] = dpData_unordered[locLayer]\
183 | 				                [b*nBboxesPerLoc*(nClasses+coords+1)*(1<<locLayer)*(1<<locLayer)*l0_w*l0_h +\
184 | 				                bboxIdx*(nClasses+coords+1)*(1<<locLayer)*(1<<locLayer)*l0_w*l0_h +\
185 | 				                channelIdx*(1<<locLayer)*(1<<locLayer)*l0_w*l0_h +\
186 | 				                locIdx - cnt_offset*l0_w*l0_h];
187 |     }
188 | }
189 | 
190 | void reorgOutput_gpu(
191 |         const int       nBatch,
192 |         const int       nClasses,
193 |         const int       nBboxesPerLoc,
194 |         const int       coords,
195 | 				const int				l0_w,
196 | 				const int				l0_h,
197 |         const int       nCells,
198 |         float*          dpData_unordered[],
199 |         float*          dpData,
200 |         const long      nData,
201 |         cudaStream_t    stream)
202 | {
203 |     const int blockSize = 512;
204 |     const int gridSize = (nData + blockSize - 1) / blockSize;
205 |     reorgOutputKernel<blockSize>
206 |         <<<gridSize, blockSize, 0, stream>>>
207 |         (nBatch, nClasses, nBboxesPerLoc, coords, l0_w, l0_h, nCells, dpData_unordered, dpData);
208 |     
209 | }
210 | 


--------------------------------------------------------------------------------
/src/regionLayer.h:
--------------------------------------------------------------------------------
 1 | #ifndef REGION_LAYER_H_
 2 | #define REGION_LAYER_H_
 3 | 
 4 | #include "nvUtils.h"
 5 | 
 6 | class regionParams{
 7 | public:
 8 |     int classes;        // number of class
 9 |     int n;              // number of bbox per location
10 |     int coords;         // number of coords (4)
11 |     int w;              // w (darknet)
12 |     int h;              // h (darknet), in total, we have w*h*n bbox
13 |     int outputs;        // outputs (darknet), output dimension of previous layer,
14 | 
15 |     bool softmax;       // 1 for softmax process
16 |     int background;     // background index
17 | };
18 | 
19 | typedef struct{
20 |     float x, y, w, h;
21 | } box;
22 | 
23 | void regionLayer_gpu(const int batch,
24 |                      const int C,
25 |                      const int nCells,
26 |                      const int num,
27 |                      const int coords,
28 |                      const int classes,
29 |                      const float * input,
30 |                      float * output,
31 |                      cudaStream_t stream);
32 |                      
33 | void reorgOutput_gpu(const int       nBatch,
34 |                      const int       nClasses,
35 |                      const int       nBboxesPerLoc,
36 |                      const int       coords,
37 |                      const int			 l0_w,
38 |                      const int			 l0_h,
39 |                      const int       nCells,
40 |                      float*          dpData_unordered[],
41 |                      float*          dpData,
42 |                      const long      nData,
43 |                      cudaStream_t    stream);
44 | #endif
45 | 


--------------------------------------------------------------------------------
/src/regionLayer.h~:
--------------------------------------------------------------------------------
 1 | #ifndef REGION_LAYER_H_
 2 | #define REGION_LAYER_H_
 3 | 
 4 | #include "nvUtils.h"
 5 | 
 6 | class regionParams{
 7 | public:
 8 |     int classes;        // number of class
 9 |     int n;              // number of bbox per location
10 |     int coords;         // number of coords (4)
11 |     int w;              // w (darknet)
12 |     int h;              // h (darknet), in total, we have w*h*n bbox
13 |     int outputs;        // outputs (darknet), output dimension of previous layer,
14 | 
15 |     bool softmax;       // 1 for softmax process
16 |     int background;     // background index
17 | };
18 | 
19 | typedef struct{
20 |     float x, y, w, h;
21 | } box;
22 | 
23 | void regionLayer_gpu(const int batch,
24 |                      const int C,
25 |                      const int nCells,
26 |                      const int num,
27 |                      const int coords,
28 |                      const int classes,
29 |                      const float * input,
30 |                      float * output,
31 |                      cudaStream_t stream);
32 |                      
33 | void reorgOutput_gpu(const int       nBatch,
34 |                      const int       nClasses,
35 |                      const int       nBboxesPerLoc,
36 |                      const int       coords,
37 |                      const int			 l0_w,
38 |                      const int			 l0_h,
39 |                      const int       nCells,
40 |                      float* __constant__ dpData_unordered[],
41 |                      float*          dpData,
42 |                      const long      nData,
43 |                      cudaStream_t    stream);
44 | #endif
45 | 


--------------------------------------------------------------------------------
/src/tensorRTClassifier.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * NOTICE TO USER:
  5 | *
  6 | * This source code is subject to NVIDIA ownership rights under U.S. and
  7 | * international Copyright laws.
  8 | *
  9 | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 10 | * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 11 | * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 12 | * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 13 | * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 14 | * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 15 | * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 16 | * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 17 | * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
 18 | * OR PERFORMANCE OF THIS SOURCE CODE.
 19 | *
 20 | * U.S. Government End Users.  This source code is a "commercial item" as
 21 | * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
 22 | * "commercial computer software" and "commercial computer software
 23 | * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
 24 | * and is provided to the U.S. Government only as a commercial end item.
 25 | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 26 | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 27 | * source code with only those rights set forth herein.
 28 | */
 29 | 
 30 | #ifndef TENSORRT_CLASSIFIER_H
 31 | #define TENSORRT_CLASSIFIER_H
 32 | 
 33 | #include <algorithm>
 34 | #include <iomanip>
 35 | #include <thread>
 36 | #include <iterator>
 37 | 
 38 | #include "NvInfer.h"
 39 | #include "NvCaffeParser.h"
 40 | #include "NvInferPlugin.h"
 41 | #include "logger.h"
 42 | #include "classifier.h"
 43 | 
 44 | using namespace nvinfer1;
 45 | using namespace nvcaffeparser1;
 46 | using namespace plugin;
 47 | 
 48 | static const int MAX_BUFFERS_ = 10;
 49 | 
 50 | 
 51 | // Logger for GIE info/warning/errors
 52 | class Logger : public ILogger			
 53 | {
 54 | 	void log(Severity severity, const char* msg) override
 55 | 	{
 56 | 		// suppress info-level messages
 57 | 		if (severity != Severity::kINFO)
 58 | 			std::cout << msg << std::endl;
 59 | 	}
 60 | };
 61 | 
 62 | class Int8Calibrator : public IInt8EntropyCalibrator
 63 | {
 64 | public:
 65 | 	Int8Calibrator(std::string calibrationTableFile)
 66 | 	: calibrationTableFile_(calibrationTableFile) {}
 67 | 
 68 | 	~Int8Calibrator() {}
 69 | 
 70 | 	int getBatchSize() const override {
 71 | 		return 0;
 72 | 	}
 73 | 
 74 | 	bool getBatch(void* bindings[], const char* names[], int nbBindings) override
 75 | 	{
 76 | 		return false;
 77 | 	}
 78 | 
 79 | 	const void* readCalibrationCache(size_t& length) override
 80 | 	{
 81 | 		vCalibrationCache_.clear();
 82 | 		std::ifstream input(calibrationTableFile_.c_str(), std::ios::binary);
 83 | 		input >> std::noskipws;
 84 | 		if (input.good())
 85 | 			std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(vCalibrationCache_));
 86 | 
 87 | 		length = vCalibrationCache_.size();
 88 | 		return length ? &vCalibrationCache_[0] : nullptr;
 89 | 	}
 90 | 
 91 | 	void writeCalibrationCache(const void* cache, size_t length) override
 92 | 	{
 93 | 		std::cout << "writeCalibrationCache is called!" << std::endl;
 94 | 	}
 95 | 
 96 | private:
 97 | 	std::string calibrationTableFile_;
 98 | 	std::vector<char> vCalibrationCache_;
 99 | };
100 | 
101 | class TensorRTClassifier  : public IClassifier {
102 | public:
103 | 	TensorRTClassifier(const char *deployFile, // caffe prototxt file
104 | 						const char *modelFile, // trained caffe model
105 | 						const char *meanFile,  // mean file
106 | 						const std::string& inputs,
107 | 						const std::vector<std::string >& outputs,
108 | 						const int maxBatchSize,
109 | 						const int devID,
110 | 	                    nvcaffeparser1::IPluginFactory* pPluginFactory = nullptr,
111 |                         std::string table = std::string());
112 | 	
113 | 	~TensorRTClassifier();
114 | 
115 | 	void caffeToTensorRTModel(const char *deployFile,
116 | 								const char *modelFile,
117 | 								ICaffeParser *parser);
118 | 	void initInfer();
119 | 	
120 | 	// override
121 | 	void setInputData(float *pBGR,
122 | 						const int nWidth,
123 | 						const int nHeight,
124 | 						const int nBatchSize) override;
125 | 	
126 | 	void forward(INFER_OUTPUT_PARAMS *) override;
127 | 	
128 | 	int getInferWidth() const override;
129 | 	
130 | 	int getInferHeight() const override;
131 | 	
132 | 	std::vector<float > getMeanValues() const override;
133 | 
134 | private:
135 | 	int devID_;
136 | 	int maxBatchSize_;
137 | 
138 | 	// tensorRT params
139 | 	ICudaEngine 		*pEngine_ 		= nullptr;
140 | 	ICaffeParser 		*pCaffeParser_ 	= nullptr;
141 | 	IBinaryProtoBlob 	*pMeanBlob_ 	= nullptr;
142 | 	IExecutionContext 	*pContext_ 		= nullptr;
143 | 
144 | 	std::string inputBlobName_;
145 | 	std::vector<std::string > vOutputBlobNames_;
146 | 	nvcaffeparser1::IPluginFactory* pPluginFactory_{ nullptr };	// factory for plugin layers
147 | 	Int8Calibrator *pCalibrator_{ nullptr };
148 | 	std::string calibrationTable_;
149 | 	
150 | 	int 	nInputs_;
151 | 	int 	inputIndex_;
152 | 	DimsCHW	inputDim_;
153 | 	size_t 	inputSize_;
154 | 	
155 | 	int 	nOutputs_;
156 | 	std::vector<int > vOutputIndexs_;
157 | 	std::vector<DimsCHW > vOutputDims_; 
158 | 	std::vector<size_t > vOutputSizes_; 
159 | 	
160 | 	void	*apBuffers_[MAX_BUFFERS_]; // input and output buffer
161 |     std::vector<float> vMeanValues_{0.f, 0.f, 0.f};
162 | 	
163 |     // tensorRT logger
164 | 	Logger 	logger_; 
165 | };
166 | 
167 | 
168 | #endif // TENSORRT_CLASSIFIER_H
169 | 


--------------------------------------------------------------------------------
/tensorRTClassifier.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
  3 | *
  4 | * NOTICE TO USER:
  5 | *
  6 | * This source code is subject to NVIDIA ownership rights under U.S. and
  7 | * international Copyright laws.
  8 | *
  9 | * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 10 | * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 11 | * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 12 | * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 13 | * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 14 | * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 15 | * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 16 | * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 17 | * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
 18 | * OR PERFORMANCE OF THIS SOURCE CODE.
 19 | *
 20 | * U.S. Government End Users.  This source code is a "commercial item" as
 21 | * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
 22 | * "commercial computer software" and "commercial computer software
 23 | * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
 24 | * and is provided to the U.S. Government only as a commercial end item.
 25 | * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 26 | * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 27 | * source code with only those rights set forth herein.
 28 | */
 29 | 
 30 | #ifndef TENSORRT_CLASSIFIER_H
 31 | #define TENSORRT_CLASSIFIER_H
 32 | 
 33 | #include <algorithm>
 34 | #include <iomanip>
 35 | #include <thread>
 36 | #include <iterator>
 37 | 
 38 | #include "NvInfer.h"
 39 | #include "NvCaffeParser.h"
 40 | #include "NvInferPlugin.h"
 41 | #include "logger.h"
 42 | #include "classifier.h"
 43 | 
 44 | using namespace nvinfer1;
 45 | using namespace nvcaffeparser1;
 46 | using namespace plugin;
 47 | 
 48 | static const int MAX_BUFFERS_ = 10;
 49 | 
 50 | 
 51 | // Logger for GIE info/warning/errors
 52 | class Logger : public ILogger			
 53 | {
 54 | 	void log(Severity severity, const char* msg) override
 55 | 	{
 56 | 		// suppress info-level messages
 57 | 		if (severity != Severity::kINFO)
 58 | 			std::cout << msg << std::endl;
 59 | 	}
 60 | };
 61 | 
 62 | class Int8Calibrator : public IInt8EntropyCalibrator
 63 | {
 64 | public:
 65 | 	Int8Calibrator(std::string calibrationTableFile)
 66 | 	: calibrationTableFile_(calibrationTableFile) {}
 67 | 
 68 | 	~Int8Calibrator() {}
 69 | 
 70 | 	int getBatchSize() const override {
 71 | 		return 0;
 72 | 	}
 73 | 
 74 | 	bool getBatch(void* bindings[], const char* names[], int nbBindings) override
 75 | 	{
 76 | 		return false;
 77 | 	}
 78 | 
 79 | 	const void* readCalibrationCache(size_t& length) override
 80 | 	{
 81 | 		vCalibrationCache_.clear();
 82 | 		std::ifstream input(calibrationTableFile_.c_str(), std::ios::binary);
 83 | 		input >> std::noskipws;
 84 | 		if (input.good())
 85 | 			std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(vCalibrationCache_));
 86 | 
 87 | 		length = vCalibrationCache_.size();
 88 | 		return length ? &vCalibrationCache_[0] : nullptr;
 89 | 	}
 90 | 
 91 | 	void writeCalibrationCache(const void* cache, size_t length) override
 92 | 	{
 93 | 		std::cout << "writeCalibrationCache is called!" << std::endl;
 94 | 	}
 95 | 
 96 | private:
 97 | 	std::string calibrationTableFile_;
 98 | 	std::vector<char> vCalibrationCache_;
 99 | };
100 | 
101 | class TensorRTClassifier  : public IClassifier {
102 | public:
103 | 	TensorRTClassifier(const char *deployFile, // caffe prototxt file
104 | 						const char *modelFile, // trained caffe model
105 | 						const char *meanFile,  // mean file
106 | 						const std::string& inputs,
107 | 						const std::vector<std::string >& outputs,
108 | 						const int maxBatchSize,
109 | 						const int devID,
110 | 	                    nvcaffeparser1::IPluginFactory* pPluginFactory = nullptr,
111 |                         std::string table = std::string());
112 | 	
113 | 	~TensorRTClassifier();
114 | 
115 | 	void caffeToTensorRTModel(const char *deployFile,
116 | 								const char *modelFile,
117 | 								ICaffeParser *parser);
118 | 	void initInfer();
119 | 	
120 | 	// override
121 | 	void setInputData(float *pBGR,
122 | 						const int nWidth,
123 | 						const int nHeight,
124 | 						const int nBatchSize) override;
125 | 	
126 | 	void forward(INFER_OUTPUT_PARAMS *) override;
127 | 	
128 | 	int getInferWidth() const override;
129 | 	
130 | 	int getInferHeight() const override;
131 | 	
132 | 	std::vector<float > getMeanValues() const override;
133 | 
134 | private:
135 | 	int devID_;
136 | 	int maxBatchSize_;
137 | 
138 | 	// tensorRT params
139 | 	ICudaEngine 		*pEngine_ 		= nullptr;
140 | 	ICaffeParser 		*pCaffeParser_ 	= nullptr;
141 | 	IBinaryProtoBlob 	*pMeanBlob_ 	= nullptr;
142 | 	IExecutionContext 	*pContext_ 		= nullptr;
143 | 
144 | 	std::string inputBlobName_;
145 | 	std::vector<std::string > vOutputBlobNames_;
146 | 	nvcaffeparser1::IPluginFactory* pPluginFactory_{ nullptr };	// factory for plugin layers
147 | 	Int8Calibrator *pCalibrator_{ nullptr };
148 | 	std::string calibrationTable_;
149 | 	
150 | 	int 	nInputs_;
151 | 	int 	inputIndex_;
152 | 	DimsCHW	inputDim_;
153 | 	size_t 	inputSize_;
154 | 	
155 | 	int 	nOutputs_;
156 | 	std::vector<int > vOutputIndexs_;
157 | 	std::vector<DimsCHW > vOutputDims_; 
158 | 	std::vector<size_t > vOutputSizes_; 
159 | 	
160 | 	void	*apBuffers_[MAX_BUFFERS_]; // input and output buffer
161 |     std::vector<float> vMeanValues_{0.f, 0.f, 0.f};
162 | 	
163 |     // tensorRT logger
164 | 	Logger 	logger_; 
165 | };
166 | 
167 | 
168 | #endif // TENSORRT_CLASSIFIER_H
169 | 


--------------------------------------------------------------------------------
/test.py~:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import sys
3 | c = float(sys.argv[1])
4 | mAP = [1., c]
5 | ret = (float)(sum(mAP) / len(mAP))
6 | print 'mAP = %.5f' % ret
7 | exit((sum(mAP) / len(mAP)))
8 | 


--------------------------------------------------------------------------------
/test.sh~:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | result=test.txt
4 | a=0.45
5 | b=0.003
6 | mAP=`python test.py 3.0 2>&1 1>/dev/null`
7 | echo $( printf '%f %f %f' ${a} ${b} ${mAP}) >> ${result}
8 | cat ${result}
9 | 


--------------------------------------------------------------------------------