├── README.md ├── cython_util ├── bbox_overlap.pyx ├── bbox_transform_inv.pyx ├── bool_anchors_inside_image.pyx ├── nms.pyx ├── remove_extraboxes.pyx ├── setup.py └── setup.sh ├── pretrain ├── base_vgg16.py ├── vgg16.py └── vgg16_vehicle.py ├── rcnn ├── base_rpn.py ├── proposal_layer.py ├── rcnn.py └── rcnn_vehicle.py ├── rpn ├── __init__.py ├── rpn.py └── rpn_vehicle.py └── util ├── __init__.py ├── bbox_transform.py ├── data_util.py ├── generate_anchors.py ├── input_kitti.py ├── model.py ├── network_util.py ├── parse_xml.py └── train.py /README.md: -------------------------------------------------------------------------------- 1 | # Fast_RCNN_tensorflow 2 | Implementation of Faster RCNN by Tensorflow (In development) 3 | 4 | ・Complete 5 | Load Images of KiTTI Object Detection Datasets 6 | Preprocessing for Network Input 7 | RPN(Region Proposal Network) 8 | Proposal Layer(Convert rpn to rois) 9 | 10 | ・ToDO 11 | Trainer for RCNN 12 | 13 | ``` 14 | # Prepare KiTTI Datasets 15 | http://www.cvlibs.net/datasets/kitti/eval_object.php 16 | 17 | # Compile Cython File 18 | cd cython_util 19 | ./setup.sh 20 | 21 | # Training RPN 22 | cd rpn 23 | python rpn.py 24 | ``` 25 | 26 | # ROI Pooling 27 | ROI Pooling layer was implemented by this repository 28 | https://github.com/deepsense-io/roi-pooling 29 | -------------------------------------------------------------------------------- /cython_util/bbox_overlap.pyx: -------------------------------------------------------------------------------- 1 | cimport cython 2 | import numpy as np 3 | cimport numpy as np 4 | 5 | from libc.math cimport log 6 | 7 | DTYPE = np.float 8 | ctypedef np.float_t DTYPE_t 9 | 10 | DTYPE_int = np.int 11 | ctypedef np.int_t DTYPE_int_t 12 | 13 | def bbox_overlaps( 14 | np.ndarray[DTYPE_t, ndim=4] anchors, 15 | np.ndarray[DTYPE_int_t, ndim=3] is_inside, 16 | object gt_boxes): 17 | """ 18 | Parameters 19 | ---------- 20 | anchors: (Batch_Size, K, A, 4) ndarray of float 21 | is_inside: (Batch_Size, K, A) ndarray of int 22 | gt_boxes: (Batch, G, 4) ndarray of float 23 | Returns 24 | ------- 25 | """ 26 | cdef unsigned int Batch_Size = anchors.shape[0] 27 | cdef unsigned int K = anchors.shape[1] 28 | cdef unsigned int A = anchors.shape[2] 29 | cdef unsigned int G 30 | cdef np.ndarray[DTYPE_t, ndim=4] overlaps 31 | cdef np.ndarray[DTYPE_int_t, ndim=3] true_index = np.zeros((Batch_Size, K, A), dtype=DTYPE_int) 32 | cdef np.ndarray[DTYPE_int_t, ndim=3] false_index = np.zeros((Batch_Size, K, A), dtype=DTYPE_int) 33 | cdef DTYPE_t iw, ih, box_area 34 | cdef DTYPE_t ua 35 | cdef DTYPE_t max_overlap 36 | cdef DTYPE_t ex_width, ex_height, ex_center_x, ex_center_y, gt_width, gt_height, gt_center_x, gt_center_y 37 | cdef unsigned int k, a, b, g, max_k, max_a, max_g 38 | 39 | max_g = 0 40 | for b in range(Batch_Size): 41 | if max_g < gt_boxes[b].shape[0]: 42 | max_g = gt_boxes[b].shape[0] 43 | 44 | overlaps = np.zeros((Batch_Size, K, A, max_g)) 45 | 46 | for b in range(Batch_Size): 47 | G = gt_boxes[b].shape[0] 48 | for g in range(G): 49 | box_area = ( 50 | (gt_boxes[b][g, 2] - gt_boxes[b][g, 0] + 1) * 51 | (gt_boxes[b][g, 3] - gt_boxes[b][g, 1] + 1) 52 | ) 53 | max_overlap = 0 54 | max_k = 0 55 | max_a = 0 56 | for k in range(K): 57 | for a in range(A): 58 | if is_inside[b, k, a] == 1: 59 | iw = ( 60 | min(anchors[b, k, a, 2], gt_boxes[b][g, 2]) - 61 | max(anchors[b, k, a, 0], gt_boxes[b][g, 0]) + 1 62 | ) 63 | if iw > 0: 64 | ih = ( 65 | min(anchors[b, k, a, 3], gt_boxes[b][g, 3]) - 66 | max(anchors[b, k, a, 1], gt_boxes[b][g, 1]) + 1 67 | ) 68 | if ih > 0: 69 | ua = float( 70 | (anchors[b, k, a, 2] - anchors[b, k, a, 0] + 1) * 71 | (anchors[b, k, a, 3] - anchors[b, k, a, 1] + 1) + 72 | box_area - iw * ih 73 | ) 74 | overlaps[b, k, a, g] = iw * ih / ua 75 | if max_overlap < ((iw * ih / ua)): 76 | max_overlap = iw * ih / ua 77 | max_k = k 78 | max_a = a 79 | true_index[b, max_k, max_a] = 1 80 | 81 | 82 | for k in range(K): 83 | for a in range(A): 84 | if is_inside[b, k, a] == 1: 85 | max_overlap = 0 86 | max_g = 0 87 | for g in range(G): 88 | if overlaps[b, k, a, g] > 0: 89 | if max_overlap < (overlaps[b, k, a, g]): 90 | max_overlap = overlaps[b, k, a, g] 91 | max_g = g 92 | if max_overlap > 0.7: 93 | true_index[b, k, a] = 1 94 | else: 95 | if max_overlap <= 0.3: 96 | false_index[b, k, a] = 1 97 | 98 | if true_index[b, k, a] == 1: 99 | ex_width = anchors[b, k, a, 2] - anchors[b, k, a, 0] + 1 100 | ex_height = anchors[b, k, a, 3] - anchors[b, k, a, 1] + 1 101 | ex_center_x = anchors[b, k, a, 0] + ex_width / 2.0 102 | ex_center_y = anchors[b, k, a, 1] + ex_height / 2.0 103 | gt_width = gt_boxes[b][max_g, 2] - gt_boxes[b][max_g, 0] + 1 104 | gt_height = gt_boxes[b][max_g, 3] - gt_boxes[b][max_g, 1] + 1 105 | gt_center_x = gt_boxes[b][max_g, 0] + gt_width / 2.0 106 | gt_center_y = gt_boxes[b][max_g, 1] + gt_height / 2.0 107 | 108 | anchors[b, k, a, 0] = (gt_center_x - ex_center_x) / (ex_width) 109 | anchors[b, k, a, 1] = (gt_center_y - ex_center_y) / (ex_height) 110 | anchors[b, k, a, 2] = log(gt_width / (ex_width)) 111 | anchors[b, k, a, 3] = log(gt_height / (ex_height)) 112 | return anchors, true_index, false_index 113 | -------------------------------------------------------------------------------- /cython_util/bbox_transform_inv.pyx: -------------------------------------------------------------------------------- 1 | cimport cython 2 | import numpy as np 3 | cimport numpy as np 4 | 5 | from libc.math cimport exp 6 | from nms cimport bbox_transform_inv 7 | 8 | DTYPE = np.float 9 | ctypedef np.float_t DTYPE_t 10 | 11 | def bbox_transform_inv_clip( 12 | np.ndarray[DTYPE_t, ndim=3] anchors, 13 | np.ndarray[DTYPE_t, ndim=3] rpn_bbox, 14 | unsigned int image_width, 15 | unsigned int image_height): 16 | """ 17 | Parameters 18 | ---------- 19 | anchors: (Batch_Sizes, K*A, 4) ndarray of float 20 | rpn_bbox: (Batch_Size, K*A, 4) ndarray of float 21 | ------- 22 | """ 23 | cdef unsigned int B = anchors.shape[0] 24 | cdef unsigned int KA = anchors.shape[1] 25 | cdef DTYPE_t ex_width, ex_height, ex_center_x, ex_center_y, gt_width, gt_height, gt_center_x, gt_center_y 26 | cdef unsigned int ka, b 27 | 28 | for b in range(Batch_Size): 29 | for ka in range(KA): 30 | ex_width = anchors[b, ka, 2] - anchors[b, ka, 0] + 1 31 | ex_height = anchors[b, ka, 3] - anchors[b, ka, 1] + 1 32 | ex_center_x = anchors[b, ka, 0] + ex_width / 2.0 33 | ex_center_y = anchors[b, ka, 1] + ex_height / 2.0 34 | 35 | pred_center_x = rpn_bbox[b, ka, 0] * ex_width + ex_center_x 36 | pred_center_y = rpn_bbox[b, ka, 1] * ex_height + ex_center_y 37 | pred_width = exp(rpn_bbox[b, ka, 2]) * ex_width 38 | pred_height = exp(rpn_bbox[b, ka, 3]) * ex_height 39 | 40 | anchors[b, ka, 0] = max(pred_center_x - pred_width / 2.0, 0) 41 | anchors[b, ka, 1] = max(pred_center_y - pred_height / 2.0, 0) 42 | anchors[b, ka, 2] = min(pred_center_x + pred_width / 2.0, image_width-1) 43 | anchors[b, ka, 3] = min(pred_center_y + pred_height / 2.0, image_height-1) 44 | 45 | 46 | return anchors 47 | -------------------------------------------------------------------------------- /cython_util/bool_anchors_inside_image.pyx: -------------------------------------------------------------------------------- 1 | cimport cython 2 | import numpy as np 3 | cimport numpy as np 4 | 5 | DTYPE = np.float 6 | ctypedef np.float_t DTYPE_t 7 | 8 | DTYPE_int = np.int 9 | ctypedef np.int_t DTYPE_int_t 10 | 11 | def batch_inside_image( 12 | np.ndarray[DTYPE_t, ndim=4] boxes, 13 | unsigned int width, 14 | unsigned int height): 15 | """ 16 | Parameters 17 | ---------- 18 | boxes: (B, K, A, 4) ndarray of float 19 | width: width of input images 20 | height: height of input images 21 | Returns 22 | ------- 23 | is_inside: (B, N, K) ndarray of overlap between boxes and query_boxes 24 | """ 25 | cdef unsigned int B = boxes.shape[0] 26 | cdef unsigned int K = boxes.shape[1] 27 | cdef unsigned int A = boxes.shape[2] 28 | cdef np.ndarray[DTYPE_int_t, ndim=3] is_inside = np.zeros((B, K, A), dtype=DTYPE_int) 29 | cdef unsigned int k, a, b 30 | for b in range(B): 31 | for k in range(K): 32 | for a in range(A): 33 | if boxes[b, k, a, 0] >= 0: 34 | if boxes[b, k, a, 1] >= 0: 35 | if boxes[b, k, a, 2] < width: 36 | if boxes[b, k, a, 3] < height: 37 | is_inside[b, k, a] = 1 38 | return is_inside 39 | 40 | def inside_image( 41 | np.ndarray[DTYPE_t, ndim=3] boxes, 42 | unsigned int width, 43 | unsigned int height): 44 | """ 45 | Parameters 46 | ---------- 47 | boxes: (K, A, 4) ndarray of float 48 | width: width of input images 49 | height: height of input images 50 | Returns 51 | ------- 52 | is_inside: (N, K) ndarray of overlap between boxes and query_boxes 53 | """ 54 | cdef unsigned int K = boxes.shape[0] 55 | cdef unsigned int A = boxes.shape[1] 56 | cdef np.ndarray[DTYPE_int_t, ndim=2] is_inside = np.zeros((K, A), dtype=DTYPE_int) 57 | cdef unsigned int k, a 58 | for k in range(K): 59 | for a in range(A): 60 | if boxes[k, a, 0] >= 0: 61 | if boxes[k, a, 1] >= 0: 62 | if boxes[k, a, 2] < width: 63 | if boxes[k, a, 3] < height: 64 | is_inside[k, a] = 1 65 | return is_inside 66 | -------------------------------------------------------------------------------- /cython_util/nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | cimport cython 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | cdef np.ndarray[np.int_t, ndim=1] cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | -------------------------------------------------------------------------------- /cython_util/remove_extraboxes.pyx: -------------------------------------------------------------------------------- 1 | cimport cython 2 | import numpy as np 3 | cimport numpy as np 4 | 5 | DTYPE_int = np.int 6 | ctypedef np.int_t DTYPE_int_t 7 | 8 | def remove_extraboxes( 9 | np.ndarray[DTYPE_int_t, ndim=1] array1, 10 | np.ndarray[DTYPE_int_t, ndim=1] array2, 11 | np.ndarray[DTYPE_int_t, ndim=1] select, 12 | np.ndarray[DTYPE_int_t, ndim=1] batch): 13 | """ 14 | Parameters 15 | ---------- 16 | array1: (A) ndarray of int 17 | array2: (A) ndarray of int 18 | select: (B) ndarray of int 19 | Returns 20 | ------- 21 | extract_array1 : (64) ndarray of index of remove boxes 22 | extract_array2 : (64) ndarray of index of remove boxes 23 | """ 24 | cdef unsigned int remove_size = select.shape[0] 25 | cdef np.ndarray[DTYPE_int_t, ndim=1] extract_array1 = np.zeros((remove_size), dtype=DTYPE_int) 26 | cdef np.ndarray[DTYPE_int_t, ndim=1] extract_array2 = np.zeros((remove_size), dtype=DTYPE_int) 27 | cdef unsigned int rs 28 | 29 | for rs in range(remove_size): 30 | extract_array1[rs] = array1[select[rs]] 31 | extract_array2[rs] = array2[select[rs]] 32 | return batch, extract_array1, extract_array2 33 | -------------------------------------------------------------------------------- /cython_util/setup.py: -------------------------------------------------------------------------------- 1 | #python setup.py build_ext --inplace 2 | # -------------------------------------------------------- 3 | # Fast R-CNN 4 | # Copyright (c) 2015 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Written by Ross Girshick 7 | # -------------------------------------------------------- 8 | 9 | import os 10 | from os.path import join as pjoin 11 | import numpy as np 12 | from distutils.core import setup 13 | from distutils.extension import Extension 14 | from Cython.Distutils import build_ext 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | for dir in path.split(os.pathsep): 19 | binpath = pjoin(dir, name) 20 | if os.path.exists(binpath): 21 | return os.path.abspath(binpath) 22 | return None 23 | 24 | def locate_cuda(): 25 | """Locate the CUDA environment on the system 26 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 27 | and values giving the absolute path to each directory. 28 | Starts by looking for the CUDAHOME env variable. If not found, everything 29 | is based on finding 'nvcc' in the PATH. 30 | """ 31 | 32 | # first check if the CUDAHOME env variable is in use 33 | if 'CUDAHOME' in os.environ: 34 | home = os.environ['CUDAHOME'] 35 | nvcc = pjoin(home, 'bin', 'nvcc') 36 | else: 37 | # otherwise, search the PATH for NVCC 38 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 39 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 40 | if nvcc is None: 41 | return None; 42 | home = os.path.dirname(os.path.dirname(nvcc)) 43 | 44 | cudaconfig = {'home':home, 'nvcc':nvcc, 45 | 'include': pjoin(home, 'include'), 46 | 'lib64': pjoin(home, 'lib64')} 47 | for k, v in cudaconfig.iteritems(): 48 | if not os.path.exists(v): 49 | return None; 50 | 51 | return cudaconfig 52 | 53 | CUDA = locate_cuda() 54 | 55 | # Obtain the numpy include directory. This logic works across numpy versions. 56 | try: 57 | numpy_include = np.get_include() 58 | except AttributeError: 59 | numpy_include = np.get_numpy_include() 60 | 61 | def customize_compiler_for_nvcc(self): 62 | """inject deep into distutils to customize how the dispatch 63 | to gcc/nvcc works. 64 | If you subclass UnixCCompiler, it's not trivial to get your subclass 65 | injected in, and still have the right customizations (i.e. 66 | distutils.sysconfig.customize_compiler) run on it. So instead of going 67 | the OO route, I have this. Note, it's kindof like a wierd functional 68 | subclassing going on.""" 69 | 70 | # tell the compiler it can processes .cu 71 | self.src_extensions.append('.cu') 72 | 73 | # save references to the default compiler_so and _comple methods 74 | default_compiler_so = self.compiler_so 75 | super = self._compile 76 | 77 | # now redefine the _compile method. This gets executed for each 78 | # object but distutils doesn't have the ability to change compilers 79 | # based on source extension: we add it. 80 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 81 | print extra_postargs 82 | if os.path.splitext(src)[1] == '.cu': 83 | # use the cuda for .cu files 84 | self.set_executable('compiler_so', CUDA['nvcc']) 85 | # use only a subset of the extra_postargs, which are 1-1 translated 86 | # from the extra_compile_args in the Extension class 87 | postargs = extra_postargs['nvcc'] 88 | else: 89 | postargs = extra_postargs['gcc'] 90 | 91 | super(obj, src, ext, cc_args, postargs, pp_opts) 92 | # reset the default compiler_so, which we might have changed for cuda 93 | self.compiler_so = default_compiler_so 94 | 95 | # inject our redefined _compile method into the class 96 | self._compile = _compile 97 | 98 | 99 | # run the customize_compiler 100 | class custom_build_ext(build_ext): 101 | def build_extensions(self): 102 | customize_compiler_for_nvcc(self.compiler) 103 | build_ext.build_extensions(self) 104 | 105 | ext_modules = [ 106 | Extension( 107 | "nms", 108 | ["nms.pyx"], 109 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 110 | include_dirs = [numpy_include] 111 | ), 112 | Extension( 113 | "bbox_transform_inv", 114 | ["bbox_transform_inv.pyx"], 115 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 116 | include_dirs = [numpy_include] 117 | ), 118 | Extension( 119 | "bbox_overlap", 120 | ["bbox_overlap.pyx"], 121 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 122 | include_dirs = [numpy_include] 123 | ), 124 | Extension( 125 | "bool_anchors_inside_image", 126 | ["bool_anchors_inside_image.pyx"], 127 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 128 | include_dirs = [numpy_include] 129 | ), 130 | Extension( 131 | "remove_extraboxes", 132 | ["remove_extraboxes.pyx"], 133 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 134 | include_dirs = [numpy_include] 135 | ), 136 | ] 137 | 138 | # if CUDA: 139 | # ext_modules.append( 140 | # Extension('nms.gpu_nms', 141 | # ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'], 142 | # library_dirs=[CUDA['lib64']], 143 | # libraries=['cudart'], 144 | # language='c++', 145 | # runtime_library_dirs=[CUDA['lib64']], 146 | # # this syntax is specific to this build system 147 | # # we're only going to use certain compiler args with nvcc and not with gcc 148 | # # the implementation of this trick is in customize_compiler() below 149 | # extra_compile_args={'gcc': ["-Wno-unused-function"], 150 | # 'nvcc': ['-arch=sm_35', 151 | # '--ptxas-options=-v', 152 | # '-c', 153 | # '--compiler-options', 154 | # "'-fPIC'"]}, 155 | # include_dirs = [numpy_include, CUDA['include']] 156 | # ) 157 | # ) 158 | 159 | setup( 160 | ext_modules=ext_modules, 161 | cmdclass={'build_ext': custom_build_ext}, 162 | ) 163 | -------------------------------------------------------------------------------- /cython_util/setup.sh: -------------------------------------------------------------------------------- 1 | python setup.py build_ext --inplace 2 | 3 | -------------------------------------------------------------------------------- /pretrain/base_vgg16.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import os 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | import time 7 | 8 | VGG_MEAN = [103.939, 116.779, 123.68] 9 | 10 | class Vgg16: 11 | def __init__(self, vgg16_npy_path=None): 12 | if vgg16_npy_path is None: 13 | path = inspect.getfile(Vgg16) 14 | path = os.path.abspath(os.path.join(path, os.pardir)) 15 | path = os.path.join(path, "vgg16.npy") 16 | vgg16_npy_path = path 17 | print(path) 18 | 19 | self.data_dict = np.load(vgg16_npy_path, encoding='latin1').item() 20 | print("npy file loaded") 21 | 22 | def build_model(self, bgr): 23 | """ 24 | load variable from npy to build the VGG 25 | :param rgb: rgb image [batch, height, width, 3] values scaled [0, 1] 26 | """ 27 | start_time = time.time() 28 | print("build model started") 29 | # rgb_scaled = rgb * 1.0 30 | 31 | # Convert RGB to BGR 32 | # red, green, blue = tf.split(axis=3, num_or_size_splits=3, value=rgb_scaled) 33 | # bgr = tf.concat(axis=3, values=[ 34 | # blue, 35 | # green, 36 | # red, 37 | # ]) 38 | 39 | self.conv1_1 = self.conv_layer(bgr, "conv1_1") 40 | self.conv1_2 = self.conv_layer(self.conv1_1, "conv1_2") 41 | self.pool1 = self.max_pool(self.conv1_2, 'pool1') 42 | 43 | self.conv2_1 = self.conv_layer(self.pool1, "conv2_1") 44 | self.conv2_2 = self.conv_layer(self.conv2_1, "conv2_2") 45 | self.pool2 = self.max_pool(self.conv2_2, 'pool2') 46 | 47 | self.conv3_1 = self.conv_layer(self.pool2, "conv3_1") 48 | self.conv3_2 = self.conv_layer(self.conv3_1, "conv3_2") 49 | self.conv3_3 = self.conv_layer(self.conv3_2, "conv3_3") 50 | self.pool3 = self.max_pool(self.conv3_3, 'pool3') 51 | 52 | self.conv4_1 = self.conv_layer(self.pool3, "conv4_1") 53 | self.conv4_2 = self.conv_layer(self.conv4_1, "conv4_2") 54 | self.conv4_3 = self.conv_layer(self.conv4_2, "conv4_3") 55 | self.pool4 = self.max_pool(self.conv4_3, 'pool4') 56 | 57 | self.conv5_1 = self.conv_layer(self.pool4, "conv5_1") 58 | self.conv5_2 = self.conv_layer(self.conv5_1, "conv5_2") 59 | self.conv5_3 = self.conv_layer(self.conv5_2, "conv5_3") 60 | 61 | self.data_dict = None 62 | print(("build model finished: %ds" % (time.time() - start_time))) 63 | 64 | def avg_pool(self, bottom, name): 65 | return tf.nn.avg_pool(bottom, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name) 66 | 67 | def max_pool(self, bottom, name): 68 | return tf.nn.max_pool(bottom, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name) 69 | 70 | def conv_layer(self, bottom, name): 71 | with tf.variable_scope(name): 72 | filt = self.get_conv_filter(name) 73 | 74 | conv = tf.nn.conv2d(bottom, filt, [1, 1, 1, 1], padding='SAME') 75 | 76 | conv_biases = self.get_bias(name) 77 | bias = tf.nn.bias_add(conv, conv_biases) 78 | 79 | relu = tf.nn.relu(bias) 80 | return relu 81 | 82 | def fc_layer(self, bottom, name): 83 | with tf.variable_scope(name): 84 | shape = bottom.get_shape().as_list() 85 | dim = 1 86 | for d in shape[1:]: 87 | dim *= d 88 | x = tf.reshape(bottom, [-1, dim]) 89 | 90 | weights = self.get_fc_weight(name) 91 | biases = self.get_bias(name) 92 | 93 | # Fully connected layer. Note that the '+' operation automatically 94 | # broadcasts the biases. 95 | fc = tf.nn.bias_add(tf.matmul(x, weights), biases) 96 | 97 | return fc 98 | 99 | def get_conv_filter(self, name): 100 | return tf.Variable(self.data_dict[name][0], name="filter") 101 | 102 | def get_bias(self, name): 103 | return tf.Variable(self.data_dict[name][1], name="biases") 104 | 105 | def get_fc_weight(self, name): 106 | return tf.Variable(self.data_dict[name][0], name="weights") 107 | -------------------------------------------------------------------------------- /pretrain/vgg16.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import os 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | import time 7 | 8 | VGG_MEAN = [103.939, 116.779, 123.68] 9 | 10 | 11 | class Vgg16: 12 | def __init__(self, vgg16_npy_path=None): 13 | if vgg16_npy_path is None: 14 | path = inspect.getfile(Vgg16) 15 | path = os.path.abspath(os.path.join(path, os.pardir)) 16 | path = os.path.join(path, "vgg16.npy") 17 | vgg16_npy_path = path 18 | print(path) 19 | 20 | self.data_dict = np.load(vgg16_npy_path, encoding='latin1').item() 21 | print("npy file loaded") 22 | 23 | def build_model(self, rgb): 24 | """ 25 | load variable from npy to build the VGG 26 | :param rgb: rgb image [batch, height, width, 3] values scaled [0, 1] 27 | """ 28 | 29 | start_time = time.time() 30 | print("build model started") 31 | rgb_scaled = rgb * 255.0 32 | 33 | # Convert RGB to BGR 34 | red, green, blue = tf.split(axis=3, num_or_size_splits=3, value=rgb_scaled) 35 | assert red.get_shape().as_list()[1:] == [224, 224, 1] 36 | assert green.get_shape().as_list()[1:] == [224, 224, 1] 37 | assert blue.get_shape().as_list()[1:] == [224, 224, 1] 38 | bgr = tf.concat(axis=3, values=[ 39 | blue - VGG_MEAN[0], 40 | green - VGG_MEAN[1], 41 | red - VGG_MEAN[2], 42 | ]) 43 | assert bgr.get_shape().as_list()[1:] == [224, 224, 3] 44 | 45 | self.conv1_1 = self.conv_layer(bgr, "conv1_1") 46 | self.conv1_2 = self.conv_layer(self.conv1_1, "conv1_2") 47 | self.pool1 = self.max_pool(self.conv1_2, 'pool1') 48 | 49 | self.conv2_1 = self.conv_layer(self.pool1, "conv2_1") 50 | self.conv2_2 = self.conv_layer(self.conv2_1, "conv2_2") 51 | self.pool2 = self.max_pool(self.conv2_2, 'pool2') 52 | 53 | self.conv3_1 = self.conv_layer(self.pool2, "conv3_1") 54 | self.conv3_2 = self.conv_layer(self.conv3_1, "conv3_2") 55 | self.conv3_3 = self.conv_layer(self.conv3_2, "conv3_3") 56 | self.pool3 = self.max_pool(self.conv3_3, 'pool3') 57 | 58 | self.conv4_1 = self.conv_layer(self.pool3, "conv4_1") 59 | self.conv4_2 = self.conv_layer(self.conv4_1, "conv4_2") 60 | self.conv4_3 = self.conv_layer(self.conv4_2, "conv4_3") 61 | self.pool4 = self.max_pool(self.conv4_3, 'pool4') 62 | 63 | self.conv5_1 = self.conv_layer(self.pool4, "conv5_1") 64 | self.conv5_2 = self.conv_layer(self.conv5_1, "conv5_2") 65 | self.conv5_3 = self.conv_layer(self.conv5_2, "conv5_3") 66 | self.pool5 = self.max_pool(self.conv5_3, 'pool5') 67 | 68 | self.fc6 = self.fc_layer(self.pool5, "fc6") 69 | assert self.fc6.get_shape().as_list()[1:] == [4096] 70 | self.relu6 = tf.nn.relu(self.fc6) 71 | 72 | self.fc7 = self.fc_layer(self.relu6, "fc7") 73 | self.relu7 = tf.nn.relu(self.fc7) 74 | 75 | self.fc8 = self.fc_layer(self.relu7, "fc8") 76 | 77 | self.prob = tf.nn.softmax(self.fc8, name="prob") 78 | 79 | self.data_dict = None 80 | print(("build model finished: %ds" % (time.time() - start_time))) 81 | 82 | def avg_pool(self, bottom, name): 83 | return tf.nn.avg_pool(bottom, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name) 84 | 85 | def max_pool(self, bottom, name): 86 | return tf.nn.max_pool(bottom, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name) 87 | 88 | def conv_layer(self, bottom, name): 89 | with tf.variable_scope(name): 90 | filt = self.get_conv_filter(name) 91 | 92 | conv = tf.nn.conv2d(bottom, filt, [1, 1, 1, 1], padding='SAME') 93 | 94 | conv_biases = self.get_bias(name) 95 | bias = tf.nn.bias_add(conv, conv_biases) 96 | 97 | relu = tf.nn.relu(bias) 98 | return relu 99 | 100 | def fc_layer(self, bottom, name): 101 | with tf.variable_scope(name): 102 | shape = bottom.get_shape().as_list() 103 | dim = 1 104 | for d in shape[1:]: 105 | dim *= d 106 | x = tf.reshape(bottom, [-1, dim]) 107 | 108 | weights = self.get_fc_weight(name) 109 | biases = self.get_bias(name) 110 | 111 | # Fully connected layer. Note that the '+' operation automatically 112 | # broadcasts the biases. 113 | fc = tf.nn.bias_add(tf.matmul(x, weights), biases) 114 | 115 | return fc 116 | 117 | def get_conv_filter(self, name): 118 | return tf.constant(self.data_dict[name][0], name="filter") 119 | 120 | def get_bias(self, name): 121 | return tf.constant(self.data_dict[name][1], name="biases") 122 | 123 | def get_fc_weight(self, name): 124 | return tf.constant(self.data_dict[name][0], name="weights") 125 | -------------------------------------------------------------------------------- /pretrain/vgg16_vehicle.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import os 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | import time 7 | 8 | VGG_MEAN = [103.939, 116.779, 123.68] 9 | 10 | class Vgg16: 11 | def __init__(self, vgg16_npy_path=None): 12 | if vgg16_npy_path is None: 13 | path = inspect.getfile(Vgg16) 14 | path = os.path.abspath(os.path.join(path, os.pardir)) 15 | path = os.path.join(path, "vgg16.npy") 16 | vgg16_npy_path = path 17 | print(path) 18 | 19 | self.data_dict = np.load(vgg16_npy_path, encoding='latin1').item() 20 | print("npy file loaded") 21 | 22 | def build_model(self, bgr): 23 | """ 24 | load variable from npy to build the VGG 25 | :param rgb: rgb image [batch, height, width, 3] values scaled [0, 1] 26 | """ 27 | start_time = time.time() 28 | print("build model started") 29 | # rgb_scaled = rgb * 1.0 30 | 31 | # Convert RGB to BGR 32 | # red, green, blue = tf.split(axis=3, num_or_size_splits=3, value=rgb_scaled) 33 | # bgr = tf.concat(axis=3, values=[ 34 | # blue, 35 | # green, 36 | # red, 37 | # ]) 38 | 39 | self.conv1_1 = self.conv_layer(bgr, "conv1_1", training=False) 40 | self.conv1_2 = self.conv_layer(self.conv1_1, "conv1_2", training=False) 41 | self.pool1 = self.max_pool(self.conv1_2, 'pool1') 42 | 43 | self.conv2_1 = self.conv_layer(self.pool1, "conv2_1", training=False) 44 | self.conv2_2 = self.conv_layer(self.conv2_1, "conv2_2", training=False) 45 | self.pool2 = self.max_pool(self.conv2_2, 'pool2') 46 | 47 | self.conv3_1 = self.conv_layer(self.pool2, "conv3_1") 48 | self.conv3_2 = self.conv_layer(self.conv3_1, "conv3_2") 49 | self.conv3_3 = self.conv_layer(self.conv3_2, "conv3_3") 50 | self.pool3 = self.max_pool(self.conv3_3, 'pool3') 51 | 52 | self.conv4_1 = self.conv_layer(self.pool3, "conv4_1") 53 | self.conv4_2 = self.conv_layer(self.conv4_1, "conv4_2") 54 | self.conv4_3 = self.conv_layer(self.conv4_2, "conv4_3") 55 | 56 | self.data_dict = None 57 | print(("build model finished: %ds" % (time.time() - start_time))) 58 | 59 | def avg_pool(self, bottom, name): 60 | return tf.nn.avg_pool(bottom, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name) 61 | 62 | def max_pool(self, bottom, name): 63 | return tf.nn.max_pool(bottom, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name) 64 | 65 | def conv_layer(self, bottom, name, training=True): 66 | with tf.variable_scope(name): 67 | filt = self.get_conv_filter(name, training=training) 68 | 69 | conv = tf.nn.conv2d(bottom, filt, [1, 1, 1, 1], padding='SAME') 70 | 71 | conv_biases = self.get_bias(name, training=training) 72 | bias = tf.nn.bias_add(conv, conv_biases) 73 | 74 | relu = tf.nn.relu(bias) 75 | return relu 76 | 77 | def fc_layer(self, bottom, name): 78 | with tf.variable_scope(name): 79 | shape = bottom.get_shape().as_list() 80 | dim = 1 81 | for d in shape[1:]: 82 | dim *= d 83 | x = tf.reshape(bottom, [-1, dim]) 84 | 85 | weights = self.get_fc_weight(name) 86 | biases = self.get_bias(name) 87 | 88 | # Fully connected layer. Note that the '+' operation automatically 89 | # broadcasts the biases. 90 | fc = tf.nn.bias_add(tf.matmul(x, weights), biases) 91 | 92 | return fc 93 | 94 | def get_conv_filter(self, name, training=True): 95 | return tf.Variable(self.data_dict[name][0], name="filter", trainable=training) 96 | 97 | def get_bias(self, name, training=True): 98 | return tf.Variable(self.data_dict[name][1], name="biases", trainable=training) 99 | 100 | def get_fc_weight(self, name): 101 | return tf.Variable(self.data_dict[name][0], name="weights") 102 | -------------------------------------------------------------------------------- /rcnn/base_rpn.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import os 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | import time 7 | 8 | VGG_MEAN = [103.939, 116.779, 123.68] 9 | 10 | class RPN: 11 | def __init__(self, vgg16_npy_path=None): 12 | if vgg16_npy_path is None: 13 | path = inspect.getfile(Vgg16) 14 | path = os.path.abspath(os.path.join(path, os.pardir)) 15 | path = os.path.join(path, "vgg16.npy") 16 | vgg16_npy_path = path 17 | print(path) 18 | 19 | self.data_dict = np.load(vgg16_npy_path, encoding='latin1').item() 20 | print("npy file loaded") 21 | 22 | def build_model(self, bgr): 23 | """ 24 | load variable from npy to build the VGG 25 | :param rgb: rgb image [batch, height, width, 3] values scaled [0, 1] 26 | """ 27 | start_time = time.time() 28 | print("build model started") 29 | 30 | self.conv1_1 = self.conv_layer(bgr, "conv1_1") 31 | self.conv1_2 = self.conv_layer(self.conv1_1, "conv1_2") 32 | self.pool1 = self.max_pool(self.conv1_2, 'pool1') 33 | 34 | self.conv2_1 = self.conv_layer(self.pool1, "conv2_1") 35 | self.conv2_2 = self.conv_layer(self.conv2_1, "conv2_2") 36 | self.pool2 = self.max_pool(self.conv2_2, 'pool2') 37 | 38 | self.conv3_1 = self.conv_layer(self.pool2, "conv3_1") 39 | self.conv3_2 = self.conv_layer(self.conv3_1, "conv3_2") 40 | self.conv3_3 = self.conv_layer(self.conv3_2, "conv3_3") 41 | self.pool3 = self.max_pool(self.conv3_3, 'pool3') 42 | 43 | self.conv4_1 = self.conv_layer(self.pool3, "conv4_1") 44 | self.conv4_2 = self.conv_layer(self.conv4_1, "conv4_2") 45 | self.conv4_3 = self.conv_layer(self.conv4_2, "conv4_3") 46 | 47 | self.data_dict = None 48 | print(("build model finished: %ds" % (time.time() - start_time))) 49 | 50 | def avg_pool(self, bottom, name): 51 | return tf.nn.avg_pool(bottom, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name) 52 | 53 | def max_pool(self, bottom, name): 54 | return tf.nn.max_pool(bottom, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name) 55 | 56 | def conv_layer(self, bottom, name): 57 | with tf.variable_scope(name): 58 | filt = self.get_conv_filter(name) 59 | 60 | conv = tf.nn.conv2d(bottom, filt, [1, 1, 1, 1], padding='SAME') 61 | 62 | conv_biases = self.get_bias(name) 63 | bias = tf.nn.bias_add(conv, conv_biases) 64 | 65 | relu = tf.nn.relu(bias) 66 | return relu 67 | 68 | def fc_layer(self, bottom, name): 69 | with tf.variable_scope(name): 70 | shape = bottom.get_shape().as_list() 71 | dim = 1 72 | for d in shape[1:]: 73 | dim *= d 74 | x = tf.reshape(bottom, [-1, dim]) 75 | 76 | weights = self.get_fc_weight(name) 77 | biases = self.get_bias(name) 78 | 79 | # Fully connected layer. Note that the '+' operation automatically 80 | # broadcasts the biases. 81 | fc = tf.nn.bias_add(tf.matmul(x, weights), biases) 82 | 83 | return fc 84 | 85 | def get_conv_filter(self, name): 86 | return tf.Variable(self.data_dict[name][0], name="filter") 87 | 88 | def get_bias(self, name): 89 | return tf.Variable(self.data_dict[name][1], name="biases") 90 | 91 | def get_fc_weight(self, name): 92 | return tf.Variable(self.data_dict[name][0], name="weights") 93 | -------------------------------------------------------------------------------- /rcnn/proposal_layer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | def rpn_to_roi(): 4 | pass 5 | -------------------------------------------------------------------------------- /rcnn/rcnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("../") 5 | sys.path.append("../util") 6 | sys.path.append("../cython_util") 7 | sys.path.append("../pretrain") 8 | import glob 9 | import cv2 10 | import numpy as np 11 | # from vgg16 import vgg16 12 | from input_kitti import * 13 | from data_util import * 14 | from parse_xml import parseXML 15 | from vgg16_vehicle import Vgg16 as Vgg 16 | import tensorflow as tf 17 | from network_util import * 18 | from bbox_overlap import bbox_overlaps 19 | from remove_extraboxes import remove_extraboxes 20 | from bool_anchors_inside_image import batch_inside_image 21 | from generate_anchors import generate_anchors 22 | 23 | """ 24 | ・collect dataset of cars 25 | ・Preprocessing BBOX and Label for training 26 | ・try roi_pooling layer 27 | ・Extract ROI using mitmul tools 28 | ・NMS 29 | """ 30 | 31 | """Flow of Fast RCNN 32 | ############################################################################### 33 | In this state, Create Input Images and ROI Labels 34 | 35 | 1. input batch images and GroundTruth BBox from datasets *folder name, batch size 36 | Image shape is [batch size, width, height, channel], tf.float32, vgg normalized, bgr 37 | Bounding Box shape is [batch size, center_x, center_y, width, height] 38 | 39 | 2. get candicate bounding box from images. 40 | 41 | # Implemented 42 | 3. resize input images to input size *size of resize if needed. 43 | if this operation was done, you should adjust bounding box according to it. 44 | Both of Candicate and GroundTruth Bounding Boxes. 45 | In thesis, Image size is in [600, 1000] 46 | In this Implemention, input image has dynamic shape between [600, 1000] 47 | 48 | 4. convert candicate bounding box to ROI label. 49 | 50 | 5. calculate IOU between ROI label and GroundTruth label. 51 | IOU is Intersection Over Union. 52 | 53 | 6. Select Bounding Box from IOU. 54 | IOU > 0.5 is correct label, IOU = [0.1 0.5) is a false label(background). 55 | Correct Label is 25%, BackGround Label is 75%. 56 | Number of Label is 128, Batch Size is 2, so each image has 64 ROIs 57 | 58 | ############################################################################### 59 | In this stage, Calculate Loss 60 | 61 | 7. Input data to ROI Pooling Layer is Conv5_3 Feature Map and ROIs 62 | Input shape is Feature map (batch, width, height, 512), ROIs (Num of ROIs, 5) 63 | ROIs, ex:) [0, left, height, right, bottom]. First Element is the index of batch 64 | 65 | 8. Through ROI Pooling Layer, Output Shape is [Num of ROIs, 7, 7, 512] 66 | 67 | 9. Reshape it to [Num of ROIs, -1], and then connect to Fully Connected Layer. 68 | 69 | 10.Output Layer has two section, one is class prediction, the other is its bounding box prediction. 70 | class prediction shape is [Num of ROIs, Num of Class + 1] 71 | bounding box prediction shape is [Num of ROIs, 4 * (Num of Class + 1)] 72 | 73 | 11.Loss Function 74 | Regularize bounding box value [center_x, center_y, w, h] into 75 | [(GroundTruth x - pred_x) / pred_w, (GroundTruth y - pred_y) / pred_h, log(GroundTruth w / pred_w), log(GroundTruth h / pred_h)] 76 | Class prediction is by softmax with loss. 77 | Bounding Box prediction is by smooth_L1 loss 78 | ############################################################################### 79 | In this stage, Describe Datasets. 80 | 1. PASCAL VOC2007 81 | 2. KITTI Datasets 82 | 3. Udacity Datasets 83 | """ 84 | 85 | def create_optimizer(all_loss, lr=0.001, var_list=None): 86 | opt = tf.train.AdamOptimizer(lr) 87 | if var_list is None: 88 | return opt.minimize(all_loss) 89 | optimizer = opt.minimize(all_loss, var_list=var_list) 90 | return optimizer 91 | 92 | class RPN_ExtendedLayer(object): 93 | def __init__(self): 94 | pass 95 | 96 | def build_model(self, input_layer, use_batchnorm=False, is_training=True, activation=tf.nn.relu, anchors=1): 97 | self.rpn_conv = convBNLayer(input_layer, use_batchnorm, is_training, 512, 512, 3, 1, name="conv_rpn", activation=activation) 98 | # shape is [Batch, 2(bg/fg) * 9(anchors=3scale*3aspect ratio)] 99 | self.rpn_cls = convBNLayer(self.rpn_conv, use_batchnorm, is_training, 512, anchors*2, 1, 1, name="rpn_cls", activation=activation) 100 | rpn_shape = self.rpn_cls.get_shape().as_list() 101 | rpn_shape = tf.shape(self.rpn_cls) 102 | self.rpn_cls = tf.reshape(self.rpn_cls, [rpn_shape[0], rpn_shape[1], rpn_shape[2], anchors, 2]) 103 | self.rpn_cls = tf.nn.softmax(self.rpn_cls, dim=-1)[:, :, :, :, 0] 104 | self.rpn_cls = tf.reshape(self.rpn_cls, [rpn_shape[0], rpn_shape[1]*rpn_shape[2]*anchors]) # for loss 105 | # shape is [Batch, 4(x, y, w, h) * 9(anchors=3scale*3aspect ratio)] 106 | self.rpn_bbox = convBNLayer(self.rpn_conv, use_batchnorm, is_training, 512, anchors*4, 1, 1, name="rpn_bbox", activation=activation) 107 | self.rpn_bbox = tf.reshape(self.rpn_bbox, [rpn_shape[0], rpn_shape[1]*rpn_shape[2]*anchors, 4]) 108 | 109 | class VGG(object): 110 | def __init__(self): 111 | pass 112 | 113 | def build_model(self, input_layer, activation=tf.nn.relu, anchors=1): 114 | self.conv1_1 = convLayer(images, 3, 64, 3, 1, activation=activation, name="conv1_1") 115 | self.conv1_2 = convLayer(self.conv1_1, 64, 64, 3, 1, activation=activation, name="conv1_2") 116 | self.pool1 = maxpool2d(self.conv1_2, kernel=2, stride=2, name="pool1") 117 | 118 | self.conv2_1 = convLayer(self.pool1, 64, 128, 3, 1, activation=activation, name="conv2_1") 119 | self.conv2_2 = convLayer(self.conv2_1, 128, 128, 3, 1, activation=activation, name="conv2_2") 120 | self.pool2 = maxpool2d(self.conv2_2, kernel=2, stride=2, name="pool2") 121 | 122 | self.conv3_1 = convLayer(self.pool2, 128, 256, 3, 1, activation=activation, name="conv3_1") 123 | self.conv3_2 = convLayer(self.conv3_1, 256, 256, 3, 1, activation=activation, name="conv3_2") 124 | self.conv3_3 = convLayer(self.conv3_2, 256, 256, 3, 1, activation=activation, name="conv3_3") 125 | self.pool3 = maxpool2d(self.conv3_3, kernel=2, stride=2, name="pool3") 126 | 127 | self.conv4_1 = convLayer(self.pool2, 256, 512, 3, 1, activation=activation, name="conv4_1") 128 | self.conv4_2 = convLayer(self.conv4_1, 512, 512, 3, 1, activation=activation, name="conv4_2") 129 | self.conv4_3 = convLayer(self.conv4_2, 512, 512, 3, 1, activation=activation, name="conv4_3") 130 | self.pool4 = maxpool2d(self.conv4_3, kernel=2, stride=2, name="pool4") 131 | 132 | self.conv5_1 = convLayer(self.pool2, 512, 512, 3, 1, activation=activation, name="conv5_1") 133 | self.conv5_2 = convLayer(self.conv5_1, 512, 512, 3, 1, activation=activation, name="conv5_2") 134 | self.conv5_3 = convLayer(self.conv5_2, 512, 512, 3, 1, activation=activation, name="conv5_3") 135 | 136 | def propose_for_rois(rpn_cls, rpn_bbox, gt_labels, feat_stride, scales, ratios, feature_shape, image_size, num_of_rois=128): 137 | """ 138 | **rpn_modelから、実際の大きさまでスケールさせる** 139 | 1. 小さなbounding boxを排除(feature_stride * roi size?) 140 | 2. scoreから6000個を抽出 141 | 3. NMSをかけて、300個以下まで候補を絞る 142 | ここまでが物体候補領域の抽出 143 | ーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーー 144 | 4. gt_boxesと候補領域でoverlapsを計算する 145 | overlapsが0.5以上ならGroundTruth, [0.1, 0.5)ならFalseであるとする  * ここまでBatchでよい 146 | ここの計算でReshapeされたROI, 正解Class Label, 正解Regression Label, そのindex番号の計算が行われる 147 | 5. rpn_modelをclass label[?]とregression label[?, 4]にReshapeし、indexで値を取ってくる 148 | 149 | input 150 | 1. Pred class Label 151 | 2. Pred regression Label 152 | 3. GroundTruth class Label 153 | 4. GroundTruth regression Label 154 | 155 | output 156 | 1. 候補領域の計算されたROI(batch number, x, y, w, h), 数は[?] 157 | 2. 候補領域の正解Class Label(batch number, 2) car or not 158 | 3. 候補領域の正解Regression Label(batch number, 4) x, y, w, h 159 |   これも事前に正規化しておく必要があります 160 | 161 | ここではBack Propは計算されない 162 | indexのみ計算される indexのOutputのShapeは、[?] 163 | ROIs[index]で、これが次の層に伝搬される 164 | """ 165 | image_size = images.shape[1:3] 166 | width = feature_shape[0] 167 | height = feature_shape[1] 168 | batch_size = gt_labels.shape[0] 169 | A = scales.shape[0] * len(ratios) 170 | K = width * height 171 | 172 | center_x = np.arange(0, height) * feat_stride 173 | center_y = np.arange(0, width) * feat_stride 174 | center_x, center_y = np.meshgrid(center_x, center_y) 175 | centers = np.zeros((batch_size, width*height, 4)) 176 | centers[:] = np.vstack((center_x.ravel(), center_y.ravel(), 177 | center_x.ravel(), center_y.ravel())).transpose() 178 | anchors = np.zeros((batch_size, A, 4)) 179 | anchors = generate_anchors(scales=scales, ratios=ratios) # Shape is [A, 4] 180 | anchors = centers.reshape(batch_size, K, 1, 4) + anchors # [Batch, K, A, 4] 181 | # gt_labels: Shape is [Batch, G, 4] 182 | # rpn_bbox: Shape is [Batch, K*A, 4] 183 | # rpn_cls: Shape is [Batch, K*A] 184 | # rois: Shape is [Num of ROIs, 5] 5 is [batch index, left, top, right, bottom] 185 | # gt_cls: Shape is [Num of ROIs, 2] 0 is GroundTruth, 1 is otherwise 186 | # gt_boxes: Shape is [Num of ROIs, 4] Value is Normalized by proposal target lay 187 | 188 | # Convert anchors into proposals via bbox transformations 189 | # clip predicted boxes to image 190 | # proposals: Shape is [Batch, K*A, 4] 191 | # scores: Shape is [Batch, K*A] 192 | # anchors: Shape is [Batch, K*A, 4] 193 | anchors = bbox_transform_inv_clip(anchors, rpn_bbox, image_size[1], image_height[0]) 194 | for bs in range(batch_size): 195 | keep = _filter_boxes(anchors[bs], min_size) 196 | proposals = anchors[bs, keep] 197 | scores = rpn_cls[bs, keep] 198 | order = scores.ravel().argsort()[-6000:] 199 | proposals = proposals[order] 200 | scores = scores[order] 201 | keep = nms(np.hstack((proposals, scores)), 0.7) 202 | if post_nms_topN > 0: 203 | keep = keep[:300] 204 | proposals = proposals[keep, :] 205 | scores = scores[keep] 206 | 207 | # Sample ROIs 208 | #ここから128枚(64枚: fg 16, bg 48) 209 | computed_gt_boxes, true_index, false_index = bbox_overlaps( 210 | proposals, 211 | scores, 212 | gt_labels) 213 | # for i in range(batch_size): 214 | true_where = np.where(true_index == 1) 215 | num_true = len(true_where[0]) 216 | 217 | if num_true > 16: 218 | select = np.random.choice(num_true, num_true - 16, replace=False) 219 | num_true = 16 220 | batch = np.ones((select.shape[0]), dtype=np.int) * bs 221 | true_where = remove_extraboxes(true_where[0], select, batch) 222 | true_index[true_where] = 0 223 | 224 | false_where = np.where(false_index[i] == 1) 225 | num_false = len(false_where[0]) 226 | select = np.random.choice(num_false, num_false - (64-num_true), replace=False) 227 | batch = np.ones((select.shape[0]), dtype=np.int) * bs 228 | false_where = remove_extraboxes(false_where[0], select, batch) 229 | false_index[false_where] = 0 230 | batch_inds.append(keep.shape[0]) 231 | 232 | 233 | true_index = None 234 | false_index = None 235 | final_index = None 236 | # TODO Concatenate true_index and false_index 237 | proposals = proposals[final_index] 238 | gt_cls = true_index 239 | gt_cls[bs, true_index, 0] = 1 240 | gt_cls[bs, false_index, 1] = 1 241 | gt_boxes[bs, true_index] = computed_gt_boxes[true_index] 242 | rois[bs] = (proposals[true_index] / 4).astype(np.int32) 243 | return rois, gt_cls, gt_boxes 244 | 245 | 246 | def _filter_boxes(boxes, min_size): 247 | """Remove all boxes with any side smaller than min_size.""" 248 | ws = boxes[:, 2] - boxes[:, 0] + 1 249 | hs = boxes[:, 3] - boxes[:, 1] + 1 250 | keep = np.where((ws >= min_size) & (hs >= min_size))[0] 251 | return keep 252 | 253 | def proposal_target_layer(self, feature_map, rpn_model, gt_labels, feat_stride, scales, ratios, feature_shape, images, num_of_rois=num_of_rois, feat_stride=16, name=""): 254 | """ 255 | gt_labels: Shape is [Batch, Num of GroundTruth Num, 4] 256 | rois: Shape is [Num of ROIs, 5] 5 is [batch index, left, top, right, bottom] 257 | gt_cls: Shape is [Num of ROIs, 2] 0 is GroundTruth, 1 is otherwise 258 | gt_boxes: Shape is [Num of ROIs, 4] Value is Normalized by proposal target layer 259 | 260 | Gradient will not deliver to RPN Layer 261 | """ 262 | 263 | with tf.variable_scope(name): 264 | rois, gt_cls, gt_boxes = tf.py_func(propose_for_rois, \ 265 | [rpn_model.rpn_cls, rpn_model.rpn_bbox, gt_labels, feat_stride, scales, ratios, feature_shape, images],[tf.int8,tf.float32,tf.float32]) 266 | 267 | rois = tf.convert_to_tensor(rois, name="rois") 268 | gt_cls = tf.convert_to_tensor(gt_cls, name="gt_cls") 269 | gt_boxes = tf.convert_to_tensor(gt_boxes, name="gt_boxes") 270 | return rois, gt_cls, gt_boxes 271 | 272 | class FAST_RCNN(object): 273 | def __init__(self, roi_size): 274 | self.roi_size = roi_size 275 | 276 | def build_model(self, feature_map, rois, rpn_model, activation=tf.nn.relu): 277 | # input_layer shape is [Batch, K, A, ] 278 | self.roi_layer = roi_pooling(feature_map, rois, self.roi_size[0], self.roi_size[1]) 279 | # input_shape [num_of_rois, channel, roi size, roi size] 280 | self.pool_5 = tf.reshape(roi_layer, [-1, self.roi_size[0]*self.roi_size[1]*512]) 281 | self.fc6 = vgg_fully(self.pool_5, [self.roi_size[0]*self.roi_size[1]*512, 4096], name="fc6", is_training=is_training) 282 | self.fc7 = vgg_fully(self.fc6, [4096, 4096], name="fc7") 283 | self.fc8 = vgg_fully(self.fc7, [4096, 6], name="fc8") 284 | # output shape [num_of_rois, 2] 285 | self.obj_class = tf.nn.softmax(self.fc8[:, :2], dim=-1) 286 | # output shape [num_of_rois, 8] 287 | self.bbox_regression = self.fc8[:, 2:] 288 | 289 | def rpn(sess, vggpath=None, image_shape=(300, 300), \ 290 | is_training=None, use_batchnorm=False, activation=tf.nn.relu, anchors=9): 291 | images = tf.placeholder(tf.float32, [None, None, None, 3]) 292 | phase_train = tf.placeholder(tf.bool, name="phase_traing") if is_training else None 293 | 294 | vgg = VGG() 295 | vgg.build_model(images) 296 | with tf.variable_scope("rpn_model"): 297 | rpn = RPN_ExtendedLayer() 298 | rpn.build_model(vgg.conv5_3, use_batchnorm=use_batchnorm, is_training=is_training, activation=activation, anchors=anchors) 299 | 300 | if is_training: 301 | rcnn_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="rpn_model") 302 | sess.run(tf.variables_initializer(rcnn_vars)) 303 | return vgg.conv5_3, rpn, images, phase_train 304 | 305 | def fast_rcnn(sess, feature_map, rpn_model, gt_labels, roi_size=(7, 7), \ 306 | is_training=None, use_batchnorm=False, activation=tf.nn.relu, num_of_rois=128): 307 | """Model Definition of Fast RCNN 308 | In thesis, Roi Size is (7, 7), channel is 512 309 | """ 310 | with tf.variable_scope("fast_rcnn"): 311 | # gt_labels: Shape is [Batch, Num of GroundTruth Num, 4] 312 | # rois: Shape is [Num of ROIs, 5] 5 is [batch index, left, top, right, bottom] 313 | # gt_cls: Shape is [Num of ROIs, 2] 0 is GroundTruth, 1 is otherwise 314 | # gt_boxes: Shape is [Num of ROIs, 4] Value is Normalized by proposal target layer 315 | rois, gt_cls, gt_boxes = proposal_target_layer(feature_map, rpn_model, gt_labels, num_of_rois=num_of_rois) 316 | rcnn = FAST_RCNN(roi_size) 317 | rcnn.build_model(feature_map, rois) 318 | 319 | if is_training: 320 | rcnn_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="fast_rcnn") 321 | sess.run(tf.variables_initializer(rcnn_vars)) 322 | 323 | return rcnn, rcnn_vars 324 | 325 | def train_rcnn(batch_size, image_dir, label_dir, epoch=101, lr=0.01, feature_shape=(64, 19), \ 326 | is_training=True, use_batchnorm=False, activation=tf.nn.relu, \ 327 | scales=np.array([5, 8, 12, 16, 32]), ratios=[0.3, 0.5, 0.8, 1], feature_stride=16): 328 | import time 329 | training_epochs = epoch 330 | 331 | with tf.Session() as sess: 332 | vgg_featuremap, rpn_model, images, phase_train = rpn(sess, vggpath=vggpath, is_training=False, roi_size=(7, 7), \ 333 | use_batchnorm=use_batchnorm, activation=activation, anchors=scales.shape[0]*len(ratios)) 334 | saver = tf.train.Saver() 335 | new_saver = tf.train.import_meta_graph("../rpn/rpn_model40.ckpt.meta") 336 | last_model = "../rpn/rpn_model40.ckpt" 337 | saver.restore(sess, last_model) 338 | 339 | rcnn_model, rcnn_vars = fast_rcnn(sess, vgg_featuremap, rpn_model, roi_size=roi_size, activation=activation) 340 | 341 | total_loss, cls_loss, bbox_loss, true_obj_loss, false_obj_loss, g_bboxes, true_index, false_index = rpn_loss(rcnn_model.rcnn_cls, rcnn_model.rcnn_bbox) 342 | # Only Training RCNN Layer 343 | optimizer = create_optimizer(total_loss, lr=lr, var_list=rcnn_vars) 344 | 345 | init = tf.global_variables_initializer() 346 | sess.run(init) 347 | 348 | image_pathlist, label_pathlist = get_pathlist(image_dir, label_dir) 349 | for epoch in range(training_epochs): 350 | for batch_images, batch_labels in generator__Image_and_label(image_pathlist, label_pathlist, batch_size=batch_size): 351 | start = time.time() 352 | candicate_anchors, batch_true_index, batch_false_index = create_Labels_For_Loss(batch_labels, feat_stride=feature_stride, \ 353 | feature_shape=(batch_images.shape[1]//feature_stride +1, batch_images.shape[2]//feature_stride), \ 354 | scales=scales, ratios=ratios, image_size=batch_images.shape[1:3]) 355 | print "batch time", time.time() - start 356 | print batch_true_index[batch_true_index==1].shape 357 | print batch_false_index[batch_false_index==1].shape 358 | 359 | sess.run(optimizer, feed_dict={images:batch_images, g_bboxes: candicate_anchors, true_index:batch_true_index, false_index:batch_false_index}) 360 | tl, cl, bl, tol, fol = sess.run([total_loss, cls_loss, bbox_loss, true_obj_loss, false_obj_loss], feed_dict={images:batch_images, g_bboxes: candicate_anchors, true_index:batch_true_index, false_index:batch_false_index}) 361 | print("Epoch:", '%04d' % (epoch+1), "total loss=", "{:.9f}".format(tl)) 362 | print("Epoch:", '%04d' % (epoch+1), "closs loss=", "{:.9f}".format(cl)) 363 | print("Epoch:", '%04d' % (epoch+1), "bbox loss=", "{:.9f}".format(bl)) 364 | print("Epoch:", '%04d' % (epoch+1), "true loss=", "{:.9f}".format(tol)) 365 | print("Epoch:", '%04d' % (epoch+1), "false loss=", "{:.9f}".format(fol)) 366 | print("Optimization Finished") 367 | 368 | def smooth_L1(x): 369 | l2 = 0.5 * (x**2.0) 370 | l1 = tf.abs(x) - 0.5 371 | 372 | condition = tf.less(tf.abs(x), 1.0) 373 | loss = tf.where(condition, l2, l1) 374 | return loss 375 | 376 | def rpn_loss(rpn_cls, rpn_bbox): 377 | """Calculate Class Loss and Bounding Regression Loss. 378 | 379 | # Args: 380 | obj_class: Prediction of object class. Shape is [ROIs*Batch_Size, 2] 381 | bbox_regression: Prediction of bounding box. Shape is [ROIs*Batch_Size, 4] 382 | """ 383 | rpn_shape = rpn_cls.get_shape().as_list() 384 | g_bbox = tf.placeholder(tf.float32, [rpn_shape[0], rpn_shape[1], rpn_shape[2], 4]) 385 | true_index = tf.placeholder(tf.float32, [rpn_shape[0], rpn_shape[1], rpn_shape[2]]) 386 | false_index = tf.placeholder(tf.float32, [rpn_shape[0], rpn_shape[1], rpn_shape[2]]) 387 | elosion = 0.00001 388 | true_obj_loss = -tf.reduce_sum(tf.multiply(tf.log(rpn_cls[:, :, :, 0]+elosion), true_index)) 389 | false_obj_loss = -tf.reduce_sum(tf.multiply(tf.log(rpn_cls[:, :, :, 1]+elosion), false_index)) 390 | obj_loss = tf.add(true_obj_loss, false_obj_loss) 391 | cls_loss = tf.div(obj_loss, 16) # L(cls) / N(cls) N=batch size 392 | 393 | bbox_loss = smooth_L1(tf.subtract(rpn_bbox, g_bbox)) 394 | bbox_loss = tf.reduce_sum(tf.multiply(tf.reduce_sum(bbox_loss, 3), true_index)) 395 | bbox_loss = tf.multiply(tf.div(bbox_loss, 1197), 100) # rpn_shape[1]*rpn_shape[2] 396 | # bbox_loss = bbox_loss / rpn_shape[1] 397 | 398 | total_loss = tf.add(cls_loss, bbox_loss) 399 | return total_loss, cls_loss, bbox_loss, true_obj_loss, false_obj_loss, g_bbox, true_index, false_index 400 | 401 | def create_Labels_For_Loss(gt_boxes, feat_stride=16, feature_shape=(64, 19), \ 402 | scales=np.array([8, 16, 32]), ratios=[0.5, 0.8, 1], \ 403 | image_size=(300, 1000)): 404 | """This Function is processed before network input 405 | Number of Candicate Anchors is Feature Map width * heights 406 | Number of Predicted Anchors is Batch Num * Feature Map Width * Heights * 9 407 | """ 408 | width = feature_shape[0] 409 | height = feature_shape[1] 410 | batch_size = gt_boxes.shape[0] 411 | # shifts is the all candicate anchors(prediction of bounding boxes) 412 | center_x = np.arange(0, height) * feat_stride 413 | center_y = np.arange(0, width) * feat_stride 414 | center_x, center_y = np.meshgrid(center_x, center_y) 415 | # Shape is [Batch, Width*Height, 4] 416 | centers = np.zeros((batch_size, width*height, 4)) 417 | centers[:] = np.vstack((center_x.ravel(), center_y.ravel(), 418 | center_x.ravel(), center_y.ravel())).transpose() 419 | A = scales.shape[0] * len(ratios) 420 | K = width * height # width * height 421 | anchors = np.zeros((batch_size, A, 4)) 422 | anchors = generate_anchors(scales=scales, ratios=ratios) # Shape is [A, 4] 423 | 424 | candicate_anchors = centers.reshape(batch_size, K, 1, 4) + anchors # [Batch, K, A, 4] 425 | 426 | # shape is [B, K, A] 427 | is_inside = batch_inside_image(candicate_anchors, image_size[1], image_size[0]) 428 | 429 | # candicate_anchors: Shape is [Batch, K, A, 4] 430 | # gt_boxes: Shape is [Batch, G, 4] 431 | # true_index: Shape is [Batch, K, A] 432 | # false_index: Shape is [Batch, K, A] 433 | candicate_anchors, true_index, false_index = bbox_overlaps( 434 | np.ascontiguousarray(candicate_anchors, dtype=np.float), 435 | is_inside, 436 | gt_boxes) 437 | 438 | for i in range(batch_size): 439 | true_where = np.where(true_index[i] == 1) 440 | num_true = len(true_where[0]) 441 | 442 | if num_true > 64: 443 | select = np.random.choice(num_true, num_true - 64, replace=False) 444 | num_true = 64 445 | batch = np.ones((select.shape[0]), dtype=np.int) * i 446 | true_where = remove_extraboxes(true_where[0], true_where[1], select, batch) 447 | true_index[true_where] = 0 448 | 449 | false_where = np.where(false_index[i] == 1) 450 | num_false = len(false_where[0]) 451 | select = np.random.choice(num_false, num_false - (128-num_true), replace=False) 452 | batch = np.ones((select.shape[0]), dtype=np.int) * i 453 | false_where = remove_extraboxes(false_where[0], false_where[1], select, batch) 454 | false_index[false_where] = 0 455 | 456 | return candicate_anchors, true_index, false_index 457 | 458 | if __name__ == '__main__': 459 | import matplotlib.pyplot as plt 460 | -------------------------------------------------------------------------------- /rcnn/rcnn_vehicle.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import glob 5 | import cv2 6 | import dlib 7 | import numpy as np 8 | # from vgg16 import vgg16 9 | from input_kitti import * 10 | from util import * 11 | from parse_xml import parseXML 12 | from base_vgg16 import Vgg16 13 | import tensorflow as tf 14 | # from utility.image.data_augmentation.flip import Flip 15 | sys.path.append("/Users/tsujiyuuki/env_python/code/my_code/Data_Augmentation") 16 | 17 | """ 18 | ・collect dataset of cars 19 | ・Preprocessing BBOX and Label for training 20 | ・try roi_pooling layer 21 | ・Extract ROI using mitmul tools 22 | ・NMS 23 | """ 24 | 25 | """Flow of Fast RCNN 26 | ############################################################################### 27 | In this state, Create Input Images and ROI Labels 28 | 29 | 1. input batch images and GroundTruth BBox from datasets *folder name, batch size 30 | Image shape is [batch size, width, height, channel], tf.float32, vgg normalized, bgr 31 | Bounding Box shape is [batch size, center_x, center_y, width, height] 32 | 33 | 2. get candicate bounding box from images. 34 | 35 | # Implemented 36 | 3. resize input images to input size *size of resize if needed. 37 | if this operation was done, you should adjust bounding box according to it. 38 | Both of Candicate and GroundTruth Bounding Boxes. 39 | In thesis, Image size is in [600, 1000] 40 | In this Implemention, input image has dynamic shape between [600, 1000] 41 | 42 | 4. convert candicate bounding box to ROI label. 43 | 44 | 5. calculate IOU between ROI label and GroundTruth label. 45 | IOU is Intersection Over Union. 46 | 47 | 6. Select Bounding Box from IOU. 48 | IOU > 0.5 is correct label, IOU = [0.1 0.5) is a false label(background). 49 | Correct Label is 25%, BackGround Label is 75%. 50 | Number of Label is 128, Batch Size is 2, so each image has 64 ROIs 51 | 52 | ############################################################################### 53 | In this stage, Calculate Loss 54 | 55 | 7. Input data to ROI Pooling Layer is Conv5_3 Feature Map and ROIs 56 | Input shape is Feature map (batch, width, height, 512), ROIs (Num of ROIs, 5) 57 | ROIs, ex:) [0, left, height, right, bottom]. First Element is the index of batch 58 | 59 | 8. Through ROI Pooling Layer, Output Shape is [Num of ROIs, 7, 7, 512] 60 | 61 | 9. Reshape it to [Num of ROIs, -1], and then connect to Fully Connected Layer. 62 | 63 | 10.Output Layer has two section, one is class prediction, the other is its bounding box prediction. 64 | class prediction shape is [Num of ROIs, Num of Class + 1] 65 | bounding box prediction shape is [Num of ROIs, 4 * (Num of Class + 1)] 66 | 67 | 11.Loss Function 68 | Regularize bounding box value [center_x, center_y, w, h] into 69 | [(GroundTruth x - pred_x) / pred_w, (GroundTruth y - pred_y) / pred_h, log(GroundTruth w / pred_w), log(GroundTruth h / pred_h)] 70 | Class prediction is by softmax with loss. 71 | Bounding Box prediction is by smooth_L1 loss 72 | ############################################################################### 73 | In this stage, Describe Datasets. 74 | 1. PASCAL VOC2007 75 | 2. KITTI Datasets 76 | 3. Udacity Datasets 77 | """ 78 | 79 | def create_optimizer(all_loss, lr=0.001, var_list=None): 80 | opt = tf.train.AdamOptimizer(lr) 81 | if var_list is None: 82 | return opt.minimize(all_loss) 83 | optimizer = opt.minimize(all_loss, var_list=var_list) 84 | return optimizer 85 | 86 | class RPN_ExtendedLayer(object): 87 | def __init__(self): 88 | pass 89 | 90 | def build_model(self, input_layer, use_batchnorm=False, is_training=True, activation=tf.nn.relu, anchors=1): 91 | self.rpn_conv = convBNLayer(input_layer, use_batchnorm, is_training, 512, 512, 3, 1, name="conv_rpn", activation=activation) 92 | # shape is [Batch, 2(bg/fg) * 9(anchors=3scale*3aspect ratio)] 93 | self.rpn_cls = convBNLayer(self.rpn_conv, use_batchnorm, is_training, 512, anchors*2, 1, 1, name="rpn_cls", activation=activation) 94 | rpn_shape = self.rpn_cls.get_shape().as_list() 95 | rpn_shape = tf.shape(self.rpn_cls) 96 | self.rpn_cls = tf.reshape(self.rpn_cls, [rpn_shape[0], rpn_shape[1], rpn_shape[2], anchors, 2]) 97 | self.rpn_cls = tf.nn.softmax(self.rpn_cls, dim=-1)[:, :, :, :, 0] 98 | self.rpn_cls = tf.reshape(self.rpn_cls, [rpn_shape[0], rpn_shape[1]*rpn_shape[2]*anchors]) 99 | # shape is [Batch, 4(x, y, w, h) * 9(anchors=3scale*3aspect ratio)] 100 | self.rpn_bbox = convBNLayer(self.rpn_conv, use_batchnorm, is_training, 512, anchors*4, 1, 1, name="rpn_bbox", activation=activation) 101 | self.rpn_bbox = tf.reshape(self.rpn_bbox, [rpn_shape[0], rpn_shape[1]*rpn_shape[2]*anchors, 4]) 102 | 103 | class RPN(object): 104 | def __init__(self): 105 | pass 106 | 107 | def build_model(self, input_layer, activation=tf.nn.relu, anchors=1): 108 | self.conv1_1 = convLayer(images, 3, 64, 3, 1, activation=activation, name="conv1_1") 109 | self.conv1_2 = convLayer(self.conv1_1, 64, 64, 3, 1, activation=activation, name="conv1_2") 110 | self.pool1 = maxpool2d(self.conv1_2, kernel=2, stride=2, name="pool1") 111 | 112 | self.conv2_1 = convLayer(self.pool1, 64, 128, 3, 1, activation=activation, name="conv2_1") 113 | self.conv2_2 = convLayer(self.conv2_1, 128, 128, 3, 1, activation=activation, name="conv2_2") 114 | self.pool2 = maxpool2d(self.conv2_2, kernel=2, stride=2, name="pool2") 115 | 116 | self.conv3_1 = convLayer(self.pool2, 128, 256, 3, 1, activation=activation, name="conv3_1") 117 | self.conv3_2 = convLayer(self.conv3_1, 256, 256, 3, 1, activation=activation, name="conv3_2") 118 | self.conv3_3 = convLayer(self.conv3_2, 256, 256, 3, 1, activation=activation, name="conv3_3") 119 | self.pool3 = maxpool2d(self.conv3_3, kernel=2, stride=2, name="pool3") 120 | 121 | self.conv4_1 = convLayer(self.pool2, 256, 512, 3, 1, activation=activation, name="conv4_1") 122 | self.conv4_2 = convLayer(self.conv4_1, 512, 512, 3, 1, activation=activation, name="conv4_2") 123 | self.conv4_3 = convLayer(self.conv4_2, 512, 512, 3, 1, activation=activation, name="conv4_3") 124 | 125 | class FAST_RCNN(object): 126 | def __init__(self): 127 | pass 128 | 129 | def build_model(self, input_layer, use_batchnorm=False, is_training=True, activation=tf.nn.relu, anchors=1): 130 | self.conv1_1 = convBNLayer(images, False, is_training, activatioin=tf.nn.relu, name="conv1_1") 131 | self.conv1_2 = convBNLayer() 132 | 133 | def rpn(sess, vggpath=None, image_shape=(300, 300), \ 134 | is_training=None, use_batchnorm=False, activation=tf.nn.relu, anchors=9): 135 | images = tf.placeholder(tf.float32, [None, None, None, 3]) 136 | phase_train = tf.placeholder(tf.bool, name="phase_traing") if is_training else None 137 | 138 | vgg = Vgg(vgg16_npy_path=vggpath) 139 | vgg.build_model(images) 140 | rpn = RPN_ExtendedLayer() 141 | rpn.build_model(vgg.conv5_3, use_batchnorm=use_batchnorm, is_training=is_training, activation=activation, anchors=anchors) 142 | return vgg.conv5_3, rpn, images, phase_train 143 | 144 | def fast_rcnn(sess, model, rois, roi_size=(7, 7), image_shape=(300, 300), \ 145 | is_training=None, use_batchnorm=False, activation=tf.nn.relu, num_of_rois=128): 146 | """Model Definition of Fast RCNN 147 | In thesis, Roi Size is (7, 7), channel is 512 148 | """ 149 | with tf.variable_scope("fast_rcnn"): 150 | # roi shape [Num of ROIs, X, Y, W, H] 151 | roi_layer = roi_pooling(model, rois, roi_size[0], roi_size[1]) 152 | # input_shape [num_of_rois, channel, roi size, roi size] 153 | pool_5 = tf.reshape(roi_layer, [num_of_rois, roi_size[0]*roi_size[1]*512]) 154 | fc6 = fully_connected(pool_5, [roi_size[0]*roi_size[1]*512, 4096], name="fc6", is_training=is_training) 155 | fc7 = fully_connected(fc6, [4096, 4096], name="fc7", is_training=is_training) 156 | # output shape [num_of_rois, 2] 157 | obj_class = tf.nn.softmax(fully_connected(fc7, [4096, 2], name="fc_class", activation=None, use_batchnorm=None), dim=-1) 158 | # output shape [num_of_rois, 8] 159 | bbox_regression = fully_connected(fc7, [4096, 8], name="fc_bbox", activation=None, use_batchnorm=None) 160 | 161 | def train_rpn(batch_size, image_dir, label_dir, epoch=101, lr=0.01, feature_shape=(64, 19), \ 162 | vggpath="../pretrain/vgg16.npy", use_batchnorm=False, activation=tf.nn.relu, \ 163 | scales=np.array([5, 8, 12, 16, 32]), ratios=[0.3, 0.5, 0.8, 1], feature_stride=16): 164 | import time 165 | training_epochs = epoch 166 | 167 | with tf.Session() as sess: 168 | vgg_featuremap, rpn_model, images, phase_train = rpn(sess, vggpath=vggpath, is_training=False, \ 169 | use_batchnorm=use_batchnorm, activation=activation, anchors=scales.shape[0]*len(ratios)) 170 | saver = tf.train.Saver() 171 | new_saver = tf.train.import_meta_graph("../rpn/rpn_model40.ckpt.meta") 172 | last_model = "../rpn/rpn_model40.ckpt" 173 | saver.restore(sess, last_model) 174 | 175 | with tf.variable_scope("fast-rcnn"): 176 | rcnn_model = fast_rcnn(vgg_featuremap, rpn_model, activation=activation) 177 | 178 | if is_training: 179 | rcnn_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="fast-rcnn") 180 | sess.run(tf.variables_initializer(rcnn_vars)) 181 | 182 | total_loss, cls_loss, bbox_loss, true_obj_loss, false_obj_loss, g_bboxes, true_index, false_index = rpn_loss(rcnn_model.rcnn_cls, rcnn_model.rcnn_bbox) 183 | # Only Training RCNN Layer 184 | optimizer = create_optimizer(total_loss, lr=lr, var_list=rcnn_vars) 185 | 186 | init = tf.global_variables_initializer() 187 | sess.run(init) 188 | 189 | image_pathlist, label_pathlist = get_pathlist(image_dir, label_dir) 190 | for epoch in range(training_epochs): 191 | for batch_images, batch_labels in generator__Image_and_label(image_pathlist, label_pathlist, batch_size=batch_size): 192 | start = time.time() 193 | candicate_anchors, batch_true_index, batch_false_index = create_Labels_For_Loss(batch_labels, feat_stride=feature_stride, \ 194 | feature_shape=(batch_images.shape[1]//feature_stride +1, batch_images.shape[2]//feature_stride), \ 195 | scales=scales, ratios=ratios, image_size=batch_images.shape[1:3]) 196 | print "batch time", time.time() - start 197 | print batch_true_index[batch_true_index==1].shape 198 | print batch_false_index[batch_false_index==1].shape 199 | 200 | sess.run(optimizer, feed_dict={images:batch_images, g_bboxes: candicate_anchors, true_index:batch_true_index, false_index:batch_false_index}) 201 | tl, cl, bl, tol, fol = sess.run([total_loss, cls_loss, bbox_loss, true_obj_loss, false_obj_loss], feed_dict={images:batch_images, g_bboxes: candicate_anchors, true_index:batch_true_index, false_index:batch_false_index}) 202 | print("Epoch:", '%04d' % (epoch+1), "total loss=", "{:.9f}".format(tl)) 203 | print("Epoch:", '%04d' % (epoch+1), "closs loss=", "{:.9f}".format(cl)) 204 | print("Epoch:", '%04d' % (epoch+1), "bbox loss=", "{:.9f}".format(bl)) 205 | print("Epoch:", '%04d' % (epoch+1), "true loss=", "{:.9f}".format(tol)) 206 | print("Epoch:", '%04d' % (epoch+1), "false loss=", "{:.9f}".format(fol)) 207 | print("Optimization Finished") 208 | 209 | def smooth_L1(x): 210 | l2 = 0.5 * (x**2.0) 211 | l1 = tf.abs(x) - 0.5 212 | 213 | condition = tf.less(tf.abs(x), 1.0) 214 | loss = tf.where(condition, l2, l1) 215 | return loss 216 | 217 | def rpn_loss(rpn_cls, rpn_bbox): 218 | """Calculate Class Loss and Bounding Regression Loss. 219 | 220 | # Args: 221 | obj_class: Prediction of object class. Shape is [ROIs*Batch_Size, 2] 222 | bbox_regression: Prediction of bounding box. Shape is [ROIs*Batch_Size, 4] 223 | """ 224 | rpn_shape = rpn_cls.get_shape().as_list() 225 | g_bbox = tf.placeholder(tf.float32, [rpn_shape[0], rpn_shape[1], rpn_shape[2], 4]) 226 | true_index = tf.placeholder(tf.float32, [rpn_shape[0], rpn_shape[1], rpn_shape[2]]) 227 | false_index = tf.placeholder(tf.float32, [rpn_shape[0], rpn_shape[1], rpn_shape[2]]) 228 | elosion = 0.00001 229 | true_obj_loss = -tf.reduce_sum(tf.multiply(tf.log(rpn_cls[:, :, :, 0]+elosion), true_index)) 230 | false_obj_loss = -tf.reduce_sum(tf.multiply(tf.log(rpn_cls[:, :, :, 1]+elosion), false_index)) 231 | obj_loss = tf.add(true_obj_loss, false_obj_loss) 232 | cls_loss = tf.div(obj_loss, 16) # L(cls) / N(cls) N=batch size 233 | 234 | bbox_loss = smooth_L1(tf.subtract(rpn_bbox, g_bbox)) 235 | bbox_loss = tf.reduce_sum(tf.multiply(tf.reduce_sum(bbox_loss, 3), true_index)) 236 | bbox_loss = tf.multiply(tf.div(bbox_loss, 1197), 100) # rpn_shape[1]*rpn_shape[2] 237 | # bbox_loss = bbox_loss / rpn_shape[1] 238 | 239 | total_loss = tf.add(cls_loss, bbox_loss) 240 | return total_loss, cls_loss, bbox_loss, true_obj_loss, false_obj_loss, g_bbox, true_index, false_index 241 | 242 | def create_Labels_For_Loss(gt_boxes, feat_stride=16, feature_shape=(64, 19), \ 243 | scales=np.array([8, 16, 32]), ratios=[0.5, 0.8, 1], \ 244 | image_size=(300, 1000)): 245 | """This Function is processed before network input 246 | Number of Candicate Anchors is Feature Map width * heights 247 | Number of Predicted Anchors is Batch Num * Feature Map Width * Heights * 9 248 | """ 249 | width = feature_shape[0] 250 | height = feature_shape[1] 251 | batch_size = gt_boxes.shape[0] 252 | # shifts is the all candicate anchors(prediction of bounding boxes) 253 | center_x = np.arange(0, height) * feat_stride 254 | center_y = np.arange(0, width) * feat_stride 255 | center_x, center_y = np.meshgrid(center_x, center_y) 256 | # Shape is [Batch, Width*Height, 4] 257 | centers = np.zeros((batch_size, width*height, 4)) 258 | centers[:] = np.vstack((center_x.ravel(), center_y.ravel(), 259 | center_x.ravel(), center_y.ravel())).transpose() 260 | A = scales.shape[0] * len(ratios) 261 | K = width * height # width * height 262 | anchors = np.zeros((batch_size, A, 4)) 263 | anchors = generate_anchors(scales=scales, ratios=ratios) # Shape is [A, 4] 264 | 265 | candicate_anchors = centers.reshape(batch_size, K, 1, 4) + anchors # [Batch, K, A, 4] 266 | 267 | # shape is [B, K, A] 268 | is_inside = batch_inside_image(candicate_anchors, image_size[1], image_size[0]) 269 | 270 | # candicate_anchors: Shape is [Batch, K, A, 4] 271 | # gt_boxes: Shape is [Batch, G, 4] 272 | # true_index: Shape is [Batch, K, A] 273 | # false_index: Shape is [Batch, K, A] 274 | candicate_anchors, true_index, false_index = bbox_overlaps( 275 | np.ascontiguousarray(candicate_anchors, dtype=np.float), 276 | is_inside, 277 | gt_boxes) 278 | 279 | for i in range(batch_size): 280 | true_where = np.where(true_index[i] == 1) 281 | num_true = len(true_where[0]) 282 | 283 | if num_true > 64: 284 | select = np.random.choice(num_true, num_true - 64, replace=False) 285 | num_true = 64 286 | batch = np.ones((select.shape[0]), dtype=np.int) * i 287 | true_where = remove_extraboxes(true_where[0], true_where[1], select, batch) 288 | true_index[true_where] = 0 289 | 290 | false_where = np.where(false_index[i] == 1) 291 | num_false = len(false_where[0]) 292 | select = np.random.choice(num_false, num_false - (128-num_true), replace=False) 293 | batch = np.ones((select.shape[0]), dtype=np.int) * i 294 | false_where = remove_extraboxes(false_where[0], false_where[1], select, batch) 295 | false_index[false_where] = 0 296 | 297 | return candicate_anchors, true_index, false_index 298 | 299 | if __name__ == '__main__': 300 | import sys 301 | import matplotlib.pyplot as plt 302 | from PIL import Image as im 303 | sys.path.append('/home/katou01/code/grid/DataAugmentation') 304 | # from resize import resize 305 | 306 | image_dir = "/home/katou01/download/training/image_2/*.png" 307 | label_dir = "/home/katou01/download/training/label_2/*.txt" 308 | get_Image_Roi_All(image_dir, label_dir, 80) 309 | # 310 | # image = im.open("./test_images/test1.jpg") 311 | # image = np.array(image, dtype=np.float32) 312 | # new_image = image[np.newaxis, :] 313 | # batch_image = np.vstack((new_image, new_image)) 314 | # batch_image = resize(batch_image, size=(300, 300)) 315 | # 316 | # with tf.Session() as sess: 317 | # model = ssd_model(sess, batch_image, activation=None, atrous=False, rate=1, implement_atrous=False) 318 | # print(vars(model)) 319 | # # tf.summary.scalar('model', model) 320 | -------------------------------------------------------------------------------- /rpn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yukitsuji/Faster_RCNN_tensorflow/765c729eaf03cb401ad308a289ec7d8c2bfca474/rpn/__init__.py -------------------------------------------------------------------------------- /rpn/rpn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("../") 5 | sys.path.append("../util") 6 | sys.path.append("../cython_util") 7 | sys.path.append("../pretrain") 8 | import glob 9 | import cv2 10 | import numpy as np 11 | # from vgg16 import vgg16 12 | from input_kitti import * 13 | from data_util import * 14 | from parse_xml import parseXML 15 | from base_vgg16 import Vgg16 as Vgg 16 | import tensorflow as tf 17 | from network_util import * 18 | from bbox_overlap import bbox_overlaps 19 | from remove_extraboxes import remove_extraboxes 20 | from bool_anchors_inside_image import batch_inside_image 21 | from generate_anchors import generate_anchors 22 | # from utility.image.data_augmentation.flip import Flip 23 | # sys.path.append("/Users/tsujiyuuki/env_python/code/my_code/Data_Augmentation") 24 | 25 | 26 | """Flow of Fast RCNN 27 | ############################################################################### 28 | In this state, Create Input Images and ROI Labels 29 | 30 | 1. input batch images and GroundTruth BBox from datasets *folder name, batch size 31 | Image shape is [batch size, width, height, channel], tf.float32, vgg normalized, bgr 32 | Bounding Box shape is [batch size, center_x, center_y, width, height] 33 | 34 | 2. get candicate bounding box from images. 35 | 36 | # Implemented 37 | 3. resize input images to input size *size of resize if needed. 38 | if this operation was done, you should adjust bounding box according to it. 39 | Both of Candicate and GroundTruth Bounding Boxes. 40 | In thesis, Image size is in [600, 1000] 41 | In this Implemention, input image has dynamic shape between [600, 1000] 42 | 43 | 4. convert candicate bounding box to ROI label. 44 | 45 | 5. calculate IOU between ROI label and GroundTruth label. 46 | IOU is Intersection Over Union. 47 | 48 | 6. Select Bounding Box from IOU. 49 | IOU > 0.5 is correct label, IOU = [0.1 0.5) is a false label(background). 50 | Correct Label is 25%, BackGround Label is 75%. 51 | Number of Label is 128, Batch Size is 2, so each image has 64 ROIs 52 | 53 | ############################################################################### 54 | In this stage, Calculate Loss 55 | 56 | 7. Input data to ROI Pooling Layer is Conv5_3 Feature Map and ROIs 57 | Input shape is Feature map (batch, width, height, 512), ROIs (Num of ROIs, 5) 58 | ROIs, ex:) [0, left, height, right, bottom]. First Element is the index of batch 59 | 60 | 8. Through ROI Pooling Layer, Output Shape is [Num of ROIs, 7, 7, 512] 61 | 62 | 9. Reshape it to [Num of ROIs, -1], and then connect to Fully Connected Layer. 63 | 64 | 10.Output Layer has two section, one is class prediction, the other is its bounding box prediction. 65 | class prediction shape is [Num of ROIs, Num of Class + 1] 66 | bounding box prediction shape is [Num of ROIs, 4 * (Num of Class + 1)] 67 | 68 | 11.Loss Function 69 | Regularize bounding box value [center_x, center_y, w, h] into 70 | [(GroundTruth x - pred_x) / pred_w, (GroundTruth y - pred_y) / pred_h, log(GroundTruth w / pred_w), log(GroundTruth h / pred_h)] 71 | Class prediction is by softmax with loss. 72 | Bounding Box prediction is by smooth_L1 loss 73 | ############################################################################### 74 | In this stage, Describe Datasets. 75 | 1. PASCAL VOC2007 76 | 2. KITTI Datasets 77 | 3. Udacity Datasets 78 | """ 79 | 80 | # TODO: datasetsを丸ごとメモリに展開できるか。Generatorを用いるか。 81 | 82 | 83 | def create_optimizer(all_loss, lr=0.001): 84 | opt = tf.train.AdamOptimizer(lr) 85 | optimizer = opt.minimize(all_loss) 86 | return optimizer 87 | 88 | class RPN_ExtendedLayer(object): 89 | def __init__(self): 90 | pass 91 | 92 | def build_model(self, input_layer, use_batchnorm=False, is_training=True, atrous=False, \ 93 | rate=1, activation=tf.nn.relu, implement_atrous=False, lr_mult=1, anchors=1): 94 | self.rpn_conv = convBNLayer(input_layer, use_batchnorm, is_training, 512, 512, 3, 1, name="conv_rpn", activation=activation) 95 | # shape is [Batch, 2(bg/fg) * 9(anchors=3scale*3aspect ratio)] 96 | self.rpn_cls = convBNLayer(self.rpn_conv, use_batchnorm, is_training, 512, anchors*2, 1, 1, name="rpn_cls", activation=activation) 97 | rpn_shape = self.rpn_cls.get_shape().as_list() 98 | rpn_shape = tf.shape(self.rpn_cls) 99 | self.rpn_cls = tf.reshape(self.rpn_cls, [rpn_shape[0], rpn_shape[1], rpn_shape[2], anchors, 2]) 100 | self.rpn_cls = tf.nn.softmax(self.rpn_cls, dim=-1) 101 | self.rpn_cls = tf.reshape(self.rpn_cls, [rpn_shape[0], rpn_shape[1]*rpn_shape[2], anchors, 2]) 102 | # shape is [Batch, 4(x, y, w, h) * 9(anchors=3scale*3aspect ratio)] 103 | self.rpn_bbox = convBNLayer(self.rpn_conv, use_batchnorm, is_training, 512, anchors*4, 1, 1, name="rpn_bbox", activation=activation) 104 | self.rpn_bbox = tf.reshape(self.rpn_bbox, [rpn_shape[0], rpn_shape[1]*rpn_shape[2], anchors, 4]) 105 | 106 | def rpn(sess, vggpath=None, image_shape=(300, 300), \ 107 | is_training=None, use_batchnorm=False, activation=tf.nn.relu, anchors=9): 108 | images = tf.placeholder(tf.float32, [None, None, None, 3]) 109 | phase_train = tf.placeholder(tf.bool, name="phase_traing") if is_training else None 110 | 111 | vgg = Vgg(vgg16_npy_path=vggpath) 112 | vgg.build_model(images) 113 | 114 | with tf.variable_scope("rpn_model") as scope: 115 | rpn_model = RPN_ExtendedLayer() 116 | rpn_model.build_model(vgg.conv5_3, use_batchnorm=use_batchnorm, \ 117 | is_training=phase_train, activation=activation, anchors=anchors) 118 | 119 | if is_training: 120 | initialized_var = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="rpn_model") 121 | sess.run(tf.variables_initializer(initialized_var)) 122 | return rpn_model, images, phase_train 123 | 124 | def smooth_L1(x): 125 | l2 = 0.5 * (x**2.0) 126 | l1 = tf.abs(x) - 0.5 127 | 128 | condition = tf.less(tf.abs(x), 1.0) 129 | loss = tf.where(condition, l2, l1) 130 | return loss 131 | 132 | def rpn_loss(rpn_cls, rpn_bbox): 133 | """Calculate Class Loss and Bounding Regression Loss. 134 | 135 | # Args: 136 | obj_class: Prediction of object class. Shape is [ROIs*Batch_Size, 2] 137 | bbox_regression: Prediction of bounding box. Shape is [ROIs*Batch_Size, 4] 138 | """ 139 | rpn_shape = rpn_cls.get_shape().as_list() 140 | g_bbox = tf.placeholder(tf.float32, [rpn_shape[0], rpn_shape[1], rpn_shape[2], 4]) 141 | true_index = tf.placeholder(tf.float32, [rpn_shape[0], rpn_shape[1], rpn_shape[2]]) 142 | false_index = tf.placeholder(tf.float32, [rpn_shape[0], rpn_shape[1], rpn_shape[2]]) 143 | elosion = 0.00001 144 | true_obj_loss = -tf.reduce_sum(tf.multiply(tf.log(rpn_cls[:, :, :, 0]+elosion), true_index)) 145 | false_obj_loss = -tf.reduce_sum(tf.multiply(tf.log(rpn_cls[:, :, :, 1]+elosion), false_index)) 146 | obj_loss = tf.add(true_obj_loss, false_obj_loss) 147 | cls_loss = tf.div(obj_loss, 16) # L(cls) / N(cls) N=batch size 148 | 149 | bbox_loss = smooth_L1(tf.subtract(rpn_bbox, g_bbox)) 150 | bbox_loss = tf.reduce_sum(tf.multiply(tf.reduce_sum(bbox_loss, 3), true_index)) 151 | bbox_loss = tf.multiply(tf.div(bbox_loss, 1197), 100) # rpn_shape[1]*rpn_shape[2] 152 | # bbox_loss = bbox_loss / rpn_shape[1] 153 | 154 | total_loss = tf.add(cls_loss, bbox_loss) 155 | return total_loss, cls_loss, bbox_loss, true_obj_loss, false_obj_loss, g_bbox, true_index, false_index 156 | 157 | 158 | def create_Labels_For_Loss(gt_boxes, feat_stride=16, feature_shape=(64, 19), \ 159 | scales=np.array([8, 16, 32]), ratios=[0.5, 0.8, 1], \ 160 | image_size=(300, 1000)): 161 | """This Function is processed before network input 162 | Number of Candicate Anchors is Feature Map width * heights 163 | Number of Predicted Anchors is Batch Num * Feature Map Width * Heights * 9 164 | """ 165 | width = feature_shape[0] 166 | height = feature_shape[1] 167 | batch_size = gt_boxes.shape[0] 168 | # shifts is the all candicate anchors(prediction of bounding boxes) 169 | center_x = np.arange(0, height) * feat_stride 170 | center_y = np.arange(0, width) * feat_stride 171 | center_x, center_y = np.meshgrid(center_x, center_y) 172 | # Shape is [Batch, Width*Height, 4] 173 | centers = np.zeros((batch_size, width*height, 4)) 174 | centers[:] = np.vstack((center_x.ravel(), center_y.ravel(), 175 | center_x.ravel(), center_y.ravel())).transpose() 176 | A = scales.shape[0] * len(ratios) 177 | K = width * height # width * height 178 | anchors = np.zeros((batch_size, A, 4)) 179 | anchors = generate_anchors(scales=scales, ratios=ratios) # Shape is [A, 4] 180 | 181 | candicate_anchors = centers.reshape(batch_size, K, 1, 4) + anchors # [Batch, K, A, 4] 182 | 183 | # shape is [B, K, A] 184 | is_inside = batch_inside_image(candicate_anchors, image_size[1], image_size[0]) 185 | 186 | # candicate_anchors: Shape is [Batch, K, A, 4] 187 | # gt_boxes: Shape is [Batch, G, 4] 188 | # true_index: Shape is [Batch, K, A] 189 | # false_index: Shape is [Batch, K, A] 190 | candicate_anchors, true_index, false_index = bbox_overlaps( 191 | np.ascontiguousarray(candicate_anchors, dtype=np.float), 192 | is_inside, 193 | gt_boxes) 194 | 195 | for i in range(batch_size): 196 | true_where = np.where(true_index[i] == 1) 197 | num_true = len(true_where[0]) 198 | 199 | if num_true > 64: 200 | select = np.random.choice(num_true, num_true - 64, replace=False) 201 | num_true = 64 202 | batch = np.ones((select.shape[0]), dtype=np.int) * i 203 | true_where = remove_extraboxes(true_where[0], true_where[1], select, batch) 204 | true_index[true_where] = 0 205 | 206 | false_where = np.where(false_index[i] == 1) 207 | num_false = len(false_where[0]) 208 | select = np.random.choice(num_false, num_false - (128-num_true), replace=False) 209 | batch = np.ones((select.shape[0]), dtype=np.int) * i 210 | false_where = remove_extraboxes(false_where[0], false_where[1], select, batch) 211 | false_index[false_where] = 0 212 | 213 | return candicate_anchors, true_index, false_index 214 | 215 | def train_rpn(batch_size, image_dir, label_dir, epoch=101, lr=0.01, feature_shape=(64, 19), \ 216 | vggpath="../pretrain/vgg16.npy", use_batchnorm=False, activation=tf.nn.relu, \ 217 | scales=np.array([5, 8, 12, 16, 32]), ratios=[0.3, 0.5, 0.8, 1], feature_stride=16): 218 | import time 219 | training_epochs = epoch 220 | 221 | with tf.Session() as sess: 222 | model, images, phase_train = rpn(sess, vggpath=vggpath, is_training=True, \ 223 | use_batchnorm=use_batchnorm, activation=activation, anchors=scales.shape[0]*len(ratios)) 224 | total_loss, cls_loss, bbox_loss, true_obj_loss, false_obj_loss, g_bboxes, true_index, false_index = rpn_loss(model.rpn_cls, model.rpn_bbox) 225 | optimizer = create_optimizer(total_loss, lr=lr) 226 | init = tf.global_variables_initializer() 227 | sess.run(init) 228 | 229 | image_pathlist, label_pathlist = get_pathlist(image_dir, label_dir) 230 | for epoch in range(training_epochs): 231 | for batch_images, batch_labels in generator__Image_and_label(image_pathlist, label_pathlist, batch_size=batch_size): 232 | start = time.time() 233 | candicate_anchors, batch_true_index, batch_false_index = create_Labels_For_Loss(batch_labels, feat_stride=feature_stride, feature_shape=(batch_images.shape[1]//feature_stride +1, batch_images.shape[2]//feature_stride+1), \ 234 | scales=scales, ratios=ratios, image_size=batch_images.shape[1:3]) 235 | print "batch time", time.time() - start 236 | print batch_true_index[batch_true_index==1].shape 237 | print batch_false_index[batch_false_index==1].shape 238 | 239 | sess.run(optimizer, feed_dict={images:batch_images, g_bboxes: candicate_anchors, true_index:batch_true_index, false_index:batch_false_index}) 240 | tl, cl, bl, tol, fol = sess.run([total_loss, cls_loss, bbox_loss, true_obj_loss, false_obj_loss], feed_dict={images:batch_images, g_bboxes: candicate_anchors, true_index:batch_true_index, false_index:batch_false_index}) 241 | print("Epoch:", '%04d' % (epoch+1), "total loss=", "{:.9f}".format(tl)) 242 | print("Epoch:", '%04d' % (epoch+1), "closs loss=", "{:.9f}".format(cl)) 243 | print("Epoch:", '%04d' % (epoch+1), "bbox loss=", "{:.9f}".format(bl)) 244 | print("Epoch:", '%04d' % (epoch+1), "true loss=", "{:.9f}".format(tol)) 245 | print("Epoch:", '%04d' % (epoch+1), "false loss=", "{:.9f}".format(fol)) 246 | print("Optimization Finished") 247 | 248 | if __name__ == '__main__': 249 | import sys 250 | import matplotlib.pyplot as plt 251 | from PIL import Image as im 252 | sys.path.append('/home/katou01/code/grid/DataAugmentation') 253 | # from resize import resize 254 | 255 | image_dir = "/home/katou01/download/training/image_2/*.png" 256 | label_dir = "/home/katou01/download/training/label_2/*.txt" 257 | # import time 258 | train_rpn(6, image_dir, label_dir, epoch=20, lr=0.001, \ 259 | scales=np.array([2, 4, 6, 8, 10]), ratios=[0.4, 0.6, 0.8, 1.0]) 260 | # image_pathlist, label_pathlist = get_pathlist(image_dir, label_dir) 261 | # for images, labels in generator__Image_and_label(image_pathlist, label_pathlist, batch_size=32): 262 | # start = time.time() 263 | # # images, labels = get_ALL_Image(image_pathlist, label_pathlist) 264 | # candicate_anchors, true_index, false_index = create_Labels_For_Loss(labels, feat_stride=16, feature_shape=(64, 19), \ 265 | # scales=np.array([5, 8, 12, 16, 32]), ratios=[0.3, 0.5, 0.8, 1], \ 266 | # image_size=(302, 1000)) 267 | # print "batch time", time.time() - start 268 | # print candicate_anchors.shape, true_index.shape, false_index.shape 269 | # # images, labels = get_ALL_Image(image_pathlist, label_pathlist) 270 | # candicate_anchors, true_index, false_index = create_Labels_For_Loss(labels, feat_stride=16, feature_shape=(64, 19), \ 271 | # scales=np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32]), ratios=[0.1, 0.2, 0.3, 0.5, 0.8, 1, 1.2], \ 272 | # image_size=(302, 1000)) 273 | -------------------------------------------------------------------------------- /rpn/rpn_vehicle.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("../") 5 | sys.path.append("../util") 6 | sys.path.append("../cython_util") 7 | sys.path.append("../pretrain") 8 | import glob 9 | import cv2 10 | import numpy as np 11 | # from vgg16 import vgg16 12 | from input_kitti import * 13 | from data_util import * 14 | from parse_xml import parseXML 15 | from vgg16_vehicle import Vgg16 as Vgg 16 | import tensorflow as tf 17 | from network_util import * 18 | from bbox_overlap import bbox_overlaps 19 | from remove_extraboxes import remove_extraboxes 20 | from bool_anchors_inside_image import batch_inside_image 21 | from generate_anchors import generate_anchors 22 | # from utility.image.data_augmentation.flip import Flip 23 | # sys.path.append("/Users/tsujiyuuki/env_python/code/my_code/Data_Augmentation") 24 | 25 | 26 | """Flow of Fast RCNN 27 | ############################################################################### 28 | In this state, Create Input Images and ROI Labels 29 | 30 | 1. input batch images and GroundTruth BBox from datasets *folder name, batch size 31 | Image shape is [batch size, width, height, channel], tf.float32, vgg normalized, bgr 32 | Bounding Box shape is [batch size, center_x, center_y, width, height] 33 | 34 | 2. get candicate bounding box from images. 35 | 36 | # Implemented 37 | 3. resize input images to input size *size of resize if needed. 38 | if this operation was done, you should adjust bounding box according to it. 39 | Both of Candicate and GroundTruth Bounding Boxes. 40 | In thesis, Image size is in [600, 1000] 41 | In this Implemention, input image has dynamic shape between [600, 1000] 42 | 43 | 4. convert candicate bounding box to ROI label. 44 | 45 | 5. calculate IOU between ROI label and GroundTruth label. 46 | IOU is Intersection Over Union. 47 | 48 | 6. Select Bounding Box from IOU. 49 | IOU > 0.5 is correct label, IOU = [0.1 0.5) is a false label(background). 50 | Correct Label is 25%, BackGround Label is 75%. 51 | Number of Label is 128, Batch Size is 2, so each image has 64 ROIs 52 | 53 | ############################################################################### 54 | In this stage, Calculate Loss 55 | 56 | 7. Input data to ROI Pooling Layer is Conv5_3 Feature Map and ROIs 57 | Input shape is Feature map (batch, width, height, 512), ROIs (Num of ROIs, 5) 58 | ROIs, ex:) [0, left, height, right, bottom]. First Element is the index of batch 59 | 60 | 8. Through ROI Pooling Layer, Output Shape is [Num of ROIs, 7, 7, 512] 61 | 62 | 9. Reshape it to [Num of ROIs, -1], and then connect to Fully Connected Layer. 63 | 64 | 10.Output Layer has two section, one is class prediction, the other is its bounding box prediction. 65 | class prediction shape is [Num of ROIs, Num of Class + 1] 66 | bounding box prediction shape is [Num of ROIs, 4 * (Num of Class + 1)] 67 | 68 | 11.Loss Function 69 | Regularize bounding box value [center_x, center_y, w, h] into 70 | [(GroundTruth x - pred_x) / pred_w, (GroundTruth y - pred_y) / pred_h, log(GroundTruth w / pred_w), log(GroundTruth h / pred_h)] 71 | Class prediction is by softmax with loss. 72 | Bounding Box prediction is by smooth_L1 loss 73 | ############################################################################### 74 | In this stage, Describe Datasets. 75 | 1. PASCAL VOC2007 76 | 2. KITTI Datasets 77 | 3. Udacity Datasets 78 | """ 79 | 80 | # TODO: datasetsを丸ごとメモリに展開できるか。Generatorを用いるか。 81 | 82 | 83 | def create_optimizer(all_loss, lr=0.001): 84 | opt = tf.train.AdamOptimizer(lr) 85 | optimizer = opt.minimize(all_loss) 86 | return optimizer 87 | 88 | class RPN_ExtendedLayer(object): 89 | def __init__(self): 90 | pass 91 | 92 | def build_model(self, input_layer, use_batchnorm=False, is_training=True, atrous=False, \ 93 | rate=1, activation=tf.nn.relu, implement_atrous=False, anchors=1): 94 | self.rpn_conv = convBNLayer(input_layer, use_batchnorm, is_training, 512, 512, 3, 1, name="conv_rpn", activation=activation) 95 | # shape is [Batch, 2(bg/fg) * 9(anchors=3scale*3aspect ratio)] 96 | self.rpn_cls = convBNLayer(self.rpn_conv, False, is_training, 512, anchors*2, 1, 1, name="rpn_cls", activation=None) 97 | rpn_shape = self.rpn_cls.get_shape().as_list() 98 | rpn_shape = tf.shape(self.rpn_cls) 99 | self.rpn_cls = tf.reshape(self.rpn_cls, [rpn_shape[0], rpn_shape[1], rpn_shape[2], anchors, 2]) 100 | self.rpn_cls = tf.nn.softmax(self.rpn_cls, dim=-1) 101 | self.rpn_cls = tf.reshape(self.rpn_cls, [rpn_shape[0], rpn_shape[1]*rpn_shape[2], anchors, 2]) 102 | # shape is [Batch, 4(x, y, w, h) * 9(anchors=3scale*3aspect ratio)] 103 | self.rpn_bbox = convBNLayer(self.rpn_conv, use_batchnorm, is_training, 512, anchors*4, 1, 1, name="rpn_bbox", activation=None) 104 | self.rpn_bbox = tf.reshape(self.rpn_bbox, [rpn_shape[0], rpn_shape[1]*rpn_shape[2], anchors, 4]) 105 | 106 | def rpn(sess, vggpath=None, image_shape=(300, 300), \ 107 | is_training=None, use_batchnorm=False, activation=tf.nn.relu, anchors=9): 108 | images = tf.placeholder(tf.float32, [None, None, None, 3]) 109 | phase_train = tf.placeholder(tf.bool, name="phase_traing") if is_training else None 110 | 111 | vgg = Vgg(vgg16_npy_path=vggpath) 112 | vgg.build_model(images) 113 | 114 | with tf.variable_scope("rpn_model"): 115 | rpn_model = RPN_ExtendedLayer() 116 | rpn_model.build_model(vgg.conv4_3, use_batchnorm=use_batchnorm, \ 117 | is_training=phase_train, activation=activation, anchors=anchors) 118 | 119 | if is_training: 120 | initialized_var = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="rpn_model") 121 | sess.run(tf.variables_initializer(initialized_var)) 122 | return rpn_model, images, phase_train 123 | 124 | def smooth_L1(x): 125 | l2 = 0.5 * (x**2.0) 126 | l1 = tf.abs(x) - 0.5 127 | 128 | condition = tf.less(tf.abs(x), 1.0) 129 | loss = tf.where(condition, l2, l1) 130 | return loss 131 | 132 | def rpn_loss(rpn_cls, rpn_bbox): 133 | """Calculate Class Loss and Bounding Regression Loss. 134 | 135 | # Args: 136 | obj_class: Prediction of object class. Shape is [ROIs*Batch_Size, 2] 137 | bbox_regression: Prediction of bounding box. Shape is [ROIs*Batch_Size, 4] 138 | """ 139 | rpn_shape = rpn_cls.get_shape().as_list() 140 | g_bbox = tf.placeholder(tf.float32, [rpn_shape[0], rpn_shape[1], rpn_shape[2], 4]) 141 | true_index = tf.placeholder(tf.float32, [rpn_shape[0], rpn_shape[1], rpn_shape[2]]) 142 | false_index = tf.placeholder(tf.float32, [rpn_shape[0], rpn_shape[1], rpn_shape[2]]) 143 | elosion = 0.00001 144 | true_obj_loss = -tf.reduce_sum(tf.multiply(tf.log(rpn_cls[:, :, :, 0]+elosion), true_index)) 145 | false_obj_loss = -tf.reduce_sum(tf.multiply(tf.log(rpn_cls[:, :, :, 1]+elosion), false_index)) 146 | obj_loss = tf.add(true_obj_loss, false_obj_loss) 147 | cls_loss = tf.div(obj_loss, 16) # L(cls) / N(cls) N=batch size 148 | 149 | bbox_loss = smooth_L1(tf.subtract(rpn_bbox, g_bbox)) 150 | bbox_loss = tf.reduce_sum(tf.multiply(tf.reduce_sum(bbox_loss, 3), true_index)) 151 | bbox_loss = tf.multiply(tf.div(bbox_loss, 1197), 100) # rpn_shape[1]*rpn_shape[2] 152 | # bbox_loss = bbox_loss / rpn_shape[1] 153 | 154 | total_loss = tf.add(cls_loss, bbox_loss) 155 | return total_loss, cls_loss, bbox_loss, true_obj_loss, false_obj_loss, g_bbox, true_index, false_index 156 | 157 | 158 | def create_Labels_For_Loss(gt_boxes, feat_stride=16, feature_shape=(64, 19), \ 159 | scales=np.array([8, 16, 32]), ratios=[0.5, 0.8, 1], \ 160 | image_size=(300, 1000)): 161 | """This Function is processed before network input 162 | Number of Candicate Anchors is Feature Map width * heights 163 | Number of Predicted Anchors is Batch Num * Feature Map Width * Heights * 9 164 | """ 165 | width = feature_shape[0] 166 | height = feature_shape[1] 167 | batch_size = gt_boxes.shape[0] 168 | # shifts is the all candicate anchors(prediction of bounding boxes) 169 | center_x = np.arange(0, height) * feat_stride 170 | center_y = np.arange(0, width) * feat_stride 171 | center_x, center_y = np.meshgrid(center_x, center_y) 172 | # Shape is [Batch, Width*Height, 4] 173 | centers = np.zeros((batch_size, width*height, 4)) 174 | centers[:] = np.vstack((center_x.ravel(), center_y.ravel(), 175 | center_x.ravel(), center_y.ravel())).transpose() 176 | A = scales.shape[0] * len(ratios) 177 | K = width * height # width * height 178 | anchors = np.zeros((batch_size, A, 4)) 179 | anchors = generate_anchors(scales=scales, ratios=ratios) # Shape is [A, 4] 180 | 181 | candicate_anchors = centers.reshape(batch_size, K, 1, 4) + anchors # [Batch, K, A, 4] 182 | 183 | # shape is [B, K, A] 184 | is_inside = batch_inside_image(candicate_anchors, image_size[1], image_size[0]) 185 | 186 | # candicate_anchors: Shape is [Batch, K, A, 4] 187 | # gt_boxes: Shape is [Batch, G, 4] 188 | # true_index: Shape is [Batch, K, A] 189 | # false_index: Shape is [Batch, K, A] 190 | candicate_anchors, true_index, false_index = bbox_overlaps( 191 | np.ascontiguousarray(candicate_anchors, dtype=np.float), 192 | is_inside, 193 | gt_boxes) 194 | 195 | for i in range(batch_size): 196 | true_where = np.where(true_index[i] == 1) 197 | num_true = len(true_where[0]) 198 | 199 | if num_true > 64: 200 | select = np.random.choice(num_true, num_true - 64, replace=False) 201 | num_true = 64 202 | batch = np.ones((select.shape[0]), dtype=np.int) * i 203 | true_where = remove_extraboxes(true_where[0], true_where[1], select, batch) 204 | true_index[true_where] = 0 205 | 206 | false_where = np.where(false_index[i] == 1) 207 | num_false = len(false_where[0]) 208 | select = np.random.choice(num_false, num_false - (128-num_true), replace=False) 209 | batch = np.ones((select.shape[0]), dtype=np.int) * i 210 | false_where = remove_extraboxes(false_where[0], false_where[1], select, batch) 211 | false_index[false_where] = 0 212 | 213 | return candicate_anchors, true_index, false_index 214 | 215 | def train_rpn(batch_size, image_dir, label_dir, epoch=101, lr=0.01, feature_shape=(64, 19), \ 216 | vggpath="../pretrain/vgg16.npy", use_batchnorm=False, activation=tf.nn.relu, \ 217 | scales=np.array([5, 8, 12, 16, 32]), ratios=[0.3, 0.5, 0.8, 1], feature_stride=16): 218 | import time 219 | training_epochs = epoch 220 | 221 | with tf.Session() as sess: 222 | model, images, phase_train = rpn(sess, vggpath=vggpath, is_training=True, \ 223 | use_batchnorm=use_batchnorm, activation=activation, anchors=scales.shape[0]*len(ratios)) 224 | saver = tf.train.Saver() 225 | total_loss, cls_loss, bbox_loss, true_obj_loss, false_obj_loss, g_bboxes, true_index, false_index = rpn_loss(model.rpn_cls, model.rpn_bbox) 226 | optimizer = create_optimizer(total_loss, lr=lr) 227 | init = tf.global_variables_initializer() 228 | sess.run(init) 229 | 230 | image_pathlist, label_pathlist = get_pathlist(image_dir, label_dir) 231 | for epoch in range(training_epochs): 232 | for batch_images, batch_labels in generator__Image_and_label(image_pathlist, label_pathlist, batch_size=batch_size): 233 | start = time.time() 234 | candicate_anchors, batch_true_index, batch_false_index = create_Labels_For_Loss(batch_labels, feat_stride=feature_stride, \ 235 | feature_shape=(batch_images.shape[1]//feature_stride +1, batch_images.shape[2]//feature_stride), \ 236 | scales=scales, ratios=ratios, image_size=batch_images.shape[1:3]) 237 | print "batch time", time.time() - start 238 | print batch_true_index[batch_true_index==1].shape 239 | print batch_false_index[batch_false_index==1].shape 240 | 241 | sess.run(optimizer, feed_dict={images:batch_images, g_bboxes: candicate_anchors, true_index:batch_true_index, false_index:batch_false_index}) 242 | tl, cl, bl, tol, fol = sess.run([total_loss, cls_loss, bbox_loss, true_obj_loss, false_obj_loss], feed_dict={images:batch_images, g_bboxes: candicate_anchors, true_index:batch_true_index, false_index:batch_false_index}) 243 | print("Epoch:", '%04d' % (epoch+1), "total loss=", "{:.9f}".format(tl)) 244 | print("Epoch:", '%04d' % (epoch+1), "closs loss=", "{:.9f}".format(cl)) 245 | print("Epoch:", '%04d' % (epoch+1), "bbox loss=", "{:.9f}".format(bl)) 246 | print("Epoch:", '%04d' % (epoch+1), "true loss=", "{:.9f}".format(tol)) 247 | print("Epoch:", '%04d' % (epoch+1), "false loss=", "{:.9f}".format(fol)) 248 | if (epoch != 0) and ((epoch+1) % 10 == 0): 249 | print "Save epoch " + str(epoch) 250 | saver.save(sess, "rpn_model" + str(epoch) + ".ckpt") 251 | print("Optimization Finished") 252 | 253 | if __name__ == '__main__': 254 | import sys 255 | import matplotlib.pyplot as plt 256 | from PIL import Image as im 257 | sys.path.append('/home/katou01/code/grid/DataAugmentation') 258 | # from resize import resize 259 | 260 | image_dir = "/home/katou01/download/training/image_2/*.png" 261 | label_dir = "/home/katou01/download/training/label_2/*.txt" 262 | # import time 263 | train_rpn(4, image_dir, label_dir, epoch=41, lr=0.001, use_batchnorm=True, \ 264 | scales=np.array([6, 8, 10, 12, 14, 16, 20, 32]), ratios=[0.4, 0.6, 0.8, 1.0], feature_stride=8) 265 | # image_pathlist, label_pathlist = get_pathlist(image_dir, label_dir) 266 | # for images, labels in generator__Image_and_label(image_pathlist, label_pathlist, batch_size=32): 267 | # start = time.time() 268 | # # images, labels = get_ALL_Image(image_pathlist, label_pathlist) 269 | # candicate_anchors, true_index, false_index = create_Labels_For_Loss(labels, feat_stride=16, feature_shape=(64, 19), \ 270 | # scales=np.array([5, 8, 12, 16, 32]), ratios=[0.3, 0.5, 0.8, 1], \ 271 | # image_size=(302, 1000)) 272 | # print "batch time", time.time() - start 273 | # print candicate_anchors.shape, true_index.shape, false_index.shape 274 | # # images, labels = get_ALL_Image(image_pathlist, label_pathlist) 275 | # candicate_anchors, true_index, false_index = create_Labels_For_Loss(labels, feat_stride=16, feature_shape=(64, 19), \ 276 | # scales=np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32]), ratios=[0.1, 0.2, 0.3, 0.5, 0.8, 1, 1.2], \ 277 | # image_size=(302, 1000)) 278 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yukitsuji/Faster_RCNN_tensorflow/765c729eaf03cb401ad308a289ec7d8c2bfca474/util/__init__.py -------------------------------------------------------------------------------- /util/bbox_transform.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def create_bbox_regression_label(p_bboxes, g_bboxes): 11 | """Create Label for Bounding Box Regression Label 12 | # Args: 13 | p_bboxes: Predicted Bounging Box. Shape is [ROIs, 5]. 14 | [0, left, top, right, bottom] 15 | g_bboxes: GroundTruth Bounding Box. Shape is [ROIs, 5]. 16 | [0, left, top, right, bottom] 17 | # Returns: 18 | regression_label: Regression Label of Bounding Boxes. 19 | Shape is [ROIs, 4] 20 | """ 21 | p_width = p_bboxes[:, 2] - p_bboxes[:, 0] + 1 22 | p_height = p_bboxes[:, 3] - p_bboxes[:, 1] + 1 23 | 24 | 25 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths 26 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights 27 | 28 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 29 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 30 | gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths 31 | gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights 32 | 33 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths 34 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights 35 | targets_dw = np.log(gt_widths / ex_widths) 36 | targets_dh = np.log(gt_heights / ex_heights) 37 | 38 | targets = np.vstack( 39 | (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() 40 | return targets 41 | 42 | def bbox_transform_inv(boxes, deltas): 43 | """Convert network output to Bounding Boxes 44 | """ 45 | if boxes.shape[0] == 0: 46 | return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype) 47 | 48 | boxes = boxes.astype(deltas.dtype, copy=False) 49 | widths = boxes[:, 2] - boxes[:, 0] + 1.0 50 | heights = boxes[:, 3] - boxes[:, 1] + 1.0 51 | ctr_x = boxes[:, 0] + 0.5 * widths 52 | ctr_y = boxes[:, 1] + 0.5 * heights 53 | 54 | dx = deltas[:, 0::4] 55 | dy = deltas[:, 1::4] 56 | dw = deltas[:, 2::4] 57 | dh = deltas[:, 3::4] 58 | 59 | pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] 60 | pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] 61 | pred_w = np.exp(dw) * widths[:, np.newaxis] 62 | pred_h = np.exp(dh) * heights[:, np.newaxis] 63 | 64 | pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype) 65 | # x1 66 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w 67 | # y1 68 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h 69 | # x2 70 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w 71 | # y2 72 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h 73 | 74 | return pred_boxes 75 | 76 | def clip_boxes(boxes, im_shape): 77 | """ 78 | Clip boxes to image boundaries. 79 | """ 80 | 81 | # x1 >= 0 82 | boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) 83 | # y1 >= 0 84 | boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) 85 | # x2 < im_shape[1] 86 | boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) 87 | # y2 < im_shape[0] 88 | boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) 89 | return boxes 90 | -------------------------------------------------------------------------------- /util/data_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import glob 5 | import cv2 6 | import numpy as np 7 | # from vgg16 import vgg16 8 | # from input_kitti import * 9 | from parse_xml import parseXML 10 | from bbox_transform import * 11 | from base_vgg16 import Vgg16 12 | import tensorflow as tf 13 | from bbox_overlap import bbox_overlaps 14 | 15 | def create_labels(resized_images, resize_scales, feature_scale=1./16): 16 | """create labels for classification and regression 17 | 1. get bbox from resized images 18 | 2. from bbox, create input labels for regression 19 | 3. get GroundTruth Bounding Boxes 20 | 4. calculate IOU for training 21 | 5. divide labels into training sets and trush 22 | 6. 23 | """ 24 | return labels 25 | 26 | def create_rois(labels, feature_scale=1./16): 27 | """create rois from labels""" 28 | return rois 29 | 30 | def nms(): 31 | return bboxes 32 | 33 | def process(image_dir, label_dir, num_of_rois, batch_size, min_size): 34 | # model Definition 35 | # loss function 36 | dataset_img_list, dataset_pred_bbox_list, g_bboxes, get_Image_Roi_All(image_dir, label_dir, min_size) 37 | # batch_imgs, batch_rois, batch_g_bboxes = select_inputs_from_datasets(dataset_img_list, dataset_pred_bbox_list, g_bboxes, batch_size) 38 | for batch_imgs, batch_rois, batch_g_bboxes in select_inputs_from_datasets(dataset_img_list, dataset_pred_bbox_list, g_bboxes, batch_size): 39 | pass 40 | # training 41 | # test 42 | # validation 43 | 44 | def get_Image_Roi_All(image_dir, label_dir, min_size): 45 | """Get Images and ROIs of All Datasets. 46 | # Args: 47 | image_dir (str): path of image directory. 48 | label_dir (str): path of label's xml directory. 49 | num_of_rois(int): Number of ROIs in a image. 50 | # Returns: 51 | images (list): ndarray Images of datasets. 52 | pred_bboxes(ndarray): rescaled bbox Label [0, x, y, w, h] 53 | """ 54 | # 車が含まれている画像のみラベルと一緒に読み込む 55 | image_pathlist = 0 #load_for_detection(label_dir) 56 | g_bboxes = 0 #load_for_detection(label_dir) #TODO: [Datasets, x, y, w, h] 57 | dataset_img_list = [] # len(dataset_img_list) == Number of Datasets Images 58 | dataset_pred_bbox_list = [] # len(dataset_pred_bbox_list) == Number of (num_of_rois * num of images) 59 | # Preprocess Ground Truth ROIs. shape is [Num of ROIs * batch_size, x, y, w, h, 0, 1] 60 | g_bboxes = [] 61 | # shape is [batch_channel, x, y, w, h] 62 | image_pathlist = glob.glob(image_dir) 63 | label_pathlist = glob.glob(label_dir) 64 | image_pathlist.sort() 65 | label_pathlist.sort() 66 | 67 | for index, (image_path, label_path) in enumerate(zip(image_pathlist, label_pathlist)): 68 | if index == 10: 69 | break 70 | img = cv2.imread(image_path) 71 | label = read_label_from_txt(label_path) 72 | if label is None: 73 | continue 74 | # ここでは、IOUを計算していないので、予測のbounding boxは絞らない 75 | # なので、数多くのbounding boxが存在していることになるが、メモリが許す限り確保する 76 | p_bbox_candicate = pred_bboxes(img, min_size, index) 77 | img, im_scale = preprocess_imgs(img) 78 | p_bbox_candicate = unique_bboxes(p_bbox_candicate, im_scale, feature_scale=1./16) 79 | overlaps = bbox_overlaps(p_bbox_candicate[:, 1:], label) 80 | print label 81 | print p_bbox_candicate[0] 82 | print overlaps[overlaps > 0.5] 83 | print overlaps.shape 84 | print 85 | dataset_img_list.append(img) 86 | dataset_pred_bbox_list.append(p_bbox_candicate) 87 | g_bboxes.append(label) 88 | 89 | dataset_pred_bbox_list = np.array(dataset_pred_bbox_list) 90 | g_bboxes = np.array(g_bboxes) 91 | print dataset_img_list[1].shape, dataset_pred_bbox_list[0].shape, g_bboxes[0].shape 92 | print dataset_pred_bbox_list[1].shape 93 | print dataset_pred_bbox_list[2].shape 94 | g_bboxes = create_bbox_regression_label(dataset_pred_bbox_list, g_bboxes) 95 | return np.array(dataset_img_list), np.array(dataset_pred_bbox_list), g_bboxes 96 | 97 | 98 | def select_inputs_from_datasets(dataset_img_list, dataset_pred_bbox_list, g_bboxes, batch_size): 99 | """ 100 | # Args: 101 | dataset_img_list (ndarray): ndarray Images in datasets. 102 | dataset_pred_bbox_list(ndarray): rescaled bbox Label [0, x, y, w, h] 103 | shape is [batch, num_of_rois, 5] 104 | g_bboxes (ndarray): GroundTruth Bounding Box with Class Label 105 | shape is [batch, 6*max_label_num] 106 | label is [x, y, w, h, car, background] 107 | batch_size (int): batch size for training 108 | # Returns: 109 | batch_imgs (ndarray): input batch images for Network. Shape is [Batch Size, shape] 110 | batch_p_bboxes(ndarray): input ROIs for Network. Shape is [Num of ROIs*Batch size] 111 | batch_g_bboxes(ndarray): input GroundTruth Bounding Box for Network. 112 | Shape is [Num of ROIs*Batch Size] 113 | """ 114 | perm = np.random.permutation(len(dataset_img_list)) 115 | batches = [perm[i * batch_size:(i + 1) * batch_size] \ 116 | for i in range(len(dataset_img_list) // batch_size)] 117 | for batch in batches: 118 | batch_imgs = dataset_img_list[batch] 119 | batch_p_bboxes = dataset_pred_bbox_list[batch] 120 | batch_g_bboxes = g_bboxes[batch] 121 | # この時点でbatch_p_bboxes, g_bboxesは、batch毎にListでまとめられていそう? #TODO 122 | # TODO: Batch毎にLabelの形にする。それをcalculate IOUに入れて、最終的な形をvstackすれば全体のLabelが得られる 123 | 124 | # Flip Conversion 125 | # batch_imgs, batch_p_bboxes, batch_g_bboxes = flip_conversion(batch_imgs, batch_p_bboxes, batch_g_bboxes) 126 | batch_imgs = convert_imgslist_to_ndarray(batch_imgs) 127 | # calculate IOU between pred_roi_candicate, ground truth bounding box 128 | # この時点でbatch_g_bboxesはLabelの形になっていると想定 129 | batch_p_bboxes, batch_g_bboxes = calculate_IOU(batch_p_bboxes, batch_g_bboxes) 130 | yield batch_imgs, batch_rois, batch_g_bboxes 131 | 132 | def convert_pred_bbox_to_roi(batch_bbox, feature_scale=1./16): 133 | pass 134 | 135 | def calculate_IOU(batch_roi, batch_g_bboxes, fg_thres=0.5, bg_thres_max=0.5, bg_thres_min=0.1): 136 | """各画像の全ての車のラベルに対して、IOUを計算する 137 | そのために、batch_roi, batch_g_bboxesをforループで回し、 138 | """ 139 | area = batch_g_bboxes[:, 3] * batch_g_bboxes[: 4] 140 | w = np.maximum(batch_roi[:, 0], batch_g_bboxes[:, 0]) - np.minimum(batch_roi[:, 1], batch_g_bboxes[:, 1]) 141 | w_id = np.where(w > 0)[0] 142 | h = np.minimum(batch_roi[w_id][:, 0], batch_g_bboxes[w_id][:, 0]) - np.minimum(batch_roi[w_id][:, 1], batch_g_bboxes[w_id][:, 1]) 143 | h_id = np.where(h > 0)[0] 144 | IOU = float(w[w_id][h_id] * h[w_id][h_id]) / area[w_id][h_id] 145 | fg_rois = np.where(IOU >= fg_thres)[0] 146 | bg_rois1 = np.where(IOU < bg_thres_max)[0] 147 | bg_rois2 = np.where(IOU[bg_rois] >= bg_thres_min)[0] 148 | fg_index = w_id[h_id][fg_rois] 149 | bg_index = w_id[h_id][bg_rois1][bg_rois2] 150 | index = np.hstack((fg_index, bg_index)) 151 | return batch_rois[index], batch_g_bboxes[index] 152 | 153 | def convert_imgslist_to_ndarray(images): 154 | """Convert a list of images into a network input. 155 | Assumes images are already prepared (means subtracted, BGR order, ...). 156 | 157 | In this stage, the shape of images are different 158 | """ 159 | max_shape = np.array([im.shape for im in images]).max(axis=0) 160 | num_images = len(images) 161 | blob = np.zeros((num_images, max_shape[0], max_shape[1], 3), 162 | dtype=np.float32) 163 | for i in xrange(num_images): 164 | im = images[i] 165 | blob[i, 0:im.shape[0], 0:im.shape[1], :] = im 166 | return blob 167 | 168 | def flip_conversion(batch_imgs, batch_rois, batch_g_bboxes, batch_size): 169 | return batch_imgs, batch_rois, batch_g_bboxes 170 | 171 | def preprocess_imgs(im, pixel_means=np.array([103.939, 116.779, 123.68]), target_size=600, max_size=1000): 172 | """Mean subtract and scale an image for use in a blob. 173 | If you want to Data Augmentation, please edit this function 174 | """ 175 | im = im.astype(np.float32, copy=False) 176 | # if np.random.randint(2): 177 | # im = im[:, ::-1] 178 | im -= pixel_means 179 | im_shape = im.shape 180 | im_size_min = np.min(im_shape[0:2]) 181 | im_size_max = np.max(im_shape[0:2]) 182 | im_scale = float(target_size) / float(im_size_min) 183 | # Prevent the biggest axis from being more than MAX_SIZE 184 | if np.round(im_scale * im_size_max) > max_size: 185 | im_scale = float(max_size) / float(im_size_max) 186 | im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, 187 | interpolation=cv2.INTER_LINEAR) 188 | return im, im_scale 189 | 190 | def data_generator(imgs, rois, labels): 191 | """data generator for network inputs""" 192 | yield batch_x, batch_rois, batch_labels 193 | 194 | def unique_bboxes(rects, im_scale, feature_scale=1./16): 195 | """Get Bounding Box from Original Image. 196 | 197 | # Args: 198 | orig_img (ndarray): original image. 3 dimensional array. 199 | min_size (tuple): minimum size of bounding box. 200 | feature_scale(float): scale of feature map. 2 ** (num of pooling layer) 201 | 202 | """ 203 | rects *= im_scale 204 | v = np.array([1, 1e3, 1e6, 1e9, 1e12]) 205 | hashes = np.round(rects * feature_scale).dot(v) 206 | _, index, inv_index = np.unique(hashes, return_index=True, 207 | return_inverse=True) 208 | rects = rects[index, :] 209 | return rects 210 | 211 | def pred_bboxes(orig_img, min_size, index): 212 | rects = [] 213 | dlib.find_candidate_object_locations(orig_img, rects, min_size=min_size) 214 | rects = [[0, d.left(), d.top(), d.right(), d.bottom()] for d in rects] 215 | rects = np.asarray(rects, dtype=np.float64) 216 | return rects 217 | -------------------------------------------------------------------------------- /util/generate_anchors.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | # Verify that we compute the same anchors as Shaoqing's matlab implementation: 11 | # 12 | # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat 13 | # >> anchors 14 | # 15 | # anchors = 16 | # 17 | # -83 -39 100 56 18 | # -175 -87 192 104 19 | # -359 -183 376 200 20 | # -55 -55 72 72 21 | # -119 -119 136 136 22 | # -247 -247 264 264 23 | # -35 -79 52 96 24 | # -79 -167 96 184 25 | # -167 -343 184 360 26 | 27 | #array([[ -83., -39., 100., 56.], 28 | # [-175., -87., 192., 104.], 29 | # [-359., -183., 376., 200.], 30 | # [ -55., -55., 72., 72.], 31 | # [-119., -119., 136., 136.], 32 | # [-247., -247., 264., 264.], 33 | # [ -35., -79., 52., 96.], 34 | # [ -79., -167., 96., 184.], 35 | # [-167., -343., 184., 360.]]) 36 | 37 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2], 38 | scales=2**np.arange(3, 6)): 39 | """ 40 | Generate anchor (reference) windows by enumerating aspect ratios X 41 | scales wrt a reference (0, 0, 15, 15) window. 42 | """ 43 | 44 | base_anchor = np.array([1, 1, base_size, base_size]) - 1 45 | ratio_anchors = _ratio_enum(base_anchor, ratios) 46 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) 47 | for i in xrange(ratio_anchors.shape[0])]) 48 | return anchors 49 | 50 | def _whctrs(anchor): 51 | """ 52 | Return width, height, x center, and y center for an anchor (window). 53 | """ 54 | 55 | w = anchor[2] - anchor[0] + 1 56 | h = anchor[3] - anchor[1] + 1 57 | x_ctr = anchor[0] + 0.5 * (w - 1) 58 | y_ctr = anchor[1] + 0.5 * (h - 1) 59 | return w, h, x_ctr, y_ctr 60 | 61 | def _mkanchors(ws, hs, x_ctr, y_ctr): 62 | """ 63 | Given a vector of widths (ws) and heights (hs) around a center 64 | (x_ctr, y_ctr), output a set of anchors (windows). 65 | """ 66 | 67 | ws = ws[:, np.newaxis] 68 | hs = hs[:, np.newaxis] 69 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), 70 | y_ctr - 0.5 * (hs - 1), 71 | x_ctr + 0.5 * (ws - 1), 72 | y_ctr + 0.5 * (hs - 1))) 73 | return anchors 74 | 75 | def _ratio_enum(anchor, ratios): 76 | """ 77 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 78 | """ 79 | 80 | w, h, x_ctr, y_ctr = _whctrs(anchor) 81 | size = w * h 82 | size_ratios = size / ratios 83 | ws = np.round(np.sqrt(size_ratios)) 84 | hs = np.round(ws * ratios) 85 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 86 | return anchors 87 | 88 | def _scale_enum(anchor, scales): 89 | """ 90 | Enumerate a set of anchors for each scale wrt an anchor. 91 | """ 92 | 93 | w, h, x_ctr, y_ctr = _whctrs(anchor) 94 | ws = w * scales 95 | hs = h * scales 96 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 97 | return anchors 98 | 99 | if __name__ == '__main__': 100 | import time 101 | t = time.time() 102 | a = generate_anchors() 103 | print time.time() - t 104 | print a 105 | from IPython import embed; embed() 106 | -------------------------------------------------------------------------------- /util/input_kitti.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import os 6 | import numpy as np 7 | import cv2 8 | import glob 9 | import math 10 | from parse_xml import parseXML 11 | from data_util import * 12 | import matplotlib.pyplot as plt 13 | 14 | def read_label_from_txt(label_path): 15 | """From label text file, Read bounding box 16 | Each text file corresponds to one image. 17 | 18 | # Returns: 19 | bounding_box(list): List of Bounding Boxes in one image 20 | """ 21 | bounding_box = [] 22 | with open(label_path, "r") as f: 23 | labels = f.read().split("\n") 24 | for label in labels: 25 | label = label.split(" ") 26 | if label[0] == ("Car" or "Van"): # or "Truck" 27 | bounding_box.append(label[4:8]) 28 | 29 | if bounding_box: 30 | return np.array(bounding_box, dtype=np.float64) 31 | else: 32 | return None 33 | 34 | def select_inputs_from_datasets(dataset_img_list, g_boxes, batch_size): 35 | """ 36 | # Args: 37 | dataset_img_list (ndarray): ndarray Images in datasets. 38 | g_boxes (ndarray): GroundTruth Bounding Box with Class Label 39 | shape is [batch, 6*max_label_num] 40 | label is [x, y, w, h] 41 | batch_size (int): batch size for training 42 | # Returns: 43 | batch_imgs (ndarray): input batch images for Network. Shape is [Batch Size, shape] 44 | batch_g_boxes(ndarray): input GroundTruth Bounding Box for Network. 45 | Shape is [Num of ROIs*Batch Size] 46 | """ 47 | perm = np.random.permutation(len(dataset_img_list)) 48 | batches = [perm[i * batch_size:(i + 1) * batch_size] \ 49 | for i in range(len(dataset_img_list) // batch_size)] 50 | for batch in batches: 51 | batch_imgs = dataset_img_list[batch] 52 | batch_g_boxes = g_bboxes[batch] 53 | # Flip Conversion 54 | # batch_imgs, batch_p_bboxes, batch_g_boxes = flip_conversion(batch_imgs, batch_p_bboxes, batch_g_boxes) 55 | batch_imgs, batch_g_boxes = convert_imgslist_to_ndarray(batch_imgs, batch_g_boxes) 56 | yield batch_imgs, batch_g_boxes 57 | 58 | # def convert_imgslist_to_ndarray(images, batch_g_boxes): 59 | # """Convert a list of images into a network input. 60 | # Assumes images are already prepared (means subtracted, BGR order, ...). 61 | # 62 | # In this stage, the shape of images are different 63 | # """ 64 | # max_shape = np.array([im.shape for im in images]).max(axis=0) 65 | # num_images = len(images) 66 | # blob = np.zeros((num_images, max_shape[0], max_shape[1], 3), 67 | # dtype=np.float32) 68 | # for i in xrange(num_images): 69 | # if np.random.randint(2): 70 | # blob[i, 0:im.shape[0], 0:im.shape[1], :] = images[i][:, ::-1] 71 | # batch_g_boxes[i][:, 0] -= (max_shape[1]-1) 72 | # batch_g_boxes[i][:, 1] -= (max_shape[1]-1) 73 | # batch_g_boxes[i][:, 2] -= (max_shape[1]-1) 74 | # batch_g_boxes[i][:, 3] -= (max_shape[1]-1) 75 | # else: 76 | # blob[i, 0:im.shape[0], 0:im.shape[1], :] = images[i] 77 | # return blob, batch_g_boxes 78 | 79 | def get_pathlist(image_dir, label_dir): 80 | image_pathlist = 0 #load_for_detection(label_dir) 81 | dataset_img_list = [] # len(dataset_img_list) == Number of Datasets Images 82 | # Preprocess Ground Truth ROIs. shape is [Num of ROIs * batch_size, x, y, w, h, 0, 1] 83 | g_bboxes = [] 84 | # shape is [batch_channel, x, y, w, h] 85 | image_pathlist = glob.glob(image_dir) 86 | label_pathlist = glob.glob(label_dir) 87 | image_pathlist.sort() 88 | label_pathlist.sort() 89 | return np.array(image_pathlist), np.array(label_pathlist) 90 | 91 | def generator__Image_and_label(image_pathlist, label_pathlist, batch_size=32): 92 | """Get Images and ROIs of All Datasets. 93 | # Args: 94 | image_pathlist (ndarray): path of image files. 95 | label_pathlist (ndarray): path of label's xml files. 96 | batch_size(int): Batch Size for network input. 97 | # Returns: 98 | images (list): ndarray Images of datasets. 99 | g_bboxes(ndarray): rescaled bbox Label. Shapeis [Batch, ?, 4](x, y, w, h) 100 | """ 101 | iter_num = image_pathlist.shape[0] / batch_size 102 | for it in range(iter_num): 103 | dataset_img_list = [] # len(dataset_img_list) == Number of Datasets Images 104 | g_bboxes = [] 105 | for (image_path, label_path) in zip(image_pathlist[it*batch_size:(it+1)*batch_size], label_pathlist[it*batch_size:(it+1)*batch_size]): 106 | img = cv2.imread(image_path) 107 | label = read_label_from_txt(label_path) 108 | if label is None: 109 | continue 110 | img, im_scale = preprocess_imgs(img) 111 | dataset_img_list.append(img) 112 | g_bboxes.append(label) 113 | dataset_img_list = convert_imgslist_to_ndarray(dataset_img_list) 114 | yield np.array(dataset_img_list), np.array(g_bboxes) 115 | 116 | def get_ALL_Image(image_dir, label_dir): 117 | """Get Images and ROIs of All Datasets. 118 | # Args: 119 | image_dir (str): path of image directory. 120 | label_dir (str): path of label's xml directory. 121 | num_of_rois(int): Number of ROIs in a image. 122 | # Returns: 123 | images (list): ndarray Images of datasets. 124 | pred_bboxes(ndarray): rescaled bbox Label. Shapeis [Batch, ?, 4](x, y, w, h) 125 | """ 126 | import time 127 | start = time.time() 128 | # 車が含まれている画像のみラベルと一緒に読み込む 129 | image_pathlist = 0 #load_for_detection(label_dir) 130 | dataset_img_list = [] # len(dataset_img_list) == Number of Datasets Images 131 | # Preprocess Ground Truth ROIs. shape is [Num of ROIs * batch_size, x, y, w, h, 0, 1] 132 | g_bboxes = [] 133 | # shape is [batch_channel, x, y, w, h] 134 | image_pathlist = glob.glob(image_dir) 135 | label_pathlist = glob.glob(label_dir) 136 | image_pathlist.sort() 137 | label_pathlist.sort() 138 | 139 | for (image_path, label_path) in zip(image_pathlist, label_pathlist): 140 | img = cv2.imread(image_path) 141 | label = read_label_from_txt(label_path) 142 | if label is None: 143 | continue 144 | img, im_scale = preprocess_imgs(img) 145 | dataset_img_list.append(img) 146 | g_bboxes.append(label) 147 | 148 | print time.time() - start 149 | return np.array(dataset_img_list), np.array(g_bboxes) 150 | -------------------------------------------------------------------------------- /util/model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import glob 5 | import cv2 6 | import dlib 7 | import numpy as np 8 | # from vgg16 import vgg16 9 | from input_kitti import * 10 | from util import * 11 | from parse_xml import parseXML 12 | from base_vgg16 import Vgg16 13 | import tensorflow as tf 14 | from network_util import * 15 | # from utility.image.data_augmentation.flip import Flip 16 | sys.path.append("/Users/tsujiyuuki/env_python/code/my_code/Data_Augmentation") 17 | 18 | """ 19 | ・collect dataset of cars 20 | ・Preprocessing BBOX and Label for training 21 | ・try roi_pooling layer 22 | ・Extract ROI using mitmul tools 23 | ・NMS 24 | """ 25 | 26 | """Flow of Faster RCNN 27 | ############################################################################### 28 | In this state, Create Input Images and ROI Labels 29 | 30 | 1. input batch images and GroundTruth BBox from datasets *folder name, batch size 31 | Image shape is [batch size, width, height, channel], tf.float32, vgg normalized, bgr 32 | Bounding Box shape is [batch size, center_x, center_y, width, height] 33 | 34 | 2. get candicate bounding box from images. 35 | 36 | # Implemented 37 | 3. resize input images to input size *size of resize if needed. 38 | if this operation was done, you should adjust bounding box according to it. 39 | Both of Candicate and GroundTruth Bounding Boxes. 40 | In thesis, Image size is in [600, 1000] 41 | In this Implemention, input image has dynamic shape between [600, 1000] 42 | 43 | 4. convert candicate bounding box to ROI label. 44 | 45 | 5. calculate IOU between ROI label and GroundTruth label. 46 | IOU is Intersection Over Union. 47 | 48 | 6. Select Bounding Box from IOU. 49 | IOU > 0.5 is correct label, IOU = [0.1 0.5) is a false label(background). 50 | Correct Label is 25%, BackGround Label is 75%. 51 | Number of Label is 128, Batch Size is 2, so each image has 64 ROIs 52 | 53 | ############################################################################### 54 | In this stage, Calculate Loss 55 | 56 | 7. Input data to ROI Pooling Layer is Conv5_3 Feature Map and ROIs 57 | Input shape is Feature map (batch, width, height, 512), ROIs (Num of ROIs, 5) 58 | ROIs, ex:) [0, left, height, right, bottom]. First Element is the index of batch 59 | 60 | 8. Through ROI Pooling Layer, Output Shape is [Num of ROIs, 7, 7, 512] 61 | 62 | 9. Reshape it to [Num of ROIs, -1], and then connect to Fully Connected Layer. 63 | 64 | 10.Output Layer has two section, one is class prediction, the other is its bounding box prediction. 65 | class prediction shape is [Num of ROIs, Num of Class + 1] 66 | bounding box prediction shape is [Num of ROIs, 4 * (Num of Class + 1)] 67 | 68 | 11.Loss Function 69 | Regularize bounding box value [center_x, center_y, w, h] into 70 | [(GroundTruth x - pred_x) / pred_w, (GroundTruth y - pred_y) / pred_h, log(GroundTruth w / pred_w), log(GroundTruth h / pred_h)] 71 | Class prediction is by softmax with loss. 72 | Bounding Box prediction is by smooth_L1 loss 73 | ############################################################################### 74 | In this stage, Describe Datasets. 75 | 1. PASCAL VOC2007 76 | 2. KITTI Datasets 77 | 3. Udacity Datasets 78 | """ 79 | 80 | # TODO: datasetsを丸ごとメモリに展開できるか。Generatorを用いるか。 81 | def loss(obj_class, bbox_regression, g_obj_class, g_bbox_regression): 82 | """Calculate Class Loss and Bounding Regression Loss. 83 | 84 | # Args: 85 | obj_class: Prediction of object class. Shape is [ROIs*Batch_Size, 2] 86 | bbox_regression: Prediction of bounding box. Shape is [ROIs*Batch_Size, 4] 87 | """ 88 | pass 89 | 90 | def fast_rcnn(sess, rois, roi_size=(7, 7), vggpath=None, image_shape=(300, 300), \ 91 | is_training=None, use_batchnorm=False, activation=tf.nn.relu, num_of_rois=128): 92 | """Model Definition of Fast RCNN 93 | In thesis, Roi Size is (7, 7), channel is 512 94 | """ 95 | # images = tf.placeholder(tf.float32, [None, image_shape[0], image_shape[1], 3]) 96 | # images = tf.placeholder(tf.float32, [None, image_shape[0], image_shape[1], 3]) 97 | 98 | vgg = Vgg(vgg16_npy_path=vggpath) 99 | vgg.build_model(images) 100 | feature_map = vgg.conv5_3 # (batch, kernel, kernel, channel) 101 | 102 | with tf.variable_scope("fast_rcnn"): 103 | # roi shape [Num of ROIs, X, Y, W, H] 104 | roi_layer = roi_pooling(feature_map, rois, roi_size[0], roi_size[1]) 105 | # input_shape [num_of_rois, channel, roi size, roi size] 106 | pool_5 = tf.reshape(roi_layer, [num_of_rois, roi_size[0]*roi_size[1]*512]) 107 | fc6 = fully_connected(pool_5, [roi_size[0]*roi_size[1]*512, 4096], name="fc6", is_training=is_training) 108 | fc7 = fully_connected(fc6, [4096, 4096], name="fc7", is_training=is_training) 109 | # output shape [num_of_rois, 2] 110 | obj_class = tf.nn.softmax(fully_connected(fc7, [4096, 2], name="fc_class", activation=None, use_batchnorm=None), dim=-1) 111 | # output shape [num_of_rois, 8] 112 | bbox_regression = fully_connected(fc7, [4096, 8], name="fc_bbox", activation=None, use_batchnorm=None) 113 | 114 | 115 | class ExtendedLayer(object): 116 | def __init__(self): 117 | pass 118 | 119 | def build_model(self, input_layer, use_batchnorm=False, is_training=True, atrous=False, \ 120 | rate=1, activation=tf.nn.relu, implement_atrous=False, lr_mult=1): 121 | if implement_atrous: 122 | if atrous: 123 | self.pool_5 = maxpool2d(input_layer, kernel=3, stride=1, name="pool5", padding="SAME") 124 | else: 125 | self.pool_5 = maxpool2d(input_layer, kernel=2, stride=2, name="pool5", padding="SAME") #TODO: padding is valid or same 126 | 127 | kernel_size = 3 128 | if atrous: 129 | rate *= 6 130 | # pad = int(((kernel_size + (rate - 1) * (kernel_size - 1)) - 1) / 2) 131 | self.conv_6 = convBNLayer(self.pool_5, use_batchnorm, is_training, 512, 1024, kernel_size, 1, \ 132 | name="conv_6", activation=tf.nn.relu, atrous=True, rate=rate) 133 | else: 134 | rate *= 3 135 | # pad = int(((kernel_size + (rate - 1) * (kernel_size - 1)) - 1) / 2) 136 | self.conv_6 = convBNLayer(self.pool_5, use_batchnorm, is_training, 512, 1024, kernel_size, 1, \ 137 | name="conv_6", activation=tf.nn.relu, atrous=True, rate=rate) 138 | else: 139 | self.pool_5 = maxpool2d(input_layer, kernel=3, stride=1, name="pool5", padding="SAME") 140 | self.conv_6 = convBNLayer(self.pool_5, use_batchnorm, is_training, 512, 1024, 3, 1, \ 141 | name="conv_6", activation=tf.nn.relu, atrous=False, rate=rate) 142 | 143 | self.conv_7 = convBNLayer(self.conv_6, use_batchnorm, is_training, 1024, 1024, 1, 1, name="conv_7", activation=activation) 144 | self.conv_8_1 = convBNLayer(self.conv_7, use_batchnorm, is_training, 1024, 256, 1, 1, name="conv_8_1", activation=activation) 145 | self.conv_8_2 = convBNLayer(self.conv_8_1, use_batchnorm, is_training, 256, 512, 3, 2, name="conv_8_2", activation=activation) 146 | self.conv_9_1 = convBNLayer(self.conv_8_2, use_batchnorm, is_training, 512, 128, 1, 1, name="conv_9_1", activation=activation) 147 | self.conv_9_2 = convBNLayer(self.conv_9_1, use_batchnorm, is_training, 128, 256, 3, 2, name="conv_9_2", activation=activation) 148 | self.conv_10_1 = convBNLayer(self.conv_9_2, use_batchnorm, is_training, 256, 128, 1, 1, name="conv_10_1", activation=activation) 149 | self.conv_10_2 = convBNLayer(self.conv_10_1, use_batchnorm, is_training, 128, 256, 3, 1, name="conv_10_2", activation=activation, padding="VALID") 150 | self.conv_11_1 = convBNLayer(self.conv_10_2, use_batchnorm, is_training, 256, 128, 1, 1, name="conv_11_1", activation=activation) 151 | self.conv_11_2 = convBNLayer(self.conv_11_1, use_batchnorm, is_training, 128, 256, 3, 1, name="conv_11_2", activation=activation, padding="VALID") 152 | 153 | def ssd_model(sess, vggpath=None, image_shape=(300, 300), \ 154 | is_training=None, use_batchnorm=False, activation=tf.nn.relu, \ 155 | num_classes=0, normalization=[], atrous=False, rate=1, implement_atrous=False): 156 | """ 157 | 1. input RGB images and labels 158 | 2. edit images like [-1, image_shape[0], image_shape[1], 3] 159 | 3. Create Annotate Layer? 160 | 4. input x into Vgg16 architecture(pretrained) 161 | 5. 162 | """ 163 | images = tf.placeholder(tf.float32, [None, image_shape[0], image_shape[1], 3]) 164 | vgg = Vgg(vgg16_npy_path=vggpath) 165 | vgg.build_model(images) 166 | 167 | with tf.variable_scope("extended_model") as scope: 168 | phase_train = tf.placeholder(tf.bool, name="phase_traing") if is_training else None 169 | extended_model = ExtendedLayer() 170 | extended_model.build_model(vgg.conv5_3, use_batchnorm=use_batchnorm, atrous=atrous, rate=rate, \ 171 | is_training=phase_train, activation=activation, lr_mult=1, implement_atrous=implement_atrous) 172 | 173 | # with tf.variable_scope("multibox_layer"): 174 | # from_layers = [vgg.conv4_3, extended_model.conv_7, extended_model.conv_8_2, 175 | # extended_model.conv_9_2, extended_model.conv_10_2, extended_model.conv_11_2] 176 | # multibox_layer = MultiboxLayer() 177 | # multibox_layer.build_model(from_layers, num_classes=0, normalization=normalization) 178 | # 179 | initialized_var = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="extended_model") 180 | sess.run(tf.variables_initializer(initialized_var)) 181 | 182 | return extended_model 183 | 184 | class MultiboxLayer(object): 185 | def __init__(self): 186 | pass 187 | 188 | # TODO: validate this is correct or not 189 | def l2_normalization(self, input_layer, scale=20): 190 | return tf.nn.l2_normalize(input_layer, dim) * scale 191 | 192 | def createMultiBoxHead(self, from_layers, num_classes=0, normalizations=[], \ 193 | use_batchnorm=False, is_training=None, activation=None, \ 194 | kernel_size=3, prior_boxes=[], kernel_sizes=[]): 195 | """ 196 | # Args: 197 | from_layers(list) : list of input layers 198 | num_classes(int) : num of label's classes that this architecture detects 199 | normalizations(list): list of scale for normalizations 200 | if value <= 0, not apply normalization to the specified layer 201 | """ 202 | assert num_classes > 0, "num of label's class must be positive number" 203 | if normalizations: 204 | assert len(from_layers) == len(normalizations), "from_layers and normalizations should have same length" 205 | 206 | num_list = len(from_layers) 207 | for index, kernel_size, layer, norm in zip(range(num_list), kernel_sizes, from_layers, normalizations): 208 | input_layer = layer 209 | with tf.variable_scope("layer" + str(index+1)): 210 | if norm > 0: 211 | scale = tf.get_variable("scale", trainable=True, initializer=tf.constant(norm))#initialize = norm 212 | input_layer = self.l2_normalization(input_layer, scale) 213 | 214 | # create location prediction layer 215 | loc_output_dim = 4 * prior_num # (center_x, center_y, width, height) 216 | location_layer = convBNLayer(input_layer, use_batchnorm, is_training, input_layer.get_shape()[0], loc_output_dim, kernel_size, 1, name="loc_layer", activation=activation) 217 | # from shape : (batch, from_kernel, from_kernel, loc_output_dim) 218 | # to : (batch, ) 219 | location_pred = tf.reshape(location_layer, [-1, ]) 220 | 221 | # create confidence prediction layer 222 | conf_output_dim = num_classes * prior_num 223 | confidence_layer = convBNLayer(input_layer, use_batchnorm, is_training, input_layer.get_shape()[0], conf_output_dim, kernel_size, 1, name="conf_layer", activation=activation) 224 | confidence_pred = tf.reshape(confidence_pred, [-1, ]) 225 | 226 | # Flatten each output 227 | 228 | # append result of each results 229 | 230 | return None 231 | 232 | if __name__ == '__main__': 233 | import sys 234 | import matplotlib.pyplot as plt 235 | from PIL import Image as im 236 | sys.path.append('/home/katou01/code/grid/DataAugmentation') 237 | # from resize import resize 238 | 239 | image_dir = "/home/katou01/download/training/image_2/*.png" 240 | label_dir = "/home/katou01/download/training/label_2/*.txt" 241 | get_Image_Roi_All(image_dir, label_dir, 80) 242 | # 243 | # image = im.open("./test_images/test1.jpg") 244 | # image = np.array(image, dtype=np.float32) 245 | # new_image = image[np.newaxis, :] 246 | # batch_image = np.vstack((new_image, new_image)) 247 | # batch_image = resize(batch_image, size=(300, 300)) 248 | # 249 | # with tf.Session() as sess: 250 | # model = ssd_model(sess, batch_image, activation=None, atrous=False, rate=1, implement_atrous=False) 251 | # print(vars(model)) 252 | # # tf.summary.scalar('model', model) 253 | -------------------------------------------------------------------------------- /util/network_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import numpy as np 5 | from base_vgg16 import Vgg16 6 | import tensorflow as tf 7 | 8 | def fully_connected(input_layer, shape, name="", is_training=True, use_batchnorm=True, activation=tf.nn.relu): 9 | with tf.variable_scope("fully" + name): 10 | kernel = tf.get_variable("weights", shape=shape, \ 11 | dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.01)) 12 | fully = tf.matmul(input_layer, kernel) 13 | if activation: 14 | fully = activation(fully) 15 | if use_batchnorm: 16 | fully = batch_norm(fully, is_training) 17 | return fully 18 | 19 | def vgg_fully(input_layer, shape, name="", activation=tf.nn.relu): 20 | with tf.variable_scope(name): 21 | kernel = tf.get_variable("weights", shape=shape, \ 22 | dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.01)) 23 | b = tf.get_variable("biases", shape=[shape[1]], initializer=tf.constant_initializer(0.0)) 24 | fully = tf.matmul(input_layer, kernel) 25 | fully = tf.nn.bias_add(fully, b) 26 | if activation: 27 | fully = activation(fully) 28 | return fully 29 | 30 | def batch_norm(inputs, phase_train, decay=0.9, eps=1e-5): 31 | """Batch Normalization 32 | 33 | Args: 34 | inputs: input data(Batch size) from last layer 35 | phase_train: when you test, please set phase_train "None" 36 | Returns: 37 | output for next layer 38 | """ 39 | gamma = tf.get_variable("gamma", shape=inputs.get_shape()[-1], dtype=tf.float32, initializer=tf.constant_initializer(1.0)) 40 | beta = tf.get_variable("beta", shape=inputs.get_shape()[-1], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 41 | pop_mean = tf.get_variable("pop_mean", trainable=False, shape=inputs.get_shape()[-1], dtype=tf.float32, initializer=tf.constant_initializer(0.0)) 42 | pop_var = tf.get_variable("pop_var", trainable=False, shape=inputs.get_shape()[-1], dtype=tf.float32, initializer=tf.constant_initializer(1.0)) 43 | axes = range(len(inputs.get_shape()) - 1) 44 | 45 | if phase_train != None: 46 | batch_mean, batch_var = tf.nn.moments(inputs, axes) 47 | train_mean = tf.assign(pop_mean, pop_mean * decay + batch_mean*(1 - decay)) 48 | train_var = tf.assign(pop_var, pop_var * decay + batch_var * (1 - decay)) 49 | with tf.control_dependencies([train_mean, train_var]): 50 | return tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta, gamma, eps) 51 | else: 52 | return tf.nn.batch_normalization(inputs, pop_mean, pop_var, beta, gamma, eps) 53 | 54 | def convBNLayer(input_layer, use_batchnorm, is_training, input_dim, output_dim, \ 55 | kernel_size, stride, activation=tf.nn.relu, padding="SAME", name=""): 56 | with tf.variable_scope("convBN" + name): 57 | w = tf.get_variable("weights", \ 58 | shape=[kernel_size, kernel_size, input_dim, output_dim], initializer=tf.contrib.layers.xavier_initializer()) 59 | 60 | conv = tf.nn.conv2d(input_layer, w, strides=[1, stride, stride, 1], padding=padding) 61 | 62 | if use_batchnorm: 63 | if activation != None: 64 | conv = activation(conv, name="activation") 65 | bn = batch_norm(conv, is_training) 66 | return bn 67 | 68 | b = tf.get_variable("bias", shape=[output_dim], initializer=tf.constant_initializer(0.0)) 69 | bias = tf.nn.bias_add(conv, b) 70 | if activation is not None: 71 | return activation(bias, name="activation") 72 | return bias 73 | 74 | 75 | def get_fc_weight(self, name): 76 | return tf.Variable(self.data_dict[name][0], name="weights") 77 | 78 | def convLayer(input_layer, input_dim, output_dim, \ 79 | kernel_size, stride, activation=tf.nn.relu, padding="SAME", name=""): 80 | with tf.variable_scope(name): 81 | w = tf.get_variable("filter", \ 82 | shape=[kernel_size, kernel_size, input_dim, output_dim], initializer=tf.contrib.layers.xavier_initializer()) 83 | 84 | conv = tf.nn.conv2d(input_layer, w, strides=[1, stride, stride, 1], padding=padding) 85 | 86 | b = tf.get_variable("biases", shape=[output_dim], initializer=tf.constant_initializer(0.0)) 87 | bias = tf.nn.bias_add(conv, b) 88 | if activation is not None: 89 | return activation(bias, name="activation") 90 | return bias 91 | 92 | def maxpool2d(x, kernel=2, stride=2, name="", padding="SAME"): 93 | """define max pooling layer""" 94 | with tf.variable_scope(name): 95 | return tf.nn.max_pool( 96 | x, 97 | ksize = [1, kernel, kernel, 1], 98 | strides = [1, stride, stride, 1], 99 | padding=padding) 100 | -------------------------------------------------------------------------------- /util/parse_xml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | parse XML files containing tracklet info for kitti data base (raw data section) 4 | (http://cvlibs.net/datasets/kitti/raw_data.php) 5 | 6 | No guarantees that this code is correct, usage is at your own risk! 7 | 8 | created by Christian Herdtweck, Max Planck Institute for Biological Cybernetics 9 | (christian.herdtweck@tuebingen.mpg.de) 10 | 11 | requires numpy! 12 | 13 | example usage: 14 | import parseTrackletXML as xmlParser 15 | kittiDir = '/path/to/kitti/data' 16 | drive = '2011_09_26_drive_0001' 17 | xmlParser.example(kittiDir, drive) 18 | or simply on command line: 19 | python parseTrackletXML.py 20 | """ 21 | 22 | # Version History: 23 | # 4/7/12 Christian Herdtweck: seems to work with a few random test xml tracklet files; 24 | # converts file contents to ElementTree and then to list of Tracklet objects; 25 | # Tracklet objects have str and iter functions 26 | # 5/7/12 ch: added constants for state, occlusion, truncation and added consistency checks 27 | # 30/1/14 ch: create example function from example code 28 | 29 | from sys import argv as cmdLineArgs 30 | from xml.etree.ElementTree import ElementTree 31 | import numpy as np 32 | import itertools 33 | from warnings import warn 34 | 35 | STATE_UNSET = 0 36 | STATE_INTERP = 1 37 | STATE_LABELED = 2 38 | stateFromText = {'0':STATE_UNSET, '1':STATE_INTERP, '2':STATE_LABELED} 39 | 40 | OCC_UNSET = 255 # -1 as uint8 41 | OCC_VISIBLE = 0 42 | OCC_PARTLY = 1 43 | OCC_FULLY = 2 44 | occFromText = {'-1':OCC_UNSET, '0':OCC_VISIBLE, '1':OCC_PARTLY, '2':OCC_FULLY} 45 | 46 | TRUNC_UNSET = 255 # -1 as uint8, but in xml files the value '99' is used! 47 | TRUNC_IN_IMAGE = 0 48 | TRUNC_TRUNCATED = 1 49 | TRUNC_OUT_IMAGE = 2 50 | TRUNC_BEHIND_IMAGE = 3 51 | truncFromText = {'99':TRUNC_UNSET, '0':TRUNC_IN_IMAGE, '1':TRUNC_TRUNCATED, \ 52 | '2':TRUNC_OUT_IMAGE, '3': TRUNC_BEHIND_IMAGE} 53 | 54 | 55 | class Tracklet(object): 56 | """ representation an annotated object track 57 | 58 | Tracklets are created in function parseXML and can most conveniently used as follows: 59 | 60 | for trackletObj in parseXML(trackletFile): 61 | for translation, rotation, state, occlusion, truncation, amtOcclusion, amtBorders, absoluteFrameNumber in trackletObj: 62 | your code here 63 | #end: for all frames 64 | #end: for all tracklets 65 | 66 | absoluteFrameNumber is in range [firstFrame, firstFrame+nFrames[ 67 | amtOcclusion and amtBorders could be None 68 | 69 | You can of course also directly access the fields objType (string), size (len-3 ndarray), firstFrame/nFrames (int), 70 | trans/rots (nFrames x 3 float ndarrays), states/truncs (len-nFrames uint8 ndarrays), occs (nFrames x 2 uint8 ndarray), 71 | and for some tracklets amtOccs (nFrames x 2 float ndarray) and amtBorders (nFrames x 3 float ndarray). The last two 72 | can be None if the xml file did not include these fields in poses 73 | """ 74 | 75 | objectType = None 76 | size = None # len-3 float array: (height, width, length) 77 | firstFrame = None 78 | trans = None # n x 3 float array (x,y,z) 79 | rots = None # n x 3 float array (x,y,z) 80 | states = None # len-n uint8 array of states 81 | occs = None # n x 2 uint8 array (occlusion, occlusion_kf) 82 | truncs = None # len-n uint8 array of truncation 83 | amtOccs = None # None or (n x 2) float array (amt_occlusion, amt_occlusion_kf) 84 | amtBorders = None # None (n x 3) float array (amt_border_l / _r / _kf) 85 | nFrames = None 86 | 87 | def __init__(self): 88 | """create Tracklet with no info set """ 89 | self.size = np.nan*np.ones(3, dtype=float) 90 | 91 | def __str__(self): 92 | """ return human-readable string representation of tracklet object 93 | 94 | called implicitly in 95 | print trackletObj 96 | or in 97 | text = str(trackletObj) 98 | """ 99 | return '[Tracklet over {0} frames for {1}]'.format(self.nFrames, self.objectType) 100 | 101 | def __iter__(self): 102 | """ returns an iterator that yields tuple of all the available data for each frame 103 | 104 | called whenever code iterates over a tracklet object, e.g. in 105 | for translation, rotation, state, occlusion, truncation, amtOcclusion, amtBorders, absoluteFrameNumber in trackletObj: 106 | ...do something ... 107 | or 108 | trackDataIter = iter(trackletObj) 109 | """ 110 | if self.amtOccs is None: 111 | return itertools.izip(self.trans, self.rots, self.states, self.occs, self.truncs, \ 112 | itertools.repeat(None), itertools.repeat(None), xrange(self.firstFrame, self.firstFrame+self.nFrames)) 113 | else: 114 | return itertools.izip(self.trans, self.rots, self.states, self.occs, self.truncs, \ 115 | self.amtOccs, self.amtBorders, xrange(self.firstFrame, self.firstFrame+self.nFrames)) 116 | #end: class Tracklet 117 | 118 | 119 | def parseXML(trackletFile): 120 | r""" parse tracklet xml file and convert results to list of Tracklet objects 121 | 122 | :param trackletFile: name of a tracklet xml file 123 | :returns: list of Tracklet objects read from xml file 124 | """ 125 | 126 | # convert tracklet XML data to a tree structure 127 | eTree = ElementTree() 128 | print 'parsing tracklet file', trackletFile 129 | with open(trackletFile) as f: 130 | eTree.parse(f) 131 | 132 | # now convert output to list of Tracklet objects 133 | trackletsElem = eTree.find('tracklets') 134 | tracklets = [] 135 | trackletIdx = 0 136 | nTracklets = None 137 | for trackletElem in trackletsElem: 138 | #print 'track:', trackletElem.tag 139 | if trackletElem.tag == 'count': 140 | nTracklets = int(trackletElem.text) 141 | print 'file contains', nTracklets, 'tracklets' 142 | elif trackletElem.tag == 'item_version': 143 | pass 144 | elif trackletElem.tag == 'item': 145 | #print 'tracklet {0} of {1}'.format(trackletIdx, nTracklets) 146 | # a tracklet 147 | newTrack = Tracklet() 148 | isFinished = False 149 | hasAmt = False 150 | frameIdx = None 151 | for info in trackletElem: 152 | #print 'trackInfo:', info.tag 153 | if isFinished: 154 | raise ValueError('more info on element after finished!') 155 | if info.tag == 'objectType': 156 | newTrack.objectType = info.text 157 | elif info.tag == 'h': 158 | newTrack.size[0] = float(info.text) 159 | elif info.tag == 'w': 160 | newTrack.size[1] = float(info.text) 161 | elif info.tag == 'l': 162 | newTrack.size[2] = float(info.text) 163 | elif info.tag == 'first_frame': 164 | newTrack.firstFrame = int(info.text) 165 | elif info.tag == 'poses': 166 | # this info is the possibly long list of poses 167 | for pose in info: 168 | #print 'trackInfoPose:', pose.tag 169 | if pose.tag == 'count': # this should come before the others 170 | if newTrack.nFrames is not None: 171 | raise ValueError('there are several pose lists for a single track!') 172 | elif frameIdx is not None: 173 | raise ValueError('?!') 174 | newTrack.nFrames = int(pose.text) 175 | newTrack.trans = np.nan * np.ones((newTrack.nFrames, 3), dtype=float) 176 | newTrack.rots = np.nan * np.ones((newTrack.nFrames, 3), dtype=float) 177 | newTrack.states = np.nan * np.ones(newTrack.nFrames, dtype='uint8') 178 | newTrack.occs = np.nan * np.ones((newTrack.nFrames, 2), dtype='uint8') 179 | newTrack.truncs = np.nan * np.ones(newTrack.nFrames, dtype='uint8') 180 | newTrack.amtOccs = np.nan * np.ones((newTrack.nFrames, 2), dtype=float) 181 | newTrack.amtBorders = np.nan * np.ones((newTrack.nFrames, 3), dtype=float) 182 | frameIdx = 0 183 | elif pose.tag == 'item_version': 184 | pass 185 | elif pose.tag == 'item': 186 | # pose in one frame 187 | if frameIdx is None: 188 | raise ValueError('pose item came before number of poses!') 189 | for poseInfo in pose: 190 | #print 'trackInfoPoseInfo:', poseInfo.tag 191 | if poseInfo.tag == 'tx': 192 | newTrack.trans[frameIdx, 0] = float(poseInfo.text) 193 | elif poseInfo.tag == 'ty': 194 | newTrack.trans[frameIdx, 1] = float(poseInfo.text) 195 | elif poseInfo.tag == 'tz': 196 | newTrack.trans[frameIdx, 2] = float(poseInfo.text) 197 | elif poseInfo.tag == 'rx': 198 | newTrack.rots[frameIdx, 0] = float(poseInfo.text) 199 | elif poseInfo.tag == 'ry': 200 | newTrack.rots[frameIdx, 1] = float(poseInfo.text) 201 | elif poseInfo.tag == 'rz': 202 | newTrack.rots[frameIdx, 2] = float(poseInfo.text) 203 | elif poseInfo.tag == 'state': 204 | newTrack.states[frameIdx] = stateFromText[poseInfo.text] 205 | elif poseInfo.tag == 'occlusion': 206 | newTrack.occs[frameIdx, 0] = occFromText[poseInfo.text] 207 | elif poseInfo.tag == 'occlusion_kf': 208 | newTrack.occs[frameIdx, 1] = occFromText[poseInfo.text] 209 | elif poseInfo.tag == 'truncation': 210 | newTrack.truncs[frameIdx] = truncFromText[poseInfo.text] 211 | elif poseInfo.tag == 'amt_occlusion': 212 | newTrack.amtOccs[frameIdx,0] = float(poseInfo.text) 213 | hasAmt = True 214 | elif poseInfo.tag == 'amt_occlusion_kf': 215 | newTrack.amtOccs[frameIdx,1] = float(poseInfo.text) 216 | hasAmt = True 217 | elif poseInfo.tag == 'amt_border_l': 218 | newTrack.amtBorders[frameIdx,0] = float(poseInfo.text) 219 | hasAmt = True 220 | elif poseInfo.tag == 'amt_border_r': 221 | newTrack.amtBorders[frameIdx,1] = float(poseInfo.text) 222 | hasAmt = True 223 | elif poseInfo.tag == 'amt_border_kf': 224 | newTrack.amtBorders[frameIdx,2] = float(poseInfo.text) 225 | hasAmt = True 226 | else: 227 | raise ValueError('unexpected tag in poses item: {0}!'.format(poseInfo.tag)) 228 | frameIdx += 1 229 | else: 230 | raise ValueError('unexpected pose info: {0}!'.format(pose.tag)) 231 | elif info.tag == 'finished': 232 | isFinished = True 233 | else: 234 | raise ValueError('unexpected tag in tracklets: {0}!'.format(info.tag)) 235 | #end: for all fields in current tracklet 236 | 237 | # some final consistency checks on new tracklet 238 | if not isFinished: 239 | warn('tracklet {0} was not finished!'.format(trackletIdx)) 240 | if newTrack.nFrames is None: 241 | warn('tracklet {0} contains no information!'.format(trackletIdx)) 242 | elif frameIdx != newTrack.nFrames: 243 | warn('tracklet {0} is supposed to have {1} frames, but perser found {1}!'.format(\ 244 | trackletIdx, newTrack.nFrames, frameIdx)) 245 | if np.abs(newTrack.rots[:,:2]).sum() > 1e-16: 246 | warn('track contains rotation other than yaw!') 247 | 248 | # if amtOccs / amtBorders are not set, set them to None 249 | if not hasAmt: 250 | newTrack.amtOccs = None 251 | newTrack.amtBorders = None 252 | 253 | # add new tracklet to list 254 | tracklets.append(newTrack) 255 | trackletIdx += 1 256 | 257 | else: 258 | raise ValueError('unexpected tracklet info') 259 | #end: for tracklet list items 260 | 261 | print 'loaded', trackletIdx, 'tracklets' 262 | 263 | # final consistency check 264 | if trackletIdx != nTracklets: 265 | warn('according to xml information the file has {0} tracklets, but parser found {1}!'.format(nTracklets, trackletIdx)) 266 | 267 | return tracklets 268 | #end: function parseXML 269 | 270 | 271 | def example(kittiDir=None, drive=None): 272 | 273 | from os.path import join, expanduser 274 | import readline # makes raw_input behave more fancy 275 | # from xmlParser import parseXML, TRUNC_IN_IMAGE, TRUNC_TRUNCATED 276 | 277 | DEFAULT_DRIVE = '2011_09_26_drive_0001' 278 | twoPi = 2.*np.pi 279 | 280 | # get dir names 281 | if kittiDir is None: 282 | kittiDir = expanduser(raw_input('please enter kitti base dir (e.g. ~/path/to/kitti): ').strip()) 283 | if drive is None: 284 | drive = raw_input('please enter drive name (default {0}): '.format(DEFAULT_DRIVE)).strip() 285 | if len(drive) == 0: 286 | drive = DEFAULT_DRIVE 287 | 288 | # read tracklets from file 289 | myTrackletFile = join(kittiDir, drive, 'tracklet_labels.xml') 290 | tracklets = parseXML(myTrackletFile) 291 | 292 | # loop over tracklets 293 | for iTracklet, tracklet in enumerate(tracklets): 294 | print 'tracklet {0: 3d}: {1}'.format(iTracklet, tracklet) 295 | 296 | # this part is inspired by kitti object development kit matlab code: computeBox3D 297 | h,w,l = tracklet.size 298 | trackletBox = np.array([ # in velodyne coordinates around zero point and without orientation yet\ 299 | [-l/2, -l/2, l/2, l/2, -l/2, -l/2, l/2, l/2], \ 300 | [ w/2, -w/2, -w/2, w/2, w/2, -w/2, -w/2, w/2], \ 301 | [ 0.0, 0.0, 0.0, 0.0, h, h, h, h]]) 302 | 303 | # loop over all data in tracklet 304 | for translation, rotation, state, occlusion, truncation, amtOcclusion, amtBorders, absoluteFrameNumber \ 305 | in tracklet: 306 | 307 | # determine if object is in the image; otherwise continue 308 | if truncation not in (TRUNC_IN_IMAGE, TRUNC_TRUNCATED): 309 | continue 310 | 311 | # re-create 3D bounding box in velodyne coordinate system 312 | yaw = rotation[2] # other rotations are 0 in all xml files I checked 313 | assert np.abs(rotation[:2]).sum() == 0, 'object rotations other than yaw given!' 314 | rotMat = np.array([\ 315 | [np.cos(yaw), -np.sin(yaw), 0.0], \ 316 | [np.sin(yaw), np.cos(yaw), 0.0], \ 317 | [ 0.0, 0.0, 1.0]]) 318 | cornerPosInVelo = np.dot(rotMat, trackletBox) + np.tile(translation, (8,1)).T 319 | 320 | # calc yaw as seen from the camera (i.e. 0 degree = facing away from cam), as opposed to 321 | # car-centered yaw (i.e. 0 degree = same orientation as car). 322 | # makes quite a difference for objects in periphery! 323 | # Result is in [0, 2pi] 324 | x, y, z = translation 325 | yawVisual = ( yaw - np.arctan2(y, x) ) % twoPi 326 | 327 | #end: for all frames in track 328 | #end: for all tracks 329 | #end: function example 330 | 331 | # when somebody runs this file as a script: 332 | # run example if no arg or only 'example' was given as arg 333 | # otherwise run parseXML 334 | if __name__ == "__main__": 335 | # cmdLineArgs[0] is 'parseTrackletXML.py' 336 | if len(cmdLineArgs) < 2: 337 | example() 338 | elif (len(cmdLineArgs) == 2) and (cmdLineArgs[1] == 'example'): 339 | example() 340 | else: 341 | parseXML(*cmdLineArgs[1:]) 342 | 343 | # (created using vim - the world's best text editor) 344 | 345 | -------------------------------------------------------------------------------- /util/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yukitsuji/Faster_RCNN_tensorflow/765c729eaf03cb401ad308a289ec7d8c2bfca474/util/train.py --------------------------------------------------------------------------------