├── keras_frcnn ├── __init__.py ├── config.py ├── losses.py ├── data_augment.py ├── simple_parser.py ├── pascal_voc_parser.py ├── FixedBatchNormalization.py ├── roi_helpers.py ├── RoiPoolingConv.py ├── resnet.py └── data_generators.py ├── requirements.txt ├── .gitignore ├── README.md ├── train_frcnn.py ├── test_frcnn.py └── LICENSE /keras_frcnn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | h5py==2.6.0 2 | Keras==1.2.1 3 | numpy==1.12.0 4 | cv2==1.0 5 | -------------------------------------------------------------------------------- /keras_frcnn/config.py: -------------------------------------------------------------------------------- 1 | from keras import backend as K 2 | 3 | class Config: 4 | 5 | def __init__(self): 6 | # setting for data augmentation 7 | self.use_horizontal_flips = True 8 | self.use_vertical_flips = False 9 | self.scale_augment = False 10 | self.random_rotate = False 11 | self.random_rotate_scale = 15. 12 | 13 | # anchor box scales 14 | self.anchor_box_scales = [128, 256, 512] 15 | 16 | # anchor box ratios 17 | self.anchor_box_ratios = [[1, 1], [1, 2], [2, 1]] 18 | 19 | # size to resize the smallest side of the image 20 | self.im_size = 600 21 | 22 | # number of ROIs at once 23 | self.num_rois = 2 24 | 25 | # stride at the RPN (this depends on the network configuration) 26 | self.rpn_stride = 16 27 | 28 | self.balanced_classes = True 29 | 30 | #location of pretrained weights for the base network 31 | if K.image_dim_ordering() == 'th': 32 | self.base_net_weights = 'resnet50_weights_th_dim_ordering_th_kernels_notop.h5' 33 | else: 34 | self.base_net_weights = 'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5' 35 | 36 | 37 | self.model_path = 'model_frcnn.hdf5' 38 | -------------------------------------------------------------------------------- /keras_frcnn/losses.py: -------------------------------------------------------------------------------- 1 | from keras import backend as K 2 | from keras.objectives import categorical_crossentropy 3 | 4 | 5 | lambda_rpn_regr = 10.0 6 | lambda_rpn_class = 1.0 7 | 8 | lambda_cls_regr = 10.0 9 | lambda_cls_class = 1.0 10 | 11 | 12 | def rpn_loss_regr(num_anchors): 13 | def rpn_loss_regr_fixed_num(y_true, y_pred): 14 | x = y_true[:, 4 * num_anchors:, :, :] - y_pred 15 | x_abs = K.abs(x) 16 | x_bool = K.lesser_equal(x_abs, 1.0) 17 | return lambda_rpn_regr * K.sum( 18 | y_true[:, :4 * num_anchors, :, :] * (x_bool * (0.5 * x * x) + (1 - x_bool) * (x_abs - 0.5))) / 256. 19 | return rpn_loss_regr_fixed_num 20 | 21 | 22 | def rpn_loss_cls(num_anchors): 23 | def rpn_loss_cls_fixed_num(y_true, y_pred): 24 | return lambda_rpn_class * K.sum(y_true[:, :num_anchors, :, :] * K.binary_crossentropy(y_pred[:, :, :, :], y_true[:, num_anchors:, :, :])) / 256.0 25 | return rpn_loss_cls_fixed_num 26 | 27 | 28 | def class_loss_regr(num_rois): 29 | def class_loss_regr_fixed_num(y_true, y_pred): 30 | x = y_true[:, :, 4:] - y_pred 31 | x_abs = K.abs(x) 32 | x_bool = K.lesser_equal(x_abs, 1.0) 33 | return lambda_cls_regr * K.sum(y_true[:, :, :4] * (x_bool * (0.5 * x * x) + (1 - x_bool) * (x_abs - 0.5))) / num_rois 34 | return class_loss_regr_fixed_num 35 | 36 | 37 | def class_loss_cls(y_true, y_pred): 38 | return lambda_cls_class * categorical_crossentropy(y_true, y_pred) 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # keras-frcnn 2 | Keras implementation of Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks 3 | 4 | CURRENT STATUS: 5 | - only resnet50 architecture is currently supported 6 | - weights for theano backend coming shortly 7 | 8 | USAGE: 9 | - train_frcnn.py can be used to train a model. To train on Pascal VOC data, simply do: 10 | python train_frcnn.py /path/to/pascalvoc/ 11 | - the Pascal VOC data set (images and annotations for bounding boxes around the classified objects) can be obtained from: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar 12 | 13 | - simple_parser.py provides an alternative way to input data, using a text file. Simply provide a text file, with each 14 | line containing: 15 | 16 | `filepath,x1,y1,x2,y2,class_name` 17 | 18 | For example: 19 | 20 | /data/imgs/img_001.jpg,837,346,981,456,cow 21 | /data/imgs/img_002.jpg,215,312,279,391,cat 22 | 23 | - test_frcnn.py can be used to perform inference, given pretrained weights. Specify a path to the folder containing 24 | images: 25 | python test_frcnn.py /path/to/imgs/ 26 | 27 | NOTES: 28 | config.py contains all settings for the train or test run. The default settings match those in the original Faster-RCNN 29 | paper. The anchor box sizes are [128, 256, 512] and the ratios are [1:1, 1:2, 2:1]. 30 | 31 | Example output: 32 | 33 | ![ex1](http://i.imgur.com/UtGXhtd.jpg) 34 | ![ex2](http://i.imgur.com/Szf78o2.jpg) 35 | ![ex3](http://i.imgur.com/OjVXTbn.jpg) 36 | ![ex4](http://i.imgur.com/9Fbe2Ow.jpg) 37 | -------------------------------------------------------------------------------- /keras_frcnn/data_augment.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import copy 4 | 5 | def augment(img_data, config, augment=True): 6 | assert 'filepath' in img_data 7 | assert 'bboxes' in img_data 8 | assert 'width' in img_data 9 | assert 'height' in img_data 10 | 11 | img_data_aug = copy.deepcopy(img_data) 12 | 13 | img = cv2.imread(img_data_aug['filepath']) 14 | 15 | if augment: 16 | rows, cols = img.shape[:2] 17 | 18 | if config.use_horizontal_flips and np.random.randint(0, 2) == 0: 19 | img = cv2.flip(img, 1) 20 | for bbox in img_data_aug['bboxes']: 21 | x1 = bbox['x1'] 22 | x2 = bbox['x2'] 23 | bbox['x2'] = cols - x1 24 | bbox['x1'] = cols - x2 25 | 26 | if config.use_vertical_flips and np.random.randint(0, 2) == 0: 27 | img = cv2.flip(img, 0) 28 | for bbox in img_data_aug['bboxes']: 29 | y1 = bbox['y1'] 30 | y2 = bbox['y2'] 31 | bbox['y2'] = rows - y1 32 | bbox['y1'] = rows - y2 33 | 34 | 35 | if config.random_rotate: 36 | M = cv2.getRotationMatrix2D((cols/2, rows/2), np.random.randint(-config.random_rotate_scale, config.random_rotate_scale), 1) 37 | img = cv2.warpAffine(img, M, (cols, rows), flags=cv2.INTER_CUBIC, borderMode= cv2.BORDER_REPLICATE) 38 | for bbox in img_data_aug['bboxes']: 39 | K = np.array([[bbox['x1'],bbox['y1']],[bbox['x2'],bbox['y2']],[bbox['x1'],bbox['y2']],[bbox['x2'],bbox['y1']]]) 40 | K = cv2.transform(K.reshape(4,1,2),M)[:,0,:] 41 | 42 | (x1, y1) = np.min(K, axis=0) 43 | (x2, y2) = np.max(K, axis=0) 44 | 45 | bbox['x1'] = x1 46 | bbox['x2'] = x2 47 | bbox['y1'] = y1 48 | bbox['y2'] = y2 49 | 50 | return img_data_aug, img 51 | -------------------------------------------------------------------------------- /keras_frcnn/simple_parser.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | 4 | def get_data(input_path): 5 | found_bg = False 6 | all_imgs = {} 7 | 8 | classes_count = {} 9 | 10 | class_mapping = {} 11 | 12 | visualise = True 13 | 14 | with open(input_path,'r') as f: 15 | 16 | print('Parsing annotation files') 17 | 18 | for line in f: 19 | line_split = line.strip().split(',') 20 | (filename,x1,y1,x2,y2,class_name) = line_split 21 | 22 | if class_name not in classes_count: 23 | classes_count[class_name] = 1 24 | else: 25 | classes_count[class_name] += 1 26 | 27 | if class_name not in class_mapping: 28 | if class_name == 'bg' and found_bg == False: 29 | print('Found class name with special name bg. Will be treated as a background region (this is usually for hard negative mining).') 30 | found_bg = True 31 | class_mapping[class_name] = len(class_mapping) 32 | 33 | if filename not in all_imgs: 34 | all_imgs[filename] = {} 35 | 36 | img = cv2.imread(filename) 37 | (rows,cols) = img.shape[:2] 38 | all_imgs[filename]['filepath'] = filename 39 | all_imgs[filename]['width'] = cols 40 | all_imgs[filename]['height'] = rows 41 | all_imgs[filename]['bboxes'] = [] 42 | if np.random.randint(0,6) > 0: 43 | all_imgs[filename]['imageset'] = 'trainval' 44 | else: 45 | all_imgs[filename]['imageset'] = 'test' 46 | 47 | all_imgs[filename]['bboxes'].append({'class': class_name, 'x1': int(x1), 'x2': int(x2), 'y1': int(y1), 'y2': int(y2)}) 48 | 49 | 50 | all_data = [] 51 | for key in all_imgs: 52 | all_data.append(all_imgs[key]) 53 | 54 | # make sure the bg class is last in the list 55 | if found_bg: 56 | if class_mapping['bg'] != len(class_mapping) - 1: 57 | key_to_switch = [key for key in class_mapping.keys() if class_mapping[key] == len(class_mapping)-1][0] 58 | val_to_switch = class_mapping['bg'] 59 | class_mapping['bg'] = len(class_mapping) - 1 60 | class_mapping[key_to_switch] = val_to_switch 61 | 62 | return all_data,classes_count,class_mapping 63 | 64 | 65 | -------------------------------------------------------------------------------- /keras_frcnn/pascal_voc_parser.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import xml.etree.ElementTree as ET 4 | 5 | def get_data(input_path): 6 | all_imgs = [] 7 | 8 | classes_count = {} 9 | 10 | class_mapping = {} 11 | 12 | visualise = False 13 | 14 | data_paths = [os.path.join(input_path,s) for s in ['VOC2007', 'VOC2012']] 15 | 16 | 17 | print('Parsing annotation files') 18 | 19 | for data_path in data_paths: 20 | 21 | annot_path = os.path.join(data_path, 'Annotations') 22 | imgs_path = os.path.join(data_path, 'JPEGImages') 23 | imgsets_path_trainval = os.path.join(data_path, 'ImageSets','Main','trainval.txt') 24 | imgsets_path_test = os.path.join(data_path, 'ImageSets','Main','test.txt') 25 | 26 | trainval_files = [] 27 | test_files = [] 28 | try: 29 | with open(imgsets_path_trainval) as f: 30 | for line in f: 31 | trainval_files.append(line.strip() + '.jpg') 32 | with open(imgsets_path_test) as f: 33 | for line in f: 34 | test_files.append(line.strip() + '.jpg') 35 | except Exception as e: 36 | print(e) 37 | 38 | 39 | annots = [os.path.join(annot_path, s) for s in os.listdir(annot_path)] 40 | idx = 0 41 | for annot in annots: 42 | try: 43 | idx += 1 44 | 45 | et = ET.parse(annot) 46 | element = et.getroot() 47 | 48 | element_objs = element.findall('object') 49 | element_filename = element.find('filename').text 50 | element_width = int(element.find('size').find('width').text) 51 | element_height = int(element.find('size').find('height').text) 52 | 53 | if len(element_objs) > 0: 54 | annotation_data = {'filepath': os.path.join(imgs_path, element_filename), 'width': element_width, 55 | 'height': element_height, 'bboxes': []} 56 | if element_filename in trainval_files: 57 | annotation_data['imageset'] = 'trainval' 58 | elif element_filename in test_files: 59 | annotation_data['imageset'] = 'test' 60 | else: 61 | annotation_data['imageset'] = 'test' 62 | 63 | for element_obj in element_objs: 64 | class_name = element_obj.find('name').text 65 | if class_name not in classes_count: 66 | classes_count[class_name] = 1 67 | else: 68 | classes_count[class_name] += 1 69 | 70 | if class_name not in class_mapping: 71 | class_mapping[class_name] = len(class_mapping) 72 | 73 | obj_bbox = element_obj.find('bndbox') 74 | x1 = int(round(float(obj_bbox.find('xmin').text))) 75 | y1 = int(round(float(obj_bbox.find('ymin').text))) 76 | x2 = int(round(float(obj_bbox.find('xmax').text))) 77 | y2 = int(round(float(obj_bbox.find('ymax').text))) 78 | annotation_data['bboxes'].append( 79 | {'class': class_name, 'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2}) 80 | 81 | all_imgs.append(annotation_data) 82 | 83 | if visualise: 84 | img = cv2.imread(annotation_data['filepath']) 85 | for bbox in annotation_data['bboxes']: 86 | cv2.rectangle(img, (bbox['x1'], bbox['y1']), (bbox[ 87 | 'x2'], bbox['y2']), (0, 0, 255)) 88 | cv2.imshow('img', img) 89 | cv2.waitKey(0) 90 | 91 | except Exception as e: 92 | print(e) 93 | continue 94 | return all_imgs,classes_count,class_mapping 95 | -------------------------------------------------------------------------------- /train_frcnn.py: -------------------------------------------------------------------------------- 1 | import random 2 | import pprint 3 | import sys 4 | import json 5 | from keras_frcnn import config 6 | 7 | sys.setrecursionlimit(40000) 8 | 9 | C = config.Config() 10 | C.num_rois = 8 11 | 12 | 13 | 14 | from keras_frcnn.pascal_voc_parser import get_data 15 | #from keras_frcnn.simple_parser import get_data 16 | 17 | all_imgs,classes_count,class_mapping = get_data(sys.argv[1]) 18 | 19 | 20 | if 'bg' not in classes_count: 21 | classes_count['bg'] = 0 22 | class_mapping['bg'] = len(class_mapping) 23 | 24 | with open('classes.json', 'w') as class_data_json: 25 | json.dump(class_mapping, class_data_json) 26 | 27 | inv_map = {v: k for k, v in class_mapping.iteritems()} 28 | 29 | pprint.pprint(classes_count) 30 | 31 | random.shuffle(all_imgs) 32 | 33 | num_imgs = len(all_imgs) 34 | 35 | train_imgs = [s for s in all_imgs if s['imageset'] == 'trainval'] 36 | val_imgs = [s for s in all_imgs if s['imageset'] == 'test'] 37 | 38 | print('Num train samples {}'.format(len(train_imgs))) 39 | print('Num val samples {}'.format(len(val_imgs))) 40 | 41 | 42 | from keras_frcnn import data_generators 43 | 44 | data_gen_train = data_generators.get_anchor_gt(train_imgs,class_mapping,classes_count,C,mode='train') 45 | data_gen_val = data_generators.get_anchor_gt(val_imgs,class_mapping,classes_count,C,mode='train') 46 | 47 | from keras_frcnn import resnet as nn 48 | from keras import backend as K 49 | from keras.optimizers import Adam, SGD 50 | from keras.layers import Input 51 | from keras.callbacks import ModelCheckpoint 52 | from keras.models import Model 53 | from keras.callbacks import EarlyStopping, ModelCheckpoint 54 | from keras_frcnn import losses 55 | 56 | if K.image_dim_ordering() == 'th': 57 | input_shape_img = (3, None, None) 58 | else: 59 | input_shape_img = (None, None, 3) 60 | 61 | img_input = Input(shape=input_shape_img) 62 | 63 | roi_input = Input(shape=(C.num_rois, 4)) 64 | 65 | # define the base network (resnet here, can be VGG, Inception, etc) 66 | shared_layers = nn.nn_base(img_input) 67 | 68 | # define the RPN, built on the base layers 69 | num_anchors = len(C.anchor_box_scales) * len(C.anchor_box_ratios) 70 | rpn = nn.rpn(shared_layers,num_anchors) 71 | 72 | # the classifier is build on top of the base layers + the ROI pooling layer + extra layers 73 | classifier = nn.classifier(shared_layers, roi_input, C.num_rois, nb_classes=len(classes_count)) 74 | 75 | # define the full model 76 | model = Model([img_input, roi_input], rpn + classifier) 77 | 78 | try: 79 | print 'loading weights from ', C.base_net_weights 80 | model.load_weights(C.base_net_weights, by_name=True) 81 | except: 82 | print('Could not load pretrained model weights') 83 | 84 | optimizer = Adam(1e-6) 85 | model.compile(optimizer=optimizer, loss=[losses.rpn_loss_cls(num_anchors), losses.rpn_loss_regr(num_anchors), losses.class_loss_cls, losses.class_loss_regr(C.num_rois)]) 86 | 87 | 88 | nb_epochs = 50 89 | 90 | callbacks = [EarlyStopping(monitor='val_loss', patience=2, verbose=0), 91 | ModelCheckpoint(C.model_path, monitor='val_loss', save_best_only=True, verbose=0)] 92 | train_samples_per_epoch = 2000 #len(train_imgs) 93 | nb_val_samples = 500 # len(val_imgs), 94 | 95 | print 'Starting training' 96 | 97 | model.fit_generator(data_gen_train, samples_per_epoch=train_samples_per_epoch, nb_epoch= nb_epochs, validation_data=data_gen_val, nb_val_samples=nb_val_samples, callbacks=callbacks, max_q_size=10, nb_worker=1) 98 | 99 | -------------------------------------------------------------------------------- /keras_frcnn/FixedBatchNormalization.py: -------------------------------------------------------------------------------- 1 | from keras.engine import Layer, InputSpec 2 | from keras import initializations, regularizers 3 | from keras import backend as K 4 | 5 | 6 | class FixedBatchNormalization(Layer): 7 | 8 | def __init__(self, epsilon=1e-3, axis=-1, 9 | weights=None, beta_init='zero', gamma_init='one', 10 | gamma_regularizer=None, beta_regularizer=None, **kwargs): 11 | self.supports_masking = True 12 | self.beta_init = initializations.get(beta_init) 13 | self.gamma_init = initializations.get(gamma_init) 14 | self.epsilon = epsilon 15 | self.axis = axis 16 | self.gamma_regularizer = regularizers.get(gamma_regularizer) 17 | self.beta_regularizer = regularizers.get(beta_regularizer) 18 | self.initial_weights = weights 19 | super(FixedBatchNormalization, self).__init__(**kwargs) 20 | 21 | def build(self, input_shape): 22 | self.input_spec = [InputSpec(shape=input_shape)] 23 | shape = (input_shape[self.axis],) 24 | 25 | self.gamma = self.add_weight(shape, 26 | initializer=self.gamma_init, 27 | regularizer=self.gamma_regularizer, 28 | name='{}_gamma'.format(self.name), 29 | trainable=False) 30 | self.beta = self.add_weight(shape, 31 | initializer=self.beta_init, 32 | regularizer=self.beta_regularizer, 33 | name='{}_beta'.format(self.name), 34 | trainable=False) 35 | self.running_mean = self.add_weight(shape, initializer='zero', 36 | name='{}_running_mean'.format(self.name), 37 | trainable=False) 38 | self.running_std = self.add_weight(shape, initializer='one', 39 | name='{}_running_std'.format(self.name), 40 | trainable=False) 41 | 42 | if self.initial_weights is not None: 43 | self.set_weights(self.initial_weights) 44 | del self.initial_weights 45 | self.built = True 46 | 47 | def call(self, x, mask=None): 48 | 49 | assert self.built, 'Layer must be built before being called' 50 | input_shape = K.int_shape(x) 51 | 52 | reduction_axes = list(range(len(input_shape))) 53 | del reduction_axes[self.axis] 54 | broadcast_shape = [1] * len(input_shape) 55 | broadcast_shape[self.axis] = input_shape[self.axis] 56 | 57 | if sorted(reduction_axes) == range(K.ndim(x))[:-1]: 58 | x_normed_running = K.batch_normalization( 59 | x, self.running_mean, self.running_std, 60 | self.beta, self.gamma, 61 | epsilon=self.epsilon) 62 | else: 63 | # need broadcasting 64 | broadcast_running_mean = K.reshape(self.running_mean, broadcast_shape) 65 | broadcast_running_std = K.reshape(self.running_std, broadcast_shape) 66 | broadcast_beta = K.reshape(self.beta, broadcast_shape) 67 | broadcast_gamma = K.reshape(self.gamma, broadcast_shape) 68 | x_normed_running = K.batch_normalization( 69 | x, broadcast_running_mean, broadcast_running_std, 70 | broadcast_beta, broadcast_gamma, 71 | epsilon=self.epsilon) 72 | 73 | return x_normed_running 74 | 75 | def get_config(self): 76 | config = {'epsilon': self.epsilon, 77 | 'axis': self.axis, 78 | 'gamma_regularizer': self.gamma_regularizer.get_config() if self.gamma_regularizer else None, 79 | 'beta_regularizer': self.beta_regularizer.get_config() if self.beta_regularizer else None} 80 | base_config = super(FixedBatchNormalization, self).get_config() 81 | return dict(list(base_config.items()) + list(config.items())) -------------------------------------------------------------------------------- /keras_frcnn/roi_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pdb 3 | import math 4 | 5 | def non_max_suppression_fast(boxes, probs, overlapThresh = 0.95): 6 | # if there are no boxes, return an empty list 7 | if len(boxes) == 0: 8 | return [] 9 | 10 | # grab the coordinates of the bounding boxes 11 | x1 = boxes[:,0] 12 | y1 = boxes[:,1] 13 | x2 = boxes[:,2] 14 | y2 = boxes[:,3] 15 | 16 | np.testing.assert_array_less(x1,x2) 17 | np.testing.assert_array_less(y1,y2) 18 | 19 | # if the bounding boxes integers, convert them to floats -- 20 | # this is important since we'll be doing a bunch of divisions 21 | if boxes.dtype.kind == "i": 22 | boxes = boxes.astype("float") 23 | 24 | # initialize the list of picked indexes 25 | pick = [] 26 | 27 | # sort the bounding boxes 28 | idxs = np.argsort(probs) 29 | 30 | # keep looping while some indexes still remain in the indexes 31 | # list 32 | while len(idxs) > 0: 33 | # grab the last index in the indexes list and add the 34 | # index value to the list of picked indexes 35 | last = len(idxs) - 1 36 | i = idxs[last] 37 | pick.append(i) 38 | 39 | # find the intersection 40 | 41 | xx1_int = np.maximum(x1[i], x1[idxs[:last]]) 42 | yy1_int = np.maximum(y1[i], y1[idxs[:last]]) 43 | xx2_int = np.minimum(x2[i], x2[idxs[:last]]) 44 | yy2_int = np.minimum(y2[i], y2[idxs[:last]]) 45 | 46 | # find the uniom 47 | xx1_un = np.minimum(x1[i], x1[idxs[:last]]) 48 | yy1_un = np.minimum(y1[i], y1[idxs[:last]]) 49 | xx2_un = np.maximum(x2[i], x2[idxs[:last]]) 50 | yy2_un = np.maximum(y2[i], y2[idxs[:last]]) 51 | 52 | # compute the width and height of the bounding box 53 | ww_int = xx2_int - xx1_int 54 | hh_int = yy2_int - yy1_int 55 | 56 | ww_un = xx2_un - xx1_un 57 | hh_un = yy2_un - yy1_un 58 | 59 | ww_un = np.maximum(0,ww_un) 60 | hh_un = np.maximum(0,hh_un) 61 | 62 | # compute the ratio of overlap 63 | overlap = (ww_int*hh_int)/(ww_un*hh_un + 1e-9) 64 | 65 | # delete all indexes from the index list that have 66 | idxs = np.delete(idxs, np.concatenate(([last], 67 | np.where(overlap > overlapThresh)[0]))) 68 | 69 | if len(pick) >= 300: 70 | break 71 | 72 | # return only the bounding boxes that were picked using the 73 | # integer data type 74 | boxes = boxes[pick].astype("int") 75 | probs = probs[pick] 76 | return boxes, probs 77 | 78 | def rpn_to_roi(rpn_layer, regr_layer, C, use_regr = True): 79 | anchor_sizes = C.anchor_box_scales 80 | anchor_ratios = C.anchor_box_ratios 81 | assert len(anchor_sizes) * len(anchor_ratios) == rpn_layer.shape[1] 82 | assert len(anchor_sizes) * len(anchor_ratios) * 4 == regr_layer.shape[1] 83 | 84 | assert rpn_layer.shape[0] == 1 85 | 86 | all_boxes = [] 87 | all_probs = [] 88 | 89 | (rows,cols) = rpn_layer.shape[2:] 90 | curr_layer = 0 91 | for anchor_size in anchor_sizes: 92 | for anchor_ratio in anchor_ratios: 93 | anchor_x = (anchor_size * anchor_ratio[0])/C.rpn_stride 94 | anchor_y = (anchor_size * anchor_ratio[1])/C.rpn_stride 95 | 96 | rpn = rpn_layer[0,curr_layer,:,:] 97 | regr = regr_layer[0,4 * curr_layer:4 * curr_layer + 4,:,:] 98 | 99 | for jy in xrange(rows): 100 | for ix in xrange(cols): 101 | if rpn[jy,ix] > 0.0: 102 | (tx, ty, tw, th) = regr[:,jy,ix] 103 | 104 | x1 = ix - anchor_x/2 105 | y1 = jy - anchor_y/2 106 | 107 | w = anchor_x 108 | h = anchor_y 109 | 110 | if use_regr: 111 | x1 = tx * (anchor_x) + x1 112 | y1 = ty * (anchor_y) + y1 113 | 114 | w = math.exp(tw) * (anchor_x) 115 | h = math.exp(th) * (anchor_y) 116 | 117 | # if w/h is less than 7, we cannot pool 118 | w = max(7,w) 119 | h = max(7,h) 120 | 121 | x2 = x1 + w 122 | y2 = y1 + h 123 | 124 | # box must start inside image 125 | x1 = max(x1,0) 126 | y1 = max(y1,0) 127 | 128 | #box must end inside image 129 | x2 = min(x2,cols-1) 130 | y2 = min(y2,rows-1) 131 | 132 | if x2 - x1 < 7: 133 | continue 134 | if y2 - y1 < 7: 135 | continue 136 | 137 | all_boxes.append((x1,y1,x2,y2)) 138 | 139 | all_probs.append(rpn[jy,ix]) 140 | 141 | curr_layer += 1 142 | 143 | all_boxes = np.array(all_boxes) 144 | all_probs = np.array(all_probs) 145 | 146 | return non_max_suppression_fast(all_boxes,all_probs)[0] -------------------------------------------------------------------------------- /keras_frcnn/RoiPoolingConv.py: -------------------------------------------------------------------------------- 1 | from keras.engine.topology import Layer 2 | import keras.backend as K 3 | 4 | class RoiPoolingConv(Layer): 5 | '''ROI pooling layer for 2D inputs. 6 | See Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition, 7 | K. He, X. Zhang, S. Ren, J. Sun 8 | # Arguments 9 | pool_size: int 10 | Size of pooling region to use. pool_size = 7 will result in a 7x7 region. 11 | num_rois: number of regions of interest to be used 12 | # Input shape 13 | list of two 4D tensors [X_img,X_roi] with shape: 14 | X_img: 15 | `(1, channels, rows, cols)` if dim_ordering='th' 16 | or 4D tensor with shape: 17 | `(1, rows, cols, channels)` if dim_ordering='tf'. 18 | X_roi: 19 | `(1,num_rois,4)` list of rois, with ordering (x,y,w,h) 20 | # Output shape 21 | 3D tensor with shape: 22 | `(1, num_rois, channels, pool_size, pool_size)` 23 | ''' 24 | def __init__(self, pool_size, num_rois, **kwargs): 25 | 26 | self.dim_ordering = K.image_dim_ordering() 27 | assert self.dim_ordering in {'tf', 'th'}, 'dim_ordering must be in {tf, th}' 28 | 29 | self.pool_size = pool_size 30 | self.num_rois = num_rois 31 | 32 | super(RoiPoolingConv, self).__init__(**kwargs) 33 | 34 | def build(self, input_shape): 35 | if self.dim_ordering == 'th': 36 | self.nb_channels = input_shape[0][1] 37 | elif self.dim_ordering == 'tf': 38 | self.nb_channels = input_shape[0][3] 39 | 40 | def get_output_shape_for(self, input_shape): 41 | return (None, self.num_rois, self.nb_channels, self.pool_size, self.pool_size) 42 | 43 | def call(self, x, mask=None): 44 | 45 | assert(len(x) == 2) 46 | 47 | img = x[0] 48 | rois = x[1] 49 | 50 | input_shape = K.shape(img) 51 | 52 | outputs = [] 53 | 54 | for roi_idx in range(self.num_rois): 55 | 56 | x = rois[0,roi_idx,0] 57 | y = rois[0,roi_idx,1] 58 | w = rois[0,roi_idx,2] 59 | h = rois[0,roi_idx,3] 60 | 61 | row_length = w / self.pool_size 62 | col_length = h / self.pool_size 63 | 64 | num_pool_regions = self.pool_size 65 | 66 | if self.dim_ordering == 'th': 67 | for jy in range(num_pool_regions): 68 | for ix in range(num_pool_regions): 69 | x1 = x + ix * row_length 70 | x2 = x1 + row_length 71 | y1 = y + jy * col_length 72 | y2 = y1 + col_length 73 | 74 | x1 = K.cast(K.round(x1), 'int32') 75 | x2 = K.cast(K.round(x2), 'int32') 76 | y1 = K.cast(K.round(y1), 'int32') 77 | y2 = K.cast(K.round(y2), 'int32') 78 | 79 | new_shape = [input_shape[0], input_shape[1], 80 | y2 - y1, x2 - x1] 81 | x_crop = img[:, :, y1:y2, x1:x2] 82 | xm = K.reshape(x_crop, new_shape) 83 | pooled_val = K.max(xm, axis=(2, 3)) 84 | outputs.append(pooled_val) 85 | 86 | elif self.dim_ordering == 'tf': 87 | for jy in range(num_pool_regions): 88 | for ix in range(num_pool_regions): 89 | x1 = x + ix * row_length 90 | x2 = x1 + row_length 91 | y1 = y + jy * col_length 92 | y2 = y1 + col_length 93 | 94 | x1 = K.cast(K.round(x1), 'int32') 95 | x2 = K.cast(K.round(x2), 'int32') 96 | y1 = K.cast(K.round(y1), 'int32') 97 | y2 = K.cast(K.round(y2), 'int32') 98 | 99 | new_shape = [input_shape[0], y2 - y1, 100 | x2 - x1, input_shape[3]] 101 | x_crop = img[:, y1:y2, x1:x2, :] 102 | xm = K.reshape(x_crop, new_shape) 103 | pooled_val = K.max(xm, axis=(1, 2)) 104 | outputs.append(pooled_val) 105 | 106 | final_output = K.concatenate(outputs,axis = 0) 107 | final_output = K.reshape(final_output,(1,self.num_rois, self.pool_size, self.pool_size, self.nb_channels)) 108 | final_output = K.permute_dimensions(final_output,(0, 1, 4, 2, 3)) 109 | 110 | 111 | return final_output -------------------------------------------------------------------------------- /test_frcnn.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import xml.etree.ElementTree as ET 4 | import math 5 | import pprint 6 | import pdb 7 | import cv2 8 | import json 9 | import numpy as np 10 | import sys 11 | from keras_frcnn import config 12 | 13 | sys.setrecursionlimit(40000) 14 | C = config.Config() 15 | C.use_horizontal_flips = False 16 | C.use_vertical_flips = False 17 | 18 | def format_img(img): 19 | img_min_side = 600.0 20 | (height,width,_) = img.shape 21 | 22 | if width <= height: 23 | f = img_min_side/width 24 | new_height = int(f * height) 25 | new_width = 600 26 | else: 27 | f = img_min_side/height 28 | new_width = int(f * width) 29 | new_height = 600 30 | img = cv2.resize(img,(new_width,new_height),interpolation = cv2.INTER_CUBIC) 31 | img = np.transpose(img,(2,0,1)).astype(np.float32) 32 | img = np.expand_dims(img, axis=0) 33 | img -= 127.5 34 | return img 35 | 36 | with open('classes.json', 'r') as class_data_json: 37 | class_mapping = json.load(class_data_json) 38 | 39 | if 'bg' not in class_mapping: 40 | class_mapping['bg'] = len(class_mapping) 41 | 42 | class_mapping = {v: k for k, v in class_mapping.iteritems()} 43 | 44 | class_to_color = {class_mapping[v]:np.random.randint(0,255,3) for v in class_mapping} 45 | num_rois = 4 46 | 47 | import keras_frcnn.resnet as nn 48 | from keras import backend as K 49 | from keras.layers import Input 50 | from keras.models import Model 51 | from keras_frcnn import roi_helpers 52 | 53 | if K.image_dim_ordering() == 'th': 54 | input_shape_img = (3, None, None) 55 | input_shape_features = (1024, None, None) 56 | else: 57 | input_shape_img = (None, None, 3) 58 | input_shape_features = (None, None, 1024) 59 | 60 | 61 | img_input = Input(shape=input_shape_img) 62 | 63 | feature_map_input = Input(shape=input_shape_features) 64 | 65 | roi_input = Input(shape=(num_rois, 4)) 66 | 67 | # define the base network (resnet here, can be VGG, Inception, etc) 68 | shared_layers = nn.nn_base(img_input) 69 | 70 | # define the RPN, built on the base layers 71 | num_anchors = len(C.anchor_box_scales) * len(C.anchor_box_ratios) 72 | rpn = nn.rpn(shared_layers,num_anchors) 73 | 74 | # classifier, uses base layers + proposals 75 | print(class_mapping) 76 | 77 | classifier = nn.classifier(feature_map_input,roi_input,num_rois,nb_classes=len(class_mapping)) 78 | 79 | model_rpn = Model(img_input,rpn + [shared_layers]) 80 | model_classifier = Model([feature_map_input,roi_input],classifier) 81 | 82 | model_rpn.load_weights('model_frcnn.hdf5', by_name=True) 83 | model_classifier.load_weights('model_frcnn.hdf5', by_name=True) 84 | 85 | model_rpn.compile(optimizer='sgd',loss='mse') 86 | model_classifier.compile(optimizer='sgd',loss='mse') 87 | 88 | all_imgs = [] 89 | 90 | classes = {} 91 | 92 | visualise = True 93 | 94 | print('Parsing annotation files') 95 | img_path = sys.argv[1] 96 | bufsize = 0 97 | 98 | for idx,img_name in enumerate(sorted(os.listdir(img_path))): 99 | print(img_name) 100 | filepath = os.path.join(img_path,img_name) 101 | img = cv2.imread(filepath) 102 | X = format_img(img) 103 | img_scaled = (np.transpose(X[0,:,:,:],(1,2,0)) + 127.5).astype('uint8') 104 | # get the feature maps and output from the RPN 105 | [Y1,Y2,F] = model_rpn.predict(X) 106 | 107 | R = roi_helpers.rpn_to_roi(Y1,Y2,C) 108 | 109 | # convert from (x1,y1,x2,y2) to (x,y,w,h) 110 | R[:,2] = R[:,2] - R[:,0] 111 | R[:,3] = R[:,3] - R[:,1] 112 | 113 | # apply the spatial pyramid pooling to the proposed regions 114 | bboxes = {} 115 | probs = {} 116 | for jk in range(R.shape[0]//num_rois + 1): 117 | ROIs = np.expand_dims(R[num_rois*jk:num_rois*(jk+1),:],axis=0) 118 | if ROIs.shape[1] == 0: 119 | break 120 | 121 | if jk == R.shape[0]//num_rois: 122 | #pad R 123 | curr_shape = ROIs.shape 124 | target_shape = (curr_shape[0],num_rois,curr_shape[2]) 125 | ROIs_padded = np.zeros(target_shape).astype(ROIs.dtype) 126 | ROIs_padded[:,:curr_shape[1],:] = ROIs 127 | ROIs_padded[0,curr_shape[1]:,:] = ROIs[0,0,:] 128 | ROIs = ROIs_padded 129 | 130 | [P_cls,P_regr] = model_classifier.predict([F,ROIs]) 131 | 132 | for ii in range(P_cls.shape[1]): 133 | 134 | if np.max(P_cls[0,ii,:]) < 0.8 or np.argmax(P_cls[0,ii,:]) == (P_cls.shape[2] - 1): 135 | continue 136 | 137 | cls_name = class_mapping[np.argmax(P_cls[0,ii,:])] 138 | 139 | if cls_name not in bboxes: 140 | bboxes[cls_name] = [] 141 | probs[cls_name] = [] 142 | (x,y,w,h) = ROIs[0,ii,:] 143 | 144 | bboxes[cls_name].append([16*x,16*y,16*(x+w),16*(y+h)]) 145 | probs[cls_name].append(np.max(P_cls[0,ii,:])) 146 | 147 | all_dets = {} 148 | 149 | for key in bboxes: 150 | print(key) 151 | print(len(bboxes[key])) 152 | bbox = np.array(bboxes[key]) 153 | 154 | new_boxes, new_probs = roi_helpers.non_max_suppression_fast(bbox, np.array(probs[key]), overlapThresh = 0.5) 155 | for jk in range(new_boxes.shape[0]): 156 | (x1,y1,x2,y2) = new_boxes[jk,:] 157 | cv2.rectangle(img_scaled,(x1,y1),(x2,y2),class_to_color[key],1) 158 | 159 | textLabel = '{}:{}'.format(key,int(100*new_probs[jk])) 160 | if key not in all_dets: 161 | all_dets[key] = 100*new_probs[jk] 162 | else: 163 | all_dets[key] = max(all_dets[key],100*new_probs[jk]) 164 | 165 | (retval,baseLine) = cv2.getTextSize(textLabel,cv2.FONT_HERSHEY_COMPLEX,1,1) 166 | textOrg = (x1,y1+20) 167 | 168 | cv2.rectangle(img_scaled,(textOrg[0] - 5,textOrg[1]+baseLine - 5),(textOrg[0]+retval[0] + 5,textOrg[1]-retval[1] - 5),(0,0,0),2) 169 | cv2.rectangle(img_scaled,(textOrg[0] - 5,textOrg[1]+baseLine - 5),(textOrg[0]+retval[0] + 5,textOrg[1]-retval[1] - 5),(255,255,255),-1) 170 | cv2.putText(img_scaled,textLabel,textOrg,cv2.FONT_HERSHEY_SIMPLEX,1,(0,0,0),1) 171 | 172 | cv2.imshow('img',img_scaled) 173 | cv2.waitKey(0) 174 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /keras_frcnn/resnet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | '''ResNet50 model for Keras. 3 | # Reference: 4 | - [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) 5 | Adapted from code contributed by BigMoyan. 6 | ''' 7 | from __future__ import print_function 8 | from __future__ import absolute_import 9 | 10 | from keras.layers import merge, Input 11 | from keras.layers import Dense, Activation, Flatten 12 | from keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D, AveragePooling2D, TimeDistributed 13 | from keras import backend as K 14 | 15 | from keras_frcnn.RoiPoolingConv import RoiPoolingConv 16 | from keras_frcnn.FixedBatchNormalization import FixedBatchNormalization 17 | 18 | import h5py 19 | 20 | bn_mode = 0 21 | 22 | def identity_block(input_tensor, kernel_size, filters, stage, block, trainable=True): 23 | 24 | '''The identity_block is the block that has no conv layer at shortcut 25 | # Arguments 26 | input_tensor: input tensor 27 | kernel_size: defualt 3, the kernel size of middle conv layer at main path 28 | filters: list of integers, the nb_filters of 3 conv layer at main path 29 | stage: integer, current stage label, used for generating layer names 30 | block: 'a','b'..., current block label, used for generating layer names 31 | ''' 32 | nb_filter1, nb_filter2, nb_filter3 = filters 33 | if K.image_dim_ordering() == 'tf': 34 | bn_axis = 3 35 | else: 36 | bn_axis = 1 37 | conv_name_base = 'res' + str(stage) + block + '_branch' 38 | bn_name_base = 'bn' + str(stage) + block + '_branch' 39 | 40 | x = Convolution2D(nb_filter1, 1, 1, name=conv_name_base + '2a', trainable=trainable)(input_tensor) 41 | x = FixedBatchNormalization(trainable=False,axis=bn_axis, name=bn_name_base + '2a')(x) 42 | x = Activation('relu')(x) 43 | 44 | x = Convolution2D(nb_filter2, kernel_size, kernel_size, border_mode='same', name=conv_name_base + '2b', trainable=trainable)(x) 45 | x = FixedBatchNormalization(trainable=False,axis=bn_axis, name=bn_name_base + '2b')(x) 46 | x = Activation('relu')(x) 47 | 48 | x = Convolution2D(nb_filter3, 1, 1, name=conv_name_base + '2c', trainable=trainable)(x) 49 | x = FixedBatchNormalization(trainable=False,axis=bn_axis, name=bn_name_base + '2c')(x) 50 | 51 | x = merge([x, input_tensor], mode='sum') 52 | x = Activation('relu')(x) 53 | return x 54 | 55 | 56 | def identity_block_td(input_tensor, kernel_size, filters, stage, block, trainable=True): 57 | '''The identity_block is the block that has no conv layer at shortcut 58 | # Arguments 59 | input_tensor: input tensor 60 | kernel_size: defualt 3, the kernel size of middle conv layer at main path 61 | filters: list of integers, the nb_filters of 3 conv layer at main path 62 | stage: integer, current stage label, used for generating layer names 63 | block: 'a','b'..., current block label, used for generating layer names 64 | ''' 65 | nb_filter1, nb_filter2, nb_filter3 = filters 66 | if K.image_dim_ordering() == 'tf': 67 | bn_axis = 3 68 | else: 69 | bn_axis = 1 70 | 71 | conv_name_base = 'res' + str(stage) + block + '_branch' 72 | bn_name_base = 'bn' + str(stage) + block + '_branch' 73 | 74 | 75 | x = TimeDistributed(Convolution2D(nb_filter1, 1, 1, trainable=trainable, init='normal'), name=conv_name_base + '2a')(input_tensor) 76 | x = TimeDistributed(FixedBatchNormalization(trainable=False,axis=bn_axis), name=bn_name_base + '2a')(x) 77 | 78 | x = Activation('relu')(x) 79 | 80 | x = TimeDistributed(Convolution2D(nb_filter2, kernel_size, kernel_size, trainable=trainable, init='normal',border_mode='same'), name=conv_name_base + '2b')(x) 81 | x = TimeDistributed(FixedBatchNormalization(trainable=False,axis=bn_axis), name=bn_name_base + '2b')(x) 82 | 83 | x = Activation('relu')(x) 84 | 85 | x = TimeDistributed(Convolution2D(nb_filter3, 1, 1, trainable=trainable, init='normal'), name=conv_name_base + '2c')(x) 86 | x = TimeDistributed(FixedBatchNormalization(trainable=False,axis=bn_axis), name=bn_name_base + '2c')(x) 87 | 88 | 89 | x = merge([x, input_tensor], mode='sum') 90 | x = Activation('relu')(x) 91 | 92 | return x 93 | 94 | def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2), trainable=True): 95 | '''conv_block is the block that has a conv layer at shortcut 96 | # Arguments 97 | input_tensor: input tensor 98 | kernel_size: defualt 3, the kernel size of middle conv layer at main path 99 | filters: list of integers, the nb_filters of 3 conv layer at main path 100 | stage: integer, current stage label, used for generating layer names 101 | block: 'a','b'..., current block label, used for generating layer names 102 | Note that from stage 3, the first conv layer at main path is with subsample=(2,2) 103 | And the shortcut should have subsample=(2,2) as well 104 | ''' 105 | nb_filter1, nb_filter2, nb_filter3 = filters 106 | if K.image_dim_ordering() == 'tf': 107 | bn_axis = 3 108 | else: 109 | bn_axis = 1 110 | conv_name_base = 'res' + str(stage) + block + '_branch' 111 | bn_name_base = 'bn' + str(stage) + block + '_branch' 112 | 113 | x = Convolution2D(nb_filter1, 1, 1, subsample=strides, name=conv_name_base + '2a', trainable=trainable)(input_tensor) 114 | x = FixedBatchNormalization(trainable=False,axis=bn_axis, name=bn_name_base + '2a')(x) 115 | x = Activation('relu')(x) 116 | 117 | x = Convolution2D(nb_filter2, kernel_size, kernel_size, border_mode='same', name=conv_name_base + '2b', trainable=trainable)(x) 118 | x = FixedBatchNormalization(trainable=False,axis=bn_axis, name=bn_name_base + '2b')(x) 119 | x = Activation('relu')(x) 120 | 121 | x = Convolution2D(nb_filter3, 1, 1, name=conv_name_base + '2c', trainable=trainable)(x) 122 | x = FixedBatchNormalization(trainable=False,axis=bn_axis, name=bn_name_base + '2c')(x) 123 | 124 | shortcut = Convolution2D(nb_filter3, 1, 1, subsample=strides, name=conv_name_base + '1', trainable=trainable)(input_tensor) 125 | shortcut = FixedBatchNormalization(trainable=False,axis=bn_axis, name=bn_name_base + '1')(shortcut) 126 | 127 | x = merge([x, shortcut], mode='sum') 128 | x = Activation('relu')(x) 129 | return x 130 | 131 | def conv_block_td(input_tensor, kernel_size, filters, stage, block, strides=(2, 2), trainable=True): 132 | '''conv_block is the block that has a conv layer at shortcut 133 | # Arguments 134 | input_tensor: input tensor 135 | kernel_size: defualt 3, the kernel size of middle conv layer at main path 136 | filters: list of integers, the nb_filters of 3 conv layer at main path 137 | stage: integer, current stage label, used for generating layer names 138 | block: 'a','b'..., current block label, used for generating layer names 139 | Note that from stage 3, the first conv layer at main path is with subsample=(2,2) 140 | And the shortcut should have subsample=(2,2) as well 141 | ''' 142 | nb_filter1, nb_filter2, nb_filter3 = filters 143 | if K.image_dim_ordering() == 'tf': 144 | bn_axis = 3 145 | else: 146 | bn_axis = 1 147 | 148 | conv_name_base = 'res' + str(stage) + block + '_branch' 149 | bn_name_base = 'bn' + str(stage) + block + '_branch' 150 | 151 | x = TimeDistributed(Convolution2D(nb_filter1, 1, 1, subsample=strides, trainable=trainable, init='normal'), name=conv_name_base + '2a')(input_tensor) 152 | x = TimeDistributed(FixedBatchNormalization(trainable=False,axis=bn_axis), name=bn_name_base + '2a')(x) 153 | 154 | x = Activation('relu')(x) 155 | 156 | x = TimeDistributed(Convolution2D(nb_filter2, kernel_size, kernel_size, border_mode='same', trainable=trainable, init='normal'), name=conv_name_base + '2b')(x) 157 | x = TimeDistributed(FixedBatchNormalization(trainable=False,axis=bn_axis), name=bn_name_base + '2b')(x) 158 | 159 | x = Activation('relu')(x) 160 | 161 | x = TimeDistributed(Convolution2D(nb_filter3, 1, 1, init='normal'), name=conv_name_base + '2c', trainable=trainable)(x) 162 | x = TimeDistributed(FixedBatchNormalization(trainable=False,axis=bn_axis), name=bn_name_base + '2c')(x) 163 | 164 | shortcut = TimeDistributed(Convolution2D(nb_filter3, 1, 1, subsample=strides, trainable=trainable, init='normal'), name=conv_name_base + '1')(input_tensor) 165 | shortcut = TimeDistributed(FixedBatchNormalization(trainable=False,axis=bn_axis), name=bn_name_base + '1')(shortcut) 166 | 167 | 168 | x = merge([x, shortcut], mode='sum') 169 | x = Activation('relu')(x) 170 | return x 171 | 172 | def nn_base(input_tensor=None, trainable = False): 173 | 174 | # Determine proper input shape 175 | if K.image_dim_ordering() == 'th': 176 | input_shape = (3, None, None) 177 | else: 178 | input_shape = (None, None, 3) 179 | 180 | if input_tensor is None: 181 | img_input = Input(shape=input_shape) 182 | else: 183 | if not K.is_keras_tensor(input_tensor): 184 | img_input = Input(tensor=input_tensor, shape=input_shape) 185 | else: 186 | img_input = input_tensor 187 | 188 | if K.image_dim_ordering() == 'tf': 189 | bn_axis = 3 190 | else: 191 | bn_axis = 1 192 | 193 | x = ZeroPadding2D((3, 3))(img_input) 194 | 195 | x = Convolution2D(64, 7, 7, subsample=(2, 2), name='conv1', trainable = trainable)(x) 196 | x = FixedBatchNormalization(trainable=False,axis=bn_axis, name='bn_conv1')(x) 197 | x = Activation('relu')(x) 198 | x = MaxPooling2D((3, 3), strides=(2, 2))(x) 199 | 200 | x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1), trainable = trainable) 201 | x = identity_block(x, 3, [64, 64, 256], stage=2, block='b', trainable = trainable) 202 | x = identity_block(x, 3, [64, 64, 256], stage=2, block='c', trainable = trainable) 203 | 204 | x = conv_block(x, 3, [128, 128, 512], stage=3, block='a', trainable = trainable) 205 | x = identity_block(x, 3, [128, 128, 512], stage=3, block='b', trainable = trainable) 206 | x = identity_block(x, 3, [128, 128, 512], stage=3, block='c', trainable = trainable) 207 | x = identity_block(x, 3, [128, 128, 512], stage=3, block='d', trainable = trainable) 208 | 209 | x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a', trainable = trainable) 210 | x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b', trainable = trainable) 211 | x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c', trainable = trainable) 212 | x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d', trainable = trainable) 213 | x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e', trainable = trainable) 214 | x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f', trainable = trainable) 215 | 216 | return x 217 | 218 | def classifier_layers(x, trainable=False): 219 | x = conv_block_td(x, 3, [512, 512, 2048], stage=5, block='a', strides=(1, 1), trainable=trainable) 220 | x = identity_block_td(x, 3, [512, 512, 2048], stage=5, block='b', trainable=trainable) 221 | x = identity_block_td(x, 3, [512, 512, 2048], stage=5, block='c', trainable=True) 222 | 223 | x = TimeDistributed(AveragePooling2D((7, 7)), name='avg_pool')(x) 224 | 225 | return x 226 | 227 | def rpn(base_layers,num_anchors): 228 | 229 | x = Convolution2D(512, 3, 3, border_mode = 'same', activation='relu', init='normal',name='rpn_conv1')(base_layers) 230 | 231 | x_class = Convolution2D(num_anchors, 1, 1, activation='sigmoid', init='normal',name='rpn_out_class')(x) 232 | x_regr = Convolution2D(num_anchors * 4, 1, 1, activation='linear', init='normal',name='rpn_out_regr')(x) 233 | 234 | return [x_class,x_regr] 235 | 236 | def classifier(base_layers,input_rois,num_rois,nb_classes = 21): 237 | 238 | pooling_regions = 7 239 | 240 | out_roi_pool = RoiPoolingConv(pooling_regions, num_rois)([base_layers,input_rois]) 241 | 242 | out = classifier_layers(out_roi_pool) 243 | out = TimeDistributed(Flatten(),name='td_flatten')(out) 244 | out_class = TimeDistributed(Dense(nb_classes, activation='softmax'), name='dense_class_{}'.format(nb_classes))(out) 245 | out_regr = TimeDistributed(Dense(4, activation='linear'), name='dense_regr')(out) 246 | 247 | 248 | return [out_class,out_regr] 249 | -------------------------------------------------------------------------------- /keras_frcnn/data_generators.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | import random 4 | import copy 5 | import data_augment 6 | import threading 7 | import itertools 8 | #import numba 9 | 10 | 11 | 12 | def get_img_output_length(width, height): 13 | def get_output_length(input_length): 14 | # zero_pad 15 | input_length += 6 16 | # apply 4 strided convolutions 17 | filter_sizes = [7, 3, 1, 1] 18 | stride = 2 19 | for filter_size in filter_sizes: 20 | input_length = (input_length - filter_size + stride) // stride 21 | return input_length 22 | 23 | return get_output_length(width), get_output_length(height) 24 | 25 | #@numba.jit 26 | def union(au, bu): 27 | x = min(au[0], bu[0]) 28 | y = min(au[1], bu[1]) 29 | w = max(au[2], bu[2]) - x 30 | h = max(au[3], bu[3]) - y 31 | return x, y, w, h 32 | 33 | #@numba.jit 34 | def intersection(ai, bi): 35 | x = max(ai[0], bi[0]) 36 | y = max(ai[1], bi[1]) 37 | w = min(ai[2], bi[2]) - x 38 | h = min(ai[3], bi[3]) - y 39 | if w < 0 or h < 0: 40 | return 0, 0, 0, 0 41 | return x, y, w, h 42 | 43 | #@numba.jit 44 | def iou(a, b): 45 | # a and b should be (x1,y1,x2,y2) 46 | assert a[0] < a[2] 47 | assert a[1] < a[3] 48 | assert b[0] < b[2] 49 | assert b[1] < b[3] 50 | 51 | i = intersection(a, b) 52 | u = union(a, b) 53 | 54 | area_i = i[2] * i[3] 55 | area_u = u[2] * u[3] 56 | return float(area_i) / float(area_u) 57 | 58 | 59 | def get_new_img_size(width, height, img_min_side=600): 60 | if width <= height: 61 | f = float(img_min_side) / width 62 | resized_height = int(f * height) 63 | resized_width = img_min_side 64 | else: 65 | f = float(img_min_side) / height 66 | resized_width = int(f * width) 67 | resized_height = img_min_side 68 | 69 | return resized_width, resized_height 70 | 71 | 72 | 73 | 74 | class SampleSelector: 75 | def __init__(self, class_count): 76 | # ignore classes that have zero samples 77 | self.classes = [b for b in class_count.keys() if class_count[b] != 0] 78 | self.class_cycle = itertools.cycle(self.classes) 79 | 80 | def skip_sample_for_balanced_class(self, img_data): 81 | 82 | curr_class = self.class_cycle.next() 83 | class_in_img = False 84 | 85 | for bbox in img_data['bboxes']: 86 | 87 | cls_name = bbox['class'] 88 | 89 | if cls_name == curr_class: 90 | class_in_img = True 91 | break 92 | 93 | if class_in_img: 94 | return False 95 | else: 96 | return True 97 | 98 | 99 | def calcY(C, class_mapping, img_data, width, height, resized_width, resized_height): 100 | downscale = float(C.rpn_stride) 101 | anchor_sizes = C.anchor_box_scales 102 | anchor_ratios = C.anchor_box_ratios 103 | num_anchors = len(anchor_sizes) * len(anchor_ratios) 104 | 105 | # calculate the output map size based on the network architecture 106 | (output_width, output_height) = get_img_output_length(resized_width, resized_height) 107 | 108 | n_anchratios = len(anchor_ratios) 109 | 110 | # initialise empty output objectives 111 | y_rpn_overlap = np.zeros((output_height, output_width, num_anchors)).astype(int) 112 | y_is_box_valid = np.zeros((output_height, output_width, num_anchors)).astype(int) 113 | y_rpn_regr = np.zeros((output_height, output_width, num_anchors * 4)) 114 | 115 | num_bboxes = len(img_data['bboxes']) 116 | 117 | num_anchors_for_bbox = np.zeros(num_bboxes).astype(int) 118 | best_anchor_for_bbox = -1*np.ones((num_bboxes, 4)).astype(int) 119 | best_iou_for_bbox = np.zeros(num_bboxes) 120 | best_x_for_bbox = np.zeros((num_bboxes, 4)) 121 | best_dx_for_bbox = np.zeros((num_bboxes, 4)) 122 | 123 | pos_samples = [] 124 | cls_samples = [] 125 | cls_regr_samples = [] 126 | neg_samples = [] 127 | 128 | # get the GT box coordinates, and resize to account for image resizing 129 | gta = np.zeros((num_bboxes, 4)) 130 | for bbox_num, bbox in enumerate(img_data['bboxes']): 131 | # get the GT box coordinates, and resize to account for image resizing 132 | gta[bbox_num, 0] = bbox['x1'] * (resized_width / float(width)) 133 | gta[bbox_num, 1] = bbox['x2'] * (resized_width / float(width)) 134 | gta[bbox_num, 2] = bbox['y1'] * (resized_height / float(height)) 135 | gta[bbox_num, 3] = bbox['y2'] * (resized_height / float(height)) 136 | 137 | 138 | for anchor_size_idx in xrange(len(anchor_sizes)): 139 | for anchor_ratio_idx in xrange(n_anchratios): 140 | anchor_x = anchor_sizes[anchor_size_idx] * anchor_ratios[anchor_ratio_idx][0] 141 | anchor_y = anchor_sizes[anchor_size_idx] * anchor_ratios[anchor_ratio_idx][1] 142 | 143 | for ix in xrange(output_width): 144 | # x-coordinates of the current anchor box 145 | x1_anc = downscale * (ix + 0.5) - anchor_x / 2 146 | x2_anc = downscale * (ix + 0.5) + anchor_x / 2 147 | 148 | # ignore boxes that go across image boundaries 149 | if x1_anc < 0 or x2_anc > resized_width: 150 | continue 151 | 152 | for jy in xrange(output_height): 153 | 154 | # y-coordinates of the current anchor box 155 | y1_anc = downscale * (jy + 0.5) - anchor_y / 2 156 | y2_anc = downscale * (jy + 0.5) + anchor_y / 2 157 | 158 | # ignore boxes that go across image boundaries 159 | if y1_anc < 0 or y2_anc > resized_height: 160 | continue 161 | 162 | # bbox_type indicates whether an anchor should be a target 163 | bbox_type = 'neg' 164 | 165 | # this is the best IOU for the (x,y) coord and the current anchor 166 | # note that this is different from the best IOU for a GT bbox 167 | best_iou_for_loc = 0.0 168 | 169 | for bbox_num in xrange(num_bboxes): 170 | 171 | # get IOU of the current GT box and the current anchor box 172 | curr_iou = iou([gta[bbox_num, 0], gta[bbox_num, 2], gta[bbox_num, 1], gta[bbox_num, 3]], [x1_anc, y1_anc, x2_anc, y2_anc]) 173 | # calculate the regression targets if they will be needed 174 | if curr_iou > best_iou_for_bbox[bbox_num] or curr_iou > 0.5: 175 | tx = (gta[bbox_num, 0] - x1_anc) / (x2_anc - x1_anc) 176 | ty = (gta[bbox_num, 2] - y1_anc) / (y2_anc - y1_anc) 177 | #calculate log of tw and th later 178 | tw = 1.0*(gta[bbox_num, 1] - gta[bbox_num, 0]) / (x2_anc - x1_anc) 179 | th = 1.0*(gta[bbox_num, 3] - gta[bbox_num, 2]) / (y2_anc - y1_anc) 180 | 181 | if img_data['bboxes'][bbox_num]['class'] != 'bg': 182 | 183 | # all GT boxes should be mapped to an anchor box, so we keep track of which anchor box was best 184 | if curr_iou > best_iou_for_bbox[bbox_num]: 185 | best_anchor_for_bbox[bbox_num] = [jy, ix, anchor_ratio_idx, anchor_size_idx] 186 | best_iou_for_bbox[bbox_num] = curr_iou 187 | best_x_for_bbox[bbox_num] = [x1_anc, x2_anc, y1_anc, y2_anc] 188 | best_dx_for_bbox[bbox_num] = [tx, ty, tw, th] 189 | 190 | # we set the anchor to positive if the IOU is >0.7 (it does not matter if there was another better box, it just indicates overlap) 191 | if curr_iou > 0.7: 192 | bbox_type = 'pos' 193 | num_anchors_for_bbox[bbox_num] += 1 194 | # we update the regression layer target if this IOU is the best for the current (x,y) and anchor position 195 | if curr_iou > best_iou_for_loc: 196 | best_iou_for_loc = curr_iou 197 | best_regr = (tx, ty, tw, th) 198 | 199 | # if the IOU is >0.3 and <0.7, it is ambiguous and no included in the objective 200 | if 0.3 < curr_iou < 0.7: 201 | # gray zone between neg and pos 202 | if bbox_type != 'pos': 203 | bbox_type = 'neutral' 204 | 205 | 206 | # we also make a list of anchor boxes that can be used as positive or negative targets for the classification network 207 | if curr_iou < 0.1: 208 | # negative sample, we dont store this since it's probably very 'easy' 209 | pass 210 | elif curr_iou < 0.5: 211 | # sample which partially overlaps a GT box. We store this since we expect it to be a rather difficult background sample 212 | neg_samples.append((int(x1_anc / downscale), int(y1_anc / downscale), 213 | int((x2_anc - x1_anc) / downscale), 214 | int((y2_anc - y1_anc) / downscale))) 215 | else: 216 | # A positive sample, there is sufficient overlap 217 | pos_samples.append((int(x1_anc / downscale), int(y1_anc / downscale), int((x2_anc - x1_anc) / downscale), int((y2_anc - y1_anc) / downscale))) 218 | cls_samples.append(img_data['bboxes'][bbox_num]['class']) 219 | cls_regr_samples.append([tx,ty,tw,th]) 220 | 221 | # turn on or off outputs depending on IOUs 222 | if bbox_type == 'neg': 223 | y_is_box_valid[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1 224 | y_rpn_overlap[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0 225 | elif bbox_type == 'neutral': 226 | y_is_box_valid[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0 227 | y_rpn_overlap[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0 228 | else: 229 | y_is_box_valid[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1 230 | y_rpn_overlap[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1 231 | start = 4 * anchor_ratio_idx + 4 * n_anchratios * anchor_size_idx 232 | y_rpn_regr[jy, ix, start:start+2 ] = best_regr[0:2] 233 | y_rpn_regr[jy, ix, start+2:start+4 ] = np.log(best_regr[2:]) 234 | 235 | 236 | #check that there is at least one labeled region in the image 237 | if len(pos_samples) == 0: 238 | return None, None, None, None, None 239 | 240 | # we ensure that every bbox has at least one positive RPN region 241 | for idx in xrange(num_anchors_for_bbox.shape[0]): 242 | if num_anchors_for_bbox[idx] == 0: 243 | # no box with an IOU greater than zero ... 244 | if best_anchor_for_bbox[idx, 0] == -1: 245 | continue 246 | y_is_box_valid[ 247 | best_anchor_for_bbox[idx,0], best_anchor_for_bbox[idx,1], best_anchor_for_bbox[idx,2] + n_anchratios * 248 | best_anchor_for_bbox[idx,3]] = 1 249 | y_rpn_overlap[ 250 | best_anchor_for_bbox[idx,0], best_anchor_for_bbox[idx,1], best_anchor_for_bbox[idx,2] + n_anchratios * 251 | best_anchor_for_bbox[idx,3]] = 1 252 | start = 4 * best_anchor_for_bbox[idx,2] + 4 * n_anchratios * best_anchor_for_bbox[idx,3] + 0 253 | y_rpn_regr[ 254 | best_anchor_for_bbox[idx,0], best_anchor_for_bbox[idx,1], start:start+2] = best_dx_for_bbox[idx, 0:2] 255 | y_rpn_regr[ 256 | best_anchor_for_bbox[idx,0], best_anchor_for_bbox[idx,1], start+2:start+4] = np.log(best_dx_for_bbox[idx, 2:4]) 257 | 258 | 259 | y_rpn_overlap = np.transpose(y_rpn_overlap, (2, 0, 1)) 260 | y_rpn_overlap = np.expand_dims(y_rpn_overlap, axis=0) 261 | 262 | y_is_box_valid = np.transpose(y_is_box_valid, (2, 0, 1)) 263 | y_is_box_valid = np.expand_dims(y_is_box_valid, axis=0) 264 | 265 | y_rpn_regr = np.transpose(y_rpn_regr, (2, 0, 1)) 266 | y_rpn_regr = np.expand_dims(y_rpn_regr, axis=0) 267 | 268 | pos_locs = np.where(np.logical_and(y_rpn_overlap[0, :, :, :] == 1, y_is_box_valid[0, :, :, :] == 1)) 269 | neg_locs = np.where(np.logical_and(y_rpn_overlap[0, :, :, :] == 0, y_is_box_valid[0, :, :, :] == 1)) 270 | 271 | pos_samples = np.array(pos_samples) 272 | neg_samples = np.array(neg_samples) 273 | cls_samples = np.array(cls_samples) 274 | cls_regr_samples = np.array(cls_regr_samples) 275 | 276 | # randomly sample some positive and negative ROIs from the list 277 | target_pos_samples = C.num_rois / 2 278 | 279 | if pos_samples.shape[0] > target_pos_samples: 280 | val_locs = random.sample(range(pos_samples.shape[0]), target_pos_samples) 281 | valid_pos_samples = pos_samples[val_locs, :] 282 | valid_cls_samples = cls_samples[val_locs] 283 | valid_regr_samples = cls_regr_samples[val_locs,:] 284 | else: 285 | valid_pos_samples = pos_samples 286 | valid_cls_samples = cls_samples 287 | valid_regr_samples = cls_regr_samples 288 | 289 | val_locs = random.sample(range(neg_samples.shape[0]), C.num_rois - valid_cls_samples.shape[0]) 290 | valid_neg_samples = neg_samples[val_locs, :] 291 | 292 | valid_regr_samples[:, 2:] = np.log(valid_regr_samples[:, 2:]) 293 | 294 | x_rois = np.expand_dims(np.concatenate([valid_pos_samples, valid_neg_samples]), axis=0) 295 | 296 | y_class_num = np.zeros((x_rois.shape[1], len(class_mapping))) 297 | # regr has 8 values: 4 for on/off, 4 for w,y,w,h 298 | y_class_regr = np.zeros((x_rois.shape[1], 2*4)) 299 | 300 | for i in xrange(x_rois.shape[1]): 301 | if i < valid_cls_samples.shape[0]: 302 | class_num = class_mapping[valid_cls_samples[i]] 303 | y_class_num[i, class_num] = 1 304 | else: 305 | y_class_num[i, -1] = 1 306 | # NB: we only y_class_regr set to positive here if the sample is not from the bg class 307 | if y_class_num[i, -1] != 1: 308 | y_class_regr[i, :4] = 1 # set value to 1 if the sample is positive 309 | y_class_regr[i,4:] = valid_regr_samples[i,:] 310 | 311 | 312 | y_class_num = np.expand_dims(y_class_num, axis=0) 313 | y_class_regr = np.expand_dims(y_class_regr, axis=0) 314 | 315 | num_pos = len(pos_locs[0]) 316 | 317 | # one issue is that the RPN has many more negative than positive regions, so we turn off some of the negative regions 318 | 319 | if len(pos_locs[0]) > 128: 320 | val_locs = random.sample(range(len(pos_locs[0])), len(pos_locs[0]) - 128) 321 | y_is_box_valid[0, pos_locs[0][val_locs], pos_locs[1][val_locs], pos_locs[2][val_locs]] = 0 322 | num_pos = 128 323 | 324 | if len(neg_locs[0]) + num_pos > 256: 325 | val_locs = random.sample(range(len(neg_locs[0])), len(neg_locs[0]) + num_pos - 256) 326 | y_is_box_valid[0, neg_locs[0][val_locs], neg_locs[1][val_locs], neg_locs[2][val_locs]] = 0 327 | 328 | y_rpn_cls = np.concatenate([y_is_box_valid, y_rpn_overlap], axis=1) 329 | y_rpn_regr = np.concatenate([np.repeat(y_rpn_overlap, 4, axis=1), y_rpn_regr], axis=1) 330 | 331 | return x_rois, y_rpn_cls, y_rpn_regr, y_class_num, y_class_regr 332 | 333 | 334 | class threadsafe_iter: 335 | """Takes an iterator/generator and makes it thread-safe by 336 | serializing call to the `next` method of given iterator/generator. 337 | """ 338 | def __init__(self, it): 339 | self.it = it 340 | self.lock = threading.Lock() 341 | 342 | def __iter__(self): 343 | return self 344 | 345 | def next(self): 346 | with self.lock: 347 | return self.it.next() 348 | 349 | 350 | def threadsafe_generator(f): 351 | """A decorator that takes a generator function and makes it thread-safe. 352 | """ 353 | def g(*a, **kw): 354 | return threadsafe_iter(f(*a, **kw)) 355 | return g 356 | 357 | #@threadsafe_generator 358 | def get_anchor_gt(all_img_data, class_mapping, class_count, C, mode='train'): 359 | downscale = float(C.rpn_stride) 360 | 361 | anchor_sizes = C.anchor_box_scales 362 | anchor_ratios = C.anchor_box_ratios 363 | 364 | num_anchors = len(anchor_sizes) * len(anchor_ratios) 365 | 366 | sample_selector = SampleSelector(class_count) 367 | 368 | while True: 369 | if mode=='train': 370 | random.shuffle(all_img_data) 371 | 372 | for img_data in all_img_data: 373 | 374 | if C.balanced_classes and sample_selector.skip_sample_for_balanced_class(img_data): 375 | continue 376 | 377 | # read in image, and optionally add augmentation 378 | 379 | if mode=='train': 380 | img_data_aug, x_img = data_augment.augment(img_data, C, augment=True) 381 | else: 382 | img_data_aug, x_img = data_augment.augment(img_data, C, augment=False) 383 | 384 | (width, height) = (img_data_aug['width'], img_data_aug['height']) 385 | (rows, cols, _) = x_img.shape 386 | 387 | assert cols == width 388 | assert rows == height 389 | 390 | # get image dimensions for resizing 391 | (resized_width, resized_height) = get_new_img_size(width, height, C.im_size) 392 | 393 | # resize the image so that smalles side is length = 600px 394 | x_img = cv2.resize(x_img, (resized_width, resized_height), interpolation=cv2.INTER_CUBIC) 395 | 396 | # calculate the output map size based on the network architecture 397 | (output_width, output_height) = get_img_output_length(resized_width, resized_height) 398 | 399 | x_rois, y_rpn_cls, y_rpn_regr, y_class_num, y_class_regr = calcY(C, class_mapping, img_data_aug, width, height, resized_width, resized_height) 400 | if x_rois is None: 401 | continue 402 | 403 | x_img = np.transpose(x_img, (2, 0, 1)) 404 | x_img = np.expand_dims(x_img, axis=0).astype('float32') 405 | 406 | # Zero-center by mean pixel 407 | x_img[:, 0, :, :] -= 103.939 408 | x_img[:, 1, :, :] -= 116.779 409 | x_img[:, 2, :, :] -= 123.68 410 | 411 | yield [x_img, x_rois], [y_rpn_cls, y_rpn_regr, y_class_num, y_class_regr] 412 | 413 | 414 | 415 | --------------------------------------------------------------------------------