├── .gitignore ├── LICENSE ├── README.md ├── keras_frcnn ├── FixedBatchNormalization.py ├── RoiPoolingConv.py ├── __init__.py ├── config.py ├── data_augment.py ├── data_generators.py ├── losses.py ├── pascal_voc_parser.py ├── resnet.py ├── roi_helpers.py └── simple_parser.py ├── measure_map.py ├── requirements.txt ├── test_frcnn.py └── train_frcnn.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # keras-frcnn 2 | Keras implementation of Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. 3 | 4 | 5 | USAGE: 6 | - Both theano and tensorflow backends are supported. However compile times are very high in theano, and tensorflow is highly recommended. 7 | - `train_frcnn.py` can be used to train a model. To train on Pascal VOC data, simply do: 8 | `python train_frcnn.py -p /path/to/pascalvoc/`. 9 | - the Pascal VOC data set (images and annotations for bounding boxes around the classified objects) can be obtained from: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar 10 | - simple_parser.py provides an alternative way to input data, using a text file. Simply provide a text file, with each 11 | line containing: 12 | 13 | `filepath,x1,y1,x2,y2,class_name` 14 | 15 | For example: 16 | 17 | /data/imgs/img_001.jpg,837,346,981,456,cow 18 | 19 | /data/imgs/img_002.jpg,215,312,279,391,cat 20 | 21 | The classes will be inferred from the file. To use the simple parser instead of the default pascal voc style parser, 22 | use the command line option `-o simple`. For example `python train_frcnn.py -o simple -p my_data.txt`. 23 | 24 | - Running `train_frcnn.py` will write weights to disk to an hdf5 file, as well as all the setting of the training run to a `pickle` file. These 25 | settings can then be loaded by `test_frcnn.py` for any testing. 26 | 27 | - test_frcnn.py can be used to perform inference, given pretrained weights and a config file. Specify a path to the folder containing 28 | images: 29 | `python test_frcnn.py -p /path/to/test_data/` 30 | - Data augmentation can be applied by specifying `--hf` for horizontal flips, `--vf` for vertical flips and `--rot` for 90 degree rotations 31 | 32 | 33 | 34 | NOTES: 35 | - config.py contains all settings for the train or test run. The default settings match those in the original Faster-RCNN 36 | paper. The anchor box sizes are [128, 256, 512] and the ratios are [1:1, 1:2, 2:1]. 37 | - The theano backend by default uses a 7x7 pooling region, instead of 14x14 as in the frcnn paper. This cuts down compiling time slightly. 38 | - The tensorflow backend performs a resize on the pooling region, instead of max pooling. This is much more efficient and has little impact on results. 39 | 40 | 41 | Example output: 42 | 43 | ![ex1](http://i.imgur.com/7Lmb2RC.png) 44 | ![ex2](http://i.imgur.com/h58kCIV.png) 45 | ![ex3](http://i.imgur.com/EbvGBaG.png) 46 | ![ex4](http://i.imgur.com/i5UAgLb.png) 47 | 48 | ISSUES: 49 | 50 | - If you get this error: 51 | `ValueError: There is a negative shape in the graph!` 52 | than update keras to the newest version 53 | 54 | - Make sure to use `python2`, not `python3`. If you get this error: 55 | `TypeError: unorderable types: dict() < dict()` you are using python3 56 | 57 | - If you run out of memory, try reducing the number of ROIs that are processed simultaneously. Try passing a lower `-n` to `train_frcnn.py`. Alternatively, try reducing the image size from the default value of 600 (this setting is found in `config.py`. 58 | -------------------------------------------------------------------------------- /keras_frcnn/FixedBatchNormalization.py: -------------------------------------------------------------------------------- 1 | from keras.engine import Layer, InputSpec 2 | from keras import initializers, regularizers 3 | from keras import backend as K 4 | 5 | 6 | class FixedBatchNormalization(Layer): 7 | 8 | def __init__(self, epsilon=1e-3, axis=-1, 9 | weights=None, beta_init='zero', gamma_init='one', 10 | gamma_regularizer=None, beta_regularizer=None, **kwargs): 11 | 12 | self.supports_masking = True 13 | self.beta_init = initializers.get(beta_init) 14 | self.gamma_init = initializers.get(gamma_init) 15 | self.epsilon = epsilon 16 | self.axis = axis 17 | self.gamma_regularizer = regularizers.get(gamma_regularizer) 18 | self.beta_regularizer = regularizers.get(beta_regularizer) 19 | self.initial_weights = weights 20 | super(FixedBatchNormalization, self).__init__(**kwargs) 21 | 22 | def build(self, input_shape): 23 | self.input_spec = [InputSpec(shape=input_shape)] 24 | shape = (input_shape[self.axis],) 25 | 26 | self.gamma = self.add_weight(shape, 27 | initializer=self.gamma_init, 28 | regularizer=self.gamma_regularizer, 29 | name='{}_gamma'.format(self.name), 30 | trainable=False) 31 | self.beta = self.add_weight(shape, 32 | initializer=self.beta_init, 33 | regularizer=self.beta_regularizer, 34 | name='{}_beta'.format(self.name), 35 | trainable=False) 36 | self.running_mean = self.add_weight(shape, initializer='zero', 37 | name='{}_running_mean'.format(self.name), 38 | trainable=False) 39 | self.running_std = self.add_weight(shape, initializer='one', 40 | name='{}_running_std'.format(self.name), 41 | trainable=False) 42 | 43 | if self.initial_weights is not None: 44 | self.set_weights(self.initial_weights) 45 | del self.initial_weights 46 | 47 | self.built = True 48 | 49 | def call(self, x, mask=None): 50 | 51 | assert self.built, 'Layer must be built before being called' 52 | input_shape = K.int_shape(x) 53 | 54 | reduction_axes = list(range(len(input_shape))) 55 | del reduction_axes[self.axis] 56 | broadcast_shape = [1] * len(input_shape) 57 | broadcast_shape[self.axis] = input_shape[self.axis] 58 | 59 | if sorted(reduction_axes) == range(K.ndim(x))[:-1]: 60 | x_normed = K.batch_normalization( 61 | x, self.running_mean, self.running_std, 62 | self.beta, self.gamma, 63 | epsilon=self.epsilon) 64 | else: 65 | # need broadcasting 66 | broadcast_running_mean = K.reshape(self.running_mean, broadcast_shape) 67 | broadcast_running_std = K.reshape(self.running_std, broadcast_shape) 68 | broadcast_beta = K.reshape(self.beta, broadcast_shape) 69 | broadcast_gamma = K.reshape(self.gamma, broadcast_shape) 70 | x_normed = K.batch_normalization( 71 | x, broadcast_running_mean, broadcast_running_std, 72 | broadcast_beta, broadcast_gamma, 73 | epsilon=self.epsilon) 74 | 75 | return x_normed 76 | 77 | def get_config(self): 78 | config = {'epsilon': self.epsilon, 79 | 'axis': self.axis, 80 | 'gamma_regularizer': self.gamma_regularizer.get_config() if self.gamma_regularizer else None, 81 | 'beta_regularizer': self.beta_regularizer.get_config() if self.beta_regularizer else None} 82 | base_config = super(FixedBatchNormalization, self).get_config() 83 | return dict(list(base_config.items()) + list(config.items())) -------------------------------------------------------------------------------- /keras_frcnn/RoiPoolingConv.py: -------------------------------------------------------------------------------- 1 | from keras.engine.topology import Layer 2 | import keras.backend as K 3 | 4 | if K.backend() == 'tensorflow': 5 | import tensorflow as tf 6 | 7 | class RoiPoolingConv(Layer): 8 | '''ROI pooling layer for 2D inputs. 9 | See Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition, 10 | K. He, X. Zhang, S. Ren, J. Sun 11 | # Arguments 12 | pool_size: int 13 | Size of pooling region to use. pool_size = 7 will result in a 7x7 region. 14 | num_rois: number of regions of interest to be used 15 | # Input shape 16 | list of two 4D tensors [X_img,X_roi] with shape: 17 | X_img: 18 | `(1, channels, rows, cols)` if dim_ordering='th' 19 | or 4D tensor with shape: 20 | `(1, rows, cols, channels)` if dim_ordering='tf'. 21 | X_roi: 22 | `(1,num_rois,4)` list of rois, with ordering (x,y,w,h) 23 | # Output shape 24 | 3D tensor with shape: 25 | `(1, num_rois, channels, pool_size, pool_size)` 26 | ''' 27 | def __init__(self, pool_size, num_rois, **kwargs): 28 | 29 | self.dim_ordering = K.image_dim_ordering() 30 | assert self.dim_ordering in {'tf', 'th'}, 'dim_ordering must be in {tf, th}' 31 | 32 | self.pool_size = pool_size 33 | self.num_rois = num_rois 34 | 35 | super(RoiPoolingConv, self).__init__(**kwargs) 36 | 37 | def build(self, input_shape): 38 | if self.dim_ordering == 'th': 39 | self.nb_channels = input_shape[0][1] 40 | elif self.dim_ordering == 'tf': 41 | self.nb_channels = input_shape[0][3] 42 | 43 | def compute_output_shape(self, input_shape): 44 | if self.dim_ordering == 'th': 45 | return None, self.num_rois, self.nb_channels, self.pool_size, self.pool_size 46 | else: 47 | return None, self.num_rois, self.pool_size, self.pool_size, self.nb_channels 48 | 49 | def call(self, x, mask=None): 50 | 51 | assert(len(x) == 2) 52 | 53 | img = x[0] 54 | rois = x[1] 55 | 56 | input_shape = K.shape(img) 57 | 58 | outputs = [] 59 | 60 | for roi_idx in range(self.num_rois): 61 | 62 | x = rois[0, roi_idx, 0] 63 | y = rois[0, roi_idx, 1] 64 | w = rois[0, roi_idx, 2] 65 | h = rois[0, roi_idx, 3] 66 | 67 | row_length = w / float(self.pool_size) 68 | col_length = h / float(self.pool_size) 69 | 70 | num_pool_regions = self.pool_size 71 | 72 | #NOTE: the RoiPooling implementation differs between theano and tensorflow due to the lack of a resize op 73 | # in theano. The theano implementation is much less efficient and leads to long compile times 74 | 75 | if self.dim_ordering == 'th': 76 | for jy in range(num_pool_regions): 77 | for ix in range(num_pool_regions): 78 | x1 = x + ix * row_length 79 | x2 = x1 + row_length 80 | y1 = y + jy * col_length 81 | y2 = y1 + col_length 82 | 83 | x1 = K.cast(x1, 'int32') 84 | x2 = K.cast(x2, 'int32') 85 | y1 = K.cast(y1, 'int32') 86 | y2 = K.cast(y2, 'int32') 87 | 88 | x2 = x1 + K.maximum(1,x2-x1) 89 | y2 = y1 + K.maximum(1,y2-y1) 90 | 91 | new_shape = [input_shape[0], input_shape[1], 92 | y2 - y1, x2 - x1] 93 | 94 | x_crop = img[:, :, y1:y2, x1:x2] 95 | xm = K.reshape(x_crop, new_shape) 96 | pooled_val = K.max(xm, axis=(2, 3)) 97 | outputs.append(pooled_val) 98 | 99 | elif self.dim_ordering == 'tf': 100 | x = K.cast(x, 'int32') 101 | y = K.cast(y, 'int32') 102 | w = K.cast(w, 'int32') 103 | h = K.cast(h, 'int32') 104 | 105 | rs = tf.image.resize_images(img[:, y:y+h, x:x+w, :], (self.pool_size, self.pool_size)) 106 | outputs.append(rs) 107 | 108 | final_output = K.concatenate(outputs, axis=0) 109 | final_output = K.reshape(final_output, (1, self.num_rois, self.pool_size, self.pool_size, self.nb_channels)) 110 | 111 | if self.dim_ordering == 'th': 112 | final_output = K.permute_dimensions(final_output, (0, 1, 4, 2, 3)) 113 | else: 114 | final_output = K.permute_dimensions(final_output, (0, 1, 2, 3, 4)) 115 | 116 | return final_output 117 | -------------------------------------------------------------------------------- /keras_frcnn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akshaylamba/FasterRCNN_KERAS/0f284263bb13ef57e6167008c04a75687a26f9d7/keras_frcnn/__init__.py -------------------------------------------------------------------------------- /keras_frcnn/config.py: -------------------------------------------------------------------------------- 1 | from keras import backend as K 2 | 3 | 4 | class Config: 5 | 6 | def __init__(self): 7 | 8 | self.verbose = True 9 | 10 | # setting for data augmentation 11 | self.use_horizontal_flips = False 12 | self.use_vertical_flips = False 13 | self.rot_90 = False 14 | 15 | # anchor box scales 16 | self.anchor_box_scales = [128, 256, 512] 17 | 18 | # anchor box ratios 19 | self.anchor_box_ratios = [[1, 1], [1, 2], [2, 1]] 20 | 21 | # size to resize the smallest side of the image 22 | self.im_size = 600 23 | 24 | # image channel-wise mean to subtract 25 | self.img_channel_mean = [103.939, 116.779, 123.68] 26 | self.img_scaling_factor = 1.0 27 | 28 | # number of ROIs at once 29 | self.num_rois = 4 30 | 31 | # stride at the RPN (this depends on the network configuration) 32 | self.rpn_stride = 16 33 | 34 | self.balanced_classes = False 35 | 36 | # scaling the stdev 37 | self.std_scaling = 4.0 38 | self.classifier_regr_std = [8.0, 8.0, 4.0, 4.0] 39 | 40 | # overlaps for RPN 41 | self.rpn_min_overlap = 0.3 42 | self.rpn_max_overlap = 0.7 43 | 44 | # overlaps for classifier ROIs 45 | self.classifier_min_overlap = 0.1 46 | self.classifier_max_overlap = 0.5 47 | 48 | # placeholder for the class mapping, automatically generated by the parser 49 | self.class_mapping = None 50 | 51 | #location of pretrained weights for the base network 52 | # weight files can be found at: 53 | # https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_th_dim_ordering_th_kernels_notop.h5 54 | # https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5 55 | if K.image_dim_ordering() == 'th': 56 | self.base_net_weights = 'resnet50_weights_th_dim_ordering_th_kernels_notop.h5' 57 | else: 58 | self.base_net_weights = 'resnet50_weights_tf_dim_ordering_tf_kernels.h5' 59 | 60 | self.model_path = 'model_frcnn.hdf5' 61 | -------------------------------------------------------------------------------- /keras_frcnn/data_augment.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import copy 4 | 5 | 6 | def augment(img_data, config, augment=True): 7 | assert 'filepath' in img_data 8 | assert 'bboxes' in img_data 9 | assert 'width' in img_data 10 | assert 'height' in img_data 11 | 12 | img_data_aug = copy.deepcopy(img_data) 13 | 14 | img = cv2.imread(img_data_aug['filepath']) 15 | 16 | if augment: 17 | rows, cols = img.shape[:2] 18 | 19 | if config.use_horizontal_flips and np.random.randint(0, 2) == 0: 20 | img = cv2.flip(img, 1) 21 | for bbox in img_data_aug['bboxes']: 22 | x1 = bbox['x1'] 23 | x2 = bbox['x2'] 24 | bbox['x2'] = cols - x1 25 | bbox['x1'] = cols - x2 26 | 27 | if config.use_vertical_flips and np.random.randint(0, 2) == 0: 28 | img = cv2.flip(img, 0) 29 | for bbox in img_data_aug['bboxes']: 30 | y1 = bbox['y1'] 31 | y2 = bbox['y2'] 32 | bbox['y2'] = rows - y1 33 | bbox['y1'] = rows - y2 34 | 35 | if config.rot_90: 36 | angle = np.random.choice([0,90,180,270],1)[0] 37 | if angle == 270: 38 | img = np.transpose(img, (1,0,2)) 39 | img = cv2.flip(img, 0) 40 | elif angle == 180: 41 | img = cv2.flip(img, -1) 42 | elif angle == 90: 43 | img = np.transpose(img, (1,0,2)) 44 | img = cv2.flip(img, 1) 45 | elif angle == 0: 46 | pass 47 | 48 | for bbox in img_data_aug['bboxes']: 49 | x1 = bbox['x1'] 50 | x2 = bbox['x2'] 51 | y1 = bbox['y1'] 52 | y2 = bbox['y2'] 53 | if angle == 270: 54 | bbox['x1'] = y1 55 | bbox['x2'] = y2 56 | bbox['y1'] = cols - x2 57 | bbox['y2'] = cols - x1 58 | elif angle == 180: 59 | bbox['x2'] = cols - x1 60 | bbox['x1'] = cols - x2 61 | bbox['y2'] = rows - y1 62 | bbox['y1'] = rows - y2 63 | elif angle == 90: 64 | bbox['x1'] = rows - y2 65 | bbox['x2'] = rows - y1 66 | bbox['y1'] = x1 67 | bbox['y2'] = x2 68 | elif angle == 0: 69 | pass 70 | 71 | img_data_aug['width'] = img.shape[1] 72 | img_data_aug['height'] = img.shape[0] 73 | return img_data_aug, img 74 | -------------------------------------------------------------------------------- /keras_frcnn/data_generators.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | import random 4 | import copy 5 | import data_augment 6 | import roi_helpers 7 | import threading 8 | import itertools 9 | 10 | random.seed(0) 11 | 12 | def get_img_output_length(width, height): 13 | def get_output_length(input_length): 14 | # zero_pad 15 | input_length += 6 16 | # apply 4 strided convolutions 17 | filter_sizes = [7, 3, 1, 1] 18 | stride = 2 19 | for filter_size in filter_sizes: 20 | input_length = (input_length - filter_size + stride) // stride 21 | return input_length 22 | 23 | return get_output_length(width), get_output_length(height) 24 | 25 | def union(au, bu): 26 | x = min(au[0], bu[0]) 27 | y = min(au[1], bu[1]) 28 | w = max(au[2], bu[2]) - x 29 | h = max(au[3], bu[3]) - y 30 | return x, y, w, h 31 | 32 | def intersection(ai, bi): 33 | x = max(ai[0], bi[0]) 34 | y = max(ai[1], bi[1]) 35 | w = min(ai[2], bi[2]) - x 36 | h = min(ai[3], bi[3]) - y 37 | if w < 0 or h < 0: 38 | return 0, 0, 0, 0 39 | return x, y, w, h 40 | 41 | def iou(a, b): 42 | # a and b should be (x1,y1,x2,y2) 43 | 44 | if a[0] >= a[2] or a[1] >= a[3] or b[0] >= b[2] or b[1] >= b[3]: 45 | return 0.0 46 | 47 | i = intersection(a, b) 48 | u = union(a, b) 49 | 50 | area_i = i[2] * i[3] 51 | area_u = u[2] * u[3] 52 | return float(area_i) / float(area_u) 53 | 54 | 55 | def get_new_img_size(width, height, img_min_side=600): 56 | if width <= height: 57 | f = float(img_min_side) / width 58 | resized_height = int(f * height) 59 | resized_width = img_min_side 60 | else: 61 | f = float(img_min_side) / height 62 | resized_width = int(f * width) 63 | resized_height = img_min_side 64 | 65 | return resized_width, resized_height 66 | 67 | 68 | 69 | 70 | class SampleSelector: 71 | def __init__(self, class_count): 72 | # ignore classes that have zero samples 73 | self.classes = [b for b in class_count.keys() if class_count[b] > 0] 74 | self.class_cycle = itertools.cycle(self.classes) 75 | self.curr_class = self.class_cycle.next() 76 | 77 | def skip_sample_for_balanced_class(self, img_data): 78 | 79 | class_in_img = False 80 | 81 | for bbox in img_data['bboxes']: 82 | 83 | cls_name = bbox['class'] 84 | 85 | if cls_name == self.curr_class: 86 | class_in_img = True 87 | self.curr_class = self.class_cycle.next() 88 | break 89 | 90 | if class_in_img: 91 | return False 92 | else: 93 | return True 94 | 95 | 96 | def calc_rpn(C, img_data, width, height, resized_width, resized_height): 97 | 98 | downscale = float(C.rpn_stride) 99 | anchor_sizes = C.anchor_box_scales 100 | anchor_ratios = C.anchor_box_ratios 101 | num_anchors = len(anchor_sizes) * len(anchor_ratios) 102 | 103 | # calculate the output map size based on the network architecture 104 | (output_width, output_height) = get_img_output_length(resized_width, resized_height) 105 | 106 | n_anchratios = len(anchor_ratios) 107 | 108 | # initialise empty output objectives 109 | y_rpn_overlap = np.zeros((output_height, output_width, num_anchors)) 110 | y_is_box_valid = np.zeros((output_height, output_width, num_anchors)) 111 | y_rpn_regr = np.zeros((output_height, output_width, num_anchors * 4)) 112 | 113 | num_bboxes = len(img_data['bboxes']) 114 | 115 | num_anchors_for_bbox = np.zeros(num_bboxes).astype(int) 116 | best_anchor_for_bbox = -1*np.ones((num_bboxes, 4)).astype(int) 117 | best_iou_for_bbox = np.zeros(num_bboxes).astype(np.float32) 118 | best_x_for_bbox = np.zeros((num_bboxes, 4)).astype(int) 119 | best_dx_for_bbox = np.zeros((num_bboxes, 4)).astype(np.float32) 120 | 121 | # get the GT box coordinates, and resize to account for image resizing 122 | gta = np.zeros((num_bboxes, 4)) 123 | for bbox_num, bbox in enumerate(img_data['bboxes']): 124 | # get the GT box coordinates, and resize to account for image resizing 125 | gta[bbox_num, 0] = bbox['x1'] * (resized_width / float(width)) 126 | gta[bbox_num, 1] = bbox['x2'] * (resized_width / float(width)) 127 | gta[bbox_num, 2] = bbox['y1'] * (resized_height / float(height)) 128 | gta[bbox_num, 3] = bbox['y2'] * (resized_height / float(height)) 129 | 130 | # rpn ground truth 131 | 132 | for anchor_size_idx in xrange(len(anchor_sizes)): 133 | for anchor_ratio_idx in xrange(n_anchratios): 134 | anchor_x = anchor_sizes[anchor_size_idx] * anchor_ratios[anchor_ratio_idx][0] 135 | anchor_y = anchor_sizes[anchor_size_idx] * anchor_ratios[anchor_ratio_idx][1] 136 | 137 | for ix in xrange(output_width): 138 | # x-coordinates of the current anchor box 139 | x1_anc = downscale * (ix + 0.5) - anchor_x / 2 140 | x2_anc = downscale * (ix + 0.5) + anchor_x / 2 141 | 142 | # ignore boxes that go across image boundaries 143 | if x1_anc < 0 or x2_anc > resized_width: 144 | continue 145 | 146 | for jy in xrange(output_height): 147 | 148 | # y-coordinates of the current anchor box 149 | y1_anc = downscale * (jy + 0.5) - anchor_y / 2 150 | y2_anc = downscale * (jy + 0.5) + anchor_y / 2 151 | 152 | # ignore boxes that go across image boundaries 153 | if y1_anc < 0 or y2_anc > resized_height: 154 | continue 155 | 156 | # bbox_type indicates whether an anchor should be a target 157 | bbox_type = 'neg' 158 | 159 | # this is the best IOU for the (x,y) coord and the current anchor 160 | # note that this is different from the best IOU for a GT bbox 161 | best_iou_for_loc = 0.0 162 | 163 | for bbox_num in xrange(num_bboxes): 164 | 165 | # get IOU of the current GT box and the current anchor box 166 | curr_iou = iou([gta[bbox_num, 0], gta[bbox_num, 2], gta[bbox_num, 1], gta[bbox_num, 3]], [x1_anc, y1_anc, x2_anc, y2_anc]) 167 | # calculate the regression targets if they will be needed 168 | if curr_iou > best_iou_for_bbox[bbox_num] or curr_iou > C.rpn_max_overlap: 169 | cx = (gta[bbox_num, 0] + gta[bbox_num, 1]) / 2.0 170 | cy = (gta[bbox_num, 2] + gta[bbox_num, 3]) / 2.0 171 | cxa = (x1_anc + x2_anc)/2.0 172 | cya = (y1_anc + y2_anc)/2.0 173 | 174 | tx = (cx - cxa) / (x2_anc - x1_anc) 175 | ty = (cy - cya) / (y2_anc - y1_anc) 176 | tw = np.log((gta[bbox_num, 1] - gta[bbox_num, 0]) / (x2_anc - x1_anc)) 177 | th = np.log((gta[bbox_num, 3] - gta[bbox_num, 2]) / (y2_anc - y1_anc)) 178 | 179 | if img_data['bboxes'][bbox_num]['class'] != 'bg': 180 | 181 | # all GT boxes should be mapped to an anchor box, so we keep track of which anchor box was best 182 | if curr_iou > best_iou_for_bbox[bbox_num]: 183 | best_anchor_for_bbox[bbox_num] = [jy, ix, anchor_ratio_idx, anchor_size_idx] 184 | best_iou_for_bbox[bbox_num] = curr_iou 185 | best_x_for_bbox[bbox_num,:] = [x1_anc, x2_anc, y1_anc, y2_anc] 186 | best_dx_for_bbox[bbox_num,:] = [tx, ty, tw, th] 187 | 188 | # we set the anchor to positive if the IOU is >0.7 (it does not matter if there was another better box, it just indicates overlap) 189 | if curr_iou > C.rpn_max_overlap: 190 | bbox_type = 'pos' 191 | num_anchors_for_bbox[bbox_num] += 1 192 | # we update the regression layer target if this IOU is the best for the current (x,y) and anchor position 193 | if curr_iou > best_iou_for_loc: 194 | best_iou_for_loc = curr_iou 195 | best_regr = (tx, ty, tw, th) 196 | 197 | # if the IOU is >0.3 and <0.7, it is ambiguous and no included in the objective 198 | if C.rpn_min_overlap < curr_iou < C.rpn_max_overlap: 199 | # gray zone between neg and pos 200 | if bbox_type != 'pos': 201 | bbox_type = 'neutral' 202 | 203 | # turn on or off outputs depending on IOUs 204 | if bbox_type == 'neg': 205 | y_is_box_valid[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1 206 | y_rpn_overlap[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0 207 | elif bbox_type == 'neutral': 208 | y_is_box_valid[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0 209 | y_rpn_overlap[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 0 210 | elif bbox_type == 'pos': 211 | y_is_box_valid[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1 212 | y_rpn_overlap[jy, ix, anchor_ratio_idx + n_anchratios * anchor_size_idx] = 1 213 | start = 4 * (anchor_ratio_idx + n_anchratios * anchor_size_idx) 214 | y_rpn_regr[jy, ix, start:start+4] = best_regr 215 | 216 | # we ensure that every bbox has at least one positive RPN region 217 | 218 | for idx in xrange(num_anchors_for_bbox.shape[0]): 219 | if num_anchors_for_bbox[idx] == 0: 220 | # no box with an IOU greater than zero ... 221 | if best_anchor_for_bbox[idx, 0] == -1: 222 | continue 223 | y_is_box_valid[ 224 | best_anchor_for_bbox[idx,0], best_anchor_for_bbox[idx,1], best_anchor_for_bbox[idx,2] + n_anchratios * 225 | best_anchor_for_bbox[idx,3]] = 1 226 | y_rpn_overlap[ 227 | best_anchor_for_bbox[idx,0], best_anchor_for_bbox[idx,1], best_anchor_for_bbox[idx,2] + n_anchratios * 228 | best_anchor_for_bbox[idx,3]] = 1 229 | start = 4 * (best_anchor_for_bbox[idx,2] + n_anchratios * best_anchor_for_bbox[idx,3]) 230 | y_rpn_regr[ 231 | best_anchor_for_bbox[idx,0], best_anchor_for_bbox[idx,1], start:start+4] = best_dx_for_bbox[idx, :] 232 | 233 | y_rpn_overlap = np.transpose(y_rpn_overlap, (2, 0, 1)) 234 | y_rpn_overlap = np.expand_dims(y_rpn_overlap, axis=0) 235 | 236 | y_is_box_valid = np.transpose(y_is_box_valid, (2, 0, 1)) 237 | y_is_box_valid = np.expand_dims(y_is_box_valid, axis=0) 238 | 239 | y_rpn_regr = np.transpose(y_rpn_regr, (2, 0, 1)) 240 | y_rpn_regr = np.expand_dims(y_rpn_regr, axis=0) 241 | 242 | pos_locs = np.where(np.logical_and(y_rpn_overlap[0, :, :, :] == 1, y_is_box_valid[0, :, :, :] == 1)) 243 | neg_locs = np.where(np.logical_and(y_rpn_overlap[0, :, :, :] == 0, y_is_box_valid[0, :, :, :] == 1)) 244 | 245 | num_pos = len(pos_locs[0]) 246 | 247 | # one issue is that the RPN has many more negative than positive regions, so we turn off some of the negative 248 | # regions. We also limit it to 256 regions. 249 | num_regions = 256 250 | 251 | if len(pos_locs[0]) > num_regions/2: 252 | val_locs = random.sample(range(len(pos_locs[0])), len(pos_locs[0]) - num_regions/2) 253 | y_is_box_valid[0, pos_locs[0][val_locs], pos_locs[1][val_locs], pos_locs[2][val_locs]] = 0 254 | num_pos = num_regions/2 255 | 256 | if len(neg_locs[0]) + num_pos > num_regions: 257 | val_locs = random.sample(range(len(neg_locs[0])), len(neg_locs[0]) - num_pos) 258 | y_is_box_valid[0, neg_locs[0][val_locs], neg_locs[1][val_locs], neg_locs[2][val_locs]] = 0 259 | 260 | y_rpn_cls = np.concatenate([y_is_box_valid, y_rpn_overlap], axis=1) 261 | y_rpn_regr = np.concatenate([np.repeat(y_rpn_overlap, 4, axis=1), y_rpn_regr], axis=1) 262 | 263 | return np.copy(y_rpn_cls), np.copy(y_rpn_regr) 264 | 265 | 266 | class threadsafe_iter: 267 | """Takes an iterator/generator and makes it thread-safe by 268 | serializing call to the `next` method of given iterator/generator. 269 | """ 270 | def __init__(self, it): 271 | self.it = it 272 | self.lock = threading.Lock() 273 | 274 | def __iter__(self): 275 | return self 276 | 277 | def next(self): 278 | with self.lock: 279 | return self.it.next() 280 | 281 | 282 | def threadsafe_generator(f): 283 | """A decorator that takes a generator function and makes it thread-safe. 284 | """ 285 | def g(*a, **kw): 286 | return threadsafe_iter(f(*a, **kw)) 287 | return g 288 | 289 | def get_anchor_gt(all_img_data, class_count, C, backend, mode='train'): 290 | 291 | all_img_data = sorted(all_img_data) 292 | 293 | sample_selector = SampleSelector(class_count) 294 | 295 | while True: 296 | if mode == 'train': 297 | random.shuffle(all_img_data) 298 | 299 | for img_data in all_img_data: 300 | try: 301 | 302 | if C.balanced_classes and sample_selector.skip_sample_for_balanced_class(img_data): 303 | continue 304 | 305 | # read in image, and optionally add augmentation 306 | 307 | if mode == 'train': 308 | img_data_aug, x_img = data_augment.augment(img_data, C, augment=True) 309 | else: 310 | img_data_aug, x_img = data_augment.augment(img_data, C, augment=False) 311 | 312 | (width, height) = (img_data_aug['width'], img_data_aug['height']) 313 | (rows, cols, _) = x_img.shape 314 | 315 | assert cols == width 316 | assert rows == height 317 | 318 | # get image dimensions for resizing 319 | (resized_width, resized_height) = get_new_img_size(width, height, C.im_size) 320 | 321 | # resize the image so that smalles side is length = 600px 322 | x_img = cv2.resize(x_img, (resized_width, resized_height), interpolation=cv2.INTER_CUBIC) 323 | 324 | try: 325 | y_rpn_cls, y_rpn_regr = calc_rpn(C, img_data_aug, width, height, resized_width, resized_height) 326 | except: 327 | continue 328 | 329 | # Zero-center by mean pixel, and preprocess image 330 | 331 | x_img = x_img[:,:, (2, 1, 0)] # BGR -> RGB 332 | x_img = x_img.astype(np.float32) 333 | x_img[:, :, 0] -= C.img_channel_mean[0] 334 | x_img[:, :, 1] -= C.img_channel_mean[1] 335 | x_img[:, :, 2] -= C.img_channel_mean[2] 336 | x_img /= C.img_scaling_factor 337 | 338 | x_img = np.transpose(x_img, (2, 0, 1)) 339 | x_img = np.expand_dims(x_img, axis=0) 340 | 341 | y_rpn_regr[:, y_rpn_regr.shape[1]/2:, :, :] *= C.std_scaling 342 | 343 | if backend == 'tf': 344 | x_img = np.transpose(x_img, (0, 2, 3, 1)) 345 | y_rpn_cls = np.transpose(y_rpn_cls, (0, 2, 3, 1)) 346 | y_rpn_regr = np.transpose(y_rpn_regr, (0, 2, 3, 1)) 347 | 348 | yield np.copy(x_img), [np.copy(y_rpn_cls), np.copy(y_rpn_regr)], img_data_aug 349 | 350 | except Exception as e: 351 | print(e) 352 | continue 353 | -------------------------------------------------------------------------------- /keras_frcnn/losses.py: -------------------------------------------------------------------------------- 1 | from keras import backend as K 2 | from keras.objectives import categorical_crossentropy 3 | 4 | if K.image_dim_ordering() == 'tf': 5 | import tensorflow as tf 6 | 7 | lambda_rpn_regr = 1.0 8 | lambda_rpn_class = 1.0 9 | 10 | lambda_cls_regr = 1.0 11 | lambda_cls_class = 1.0 12 | 13 | epsilon = 1e-4 14 | 15 | 16 | def rpn_loss_regr(num_anchors): 17 | def rpn_loss_regr_fixed_num(y_true, y_pred): 18 | if K.image_dim_ordering() == 'th': 19 | x = y_true[:, 4 * num_anchors:, :, :] - y_pred 20 | x_abs = K.abs(x) 21 | x_bool = K.less_equal(x_abs, 1.0) 22 | return lambda_rpn_regr * K.sum( 23 | y_true[:, :4 * num_anchors, :, :] * (x_bool * (0.5 * x * x) + (1 - x_bool) * (x_abs - 0.5))) / K.sum(epsilon + y_true[:, :4 * num_anchors, :, :]) 24 | else: 25 | x = y_true[:, :, :, 4 * num_anchors:] - y_pred 26 | x_abs = K.abs(x) 27 | x_bool = K.cast(K.less_equal(x_abs, 1.0), tf.float32) 28 | 29 | return lambda_rpn_regr * K.sum( 30 | y_true[:, :, :, :4 * num_anchors] * (x_bool * (0.5 * x * x) + (1 - x_bool) * (x_abs - 0.5))) / K.sum(epsilon + y_true[:, :, :, :4 * num_anchors]) 31 | 32 | return rpn_loss_regr_fixed_num 33 | 34 | 35 | def rpn_loss_cls(num_anchors): 36 | def rpn_loss_cls_fixed_num(y_true, y_pred): 37 | if K.image_dim_ordering() == 'tf': 38 | return lambda_rpn_class * K.sum(y_true[:, :, :, :num_anchors] * K.binary_crossentropy(y_pred[:, :, :, :], y_true[:, :, :, num_anchors:])) / K.sum(epsilon + y_true[:, :, :, :num_anchors]) 39 | else: 40 | return lambda_rpn_class * K.sum(y_true[:, :num_anchors, :, :] * K.binary_crossentropy(y_pred[:, :, :, :], y_true[:, num_anchors:, :, :])) / K.sum(epsilon + y_true[:, :num_anchors, :, :]) 41 | 42 | return rpn_loss_cls_fixed_num 43 | 44 | 45 | def class_loss_regr(num_classes): 46 | def class_loss_regr_fixed_num(y_true, y_pred): 47 | x = y_true[:, :, 4*num_classes:] - y_pred 48 | x_abs = K.abs(x) 49 | x_bool = K.cast(K.less_equal(x_abs, 1.0), 'float32') 50 | return lambda_cls_regr * K.sum(y_true[:, :, :4*num_classes] * (x_bool * (0.5 * x * x) + (1 - x_bool) * (x_abs - 0.5))) / K.sum(epsilon + y_true[:, :, :4*num_classes]) 51 | return class_loss_regr_fixed_num 52 | 53 | 54 | def class_loss_cls(y_true, y_pred): 55 | return lambda_cls_class * K.mean(categorical_crossentropy(y_true[0, :, :], y_pred[0, :, :])) 56 | -------------------------------------------------------------------------------- /keras_frcnn/pascal_voc_parser.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import xml.etree.ElementTree as ET 4 | import numpy as np 5 | def get_data(input_path): 6 | all_imgs = [] 7 | 8 | classes_count = {} 9 | 10 | class_mapping = {} 11 | 12 | visualise = False 13 | 14 | data_paths = [os.path.join(input_path,s) for s in ['VOC2007', 'VOC2012']] 15 | 16 | 17 | print('Parsing annotation files') 18 | 19 | for data_path in data_paths: 20 | 21 | annot_path = os.path.join(data_path, 'Annotations') 22 | imgs_path = os.path.join(data_path, 'JPEGImages') 23 | imgsets_path_trainval = os.path.join(data_path, 'ImageSets','Main','trainval.txt') 24 | imgsets_path_test = os.path.join(data_path, 'ImageSets','Main','test.txt') 25 | 26 | trainval_files = [] 27 | test_files = [] 28 | try: 29 | with open(imgsets_path_trainval) as f: 30 | for line in f: 31 | trainval_files.append(line.strip() + '.jpg') 32 | except Exception as e: 33 | print(e) 34 | 35 | try: 36 | with open(imgsets_path_test) as f: 37 | for line in f: 38 | test_files.append(line.strip() + '.jpg') 39 | except Exception as e: 40 | if data_path[-7:] == 'VOC2012': 41 | # this is expected, most pascal voc distibutions dont have the test.txt file 42 | pass 43 | else: 44 | print(e) 45 | 46 | annots = [os.path.join(annot_path, s) for s in os.listdir(annot_path)] 47 | idx = 0 48 | for annot in annots: 49 | try: 50 | idx += 1 51 | 52 | et = ET.parse(annot) 53 | element = et.getroot() 54 | 55 | element_objs = element.findall('object') 56 | element_filename = element.find('filename').text 57 | element_width = int(element.find('size').find('width').text) 58 | element_height = int(element.find('size').find('height').text) 59 | 60 | if len(element_objs) > 0: 61 | annotation_data = {'filepath': os.path.join(imgs_path, element_filename), 'width': element_width, 62 | 'height': element_height, 'bboxes': []} 63 | 64 | if element_filename in trainval_files: 65 | annotation_data['imageset'] = 'trainval' 66 | elif element_filename in test_files: 67 | annotation_data['imageset'] = 'test' 68 | else: 69 | annotation_data['imageset'] = 'trainval' 70 | 71 | for element_obj in element_objs: 72 | class_name = element_obj.find('name').text 73 | if class_name not in classes_count: 74 | classes_count[class_name] = 1 75 | else: 76 | classes_count[class_name] += 1 77 | 78 | if class_name not in class_mapping: 79 | class_mapping[class_name] = len(class_mapping) 80 | 81 | obj_bbox = element_obj.find('bndbox') 82 | x1 = int(round(float(obj_bbox.find('xmin').text))) 83 | y1 = int(round(float(obj_bbox.find('ymin').text))) 84 | x2 = int(round(float(obj_bbox.find('xmax').text))) 85 | y2 = int(round(float(obj_bbox.find('ymax').text))) 86 | difficulty = int(element_obj.find('difficult').text) == 1 87 | annotation_data['bboxes'].append( 88 | {'class': class_name, 'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2, 'difficult': difficulty}) 89 | all_imgs.append(annotation_data) 90 | 91 | if visualise: 92 | img = cv2.imread(annotation_data['filepath']) 93 | for bbox in annotation_data['bboxes']: 94 | cv2.rectangle(img, (bbox['x1'], bbox['y1']), (bbox[ 95 | 'x2'], bbox['y2']), (0, 0, 255)) 96 | cv2.imshow('img', img) 97 | cv2.waitKey(0) 98 | 99 | except Exception as e: 100 | print(e) 101 | continue 102 | return all_imgs, classes_count, class_mapping 103 | -------------------------------------------------------------------------------- /keras_frcnn/resnet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | '''ResNet50 model for Keras. 3 | # Reference: 4 | - [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) 5 | Adapted from code contributed by BigMoyan. 6 | ''' 7 | 8 | from __future__ import print_function 9 | from __future__ import absolute_import 10 | 11 | from keras.layers import Input, Add, Dense, Activation, Flatten, Convolution2D, MaxPooling2D, ZeroPadding2D, \ 12 | AveragePooling2D, TimeDistributed 13 | 14 | from keras import backend as K 15 | 16 | from keras_frcnn.RoiPoolingConv import RoiPoolingConv 17 | from keras_frcnn.FixedBatchNormalization import FixedBatchNormalization 18 | 19 | def identity_block(input_tensor, kernel_size, filters, stage, block, trainable=True): 20 | 21 | nb_filter1, nb_filter2, nb_filter3 = filters 22 | 23 | if K.image_dim_ordering() == 'tf': 24 | bn_axis = 3 25 | else: 26 | bn_axis = 1 27 | 28 | conv_name_base = 'res' + str(stage) + block + '_branch' 29 | bn_name_base = 'bn' + str(stage) + block + '_branch' 30 | 31 | x = Convolution2D(nb_filter1, (1, 1), name=conv_name_base + '2a', trainable=trainable)(input_tensor) 32 | x = FixedBatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x) 33 | x = Activation('relu')(x) 34 | 35 | x = Convolution2D(nb_filter2, (kernel_size, kernel_size), padding='same', name=conv_name_base + '2b', trainable=trainable)(x) 36 | x = FixedBatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x) 37 | x = Activation('relu')(x) 38 | 39 | x = Convolution2D(nb_filter3, (1, 1), name=conv_name_base + '2c', trainable=trainable)(x) 40 | x = FixedBatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x) 41 | 42 | x = Add()([x, input_tensor]) 43 | x = Activation('relu')(x) 44 | return x 45 | 46 | 47 | def identity_block_td(input_tensor, kernel_size, filters, stage, block, trainable=True): 48 | 49 | # identity block time distributed 50 | 51 | nb_filter1, nb_filter2, nb_filter3 = filters 52 | if K.image_dim_ordering() == 'tf': 53 | bn_axis = 3 54 | else: 55 | bn_axis = 1 56 | 57 | conv_name_base = 'res' + str(stage) + block + '_branch' 58 | bn_name_base = 'bn' + str(stage) + block + '_branch' 59 | 60 | x = TimeDistributed(Convolution2D(nb_filter1, (1, 1), trainable=trainable, kernel_initializer='normal'), name=conv_name_base + '2a')(input_tensor) 61 | x = TimeDistributed(FixedBatchNormalization(axis=bn_axis), name=bn_name_base + '2a')(x) 62 | x = Activation('relu')(x) 63 | 64 | x = TimeDistributed(Convolution2D(nb_filter2, (kernel_size, kernel_size), trainable=trainable, kernel_initializer='normal',padding='same'), name=conv_name_base + '2b')(x) 65 | x = TimeDistributed(FixedBatchNormalization(axis=bn_axis), name=bn_name_base + '2b')(x) 66 | x = Activation('relu')(x) 67 | 68 | x = TimeDistributed(Convolution2D(nb_filter3, (1, 1), trainable=trainable, kernel_initializer='normal'), name=conv_name_base + '2c')(x) 69 | x = TimeDistributed(FixedBatchNormalization(axis=bn_axis), name=bn_name_base + '2c')(x) 70 | 71 | x = Add()([x, input_tensor]) 72 | x = Activation('relu')(x) 73 | 74 | return x 75 | 76 | def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2), trainable=True): 77 | 78 | nb_filter1, nb_filter2, nb_filter3 = filters 79 | if K.image_dim_ordering() == 'tf': 80 | bn_axis = 3 81 | else: 82 | bn_axis = 1 83 | 84 | conv_name_base = 'res' + str(stage) + block + '_branch' 85 | bn_name_base = 'bn' + str(stage) + block + '_branch' 86 | 87 | x = Convolution2D(nb_filter1, (1, 1), strides=strides, name=conv_name_base + '2a', trainable=trainable)(input_tensor) 88 | x = FixedBatchNormalization(axis=bn_axis, name=bn_name_base + '2a')(x) 89 | x = Activation('relu')(x) 90 | 91 | x = Convolution2D(nb_filter2, (kernel_size, kernel_size), padding='same', name=conv_name_base + '2b', trainable=trainable)(x) 92 | x = FixedBatchNormalization(axis=bn_axis, name=bn_name_base + '2b')(x) 93 | x = Activation('relu')(x) 94 | 95 | x = Convolution2D(nb_filter3, (1, 1), name=conv_name_base + '2c', trainable=trainable)(x) 96 | x = FixedBatchNormalization(axis=bn_axis, name=bn_name_base + '2c')(x) 97 | 98 | shortcut = Convolution2D(nb_filter3, (1, 1), strides=strides, name=conv_name_base + '1', trainable=trainable)(input_tensor) 99 | shortcut = FixedBatchNormalization(axis=bn_axis, name=bn_name_base + '1')(shortcut) 100 | 101 | x = Add()([x, shortcut]) 102 | x = Activation('relu')(x) 103 | return x 104 | 105 | 106 | def conv_block_td(input_tensor, kernel_size, filters, stage, block, input_shape, strides=(2, 2), trainable=True): 107 | 108 | # conv block time distributed 109 | 110 | nb_filter1, nb_filter2, nb_filter3 = filters 111 | if K.image_dim_ordering() == 'tf': 112 | bn_axis = 3 113 | else: 114 | bn_axis = 1 115 | 116 | conv_name_base = 'res' + str(stage) + block + '_branch' 117 | bn_name_base = 'bn' + str(stage) + block + '_branch' 118 | 119 | x = TimeDistributed(Convolution2D(nb_filter1, (1, 1), strides=strides, trainable=trainable, kernel_initializer='normal'), input_shape=input_shape, name=conv_name_base + '2a')(input_tensor) 120 | x = TimeDistributed(FixedBatchNormalization(axis=bn_axis), name=bn_name_base + '2a')(x) 121 | x = Activation('relu')(x) 122 | 123 | x = TimeDistributed(Convolution2D(nb_filter2, (kernel_size, kernel_size), padding='same', trainable=trainable, kernel_initializer='normal'), name=conv_name_base + '2b')(x) 124 | x = TimeDistributed(FixedBatchNormalization(axis=bn_axis), name=bn_name_base + '2b')(x) 125 | x = Activation('relu')(x) 126 | 127 | x = TimeDistributed(Convolution2D(nb_filter3, (1, 1), kernel_initializer='normal'), name=conv_name_base + '2c', trainable=trainable)(x) 128 | x = TimeDistributed(FixedBatchNormalization(axis=bn_axis), name=bn_name_base + '2c')(x) 129 | 130 | shortcut = TimeDistributed(Convolution2D(nb_filter3, (1, 1), strides=strides, trainable=trainable, kernel_initializer='normal'), name=conv_name_base + '1')(input_tensor) 131 | shortcut = TimeDistributed(FixedBatchNormalization(axis=bn_axis), name=bn_name_base + '1')(shortcut) 132 | 133 | x = Add()([x, shortcut]) 134 | x = Activation('relu')(x) 135 | return x 136 | 137 | def nn_base(input_tensor=None, trainable=False): 138 | 139 | # Determine proper input shape 140 | if K.image_dim_ordering() == 'th': 141 | input_shape = (3, None, None) 142 | else: 143 | input_shape = (None, None, 3) 144 | 145 | if input_tensor is None: 146 | img_input = Input(shape=input_shape) 147 | else: 148 | if not K.is_keras_tensor(input_tensor): 149 | img_input = Input(tensor=input_tensor, shape=input_shape) 150 | else: 151 | img_input = input_tensor 152 | 153 | if K.image_dim_ordering() == 'tf': 154 | bn_axis = 3 155 | else: 156 | bn_axis = 1 157 | 158 | x = ZeroPadding2D((3, 3))(img_input) 159 | 160 | x = Convolution2D(64, (7, 7), strides=(2, 2), name='conv1', trainable = trainable)(x) 161 | x = FixedBatchNormalization(axis=bn_axis, name='bn_conv1')(x) 162 | x = Activation('relu')(x) 163 | x = MaxPooling2D((3, 3), strides=(2, 2))(x) 164 | 165 | x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1), trainable = trainable) 166 | x = identity_block(x, 3, [64, 64, 256], stage=2, block='b', trainable = trainable) 167 | x = identity_block(x, 3, [64, 64, 256], stage=2, block='c', trainable = trainable) 168 | 169 | x = conv_block(x, 3, [128, 128, 512], stage=3, block='a', trainable = trainable) 170 | x = identity_block(x, 3, [128, 128, 512], stage=3, block='b', trainable = trainable) 171 | x = identity_block(x, 3, [128, 128, 512], stage=3, block='c', trainable = trainable) 172 | x = identity_block(x, 3, [128, 128, 512], stage=3, block='d', trainable = trainable) 173 | 174 | x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a', trainable = trainable) 175 | x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b', trainable = trainable) 176 | x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c', trainable = trainable) 177 | x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d', trainable = trainable) 178 | x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e', trainable = trainable) 179 | x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f', trainable = trainable) 180 | 181 | return x 182 | 183 | 184 | def classifier_layers(x, input_shape, trainable=False): 185 | 186 | # compile times on theano tend to be very high, so we use smaller ROI pooling regions to workaround 187 | # (hence a smaller stride in the region that follows the ROI pool) 188 | if K.backend() == 'tensorflow': 189 | x = conv_block_td(x, 3, [512, 512, 2048], stage=5, block='a', input_shape=input_shape, strides=(2, 2), trainable=trainable) 190 | elif K.backend() == 'theano': 191 | x = conv_block_td(x, 3, [512, 512, 2048], stage=5, block='a', input_shape=input_shape, strides=(1, 1), trainable=trainable) 192 | 193 | x = identity_block_td(x, 3, [512, 512, 2048], stage=5, block='b', trainable=trainable) 194 | x = identity_block_td(x, 3, [512, 512, 2048], stage=5, block='c', trainable=trainable) 195 | x = TimeDistributed(AveragePooling2D((7, 7)), name='avg_pool')(x) 196 | 197 | return x 198 | 199 | 200 | def rpn(base_layers,num_anchors): 201 | 202 | x = Convolution2D(512, (3, 3), padding='same', activation='relu', kernel_initializer='normal', name='rpn_conv1')(base_layers) 203 | 204 | x_class = Convolution2D(num_anchors, (1, 1), activation='sigmoid', kernel_initializer='uniform', name='rpn_out_class')(x) 205 | x_regr = Convolution2D(num_anchors * 4, (1, 1), activation='linear', kernel_initializer='zero', name='rpn_out_regress')(x) 206 | 207 | return [x_class, x_regr, base_layers] 208 | 209 | def classifier(base_layers, input_rois, num_rois, nb_classes = 21, trainable=False): 210 | 211 | # compile times on theano tend to be very high, so we use smaller ROI pooling regions to workaround 212 | 213 | if K.backend() == 'tensorflow': 214 | pooling_regions = 14 215 | input_shape = (num_rois,14,14,1024) 216 | elif K.backend() == 'theano': 217 | pooling_regions = 7 218 | input_shape = (num_rois,1024,7,7) 219 | 220 | out_roi_pool = RoiPoolingConv(pooling_regions, num_rois)([base_layers, input_rois]) 221 | out = classifier_layers(out_roi_pool, input_shape=input_shape, trainable=True) 222 | 223 | out = TimeDistributed(Flatten())(out) 224 | 225 | out_class = TimeDistributed(Dense(nb_classes, activation='softmax', kernel_initializer='zero'), name='dense_class_{}'.format(nb_classes))(out) 226 | # note: no regression target for bg class 227 | out_regr = TimeDistributed(Dense(4 * (nb_classes-1), activation='linear', kernel_initializer='zero'), name='dense_regress_{}'.format(nb_classes))(out) 228 | return [out_class, out_regr] 229 | 230 | -------------------------------------------------------------------------------- /keras_frcnn/roi_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pdb 3 | import math 4 | import data_generators 5 | import copy 6 | 7 | 8 | def calc_iou(R, img_data, C, class_mapping): 9 | 10 | bboxes = img_data['bboxes'] 11 | (width, height) = (img_data['width'], img_data['height']) 12 | # get image dimensions for resizing 13 | (resized_width, resized_height) = data_generators.get_new_img_size(width, height, C.im_size) 14 | 15 | gta = np.zeros((len(bboxes), 4)) 16 | 17 | for bbox_num, bbox in enumerate(bboxes): 18 | # get the GT box coordinates, and resize to account for image resizing 19 | gta[bbox_num, 0] = int(round(bbox['x1'] * (resized_width / float(width))/C.rpn_stride)) 20 | gta[bbox_num, 1] = int(round(bbox['x2'] * (resized_width / float(width))/C.rpn_stride)) 21 | gta[bbox_num, 2] = int(round(bbox['y1'] * (resized_height / float(height))/C.rpn_stride)) 22 | gta[bbox_num, 3] = int(round(bbox['y2'] * (resized_height / float(height))/C.rpn_stride)) 23 | 24 | x_roi = [] 25 | y_class_num = [] 26 | y_class_regr_coords = [] 27 | y_class_regr_label = [] 28 | 29 | for ix in range(R.shape[0]): 30 | (x1, y1, x2, y2) = R[ix, :] 31 | x1 = int(round(x1)) 32 | y1 = int(round(y1)) 33 | x2 = int(round(x2)) 34 | y2 = int(round(y2)) 35 | 36 | best_iou = 0.0 37 | best_bbox = -1 38 | for bbox_num in range(len(bboxes)): 39 | curr_iou = data_generators.iou([gta[bbox_num, 0], gta[bbox_num, 2], gta[bbox_num, 1], gta[bbox_num, 3]], [x1, y1, x2, y2]) 40 | if curr_iou > best_iou: 41 | best_iou = curr_iou 42 | best_bbox = bbox_num 43 | 44 | if best_iou < C.classifier_min_overlap: 45 | continue 46 | else: 47 | w = x2 - x1 48 | h = y2 - y1 49 | x_roi.append([x1, y1, w, h]) 50 | 51 | if C.classifier_min_overlap <= best_iou < C.classifier_max_overlap: 52 | # hard negative example 53 | cls_name = 'bg' 54 | elif C.classifier_max_overlap <= best_iou: 55 | cls_name = bboxes[best_bbox]['class'] 56 | cxg = (gta[best_bbox, 0] + gta[best_bbox, 1]) / 2.0 57 | cyg = (gta[best_bbox, 2] + gta[best_bbox, 3]) / 2.0 58 | 59 | cx = x1 + w / 2.0 60 | cy = y1 + h / 2.0 61 | 62 | tx = (cxg - cx) / float(w) 63 | ty = (cyg - cy) / float(h) 64 | tw = np.log((gta[best_bbox, 1] - gta[best_bbox, 0]) / float(w)) 65 | th = np.log((gta[best_bbox, 3] - gta[best_bbox, 2]) / float(h)) 66 | else: 67 | print('roi = {}'.format(best_iou)) 68 | raise RuntimeError 69 | 70 | class_num = class_mapping[cls_name] 71 | class_label = len(class_mapping) * [0] 72 | class_label[class_num] = 1 73 | y_class_num.append(copy.deepcopy(class_label)) 74 | coords = [0] * 4 * (len(class_mapping) - 1) 75 | labels = [0] * 4 * (len(class_mapping) - 1) 76 | if cls_name != 'bg': 77 | label_pos = 4 * class_num 78 | sx, sy, sw, sh = C.classifier_regr_std 79 | coords[label_pos:4+label_pos] = [sx*tx, sy*ty, sw*tw, sh*th] 80 | labels[label_pos:4+label_pos] = [1, 1, 1, 1] 81 | y_class_regr_coords.append(copy.deepcopy(coords)) 82 | y_class_regr_label.append(copy.deepcopy(labels)) 83 | else: 84 | y_class_regr_coords.append(copy.deepcopy(coords)) 85 | y_class_regr_label.append(copy.deepcopy(labels)) 86 | 87 | if len(x_roi) == 0: 88 | return None, None, None 89 | 90 | X = np.array(x_roi) 91 | Y1 = np.array(y_class_num) 92 | Y2 = np.concatenate([np.array(y_class_regr_label),np.array(y_class_regr_coords)],axis=1) 93 | 94 | return np.expand_dims(X, axis=0), np.expand_dims(Y1, axis=0), np.expand_dims(Y2, axis=0) 95 | 96 | def apply_regr(x, y, w, h, tx, ty, tw, th): 97 | try: 98 | cx = x + w/2. 99 | cy = y + h/2. 100 | cx1 = tx * w + cx 101 | cy1 = ty * h + cy 102 | w1 = math.exp(tw) * w 103 | h1 = math.exp(th) * h 104 | x1 = cx1 - w1/2. 105 | y1 = cy1 - h1/2. 106 | x1 = int(round(x1)) 107 | y1 = int(round(y1)) 108 | w1 = int(round(w1)) 109 | h1 = int(round(h1)) 110 | 111 | return x1, y1, w1, h1 112 | 113 | except ValueError: 114 | return x, y, w, h 115 | except OverflowError: 116 | return x, y, w, h 117 | except Exception as e: 118 | print(e) 119 | return x, y, w, h 120 | 121 | def apply_regr_np(X, T): 122 | try: 123 | x = X[0, :, :] 124 | y = X[1, :, :] 125 | w = X[2, :, :] 126 | h = X[3, :, :] 127 | 128 | tx = T[0, :, :] 129 | ty = T[1, :, :] 130 | tw = T[2, :, :] 131 | th = T[3, :, :] 132 | 133 | cx = x + w/2. 134 | cy = y + h/2. 135 | cx1 = tx * w + cx 136 | cy1 = ty * h + cy 137 | 138 | w1 = np.exp(tw) * w 139 | h1 = np.exp(th) * h 140 | x1 = cx1 - w1/2. 141 | y1 = cy1 - h1/2. 142 | 143 | x1 = np.round(x1) 144 | y1 = np.round(y1) 145 | w1 = np.round(w1) 146 | h1 = np.round(h1) 147 | return np.stack([x1, y1, w1, h1]) 148 | except Exception as e: 149 | print(e) 150 | return X 151 | 152 | def non_max_suppression_fast(boxes, probs, overlap_thresh=0.9, max_boxes=300): 153 | # code used from here: http://www.pyimagesearch.com/2015/02/16/faster-non-maximum-suppression-python/ 154 | # if there are no boxes, return an empty list 155 | if len(boxes) == 0: 156 | return [] 157 | 158 | # grab the coordinates of the bounding boxes 159 | x1 = boxes[:, 0] 160 | y1 = boxes[:, 1] 161 | x2 = boxes[:, 2] 162 | y2 = boxes[:, 3] 163 | 164 | np.testing.assert_array_less(x1, x2) 165 | np.testing.assert_array_less(y1, y2) 166 | 167 | # if the bounding boxes integers, convert them to floats -- 168 | # this is important since we'll be doing a bunch of divisions 169 | if boxes.dtype.kind == "i": 170 | boxes = boxes.astype("float") 171 | 172 | # initialize the list of picked indexes 173 | pick = [] 174 | 175 | # sort the bounding boxes 176 | idxs = np.argsort(probs) 177 | 178 | # keep looping while some indexes still remain in the indexes 179 | # list 180 | while len(idxs) > 0: 181 | # grab the last index in the indexes list and add the 182 | # index value to the list of picked indexes 183 | last = len(idxs) - 1 184 | i = idxs[last] 185 | pick.append(i) 186 | 187 | # find the intersection 188 | 189 | xx1_int = np.maximum(x1[i], x1[idxs[:last]]) 190 | yy1_int = np.maximum(y1[i], y1[idxs[:last]]) 191 | xx2_int = np.minimum(x2[i], x2[idxs[:last]]) 192 | yy2_int = np.minimum(y2[i], y2[idxs[:last]]) 193 | 194 | # find the union 195 | xx1_un = np.minimum(x1[i], x1[idxs[:last]]) 196 | yy1_un = np.minimum(y1[i], y1[idxs[:last]]) 197 | xx2_un = np.maximum(x2[i], x2[idxs[:last]]) 198 | yy2_un = np.maximum(y2[i], y2[idxs[:last]]) 199 | 200 | # compute the width and height of the bounding box 201 | ww_int = np.maximum(0, xx2_int - xx1_int) 202 | hh_int = np.maximum(0, yy2_int - yy1_int) 203 | 204 | ww_un = np.maximum(0, xx2_un - xx1_un) 205 | hh_un = np.maximum(0, yy2_un - yy1_un) 206 | 207 | # compute the ratio of overlap 208 | overlap = (ww_int*hh_int)/(ww_un*hh_un + 1e-9) 209 | 210 | # delete all indexes from the index list that have 211 | idxs = np.delete(idxs, np.concatenate(([last], 212 | np.where(overlap > overlap_thresh)[0]))) 213 | 214 | if len(pick) >= max_boxes: 215 | break 216 | 217 | # return only the bounding boxes that were picked using the integer data type 218 | boxes = boxes[pick].astype("int") 219 | probs = probs[pick] 220 | return boxes, probs 221 | 222 | import time 223 | def rpn_to_roi(rpn_layer, regr_layer, C, dim_ordering, use_regr=True, max_boxes=300,overlap_thresh=0.9): 224 | 225 | regr_layer = regr_layer / C.std_scaling 226 | 227 | anchor_sizes = C.anchor_box_scales 228 | anchor_ratios = C.anchor_box_ratios 229 | 230 | assert rpn_layer.shape[0] == 1 231 | 232 | if dim_ordering == 'th': 233 | (rows,cols) = rpn_layer.shape[2:] 234 | 235 | elif dim_ordering == 'tf': 236 | (rows, cols) = rpn_layer.shape[1:3] 237 | 238 | curr_layer = 0 239 | if dim_ordering == 'tf': 240 | A = np.zeros((4, rpn_layer.shape[1], rpn_layer.shape[2], rpn_layer.shape[3])) 241 | elif dim_ordering == 'th': 242 | A = np.zeros((4, rpn_layer.shape[2], rpn_layer.shape[3], rpn_layer.shape[1])) 243 | 244 | for anchor_size in anchor_sizes: 245 | for anchor_ratio in anchor_ratios: 246 | 247 | anchor_x = (anchor_size * anchor_ratio[0])/C.rpn_stride 248 | anchor_y = (anchor_size * anchor_ratio[1])/C.rpn_stride 249 | if dim_ordering == 'th': 250 | regr = regr_layer[0, 4 * curr_layer:4 * curr_layer + 4, :, :] 251 | else: 252 | regr = regr_layer[0, :, :, 4 * curr_layer:4 * curr_layer + 4] 253 | regr = np.transpose(regr, (2, 0, 1)) 254 | 255 | X, Y = np.meshgrid(np.arange(cols),np. arange(rows)) 256 | 257 | A[0, :, :, curr_layer] = X - anchor_x/2 258 | A[1, :, :, curr_layer] = Y - anchor_y/2 259 | A[2, :, :, curr_layer] = anchor_x 260 | A[3, :, :, curr_layer] = anchor_y 261 | 262 | if use_regr: 263 | A[:, :, :, curr_layer] = apply_regr_np(A[:, :, :, curr_layer], regr) 264 | 265 | A[2, :, :, curr_layer] = np.maximum(1, A[2, :, :, curr_layer]) 266 | A[3, :, :, curr_layer] = np.maximum(1, A[3, :, :, curr_layer]) 267 | A[2, :, :, curr_layer] += A[0, :, :, curr_layer] 268 | A[3, :, :, curr_layer] += A[1, :, :, curr_layer] 269 | 270 | A[0, :, :, curr_layer] = np.maximum(0, A[0, :, :, curr_layer]) 271 | A[1, :, :, curr_layer] = np.maximum(0, A[1, :, :, curr_layer]) 272 | A[2, :, :, curr_layer] = np.minimum(cols-1, A[2, :, :, curr_layer]) 273 | A[3, :, :, curr_layer] = np.minimum(rows-1, A[3, :, :, curr_layer]) 274 | 275 | curr_layer += 1 276 | 277 | all_boxes = np.reshape(A.transpose((0, 3, 1,2)), (4, -1)).transpose((1, 0)) 278 | all_probs = rpn_layer.transpose((0, 3, 1, 2)).reshape((-1)) 279 | 280 | x1 = all_boxes[:, 0] 281 | y1 = all_boxes[:, 1] 282 | x2 = all_boxes[:, 2] 283 | y2 = all_boxes[:, 3] 284 | 285 | idxs = np.where((x1 - x2 >= 0) | (y1 - y2 >= 0)) 286 | 287 | all_boxes = np.delete(all_boxes, idxs, 0) 288 | all_probs = np.delete(all_probs, idxs, 0) 289 | 290 | result = non_max_suppression_fast(all_boxes, all_probs, overlap_thresh=overlap_thresh, max_boxes=max_boxes)[0] 291 | 292 | return result 293 | -------------------------------------------------------------------------------- /keras_frcnn/simple_parser.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | 4 | def get_data(input_path): 5 | found_bg = False 6 | all_imgs = {} 7 | 8 | classes_count = {} 9 | 10 | class_mapping = {} 11 | 12 | visualise = True 13 | 14 | with open(input_path,'r') as f: 15 | 16 | print('Parsing annotation files') 17 | 18 | for line in f: 19 | line_split = line.strip().split(',') 20 | (filename,x1,y1,x2,y2,class_name) = line_split 21 | 22 | if class_name not in classes_count: 23 | classes_count[class_name] = 1 24 | else: 25 | classes_count[class_name] += 1 26 | 27 | if class_name not in class_mapping: 28 | if class_name == 'bg' and found_bg == False: 29 | print('Found class name with special name bg. Will be treated as a background region (this is usually for hard negative mining).') 30 | found_bg = True 31 | class_mapping[class_name] = len(class_mapping) 32 | 33 | if filename not in all_imgs: 34 | all_imgs[filename] = {} 35 | 36 | img = cv2.imread(filename) 37 | (rows,cols) = img.shape[:2] 38 | all_imgs[filename]['filepath'] = filename 39 | all_imgs[filename]['width'] = cols 40 | all_imgs[filename]['height'] = rows 41 | all_imgs[filename]['bboxes'] = [] 42 | if np.random.randint(0,6) > 0: 43 | all_imgs[filename]['imageset'] = 'trainval' 44 | else: 45 | all_imgs[filename]['imageset'] = 'test' 46 | 47 | all_imgs[filename]['bboxes'].append({'class': class_name, 'x1': int(x1), 'x2': int(x2), 'y1': int(y1), 'y2': int(y2)}) 48 | 49 | 50 | all_data = [] 51 | for key in all_imgs: 52 | all_data.append(all_imgs[key]) 53 | 54 | # make sure the bg class is last in the list 55 | if found_bg: 56 | if class_mapping['bg'] != len(class_mapping) - 1: 57 | key_to_switch = [key for key in class_mapping.keys() if class_mapping[key] == len(class_mapping)-1][0] 58 | val_to_switch = class_mapping['bg'] 59 | class_mapping['bg'] = len(class_mapping) - 1 60 | class_mapping[key_to_switch] = val_to_switch 61 | 62 | return all_data, classes_count, class_mapping 63 | 64 | 65 | -------------------------------------------------------------------------------- /measure_map.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import numpy as np 4 | import sys 5 | import pickle 6 | from optparse import OptionParser 7 | import time 8 | from keras_frcnn import config 9 | import keras_frcnn.resnet as nn 10 | from keras import backend as K 11 | from keras.layers import Input 12 | from keras.models import Model 13 | from keras_frcnn import roi_helpers 14 | from keras_frcnn import data_generators 15 | from sklearn.metrics import average_precision_score 16 | 17 | 18 | def get_map(pred, gt, f): 19 | T = {} 20 | P = {} 21 | fx, fy = f 22 | 23 | for bbox in gt: 24 | bbox['bbox_matched'] = False 25 | 26 | pred_probs = np.array([s['prob'] for s in pred]) 27 | box_idx_sorted_by_prob = np.argsort(pred_probs)[::-1] 28 | 29 | for box_idx in box_idx_sorted_by_prob: 30 | pred_box = pred[box_idx] 31 | pred_class = pred_box['class'] 32 | pred_x1 = pred_box['x1'] 33 | pred_x2 = pred_box['x2'] 34 | pred_y1 = pred_box['y1'] 35 | pred_y2 = pred_box['y2'] 36 | pred_prob = pred_box['prob'] 37 | if pred_class not in P: 38 | P[pred_class] = [] 39 | T[pred_class] = [] 40 | P[pred_class].append(pred_prob) 41 | found_match = False 42 | 43 | for gt_box in gt: 44 | gt_class = gt_box['class'] 45 | gt_x1 = gt_box['x1']/fx 46 | gt_x2 = gt_box['x2']/fx 47 | gt_y1 = gt_box['y1']/fy 48 | gt_y2 = gt_box['y2']/fy 49 | gt_seen = gt_box['bbox_matched'] 50 | if gt_class != pred_class: 51 | continue 52 | if gt_seen: 53 | continue 54 | iou = data_generators.iou((pred_x1, pred_y1, pred_x2, pred_y2), (gt_x1, gt_y1, gt_x2, gt_y2)) 55 | if iou >= 0.5: 56 | found_match = True 57 | gt_box['bbox_matched'] = True 58 | break 59 | else: 60 | continue 61 | 62 | T[pred_class].append(int(found_match)) 63 | 64 | for gt_box in gt: 65 | if not gt_box['bbox_matched'] and not gt_box['difficult']: 66 | if gt_box['class'] not in P: 67 | P[gt_box['class']] = [] 68 | T[gt_box['class']] = [] 69 | 70 | T[gt_box['class']].append(1) 71 | P[gt_box['class']].append(0) 72 | 73 | #import pdb 74 | #pdb.set_trace() 75 | return T, P 76 | 77 | sys.setrecursionlimit(40000) 78 | 79 | parser = OptionParser() 80 | 81 | parser.add_option("-p", "--path", dest="test_path", help="Path to test data.") 82 | parser.add_option("-n", "--num_rois", dest="num_rois", 83 | help="Number of ROIs per iteration. Higher means more memory use.", default=32) 84 | parser.add_option("--config_filename", dest="config_filename", help= 85 | "Location to read the metadata related to the training (generated when training).", 86 | default="config.pickle") 87 | parser.add_option("-o", "--parser", dest="parser", help="Parser to use. One of simple or pascal_voc", 88 | default="pascal_voc"), 89 | 90 | (options, args) = parser.parse_args() 91 | 92 | if not options.test_path: # if filename is not given 93 | parser.error('Error: path to test data must be specified. Pass --path to command line') 94 | 95 | 96 | if options.parser == 'pascal_voc': 97 | from keras_frcnn.pascal_voc_parser import get_data 98 | elif options.parser == 'simple': 99 | from keras_frcnn.simple_parser import get_data 100 | else: 101 | raise ValueError("Command line option parser must be one of 'pascal_voc' or 'simple'") 102 | 103 | config_output_filename = options.config_filename 104 | 105 | with open(config_output_filename, 'r') as f_in: 106 | C = pickle.load(f_in) 107 | 108 | # turn off any data augmentation at test time 109 | C.use_horizontal_flips = False 110 | C.use_vertical_flips = False 111 | C.rot_90 = False 112 | 113 | img_path = options.test_path 114 | 115 | 116 | def format_img(img, C): 117 | img_min_side = float(C.im_size) 118 | (height,width,_) = img.shape 119 | 120 | if width <= height: 121 | f = img_min_side/width 122 | new_height = int(f * height) 123 | new_width = int(img_min_side) 124 | else: 125 | f = img_min_side/height 126 | new_width = int(f * width) 127 | new_height = int(img_min_side) 128 | fx = width/float(new_width) 129 | fy = height/float(new_height) 130 | img = cv2.resize(img, (new_width, new_height), interpolation=cv2.INTER_CUBIC) 131 | img = img[:, :, (2, 1, 0)] 132 | img = img.astype(np.float32) 133 | img[:, :, 0] -= C.img_channel_mean[0] 134 | img[:, :, 1] -= C.img_channel_mean[1] 135 | img[:, :, 2] -= C.img_channel_mean[2] 136 | img /= C.img_scaling_factor 137 | img = np.transpose(img, (2, 0, 1)) 138 | img = np.expand_dims(img, axis=0) 139 | return img, fx, fy 140 | 141 | 142 | class_mapping = C.class_mapping 143 | 144 | if 'bg' not in class_mapping: 145 | class_mapping['bg'] = len(class_mapping) 146 | 147 | class_mapping = {v: k for k, v in class_mapping.iteritems()} 148 | print(class_mapping) 149 | class_to_color = {class_mapping[v]: np.random.randint(0, 255, 3) for v in class_mapping} 150 | C.num_rois = int(options.num_rois) 151 | 152 | if K.image_dim_ordering() == 'th': 153 | input_shape_img = (3, None, None) 154 | input_shape_features = (1024, None, None) 155 | else: 156 | input_shape_img = (None, None, 3) 157 | input_shape_features = (None, None, 1024) 158 | 159 | 160 | img_input = Input(shape=input_shape_img) 161 | roi_input = Input(shape=(C.num_rois, 4)) 162 | feature_map_input = Input(shape=input_shape_features) 163 | 164 | # define the base network (resnet here, can be VGG, Inception, etc) 165 | shared_layers = nn.nn_base(img_input, trainable=True) 166 | 167 | # define the RPN, built on the base layers 168 | num_anchors = len(C.anchor_box_scales) * len(C.anchor_box_ratios) 169 | rpn_layers = nn.rpn(shared_layers, num_anchors) 170 | 171 | classifier = nn.classifier(feature_map_input, roi_input, C.num_rois, nb_classes=len(class_mapping), trainable=True) 172 | 173 | model_rpn = Model(img_input, rpn_layers) 174 | model_classifier_only = Model([feature_map_input, roi_input], classifier) 175 | 176 | model_classifier = Model([feature_map_input, roi_input], classifier) 177 | 178 | model_rpn.load_weights(C.model_path, by_name=True) 179 | model_classifier.load_weights(C.model_path, by_name=True) 180 | 181 | model_rpn.compile(optimizer='sgd', loss='mse') 182 | model_classifier.compile(optimizer='sgd', loss='mse') 183 | 184 | all_imgs, _, _ = get_data(options.test_path) 185 | test_imgs = [s for s in all_imgs if s['imageset'] == 'test'] 186 | 187 | 188 | T = {} 189 | P = {} 190 | for idx, img_data in enumerate(test_imgs): 191 | print('{}/{}'.format(idx,len(test_imgs))) 192 | st = time.time() 193 | filepath = img_data['filepath'] 194 | 195 | img = cv2.imread(filepath) 196 | 197 | X, fx, fy = format_img(img, C) 198 | 199 | if K.image_dim_ordering() == 'tf': 200 | X = np.transpose(X, (0, 2, 3, 1)) 201 | 202 | # get the feature maps and output from the RPN 203 | [Y1, Y2, F] = model_rpn.predict(X) 204 | 205 | R = roi_helpers.rpn_to_roi(Y1, Y2, C, K.image_dim_ordering(), overlap_thresh=0.7) 206 | 207 | # convert from (x1,y1,x2,y2) to (x,y,w,h) 208 | R[:, 2] -= R[:, 0] 209 | R[:, 3] -= R[:, 1] 210 | 211 | # apply the spatial pyramid pooling to the proposed regions 212 | bboxes = {} 213 | probs = {} 214 | 215 | for jk in range(R.shape[0] // C.num_rois + 1): 216 | ROIs = np.expand_dims(R[C.num_rois * jk:C.num_rois * (jk + 1), :], axis=0) 217 | if ROIs.shape[1] == 0: 218 | break 219 | 220 | if jk == R.shape[0] // C.num_rois: 221 | # pad R 222 | curr_shape = ROIs.shape 223 | target_shape = (curr_shape[0], C.num_rois, curr_shape[2]) 224 | ROIs_padded = np.zeros(target_shape).astype(ROIs.dtype) 225 | ROIs_padded[:, :curr_shape[1], :] = ROIs 226 | ROIs_padded[0, curr_shape[1]:, :] = ROIs[0, 0, :] 227 | ROIs = ROIs_padded 228 | 229 | [P_cls, P_regr] = model_classifier_only.predict([F, ROIs]) 230 | 231 | for ii in range(P_cls.shape[1]): 232 | 233 | if np.argmax(P_cls[0, ii, :]) == (P_cls.shape[2] - 1): 234 | continue 235 | 236 | cls_name = class_mapping[np.argmax(P_cls[0, ii, :])] 237 | 238 | if cls_name not in bboxes: 239 | bboxes[cls_name] = [] 240 | probs[cls_name] = [] 241 | 242 | (x, y, w, h) = ROIs[0, ii, :] 243 | 244 | cls_num = np.argmax(P_cls[0, ii, :]) 245 | try: 246 | (tx, ty, tw, th) = P_regr[0, ii, 4 * cls_num:4 * (cls_num + 1)] 247 | tx /= C.classifier_regr_std[0] 248 | ty /= C.classifier_regr_std[1] 249 | tw /= C.classifier_regr_std[2] 250 | th /= C.classifier_regr_std[3] 251 | x, y, w, h = roi_helpers.apply_regr(x, y, w, h, tx, ty, tw, th) 252 | except: 253 | pass 254 | bboxes[cls_name].append([16 * x, 16 * y, 16 * (x + w), 16 * (y + h)]) 255 | probs[cls_name].append(np.max(P_cls[0, ii, :])) 256 | 257 | all_dets = [] 258 | 259 | for key in bboxes: 260 | bbox = np.array(bboxes[key]) 261 | 262 | new_boxes, new_probs = roi_helpers.non_max_suppression_fast(bbox, np.array(probs[key]), overlap_thresh=0.5) 263 | for jk in range(new_boxes.shape[0]): 264 | (x1, y1, x2, y2) = new_boxes[jk, :] 265 | det = {'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2, 'class': key, 'prob': new_probs[jk]} 266 | all_dets.append(det) 267 | 268 | 269 | print('Elapsed time = {}'.format(time.time() - st)) 270 | t, p = get_map(all_dets, img_data['bboxes'], (fx, fy)) 271 | for key in t.keys(): 272 | if key not in T: 273 | T[key] = [] 274 | P[key] = [] 275 | T[key].extend(t[key]) 276 | P[key].extend(p[key]) 277 | all_aps = [] 278 | for key in T.keys(): 279 | ap = average_precision_score(T[key], P[key]) 280 | print('{} AP: {}'.format(key, ap)) 281 | all_aps.append(ap) 282 | print('mAP = {}'.format(np.mean(np.array(all_aps)))) 283 | #print(T) 284 | #print(P) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | h5py 2 | Keras==2.0.3 3 | numpy 4 | opencv-python 5 | sklearn 6 | -------------------------------------------------------------------------------- /test_frcnn.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import numpy as np 4 | import sys 5 | import pickle 6 | from optparse import OptionParser 7 | import time 8 | from keras_frcnn import config 9 | import keras_frcnn.resnet as nn 10 | from keras import backend as K 11 | from keras.layers import Input 12 | from keras.models import Model 13 | from keras_frcnn import roi_helpers 14 | 15 | sys.setrecursionlimit(40000) 16 | 17 | parser = OptionParser() 18 | 19 | parser.add_option("-p", "--path", dest="test_path", help="Path to test data.") 20 | parser.add_option("-n", "--num_rois", dest="num_rois", 21 | help="Number of ROIs per iteration. Higher means more memory use.", default=32) 22 | parser.add_option("--config_filename", dest="config_filename", help= 23 | "Location to read the metadata related to the training (generated when training).", 24 | default="config.pickle") 25 | 26 | (options, args) = parser.parse_args() 27 | 28 | if not options.test_path: # if filename is not given 29 | parser.error('Error: path to test data must be specified. Pass --path to command line') 30 | 31 | 32 | config_output_filename = options.config_filename 33 | 34 | with open(config_output_filename, 'r') as f_in: 35 | C = pickle.load(f_in) 36 | 37 | # turn off any data augmentation at test time 38 | C.use_horizontal_flips = False 39 | C.use_vertical_flips = False 40 | C.rot_90 = False 41 | 42 | img_path = options.test_path 43 | 44 | 45 | def format_img(img, C): 46 | img_min_side = float(C.im_size) 47 | (height,width,_) = img.shape 48 | 49 | if width <= height: 50 | f = img_min_side/width 51 | new_height = int(f * height) 52 | new_width = int(img_min_side) 53 | else: 54 | f = img_min_side/height 55 | new_width = int(f * width) 56 | new_height = int(img_min_side) 57 | img = cv2.resize(img, (new_width, new_height), interpolation=cv2.INTER_CUBIC) 58 | img = img[:, :, (2, 1, 0)] 59 | img = img.astype(np.float32) 60 | img[:, :, 0] -= C.img_channel_mean[0] 61 | img[:, :, 1] -= C.img_channel_mean[1] 62 | img[:, :, 2] -= C.img_channel_mean[2] 63 | img /= C.img_scaling_factor 64 | img = np.transpose(img, (2, 0, 1)) 65 | img = np.expand_dims(img, axis=0) 66 | return img 67 | 68 | 69 | class_mapping = C.class_mapping 70 | 71 | if 'bg' not in class_mapping: 72 | class_mapping['bg'] = len(class_mapping) 73 | 74 | class_mapping = {v: k for k, v in class_mapping.iteritems()} 75 | print(class_mapping) 76 | class_to_color = {class_mapping[v]: np.random.randint(0, 255, 3) for v in class_mapping} 77 | C.num_rois = int(options.num_rois) 78 | 79 | if K.image_dim_ordering() == 'th': 80 | input_shape_img = (3, None, None) 81 | input_shape_features = (1024, None, None) 82 | else: 83 | input_shape_img = (None, None, 3) 84 | input_shape_features = (None, None, 1024) 85 | 86 | 87 | img_input = Input(shape=input_shape_img) 88 | roi_input = Input(shape=(C.num_rois, 4)) 89 | feature_map_input = Input(shape=input_shape_features) 90 | 91 | # define the base network (resnet here, can be VGG, Inception, etc) 92 | shared_layers = nn.nn_base(img_input, trainable=True) 93 | 94 | # define the RPN, built on the base layers 95 | num_anchors = len(C.anchor_box_scales) * len(C.anchor_box_ratios) 96 | rpn_layers = nn.rpn(shared_layers, num_anchors) 97 | 98 | classifier = nn.classifier(feature_map_input, roi_input, C.num_rois, nb_classes=len(class_mapping), trainable=True) 99 | 100 | model_rpn = Model(img_input, rpn_layers) 101 | model_classifier_only = Model([feature_map_input, roi_input], classifier) 102 | 103 | model_classifier = Model([feature_map_input, roi_input], classifier) 104 | 105 | model_rpn.load_weights(C.model_path, by_name=True) 106 | model_classifier.load_weights(C.model_path, by_name=True) 107 | 108 | model_rpn.compile(optimizer='sgd', loss='mse') 109 | model_classifier.compile(optimizer='sgd', loss='mse') 110 | 111 | all_imgs = [] 112 | 113 | classes = {} 114 | 115 | bbox_threshold = 0.8 116 | 117 | visualise = True 118 | 119 | for idx, img_name in enumerate(sorted(os.listdir(img_path))): 120 | if not img_name.lower().endswith(('.bmp', '.jpeg', '.jpg', '.png', '.tif', '.tiff')): 121 | continue 122 | print(img_name) 123 | st = time.time() 124 | filepath = os.path.join(img_path,img_name) 125 | 126 | img = cv2.imread(filepath) 127 | 128 | X = format_img(img, C) 129 | 130 | img_scaled = np.transpose(X.copy()[0, (2, 1, 0), :, :], (1, 2, 0)).copy() 131 | img_scaled[:, :, 0] += 123.68 132 | img_scaled[:, :, 1] += 116.779 133 | img_scaled[:, :, 2] += 103.939 134 | 135 | img_scaled = img_scaled.astype(np.uint8) 136 | 137 | if K.image_dim_ordering() == 'tf': 138 | X = np.transpose(X, (0, 2, 3, 1)) 139 | 140 | # get the feature maps and output from the RPN 141 | [Y1, Y2, F] = model_rpn.predict(X) 142 | 143 | 144 | R = roi_helpers.rpn_to_roi(Y1, Y2, C, K.image_dim_ordering(), overlap_thresh=0.7) 145 | 146 | # convert from (x1,y1,x2,y2) to (x,y,w,h) 147 | R[:, 2] -= R[:, 0] 148 | R[:, 3] -= R[:, 1] 149 | 150 | # apply the spatial pyramid pooling to the proposed regions 151 | bboxes = {} 152 | probs = {} 153 | 154 | for jk in range(R.shape[0]//C.num_rois + 1): 155 | ROIs = np.expand_dims(R[C.num_rois*jk:C.num_rois*(jk+1), :], axis=0) 156 | if ROIs.shape[1] == 0: 157 | break 158 | 159 | if jk == R.shape[0]//C.num_rois: 160 | #pad R 161 | curr_shape = ROIs.shape 162 | target_shape = (curr_shape[0],C.num_rois,curr_shape[2]) 163 | ROIs_padded = np.zeros(target_shape).astype(ROIs.dtype) 164 | ROIs_padded[:, :curr_shape[1], :] = ROIs 165 | ROIs_padded[0, curr_shape[1]:, :] = ROIs[0, 0, :] 166 | ROIs = ROIs_padded 167 | 168 | [P_cls, P_regr] = model_classifier_only.predict([F, ROIs]) 169 | 170 | for ii in range(P_cls.shape[1]): 171 | 172 | if np.max(P_cls[0, ii, :]) < bbox_threshold or np.argmax(P_cls[0, ii, :]) == (P_cls.shape[2] - 1): 173 | continue 174 | 175 | cls_name = class_mapping[np.argmax(P_cls[0, ii, :])] 176 | 177 | if cls_name not in bboxes: 178 | bboxes[cls_name] = [] 179 | probs[cls_name] = [] 180 | 181 | (x, y, w, h) = ROIs[0, ii, :] 182 | 183 | cls_num = np.argmax(P_cls[0, ii, :]) 184 | try: 185 | (tx, ty, tw, th) = P_regr[0, ii, 4*cls_num:4*(cls_num+1)] 186 | tx /= C.classifier_regr_std[0] 187 | ty /= C.classifier_regr_std[1] 188 | tw /= C.classifier_regr_std[2] 189 | th /= C.classifier_regr_std[3] 190 | x, y, w, h = roi_helpers.apply_regr(x, y, w, h, tx, ty, tw, th) 191 | except: 192 | pass 193 | bboxes[cls_name].append([16*x, 16*y, 16*(x+w), 16*(y+h)]) 194 | probs[cls_name].append(np.max(P_cls[0, ii, :])) 195 | 196 | all_dets = [] 197 | 198 | for key in bboxes: 199 | bbox = np.array(bboxes[key]) 200 | 201 | new_boxes, new_probs = roi_helpers.non_max_suppression_fast(bbox, np.array(probs[key]), overlap_thresh=0.5) 202 | for jk in range(new_boxes.shape[0]): 203 | (x1, y1, x2, y2) = new_boxes[jk,:] 204 | 205 | cv2.rectangle(img_scaled,(x1, y1), (x2, y2), class_to_color[key],2) 206 | 207 | textLabel = '{}: {}'.format(key,int(100*new_probs[jk])) 208 | all_dets.append((key,100*new_probs[jk])) 209 | 210 | (retval,baseLine) = cv2.getTextSize(textLabel,cv2.FONT_HERSHEY_COMPLEX,1,1) 211 | textOrg = (x1, y1-0) 212 | 213 | cv2.rectangle(img_scaled, (textOrg[0] - 5, textOrg[1]+baseLine - 5), (textOrg[0]+retval[0] + 5, textOrg[1]-retval[1] - 5), (0, 0, 0), 2) 214 | cv2.rectangle(img_scaled, (textOrg[0] - 5,textOrg[1]+baseLine - 5), (textOrg[0]+retval[0] + 5, textOrg[1]-retval[1] - 5), (255, 255, 255), -1) 215 | cv2.putText(img_scaled, textLabel, textOrg, cv2.FONT_HERSHEY_DUPLEX, 1, (0, 0, 0), 1) 216 | print('Elapsed time = {}'.format(time.time() - st)) 217 | cv2.imshow('img', img_scaled) 218 | cv2.waitKey(0) 219 | #cv2.imwrite('./imgs/{}.png'.format(idx),img_scaled) 220 | print(all_dets) 221 | -------------------------------------------------------------------------------- /train_frcnn.py: -------------------------------------------------------------------------------- 1 | import random 2 | import pprint 3 | import sys 4 | import time 5 | import numpy as np 6 | from optparse import OptionParser 7 | import pickle 8 | 9 | from keras import backend as K 10 | from keras.optimizers import Adam 11 | from keras.layers import Input 12 | from keras.models import Model 13 | from keras_frcnn import config, data_generators 14 | from keras_frcnn import losses as losses 15 | from keras_frcnn import resnet as nn 16 | import keras_frcnn.roi_helpers as roi_helpers 17 | from keras.utils import generic_utils 18 | 19 | sys.setrecursionlimit(40000) 20 | 21 | parser = OptionParser() 22 | 23 | parser.add_option("-p", "--path", dest="train_path", help="Path to training data.") 24 | parser.add_option("-o", "--parser", dest="parser", help="Parser to use. One of simple or pascal_voc", 25 | default="pascal_voc"), 26 | parser.add_option("-n", "--num_rois", dest="num_rois", 27 | help="Number of ROIs per iteration. Higher means more memory use.", default=32) 28 | parser.add_option("--hf", dest="horizontal_flips", help="Augment with horizontal flips in training. (Default=true).", action="store_true", default=False) 29 | parser.add_option("--vf", dest="vertical_flips", help="Augment with vertical flips in training. (Default=false).", action="store_true", default=False) 30 | parser.add_option("--rot", "--rot_90", dest="rot_90", help="Augment with 90 degree rotations in training. (Default=false).", 31 | action="store_true", default=False) 32 | parser.add_option("--num_epochs", dest="num_epochs", help="Number of epochs.", default=2000) 33 | parser.add_option("--config_filename", dest="config_filename", help= 34 | "Location to store all the metadata related to the training (to be used when testing).", 35 | default="config.pickle") 36 | parser.add_option("--output_weight_path", dest="output_weight_path", help="Output path for weights.", default='./model_frcnn.hdf5') 37 | parser.add_option("--input_weight_path", dest="input_weight_path", help="Input path for weights. If not specified, will try to load default weights provided by keras.") 38 | 39 | (options, args) = parser.parse_args() 40 | 41 | if not options.train_path: # if filename is not given 42 | parser.error('Error: path to training data must be specified. Pass --path to command line') 43 | 44 | if options.parser == 'pascal_voc': 45 | from keras_frcnn.pascal_voc_parser import get_data 46 | elif options.parser == 'simple': 47 | from keras_frcnn.simple_parser import get_data 48 | else: 49 | raise ValueError("Command line option parser must be one of 'pascal_voc' or 'simple'") 50 | 51 | # pass the settings from the command line, and persist them in the config object 52 | C = config.Config() 53 | 54 | C.num_rois = int(options.num_rois) 55 | C.use_horizontal_flips = bool(options.horizontal_flips) 56 | C.use_vertical_flips = bool(options.vertical_flips) 57 | C.rot_90 = bool(options.rot_90) 58 | 59 | C.model_path = options.output_weight_path 60 | 61 | if options.input_weight_path: 62 | C.base_net_weights = options.input_weight_path 63 | 64 | all_imgs, classes_count, class_mapping = get_data(options.train_path) 65 | 66 | if 'bg' not in classes_count: 67 | classes_count['bg'] = 0 68 | class_mapping['bg'] = len(class_mapping) 69 | 70 | C.class_mapping = class_mapping 71 | 72 | inv_map = {v: k for k, v in class_mapping.iteritems()} 73 | 74 | print('Training images per class:') 75 | pprint.pprint(classes_count) 76 | print('Num classes (including bg) = {}'.format(len(classes_count))) 77 | 78 | config_output_filename = options.config_filename 79 | 80 | with open(config_output_filename, 'w') as config_f: 81 | pickle.dump(C,config_f) 82 | print('Config has been written to {}, and can be loaded when testing to ensure correct results'.format(config_output_filename)) 83 | 84 | random.shuffle(all_imgs) 85 | 86 | num_imgs = len(all_imgs) 87 | 88 | train_imgs = [s for s in all_imgs if s['imageset'] == 'trainval'] 89 | val_imgs = [s for s in all_imgs if s['imageset'] == 'test'] 90 | 91 | print('Num train samples {}'.format(len(train_imgs))) 92 | print('Num val samples {}'.format(len(val_imgs))) 93 | 94 | 95 | data_gen_train = data_generators.get_anchor_gt(train_imgs, classes_count, C, K.image_dim_ordering(), mode='train') 96 | data_gen_val = data_generators.get_anchor_gt(val_imgs, classes_count, C, K.image_dim_ordering(), mode='val') 97 | 98 | if K.image_dim_ordering() == 'th': 99 | input_shape_img = (3, None, None) 100 | else: 101 | input_shape_img = (None, None, 3) 102 | 103 | img_input = Input(shape=input_shape_img) 104 | roi_input = Input(shape=(C.num_rois, 4)) 105 | 106 | # define the base network (resnet here, can be VGG, Inception, etc) 107 | shared_layers = nn.nn_base(img_input, trainable=True) 108 | 109 | # define the RPN, built on the base layers 110 | num_anchors = len(C.anchor_box_scales) * len(C.anchor_box_ratios) 111 | rpn = nn.rpn(shared_layers, num_anchors) 112 | 113 | classifier = nn.classifier(shared_layers, roi_input, C.num_rois, nb_classes=len(classes_count), trainable=True) 114 | 115 | model_rpn = Model(img_input, rpn[:2]) 116 | model_classifier = Model([img_input, roi_input], classifier) 117 | 118 | # this is a model that holds both the RPN and the classifier, used to load/save weights for the models 119 | model_all = Model([img_input, roi_input], rpn[:2] + classifier) 120 | 121 | try: 122 | print('loading weights from {}'.format(C.base_net_weights)) 123 | model_rpn.load_weights(C.base_net_weights, by_name=True) 124 | model_classifier.load_weights(C.base_net_weights, by_name=True) 125 | except: 126 | print('Could not load pretrained model weights. Weights can be found at {} and {}'.format( 127 | 'https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_th_dim_ordering_th_kernels_notop.h5', 128 | 'https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5' 129 | )) 130 | 131 | optimizer = Adam(lr=1e-4) 132 | optimizer_classifier = Adam(lr=1e-4) 133 | model_rpn.compile(optimizer=optimizer, loss=[losses.rpn_loss_cls(num_anchors), losses.rpn_loss_regr(num_anchors)]) 134 | model_classifier.compile(optimizer=optimizer_classifier, loss=[losses.class_loss_cls, losses.class_loss_regr(len(classes_count)-1)], metrics={'dense_class_{}'.format(len(classes_count)): 'accuracy'}) 135 | model_all.compile(optimizer='sgd', loss='mae') 136 | 137 | epoch_length = 1000 138 | num_epochs = int(options.num_epochs) 139 | iter_num = 0 140 | 141 | losses = np.zeros((epoch_length, 5)) 142 | rpn_accuracy_rpn_monitor = [] 143 | rpn_accuracy_for_epoch = [] 144 | start_time = time.time() 145 | 146 | best_loss = np.Inf 147 | 148 | class_mapping_inv = {v: k for k, v in class_mapping.iteritems()} 149 | print('Starting training') 150 | 151 | 152 | for epoch_num in range(num_epochs): 153 | 154 | progbar = generic_utils.Progbar(epoch_length) 155 | print('Epoch {}/{}'.format(epoch_num + 1, num_epochs)) 156 | 157 | while True: 158 | try: 159 | if len(rpn_accuracy_rpn_monitor) == epoch_length and C.verbose: 160 | mean_overlapping_bboxes = float(sum(rpn_accuracy_rpn_monitor))/len(rpn_accuracy_rpn_monitor) 161 | rpn_accuracy_rpn_monitor = [] 162 | print('Average number of overlapping bounding boxes from RPN = {} for {} previous iterations'.format(mean_overlapping_bboxes, epoch_length)) 163 | if mean_overlapping_bboxes == 0: 164 | print('RPN is not producing bounding boxes that overlap the ground truth boxes. Check RPN settings or keep training.') 165 | 166 | X, Y, img_data = data_gen_train.next() 167 | 168 | loss_rpn = model_rpn.train_on_batch(X, Y) 169 | 170 | P_rpn = model_rpn.predict_on_batch(X) 171 | 172 | R = roi_helpers.rpn_to_roi(P_rpn[0], P_rpn[1], C, K.image_dim_ordering(), use_regr=True, overlap_thresh=0.7, max_boxes=300) 173 | 174 | # note: calc_iou converts from (x1,y1,x2,y2) to (x,y,w,h) format 175 | X2, Y1, Y2 = roi_helpers.calc_iou(R, img_data, C, class_mapping) 176 | 177 | if X2 is None: 178 | rpn_accuracy_rpn_monitor.append(0) 179 | rpn_accuracy_for_epoch.append(0) 180 | continue 181 | 182 | neg_samples = np.where(Y1[0, :, -1] == 1) 183 | pos_samples = np.where(Y1[0, :, -1] == 0) 184 | 185 | if len(neg_samples) > 0: 186 | neg_samples = neg_samples[0] 187 | else: 188 | neg_samples = [] 189 | 190 | if len(pos_samples) > 0: 191 | pos_samples = pos_samples[0] 192 | else: 193 | pos_samples = [] 194 | 195 | rpn_accuracy_rpn_monitor.append(len(pos_samples)) 196 | rpn_accuracy_for_epoch.append((len(pos_samples))) 197 | 198 | if C.num_rois > 1: 199 | if len(pos_samples) < C.num_rois/2: 200 | selected_pos_samples = pos_samples.tolist() 201 | else: 202 | selected_pos_samples = np.random.choice(pos_samples, C.num_rois/2, replace=False).tolist() 203 | try: 204 | selected_neg_samples = np.random.choice(neg_samples, C.num_rois - len(selected_pos_samples), replace=False).tolist() 205 | except: 206 | selected_neg_samples = np.random.choice(neg_samples, C.num_rois - len(selected_pos_samples), replace=True).tolist() 207 | 208 | sel_samples = selected_pos_samples + selected_neg_samples 209 | else: 210 | # in the extreme case where num_rois = 1, we pick a random pos or neg sample 211 | selected_pos_samples = pos_samples.tolist() 212 | selected_neg_samples = neg_samples.tolist() 213 | if np.random.randint(0, 2): 214 | sel_samples = random.choice(neg_samples) 215 | else: 216 | sel_samples = random.choice(pos_samples) 217 | 218 | loss_class = model_classifier.train_on_batch([X, X2[:, sel_samples, :]], [Y1[:, sel_samples, :], Y2[:, sel_samples, :]]) 219 | 220 | losses[iter_num, 0] = loss_rpn[1] 221 | losses[iter_num, 1] = loss_rpn[2] 222 | 223 | losses[iter_num, 2] = loss_class[1] 224 | losses[iter_num, 3] = loss_class[2] 225 | losses[iter_num, 4] = loss_class[3] 226 | 227 | iter_num += 1 228 | 229 | progbar.update(iter_num, [('rpn_cls', np.mean(losses[:iter_num, 0])), ('rpn_regr', np.mean(losses[:iter_num, 1])), 230 | ('detector_cls', np.mean(losses[:iter_num, 2])), ('detector_regr', np.mean(losses[:iter_num, 0]))]) 231 | 232 | if iter_num == epoch_length: 233 | loss_rpn_cls = np.mean(losses[:, 0]) 234 | loss_rpn_regr = np.mean(losses[:, 1]) 235 | loss_class_cls = np.mean(losses[:, 2]) 236 | loss_class_regr = np.mean(losses[:, 3]) 237 | class_acc = np.mean(losses[:, 4]) 238 | 239 | mean_overlapping_bboxes = float(sum(rpn_accuracy_for_epoch)) / len(rpn_accuracy_for_epoch) 240 | rpn_accuracy_for_epoch = [] 241 | 242 | if C.verbose: 243 | print('Mean number of bounding boxes from RPN overlapping ground truth boxes: {}'.format(mean_overlapping_bboxes)) 244 | print('Classifier accuracy for bounding boxes from RPN: {}'.format(class_acc)) 245 | print('Loss RPN classifier: {}'.format(loss_rpn_cls)) 246 | print('Loss RPN regression: {}'.format(loss_rpn_regr)) 247 | print('Loss Detector classifier: {}'.format(loss_class_cls)) 248 | print('Loss Detector regression: {}'.format(loss_class_regr)) 249 | print('Elapsed time: {}'.format(time.time() - start_time)) 250 | 251 | curr_loss = loss_rpn_cls + loss_rpn_regr + loss_class_cls + loss_class_regr 252 | iter_num = 0 253 | start_time = time.time() 254 | 255 | if curr_loss < best_loss: 256 | if C.verbose: 257 | print('Total loss decreased from {} to {}, saving weights'.format(best_loss,curr_loss)) 258 | best_loss = curr_loss 259 | model_all.save_weights(C.model_path) 260 | 261 | break 262 | 263 | except Exception as e: 264 | print('Exception: {}'.format(e)) 265 | continue 266 | 267 | print('Training complete, exiting.') 268 | --------------------------------------------------------------------------------