├── .gitattributes ├── .github └── stale.yml ├── .gitignore ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE.md ├── LICENSE.txt ├── README.md ├── __init__.py ├── bounding_box_utils ├── __init__.py └── bounding_box_utils.py ├── data_generator ├── __init__.py ├── data_augmentation_chain_constant_input_size.py ├── data_augmentation_chain_original_ssd.py ├── data_augmentation_chain_satellite.py ├── data_augmentation_chain_variable_input_size.py ├── object_detection_2d_data_generator.py ├── object_detection_2d_geometric_ops.py ├── object_detection_2d_image_boxes_validation_utils.py ├── object_detection_2d_misc_utils.py ├── object_detection_2d_patch_sampling_ops.py └── object_detection_2d_photometric_ops.py ├── eval_utils ├── __init__.py ├── average_precision_evaluator.py └── coco_utils.py ├── examples ├── fish-bike.jpg ├── fish_bike.jpg ├── ssd300_pascalVOC_pred_01.png ├── ssd300_pascalVOC_pred_02.png ├── ssd300_pascalVOC_pred_03.png ├── ssd300_pascalVOC_pred_04.png ├── ssd300_pascalVOC_pred_05.png ├── ssd300_pascalVOC_pred_06.png ├── ssd300_pascalVOC_pred_07.png ├── ssd300_pascalVOC_pred_08.png ├── ssd300_pascalVOC_pred_09.png ├── ssd7_udacity_traffic_pred_01.png ├── ssd7_udacity_traffic_pred_02.png ├── ssd7_udacity_traffic_pred_03.png ├── ssd7_udacity_traffic_pred_04.png ├── ssd7_udacity_traffic_pred_05.png ├── trained_ssd300_pascalVOC2007_test_pred_01.png ├── trained_ssd300_pascalVOC2007_test_pred_01_no_gt.png ├── trained_ssd300_pascalVOC2007_test_pred_02.png ├── trained_ssd300_pascalVOC2007_test_pred_02_no_gt.png ├── trained_ssd300_pascalVOC2007_test_pred_03.png ├── trained_ssd300_pascalVOC2007_test_pred_03_no_gt.png ├── trained_ssd300_pascalVOC2007_test_pred_04.png ├── trained_ssd300_pascalVOC2007_test_pred_04_no_gt.png ├── trained_ssd300_pascalVOC2007_test_pred_05.png ├── trained_ssd300_pascalVOC2007_test_pred_05_no_gt.png └── trained_ssd300_pascalVOC2007_test_pred_06.png ├── keras_layers ├── __init__.py ├── keras_layer_AnchorBoxes.py ├── keras_layer_DecodeDetections.py ├── keras_layer_DecodeDetectionsFast.py └── keras_layer_L2Normalization.py ├── keras_loss_function ├── __init__.py └── keras_ssd_loss.py ├── misc_utils ├── __init__.py └── tensor_sampling_utils.py ├── models ├── __init__.py ├── keras_ssd300.py ├── keras_ssd512.py └── keras_ssd7.py ├── ssd300_evaluation.ipynb ├── ssd300_evaluation_COCO.ipynb ├── ssd300_inference.ipynb ├── ssd300_training.ipynb ├── ssd512_inference.ipynb ├── ssd7_training.ipynb ├── ssd_encoder_decoder ├── __init__.py ├── matching_utils.py ├── ssd_input_encoder.py └── ssd_output_decoder.py ├── training_summaries ├── ssd300_pascal_07+12_loss_history.png └── ssd300_pascal_07+12_training_summary.md └── weight_sampling_tutorial.ipynb /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-language=Python 2 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Configuration for probot-stale - https://github.com/probot/stale 2 | 3 | # Number of days of inactivity before an Issue or Pull Request becomes stale 4 | daysUntilStale: 7 5 | # Number of days of inactivity before a stale Issue or Pull Request is closed 6 | daysUntilClose: 7 7 | # Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable 8 | exemptLabels: 9 | - pinned 10 | - security 11 | - "[Status] Maybe Later" 12 | # Label to use when marking as stale 13 | staleLabel: stale 14 | # Comment to post when marking as stale. Set to `false` to disable 15 | markComment: > 16 | This issue has been automatically marked as stale because it has not had 17 | recent activity. It will be closed if no further activity occurs. Thank you 18 | for your contributions. 19 | # Comment to post when removing the stale label. Set to `false` to disable 20 | unmarkComment: false 21 | # Comment to post when closing a stale Issue or Pull Request. Set to `false` to disable 22 | closeComment: false 23 | # Limit to only `issues` or `pulls` 24 | # only: issues 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | .ipynb_checkpoints/ 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # Ignore any files and directories that begin with the word "local" 98 | local* 99 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | --- 3 | 4 | Contributions to this repository are welcome, but before you create a pull request, consider the following guidelines: 5 | 6 | 1. The To-do list in the README of this repository defines the main topics for which contributions are welcome. If you want to contribute, ideally contribute to one of the topics listed there. 7 | 2. If you'd like to contribute features that are not mentioned on the to-do list in the README, make sure to explain why your proposed change adds value, i.e. what relevant use case it solves. The benefit of any new feature will be compared against the cost of maintaining it and your contribution will be accepter or rejected based on this trade-off. 8 | 3. One pull request should be about one specific feature or improvement, i.e. it should not contain multiple unrelated changes. If you want to contribute multiple features and/or improvements, create a separate pull request for every individual feature or improvement. 9 | 3. When you create a pull request, make sure to explain properly 10 | * why your propsed change adds value, i.e. what problem or use case it solves, 11 | * all the API changes it will introduce, if any, 12 | * all behavioral changes in any existing parts of the project it will introduce, if any. 13 | 4. This should go without saying, but you are responsible for updating any parts of the code or the tutorial notebooks that are affected by your introduced changes. 14 | 5. Any submitted code must conform to the coding standards and style of this repository. There is no formal guide for coding standards and style, but here are a few things to note: 15 | * Any new modules, classes or functions must provide proper docstrings unless they are trivial. These docstrings must have sections for Arguments, Returns, and Raises (if applicable). For every argument of a function, the docstring must explain precisely what the argument does, what data type it expects, whether or not it is optional, and any requirements for the range of values it expects. The same goes for the returns. Use existing docstrings as templates. 16 | * Naming: 17 | * `ClassNames` consist of capitalized words without underscores. 18 | * `module_names.py` consist of lower case words connected with underscores. 19 | * `function_names` consist of lower case words connected with underscores. 20 | * `variable_names` consist of lower case words connected with underscores. 21 | * All module, class, function, and variable names must be descriptive in order to meet the goal that all code should always be as self-explanatory as possible. A longer and descriptive name is always preferable over a shorter and non-descriptive name. Abbreviations are generally to be avoided unless the full words would really make the name too long. 22 | * More in-line comments are better than fewer in-line comments and all comments should be precise and succinct. 23 | -------------------------------------------------------------------------------- /ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ### If you open a GitHub issue, here is the policy: 2 | 3 | Your issue must be about one of the following: 4 | 5 | 1. a bug, 6 | 2. a feature request, 7 | 3. a documentation issue, or 8 | 4. a question that is **specific to this SSD implementation**. 9 | 10 | You will only get help if you adhere to the following guidelines: 11 | 12 | * Before you open an issue, search the open **and closed** issues first. Your problem/question might already have been solved/answered before. 13 | * If you're getting unexpected behavior from code I wrote, open an issue and I'll try to help. If you're getting unexpected behavior from code **you** wrote, you'll have to fix it yourself. E.g. if you made a ton of changes to the code or the tutorials and now it doesn't work anymore, that's your own problem. I don't want to spend my time debugging your code. 14 | * Make sure you're using the latest master. If you're 30 commits behind and have a problem, the only answer you'll likely get is to pull the latest master and try again. 15 | * Read the documentation. All of it. If the answer to your problem/question can be found in the documentation, you might not get an answer, because, seriously, you could really have figured this out yourself. 16 | * If you're asking a question, it must be specific to this SSD implementation. General deep learning or object detection questions will likely get closed without an answer. E.g. a question like "How do I get the mAP of an SSD for my own dataset?" has nothing to do with this particular SSD implementation, because computing the mAP works the same way for any object detection model. You should ask such a question in an appropriate forum or on the [Data Science section of StackOverflow](https://datascience.stackexchange.com/) instead. 17 | * If you get an error: 18 | * Provide the full stack trace of the error you're getting, not just the error message itself. 19 | * Make sure any code you post is properly formatted as such. 20 | * Provide any useful information about your environment, e.g.: 21 | * Operating System 22 | * Which commit of this repository you're on 23 | * Keras version 24 | * TensorFlow version 25 | * Provide a minimal reproducible example, i.e. post code and explain clearly how you ended up with this error. 26 | * Provide any useful information about your specific use case and parameters: 27 | * What model are you trying to use/train? 28 | * Describe the dataset you're using. 29 | * List the values of any parameters you changed that might be relevant. 30 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2018 Pierluigi Ferrari. 2 | 3 | Apache License 4 | Version 2.0, January 2004 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, 12 | and distribution as defined by Sections 1 through 9 of this document. 13 | 14 | "Licensor" shall mean the copyright owner or entity authorized by 15 | the copyright owner that is granting the License. 16 | 17 | "Legal Entity" shall mean the union of the acting entity and all 18 | other entities that control, are controlled by, or are under common 19 | control with that entity. For the purposes of this definition, 20 | "control" means (i) the power, direct or indirect, to cause the 21 | direction or management of such entity, whether by contract or 22 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 23 | outstanding shares, or (iii) beneficial ownership of such entity. 24 | 25 | "You" (or "Your") shall mean an individual or Legal Entity 26 | exercising permissions granted by this License. 27 | 28 | "Source" form shall mean the preferred form for making modifications, 29 | including but not limited to software source code, documentation 30 | source, and configuration files. 31 | 32 | "Object" form shall mean any form resulting from mechanical 33 | transformation or translation of a Source form, including but 34 | not limited to compiled object code, generated documentation, 35 | and conversions to other media types. 36 | 37 | "Work" shall mean the work of authorship, whether in Source or 38 | Object form, made available under the License, as indicated by a 39 | copyright notice that is included in or attached to the work 40 | (an example is provided in the Appendix below). 41 | 42 | "Derivative Works" shall mean any work, whether in Source or Object 43 | form, that is based on (or derived from) the Work and for which the 44 | editorial revisions, annotations, elaborations, or other modifications 45 | represent, as a whole, an original work of authorship. For the purposes 46 | of this License, Derivative Works shall not include works that remain 47 | separable from, or merely link (or bind by name) to the interfaces of, 48 | the Work and Derivative Works thereof. 49 | 50 | "Contribution" shall mean any work of authorship, including 51 | the original version of the Work and any modifications or additions 52 | to that Work or Derivative Works thereof, that is intentionally 53 | submitted to Licensor for inclusion in the Work by the copyright owner 54 | or by an individual or Legal Entity authorized to submit on behalf of 55 | the copyright owner. For the purposes of this definition, "submitted" 56 | means any form of electronic, verbal, or written communication sent 57 | to the Licensor or its representatives, including but not limited to 58 | communication on electronic mailing lists, source code control systems, 59 | and issue tracking systems that are managed by, or on behalf of, the 60 | Licensor for the purpose of discussing and improving the Work, but 61 | excluding communication that is conspicuously marked or otherwise 62 | designated in writing by the copyright owner as "Not a Contribution." 63 | 64 | "Contributor" shall mean Licensor and any individual or Legal Entity 65 | on behalf of whom a Contribution has been received by Licensor and 66 | subsequently incorporated within the Work. 67 | 68 | 2. Grant of Copyright License. Subject to the terms and conditions of 69 | this License, each Contributor hereby grants to You a perpetual, 70 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 71 | copyright license to reproduce, prepare Derivative Works of, 72 | publicly display, publicly perform, sublicense, and distribute the 73 | Work and such Derivative Works in Source or Object form. 74 | 75 | 3. Grant of Patent License. Subject to the terms and conditions of 76 | this License, each Contributor hereby grants to You a perpetual, 77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 78 | (except as stated in this section) patent license to make, have made, 79 | use, offer to sell, sell, import, and otherwise transfer the Work, 80 | where such license applies only to those patent claims licensable 81 | by such Contributor that are necessarily infringed by their 82 | Contribution(s) alone or by combination of their Contribution(s) 83 | with the Work to which such Contribution(s) was submitted. If You 84 | institute patent litigation against any entity (including a 85 | cross-claim or counterclaim in a lawsuit) alleging that the Work 86 | or a Contribution incorporated within the Work constitutes direct 87 | or contributory patent infringement, then any patent licenses 88 | granted to You under this License for that Work shall terminate 89 | as of the date such litigation is filed. 90 | 91 | 4. Redistribution. You may reproduce and distribute copies of the 92 | Work or Derivative Works thereof in any medium, with or without 93 | modifications, and in Source or Object form, provided that You 94 | meet the following conditions: 95 | 96 | (a) You must give any other recipients of the Work or 97 | Derivative Works a copy of this License; and 98 | 99 | (b) You must cause any modified files to carry prominent notices 100 | stating that You changed the files; and 101 | 102 | (c) You must retain, in the Source form of any Derivative Works 103 | that You distribute, all copyright, patent, trademark, and 104 | attribution notices from the Source form of the Work, 105 | excluding those notices that do not pertain to any part of 106 | the Derivative Works; and 107 | 108 | (d) If the Work includes a "NOTICE" text file as part of its 109 | distribution, then any Derivative Works that You distribute must 110 | include a readable copy of the attribution notices contained 111 | within such NOTICE file, excluding those notices that do not 112 | pertain to any part of the Derivative Works, in at least one 113 | of the following places: within a NOTICE text file distributed 114 | as part of the Derivative Works; within the Source form or 115 | documentation, if provided along with the Derivative Works; or, 116 | within a display generated by the Derivative Works, if and 117 | wherever such third-party notices normally appear. The contents 118 | of the NOTICE file are for informational purposes only and 119 | do not modify the License. You may add Your own attribution 120 | notices within Derivative Works that You distribute, alongside 121 | or as an addendum to the NOTICE text from the Work, provided 122 | that such additional attribution notices cannot be construed 123 | as modifying the License. 124 | 125 | You may add Your own copyright statement to Your modifications and 126 | may provide additional or different license terms and conditions 127 | for use, reproduction, or distribution of Your modifications, or 128 | for any such Derivative Works as a whole, provided Your use, 129 | reproduction, and distribution of the Work otherwise complies with 130 | the conditions stated in this License. 131 | 132 | 5. Submission of Contributions. Unless You explicitly state otherwise, 133 | any Contribution intentionally submitted for inclusion in the Work 134 | by You to the Licensor shall be under the terms and conditions of 135 | this License, without any additional terms or conditions. 136 | Notwithstanding the above, nothing herein shall supersede or modify 137 | the terms of any separate license agreement you may have executed 138 | with Licensor regarding such Contributions. 139 | 140 | 6. Trademarks. This License does not grant permission to use the trade 141 | names, trademarks, service marks, or product names of the Licensor, 142 | except as required for reasonable and customary use in describing the 143 | origin of the Work and reproducing the content of the NOTICE file. 144 | 145 | 7. Disclaimer of Warranty. Unless required by applicable law or 146 | agreed to in writing, Licensor provides the Work (and each 147 | Contributor provides its Contributions) on an "AS IS" BASIS, 148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 149 | implied, including, without limitation, any warranties or conditions 150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 151 | PARTICULAR PURPOSE. You are solely responsible for determining the 152 | appropriateness of using or redistributing the Work and assume any 153 | risks associated with Your exercise of permissions under this License. 154 | 155 | 8. Limitation of Liability. In no event and under no legal theory, 156 | whether in tort (including negligence), contract, or otherwise, 157 | unless required by applicable law (such as deliberate and grossly 158 | negligent acts) or agreed to in writing, shall any Contributor be 159 | liable to You for damages, including any direct, indirect, special, 160 | incidental, or consequential damages of any character arising as a 161 | result of this License or out of the use or inability to use the 162 | Work (including but not limited to damages for loss of goodwill, 163 | work stoppage, computer failure or malfunction, or any and all 164 | other commercial damages or losses), even if such Contributor 165 | has been advised of the possibility of such damages. 166 | 167 | 9. Accepting Warranty or Additional Liability. While redistributing 168 | the Work or Derivative Works thereof, You may choose to offer, 169 | and charge a fee for, acceptance of support, warranty, indemnity, 170 | or other liability obligations and/or rights consistent with this 171 | License. However, in accepting such obligations, You may act only 172 | on Your own behalf and on Your sole responsibility, not on behalf 173 | of any other Contributor, and only if You agree to indemnify, 174 | defend, and hold each Contributor harmless for any liability 175 | incurred by, or claims asserted against, such Contributor by reason 176 | of your accepting any such warranty or additional liability. 177 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/__init__.py -------------------------------------------------------------------------------- /bounding_box_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/bounding_box_utils/__init__.py -------------------------------------------------------------------------------- /data_generator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/data_generator/__init__.py -------------------------------------------------------------------------------- /data_generator/data_augmentation_chain_constant_input_size.py: -------------------------------------------------------------------------------- 1 | ''' 2 | The data augmentation operations of the original SSD implementation. 3 | 4 | Copyright (C) 2018 Pierluigi Ferrari 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | ''' 18 | 19 | from __future__ import division 20 | import numpy as np 21 | 22 | from data_generator.object_detection_2d_photometric_ops import ConvertColor, ConvertDataType, ConvertTo3Channels, RandomBrightness, RandomContrast, RandomHue, RandomSaturation 23 | from data_generator.object_detection_2d_geometric_ops import RandomFlip, RandomTranslate, RandomScale 24 | from data_generator.object_detection_2d_image_boxes_validation_utils import BoundGenerator, BoxFilter, ImageValidator 25 | 26 | class DataAugmentationConstantInputSize: 27 | ''' 28 | Applies a chain of photometric and geometric image transformations. For documentation, please refer 29 | to the documentation of the individual transformations involved. 30 | 31 | Important: This augmentation chain is suitable for constant-size images only. 32 | ''' 33 | 34 | def __init__(self, 35 | random_brightness=(-48, 48, 0.5), 36 | random_contrast=(0.5, 1.8, 0.5), 37 | random_saturation=(0.5, 1.8, 0.5), 38 | random_hue=(18, 0.5), 39 | random_flip=0.5, 40 | random_translate=((0.03,0.5), (0.03,0.5), 0.5), 41 | random_scale=(0.5, 2.0, 0.5), 42 | n_trials_max=3, 43 | clip_boxes=True, 44 | overlap_criterion='area', 45 | bounds_box_filter=(0.3, 1.0), 46 | bounds_validator=(0.5, 1.0), 47 | n_boxes_min=1, 48 | background=(0,0,0), 49 | labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): 50 | 51 | if (random_scale[0] >= 1) or (random_scale[1] <= 1): 52 | raise ValueError("This sequence of transformations only makes sense if the minimum scaling factor is <1 and the maximum scaling factor is >1.") 53 | 54 | self.n_trials_max = n_trials_max 55 | self.clip_boxes = clip_boxes 56 | self.overlap_criterion = overlap_criterion 57 | self.bounds_box_filter = bounds_box_filter 58 | self.bounds_validator = bounds_validator 59 | self.n_boxes_min = n_boxes_min 60 | self.background = background 61 | self.labels_format = labels_format 62 | 63 | # Determines which boxes are kept in an image after the transformations have been applied. 64 | self.box_filter = BoxFilter(check_overlap=True, 65 | check_min_area=True, 66 | check_degenerate=True, 67 | overlap_criterion=self.overlap_criterion, 68 | overlap_bounds=self.bounds_box_filter, 69 | min_area=16, 70 | labels_format=self.labels_format) 71 | 72 | # Determines whether the result of the transformations is a valid training image. 73 | self.image_validator = ImageValidator(overlap_criterion=self.overlap_criterion, 74 | bounds=self.bounds_validator, 75 | n_boxes_min=self.n_boxes_min, 76 | labels_format=self.labels_format) 77 | 78 | # Utility distortions 79 | self.convert_RGB_to_HSV = ConvertColor(current='RGB', to='HSV') 80 | self.convert_HSV_to_RGB = ConvertColor(current='HSV', to='RGB') 81 | self.convert_to_float32 = ConvertDataType(to='float32') 82 | self.convert_to_uint8 = ConvertDataType(to='uint8') 83 | self.convert_to_3_channels = ConvertTo3Channels() # Make sure all images end up having 3 channels. 84 | 85 | # Photometric transformations 86 | self.random_brightness = RandomBrightness(lower=random_brightness[0], upper=random_brightness[1], prob=random_brightness[2]) 87 | self.random_contrast = RandomContrast(lower=random_contrast[0], upper=random_contrast[1], prob=random_contrast[2]) 88 | self.random_saturation = RandomSaturation(lower=random_saturation[0], upper=random_saturation[1], prob=random_saturation[2]) 89 | self.random_hue = RandomHue(max_delta=random_hue[0], prob=random_hue[1]) 90 | 91 | # Geometric transformations 92 | self.random_flip = RandomFlip(dim='horizontal', prob=random_flip, labels_format=self.labels_format) 93 | self.random_translate = RandomTranslate(dy_minmax=random_translate[0], 94 | dx_minmax=random_translate[1], 95 | prob=random_translate[2], 96 | clip_boxes=self.clip_boxes, 97 | box_filter=self.box_filter, 98 | image_validator=self.image_validator, 99 | n_trials_max=self.n_trials_max, 100 | background=self.background, 101 | labels_format=self.labels_format) 102 | self.random_zoom_in = RandomScale(min_factor=1.0, 103 | max_factor=random_scale[1], 104 | prob=random_scale[2], 105 | clip_boxes=self.clip_boxes, 106 | box_filter=self.box_filter, 107 | image_validator=self.image_validator, 108 | n_trials_max=self.n_trials_max, 109 | background=self.background, 110 | labels_format=self.labels_format) 111 | self.random_zoom_out = RandomScale(min_factor=random_scale[0], 112 | max_factor=1.0, 113 | prob=random_scale[2], 114 | clip_boxes=self.clip_boxes, 115 | box_filter=self.box_filter, 116 | image_validator=self.image_validator, 117 | n_trials_max=self.n_trials_max, 118 | background=self.background, 119 | labels_format=self.labels_format) 120 | 121 | # If we zoom in, do translation before scaling. 122 | self.sequence1 = [self.convert_to_3_channels, 123 | self.convert_to_float32, 124 | self.random_brightness, 125 | self.random_contrast, 126 | self.convert_to_uint8, 127 | self.convert_RGB_to_HSV, 128 | self.convert_to_float32, 129 | self.random_saturation, 130 | self.random_hue, 131 | self.convert_to_uint8, 132 | self.convert_HSV_to_RGB, 133 | self.random_translate, 134 | self.random_zoom_in, 135 | self.random_flip] 136 | 137 | # If we zoom out, do scaling before translation. 138 | self.sequence2 = [self.convert_to_3_channels, 139 | self.convert_to_float32, 140 | self.random_brightness, 141 | self.convert_to_uint8, 142 | self.convert_RGB_to_HSV, 143 | self.convert_to_float32, 144 | self.random_saturation, 145 | self.random_hue, 146 | self.convert_to_uint8, 147 | self.convert_HSV_to_RGB, 148 | self.convert_to_float32, 149 | self.random_contrast, 150 | self.convert_to_uint8, 151 | self.random_zoom_out, 152 | self.random_translate, 153 | self.random_flip] 154 | 155 | def __call__(self, image, labels=None): 156 | 157 | self.random_translate.labels_format = self.labels_format 158 | self.random_zoom_in.labels_format = self.labels_format 159 | self.random_zoom_out.labels_format = self.labels_format 160 | self.random_flip.labels_format = self.labels_format 161 | 162 | # Choose sequence 1 with probability 0.5. 163 | if np.random.choice(2): 164 | 165 | if not (labels is None): 166 | for transform in self.sequence1: 167 | image, labels = transform(image, labels) 168 | return image, labels 169 | else: 170 | for transform in self.sequence1: 171 | image = transform(image) 172 | return image 173 | # Choose sequence 2 with probability 0.5. 174 | else: 175 | 176 | if not (labels is None): 177 | for transform in self.sequence2: 178 | image, labels = transform(image, labels) 179 | return image, labels 180 | else: 181 | for transform in self.sequence2: 182 | image = transform(image) 183 | return image 184 | -------------------------------------------------------------------------------- /data_generator/data_augmentation_chain_original_ssd.py: -------------------------------------------------------------------------------- 1 | ''' 2 | The data augmentation operations of the original SSD implementation. 3 | 4 | Copyright (C) 2018 Pierluigi Ferrari 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | ''' 18 | 19 | from __future__ import division 20 | import numpy as np 21 | import cv2 22 | import inspect 23 | 24 | from data_generator.object_detection_2d_photometric_ops import ConvertColor, ConvertDataType, ConvertTo3Channels, RandomBrightness, RandomContrast, RandomHue, RandomSaturation, RandomChannelSwap 25 | from data_generator.object_detection_2d_patch_sampling_ops import PatchCoordinateGenerator, RandomPatch, RandomPatchInf 26 | from data_generator.object_detection_2d_geometric_ops import ResizeRandomInterp, RandomFlip 27 | from data_generator.object_detection_2d_image_boxes_validation_utils import BoundGenerator, BoxFilter, ImageValidator 28 | 29 | class SSDRandomCrop: 30 | ''' 31 | Performs the same random crops as defined by the `batch_sampler` instructions 32 | of the original Caffe implementation of SSD. A description of this random cropping 33 | strategy can also be found in the data augmentation section of the paper: 34 | https://arxiv.org/abs/1512.02325 35 | ''' 36 | 37 | def __init__(self, labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): 38 | ''' 39 | Arguments: 40 | labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels 41 | of an image contains which bounding box coordinate. The dictionary maps at least the keywords 42 | 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. 43 | ''' 44 | 45 | self.labels_format = labels_format 46 | 47 | # This randomly samples one of the lower IoU bounds defined 48 | # by the `sample_space` every time it is called. 49 | self.bound_generator = BoundGenerator(sample_space=((None, None), 50 | (0.1, None), 51 | (0.3, None), 52 | (0.5, None), 53 | (0.7, None), 54 | (0.9, None)), 55 | weights=None) 56 | 57 | # Produces coordinates for candidate patches such that the height 58 | # and width of the patches are between 0.3 and 1.0 of the height 59 | # and width of the respective image and the aspect ratio of the 60 | # patches is between 0.5 and 2.0. 61 | self.patch_coord_generator = PatchCoordinateGenerator(must_match='h_w', 62 | min_scale=0.3, 63 | max_scale=1.0, 64 | scale_uniformly=False, 65 | min_aspect_ratio = 0.5, 66 | max_aspect_ratio = 2.0) 67 | 68 | # Filters out boxes whose center point does not lie within the 69 | # chosen patches. 70 | self.box_filter = BoxFilter(check_overlap=True, 71 | check_min_area=False, 72 | check_degenerate=False, 73 | overlap_criterion='center_point', 74 | labels_format=self.labels_format) 75 | 76 | # Determines whether a given patch is considered a valid patch. 77 | # Defines a patch to be valid if at least one ground truth bounding box 78 | # (n_boxes_min == 1) has an IoU overlap with the patch that 79 | # meets the requirements defined by `bound_generator`. 80 | self.image_validator = ImageValidator(overlap_criterion='iou', 81 | n_boxes_min=1, 82 | labels_format=self.labels_format, 83 | border_pixels='half') 84 | 85 | # Performs crops according to the parameters set in the objects above. 86 | # Runs until either a valid patch is found or the original input image 87 | # is returned unaltered. Runs a maximum of 50 trials to find a valid 88 | # patch for each new sampled IoU threshold. Every 50 trials, the original 89 | # image is returned as is with probability (1 - prob) = 0.143. 90 | self.random_crop = RandomPatchInf(patch_coord_generator=self.patch_coord_generator, 91 | box_filter=self.box_filter, 92 | image_validator=self.image_validator, 93 | bound_generator=self.bound_generator, 94 | n_trials_max=50, 95 | clip_boxes=True, 96 | prob=0.857, 97 | labels_format=self.labels_format) 98 | 99 | def __call__(self, image, labels=None, return_inverter=False): 100 | self.random_crop.labels_format = self.labels_format 101 | return self.random_crop(image, labels, return_inverter) 102 | 103 | class SSDExpand: 104 | ''' 105 | Performs the random image expansion as defined by the `train_transform_param` instructions 106 | of the original Caffe implementation of SSD. A description of this expansion strategy 107 | can also be found in section 3.6 ("Data Augmentation for Small Object Accuracy") of the paper: 108 | https://arxiv.org/abs/1512.02325 109 | ''' 110 | 111 | def __init__(self, background=(123, 117, 104), labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): 112 | ''' 113 | Arguments: 114 | background (list/tuple, optional): A 3-tuple specifying the RGB color value of the 115 | background pixels of the translated images. 116 | labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels 117 | of an image contains which bounding box coordinate. The dictionary maps at least the keywords 118 | 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. 119 | ''' 120 | 121 | self.labels_format = labels_format 122 | 123 | # Generate coordinates for patches that are between 1.0 and 4.0 times 124 | # the size of the input image in both spatial dimensions. 125 | self.patch_coord_generator = PatchCoordinateGenerator(must_match='h_w', 126 | min_scale=1.0, 127 | max_scale=4.0, 128 | scale_uniformly=True) 129 | 130 | # With probability 0.5, place the input image randomly on a canvas filled with 131 | # mean color values according to the parameters set above. With probability 0.5, 132 | # return the input image unaltered. 133 | self.expand = RandomPatch(patch_coord_generator=self.patch_coord_generator, 134 | box_filter=None, 135 | image_validator=None, 136 | n_trials_max=1, 137 | clip_boxes=False, 138 | prob=0.5, 139 | background=background, 140 | labels_format=self.labels_format) 141 | 142 | def __call__(self, image, labels=None, return_inverter=False): 143 | self.expand.labels_format = self.labels_format 144 | return self.expand(image, labels, return_inverter) 145 | 146 | class SSDPhotometricDistortions: 147 | ''' 148 | Performs the photometric distortions defined by the `train_transform_param` instructions 149 | of the original Caffe implementation of SSD. 150 | ''' 151 | 152 | def __init__(self): 153 | 154 | self.convert_RGB_to_HSV = ConvertColor(current='RGB', to='HSV') 155 | self.convert_HSV_to_RGB = ConvertColor(current='HSV', to='RGB') 156 | self.convert_to_float32 = ConvertDataType(to='float32') 157 | self.convert_to_uint8 = ConvertDataType(to='uint8') 158 | self.convert_to_3_channels = ConvertTo3Channels() 159 | self.random_brightness = RandomBrightness(lower=-32, upper=32, prob=0.5) 160 | self.random_contrast = RandomContrast(lower=0.5, upper=1.5, prob=0.5) 161 | self.random_saturation = RandomSaturation(lower=0.5, upper=1.5, prob=0.5) 162 | self.random_hue = RandomHue(max_delta=18, prob=0.5) 163 | self.random_channel_swap = RandomChannelSwap(prob=0.0) 164 | 165 | self.sequence1 = [self.convert_to_3_channels, 166 | self.convert_to_float32, 167 | self.random_brightness, 168 | self.random_contrast, 169 | self.convert_to_uint8, 170 | self.convert_RGB_to_HSV, 171 | self.convert_to_float32, 172 | self.random_saturation, 173 | self.random_hue, 174 | self.convert_to_uint8, 175 | self.convert_HSV_to_RGB, 176 | self.random_channel_swap] 177 | 178 | self.sequence2 = [self.convert_to_3_channels, 179 | self.convert_to_float32, 180 | self.random_brightness, 181 | self.convert_to_uint8, 182 | self.convert_RGB_to_HSV, 183 | self.convert_to_float32, 184 | self.random_saturation, 185 | self.random_hue, 186 | self.convert_to_uint8, 187 | self.convert_HSV_to_RGB, 188 | self.convert_to_float32, 189 | self.random_contrast, 190 | self.convert_to_uint8, 191 | self.random_channel_swap] 192 | 193 | def __call__(self, image, labels): 194 | 195 | # Choose sequence 1 with probability 0.5. 196 | if np.random.choice(2): 197 | 198 | for transform in self.sequence1: 199 | image, labels = transform(image, labels) 200 | return image, labels 201 | # Choose sequence 2 with probability 0.5. 202 | else: 203 | 204 | for transform in self.sequence2: 205 | image, labels = transform(image, labels) 206 | return image, labels 207 | 208 | class SSDDataAugmentation: 209 | ''' 210 | Reproduces the data augmentation pipeline used in the training of the original 211 | Caffe implementation of SSD. 212 | ''' 213 | 214 | def __init__(self, 215 | img_height=300, 216 | img_width=300, 217 | background=(123, 117, 104), 218 | labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): 219 | ''' 220 | Arguments: 221 | height (int): The desired height of the output images in pixels. 222 | width (int): The desired width of the output images in pixels. 223 | background (list/tuple, optional): A 3-tuple specifying the RGB color value of the 224 | background pixels of the translated images. 225 | labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels 226 | of an image contains which bounding box coordinate. The dictionary maps at least the keywords 227 | 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. 228 | ''' 229 | 230 | self.labels_format = labels_format 231 | 232 | self.photometric_distortions = SSDPhotometricDistortions() 233 | self.expand = SSDExpand(background=background, labels_format=self.labels_format) 234 | self.random_crop = SSDRandomCrop(labels_format=self.labels_format) 235 | self.random_flip = RandomFlip(dim='horizontal', prob=0.5, labels_format=self.labels_format) 236 | 237 | # This box filter makes sure that the resized images don't contain any degenerate boxes. 238 | # Resizing the images could lead the boxes to becomes smaller. For boxes that are already 239 | # pretty small, that might result in boxes with height and/or width zero, which we obviously 240 | # cannot allow. 241 | self.box_filter = BoxFilter(check_overlap=False, 242 | check_min_area=False, 243 | check_degenerate=True, 244 | labels_format=self.labels_format) 245 | 246 | self.resize = ResizeRandomInterp(height=img_height, 247 | width=img_width, 248 | interpolation_modes=[cv2.INTER_NEAREST, 249 | cv2.INTER_LINEAR, 250 | cv2.INTER_CUBIC, 251 | cv2.INTER_AREA, 252 | cv2.INTER_LANCZOS4], 253 | box_filter=self.box_filter, 254 | labels_format=self.labels_format) 255 | 256 | self.sequence = [self.photometric_distortions, 257 | self.expand, 258 | self.random_crop, 259 | self.random_flip, 260 | self.resize] 261 | 262 | def __call__(self, image, labels, return_inverter=False): 263 | self.expand.labels_format = self.labels_format 264 | self.random_crop.labels_format = self.labels_format 265 | self.random_flip.labels_format = self.labels_format 266 | self.resize.labels_format = self.labels_format 267 | 268 | inverters = [] 269 | 270 | for transform in self.sequence: 271 | if return_inverter and ('return_inverter' in inspect.signature(transform).parameters): 272 | image, labels, inverter = transform(image, labels, return_inverter=True) 273 | inverters.append(inverter) 274 | else: 275 | image, labels = transform(image, labels) 276 | 277 | if return_inverter: 278 | return image, labels, inverters[::-1] 279 | else: 280 | return image, labels 281 | -------------------------------------------------------------------------------- /data_generator/data_augmentation_chain_satellite.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A data augmentation pipeline for datasets in bird's eye view, i.e. where there is 3 | no "up" or "down" in the images. 4 | 5 | Copyright (C) 2018 Pierluigi Ferrari 6 | 7 | Licensed under the Apache License, Version 2.0 (the "License"); 8 | you may not use this file except in compliance with the License. 9 | You may obtain a copy of the License at 10 | 11 | http://www.apache.org/licenses/LICENSE-2.0 12 | 13 | Unless required by applicable law or agreed to in writing, software 14 | distributed under the License is distributed on an "AS IS" BASIS, 15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | See the License for the specific language governing permissions and 17 | limitations under the License. 18 | ''' 19 | 20 | from __future__ import division 21 | import numpy as np 22 | 23 | from data_generator.object_detection_2d_photometric_ops import ConvertColor, ConvertDataType, ConvertTo3Channels, RandomBrightness, RandomContrast, RandomHue, RandomSaturation 24 | from data_generator.object_detection_2d_geometric_ops import Resize, RandomFlip, RandomRotate 25 | from data_generator.object_detection_2d_patch_sampling_ops import PatchCoordinateGenerator, RandomPatch 26 | from data_generator.object_detection_2d_image_boxes_validation_utils import BoxFilter, ImageValidator 27 | 28 | class DataAugmentationSatellite: 29 | ''' 30 | A data augmentation pipeline for datasets in bird's eye view, i.e. where there is 31 | no "up" or "down" in the images. 32 | 33 | Applies a chain of photometric and geometric image transformations. For documentation, please refer 34 | to the documentation of the individual transformations involved. 35 | ''' 36 | 37 | def __init__(self, 38 | resize_height, 39 | resize_width, 40 | random_brightness=(-48, 48, 0.5), 41 | random_contrast=(0.5, 1.8, 0.5), 42 | random_saturation=(0.5, 1.8, 0.5), 43 | random_hue=(18, 0.5), 44 | random_flip=0.5, 45 | random_rotate=([90, 180, 270], 0.5), 46 | min_scale=0.3, 47 | max_scale=2.0, 48 | min_aspect_ratio = 0.8, 49 | max_aspect_ratio = 1.25, 50 | n_trials_max=3, 51 | clip_boxes=True, 52 | overlap_criterion='area', 53 | bounds_box_filter=(0.3, 1.0), 54 | bounds_validator=(0.5, 1.0), 55 | n_boxes_min=1, 56 | background=(0,0,0), 57 | labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): 58 | 59 | self.n_trials_max = n_trials_max 60 | self.clip_boxes = clip_boxes 61 | self.overlap_criterion = overlap_criterion 62 | self.bounds_box_filter = bounds_box_filter 63 | self.bounds_validator = bounds_validator 64 | self.n_boxes_min = n_boxes_min 65 | self.background = background 66 | self.labels_format = labels_format 67 | 68 | # Determines which boxes are kept in an image after the transformations have been applied. 69 | self.box_filter_patch = BoxFilter(check_overlap=True, 70 | check_min_area=False, 71 | check_degenerate=False, 72 | overlap_criterion=self.overlap_criterion, 73 | overlap_bounds=self.bounds_box_filter, 74 | labels_format=self.labels_format) 75 | 76 | self.box_filter_resize = BoxFilter(check_overlap=False, 77 | check_min_area=True, 78 | check_degenerate=True, 79 | min_area=16, 80 | labels_format=self.labels_format) 81 | 82 | # Determines whether the result of the transformations is a valid training image. 83 | self.image_validator = ImageValidator(overlap_criterion=self.overlap_criterion, 84 | bounds=self.bounds_validator, 85 | n_boxes_min=self.n_boxes_min, 86 | labels_format=self.labels_format) 87 | 88 | # Utility transformations 89 | self.convert_to_3_channels = ConvertTo3Channels() # Make sure all images end up having 3 channels. 90 | self.convert_RGB_to_HSV = ConvertColor(current='RGB', to='HSV') 91 | self.convert_HSV_to_RGB = ConvertColor(current='HSV', to='RGB') 92 | self.convert_to_float32 = ConvertDataType(to='float32') 93 | self.convert_to_uint8 = ConvertDataType(to='uint8') 94 | self.resize = Resize(height=resize_height, 95 | width=resize_width, 96 | box_filter=self.box_filter_resize, 97 | labels_format=self.labels_format) 98 | 99 | # Photometric transformations 100 | self.random_brightness = RandomBrightness(lower=random_brightness[0], upper=random_brightness[1], prob=random_brightness[2]) 101 | self.random_contrast = RandomContrast(lower=random_contrast[0], upper=random_contrast[1], prob=random_contrast[2]) 102 | self.random_saturation = RandomSaturation(lower=random_saturation[0], upper=random_saturation[1], prob=random_saturation[2]) 103 | self.random_hue = RandomHue(max_delta=random_hue[0], prob=random_hue[1]) 104 | 105 | # Geometric transformations 106 | self.random_horizontal_flip = RandomFlip(dim='horizontal', prob=random_flip, labels_format=self.labels_format) 107 | self.random_vertical_flip = RandomFlip(dim='vertical', prob=random_flip, labels_format=self.labels_format) 108 | self.random_rotate = RandomRotate(angles=random_rotate[0], prob=random_rotate[1], labels_format=self.labels_format) 109 | self.patch_coord_generator = PatchCoordinateGenerator(must_match='w_ar', 110 | min_scale=min_scale, 111 | max_scale=max_scale, 112 | scale_uniformly=False, 113 | min_aspect_ratio = min_aspect_ratio, 114 | max_aspect_ratio = max_aspect_ratio) 115 | self.random_patch = RandomPatch(patch_coord_generator=self.patch_coord_generator, 116 | box_filter=self.box_filter_patch, 117 | image_validator=self.image_validator, 118 | n_trials_max=self.n_trials_max, 119 | clip_boxes=self.clip_boxes, 120 | prob=1.0, 121 | can_fail=False, 122 | labels_format=self.labels_format) 123 | 124 | # Define the processing chain. 125 | self.transformations = [self.convert_to_3_channels, 126 | self.convert_to_float32, 127 | self.random_brightness, 128 | self.random_contrast, 129 | self.convert_to_uint8, 130 | self.convert_RGB_to_HSV, 131 | self.convert_to_float32, 132 | self.random_saturation, 133 | self.random_hue, 134 | self.convert_to_uint8, 135 | self.convert_HSV_to_RGB, 136 | self.random_horizontal_flip, 137 | self.random_vertical_flip, 138 | self.random_rotate, 139 | self.random_patch, 140 | self.resize] 141 | 142 | def __call__(self, image, labels=None): 143 | 144 | self.random_patch.labels_format = self.labels_format 145 | self.random_horizontal_flip.labels_format = self.labels_format 146 | self.random_vertical_flip.labels_format = self.labels_format 147 | self.random_rotate.labels_format = self.labels_format 148 | self.resize.labels_format = self.labels_format 149 | 150 | if not (labels is None): 151 | for transform in self.transformations: 152 | image, labels = transform(image, labels) 153 | return image, labels 154 | else: 155 | for transform in self.sequence1: 156 | image = transform(image) 157 | return image 158 | -------------------------------------------------------------------------------- /data_generator/data_augmentation_chain_variable_input_size.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A data augmentation pipeline suitable for variable-size images that produces effects 3 | that are similar (but not identical) to those of the original SSD data augmentation 4 | pipeline while being faster. 5 | 6 | Copyright (C) 2018 Pierluigi Ferrari 7 | 8 | Licensed under the Apache License, Version 2.0 (the "License"); 9 | you may not use this file except in compliance with the License. 10 | You may obtain a copy of the License at 11 | 12 | http://www.apache.org/licenses/LICENSE-2.0 13 | 14 | Unless required by applicable law or agreed to in writing, software 15 | distributed under the License is distributed on an "AS IS" BASIS, 16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | See the License for the specific language governing permissions and 18 | limitations under the License. 19 | ''' 20 | 21 | from __future__ import division 22 | import numpy as np 23 | 24 | from data_generator.object_detection_2d_photometric_ops import ConvertColor, ConvertDataType, ConvertTo3Channels, RandomBrightness, RandomContrast, RandomHue, RandomSaturation 25 | from data_generator.object_detection_2d_geometric_ops import Resize, RandomFlip 26 | from data_generator.object_detection_2d_patch_sampling_ops import PatchCoordinateGenerator, RandomPatch 27 | from data_generator.object_detection_2d_image_boxes_validation_utils import BoxFilter, ImageValidator 28 | 29 | class DataAugmentationVariableInputSize: 30 | ''' 31 | A data augmentation pipeline suitable for variable-size images that produces effects 32 | that are similar (but not identical!) to those of the original SSD data augmentation 33 | pipeline while being faster. 34 | 35 | Applies a chain of photometric and geometric image transformations. For documentation, please refer 36 | to the documentation of the individual transformations involved. 37 | ''' 38 | 39 | def __init__(self, 40 | resize_height, 41 | resize_width, 42 | random_brightness=(-48, 48, 0.5), 43 | random_contrast=(0.5, 1.8, 0.5), 44 | random_saturation=(0.5, 1.8, 0.5), 45 | random_hue=(18, 0.5), 46 | random_flip=0.5, 47 | min_scale=0.3, 48 | max_scale=2.0, 49 | min_aspect_ratio = 0.5, 50 | max_aspect_ratio = 2.0, 51 | n_trials_max=3, 52 | clip_boxes=True, 53 | overlap_criterion='area', 54 | bounds_box_filter=(0.3, 1.0), 55 | bounds_validator=(0.5, 1.0), 56 | n_boxes_min=1, 57 | background=(0,0,0), 58 | labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}): 59 | 60 | self.n_trials_max = n_trials_max 61 | self.clip_boxes = clip_boxes 62 | self.overlap_criterion = overlap_criterion 63 | self.bounds_box_filter = bounds_box_filter 64 | self.bounds_validator = bounds_validator 65 | self.n_boxes_min = n_boxes_min 66 | self.background = background 67 | self.labels_format = labels_format 68 | 69 | # Determines which boxes are kept in an image after the transformations have been applied. 70 | self.box_filter_patch = BoxFilter(check_overlap=True, 71 | check_min_area=False, 72 | check_degenerate=False, 73 | overlap_criterion=self.overlap_criterion, 74 | overlap_bounds=self.bounds_box_filter, 75 | labels_format=self.labels_format) 76 | 77 | self.box_filter_resize = BoxFilter(check_overlap=False, 78 | check_min_area=True, 79 | check_degenerate=True, 80 | min_area=16, 81 | labels_format=self.labels_format) 82 | 83 | # Determines whether the result of the transformations is a valid training image. 84 | self.image_validator = ImageValidator(overlap_criterion=self.overlap_criterion, 85 | bounds=self.bounds_validator, 86 | n_boxes_min=self.n_boxes_min, 87 | labels_format=self.labels_format) 88 | 89 | # Utility transformations 90 | self.convert_to_3_channels = ConvertTo3Channels() # Make sure all images end up having 3 channels. 91 | self.convert_RGB_to_HSV = ConvertColor(current='RGB', to='HSV') 92 | self.convert_HSV_to_RGB = ConvertColor(current='HSV', to='RGB') 93 | self.convert_to_float32 = ConvertDataType(to='float32') 94 | self.convert_to_uint8 = ConvertDataType(to='uint8') 95 | self.resize = Resize(height=resize_height, 96 | width=resize_width, 97 | box_filter=self.box_filter_resize, 98 | labels_format=self.labels_format) 99 | 100 | # Photometric transformations 101 | self.random_brightness = RandomBrightness(lower=random_brightness[0], upper=random_brightness[1], prob=random_brightness[2]) 102 | self.random_contrast = RandomContrast(lower=random_contrast[0], upper=random_contrast[1], prob=random_contrast[2]) 103 | self.random_saturation = RandomSaturation(lower=random_saturation[0], upper=random_saturation[1], prob=random_saturation[2]) 104 | self.random_hue = RandomHue(max_delta=random_hue[0], prob=random_hue[1]) 105 | 106 | # Geometric transformations 107 | self.random_flip = RandomFlip(dim='horizontal', prob=random_flip, labels_format=self.labels_format) 108 | self.patch_coord_generator = PatchCoordinateGenerator(must_match='w_ar', 109 | min_scale=min_scale, 110 | max_scale=max_scale, 111 | scale_uniformly=False, 112 | min_aspect_ratio = min_aspect_ratio, 113 | max_aspect_ratio = max_aspect_ratio) 114 | self.random_patch = RandomPatch(patch_coord_generator=self.patch_coord_generator, 115 | box_filter=self.box_filter_patch, 116 | image_validator=self.image_validator, 117 | n_trials_max=self.n_trials_max, 118 | clip_boxes=self.clip_boxes, 119 | prob=1.0, 120 | can_fail=False, 121 | labels_format=self.labels_format) 122 | 123 | # Define the processing chain 124 | self.transformations = [self.convert_to_3_channels, 125 | self.convert_to_float32, 126 | self.random_brightness, 127 | self.random_contrast, 128 | self.convert_to_uint8, 129 | self.convert_RGB_to_HSV, 130 | self.convert_to_float32, 131 | self.random_saturation, 132 | self.random_hue, 133 | self.convert_to_uint8, 134 | self.convert_HSV_to_RGB, 135 | self.random_patch, 136 | self.random_flip, 137 | self.resize] 138 | 139 | def __call__(self, image, labels=None): 140 | 141 | self.random_patch.labels_format = self.labels_format 142 | self.random_flip.labels_format = self.labels_format 143 | self.resize.labels_format = self.labels_format 144 | 145 | if not (labels is None): 146 | for transform in self.transformations: 147 | image, labels = transform(image, labels) 148 | return image, labels 149 | else: 150 | for transform in self.sequence1: 151 | image = transform(image) 152 | return image 153 | -------------------------------------------------------------------------------- /data_generator/object_detection_2d_image_boxes_validation_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Utilities for 2D object detection related to answering the following questions: 3 | 1. Given an image size and bounding boxes, which bounding boxes meet certain 4 | requirements with respect to the image size? 5 | 2. Given an image size and bounding boxes, is an image of that size valid with 6 | respect to the bounding boxes according to certain requirements? 7 | 8 | Copyright (C) 2018 Pierluigi Ferrari 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | ''' 22 | 23 | from __future__ import division 24 | import numpy as np 25 | 26 | from bounding_box_utils.bounding_box_utils import iou 27 | 28 | class BoundGenerator: 29 | ''' 30 | Generates pairs of floating point values that represent lower and upper bounds 31 | from a given sample space. 32 | ''' 33 | def __init__(self, 34 | sample_space=((0.1, None), 35 | (0.3, None), 36 | (0.5, None), 37 | (0.7, None), 38 | (0.9, None), 39 | (None, None)), 40 | weights=None): 41 | ''' 42 | Arguments: 43 | sample_space (list or tuple): A list, tuple, or array-like object of shape 44 | `(n, 2)` that contains `n` samples to choose from, where each sample 45 | is a 2-tuple of scalars and/or `None` values. 46 | weights (list or tuple, optional): A list or tuple representing the distribution 47 | over the sample space. If `None`, a uniform distribution will be assumed. 48 | ''' 49 | 50 | if (not (weights is None)) and len(weights) != len(sample_space): 51 | raise ValueError("`weights` must either be `None` for uniform distribution or have the same length as `sample_space`.") 52 | 53 | self.sample_space = [] 54 | for bound_pair in sample_space: 55 | if len(bound_pair) != 2: 56 | raise ValueError("All elements of the sample space must be 2-tuples.") 57 | bound_pair = list(bound_pair) 58 | if bound_pair[0] is None: bound_pair[0] = 0.0 59 | if bound_pair[1] is None: bound_pair[1] = 1.0 60 | if bound_pair[0] > bound_pair[1]: 61 | raise ValueError("For all sample space elements, the lower bound cannot be greater than the upper bound.") 62 | self.sample_space.append(bound_pair) 63 | 64 | self.sample_space_size = len(self.sample_space) 65 | 66 | if weights is None: 67 | self.weights = [1.0/self.sample_space_size] * self.sample_space_size 68 | else: 69 | self.weights = weights 70 | 71 | def __call__(self): 72 | ''' 73 | Returns: 74 | An item of the sample space, i.e. a 2-tuple of scalars. 75 | ''' 76 | i = np.random.choice(self.sample_space_size, p=self.weights) 77 | return self.sample_space[i] 78 | 79 | class BoxFilter: 80 | ''' 81 | Returns all bounding boxes that are valid with respect to a the defined criteria. 82 | ''' 83 | 84 | def __init__(self, 85 | check_overlap=True, 86 | check_min_area=True, 87 | check_degenerate=True, 88 | overlap_criterion='center_point', 89 | overlap_bounds=(0.3, 1.0), 90 | min_area=16, 91 | labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}, 92 | border_pixels='half'): 93 | ''' 94 | Arguments: 95 | check_overlap (bool, optional): Whether or not to enforce the overlap requirements defined by 96 | `overlap_criterion` and `overlap_bounds`. Sometimes you might want to use the box filter only 97 | to enforce a certain minimum area for all boxes (see next argument), in such cases you can 98 | turn the overlap requirements off. 99 | check_min_area (bool, optional): Whether or not to enforce the minimum area requirement defined 100 | by `min_area`. If `True`, any boxes that have an area (in pixels) that is smaller than `min_area` 101 | will be removed from the labels of an image. Bounding boxes below a certain area aren't useful 102 | training examples. An object that takes up only, say, 5 pixels in an image is probably not 103 | recognizable anymore, neither for a human, nor for an object detection model. It makes sense 104 | to remove such boxes. 105 | check_degenerate (bool, optional): Whether or not to check for and remove degenerate bounding boxes. 106 | Degenerate bounding boxes are boxes that have `xmax <= xmin` and/or `ymax <= ymin`. In particular, 107 | boxes with a width and/or height of zero are degenerate. It is obviously important to filter out 108 | such boxes, so you should only set this option to `False` if you are certain that degenerate 109 | boxes are not possible in your data and processing chain. 110 | overlap_criterion (str, optional): Can be either of 'center_point', 'iou', or 'area'. Determines 111 | which boxes are considered valid with respect to a given image. If set to 'center_point', 112 | a given bounding box is considered valid if its center point lies within the image. 113 | If set to 'area', a given bounding box is considered valid if the quotient of its intersection 114 | area with the image and its own area is within the given `overlap_bounds`. If set to 'iou', a given 115 | bounding box is considered valid if its IoU with the image is within the given `overlap_bounds`. 116 | overlap_bounds (list or BoundGenerator, optional): Only relevant if `overlap_criterion` is 'area' or 'iou'. 117 | Determines the lower and upper bounds for `overlap_criterion`. Can be either a 2-tuple of scalars 118 | representing a lower bound and an upper bound, or a `BoundGenerator` object, which provides 119 | the possibility to generate bounds randomly. 120 | min_area (int, optional): Only relevant if `check_min_area` is `True`. Defines the minimum area in 121 | pixels that a bounding box must have in order to be valid. Boxes with an area smaller than this 122 | will be removed. 123 | labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels 124 | of an image contains which bounding box coordinate. The dictionary maps at least the keywords 125 | 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. 126 | border_pixels (str, optional): How to treat the border pixels of the bounding boxes. 127 | Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong 128 | to the boxes. If 'exclude', the border pixels do not belong to the boxes. 129 | If 'half', then one of each of the two horizontal and vertical borders belong 130 | to the boxex, but not the other. 131 | ''' 132 | if not isinstance(overlap_bounds, (list, tuple, BoundGenerator)): 133 | raise ValueError("`overlap_bounds` must be either a 2-tuple of scalars or a `BoundGenerator` object.") 134 | if isinstance(overlap_bounds, (list, tuple)) and (overlap_bounds[0] > overlap_bounds[1]): 135 | raise ValueError("The lower bound must not be greater than the upper bound.") 136 | if not (overlap_criterion in {'iou', 'area', 'center_point'}): 137 | raise ValueError("`overlap_criterion` must be one of 'iou', 'area', or 'center_point'.") 138 | self.overlap_criterion = overlap_criterion 139 | self.overlap_bounds = overlap_bounds 140 | self.min_area = min_area 141 | self.check_overlap = check_overlap 142 | self.check_min_area = check_min_area 143 | self.check_degenerate = check_degenerate 144 | self.labels_format = labels_format 145 | self.border_pixels = border_pixels 146 | 147 | def __call__(self, 148 | labels, 149 | image_height=None, 150 | image_width=None): 151 | ''' 152 | Arguments: 153 | labels (array): The labels to be filtered. This is an array with shape `(m,n)`, where 154 | `m` is the number of bounding boxes and `n` is the number of elements that defines 155 | each bounding box (box coordinates, class ID, etc.). The box coordinates are expected 156 | to be in the image's coordinate system. 157 | image_height (int): Only relevant if `check_overlap == True`. The height of the image 158 | (in pixels) to compare the box coordinates to. 159 | image_width (int): `check_overlap == True`. The width of the image (in pixels) to compare 160 | the box coordinates to. 161 | 162 | Returns: 163 | An array containing the labels of all boxes that are valid. 164 | ''' 165 | 166 | labels = np.copy(labels) 167 | 168 | xmin = self.labels_format['xmin'] 169 | ymin = self.labels_format['ymin'] 170 | xmax = self.labels_format['xmax'] 171 | ymax = self.labels_format['ymax'] 172 | 173 | # Record the boxes that pass all checks here. 174 | requirements_met = np.ones(shape=labels.shape[0], dtype=np.bool) 175 | 176 | if self.check_degenerate: 177 | 178 | non_degenerate = (labels[:,xmax] > labels[:,xmin]) * (labels[:,ymax] > labels[:,ymin]) 179 | requirements_met *= non_degenerate 180 | 181 | if self.check_min_area: 182 | 183 | min_area_met = (labels[:,xmax] - labels[:,xmin]) * (labels[:,ymax] - labels[:,ymin]) >= self.min_area 184 | requirements_met *= min_area_met 185 | 186 | if self.check_overlap: 187 | 188 | # Get the lower and upper bounds. 189 | if isinstance(self.overlap_bounds, BoundGenerator): 190 | lower, upper = self.overlap_bounds() 191 | else: 192 | lower, upper = self.overlap_bounds 193 | 194 | # Compute which boxes are valid. 195 | 196 | if self.overlap_criterion == 'iou': 197 | # Compute the patch coordinates. 198 | image_coords = np.array([0, 0, image_width, image_height]) 199 | # Compute the IoU between the patch and all of the ground truth boxes. 200 | image_boxes_iou = iou(image_coords, labels[:, [xmin, ymin, xmax, ymax]], coords='corners', mode='element-wise', border_pixels=self.border_pixels) 201 | requirements_met *= (image_boxes_iou > lower) * (image_boxes_iou <= upper) 202 | 203 | elif self.overlap_criterion == 'area': 204 | if self.border_pixels == 'half': 205 | d = 0 206 | elif self.border_pixels == 'include': 207 | d = 1 # If border pixels are supposed to belong to the bounding boxes, we have to add one pixel to any difference `xmax - xmin` or `ymax - ymin`. 208 | elif self.border_pixels == 'exclude': 209 | d = -1 # If border pixels are not supposed to belong to the bounding boxes, we have to subtract one pixel from any difference `xmax - xmin` or `ymax - ymin`. 210 | # Compute the areas of the boxes. 211 | box_areas = (labels[:,xmax] - labels[:,xmin] + d) * (labels[:,ymax] - labels[:,ymin] + d) 212 | # Compute the intersection area between the patch and all of the ground truth boxes. 213 | clipped_boxes = np.copy(labels) 214 | clipped_boxes[:,[ymin,ymax]] = np.clip(labels[:,[ymin,ymax]], a_min=0, a_max=image_height-1) 215 | clipped_boxes[:,[xmin,xmax]] = np.clip(labels[:,[xmin,xmax]], a_min=0, a_max=image_width-1) 216 | intersection_areas = (clipped_boxes[:,xmax] - clipped_boxes[:,xmin] + d) * (clipped_boxes[:,ymax] - clipped_boxes[:,ymin] + d) # +1 because the border pixels belong to the box areas. 217 | # Check which boxes meet the overlap requirements. 218 | if lower == 0.0: 219 | mask_lower = intersection_areas > lower * box_areas # If `self.lower == 0`, we want to make sure that boxes with area 0 don't count, hence the ">" sign instead of the ">=" sign. 220 | else: 221 | mask_lower = intersection_areas >= lower * box_areas # Especially for the case `self.lower == 1` we want the ">=" sign, otherwise no boxes would count at all. 222 | mask_upper = intersection_areas <= upper * box_areas 223 | requirements_met *= mask_lower * mask_upper 224 | 225 | elif self.overlap_criterion == 'center_point': 226 | # Compute the center points of the boxes. 227 | cy = (labels[:,ymin] + labels[:,ymax]) / 2 228 | cx = (labels[:,xmin] + labels[:,xmax]) / 2 229 | # Check which of the boxes have center points within the cropped patch remove those that don't. 230 | requirements_met *= (cy >= 0.0) * (cy <= image_height-1) * (cx >= 0.0) * (cx <= image_width-1) 231 | 232 | return labels[requirements_met] 233 | 234 | class ImageValidator: 235 | ''' 236 | Returns `True` if a given minimum number of bounding boxes meets given overlap 237 | requirements with an image of a given height and width. 238 | ''' 239 | 240 | def __init__(self, 241 | overlap_criterion='center_point', 242 | bounds=(0.3, 1.0), 243 | n_boxes_min=1, 244 | labels_format={'class_id': 0, 'xmin': 1, 'ymin': 2, 'xmax': 3, 'ymax': 4}, 245 | border_pixels='half'): 246 | ''' 247 | Arguments: 248 | overlap_criterion (str, optional): Can be either of 'center_point', 'iou', or 'area'. Determines 249 | which boxes are considered valid with respect to a given image. If set to 'center_point', 250 | a given bounding box is considered valid if its center point lies within the image. 251 | If set to 'area', a given bounding box is considered valid if the quotient of its intersection 252 | area with the image and its own area is within `lower` and `upper`. If set to 'iou', a given 253 | bounding box is considered valid if its IoU with the image is within `lower` and `upper`. 254 | bounds (list or BoundGenerator, optional): Only relevant if `overlap_criterion` is 'area' or 'iou'. 255 | Determines the lower and upper bounds for `overlap_criterion`. Can be either a 2-tuple of scalars 256 | representing a lower bound and an upper bound, or a `BoundGenerator` object, which provides 257 | the possibility to generate bounds randomly. 258 | n_boxes_min (int or str, optional): Either a non-negative integer or the string 'all'. 259 | Determines the minimum number of boxes that must meet the `overlap_criterion` with respect to 260 | an image of the given height and width in order for the image to be a valid image. 261 | If set to 'all', an image is considered valid if all given boxes meet the `overlap_criterion`. 262 | labels_format (dict, optional): A dictionary that defines which index in the last axis of the labels 263 | of an image contains which bounding box coordinate. The dictionary maps at least the keywords 264 | 'xmin', 'ymin', 'xmax', and 'ymax' to their respective indices within last axis of the labels array. 265 | border_pixels (str, optional): How to treat the border pixels of the bounding boxes. 266 | Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong 267 | to the boxes. If 'exclude', the border pixels do not belong to the boxes. 268 | If 'half', then one of each of the two horizontal and vertical borders belong 269 | to the boxex, but not the other. 270 | ''' 271 | if not ((isinstance(n_boxes_min, int) and n_boxes_min > 0) or n_boxes_min == 'all'): 272 | raise ValueError("`n_boxes_min` must be a positive integer or 'all'.") 273 | self.overlap_criterion = overlap_criterion 274 | self.bounds = bounds 275 | self.n_boxes_min = n_boxes_min 276 | self.labels_format = labels_format 277 | self.border_pixels = border_pixels 278 | self.box_filter = BoxFilter(check_overlap=True, 279 | check_min_area=False, 280 | check_degenerate=False, 281 | overlap_criterion=self.overlap_criterion, 282 | overlap_bounds=self.bounds, 283 | labels_format=self.labels_format, 284 | border_pixels=self.border_pixels) 285 | 286 | def __call__(self, 287 | labels, 288 | image_height, 289 | image_width): 290 | ''' 291 | Arguments: 292 | labels (array): The labels to be tested. The box coordinates are expected 293 | to be in the image's coordinate system. 294 | image_height (int): The height of the image to compare the box coordinates to. 295 | image_width (int): The width of the image to compare the box coordinates to. 296 | 297 | Returns: 298 | A boolean indicating whether an imgae of the given height and width is 299 | valid with respect to the given bounding boxes. 300 | ''' 301 | 302 | self.box_filter.overlap_bounds = self.bounds 303 | self.box_filter.labels_format = self.labels_format 304 | 305 | # Get all boxes that meet the overlap requirements. 306 | valid_labels = self.box_filter(labels=labels, 307 | image_height=image_height, 308 | image_width=image_width) 309 | 310 | # Check whether enough boxes meet the requirements. 311 | if isinstance(self.n_boxes_min, int): 312 | # The image is valid if at least `self.n_boxes_min` ground truth boxes meet the requirements. 313 | if len(valid_labels) >= self.n_boxes_min: 314 | return True 315 | else: 316 | return False 317 | elif self.n_boxes_min == 'all': 318 | # The image is valid if all ground truth boxes meet the requirements. 319 | if len(valid_labels) == len(labels): 320 | return True 321 | else: 322 | return False 323 | -------------------------------------------------------------------------------- /data_generator/object_detection_2d_misc_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Miscellaneous data generator utilities. 3 | 4 | Copyright (C) 2018 Pierluigi Ferrari 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | ''' 18 | 19 | from __future__ import division 20 | import numpy as np 21 | 22 | def apply_inverse_transforms(y_pred_decoded, inverse_transforms): 23 | ''' 24 | Takes a list or Numpy array of decoded predictions and applies a given list of 25 | transforms to them. The list of inverse transforms would usually contain the 26 | inverter functions that some of the image transformations that come with this 27 | data generator return. This function would normally be used to transform predictions 28 | that were made on a transformed image back to the original image. 29 | 30 | Arguments: 31 | y_pred_decoded (list or array): Either a list of length `batch_size` that 32 | contains Numpy arrays that contain the predictions for each batch item 33 | or a Numpy array. If this is a list of Numpy arrays, the arrays would 34 | usually have the shape `(num_predictions, 6)`, where `num_predictions` 35 | is different for each batch item. If this is a Numpy array, it would 36 | usually have the shape `(batch_size, num_predictions, 6)`. The last axis 37 | would usually contain the class ID, confidence score, and four bounding 38 | box coordinates for each prediction. 39 | inverse_predictions (list): A nested list of length `batch_size` that contains 40 | for each batch item a list of functions that take one argument (one element 41 | of `y_pred_decoded` if it is a list or one slice along the first axis of 42 | `y_pred_decoded` if it is an array) and return an output of the same shape 43 | and data type. 44 | 45 | Returns: 46 | The transformed predictions, which have the same structure as `y_pred_decoded`. 47 | ''' 48 | 49 | if isinstance(y_pred_decoded, list): 50 | 51 | y_pred_decoded_inv = [] 52 | 53 | for i in range(len(y_pred_decoded)): 54 | y_pred_decoded_inv.append(np.copy(y_pred_decoded[i])) 55 | if y_pred_decoded_inv[i].size > 0: # If there are any predictions for this batch item. 56 | for inverter in inverse_transforms[i]: 57 | if not (inverter is None): 58 | y_pred_decoded_inv[i] = inverter(y_pred_decoded_inv[i]) 59 | 60 | elif isinstance(y_pred_decoded, np.ndarray): 61 | 62 | y_pred_decoded_inv = np.copy(y_pred_decoded) 63 | 64 | for i in range(len(y_pred_decoded)): 65 | if y_pred_decoded_inv[i].size > 0: # If there are any predictions for this batch item. 66 | for inverter in inverse_transforms[i]: 67 | if not (inverter is None): 68 | y_pred_decoded_inv[i] = inverter(y_pred_decoded_inv[i]) 69 | 70 | else: 71 | raise ValueError("`y_pred_decoded` must be either a list or a Numpy array.") 72 | 73 | return y_pred_decoded_inv 74 | -------------------------------------------------------------------------------- /data_generator/object_detection_2d_photometric_ops.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Various photometric image transformations, both deterministic and probabilistic. 3 | 4 | Copyright (C) 2018 Pierluigi Ferrari 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | ''' 18 | 19 | from __future__ import division 20 | import numpy as np 21 | import cv2 22 | 23 | class ConvertColor: 24 | ''' 25 | Converts images between RGB, HSV and grayscale color spaces. This is just a wrapper 26 | around `cv2.cvtColor()`. 27 | ''' 28 | def __init__(self, current='RGB', to='HSV', keep_3ch=True): 29 | ''' 30 | Arguments: 31 | current (str, optional): The current color space of the images. Can be 32 | one of 'RGB' and 'HSV'. 33 | to (str, optional): The target color space of the images. Can be one of 34 | 'RGB', 'HSV', and 'GRAY'. 35 | keep_3ch (bool, optional): Only relevant if `to == GRAY`. 36 | If `True`, the resulting grayscale images will have three channels. 37 | ''' 38 | if not ((current in {'RGB', 'HSV'}) and (to in {'RGB', 'HSV', 'GRAY'})): 39 | raise NotImplementedError 40 | self.current = current 41 | self.to = to 42 | self.keep_3ch = keep_3ch 43 | 44 | def __call__(self, image, labels=None): 45 | if self.current == 'RGB' and self.to == 'HSV': 46 | image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV) 47 | elif self.current == 'RGB' and self.to == 'GRAY': 48 | image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) 49 | if self.keep_3ch: 50 | image = np.stack([image] * 3, axis=-1) 51 | elif self.current == 'HSV' and self.to == 'RGB': 52 | image = cv2.cvtColor(image, cv2.COLOR_HSV2RGB) 53 | elif self.current == 'HSV' and self.to == 'GRAY': 54 | image = cv2.cvtColor(image, cv2.COLOR_HSV2GRAY) 55 | if self.keep_3ch: 56 | image = np.stack([image] * 3, axis=-1) 57 | if labels is None: 58 | return image 59 | else: 60 | return image, labels 61 | 62 | class ConvertDataType: 63 | ''' 64 | Converts images represented as Numpy arrays between `uint8` and `float32`. 65 | Serves as a helper for certain photometric distortions. This is just a wrapper 66 | around `np.ndarray.astype()`. 67 | ''' 68 | def __init__(self, to='uint8'): 69 | ''' 70 | Arguments: 71 | to (string, optional): To which datatype to convert the input images. 72 | Can be either of 'uint8' and 'float32'. 73 | ''' 74 | if not (to == 'uint8' or to == 'float32'): 75 | raise ValueError("`to` can be either of 'uint8' or 'float32'.") 76 | self.to = to 77 | 78 | def __call__(self, image, labels=None): 79 | if self.to == 'uint8': 80 | image = np.round(image, decimals=0).astype(np.uint8) 81 | else: 82 | image = image.astype(np.float32) 83 | if labels is None: 84 | return image 85 | else: 86 | return image, labels 87 | 88 | class ConvertTo3Channels: 89 | ''' 90 | Converts 1-channel and 4-channel images to 3-channel images. Does nothing to images that 91 | already have 3 channels. In the case of 4-channel images, the fourth channel will be 92 | discarded. 93 | ''' 94 | def __init__(self): 95 | pass 96 | 97 | def __call__(self, image, labels=None): 98 | if image.ndim == 2: 99 | image = np.stack([image] * 3, axis=-1) 100 | elif image.ndim == 3: 101 | if image.shape[2] == 1: 102 | image = np.concatenate([image] * 3, axis=-1) 103 | elif image.shape[2] == 4: 104 | image = image[:,:,:3] 105 | if labels is None: 106 | return image 107 | else: 108 | return image, labels 109 | 110 | class Hue: 111 | ''' 112 | Changes the hue of HSV images. 113 | 114 | Important: 115 | - Expects HSV input. 116 | - Expects input array to be of `dtype` `float`. 117 | ''' 118 | def __init__(self, delta): 119 | ''' 120 | Arguments: 121 | delta (int): An integer in the closed interval `[-180, 180]` that determines the hue change, where 122 | a change by integer `delta` means a change by `2 * delta` degrees. Read up on the HSV color format 123 | if you need more information. 124 | ''' 125 | if not (-180 <= delta <= 180): raise ValueError("`delta` must be in the closed interval `[-180, 180]`.") 126 | self.delta = delta 127 | 128 | def __call__(self, image, labels=None): 129 | image[:, :, 0] = (image[:, :, 0] + self.delta) % 180.0 130 | if labels is None: 131 | return image 132 | else: 133 | return image, labels 134 | 135 | class RandomHue: 136 | ''' 137 | Randomly changes the hue of HSV images. 138 | 139 | Important: 140 | - Expects HSV input. 141 | - Expects input array to be of `dtype` `float`. 142 | ''' 143 | def __init__(self, max_delta=18, prob=0.5): 144 | ''' 145 | Arguments: 146 | max_delta (int): An integer in the closed interval `[0, 180]` that determines the maximal absolute 147 | hue change. 148 | prob (float, optional): `(1 - prob)` determines the probability with which the original, 149 | unaltered image is returned. 150 | ''' 151 | if not (0 <= max_delta <= 180): raise ValueError("`max_delta` must be in the closed interval `[0, 180]`.") 152 | self.max_delta = max_delta 153 | self.prob = prob 154 | self.change_hue = Hue(delta=0) 155 | 156 | def __call__(self, image, labels=None): 157 | p = np.random.uniform(0,1) 158 | if p >= (1.0-self.prob): 159 | self.change_hue.delta = np.random.uniform(-self.max_delta, self.max_delta) 160 | return self.change_hue(image, labels) 161 | elif labels is None: 162 | return image 163 | else: 164 | return image, labels 165 | 166 | class Saturation: 167 | ''' 168 | Changes the saturation of HSV images. 169 | 170 | Important: 171 | - Expects HSV input. 172 | - Expects input array to be of `dtype` `float`. 173 | ''' 174 | def __init__(self, factor): 175 | ''' 176 | Arguments: 177 | factor (float): A float greater than zero that determines saturation change, where 178 | values less than one result in less saturation and values greater than one result 179 | in more saturation. 180 | ''' 181 | if factor <= 0.0: raise ValueError("It must be `factor > 0`.") 182 | self.factor = factor 183 | 184 | def __call__(self, image, labels=None): 185 | image[:,:,1] = np.clip(image[:,:,1] * self.factor, 0, 255) 186 | if labels is None: 187 | return image 188 | else: 189 | return image, labels 190 | 191 | class RandomSaturation: 192 | ''' 193 | Randomly changes the saturation of HSV images. 194 | 195 | Important: 196 | - Expects HSV input. 197 | - Expects input array to be of `dtype` `float`. 198 | ''' 199 | def __init__(self, lower=0.3, upper=2.0, prob=0.5): 200 | ''' 201 | Arguments: 202 | lower (float, optional): A float greater than zero, the lower bound for the random 203 | saturation change. 204 | upper (float, optional): A float greater than zero, the upper bound for the random 205 | saturation change. Must be greater than `lower`. 206 | prob (float, optional): `(1 - prob)` determines the probability with which the original, 207 | unaltered image is returned. 208 | ''' 209 | if lower >= upper: raise ValueError("`upper` must be greater than `lower`.") 210 | self.lower = lower 211 | self.upper = upper 212 | self.prob = prob 213 | self.change_saturation = Saturation(factor=1.0) 214 | 215 | def __call__(self, image, labels=None): 216 | p = np.random.uniform(0,1) 217 | if p >= (1.0-self.prob): 218 | self.change_saturation.factor = np.random.uniform(self.lower, self.upper) 219 | return self.change_saturation(image, labels) 220 | elif labels is None: 221 | return image 222 | else: 223 | return image, labels 224 | 225 | class Brightness: 226 | ''' 227 | Changes the brightness of RGB images. 228 | 229 | Important: 230 | - Expects RGB input. 231 | - Expects input array to be of `dtype` `float`. 232 | ''' 233 | def __init__(self, delta): 234 | ''' 235 | Arguments: 236 | delta (int): An integer, the amount to add to or subtract from the intensity 237 | of every pixel. 238 | ''' 239 | self.delta = delta 240 | 241 | def __call__(self, image, labels=None): 242 | image = np.clip(image + self.delta, 0, 255) 243 | if labels is None: 244 | return image 245 | else: 246 | return image, labels 247 | 248 | class RandomBrightness: 249 | ''' 250 | Randomly changes the brightness of RGB images. 251 | 252 | Important: 253 | - Expects RGB input. 254 | - Expects input array to be of `dtype` `float`. 255 | ''' 256 | def __init__(self, lower=-84, upper=84, prob=0.5): 257 | ''' 258 | Arguments: 259 | lower (int, optional): An integer, the lower bound for the random brightness change. 260 | upper (int, optional): An integer, the upper bound for the random brightness change. 261 | Must be greater than `lower`. 262 | prob (float, optional): `(1 - prob)` determines the probability with which the original, 263 | unaltered image is returned. 264 | ''' 265 | if lower >= upper: raise ValueError("`upper` must be greater than `lower`.") 266 | self.lower = float(lower) 267 | self.upper = float(upper) 268 | self.prob = prob 269 | self.change_brightness = Brightness(delta=0) 270 | 271 | def __call__(self, image, labels=None): 272 | p = np.random.uniform(0,1) 273 | if p >= (1.0-self.prob): 274 | self.change_brightness.delta = np.random.uniform(self.lower, self.upper) 275 | return self.change_brightness(image, labels) 276 | elif labels is None: 277 | return image 278 | else: 279 | return image, labels 280 | 281 | class Contrast: 282 | ''' 283 | Changes the contrast of RGB images. 284 | 285 | Important: 286 | - Expects RGB input. 287 | - Expects input array to be of `dtype` `float`. 288 | ''' 289 | def __init__(self, factor): 290 | ''' 291 | Arguments: 292 | factor (float): A float greater than zero that determines contrast change, where 293 | values less than one result in less contrast and values greater than one result 294 | in more contrast. 295 | ''' 296 | if factor <= 0.0: raise ValueError("It must be `factor > 0`.") 297 | self.factor = factor 298 | 299 | def __call__(self, image, labels=None): 300 | image = np.clip(127.5 + self.factor * (image - 127.5), 0, 255) 301 | if labels is None: 302 | return image 303 | else: 304 | return image, labels 305 | 306 | class RandomContrast: 307 | ''' 308 | Randomly changes the contrast of RGB images. 309 | 310 | Important: 311 | - Expects RGB input. 312 | - Expects input array to be of `dtype` `float`. 313 | ''' 314 | def __init__(self, lower=0.5, upper=1.5, prob=0.5): 315 | ''' 316 | Arguments: 317 | lower (float, optional): A float greater than zero, the lower bound for the random 318 | contrast change. 319 | upper (float, optional): A float greater than zero, the upper bound for the random 320 | contrast change. Must be greater than `lower`. 321 | prob (float, optional): `(1 - prob)` determines the probability with which the original, 322 | unaltered image is returned. 323 | ''' 324 | if lower >= upper: raise ValueError("`upper` must be greater than `lower`.") 325 | self.lower = lower 326 | self.upper = upper 327 | self.prob = prob 328 | self.change_contrast = Contrast(factor=1.0) 329 | 330 | def __call__(self, image, labels=None): 331 | p = np.random.uniform(0,1) 332 | if p >= (1.0-self.prob): 333 | self.change_contrast.factor = np.random.uniform(self.lower, self.upper) 334 | return self.change_contrast(image, labels) 335 | elif labels is None: 336 | return image 337 | else: 338 | return image, labels 339 | 340 | class Gamma: 341 | ''' 342 | Changes the gamma value of RGB images. 343 | 344 | Important: Expects RGB input. 345 | ''' 346 | def __init__(self, gamma): 347 | ''' 348 | Arguments: 349 | gamma (float): A float greater than zero that determines gamma change. 350 | ''' 351 | if gamma <= 0.0: raise ValueError("It must be `gamma > 0`.") 352 | self.gamma = gamma 353 | self.gamma_inv = 1.0 / gamma 354 | # Build a lookup table mapping the pixel values [0, 255] to 355 | # their adjusted gamma values. 356 | self.table = np.array([((i / 255.0) ** self.gamma_inv) * 255 for i in np.arange(0, 256)]).astype("uint8") 357 | 358 | def __call__(self, image, labels=None): 359 | image = cv2.LUT(image, table) 360 | if labels is None: 361 | return image 362 | else: 363 | return image, labels 364 | 365 | class RandomGamma: 366 | ''' 367 | Randomly changes the gamma value of RGB images. 368 | 369 | Important: Expects RGB input. 370 | ''' 371 | def __init__(self, lower=0.25, upper=2.0, prob=0.5): 372 | ''' 373 | Arguments: 374 | lower (float, optional): A float greater than zero, the lower bound for the random 375 | gamma change. 376 | upper (float, optional): A float greater than zero, the upper bound for the random 377 | gamma change. Must be greater than `lower`. 378 | prob (float, optional): `(1 - prob)` determines the probability with which the original, 379 | unaltered image is returned. 380 | ''' 381 | if lower >= upper: raise ValueError("`upper` must be greater than `lower`.") 382 | self.lower = lower 383 | self.upper = upper 384 | self.prob = prob 385 | 386 | def __call__(self, image, labels=None): 387 | p = np.random.uniform(0,1) 388 | if p >= (1.0-self.prob): 389 | gamma = np.random.uniform(self.lower, self.upper) 390 | change_gamma = Gamma(gamma=gamma) 391 | return change_gamma(image, labels) 392 | elif labels is None: 393 | return image 394 | else: 395 | return image, labels 396 | 397 | class HistogramEqualization: 398 | ''' 399 | Performs histogram equalization on HSV images. 400 | 401 | Importat: Expects HSV input. 402 | ''' 403 | def __init__(self): 404 | pass 405 | 406 | def __call__(self, image, labels=None): 407 | image[:,:,2] = cv2.equalizeHist(image[:,:,2]) 408 | if labels is None: 409 | return image 410 | else: 411 | return image, labels 412 | 413 | class RandomHistogramEqualization: 414 | ''' 415 | Randomly performs histogram equalization on HSV images. The randomness only refers 416 | to whether or not the equalization is performed. 417 | 418 | Importat: Expects HSV input. 419 | ''' 420 | def __init__(self, prob=0.5): 421 | ''' 422 | Arguments: 423 | prob (float, optional): `(1 - prob)` determines the probability with which the original, 424 | unaltered image is returned. 425 | ''' 426 | self.prob = prob 427 | self.equalize = HistogramEqualization() 428 | 429 | def __call__(self, image, labels=None): 430 | p = np.random.uniform(0,1) 431 | if p >= (1.0-self.prob): 432 | return self.equalize(image, labels) 433 | elif labels is None: 434 | return image 435 | else: 436 | return image, labels 437 | 438 | class ChannelSwap: 439 | ''' 440 | Swaps the channels of images. 441 | ''' 442 | def __init__(self, order): 443 | ''' 444 | Arguments: 445 | order (tuple): A tuple of integers that defines the desired channel order 446 | of the input images after the channel swap. 447 | ''' 448 | self.order = order 449 | 450 | def __call__(self, image, labels=None): 451 | image = image[:,:,self.order] 452 | if labels is None: 453 | return image 454 | else: 455 | return image, labels 456 | 457 | class RandomChannelSwap: 458 | ''' 459 | Randomly swaps the channels of RGB images. 460 | 461 | Important: Expects RGB input. 462 | ''' 463 | def __init__(self, prob=0.5): 464 | ''' 465 | Arguments: 466 | prob (float, optional): `(1 - prob)` determines the probability with which the original, 467 | unaltered image is returned. 468 | ''' 469 | self.prob = prob 470 | # All possible permutations of the three image channels except the original order. 471 | self.permutations = ((0, 2, 1), 472 | (1, 0, 2), (1, 2, 0), 473 | (2, 0, 1), (2, 1, 0)) 474 | self.swap_channels = ChannelSwap(order=(0, 1, 2)) 475 | 476 | def __call__(self, image, labels=None): 477 | p = np.random.uniform(0,1) 478 | if p >= (1.0-self.prob): 479 | i = np.random.randint(5) # There are 6 possible permutations. 480 | self.swap_channels.order = self.permutations[i] 481 | return self.swap_channels(image, labels) 482 | elif labels is None: 483 | return image 484 | else: 485 | return image, labels 486 | -------------------------------------------------------------------------------- /eval_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/eval_utils/__init__.py -------------------------------------------------------------------------------- /eval_utils/coco_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A few utilities that are useful when working with the MS COCO datasets. 3 | 4 | Copyright (C) 2018 Pierluigi Ferrari 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | ''' 18 | 19 | import json 20 | from tqdm import trange 21 | from math import ceil 22 | import sys 23 | 24 | from data_generator.object_detection_2d_geometric_ops import Resize 25 | from data_generator.object_detection_2d_patch_sampling_ops import RandomPadFixedAR 26 | from data_generator.object_detection_2d_photometric_ops import ConvertTo3Channels 27 | from ssd_encoder_decoder.ssd_output_decoder import decode_detections 28 | from data_generator.object_detection_2d_misc_utils import apply_inverse_transforms 29 | 30 | def get_coco_category_maps(annotations_file): 31 | ''' 32 | Builds dictionaries that map between MS COCO category IDs, transformed category IDs, and category names. 33 | The original MS COCO category IDs are not consecutive unfortunately: The 80 category IDs are spread 34 | across the integers 1 through 90 with some integers skipped. Since we usually use a one-hot 35 | class representation in neural networks, we need to map these non-consecutive original COCO category 36 | IDs (let's call them 'cats') to consecutive category IDs (let's call them 'classes'). 37 | 38 | Arguments: 39 | annotations_file (str): The filepath to any MS COCO annotations JSON file. 40 | 41 | Returns: 42 | 1) cats_to_classes: A dictionary that maps between the original (keys) and the transformed category IDs (values). 43 | 2) classes_to_cats: A dictionary that maps between the transformed (keys) and the original category IDs (values). 44 | 3) cats_to_names: A dictionary that maps between original category IDs (keys) and the respective category names (values). 45 | 4) classes_to_names: A list of the category names (values) with their indices representing the transformed IDs. 46 | ''' 47 | with open(annotations_file, 'r') as f: 48 | annotations = json.load(f) 49 | cats_to_classes = {} 50 | classes_to_cats = {} 51 | cats_to_names = {} 52 | classes_to_names = [] 53 | classes_to_names.append('background') # Need to add the background class first so that the indexing is right. 54 | for i, cat in enumerate(annotations['categories']): 55 | cats_to_classes[cat['id']] = i + 1 56 | classes_to_cats[i + 1] = cat['id'] 57 | cats_to_names[cat['id']] = cat['name'] 58 | classes_to_names.append(cat['name']) 59 | 60 | return cats_to_classes, classes_to_cats, cats_to_names, classes_to_names 61 | 62 | def predict_all_to_json(out_file, 63 | model, 64 | img_height, 65 | img_width, 66 | classes_to_cats, 67 | data_generator, 68 | batch_size, 69 | data_generator_mode='resize', 70 | model_mode='training', 71 | confidence_thresh=0.01, 72 | iou_threshold=0.45, 73 | top_k=200, 74 | pred_coords='centroids', 75 | normalize_coords=True): 76 | ''' 77 | Runs detection predictions over the whole dataset given a model and saves them in a JSON file 78 | in the MS COCO detection results format. 79 | 80 | Arguments: 81 | out_file (str): The file name (full path) under which to save the results JSON file. 82 | model (Keras model): A Keras SSD model object. 83 | img_height (int): The input image height for the model. 84 | img_width (int): The input image width for the model. 85 | classes_to_cats (dict): A dictionary that maps the consecutive class IDs predicted by the model 86 | to the non-consecutive original MS COCO category IDs. 87 | data_generator (DataGenerator): A `DataGenerator` object with the evaluation dataset. 88 | batch_size (int): The batch size for the evaluation. 89 | data_generator_mode (str, optional): Either of 'resize' or 'pad'. If 'resize', the input images will 90 | be resized (i.e. warped) to `(img_height, img_width)`. This mode does not preserve the aspect ratios of the images. 91 | If 'pad', the input images will be first padded so that they have the aspect ratio defined by `img_height` 92 | and `img_width` and then resized to `(img_height, img_width)`. This mode preserves the aspect ratios of the images. 93 | model_mode (str, optional): The mode in which the model was created, i.e. 'training', 'inference' or 'inference_fast'. 94 | This is needed in order to know whether the model output is already decoded or still needs to be decoded. Refer to 95 | the model documentation for the meaning of the individual modes. 96 | confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific 97 | positive class in order to be considered for the non-maximum suppression stage for the respective class. 98 | A lower value will result in a larger part of the selection process being done by the non-maximum suppression 99 | stage, while a larger value will result in a larger part of the selection process happening in the confidence 100 | thresholding stage. 101 | iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold` 102 | with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers 103 | to the box score. 104 | top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the 105 | non-maximum suppression stage. Defaults to 200, following the paper. 106 | input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids' 107 | for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format 108 | `(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`. 109 | normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1]) 110 | and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs 111 | relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`. 112 | Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect 113 | coordinates. Requires `img_height` and `img_width` if set to `True`. 114 | 115 | Returns: 116 | None. 117 | ''' 118 | 119 | convert_to_3_channels = ConvertTo3Channels() 120 | resize = Resize(height=img_height,width=img_width) 121 | if data_generator_mode == 'resize': 122 | transformations = [convert_to_3_channels, 123 | resize] 124 | elif data_generator_mode == 'pad': 125 | random_pad = RandomPadFixedAR(patch_aspect_ratio=img_width/img_height, clip_boxes=False) 126 | transformations = [convert_to_3_channels, 127 | random_pad, 128 | resize] 129 | else: 130 | raise ValueError("Unexpected argument value: `data_generator_mode` can be either of 'resize' or 'pad', but received '{}'.".format(data_generator_mode)) 131 | 132 | # Set the generator parameters. 133 | generator = data_generator.generate(batch_size=batch_size, 134 | shuffle=False, 135 | transformations=transformations, 136 | label_encoder=None, 137 | returns={'processed_images', 138 | 'image_ids', 139 | 'inverse_transform'}, 140 | keep_images_without_gt=True) 141 | # Put the results in this list. 142 | results = [] 143 | # Compute the number of batches to iterate over the entire dataset. 144 | n_images = data_generator.get_dataset_size() 145 | print("Number of images in the evaluation dataset: {}".format(n_images)) 146 | n_batches = int(ceil(n_images / batch_size)) 147 | # Loop over all batches. 148 | tr = trange(n_batches, file=sys.stdout) 149 | tr.set_description('Producing results file') 150 | for i in tr: 151 | # Generate batch. 152 | batch_X, batch_image_ids, batch_inverse_transforms = next(generator) 153 | # Predict. 154 | y_pred = model.predict(batch_X) 155 | # If the model was created in 'training' mode, the raw predictions need to 156 | # be decoded and filtered, otherwise that's already taken care of. 157 | if model_mode == 'training': 158 | # Decode. 159 | y_pred = decode_detections(y_pred, 160 | confidence_thresh=confidence_thresh, 161 | iou_threshold=iou_threshold, 162 | top_k=top_k, 163 | input_coords=pred_coords, 164 | normalize_coords=normalize_coords, 165 | img_height=img_height, 166 | img_width=img_width) 167 | else: 168 | # Filter out the all-zeros dummy elements of `y_pred`. 169 | y_pred_filtered = [] 170 | for i in range(len(y_pred)): 171 | y_pred_filtered.append(y_pred[i][y_pred[i,:,0] != 0]) 172 | y_pred = y_pred_filtered 173 | # Convert the predicted box coordinates for the original images. 174 | y_pred = apply_inverse_transforms(y_pred, batch_inverse_transforms) 175 | 176 | # Convert each predicted box into the results format. 177 | for k, batch_item in enumerate(y_pred): 178 | for box in batch_item: 179 | class_id = box[0] 180 | # Transform the consecutive class IDs back to the original COCO category IDs. 181 | cat_id = classes_to_cats[class_id] 182 | # Round the box coordinates to reduce the JSON file size. 183 | xmin = float(round(box[2], 1)) 184 | ymin = float(round(box[3], 1)) 185 | xmax = float(round(box[4], 1)) 186 | ymax = float(round(box[5], 1)) 187 | width = xmax - xmin 188 | height = ymax - ymin 189 | bbox = [xmin, ymin, width, height] 190 | result = {} 191 | result['image_id'] = batch_image_ids[k] 192 | result['category_id'] = cat_id 193 | result['score'] = float(round(box[1], 3)) 194 | result['bbox'] = bbox 195 | results.append(result) 196 | 197 | with open(out_file, 'w') as f: 198 | json.dump(results, f) 199 | 200 | print("Prediction results saved in '{}'".format(out_file)) 201 | -------------------------------------------------------------------------------- /examples/fish-bike.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/fish-bike.jpg -------------------------------------------------------------------------------- /examples/fish_bike.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/fish_bike.jpg -------------------------------------------------------------------------------- /examples/ssd300_pascalVOC_pred_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/ssd300_pascalVOC_pred_01.png -------------------------------------------------------------------------------- /examples/ssd300_pascalVOC_pred_02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/ssd300_pascalVOC_pred_02.png -------------------------------------------------------------------------------- /examples/ssd300_pascalVOC_pred_03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/ssd300_pascalVOC_pred_03.png -------------------------------------------------------------------------------- /examples/ssd300_pascalVOC_pred_04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/ssd300_pascalVOC_pred_04.png -------------------------------------------------------------------------------- /examples/ssd300_pascalVOC_pred_05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/ssd300_pascalVOC_pred_05.png -------------------------------------------------------------------------------- /examples/ssd300_pascalVOC_pred_06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/ssd300_pascalVOC_pred_06.png -------------------------------------------------------------------------------- /examples/ssd300_pascalVOC_pred_07.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/ssd300_pascalVOC_pred_07.png -------------------------------------------------------------------------------- /examples/ssd300_pascalVOC_pred_08.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/ssd300_pascalVOC_pred_08.png -------------------------------------------------------------------------------- /examples/ssd300_pascalVOC_pred_09.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/ssd300_pascalVOC_pred_09.png -------------------------------------------------------------------------------- /examples/ssd7_udacity_traffic_pred_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/ssd7_udacity_traffic_pred_01.png -------------------------------------------------------------------------------- /examples/ssd7_udacity_traffic_pred_02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/ssd7_udacity_traffic_pred_02.png -------------------------------------------------------------------------------- /examples/ssd7_udacity_traffic_pred_03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/ssd7_udacity_traffic_pred_03.png -------------------------------------------------------------------------------- /examples/ssd7_udacity_traffic_pred_04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/ssd7_udacity_traffic_pred_04.png -------------------------------------------------------------------------------- /examples/ssd7_udacity_traffic_pred_05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/ssd7_udacity_traffic_pred_05.png -------------------------------------------------------------------------------- /examples/trained_ssd300_pascalVOC2007_test_pred_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/trained_ssd300_pascalVOC2007_test_pred_01.png -------------------------------------------------------------------------------- /examples/trained_ssd300_pascalVOC2007_test_pred_01_no_gt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/trained_ssd300_pascalVOC2007_test_pred_01_no_gt.png -------------------------------------------------------------------------------- /examples/trained_ssd300_pascalVOC2007_test_pred_02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/trained_ssd300_pascalVOC2007_test_pred_02.png -------------------------------------------------------------------------------- /examples/trained_ssd300_pascalVOC2007_test_pred_02_no_gt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/trained_ssd300_pascalVOC2007_test_pred_02_no_gt.png -------------------------------------------------------------------------------- /examples/trained_ssd300_pascalVOC2007_test_pred_03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/trained_ssd300_pascalVOC2007_test_pred_03.png -------------------------------------------------------------------------------- /examples/trained_ssd300_pascalVOC2007_test_pred_03_no_gt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/trained_ssd300_pascalVOC2007_test_pred_03_no_gt.png -------------------------------------------------------------------------------- /examples/trained_ssd300_pascalVOC2007_test_pred_04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/trained_ssd300_pascalVOC2007_test_pred_04.png -------------------------------------------------------------------------------- /examples/trained_ssd300_pascalVOC2007_test_pred_04_no_gt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/trained_ssd300_pascalVOC2007_test_pred_04_no_gt.png -------------------------------------------------------------------------------- /examples/trained_ssd300_pascalVOC2007_test_pred_05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/trained_ssd300_pascalVOC2007_test_pred_05.png -------------------------------------------------------------------------------- /examples/trained_ssd300_pascalVOC2007_test_pred_05_no_gt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/trained_ssd300_pascalVOC2007_test_pred_05_no_gt.png -------------------------------------------------------------------------------- /examples/trained_ssd300_pascalVOC2007_test_pred_06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/examples/trained_ssd300_pascalVOC2007_test_pred_06.png -------------------------------------------------------------------------------- /keras_layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/keras_layers/__init__.py -------------------------------------------------------------------------------- /keras_layers/keras_layer_AnchorBoxes.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A custom Keras layer to generate anchor boxes. 3 | 4 | Copyright (C) 2018 Pierluigi Ferrari 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | ''' 18 | 19 | from __future__ import division 20 | import numpy as np 21 | import keras.backend as K 22 | from keras.engine.topology import InputSpec 23 | from keras.engine.topology import Layer 24 | 25 | from bounding_box_utils.bounding_box_utils import convert_coordinates 26 | 27 | class AnchorBoxes(Layer): 28 | ''' 29 | A Keras layer to create an output tensor containing anchor box coordinates 30 | and variances based on the input tensor and the passed arguments. 31 | 32 | A set of 2D anchor boxes of different aspect ratios is created for each spatial unit of 33 | the input tensor. The number of anchor boxes created per unit depends on the arguments 34 | `aspect_ratios` and `two_boxes_for_ar1`, in the default case it is 4. The boxes 35 | are parameterized by the coordinate tuple `(xmin, xmax, ymin, ymax)`. 36 | 37 | The logic implemented by this layer is identical to the logic in the module 38 | `ssd_box_encode_decode_utils.py`. 39 | 40 | The purpose of having this layer in the network is to make the model self-sufficient 41 | at inference time. Since the model is predicting offsets to the anchor boxes 42 | (rather than predicting absolute box coordinates directly), one needs to know the anchor 43 | box coordinates in order to construct the final prediction boxes from the predicted offsets. 44 | If the model's output tensor did not contain the anchor box coordinates, the necessary 45 | information to convert the predicted offsets back to absolute coordinates would be missing 46 | in the model output. The reason why it is necessary to predict offsets to the anchor boxes 47 | rather than to predict absolute box coordinates directly is explained in `README.md`. 48 | 49 | Input shape: 50 | 4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'` 51 | or `(batch, height, width, channels)` if `dim_ordering = 'tf'`. 52 | 53 | Output shape: 54 | 5D tensor of shape `(batch, height, width, n_boxes, 8)`. The last axis contains 55 | the four anchor box coordinates and the four variance values for each box. 56 | ''' 57 | 58 | def __init__(self, 59 | img_height, 60 | img_width, 61 | this_scale, 62 | next_scale, 63 | aspect_ratios=[0.5, 1.0, 2.0], 64 | two_boxes_for_ar1=True, 65 | this_steps=None, 66 | this_offsets=None, 67 | clip_boxes=False, 68 | variances=[0.1, 0.1, 0.2, 0.2], 69 | coords='centroids', 70 | normalize_coords=False, 71 | **kwargs): 72 | ''' 73 | All arguments need to be set to the same values as in the box encoding process, otherwise the behavior is undefined. 74 | Some of these arguments are explained in more detail in the documentation of the `SSDBoxEncoder` class. 75 | 76 | Arguments: 77 | img_height (int): The height of the input images. 78 | img_width (int): The width of the input images. 79 | this_scale (float): A float in [0, 1], the scaling factor for the size of the generated anchor boxes 80 | as a fraction of the shorter side of the input image. 81 | next_scale (float): A float in [0, 1], the next larger scaling factor. Only relevant if 82 | `self.two_boxes_for_ar1 == True`. 83 | aspect_ratios (list, optional): The list of aspect ratios for which default boxes are to be 84 | generated for this layer. 85 | two_boxes_for_ar1 (bool, optional): Only relevant if `aspect_ratios` contains 1. 86 | If `True`, two default boxes will be generated for aspect ratio 1. The first will be generated 87 | using the scaling factor for the respective layer, the second one will be generated using 88 | geometric mean of said scaling factor and next bigger scaling factor. 89 | clip_boxes (bool, optional): If `True`, clips the anchor box coordinates to stay within image boundaries. 90 | variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by 91 | its respective variance value. 92 | coords (str, optional): The box coordinate format to be used internally in the model (i.e. this is not the input format 93 | of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 94 | 'corners' for the format `(xmin, ymin, xmax, ymax)`, or 'minmax' for the format `(xmin, xmax, ymin, ymax)`. 95 | normalize_coords (bool, optional): Set to `True` if the model uses relative instead of absolute coordinates, 96 | i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates. 97 | ''' 98 | if K.backend() != 'tensorflow': 99 | raise TypeError("This layer only supports TensorFlow at the moment, but you are using the {} backend.".format(K.backend())) 100 | 101 | if (this_scale < 0) or (next_scale < 0) or (this_scale > 1): 102 | raise ValueError("`this_scale` must be in [0, 1] and `next_scale` must be >0, but `this_scale` == {}, `next_scale` == {}".format(this_scale, next_scale)) 103 | 104 | if len(variances) != 4: 105 | raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances))) 106 | variances = np.array(variances) 107 | if np.any(variances <= 0): 108 | raise ValueError("All variances must be >0, but the variances given are {}".format(variances)) 109 | 110 | self.img_height = img_height 111 | self.img_width = img_width 112 | self.this_scale = this_scale 113 | self.next_scale = next_scale 114 | self.aspect_ratios = aspect_ratios 115 | self.two_boxes_for_ar1 = two_boxes_for_ar1 116 | self.this_steps = this_steps 117 | self.this_offsets = this_offsets 118 | self.clip_boxes = clip_boxes 119 | self.variances = variances 120 | self.coords = coords 121 | self.normalize_coords = normalize_coords 122 | # Compute the number of boxes per cell 123 | if (1 in aspect_ratios) and two_boxes_for_ar1: 124 | self.n_boxes = len(aspect_ratios) + 1 125 | else: 126 | self.n_boxes = len(aspect_ratios) 127 | super(AnchorBoxes, self).__init__(**kwargs) 128 | 129 | def build(self, input_shape): 130 | self.input_spec = [InputSpec(shape=input_shape)] 131 | super(AnchorBoxes, self).build(input_shape) 132 | 133 | def call(self, x, mask=None): 134 | ''' 135 | Return an anchor box tensor based on the shape of the input tensor. 136 | 137 | The logic implemented here is identical to the logic in the module `ssd_box_encode_decode_utils.py`. 138 | 139 | Note that this tensor does not participate in any graph computations at runtime. It is being created 140 | as a constant once during graph creation and is just being output along with the rest of the model output 141 | during runtime. Because of this, all logic is implemented as Numpy array operations and it is sufficient 142 | to convert the resulting Numpy array into a Keras tensor at the very end before outputting it. 143 | 144 | Arguments: 145 | x (tensor): 4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'` 146 | or `(batch, height, width, channels)` if `dim_ordering = 'tf'`. The input for this 147 | layer must be the output of the localization predictor layer. 148 | ''' 149 | 150 | # Compute box width and height for each aspect ratio 151 | # The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`. 152 | size = min(self.img_height, self.img_width) 153 | # Compute the box widths and and heights for all aspect ratios 154 | wh_list = [] 155 | for ar in self.aspect_ratios: 156 | if (ar == 1): 157 | # Compute the regular anchor box for aspect ratio 1. 158 | box_height = box_width = self.this_scale * size 159 | wh_list.append((box_width, box_height)) 160 | if self.two_boxes_for_ar1: 161 | # Compute one slightly larger version using the geometric mean of this scale value and the next. 162 | box_height = box_width = np.sqrt(self.this_scale * self.next_scale) * size 163 | wh_list.append((box_width, box_height)) 164 | else: 165 | box_height = self.this_scale * size / np.sqrt(ar) 166 | box_width = self.this_scale * size * np.sqrt(ar) 167 | wh_list.append((box_width, box_height)) 168 | wh_list = np.array(wh_list) 169 | 170 | # We need the shape of the input tensor 171 | if K.image_dim_ordering() == 'tf': 172 | batch_size, feature_map_height, feature_map_width, feature_map_channels = x._keras_shape 173 | else: # Not yet relevant since TensorFlow is the only supported backend right now, but it can't harm to have this in here for the future 174 | batch_size, feature_map_channels, feature_map_height, feature_map_width = x._keras_shape 175 | 176 | # Compute the grid of box center points. They are identical for all aspect ratios. 177 | 178 | # Compute the step sizes, i.e. how far apart the anchor box center points will be vertically and horizontally. 179 | if (self.this_steps is None): 180 | step_height = self.img_height / feature_map_height 181 | step_width = self.img_width / feature_map_width 182 | else: 183 | if isinstance(self.this_steps, (list, tuple)) and (len(self.this_steps) == 2): 184 | step_height = self.this_steps[0] 185 | step_width = self.this_steps[1] 186 | elif isinstance(self.this_steps, (int, float)): 187 | step_height = self.this_steps 188 | step_width = self.this_steps 189 | # Compute the offsets, i.e. at what pixel values the first anchor box center point will be from the top and from the left of the image. 190 | if (self.this_offsets is None): 191 | offset_height = 0.5 192 | offset_width = 0.5 193 | else: 194 | if isinstance(self.this_offsets, (list, tuple)) and (len(self.this_offsets) == 2): 195 | offset_height = self.this_offsets[0] 196 | offset_width = self.this_offsets[1] 197 | elif isinstance(self.this_offsets, (int, float)): 198 | offset_height = self.this_offsets 199 | offset_width = self.this_offsets 200 | # Now that we have the offsets and step sizes, compute the grid of anchor box center points. 201 | cy = np.linspace(offset_height * step_height, (offset_height + feature_map_height - 1) * step_height, feature_map_height) 202 | cx = np.linspace(offset_width * step_width, (offset_width + feature_map_width - 1) * step_width, feature_map_width) 203 | cx_grid, cy_grid = np.meshgrid(cx, cy) 204 | cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down 205 | cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down 206 | 207 | # Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)` 208 | # where the last dimension will contain `(cx, cy, w, h)` 209 | boxes_tensor = np.zeros((feature_map_height, feature_map_width, self.n_boxes, 4)) 210 | 211 | boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, self.n_boxes)) # Set cx 212 | boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, self.n_boxes)) # Set cy 213 | boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w 214 | boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h 215 | 216 | # Convert `(cx, cy, w, h)` to `(xmin, xmax, ymin, ymax)` 217 | boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners') 218 | 219 | # If `clip_boxes` is enabled, clip the coordinates to lie within the image boundaries 220 | if self.clip_boxes: 221 | x_coords = boxes_tensor[:,:,:,[0, 2]] 222 | x_coords[x_coords >= self.img_width] = self.img_width - 1 223 | x_coords[x_coords < 0] = 0 224 | boxes_tensor[:,:,:,[0, 2]] = x_coords 225 | y_coords = boxes_tensor[:,:,:,[1, 3]] 226 | y_coords[y_coords >= self.img_height] = self.img_height - 1 227 | y_coords[y_coords < 0] = 0 228 | boxes_tensor[:,:,:,[1, 3]] = y_coords 229 | 230 | # If `normalize_coords` is enabled, normalize the coordinates to be within [0,1] 231 | if self.normalize_coords: 232 | boxes_tensor[:, :, :, [0, 2]] /= self.img_width 233 | boxes_tensor[:, :, :, [1, 3]] /= self.img_height 234 | 235 | # TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth. 236 | if self.coords == 'centroids': 237 | # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`. 238 | boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half') 239 | elif self.coords == 'minmax': 240 | # Convert `(xmin, ymin, xmax, ymax)` to `(xmin, xmax, ymin, ymax). 241 | boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2minmax', border_pixels='half') 242 | 243 | # Create a tensor to contain the variances and append it to `boxes_tensor`. This tensor has the same shape 244 | # as `boxes_tensor` and simply contains the same 4 variance values for every position in the last axis. 245 | variances_tensor = np.zeros_like(boxes_tensor) # Has shape `(feature_map_height, feature_map_width, n_boxes, 4)` 246 | variances_tensor += self.variances # Long live broadcasting 247 | # Now `boxes_tensor` becomes a tensor of shape `(feature_map_height, feature_map_width, n_boxes, 8)` 248 | boxes_tensor = np.concatenate((boxes_tensor, variances_tensor), axis=-1) 249 | 250 | # Now prepend one dimension to `boxes_tensor` to account for the batch size and tile it along 251 | # The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 8)` 252 | boxes_tensor = np.expand_dims(boxes_tensor, axis=0) 253 | boxes_tensor = K.tile(K.constant(boxes_tensor, dtype='float32'), (K.shape(x)[0], 1, 1, 1, 1)) 254 | 255 | return boxes_tensor 256 | 257 | def compute_output_shape(self, input_shape): 258 | if K.image_dim_ordering() == 'tf': 259 | batch_size, feature_map_height, feature_map_width, feature_map_channels = input_shape 260 | else: # Not yet relevant since TensorFlow is the only supported backend right now, but it can't harm to have this in here for the future 261 | batch_size, feature_map_channels, feature_map_height, feature_map_width = input_shape 262 | return (batch_size, feature_map_height, feature_map_width, self.n_boxes, 8) 263 | 264 | def get_config(self): 265 | config = { 266 | 'img_height': self.img_height, 267 | 'img_width': self.img_width, 268 | 'this_scale': self.this_scale, 269 | 'next_scale': self.next_scale, 270 | 'aspect_ratios': list(self.aspect_ratios), 271 | 'two_boxes_for_ar1': self.two_boxes_for_ar1, 272 | 'clip_boxes': self.clip_boxes, 273 | 'variances': list(self.variances), 274 | 'coords': self.coords, 275 | 'normalize_coords': self.normalize_coords 276 | } 277 | base_config = super(AnchorBoxes, self).get_config() 278 | return dict(list(base_config.items()) + list(config.items())) 279 | -------------------------------------------------------------------------------- /keras_layers/keras_layer_DecodeDetections.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A custom Keras layer to decode the raw SSD prediction output. Corresponds to the 3 | `DetectionOutput` layer type in the original Caffe implementation of SSD. 4 | 5 | Copyright (C) 2018 Pierluigi Ferrari 6 | 7 | Licensed under the Apache License, Version 2.0 (the "License"); 8 | you may not use this file except in compliance with the License. 9 | You may obtain a copy of the License at 10 | 11 | http://www.apache.org/licenses/LICENSE-2.0 12 | 13 | Unless required by applicable law or agreed to in writing, software 14 | distributed under the License is distributed on an "AS IS" BASIS, 15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | See the License for the specific language governing permissions and 17 | limitations under the License. 18 | ''' 19 | 20 | from __future__ import division 21 | import numpy as np 22 | import tensorflow as tf 23 | import keras.backend as K 24 | from keras.engine.topology import InputSpec 25 | from keras.engine.topology import Layer 26 | 27 | class DecodeDetections(Layer): 28 | ''' 29 | A Keras layer to decode the raw SSD prediction output. 30 | 31 | Input shape: 32 | 3D tensor of shape `(batch_size, n_boxes, n_classes + 12)`. 33 | 34 | Output shape: 35 | 3D tensor of shape `(batch_size, top_k, 6)`. 36 | ''' 37 | 38 | def __init__(self, 39 | confidence_thresh=0.01, 40 | iou_threshold=0.45, 41 | top_k=200, 42 | nms_max_output_size=400, 43 | coords='centroids', 44 | normalize_coords=True, 45 | img_height=None, 46 | img_width=None, 47 | **kwargs): 48 | ''' 49 | All default argument values follow the Caffe implementation. 50 | 51 | Arguments: 52 | confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific 53 | positive class in order to be considered for the non-maximum suppression stage for the respective class. 54 | A lower value will result in a larger part of the selection process being done by the non-maximum suppression 55 | stage, while a larger value will result in a larger part of the selection process happening in the confidence 56 | thresholding stage. 57 | iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold` 58 | with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers 59 | to the box score. 60 | top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the 61 | non-maximum suppression stage. 62 | nms_max_output_size (int, optional): The maximum number of predictions that will be left after performing non-maximum 63 | suppression. 64 | coords (str, optional): The box coordinate format that the model outputs. Must be 'centroids' 65 | i.e. the format `(cx, cy, w, h)` (box center coordinates, width, and height). Other coordinate formats are 66 | currently not supported. 67 | normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1]) 68 | and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs 69 | relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`. 70 | Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect 71 | coordinates. Requires `img_height` and `img_width` if set to `True`. 72 | img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`. 73 | img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`. 74 | ''' 75 | if K.backend() != 'tensorflow': 76 | raise TypeError("This layer only supports TensorFlow at the moment, but you are using the {} backend.".format(K.backend())) 77 | 78 | if normalize_coords and ((img_height is None) or (img_width is None)): 79 | raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width)) 80 | 81 | if coords != 'centroids': 82 | raise ValueError("The DetectionOutput layer currently only supports the 'centroids' coordinate format.") 83 | 84 | # We need these members for the config. 85 | self.confidence_thresh = confidence_thresh 86 | self.iou_threshold = iou_threshold 87 | self.top_k = top_k 88 | self.normalize_coords = normalize_coords 89 | self.img_height = img_height 90 | self.img_width = img_width 91 | self.coords = coords 92 | self.nms_max_output_size = nms_max_output_size 93 | 94 | # We need these members for TensorFlow. 95 | self.tf_confidence_thresh = tf.constant(self.confidence_thresh, name='confidence_thresh') 96 | self.tf_iou_threshold = tf.constant(self.iou_threshold, name='iou_threshold') 97 | self.tf_top_k = tf.constant(self.top_k, name='top_k') 98 | self.tf_normalize_coords = tf.constant(self.normalize_coords, name='normalize_coords') 99 | self.tf_img_height = tf.constant(self.img_height, dtype=tf.float32, name='img_height') 100 | self.tf_img_width = tf.constant(self.img_width, dtype=tf.float32, name='img_width') 101 | self.tf_nms_max_output_size = tf.constant(self.nms_max_output_size, name='nms_max_output_size') 102 | 103 | super(DecodeDetections, self).__init__(**kwargs) 104 | 105 | def build(self, input_shape): 106 | self.input_spec = [InputSpec(shape=input_shape)] 107 | super(DecodeDetections, self).build(input_shape) 108 | 109 | def call(self, y_pred, mask=None): 110 | ''' 111 | Returns: 112 | 3D tensor of shape `(batch_size, top_k, 6)`. The second axis is zero-padded 113 | to always yield `top_k` predictions per batch item. The last axis contains 114 | the coordinates for each predicted box in the format 115 | `[class_id, confidence, xmin, ymin, xmax, ymax]`. 116 | ''' 117 | 118 | ##################################################################################### 119 | # 1. Convert the box coordinates from predicted anchor box offsets to predicted 120 | # absolute coordinates 121 | ##################################################################################### 122 | 123 | # Convert anchor box offsets to image offsets. 124 | cx = y_pred[...,-12] * y_pred[...,-4] * y_pred[...,-6] + y_pred[...,-8] # cx = cx_pred * cx_variance * w_anchor + cx_anchor 125 | cy = y_pred[...,-11] * y_pred[...,-3] * y_pred[...,-5] + y_pred[...,-7] # cy = cy_pred * cy_variance * h_anchor + cy_anchor 126 | w = tf.exp(y_pred[...,-10] * y_pred[...,-2]) * y_pred[...,-6] # w = exp(w_pred * variance_w) * w_anchor 127 | h = tf.exp(y_pred[...,-9] * y_pred[...,-1]) * y_pred[...,-5] # h = exp(h_pred * variance_h) * h_anchor 128 | 129 | # Convert 'centroids' to 'corners'. 130 | xmin = cx - 0.5 * w 131 | ymin = cy - 0.5 * h 132 | xmax = cx + 0.5 * w 133 | ymax = cy + 0.5 * h 134 | 135 | # If the model predicts box coordinates relative to the image dimensions and they are supposed 136 | # to be converted back to absolute coordinates, do that. 137 | def normalized_coords(): 138 | xmin1 = tf.expand_dims(xmin * self.tf_img_width, axis=-1) 139 | ymin1 = tf.expand_dims(ymin * self.tf_img_height, axis=-1) 140 | xmax1 = tf.expand_dims(xmax * self.tf_img_width, axis=-1) 141 | ymax1 = tf.expand_dims(ymax * self.tf_img_height, axis=-1) 142 | return xmin1, ymin1, xmax1, ymax1 143 | def non_normalized_coords(): 144 | return tf.expand_dims(xmin, axis=-1), tf.expand_dims(ymin, axis=-1), tf.expand_dims(xmax, axis=-1), tf.expand_dims(ymax, axis=-1) 145 | 146 | xmin, ymin, xmax, ymax = tf.cond(self.tf_normalize_coords, normalized_coords, non_normalized_coords) 147 | 148 | # Concatenate the one-hot class confidences and the converted box coordinates to form the decoded predictions tensor. 149 | y_pred = tf.concat(values=[y_pred[...,:-12], xmin, ymin, xmax, ymax], axis=-1) 150 | 151 | ##################################################################################### 152 | # 2. Perform confidence thresholding, per-class non-maximum suppression, and 153 | # top-k filtering. 154 | ##################################################################################### 155 | 156 | batch_size = tf.shape(y_pred)[0] # Output dtype: tf.int32 157 | n_boxes = tf.shape(y_pred)[1] 158 | n_classes = y_pred.shape[2] - 4 159 | class_indices = tf.range(1, n_classes) 160 | 161 | # Create a function that filters the predictions for the given batch item. Specifically, it performs: 162 | # - confidence thresholding 163 | # - non-maximum suppression (NMS) 164 | # - top-k filtering 165 | def filter_predictions(batch_item): 166 | 167 | # Create a function that filters the predictions for one single class. 168 | def filter_single_class(index): 169 | 170 | # From a tensor of shape (n_boxes, n_classes + 4 coordinates) extract 171 | # a tensor of shape (n_boxes, 1 + 4 coordinates) that contains the 172 | # confidnece values for just one class, determined by `index`. 173 | confidences = tf.expand_dims(batch_item[..., index], axis=-1) 174 | class_id = tf.fill(dims=tf.shape(confidences), value=tf.to_float(index)) 175 | box_coordinates = batch_item[...,-4:] 176 | 177 | single_class = tf.concat([class_id, confidences, box_coordinates], axis=-1) 178 | 179 | # Apply confidence thresholding with respect to the class defined by `index`. 180 | threshold_met = single_class[:,1] > self.tf_confidence_thresh 181 | single_class = tf.boolean_mask(tensor=single_class, 182 | mask=threshold_met) 183 | 184 | # If any boxes made the threshold, perform NMS. 185 | def perform_nms(): 186 | scores = single_class[...,1] 187 | 188 | # `tf.image.non_max_suppression()` needs the box coordinates in the format `(ymin, xmin, ymax, xmax)`. 189 | xmin = tf.expand_dims(single_class[...,-4], axis=-1) 190 | ymin = tf.expand_dims(single_class[...,-3], axis=-1) 191 | xmax = tf.expand_dims(single_class[...,-2], axis=-1) 192 | ymax = tf.expand_dims(single_class[...,-1], axis=-1) 193 | boxes = tf.concat(values=[ymin, xmin, ymax, xmax], axis=-1) 194 | 195 | maxima_indices = tf.image.non_max_suppression(boxes=boxes, 196 | scores=scores, 197 | max_output_size=self.tf_nms_max_output_size, 198 | iou_threshold=self.iou_threshold, 199 | name='non_maximum_suppresion') 200 | maxima = tf.gather(params=single_class, 201 | indices=maxima_indices, 202 | axis=0) 203 | return maxima 204 | 205 | def no_confident_predictions(): 206 | return tf.constant(value=0.0, shape=(1,6)) 207 | 208 | single_class_nms = tf.cond(tf.equal(tf.size(single_class), 0), no_confident_predictions, perform_nms) 209 | 210 | # Make sure `single_class` is exactly `self.nms_max_output_size` elements long. 211 | padded_single_class = tf.pad(tensor=single_class_nms, 212 | paddings=[[0, self.tf_nms_max_output_size - tf.shape(single_class_nms)[0]], [0, 0]], 213 | mode='CONSTANT', 214 | constant_values=0.0) 215 | 216 | return padded_single_class 217 | 218 | # Iterate `filter_single_class()` over all class indices. 219 | filtered_single_classes = tf.map_fn(fn=lambda i: filter_single_class(i), 220 | elems=tf.range(1,n_classes), 221 | dtype=tf.float32, 222 | parallel_iterations=128, 223 | back_prop=False, 224 | swap_memory=False, 225 | infer_shape=True, 226 | name='loop_over_classes') 227 | 228 | # Concatenate the filtered results for all individual classes to one tensor. 229 | filtered_predictions = tf.reshape(tensor=filtered_single_classes, shape=(-1,6)) 230 | 231 | # Perform top-k filtering for this batch item or pad it in case there are 232 | # fewer than `self.top_k` boxes left at this point. Either way, produce a 233 | # tensor of length `self.top_k`. By the time we return the final results tensor 234 | # for the whole batch, all batch items must have the same number of predicted 235 | # boxes so that the tensor dimensions are homogenous. If fewer than `self.top_k` 236 | # predictions are left after the filtering process above, we pad the missing 237 | # predictions with zeros as dummy entries. 238 | def top_k(): 239 | return tf.gather(params=filtered_predictions, 240 | indices=tf.nn.top_k(filtered_predictions[:, 1], k=self.tf_top_k, sorted=True).indices, 241 | axis=0) 242 | def pad_and_top_k(): 243 | padded_predictions = tf.pad(tensor=filtered_predictions, 244 | paddings=[[0, self.tf_top_k - tf.shape(filtered_predictions)[0]], [0, 0]], 245 | mode='CONSTANT', 246 | constant_values=0.0) 247 | return tf.gather(params=padded_predictions, 248 | indices=tf.nn.top_k(padded_predictions[:, 1], k=self.tf_top_k, sorted=True).indices, 249 | axis=0) 250 | 251 | top_k_boxes = tf.cond(tf.greater_equal(tf.shape(filtered_predictions)[0], self.tf_top_k), top_k, pad_and_top_k) 252 | 253 | return top_k_boxes 254 | 255 | # Iterate `filter_predictions()` over all batch items. 256 | output_tensor = tf.map_fn(fn=lambda x: filter_predictions(x), 257 | elems=y_pred, 258 | dtype=None, 259 | parallel_iterations=128, 260 | back_prop=False, 261 | swap_memory=False, 262 | infer_shape=True, 263 | name='loop_over_batch') 264 | 265 | return output_tensor 266 | 267 | def compute_output_shape(self, input_shape): 268 | batch_size, n_boxes, last_axis = input_shape 269 | return (batch_size, self.tf_top_k, 6) # Last axis: (class_ID, confidence, 4 box coordinates) 270 | 271 | def get_config(self): 272 | config = { 273 | 'confidence_thresh': self.confidence_thresh, 274 | 'iou_threshold': self.iou_threshold, 275 | 'top_k': self.top_k, 276 | 'nms_max_output_size': self.nms_max_output_size, 277 | 'coords': self.coords, 278 | 'normalize_coords': self.normalize_coords, 279 | 'img_height': self.img_height, 280 | 'img_width': self.img_width, 281 | } 282 | base_config = super(DecodeDetections, self).get_config() 283 | return dict(list(base_config.items()) + list(config.items())) 284 | -------------------------------------------------------------------------------- /keras_layers/keras_layer_DecodeDetectionsFast.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A custom Keras layer to decode the raw SSD prediction output. This is a modified 3 | and more efficient version of the `DetectionOutput` layer type in the original Caffe 4 | implementation of SSD. For a faithful replication of the original layer, please 5 | refer to the `DecodeDetections` layer. 6 | 7 | Copyright (C) 2018 Pierluigi Ferrari 8 | 9 | Licensed under the Apache License, Version 2.0 (the "License"); 10 | you may not use this file except in compliance with the License. 11 | You may obtain a copy of the License at 12 | 13 | http://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, software 16 | distributed under the License is distributed on an "AS IS" BASIS, 17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | See the License for the specific language governing permissions and 19 | limitations under the License. 20 | ''' 21 | 22 | from __future__ import division 23 | import numpy as np 24 | import tensorflow as tf 25 | import keras.backend as K 26 | from keras.engine.topology import InputSpec 27 | from keras.engine.topology import Layer 28 | 29 | class DecodeDetectionsFast(Layer): 30 | ''' 31 | A Keras layer to decode the raw SSD prediction output. 32 | 33 | Input shape: 34 | 3D tensor of shape `(batch_size, n_boxes, n_classes + 12)`. 35 | 36 | Output shape: 37 | 3D tensor of shape `(batch_size, top_k, 6)`. 38 | ''' 39 | 40 | def __init__(self, 41 | confidence_thresh=0.01, 42 | iou_threshold=0.45, 43 | top_k=200, 44 | nms_max_output_size=400, 45 | coords='centroids', 46 | normalize_coords=True, 47 | img_height=None, 48 | img_width=None, 49 | **kwargs): 50 | ''' 51 | All default argument values follow the Caffe implementation. 52 | 53 | Arguments: 54 | confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific 55 | positive class in order to be considered for the non-maximum suppression stage for the respective class. 56 | A lower value will result in a larger part of the selection process being done by the non-maximum suppression 57 | stage, while a larger value will result in a larger part of the selection process happening in the confidence 58 | thresholding stage. 59 | iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold` 60 | with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers 61 | to the box score. 62 | top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the 63 | non-maximum suppression stage. 64 | nms_max_output_size (int, optional): The maximum number of predictions that will be left after performing non-maximum 65 | suppression. 66 | coords (str, optional): The box coordinate format that the model outputs. Must be 'centroids' 67 | i.e. the format `(cx, cy, w, h)` (box center coordinates, width, and height). Other coordinate formats are 68 | currently not supported. 69 | normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1]) 70 | and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs 71 | relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`. 72 | Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect 73 | coordinates. Requires `img_height` and `img_width` if set to `True`. 74 | img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`. 75 | img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`. 76 | ''' 77 | if K.backend() != 'tensorflow': 78 | raise TypeError("This layer only supports TensorFlow at the moment, but you are using the {} backend.".format(K.backend())) 79 | 80 | if normalize_coords and ((img_height is None) or (img_width is None)): 81 | raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width)) 82 | 83 | if coords != 'centroids': 84 | raise ValueError("The DetectionOutput layer currently only supports the 'centroids' coordinate format.") 85 | 86 | # We need these members for the config. 87 | self.confidence_thresh = confidence_thresh 88 | self.iou_threshold = iou_threshold 89 | self.top_k = top_k 90 | self.normalize_coords = normalize_coords 91 | self.img_height = img_height 92 | self.img_width = img_width 93 | self.coords = coords 94 | self.nms_max_output_size = nms_max_output_size 95 | 96 | # We need these members for TensorFlow. 97 | self.tf_confidence_thresh = tf.constant(self.confidence_thresh, name='confidence_thresh') 98 | self.tf_iou_threshold = tf.constant(self.iou_threshold, name='iou_threshold') 99 | self.tf_top_k = tf.constant(self.top_k, name='top_k') 100 | self.tf_normalize_coords = tf.constant(self.normalize_coords, name='normalize_coords') 101 | self.tf_img_height = tf.constant(self.img_height, dtype=tf.float32, name='img_height') 102 | self.tf_img_width = tf.constant(self.img_width, dtype=tf.float32, name='img_width') 103 | self.tf_nms_max_output_size = tf.constant(self.nms_max_output_size, name='nms_max_output_size') 104 | 105 | super(DecodeDetectionsFast, self).__init__(**kwargs) 106 | 107 | def build(self, input_shape): 108 | self.input_spec = [InputSpec(shape=input_shape)] 109 | super(DecodeDetectionsFast, self).build(input_shape) 110 | 111 | def call(self, y_pred, mask=None): 112 | ''' 113 | Returns: 114 | 3D tensor of shape `(batch_size, top_k, 6)`. The second axis is zero-padded 115 | to always yield `top_k` predictions per batch item. The last axis contains 116 | the coordinates for each predicted box in the format 117 | `[class_id, confidence, xmin, ymin, xmax, ymax]`. 118 | ''' 119 | 120 | ##################################################################################### 121 | # 1. Convert the box coordinates from predicted anchor box offsets to predicted 122 | # absolute coordinates 123 | ##################################################################################### 124 | 125 | # Extract the predicted class IDs as the indices of the highest confidence values. 126 | class_ids = tf.expand_dims(tf.to_float(tf.argmax(y_pred[...,:-12], axis=-1)), axis=-1) 127 | # Extract the confidences of the maximal classes. 128 | confidences = tf.reduce_max(y_pred[...,:-12], axis=-1, keep_dims=True) 129 | 130 | # Convert anchor box offsets to image offsets. 131 | cx = y_pred[...,-12] * y_pred[...,-4] * y_pred[...,-6] + y_pred[...,-8] # cx = cx_pred * cx_variance * w_anchor + cx_anchor 132 | cy = y_pred[...,-11] * y_pred[...,-3] * y_pred[...,-5] + y_pred[...,-7] # cy = cy_pred * cy_variance * h_anchor + cy_anchor 133 | w = tf.exp(y_pred[...,-10] * y_pred[...,-2]) * y_pred[...,-6] # w = exp(w_pred * variance_w) * w_anchor 134 | h = tf.exp(y_pred[...,-9] * y_pred[...,-1]) * y_pred[...,-5] # h = exp(h_pred * variance_h) * h_anchor 135 | 136 | # Convert 'centroids' to 'corners'. 137 | xmin = cx - 0.5 * w 138 | ymin = cy - 0.5 * h 139 | xmax = cx + 0.5 * w 140 | ymax = cy + 0.5 * h 141 | 142 | # If the model predicts box coordinates relative to the image dimensions and they are supposed 143 | # to be converted back to absolute coordinates, do that. 144 | def normalized_coords(): 145 | xmin1 = tf.expand_dims(xmin * self.tf_img_width, axis=-1) 146 | ymin1 = tf.expand_dims(ymin * self.tf_img_height, axis=-1) 147 | xmax1 = tf.expand_dims(xmax * self.tf_img_width, axis=-1) 148 | ymax1 = tf.expand_dims(ymax * self.tf_img_height, axis=-1) 149 | return xmin1, ymin1, xmax1, ymax1 150 | def non_normalized_coords(): 151 | return tf.expand_dims(xmin, axis=-1), tf.expand_dims(ymin, axis=-1), tf.expand_dims(xmax, axis=-1), tf.expand_dims(ymax, axis=-1) 152 | 153 | xmin, ymin, xmax, ymax = tf.cond(self.tf_normalize_coords, normalized_coords, non_normalized_coords) 154 | 155 | # Concatenate the one-hot class confidences and the converted box coordinates to form the decoded predictions tensor. 156 | y_pred = tf.concat(values=[class_ids, confidences, xmin, ymin, xmax, ymax], axis=-1) 157 | 158 | ##################################################################################### 159 | # 2. Perform confidence thresholding, non-maximum suppression, and top-k filtering. 160 | ##################################################################################### 161 | 162 | batch_size = tf.shape(y_pred)[0] # Output dtype: tf.int32 163 | n_boxes = tf.shape(y_pred)[1] 164 | n_classes = y_pred.shape[2] - 4 165 | class_indices = tf.range(1, n_classes) 166 | 167 | # Create a function that filters the predictions for the given batch item. Specifically, it performs: 168 | # - confidence thresholding 169 | # - non-maximum suppression (NMS) 170 | # - top-k filtering 171 | def filter_predictions(batch_item): 172 | 173 | # Keep only the non-background boxes. 174 | positive_boxes = tf.not_equal(batch_item[...,0], 0.0) 175 | predictions = tf.boolean_mask(tensor=batch_item, 176 | mask=positive_boxes) 177 | 178 | def perform_confidence_thresholding(): 179 | # Apply confidence thresholding. 180 | threshold_met = predictions[:,1] > self.tf_confidence_thresh 181 | return tf.boolean_mask(tensor=predictions, 182 | mask=threshold_met) 183 | def no_positive_boxes(): 184 | return tf.constant(value=0.0, shape=(1,6)) 185 | 186 | # If there are any positive predictions, perform confidence thresholding. 187 | predictions_conf_thresh = tf.cond(tf.equal(tf.size(predictions), 0), no_positive_boxes, perform_confidence_thresholding) 188 | 189 | def perform_nms(): 190 | scores = predictions_conf_thresh[...,1] 191 | 192 | # `tf.image.non_max_suppression()` needs the box coordinates in the format `(ymin, xmin, ymax, xmax)`. 193 | xmin = tf.expand_dims(predictions_conf_thresh[...,-4], axis=-1) 194 | ymin = tf.expand_dims(predictions_conf_thresh[...,-3], axis=-1) 195 | xmax = tf.expand_dims(predictions_conf_thresh[...,-2], axis=-1) 196 | ymax = tf.expand_dims(predictions_conf_thresh[...,-1], axis=-1) 197 | boxes = tf.concat(values=[ymin, xmin, ymax, xmax], axis=-1) 198 | 199 | maxima_indices = tf.image.non_max_suppression(boxes=boxes, 200 | scores=scores, 201 | max_output_size=self.tf_nms_max_output_size, 202 | iou_threshold=self.iou_threshold, 203 | name='non_maximum_suppresion') 204 | maxima = tf.gather(params=predictions_conf_thresh, 205 | indices=maxima_indices, 206 | axis=0) 207 | return maxima 208 | def no_confident_predictions(): 209 | return tf.constant(value=0.0, shape=(1,6)) 210 | 211 | # If any boxes made the threshold, perform NMS. 212 | predictions_nms = tf.cond(tf.equal(tf.size(predictions_conf_thresh), 0), no_confident_predictions, perform_nms) 213 | 214 | # Perform top-k filtering for this batch item or pad it in case there are 215 | # fewer than `self.top_k` boxes left at this point. Either way, produce a 216 | # tensor of length `self.top_k`. By the time we return the final results tensor 217 | # for the whole batch, all batch items must have the same number of predicted 218 | # boxes so that the tensor dimensions are homogenous. If fewer than `self.top_k` 219 | # predictions are left after the filtering process above, we pad the missing 220 | # predictions with zeros as dummy entries. 221 | def top_k(): 222 | return tf.gather(params=predictions_nms, 223 | indices=tf.nn.top_k(predictions_nms[:, 1], k=self.tf_top_k, sorted=True).indices, 224 | axis=0) 225 | def pad_and_top_k(): 226 | padded_predictions = tf.pad(tensor=predictions_nms, 227 | paddings=[[0, self.tf_top_k - tf.shape(predictions_nms)[0]], [0, 0]], 228 | mode='CONSTANT', 229 | constant_values=0.0) 230 | return tf.gather(params=padded_predictions, 231 | indices=tf.nn.top_k(padded_predictions[:, 1], k=self.tf_top_k, sorted=True).indices, 232 | axis=0) 233 | 234 | top_k_boxes = tf.cond(tf.greater_equal(tf.shape(predictions_nms)[0], self.tf_top_k), top_k, pad_and_top_k) 235 | 236 | return top_k_boxes 237 | 238 | # Iterate `filter_predictions()` over all batch items. 239 | output_tensor = tf.map_fn(fn=lambda x: filter_predictions(x), 240 | elems=y_pred, 241 | dtype=None, 242 | parallel_iterations=128, 243 | back_prop=False, 244 | swap_memory=False, 245 | infer_shape=True, 246 | name='loop_over_batch') 247 | 248 | return output_tensor 249 | 250 | def compute_output_shape(self, input_shape): 251 | batch_size, n_boxes, last_axis = input_shape 252 | return (batch_size, self.tf_top_k, 6) # Last axis: (class_ID, confidence, 4 box coordinates) 253 | 254 | def get_config(self): 255 | config = { 256 | 'confidence_thresh': self.confidence_thresh, 257 | 'iou_threshold': self.iou_threshold, 258 | 'top_k': self.top_k, 259 | 'nms_max_output_size': self.nms_max_output_size, 260 | 'coords': self.coords, 261 | 'normalize_coords': self.normalize_coords, 262 | 'img_height': self.img_height, 263 | 'img_width': self.img_width, 264 | } 265 | base_config = super(DecodeDetectionsFast, self).get_config() 266 | return dict(list(base_config.items()) + list(config.items())) 267 | -------------------------------------------------------------------------------- /keras_layers/keras_layer_L2Normalization.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A custom Keras layer to perform L2-normalization. 3 | 4 | Copyright (C) 2018 Pierluigi Ferrari 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | ''' 18 | 19 | from __future__ import division 20 | import numpy as np 21 | import keras.backend as K 22 | from keras.engine.topology import InputSpec 23 | from keras.engine.topology import Layer 24 | 25 | class L2Normalization(Layer): 26 | ''' 27 | Performs L2 normalization on the input tensor with a learnable scaling parameter 28 | as described in the paper "Parsenet: Looking Wider to See Better" (see references) 29 | and as used in the original SSD model. 30 | 31 | Arguments: 32 | gamma_init (int): The initial scaling parameter. Defaults to 20 following the 33 | SSD paper. 34 | 35 | Input shape: 36 | 4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'` 37 | or `(batch, height, width, channels)` if `dim_ordering = 'tf'`. 38 | 39 | Returns: 40 | The scaled tensor. Same shape as the input tensor. 41 | 42 | References: 43 | http://cs.unc.edu/~wliu/papers/parsenet.pdf 44 | ''' 45 | 46 | def __init__(self, gamma_init=20, **kwargs): 47 | if K.image_dim_ordering() == 'tf': 48 | self.axis = 3 49 | else: 50 | self.axis = 1 51 | self.gamma_init = gamma_init 52 | super(L2Normalization, self).__init__(**kwargs) 53 | 54 | def build(self, input_shape): 55 | self.input_spec = [InputSpec(shape=input_shape)] 56 | gamma = self.gamma_init * np.ones((input_shape[self.axis],)) 57 | self.gamma = K.variable(gamma, name='{}_gamma'.format(self.name)) 58 | self.trainable_weights = [self.gamma] 59 | super(L2Normalization, self).build(input_shape) 60 | 61 | def call(self, x, mask=None): 62 | output = K.l2_normalize(x, self.axis) 63 | return output * self.gamma 64 | 65 | def get_config(self): 66 | config = { 67 | 'gamma_init': self.gamma_init 68 | } 69 | base_config = super(L2Normalization, self).get_config() 70 | return dict(list(base_config.items()) + list(config.items())) 71 | -------------------------------------------------------------------------------- /keras_loss_function/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/keras_loss_function/__init__.py -------------------------------------------------------------------------------- /keras_loss_function/keras_ssd_loss.py: -------------------------------------------------------------------------------- 1 | ''' 2 | The Keras-compatible loss function for the SSD model. Currently supports TensorFlow only. 3 | 4 | Copyright (C) 2018 Pierluigi Ferrari 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | ''' 18 | 19 | from __future__ import division 20 | import tensorflow as tf 21 | 22 | class SSDLoss: 23 | ''' 24 | The SSD loss, see https://arxiv.org/abs/1512.02325. 25 | ''' 26 | 27 | def __init__(self, 28 | neg_pos_ratio=3, 29 | n_neg_min=0, 30 | alpha=1.0): 31 | ''' 32 | Arguments: 33 | neg_pos_ratio (int, optional): The maximum ratio of negative (i.e. background) 34 | to positive ground truth boxes to include in the loss computation. 35 | There are no actual background ground truth boxes of course, but `y_true` 36 | contains anchor boxes labeled with the background class. Since 37 | the number of background boxes in `y_true` will usually exceed 38 | the number of positive boxes by far, it is necessary to balance 39 | their influence on the loss. Defaults to 3 following the paper. 40 | n_neg_min (int, optional): The minimum number of negative ground truth boxes to 41 | enter the loss computation *per batch*. This argument can be used to make 42 | sure that the model learns from a minimum number of negatives in batches 43 | in which there are very few, or even none at all, positive ground truth 44 | boxes. It defaults to 0 and if used, it should be set to a value that 45 | stands in reasonable proportion to the batch size used for training. 46 | alpha (float, optional): A factor to weight the localization loss in the 47 | computation of the total loss. Defaults to 1.0 following the paper. 48 | ''' 49 | self.neg_pos_ratio = neg_pos_ratio 50 | self.n_neg_min = n_neg_min 51 | self.alpha = alpha 52 | 53 | def smooth_L1_loss(self, y_true, y_pred): 54 | ''' 55 | Compute smooth L1 loss, see references. 56 | 57 | Arguments: 58 | y_true (nD tensor): A TensorFlow tensor of any shape containing the ground truth data. 59 | In this context, the expected tensor has shape `(batch_size, #boxes, 4)` and 60 | contains the ground truth bounding box coordinates, where the last dimension 61 | contains `(xmin, xmax, ymin, ymax)`. 62 | y_pred (nD tensor): A TensorFlow tensor of identical structure to `y_true` containing 63 | the predicted data, in this context the predicted bounding box coordinates. 64 | 65 | Returns: 66 | The smooth L1 loss, a nD-1 Tensorflow tensor. In this context a 2D tensor 67 | of shape (batch, n_boxes_total). 68 | 69 | References: 70 | https://arxiv.org/abs/1504.08083 71 | ''' 72 | absolute_loss = tf.abs(y_true - y_pred) 73 | square_loss = 0.5 * (y_true - y_pred)**2 74 | l1_loss = tf.where(tf.less(absolute_loss, 1.0), square_loss, absolute_loss - 0.5) 75 | return tf.reduce_sum(l1_loss, axis=-1) 76 | 77 | def log_loss(self, y_true, y_pred): 78 | ''' 79 | Compute the softmax log loss. 80 | 81 | Arguments: 82 | y_true (nD tensor): A TensorFlow tensor of any shape containing the ground truth data. 83 | In this context, the expected tensor has shape (batch_size, #boxes, #classes) 84 | and contains the ground truth bounding box categories. 85 | y_pred (nD tensor): A TensorFlow tensor of identical structure to `y_true` containing 86 | the predicted data, in this context the predicted bounding box categories. 87 | 88 | Returns: 89 | The softmax log loss, a nD-1 Tensorflow tensor. In this context a 2D tensor 90 | of shape (batch, n_boxes_total). 91 | ''' 92 | # Make sure that `y_pred` doesn't contain any zeros (which would break the log function) 93 | y_pred = tf.maximum(y_pred, 1e-15) 94 | # Compute the log loss 95 | log_loss = -tf.reduce_sum(y_true * tf.log(y_pred), axis=-1) 96 | return log_loss 97 | 98 | def compute_loss(self, y_true, y_pred): 99 | ''' 100 | Compute the loss of the SSD model prediction against the ground truth. 101 | 102 | Arguments: 103 | y_true (array): A Numpy array of shape `(batch_size, #boxes, #classes + 12)`, 104 | where `#boxes` is the total number of boxes that the model predicts 105 | per image. Be careful to make sure that the index of each given 106 | box in `y_true` is the same as the index for the corresponding 107 | box in `y_pred`. The last axis must have length `#classes + 12` and contain 108 | `[classes one-hot encoded, 4 ground truth box coordinate offsets, 8 arbitrary entries]` 109 | in this order, including the background class. The last eight entries of the 110 | last axis are not used by this function and therefore their contents are 111 | irrelevant, they only exist so that `y_true` has the same shape as `y_pred`, 112 | where the last four entries of the last axis contain the anchor box 113 | coordinates, which are needed during inference. Important: Boxes that 114 | you want the cost function to ignore need to have a one-hot 115 | class vector of all zeros. 116 | y_pred (Keras tensor): The model prediction. The shape is identical 117 | to that of `y_true`, i.e. `(batch_size, #boxes, #classes + 12)`. 118 | The last axis must contain entries in the format 119 | `[classes one-hot encoded, 4 predicted box coordinate offsets, 8 arbitrary entries]`. 120 | 121 | Returns: 122 | A scalar, the total multitask loss for classification and localization. 123 | ''' 124 | self.neg_pos_ratio = tf.constant(self.neg_pos_ratio) 125 | self.n_neg_min = tf.constant(self.n_neg_min) 126 | self.alpha = tf.constant(self.alpha) 127 | 128 | batch_size = tf.shape(y_pred)[0] # Output dtype: tf.int32 129 | n_boxes = tf.shape(y_pred)[1] # Output dtype: tf.int32, note that `n_boxes` in this context denotes the total number of boxes per image, not the number of boxes per cell. 130 | 131 | # 1: Compute the losses for class and box predictions for every box. 132 | 133 | classification_loss = tf.to_float(self.log_loss(y_true[:,:,:-12], y_pred[:,:,:-12])) # Output shape: (batch_size, n_boxes) 134 | localization_loss = tf.to_float(self.smooth_L1_loss(y_true[:,:,-12:-8], y_pred[:,:,-12:-8])) # Output shape: (batch_size, n_boxes) 135 | 136 | # 2: Compute the classification losses for the positive and negative targets. 137 | 138 | # Create masks for the positive and negative ground truth classes. 139 | negatives = y_true[:,:,0] # Tensor of shape (batch_size, n_boxes) 140 | positives = tf.to_float(tf.reduce_max(y_true[:,:,1:-12], axis=-1)) # Tensor of shape (batch_size, n_boxes) 141 | 142 | # Count the number of positive boxes (classes 1 to n) in y_true across the whole batch. 143 | n_positive = tf.reduce_sum(positives) 144 | 145 | # Now mask all negative boxes and sum up the losses for the positive boxes PER batch item 146 | # (Keras loss functions must output one scalar loss value PER batch item, rather than just 147 | # one scalar for the entire batch, that's why we're not summing across all axes). 148 | pos_class_loss = tf.reduce_sum(classification_loss * positives, axis=-1) # Tensor of shape (batch_size,) 149 | 150 | # Compute the classification loss for the negative default boxes (if there are any). 151 | 152 | # First, compute the classification loss for all negative boxes. 153 | neg_class_loss_all = classification_loss * negatives # Tensor of shape (batch_size, n_boxes) 154 | n_neg_losses = tf.count_nonzero(neg_class_loss_all, dtype=tf.int32) # The number of non-zero loss entries in `neg_class_loss_all` 155 | # What's the point of `n_neg_losses`? For the next step, which will be to compute which negative boxes enter the classification 156 | # loss, we don't just want to know how many negative ground truth boxes there are, but for how many of those there actually is 157 | # a positive (i.e. non-zero) loss. This is necessary because `tf.nn.top-k()` in the function below will pick the top k boxes with 158 | # the highest losses no matter what, even if it receives a vector where all losses are zero. In the unlikely event that all negative 159 | # classification losses ARE actually zero though, this behavior might lead to `tf.nn.top-k()` returning the indices of positive 160 | # boxes, leading to an incorrect negative classification loss computation, and hence an incorrect overall loss computation. 161 | # We therefore need to make sure that `n_negative_keep`, which assumes the role of the `k` argument in `tf.nn.top-k()`, 162 | # is at most the number of negative boxes for which there is a positive classification loss. 163 | 164 | # Compute the number of negative examples we want to account for in the loss. 165 | # We'll keep at most `self.neg_pos_ratio` times the number of positives in `y_true`, but at least `self.n_neg_min` (unless `n_neg_loses` is smaller). 166 | n_negative_keep = tf.minimum(tf.maximum(self.neg_pos_ratio * tf.to_int32(n_positive), self.n_neg_min), n_neg_losses) 167 | 168 | # In the unlikely case when either (1) there are no negative ground truth boxes at all 169 | # or (2) the classification loss for all negative boxes is zero, return zero as the `neg_class_loss`. 170 | def f1(): 171 | return tf.zeros([batch_size]) 172 | # Otherwise compute the negative loss. 173 | def f2(): 174 | # Now we'll identify the top-k (where k == `n_negative_keep`) boxes with the highest confidence loss that 175 | # belong to the background class in the ground truth data. Note that this doesn't necessarily mean that the model 176 | # predicted the wrong class for those boxes, it just means that the loss for those boxes is the highest. 177 | 178 | # To do this, we reshape `neg_class_loss_all` to 1D... 179 | neg_class_loss_all_1D = tf.reshape(neg_class_loss_all, [-1]) # Tensor of shape (batch_size * n_boxes,) 180 | # ...and then we get the indices for the `n_negative_keep` boxes with the highest loss out of those... 181 | values, indices = tf.nn.top_k(neg_class_loss_all_1D, 182 | k=n_negative_keep, 183 | sorted=False) # We don't need them sorted. 184 | # ...and with these indices we'll create a mask... 185 | negatives_keep = tf.scatter_nd(indices=tf.expand_dims(indices, axis=1), 186 | updates=tf.ones_like(indices, dtype=tf.int32), 187 | shape=tf.shape(neg_class_loss_all_1D)) # Tensor of shape (batch_size * n_boxes,) 188 | negatives_keep = tf.to_float(tf.reshape(negatives_keep, [batch_size, n_boxes])) # Tensor of shape (batch_size, n_boxes) 189 | # ...and use it to keep only those boxes and mask all other classification losses 190 | neg_class_loss = tf.reduce_sum(classification_loss * negatives_keep, axis=-1) # Tensor of shape (batch_size,) 191 | return neg_class_loss 192 | 193 | neg_class_loss = tf.cond(tf.equal(n_neg_losses, tf.constant(0)), f1, f2) 194 | 195 | class_loss = pos_class_loss + neg_class_loss # Tensor of shape (batch_size,) 196 | 197 | # 3: Compute the localization loss for the positive targets. 198 | # We don't compute a localization loss for negative predicted boxes (obviously: there are no ground truth boxes they would correspond to). 199 | 200 | loc_loss = tf.reduce_sum(localization_loss * positives, axis=-1) # Tensor of shape (batch_size,) 201 | 202 | # 4: Compute the total loss. 203 | 204 | total_loss = (class_loss + self.alpha * loc_loss) / tf.maximum(1.0, n_positive) # In case `n_positive == 0` 205 | # Keras has the annoying habit of dividing the loss by the batch size, which sucks in our case 206 | # because the relevant criterion to average our loss over is the number of positive boxes in the batch 207 | # (by which we're dividing in the line above), not the batch size. So in order to revert Keras' averaging 208 | # over the batch size, we'll have to multiply by it. 209 | total_loss = total_loss * tf.to_float(batch_size) 210 | 211 | return total_loss 212 | -------------------------------------------------------------------------------- /misc_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/misc_utils/__init__.py -------------------------------------------------------------------------------- /misc_utils/tensor_sampling_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Utilities that are useful to sub- or up-sample weights tensors. 3 | 4 | Copyright (C) 2018 Pierluigi Ferrari 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | ''' 18 | 19 | import numpy as np 20 | 21 | def sample_tensors(weights_list, sampling_instructions, axes=None, init=None, mean=0.0, stddev=0.005): 22 | ''' 23 | Can sub-sample and/or up-sample individual dimensions of the tensors in the given list 24 | of input tensors. 25 | 26 | It is possible to sub-sample some dimensions and up-sample other dimensions at the same time. 27 | 28 | The tensors in the list will be sampled consistently, i.e. for any given dimension that 29 | corresponds among all tensors in the list, the same elements will be picked for every tensor 30 | along that dimension. 31 | 32 | For dimensions that are being sub-sampled, you can either provide a list of the indices 33 | that should be picked, or you can provide the number of elements to be sub-sampled, in which 34 | case the elements will be chosen at random. 35 | 36 | For dimensions that are being up-sampled, "filler" elements will be insterted at random 37 | positions along the respective dimension. These filler elements will be initialized either 38 | with zero or from a normal distribution with selectable mean and standard deviation. 39 | 40 | Arguments: 41 | weights_list (list): A list of Numpy arrays. Each array represents one of the tensors 42 | to be sampled. The tensor with the greatest number of dimensions must be the first 43 | element in the list. For example, in the case of the weights of a 2D convolutional 44 | layer, the kernel must be the first element in the list and the bias the second, 45 | not the other way around. For all tensors in the list after the first tensor, the 46 | lengths of each of their axes must identical to the length of some axis of the 47 | first tensor. 48 | sampling_instructions (list): A list that contains the sampling instructions for each 49 | dimension of the first tensor. If the first tensor has `n` dimensions, then this 50 | must be a list of length `n`. That means, sampling instructions for every dimension 51 | of the first tensor must still be given even if not all dimensions should be changed. 52 | The elements of this list can be either lists of integers or integers. If the sampling 53 | instruction for a given dimension is a list of integers, then these integers represent 54 | the indices of the elements of that dimension that will be sub-sampled. If the sampling 55 | instruction for a given dimension is an integer, then that number of elements will be 56 | sampled along said dimension. If the integer is greater than the number of elements 57 | of the input tensors in that dimension, that dimension will be up-sampled. If the integer 58 | is smaller than the number of elements of the input tensors in that dimension, that 59 | dimension will be sub-sampled. If the integer is equal to the number of elements 60 | of the input tensors in that dimension, that dimension will remain the same. 61 | axes (list, optional): Only relevant if `weights_list` contains more than one tensor. 62 | This list contains a list for each additional tensor in `weights_list` beyond the first. 63 | Each of these lists contains integers that determine to which axes of the first tensor 64 | the axes of the respective tensor correspond. For example, let the first tensor be a 65 | 4D tensor and the second tensor in the list be a 2D tensor. If the first element of 66 | `axis` is the list `[2,3]`, then that means that the two axes of the second tensor 67 | correspond to the last two axes of the first tensor, in the same order. The point of 68 | this list is for the program to know, if a given dimension of the first tensor is to 69 | be sub- or up-sampled, which dimensions of the other tensors in the list must be 70 | sub- or up-sampled accordingly. 71 | init (list, optional): Only relevant for up-sampling. Must be `None` or a list of strings 72 | that determines for each tensor in `weights_list` how the newly inserted values should 73 | be initialized. The possible values are 'gaussian' for initialization from a normal 74 | distribution with the selected mean and standard deviation (see the following two arguments), 75 | or 'zeros' for zero-initialization. If `None`, all initializations default to 76 | 'gaussian'. 77 | mean (float, optional): Only relevant for up-sampling. The mean of the values that will 78 | be inserted into the tensors at random in the case of up-sampling. 79 | stddev (float, optional): Only relevant for up-sampling. The standard deviation of the 80 | values that will be inserted into the tensors at random in the case of up-sampling. 81 | 82 | Returns: 83 | A list containing the sampled tensors in the same order in which they were given. 84 | ''' 85 | 86 | first_tensor = weights_list[0] 87 | 88 | if (not isinstance(sampling_instructions, (list, tuple))) or (len(sampling_instructions) != first_tensor.ndim): 89 | raise ValueError("The sampling instructions must be a list whose length is the number of dimensions of the first tensor in `weights_list`.") 90 | 91 | if (not init is None) and len(init) != len(weights_list): 92 | raise ValueError("`init` must either be `None` or a list of strings that has the same length as `weights_list`.") 93 | 94 | up_sample = [] # Store the dimensions along which we need to up-sample. 95 | out_shape = [] # Store the shape of the output tensor here. 96 | # Store two stages of the new (sub-sampled and/or up-sampled) weights tensors in the following two lists. 97 | subsampled_weights_list = [] # Tensors after sub-sampling, but before up-sampling (if any). 98 | upsampled_weights_list = [] # Sub-sampled tensors after up-sampling (if any), i.e. final output tensors. 99 | 100 | # Create the slicing arrays from the sampling instructions. 101 | sampling_slices = [] 102 | for i, sampling_inst in enumerate(sampling_instructions): 103 | if isinstance(sampling_inst, (list, tuple)): 104 | amax = np.amax(np.array(sampling_inst)) 105 | if amax >= first_tensor.shape[i]: 106 | raise ValueError("The sample instructions for dimension {} contain index {}, which is greater than the length of that dimension.".format(i, amax)) 107 | sampling_slices.append(np.array(sampling_inst)) 108 | out_shape.append(len(sampling_inst)) 109 | elif isinstance(sampling_inst, int): 110 | out_shape.append(sampling_inst) 111 | if sampling_inst == first_tensor.shape[i]: 112 | # Nothing to sample here, we're keeping the original number of elements along this axis. 113 | sampling_slice = np.arange(sampling_inst) 114 | sampling_slices.append(sampling_slice) 115 | elif sampling_inst < first_tensor.shape[i]: 116 | # We want to SUB-sample this dimension. Randomly pick `sample_inst` many elements from it. 117 | sampling_slice1 = np.array([0]) # We will always sample class 0, the background class. 118 | # Sample the rest of the classes. 119 | sampling_slice2 = np.sort(np.random.choice(np.arange(1, first_tensor.shape[i]), sampling_inst - 1, replace=False)) 120 | sampling_slice = np.concatenate([sampling_slice1, sampling_slice2]) 121 | sampling_slices.append(sampling_slice) 122 | else: 123 | # We want to UP-sample. Pick all elements from this dimension. 124 | sampling_slice = np.arange(first_tensor.shape[i]) 125 | sampling_slices.append(sampling_slice) 126 | up_sample.append(i) 127 | else: 128 | raise ValueError("Each element of the sampling instructions must be either an integer or a list/tuple of integers, but received `{}`".format(type(sampling_inst))) 129 | 130 | # Process the first tensor. 131 | subsampled_first_tensor = np.copy(first_tensor[np.ix_(*sampling_slices)]) 132 | subsampled_weights_list.append(subsampled_first_tensor) 133 | 134 | # Process the other tensors. 135 | if len(weights_list) > 1: 136 | for j in range(1, len(weights_list)): 137 | this_sampling_slices = [sampling_slices[i] for i in axes[j-1]] # Get the sampling slices for this tensor. 138 | subsampled_weights_list.append(np.copy(weights_list[j][np.ix_(*this_sampling_slices)])) 139 | 140 | if up_sample: 141 | # Take care of the dimensions that are to be up-sampled. 142 | 143 | out_shape = np.array(out_shape) 144 | 145 | # Process the first tensor. 146 | if init is None or init[0] == 'gaussian': 147 | upsampled_first_tensor = np.random.normal(loc=mean, scale=stddev, size=out_shape) 148 | elif init[0] == 'zeros': 149 | upsampled_first_tensor = np.zeros(out_shape) 150 | else: 151 | raise ValueError("Valid initializations are 'gaussian' and 'zeros', but received '{}'.".format(init[0])) 152 | # Pick the indices of the elements in `upsampled_first_tensor` that should be occupied by `subsampled_first_tensor`. 153 | up_sample_slices = [np.arange(k) for k in subsampled_first_tensor.shape] 154 | for i in up_sample: 155 | # Randomly select across which indices of this dimension to scatter the elements of `new_weights_tensor` in this dimension. 156 | up_sample_slice1 = np.array([0]) 157 | up_sample_slice2 = np.sort(np.random.choice(np.arange(1, upsampled_first_tensor.shape[i]), subsampled_first_tensor.shape[i] - 1, replace=False)) 158 | up_sample_slices[i] = np.concatenate([up_sample_slice1, up_sample_slice2]) 159 | upsampled_first_tensor[np.ix_(*up_sample_slices)] = subsampled_first_tensor 160 | upsampled_weights_list.append(upsampled_first_tensor) 161 | 162 | # Process the other tensors 163 | if len(weights_list) > 1: 164 | for j in range(1, len(weights_list)): 165 | if init is None or init[j] == 'gaussian': 166 | upsampled_tensor = np.random.normal(loc=mean, scale=stddev, size=out_shape[axes[j-1]]) 167 | elif init[j] == 'zeros': 168 | upsampled_tensor = np.zeros(out_shape[axes[j-1]]) 169 | else: 170 | raise ValueError("Valid initializations are 'gaussian' and 'zeros', but received '{}'.".format(init[j])) 171 | this_up_sample_slices = [up_sample_slices[i] for i in axes[j-1]] # Get the up-sampling slices for this tensor. 172 | upsampled_tensor[np.ix_(*this_up_sample_slices)] = subsampled_weights_list[j] 173 | upsampled_weights_list.append(upsampled_tensor) 174 | 175 | return upsampled_weights_list 176 | else: 177 | return subsampled_weights_list 178 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/models/__init__.py -------------------------------------------------------------------------------- /ssd300_evaluation_COCO.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# SSD300 MS COCO Evaluation Tutorial\n", 8 | "\n", 9 | "This is a brief tutorial that goes over how to evaluate a trained SSD300 on one of the MS COCO datasets using the official MS COCO Python tools available here:\n", 10 | "\n", 11 | "https://github.com/cocodataset/cocoapi\n", 12 | "\n", 13 | "Follow the instructions in the GitHub repository above to install the `pycocotools`. Note that you will need to set the path to your local copy of the PythonAPI directory in the subsequent code cell.\n", 14 | "\n", 15 | "Of course the evaulation procedure described here is identical for SSD512, you just need to build a different model." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "from keras import backend as K\n", 27 | "from keras.models import load_model\n", 28 | "from keras.optimizers import Adam\n", 29 | "from scipy.misc import imread\n", 30 | "import numpy as np\n", 31 | "from matplotlib import pyplot as plt\n", 32 | "import sys\n", 33 | "\n", 34 | "# TODO: Specify the directory that contains the `pycocotools` here.\n", 35 | "pycocotools_dir = '../cocoapi/PythonAPI/'\n", 36 | "if pycocotools_dir not in sys.path:\n", 37 | " sys.path.insert(0, pycocotools_dir)\n", 38 | "\n", 39 | "from pycocotools.coco import COCO\n", 40 | "from pycocotools.cocoeval import COCOeval\n", 41 | "\n", 42 | "from models.keras_ssd300 import ssd_300\n", 43 | "from keras_loss_function.keras_ssd_loss import SSDLoss\n", 44 | "from keras_layers.keras_layer_AnchorBoxes import AnchorBoxes\n", 45 | "from keras_layers.keras_layer_DecodeDetections import DecodeDetections\n", 46 | "from keras_layers.keras_layer_DecodeDetectionsFast import DecodeDetectionsFast\n", 47 | "from keras_layers.keras_layer_L2Normalization import L2Normalization\n", 48 | "from data_generator.object_detection_2d_data_generator import DataGenerator\n", 49 | "from eval_utils.coco_utils import get_coco_category_maps, predict_all_to_json\n", 50 | "\n", 51 | "%matplotlib inline" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 2, 57 | "metadata": { 58 | "collapsed": true 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "# Set the input image size for the model.\n", 63 | "img_height = 300\n", 64 | "img_width = 300" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "## 1. Load a trained SSD\n", 72 | "\n", 73 | "Either load a trained model or build a model and load trained weights into it. Since the HDF5 files I'm providing contain only the weights for the various SSD versions, not the complete models, you'll have to go with the latter option when using this implementation for the first time. You can then of course save the model and next time load the full model directly, without having to build it.\n", 74 | "\n", 75 | "You can find the download links to all the trained model weights in the README." 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "### 1.1. Build the model and load trained weights into it" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 4, 88 | "metadata": { 89 | "collapsed": true 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "# 1: Build the Keras model\n", 94 | "\n", 95 | "K.clear_session() # Clear previous models from memory.\n", 96 | "\n", 97 | "model = ssd_300(image_size=(img_height, img_width, 3),\n", 98 | " n_classes=80,\n", 99 | " mode='inference',\n", 100 | " l2_regularization=0.0005,\n", 101 | " scales=[0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05], # The scales for Pascal VOC are [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05]\n", 102 | " aspect_ratios_per_layer=[[1.0, 2.0, 0.5],\n", 103 | " [1.0, 2.0, 0.5, 3.0, 1.0/3.0],\n", 104 | " [1.0, 2.0, 0.5, 3.0, 1.0/3.0],\n", 105 | " [1.0, 2.0, 0.5, 3.0, 1.0/3.0],\n", 106 | " [1.0, 2.0, 0.5],\n", 107 | " [1.0, 2.0, 0.5]],\n", 108 | " two_boxes_for_ar1=True,\n", 109 | " steps=[8, 16, 32, 64, 100, 300],\n", 110 | " offsets=[0.5, 0.5, 0.5, 0.5, 0.5, 0.5],\n", 111 | " clip_boxes=False,\n", 112 | " variances=[0.1, 0.1, 0.2, 0.2],\n", 113 | " normalize_coords=True,\n", 114 | " subtract_mean=[123, 117, 104],\n", 115 | " swap_channels=[2, 1, 0],\n", 116 | " confidence_thresh=0.01,\n", 117 | " iou_threshold=0.45,\n", 118 | " top_k=200,\n", 119 | " nms_max_output_size=400)\n", 120 | "\n", 121 | "# 2: Load the trained weights into the model.\n", 122 | "\n", 123 | "# TODO: Set the path of the trained weights.\n", 124 | "weights_path = 'path/to/trained/weights/VGG_coco_SSD_300x300_iter_400000.h5'\n", 125 | "\n", 126 | "model.load_weights(weights_path, by_name=True)\n", 127 | "\n", 128 | "# 3: Compile the model so that Keras won't complain the next time you load it.\n", 129 | "\n", 130 | "adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)\n", 131 | "\n", 132 | "ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0)\n", 133 | "\n", 134 | "model.compile(optimizer=adam, loss=ssd_loss.compute_loss)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "Or" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "### 1.2. Load a trained model" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": { 155 | "collapsed": true 156 | }, 157 | "outputs": [], 158 | "source": [ 159 | "# TODO: Set the path to the `.h5` file of the model to be loaded.\n", 160 | "model_path = 'path/to/trained/model.h5'\n", 161 | "\n", 162 | "# We need to create an SSDLoss object in order to pass that to the model loader.\n", 163 | "ssd_loss = SSDLoss(neg_pos_ratio=3, n_neg_min=0, alpha=1.0)\n", 164 | "\n", 165 | "K.clear_session() # Clear previous models from memory.\n", 166 | "\n", 167 | "model = load_model(model_path, custom_objects={'AnchorBoxes': AnchorBoxes,\n", 168 | " 'L2Normalization': L2Normalization,\n", 169 | " 'DecodeDetections': DecodeDetections,\n", 170 | " 'compute_loss': ssd_loss.compute_loss})" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "## 2. Create a data generator for the evaluation dataset\n", 178 | "\n", 179 | "Instantiate a `DataGenerator` that will serve the evaluation dataset during the prediction phase." 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 5, 185 | "metadata": { 186 | "collapsed": true 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "dataset = DataGenerator()\n", 191 | "\n", 192 | "# TODO: Set the paths to the dataset here.\n", 193 | "MS_COCO_dataset_images_dir = '../../datasets/MicrosoftCOCO/val2017/'\n", 194 | "MS_COCO_dataset_annotations_filename = '../../datasets/MicrosoftCOCO/annotations/instances_val2017.json'\n", 195 | "\n", 196 | "dataset.parse_json(images_dirs=[MS_COCO_dataset_images_dir],\n", 197 | " annotations_filenames=[MS_COCO_dataset_annotations_filename],\n", 198 | " ground_truth_available=False, # It doesn't matter whether you set this `True` or `False` because the ground truth won't be used anyway, but the parsing goes faster if you don't load the ground truth.\n", 199 | " include_classes='all',\n", 200 | " ret=False)\n", 201 | "\n", 202 | "# We need the `classes_to_cats` dictionary. Read the documentation of this function to understand why.\n", 203 | "cats_to_classes, classes_to_cats, cats_to_names, classes_to_names = get_coco_category_maps(MS_COCO_dataset_annotations_filename)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "## 3. Run the predictions over the evaluation dataset\n", 211 | "\n", 212 | "Now that we have instantiated a model and a data generator to serve the dataset, we can make predictions on the entire dataset and save those predictions in a JSON file in the format in which COCOeval needs them for the evaluation.\n", 213 | "\n", 214 | "Read the documenation to learn what the arguments mean, but the arguments as preset below are the parameters used in the evaluation of the original Caffe models." 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 6, 220 | "metadata": { 221 | "collapsed": true 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "# TODO: Set the desired output file name and the batch size.\n", 226 | "results_file = 'detections_val2017_ssd300_results.json'\n", 227 | "batch_size = 20 # Ideally, choose a batch size that divides the number of images in the dataset." 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 7, 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "name": "stdout", 237 | "output_type": "stream", 238 | "text": [ 239 | "Number of images in the evaluation dataset: 5000\n", 240 | "Producing results file: 100%|██████████| 250/250 [04:11<00:00, 1.05s/it]\n", 241 | "Prediction results saved in 'detections_val2017_ssd300_results.json'\n" 242 | ] 243 | } 244 | ], 245 | "source": [ 246 | "predict_all_to_json(out_file=results_file,\n", 247 | " model=model,\n", 248 | " img_height=img_height,\n", 249 | " img_width=img_width,\n", 250 | " classes_to_cats=classes_to_cats,\n", 251 | " data_generator=dataset,\n", 252 | " batch_size=batch_size,\n", 253 | " data_generator_mode='resize',\n", 254 | " model_mode='inference',\n", 255 | " confidence_thresh=0.01,\n", 256 | " iou_threshold=0.45,\n", 257 | " top_k=200,\n", 258 | " normalize_coords=True)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "## 4. Run the evaluation\n", 266 | "\n", 267 | "Now we'll load the JSON file containing all the predictions that we produced in the last step and feed it to `COCOeval`. Note that the evaluation may take a while." 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 8, 273 | "metadata": {}, 274 | "outputs": [ 275 | { 276 | "name": "stdout", 277 | "output_type": "stream", 278 | "text": [ 279 | "loading annotations into memory...\n", 280 | "Done (t=0.46s)\n", 281 | "creating index...\n", 282 | "index created!\n", 283 | "Loading and preparing results...\n", 284 | "DONE (t=5.87s)\n", 285 | "creating index...\n", 286 | "index created!\n" 287 | ] 288 | } 289 | ], 290 | "source": [ 291 | "coco_gt = COCO(MS_COCO_dataset_annotations_filename)\n", 292 | "coco_dt = coco_gt.loadRes(results_file)\n", 293 | "image_ids = sorted(coco_gt.getImgIds())" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 9, 299 | "metadata": {}, 300 | "outputs": [ 301 | { 302 | "name": "stdout", 303 | "output_type": "stream", 304 | "text": [ 305 | "Running per image evaluation...\n", 306 | "Evaluate annotation type *bbox*\n", 307 | "DONE (t=64.15s).\n", 308 | "Accumulating evaluation results...\n", 309 | "DONE (t=10.58s).\n", 310 | " Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.247\n", 311 | " Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.424\n", 312 | " Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.253\n", 313 | " Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.059\n", 314 | " Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.264\n", 315 | " Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.414\n", 316 | " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.232\n", 317 | " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.341\n", 318 | " Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.362\n", 319 | " Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.102\n", 320 | " Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.401\n", 321 | " Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.577\n" 322 | ] 323 | } 324 | ], 325 | "source": [ 326 | "cocoEval = COCOeval(cocoGt=coco_gt,\n", 327 | " cocoDt=coco_dt,\n", 328 | " iouType='bbox')\n", 329 | "cocoEval.params.imgIds = image_ids\n", 330 | "cocoEval.evaluate()\n", 331 | "cocoEval.accumulate()\n", 332 | "cocoEval.summarize()" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": { 339 | "collapsed": true 340 | }, 341 | "outputs": [], 342 | "source": [] 343 | } 344 | ], 345 | "metadata": { 346 | "kernelspec": { 347 | "display_name": "Python 3", 348 | "language": "python", 349 | "name": "python3" 350 | }, 351 | "language_info": { 352 | "codemirror_mode": { 353 | "name": "ipython", 354 | "version": 3 355 | }, 356 | "file_extension": ".py", 357 | "mimetype": "text/x-python", 358 | "name": "python", 359 | "nbconvert_exporter": "python", 360 | "pygments_lexer": "ipython3", 361 | "version": "3.5.3" 362 | } 363 | }, 364 | "nbformat": 4, 365 | "nbformat_minor": 2 366 | } 367 | -------------------------------------------------------------------------------- /ssd_encoder_decoder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/ssd_encoder_decoder/__init__.py -------------------------------------------------------------------------------- /ssd_encoder_decoder/matching_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Utilities to match ground truth boxes to anchor boxes. 3 | 4 | Copyright (C) 2018 Pierluigi Ferrari 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | ''' 18 | 19 | from __future__ import division 20 | import numpy as np 21 | 22 | def match_bipartite_greedy(weight_matrix): 23 | ''' 24 | Returns a bipartite matching according to the given weight matrix. 25 | 26 | The algorithm works as follows: 27 | 28 | Let the first axis of `weight_matrix` represent ground truth boxes 29 | and the second axis anchor boxes. 30 | The ground truth box that has the greatest similarity with any 31 | anchor box will be matched first, then out of the remaining ground 32 | truth boxes, the ground truth box that has the greatest similarity 33 | with any of the remaining anchor boxes will be matched second, and 34 | so on. That is, the ground truth boxes will be matched in descending 35 | order by maximum similarity with any of the respectively remaining 36 | anchor boxes. 37 | The runtime complexity is O(m^2 * n), where `m` is the number of 38 | ground truth boxes and `n` is the number of anchor boxes. 39 | 40 | Arguments: 41 | weight_matrix (array): A 2D Numpy array that represents the weight matrix 42 | for the matching process. If `(m,n)` is the shape of the weight matrix, 43 | it must be `m <= n`. The weights can be integers or floating point 44 | numbers. The matching process will maximize, i.e. larger weights are 45 | preferred over smaller weights. 46 | 47 | Returns: 48 | A 1D Numpy array of length `weight_matrix.shape[0]` that represents 49 | the matched index along the second axis of `weight_matrix` for each index 50 | along the first axis. 51 | ''' 52 | 53 | weight_matrix = np.copy(weight_matrix) # We'll modify this array. 54 | num_ground_truth_boxes = weight_matrix.shape[0] 55 | all_gt_indices = list(range(num_ground_truth_boxes)) # Only relevant for fancy-indexing below. 56 | 57 | # This 1D array will contain for each ground truth box the index of 58 | # the matched anchor box. 59 | matches = np.zeros(num_ground_truth_boxes, dtype=np.int) 60 | 61 | # In each iteration of the loop below, exactly one ground truth box 62 | # will be matched to one anchor box. 63 | for _ in range(num_ground_truth_boxes): 64 | 65 | # Find the maximal anchor-ground truth pair in two steps: First, reduce 66 | # over the anchor boxes and then reduce over the ground truth boxes. 67 | anchor_indices = np.argmax(weight_matrix, axis=1) # Reduce along the anchor box axis. 68 | overlaps = weight_matrix[all_gt_indices, anchor_indices] 69 | ground_truth_index = np.argmax(overlaps) # Reduce along the ground truth box axis. 70 | anchor_index = anchor_indices[ground_truth_index] 71 | matches[ground_truth_index] = anchor_index # Set the match. 72 | 73 | # Set the row of the matched ground truth box and the column of the matched 74 | # anchor box to all zeros. This ensures that those boxes will not be matched again, 75 | # because they will never be the best matches for any other boxes. 76 | weight_matrix[ground_truth_index] = 0 77 | weight_matrix[:,anchor_index] = 0 78 | 79 | return matches 80 | 81 | def match_multi(weight_matrix, threshold): 82 | ''' 83 | Matches all elements along the second axis of `weight_matrix` to their best 84 | matches along the first axis subject to the constraint that the weight of a match 85 | must be greater than or equal to `threshold` in order to produce a match. 86 | 87 | If the weight matrix contains elements that should be ignored, the row or column 88 | representing the respective elemet should be set to a value below `threshold`. 89 | 90 | Arguments: 91 | weight_matrix (array): A 2D Numpy array that represents the weight matrix 92 | for the matching process. If `(m,n)` is the shape of the weight matrix, 93 | it must be `m <= n`. The weights can be integers or floating point 94 | numbers. The matching process will maximize, i.e. larger weights are 95 | preferred over smaller weights. 96 | threshold (float): A float that represents the threshold (i.e. lower bound) 97 | that must be met by a pair of elements to produce a match. 98 | 99 | Returns: 100 | Two 1D Numpy arrays of equal length that represent the matched indices. The first 101 | array contains the indices along the first axis of `weight_matrix`, the second array 102 | contains the indices along the second axis. 103 | ''' 104 | 105 | num_anchor_boxes = weight_matrix.shape[1] 106 | all_anchor_indices = list(range(num_anchor_boxes)) # Only relevant for fancy-indexing below. 107 | 108 | # Find the best ground truth match for every anchor box. 109 | ground_truth_indices = np.argmax(weight_matrix, axis=0) # Array of shape (weight_matrix.shape[1],) 110 | overlaps = weight_matrix[ground_truth_indices, all_anchor_indices] # Array of shape (weight_matrix.shape[1],) 111 | 112 | # Filter out the matches with a weight below the threshold. 113 | anchor_indices_thresh_met = np.nonzero(overlaps >= threshold)[0] 114 | gt_indices_thresh_met = ground_truth_indices[anchor_indices_thresh_met] 115 | 116 | return gt_indices_thresh_met, anchor_indices_thresh_met 117 | -------------------------------------------------------------------------------- /training_summaries/ssd300_pascal_07+12_loss_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pierluigiferrari/ssd_keras/3ac9adaf3889f1020d74b0eeefea281d5e82f353/training_summaries/ssd300_pascal_07+12_loss_history.png -------------------------------------------------------------------------------- /training_summaries/ssd300_pascal_07+12_training_summary.md: -------------------------------------------------------------------------------- 1 | ## SSD300 Pascal VOC 07+12 Training Summary 2 | --- 3 | 4 | This is a summary of the training of an SSD300 on the Pascal VOC 2007 `trainval` and 2012 `trainval` image sets using the same configuration as in the original Caffe implementation for that same model. 5 | 6 | Since neither the SSD paper nor the GitHub repository of the original Caffe SSD implementation state details on the training progress, but only the final evaluation results, maybe some will find the loss curves and intermediate mAP evaluation results provided here helpful for comparison with their own training. 7 | 8 | What you see below are the training results of running the [`ssd300_training.ipynb`](../ssd300_training.ipynb) notebook as is, in which all parameters are already preset to replicate the training configuration of the original SSD300 "07+12" model. I just made one small change: I occasionally ran into `OOM` errors at batch size 32, so I trained with batch size 31. 9 | 10 | Important note about the data shown below: 11 | 12 | SGD is inherently unstable at the beginning of the training. Remember that the optimization is stochastic, i.e. if you start a fresh training ten times, the loss pattern over the first training steps can look different each time, and in the case of SGD, very different. One time the loss might decrease smoothly right from the start, which is what happened in my case below. Another time the loss might get temporarily stuck on a plateau very early on such that nothing seems to be happening for a couple of hundred training steps. Yet another time the loss might blow up right at the start and become `NaN`. As long as the loss doesn't become `NaN`, the final convergence loss does, in my experience, not strongly depend on the loss progression in the very early phase of the training. In other words, even if the loss doesn't decrease as fast in the beginning, you will likely still end up with the same convergence loss, it will just take longer to get there. Just as a benchmark, after the first 1,000 training steps I've seen anything between around 10 and 15 as values for the training loss. The Adam optimizer doesn't suffer from this variability to the same extent and is evidently the superior optimizer, but since the original Caffe models were trained with SGD, I used that to reproduce the original results. 13 | 14 | ### Training and Validation Loss 15 | 16 | What you see below are the training and validation loss every 1,000 training steps. The validation loss is computed on the Pascal VOC 2007 `test` image set. In my case it took only around 105,000 instead of the expected 120,000 iterations for the validation loss to converge, but as explained above, it may well take longer. The drop you're seeing at 56,000 training steps was when I reduced the learning rate from 0.001 to 0.0001. The original learning rate schedule schedules this reduction only after 80,000 training steps, but since the loss decreased so quickly in the beginning in my case, I had to decrease the learning rate earlier. I reduced the learning rate to 0.00001 after 76,000 training steps and kept it constant from there. 17 | 18 | ![loss_history](ssd300_pascal_07+12_loss_history.png) 19 | 20 | ### Mean Average Precision 21 | 22 | Here are the intermediate and final mAP values on Pascal VOC 2007 `test`, evaluated using the official Pascal VOCdevkit 2007 Matlab evaluation code. The table shows the best values after every 20,000 training steps. Once again, the progress may be slower depending on how the early phase of the training is going. In another training I started with the same configuration, I got an mAP of only 0.665 after the first 20,000 training steps. The full model after 102,000 training steps can be downloaded [here](https://drive.google.com/open?id=1-MYYaZbIHNPtI2zzklgVBAjssbP06BeA). 23 | 24 | | | Steps | 20k | 40k | 60k | 80k | 100k | 102k | 25 | |-------------|-------|----------|----------|----------|----------|----------|----------| 26 | |aeroplane | AP | 0.6874 | 0.7401 | 0.7679 | 0.7827 | 0.7912 | 0.7904 | 27 | |bicycle | AP | 0.7786 | 0.8203 | 0.795 | 0.8436 | 0.8453 | 0.8466 | 28 | |bird | AP | 0.6855 | 0.6939 | 0.7191 | 0.7564 | 0.7655 | 0.7672 | 29 | |boat | AP | 0.5804 | 0.6173 | 0.6258 | 0.6866 | 0.6896 | 0.6952 | 30 | |bottle | AP | 0.3449 | 0.4288 | 0.453 | 0.4681 | 0.4896 | 0.4844 | 31 | |bus | AP | 0.7771 | 0.8332 | 0.8343 | 0.8525 | 0.8537 | 0.8554 | 32 | |car | AP | 0.8048 | 0.8435 | 0.8345 | 0.848 | 0.8546 | 0.8543 | 33 | |cat | AP | 0.852 | 0.7989 | 0.8551 | 0.8759 | 0.8727 | 0.8746 | 34 | |chair | AP | 0.5085 | 0.5548 | 0.5287 | 0.5873 | 0.5895 | 0.5911 | 35 | |cow | AP | 0.7359 | 0.7821 | 0.791 | 0.8278 | 0.8271 | 0.8243 | 36 | |diningtable | AP | 0.6805 | 0.7181 | 0.7502 | 0.7543 | 0.7733 | 0.7614 | 37 | |dog | AP | 0.8118 | 0.7898 | 0.8222 | 0.8546 | 0.8544 | 0.8552 | 38 | |horse | AP | 0.823 | 0.8501 | 0.8532 | 0.8586 | 0.8688 | 0.867 | 39 | |motorbike | AP | 0.7725 | 0.7935 | 0.8081 | 0.845 | 0.8471 | 0.8509 | 40 | |person | AP | 0.73 | 0.7514 | 0.7634 | 0.7851 | 0.7869 | 0.7862 | 41 | |pottedplant | AP | 0.4112 | 0.4335 | 0.4982 | 0.5051 | 0.5131 | 0.5182 | 42 | |sheep | AP | 0.6821 | 0.7324 | 0.7283 | 0.7717 | 0.7783 | 0.7799 | 43 | |sofa | AP | 0.7417 | 0.7824 | 0.7663 | 0.7928 | 0.7911 | 0.794 | 44 | |train | AP | 0.7942 | 0.8169 | 0.8326 | 0.867 | 0.862 | 0.8596 | 45 | |tvmonitor | AP | 0.725 | 0.7301 | 0.7259 | 0.7589 | 0.7649 | 0.7651 | 46 | | |**mAP**|**0.696** |**0.726** |**0.738** |**0.766** |**0.7709**|**0.7711**| 47 | --------------------------------------------------------------------------------