├── .gitignore ├── LICENSE ├── README.md ├── ThirdPartyNOtices.txt ├── experiments └── relation_rcnn │ ├── cfgs │ ├── resnet_v1_101_coco_trainvalminus_rcnn_dcn_end2end_8epoch.yaml │ ├── resnet_v1_101_coco_trainvalminus_rcnn_dcn_end2end_relation_8epoch.yaml │ ├── resnet_v1_101_coco_trainvalminus_rcnn_dcn_end2end_relation_learn_nms_8epoch.yaml │ ├── resnet_v1_101_coco_trainvalminus_rcnn_end2end_8epoch.yaml │ ├── resnet_v1_101_coco_trainvalminus_rcnn_end2end_learn_nms_3epoch.yaml │ ├── resnet_v1_101_coco_trainvalminus_rcnn_end2end_relation_8epoch.yaml │ ├── resnet_v1_101_coco_trainvalminus_rcnn_end2end_relation_learn_nms_8epoch.yaml │ ├── resnet_v1_101_coco_trainvalminus_rcnn_fpn_8epoch.yaml │ ├── resnet_v1_101_coco_trainvalminus_rcnn_fpn_relation_8epoch.yaml │ └── resnet_v1_101_coco_trainvalminus_rcnn_fpn_relation_learn_nms_8epoch.yaml │ ├── rcnn_end2end_train_test.py │ ├── rcnn_test.py │ └── rcnn_train_test.py ├── init.sh ├── lib ├── Makefile ├── __init__.py ├── bbox │ ├── .gitignore │ ├── __init__.py │ ├── bbox.pyx │ ├── bbox_regression.py │ ├── bbox_transform.py │ └── setup_linux.py ├── dataset │ ├── __init__.py │ ├── coco.py │ ├── imdb.py │ └── pycocotools │ │ ├── .gitignore │ │ ├── UPSTREAM_REV │ │ ├── __init__.py │ │ ├── _mask.pyx │ │ ├── coco.py │ │ ├── cocoeval.py │ │ ├── mask.py │ │ ├── maskApi.c │ │ ├── maskApi.h │ │ └── setup_linux.py ├── nms │ ├── __init__.py │ ├── cpu_nms.pyx │ ├── gpu_nms.cu │ ├── gpu_nms.hpp │ ├── gpu_nms.pyx │ ├── nms.py │ ├── nms_kernel.cu │ └── setup_linux.py ├── rpn │ ├── __init__.py │ ├── generate_anchor.py │ └── rpn.py └── utils │ ├── PrefetchingIter.py │ ├── __init__.py │ ├── create_logger.py │ ├── image.py │ ├── load_data.py │ ├── load_model.py │ ├── lr_scheduler.py │ └── symbol.py ├── relation_rcnn ├── __init__.py ├── _init_paths.py ├── config │ ├── __init__.py │ └── config.py ├── core │ ├── DataParallelExecutorGroup.py │ ├── __init__.py │ ├── callback.py │ ├── loader.py │ ├── metric.py │ ├── module.py │ ├── rcnn.py │ └── tester.py ├── function │ ├── __init__.py │ ├── test_rcnn.py │ ├── test_rpn.py │ ├── train_rcnn.py │ └── train_rpn.py ├── operator_cxx │ ├── deformable_convolution-inl.h │ ├── deformable_convolution.cc │ ├── deformable_convolution.cu │ ├── deformable_psroi_pooling-inl.h │ ├── deformable_psroi_pooling.cc │ ├── deformable_psroi_pooling.cu │ ├── nn │ │ ├── deformable_im2col.cuh │ │ └── deformable_im2col.h │ ├── psroi_pooling-inl.h │ ├── psroi_pooling.cc │ └── psroi_pooling.cu ├── operator_py │ ├── __init__.py │ ├── box_annotator_ohem.py │ ├── learn_nms.py │ ├── monitor_op.py │ ├── nms_multi_target.py │ ├── proposal.py │ └── proposal_target.py ├── symbols │ ├── __init__.py │ ├── resnet_v1_101_rcnn.py │ ├── resnet_v1_101_rcnn_attention_1024_pairwise_position_multi_head_16.py │ ├── resnet_v1_101_rcnn_attention_1024_pairwise_position_multi_head_16_learn_nms.py │ ├── resnet_v1_101_rcnn_base.py │ ├── resnet_v1_101_rcnn_dcn.py │ ├── resnet_v1_101_rcnn_dcn_attention_1024_pairwise_position_multi_head_16.py │ ├── resnet_v1_101_rcnn_dcn_attention_1024_pairwise_position_multi_head_16_learn_nms.py │ ├── resnet_v1_101_rcnn_fpn.py │ ├── resnet_v1_101_rcnn_fpn_attention_1024_pairwise_position_multi_head_16.py │ ├── resnet_v1_101_rcnn_fpn_attention_1024_pairwise_position_multi_head_16_learn_nms.py │ ├── resnet_v1_101_rcnn_learn_nms_1024_attention_1024_pairwise_position_multi_head_16.py │ └── resnet_v1_101_rcnn_learn_nms_base.py ├── test.py ├── train_end2end.py └── train_rcnn.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # IntelliJ project files 2 | .idea 3 | *.iml 4 | out 5 | gen 6 | 7 | ### Vim template 8 | [._]*.s[a-w][a-z] 9 | [._]s[a-w][a-z] 10 | *.un~ 11 | Session.vim 12 | .netrwhist 13 | *~ 14 | 15 | ### IPythonNotebook template 16 | # Temporary data 17 | .ipynb_checkpoints/ 18 | 19 | ### Python template 20 | # Byte-compiled / optimized / DLL files 21 | __pycache__/ 22 | *.py[cod] 23 | *$py.class 24 | 25 | # C extensions 26 | *.so 27 | 28 | # Distribution / packaging 29 | .Python 30 | env/ 31 | build/ 32 | develop-eggs/ 33 | dist/ 34 | downloads/ 35 | eggs/ 36 | .eggs/ 37 | #lib/ 38 | #lib64/ 39 | parts/ 40 | sdist/ 41 | var/ 42 | *.egg-info/ 43 | .installed.cfg 44 | *.egg 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *,cover 65 | 66 | # Translations 67 | *.mo 68 | *.pot 69 | 70 | # Django stuff: 71 | *.log 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | *.ipynb 80 | *.params 81 | *.json 82 | .vscode/ 83 | 84 | lib/dataset/pycocotools/*.cpp 85 | lib/nms/*.c 86 | lib/nms/*.cpp 87 | 88 | data 89 | external 90 | output 91 | model 92 | 93 | .db 94 | cache 95 | proposal 96 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Microsoft 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ThirdPartyNOtices.txt: -------------------------------------------------------------------------------- 1 | ************************************************************************ 2 | 3 | THIRD-PARTY SOFTWARE NOTICES AND INFORMATION 4 | 5 | MXNet 6 | 7 | Copyright (c) 2015-2016 by Contributors 8 | 9 | Licensed under the Apache License, Version 2.0 (the "License"); 10 | you may not use this file except in compliance with the License. 11 | You may obtain a copy of the License at 12 | 13 | http://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, software 16 | distributed under the License is distributed on an "AS IS" BASIS, 17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | See the License for the specific language governing permissions and 19 | limitations under the License. 20 | 21 | 22 | Fast R-CNN 23 | 24 | Copyright (c) Microsoft Corporation 25 | 26 | All rights reserved. 27 | 28 | MIT License 29 | 30 | Permission is hereby granted, free of charge, to any person obtaining a 31 | copy of this software and associated documentation files (the "Software"), 32 | to deal in the Software without restriction, including without limitation 33 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 34 | and/or sell copies of the Software, and to permit persons to whom the 35 | Software is furnished to do so, subject to the following conditions: 36 | 37 | The above copyright notice and this permission notice shall be included 38 | in all copies or substantial portions of the Software. 39 | 40 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 41 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 42 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 43 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 44 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 45 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 46 | OTHER DEALINGS IN THE SOFTWARE. 47 | 48 | 49 | Faster R-CNN 50 | 51 | The MIT License (MIT) 52 | 53 | Copyright (c) 2015 Microsoft Corporation 54 | 55 | Permission is hereby granted, free of charge, to any person obtaining a copy 56 | of this software and associated documentation files (the "Software"), to deal 57 | in the Software without restriction, including without limitation the rights 58 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 59 | copies of the Software, and to permit persons to whom the Software is 60 | furnished to do so, subject to the following conditions: 61 | 62 | The above copyright notice and this permission notice shall be included in 63 | all copies or substantial portions of the Software. 64 | 65 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 66 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 67 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 68 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 69 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 70 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 71 | THE SOFTWARE. 72 | 73 | 74 | Caffe 75 | 76 | COPYRIGHT 77 | 78 | All contributions by the University of California: 79 | Copyright (c) 2014, 2015, The Regents of the University of California (Regents) 80 | All rights reserved. 81 | 82 | All other contributions: 83 | Copyright (c) 2014, 2015, the respective contributors 84 | All rights reserved. 85 | 86 | Caffe uses a shared copyright model: each contributor holds copyright over 87 | their contributions to Caffe. The project versioning records all such 88 | contribution and copyright details. If a contributor wants to further mark 89 | their specific copyright on a particular contribution, they should indicate 90 | their copyright solely in the commit message of the change when it is 91 | committed. 92 | 93 | LICENSE 94 | 95 | Redistribution and use in source and binary forms, with or without 96 | modification, are permitted provided that the following conditions are met: 97 | 98 | 1. Redistributions of source code must retain the above copyright notice, this 99 | list of conditions and the following disclaimer. 100 | 2. Redistributions in binary form must reproduce the above copyright notice, 101 | this list of conditions and the following disclaimer in the documentation 102 | and/or other materials provided with the distribution. 103 | 104 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 105 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 106 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 107 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 108 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 109 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 110 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 111 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 112 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 113 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 114 | 115 | CONTRIBUTION AGREEMENT 116 | 117 | By contributing to the BVLC/caffe repository through pull-request, comment, 118 | or otherwise, the contributor releases their content to the 119 | license and copyright terms herein. 120 | 121 | 122 | MS COCO API 123 | 124 | Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin 125 | All rights reserved. 126 | 127 | Redistribution and use in source and binary forms, with or without 128 | modification, are permitted provided that the following conditions are met: 129 | 130 | 1. Redistributions of source code must retain the above copyright notice, this 131 | list of conditions and the following disclaimer. 132 | 2. Redistributions in binary form must reproduce the above copyright notice, 133 | this list of conditions and the following disclaimer in the documentation 134 | and/or other materials provided with the distribution. 135 | 136 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 137 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 138 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 139 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 140 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 141 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 142 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 143 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 144 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 145 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 146 | 147 | The views and conclusions contained in the software and documentation are those 148 | of the authors and should not be interpreted as representing official policies, 149 | either expressed or implied, of the FreeBSD Project. 150 | -------------------------------------------------------------------------------- /experiments/relation_rcnn/cfgs/resnet_v1_101_coco_trainvalminus_rcnn_dcn_end2end_8epoch.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | MXNET_VERSION: "mxnet_v1.1.0" 3 | output_path: "./output/rcnn/coco" 4 | symbol: resnet_v1_101_rcnn_dcn 5 | gpus: '0,1,2,3' 6 | CLASS_AGNOSTIC: true 7 | SCALES: 8 | - 600 9 | - 1000 10 | default: 11 | frequent: 100 12 | kvstore: device 13 | network: 14 | pretrained: "./model/pretrained_model/resnet_v1_101" 15 | pretrained_epoch: 0 16 | PIXEL_MEANS: 17 | - 103.06 18 | - 115.90 19 | - 123.15 20 | IMAGE_STRIDE: 0 21 | RCNN_FEAT_STRIDE: 16 22 | RPN_FEAT_STRIDE: 16 23 | FIXED_PARAMS: 24 | - conv1 25 | - bn_conv1 26 | - res2 27 | - bn2 28 | - gamma 29 | - beta 30 | FIXED_PARAMS_SHARED: 31 | - conv1 32 | - bn_conv1 33 | - res2 34 | - bn2 35 | - res3 36 | - bn3 37 | - res4 38 | - bn4 39 | - gamma 40 | - beta 41 | ANCHOR_RATIOS: 42 | - 0.5 43 | - 1 44 | - 2 45 | ANCHOR_SCALES: 46 | - 4 47 | - 8 48 | - 16 49 | - 32 50 | NUM_ANCHORS: 12 51 | dataset: 52 | NUM_CLASSES: 81 53 | dataset: coco 54 | dataset_path: "./data/coco" 55 | image_set: train2014+valminusminival2014 56 | root_path: "./" 57 | test_image_set: minival2014 58 | proposal: rpn 59 | TRAIN: 60 | lr: 0.0005 61 | lr_step: '5.33' 62 | warmup: false 63 | warmup_lr: 0.00005 64 | # typically we will use 8000 warmup step for single GPU for COCO 65 | warmup_step: 1000 66 | begin_epoch: 0 67 | end_epoch: 8 68 | model_prefix: 'rcnn_coco' 69 | # whether resume training 70 | RESUME: false 71 | # whether flip image 72 | FLIP: true 73 | # whether shuffle image 74 | SHUFFLE: true 75 | # whether use OHEM 76 | ENABLE_OHEM: true 77 | # size of images for each device, 2 for rcnn, 1 for rpn and e2e 78 | BATCH_IMAGES: 1 79 | # e2e changes behavior of anchor loader and metric 80 | END2END: true 81 | # group images with similar aspect ratio 82 | ASPECT_GROUPING: true 83 | # R-CNN 84 | # rcnn rois batch size 85 | BATCH_ROIS: -1 86 | BATCH_ROIS_OHEM: 128 87 | # rcnn rois sampling params 88 | FG_FRACTION: 0.25 89 | FG_THRESH: 0.5 90 | BG_THRESH_HI: 0.5 91 | BG_THRESH_LO: 0 92 | # rcnn bounding box regression params 93 | BBOX_REGRESSION_THRESH: 0.5 94 | BBOX_WEIGHTS: 95 | - 1.0 96 | - 1.0 97 | - 1.0 98 | - 1.0 99 | 100 | # RPN anchor loader 101 | # rpn anchors batch size 102 | RPN_BATCH_SIZE: 256 103 | # rpn anchors sampling params 104 | RPN_FG_FRACTION: 0.5 105 | RPN_POSITIVE_OVERLAP: 0.7 106 | RPN_NEGATIVE_OVERLAP: 0.3 107 | RPN_CLOBBER_POSITIVES: false 108 | # rpn bounding box regression params 109 | RPN_BBOX_WEIGHTS: 110 | - 1.0 111 | - 1.0 112 | - 1.0 113 | - 1.0 114 | RPN_POSITIVE_WEIGHT: -1.0 115 | # used for end2end training 116 | # RPN proposal 117 | CXX_PROPOSAL: false 118 | RPN_NMS_THRESH: 0.7 119 | RPN_PRE_NMS_TOP_N: 6000 120 | RPN_POST_NMS_TOP_N: 300 121 | RPN_MIN_SIZE: 0 122 | # approximate bounding box regression 123 | BBOX_NORMALIZATION_PRECOMPUTED: true 124 | BBOX_MEANS: 125 | - 0.0 126 | - 0.0 127 | - 0.0 128 | - 0.0 129 | BBOX_STDS: 130 | - 0.1 131 | - 0.1 132 | - 0.2 133 | - 0.2 134 | TEST: 135 | # use rpn to generate proposal 136 | HAS_RPN: true 137 | # size of images for each device 138 | BATCH_IMAGES: 1 139 | # RPN proposal 140 | CXX_PROPOSAL: false 141 | RPN_NMS_THRESH: 0.7 142 | RPN_PRE_NMS_TOP_N: 6000 143 | RPN_POST_NMS_TOP_N: 300 144 | RPN_MIN_SIZE: 0 145 | # RPN generate proposal 146 | PROPOSAL_NMS_THRESH: 0.7 147 | PROPOSAL_PRE_NMS_TOP_N: 20000 148 | PROPOSAL_POST_NMS_TOP_N: 2000 149 | PROPOSAL_MIN_SIZE: 0 150 | # RCNN nms 151 | NMS: 0.6 152 | SOFTNMS: true 153 | test_epoch: 8 154 | max_per_image: 100 155 | 156 | -------------------------------------------------------------------------------- /experiments/relation_rcnn/cfgs/resnet_v1_101_coco_trainvalminus_rcnn_dcn_end2end_relation_8epoch.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | MXNET_VERSION: "mxnet_v1.1.0" 3 | output_path: "./output/rcnn/coco" 4 | symbol: resnet_v1_101_rcnn_dcn_attention_1024_pairwise_position_multi_head_16 5 | gpus: '0,1,2,3' 6 | CLASS_AGNOSTIC: true 7 | SCALES: 8 | - 600 9 | - 1000 10 | default: 11 | frequent: 100 12 | kvstore: device 13 | network: 14 | pretrained: "./model/pretrained_model/resnet_v1_101" 15 | pretrained_epoch: 0 16 | PIXEL_MEANS: 17 | - 103.06 18 | - 115.90 19 | - 123.15 20 | IMAGE_STRIDE: 0 21 | RCNN_FEAT_STRIDE: 16 22 | RPN_FEAT_STRIDE: 16 23 | FIXED_PARAMS: 24 | - conv1 25 | - bn_conv1 26 | - res2 27 | - bn2 28 | - gamma 29 | - beta 30 | FIXED_PARAMS_SHARED: 31 | - conv1 32 | - bn_conv1 33 | - res2 34 | - bn2 35 | - res3 36 | - bn3 37 | - res4 38 | - bn4 39 | - gamma 40 | - beta 41 | ANCHOR_RATIOS: 42 | - 0.5 43 | - 1 44 | - 2 45 | ANCHOR_SCALES: 46 | - 4 47 | - 8 48 | - 16 49 | - 32 50 | NUM_ANCHORS: 12 51 | dataset: 52 | NUM_CLASSES: 81 53 | dataset: coco 54 | dataset_path: "./data/coco" 55 | image_set: train2014+valminusminival2014 56 | root_path: "./" 57 | test_image_set: minival2014 58 | proposal: rpn 59 | TRAIN: 60 | lr: 0.0005 61 | lr_step: '5.33' 62 | warmup: false 63 | warmup_lr: 0.00005 64 | # typically we will use 8000 warmup step for single GPU for COCO 65 | warmup_step: 1000 66 | begin_epoch: 0 67 | end_epoch: 8 68 | model_prefix: 'rcnn_coco' 69 | # whether resume training 70 | RESUME: false 71 | # whether flip image 72 | FLIP: true 73 | # whether shuffle image 74 | SHUFFLE: true 75 | # whether use OHEM 76 | ENABLE_OHEM: true 77 | # size of images for each device, 2 for rcnn, 1 for rpn and e2e 78 | BATCH_IMAGES: 1 79 | # e2e changes behavior of anchor loader and metric 80 | END2END: true 81 | # group images with similar aspect ratio 82 | ASPECT_GROUPING: true 83 | # R-CNN 84 | # rcnn rois batch size 85 | BATCH_ROIS: -1 86 | BATCH_ROIS_OHEM: 128 87 | # rcnn rois sampling params 88 | FG_FRACTION: 0.25 89 | FG_THRESH: 0.5 90 | BG_THRESH_HI: 0.5 91 | BG_THRESH_LO: 0 92 | # rcnn bounding box regression params 93 | BBOX_REGRESSION_THRESH: 0.5 94 | BBOX_WEIGHTS: 95 | - 1.0 96 | - 1.0 97 | - 1.0 98 | - 1.0 99 | 100 | # RPN anchor loader 101 | # rpn anchors batch size 102 | RPN_BATCH_SIZE: 256 103 | # rpn anchors sampling params 104 | RPN_FG_FRACTION: 0.5 105 | RPN_POSITIVE_OVERLAP: 0.7 106 | RPN_NEGATIVE_OVERLAP: 0.3 107 | RPN_CLOBBER_POSITIVES: false 108 | # rpn bounding box regression params 109 | RPN_BBOX_WEIGHTS: 110 | - 1.0 111 | - 1.0 112 | - 1.0 113 | - 1.0 114 | RPN_POSITIVE_WEIGHT: -1.0 115 | # used for end2end training 116 | # RPN proposal 117 | CXX_PROPOSAL: false 118 | RPN_NMS_THRESH: 0.7 119 | RPN_PRE_NMS_TOP_N: 6000 120 | RPN_POST_NMS_TOP_N: 300 121 | RPN_MIN_SIZE: 0 122 | # approximate bounding box regression 123 | BBOX_NORMALIZATION_PRECOMPUTED: true 124 | BBOX_MEANS: 125 | - 0.0 126 | - 0.0 127 | - 0.0 128 | - 0.0 129 | BBOX_STDS: 130 | - 0.1 131 | - 0.1 132 | - 0.2 133 | - 0.2 134 | TEST: 135 | # use rpn to generate proposal 136 | HAS_RPN: true 137 | # size of images for each device 138 | BATCH_IMAGES: 1 139 | # RPN proposal 140 | CXX_PROPOSAL: false 141 | RPN_NMS_THRESH: 0.7 142 | RPN_PRE_NMS_TOP_N: 6000 143 | RPN_POST_NMS_TOP_N: 300 144 | RPN_MIN_SIZE: 0 145 | # RPN generate proposal 146 | PROPOSAL_NMS_THRESH: 0.7 147 | PROPOSAL_PRE_NMS_TOP_N: 20000 148 | PROPOSAL_POST_NMS_TOP_N: 2000 149 | PROPOSAL_MIN_SIZE: 0 150 | # RCNN nms 151 | NMS: 0.3 152 | test_epoch: 8 153 | max_per_image: 100 154 | 155 | -------------------------------------------------------------------------------- /experiments/relation_rcnn/cfgs/resnet_v1_101_coco_trainvalminus_rcnn_dcn_end2end_relation_learn_nms_8epoch.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | MXNET_VERSION: "mxnet_v1.1.0" 3 | output_path: "./output/rcnn/coco" 4 | symbol: resnet_v1_101_rcnn_dcn_attention_1024_pairwise_position_multi_head_16_learn_nms 5 | gpus: '4,5,6,7' 6 | CLASS_AGNOSTIC: true 7 | SCALES: 8 | - 600 9 | - 1000 10 | default: 11 | frequent: 100 12 | kvstore: device 13 | network: 14 | pretrained: "./model/pretrained_model/resnet_v1_101" 15 | pretrained_epoch: 0 16 | PIXEL_MEANS: 17 | - 103.06 18 | - 115.90 19 | - 123.15 20 | IMAGE_STRIDE: 0 21 | RCNN_FEAT_STRIDE: 16 22 | RPN_FEAT_STRIDE: 16 23 | FIXED_PARAMS: 24 | - conv1 25 | - bn_conv1 26 | - res2 27 | - bn2 28 | - gamma 29 | - beta 30 | FIXED_PARAMS_SHARED: 31 | - conv1 32 | - bn_conv1 33 | - res2 34 | - bn2 35 | - res3 36 | - bn3 37 | - res4 38 | - bn4 39 | - gamma 40 | - beta 41 | ANCHOR_RATIOS: 42 | - 0.5 43 | - 1 44 | - 2 45 | ANCHOR_SCALES: 46 | - 4 47 | - 8 48 | - 16 49 | - 32 50 | NUM_ANCHORS: 12 51 | NMS_TARGET_THRESH: '0.5, 0.6, 0.7, 0.8, 0.9' 52 | dataset: 53 | NUM_CLASSES: 81 54 | dataset: coco 55 | dataset_path: "./data/coco" 56 | image_set: train2014+valminusminival2014 57 | root_path: "./" 58 | test_image_set: minival2014 59 | proposal: rpn 60 | TRAIN: 61 | lr: 0.0005 62 | lr_step: '5.33' 63 | warmup: false 64 | warmup_lr: 0.00005 65 | # typically we will use 8000 warmup step for single GPU for COCO 66 | warmup_step: 1000 67 | begin_epoch: 0 68 | end_epoch: 8 69 | model_prefix: 'rcnn_coco' 70 | # whether resume training 71 | RESUME: false 72 | # whether flip image 73 | FLIP: true 74 | # whether shuffle image 75 | SHUFFLE: true 76 | # whether use OHEM 77 | ENABLE_OHEM: true 78 | # size of images for each device, 2 for rcnn, 1 for rpn and e2e 79 | BATCH_IMAGES: 1 80 | # e2e changes behavior of anchor loader and metric 81 | END2END: true 82 | # group images with similar aspect ratio 83 | ASPECT_GROUPING: true 84 | # R-CNN 85 | # rcnn rois batch size 86 | BATCH_ROIS: -1 87 | BATCH_ROIS_OHEM: 128 88 | # rcnn rois sampling params 89 | FG_FRACTION: 0.25 90 | FG_THRESH: 0.5 91 | BG_THRESH_HI: 0.5 92 | BG_THRESH_LO: 0 93 | # rcnn bounding box regression params 94 | BBOX_REGRESSION_THRESH: 0.5 95 | BBOX_WEIGHTS: 96 | - 1.0 97 | - 1.0 98 | - 1.0 99 | - 1.0 100 | 101 | # RPN anchor loader 102 | # rpn anchors batch size 103 | RPN_BATCH_SIZE: 256 104 | # rpn anchors sampling params 105 | RPN_FG_FRACTION: 0.5 106 | RPN_POSITIVE_OVERLAP: 0.7 107 | RPN_NEGATIVE_OVERLAP: 0.3 108 | RPN_CLOBBER_POSITIVES: false 109 | # rpn bounding box regression params 110 | RPN_BBOX_WEIGHTS: 111 | - 1.0 112 | - 1.0 113 | - 1.0 114 | - 1.0 115 | RPN_POSITIVE_WEIGHT: -1.0 116 | # used for end2end training 117 | # RPN proposal 118 | CXX_PROPOSAL: false 119 | RPN_NMS_THRESH: 0.7 120 | RPN_PRE_NMS_TOP_N: 6000 121 | RPN_POST_NMS_TOP_N: 300 122 | RPN_MIN_SIZE: 0 123 | # approximate bounding box regression 124 | BBOX_NORMALIZATION_PRECOMPUTED: true 125 | BBOX_MEANS: 126 | - 0.0 127 | - 0.0 128 | - 0.0 129 | - 0.0 130 | BBOX_STDS: 131 | - 0.1 132 | - 0.1 133 | - 0.2 134 | - 0.2 135 | LEARN_NMS: true 136 | FIRST_N: 100 137 | JOINT_TRAINING: true 138 | TEST: 139 | # use rpn to generate proposal 140 | HAS_RPN: true 141 | # size of images for each device 142 | BATCH_IMAGES: 1 143 | # RPN proposal 144 | CXX_PROPOSAL: false 145 | RPN_NMS_THRESH: 0.7 146 | RPN_PRE_NMS_TOP_N: 6000 147 | RPN_POST_NMS_TOP_N: 300 148 | RPN_MIN_SIZE: 0 149 | # RPN generate proposal 150 | PROPOSAL_NMS_THRESH: 0.7 151 | PROPOSAL_PRE_NMS_TOP_N: 20000 152 | PROPOSAL_POST_NMS_TOP_N: 2000 153 | PROPOSAL_MIN_SIZE: 0 154 | # RCNN nms 155 | NMS: 10.0 156 | SOFTNMS: true 157 | test_epoch: 8 158 | max_per_image: 100 159 | # Learn nms 160 | LEARN_NMS: true 161 | LEARN_NMS_CLASS_SCORE_TH: 0.01 162 | FIRST_N: 100 163 | 164 | -------------------------------------------------------------------------------- /experiments/relation_rcnn/cfgs/resnet_v1_101_coco_trainvalminus_rcnn_end2end_8epoch.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | MXNET_VERSION: "mxnet_v1.1.0" 3 | output_path: "./output/rcnn/coco" 4 | symbol: resnet_v1_101_rcnn 5 | gpus: '0,1,2,3' 6 | CLASS_AGNOSTIC: true 7 | SCALES: 8 | - 600 9 | - 1000 10 | default: 11 | frequent: 100 12 | kvstore: device 13 | network: 14 | pretrained: "./model/pretrained_model/resnet_v1_101" 15 | pretrained_epoch: 0 16 | PIXEL_MEANS: 17 | - 103.06 18 | - 115.90 19 | - 123.15 20 | IMAGE_STRIDE: 0 21 | RCNN_FEAT_STRIDE: 16 22 | RPN_FEAT_STRIDE: 16 23 | FIXED_PARAMS: 24 | - conv1 25 | - bn_conv1 26 | - res2 27 | - bn2 28 | - gamma 29 | - beta 30 | FIXED_PARAMS_SHARED: 31 | - conv1 32 | - bn_conv1 33 | - res2 34 | - bn2 35 | - res3 36 | - bn3 37 | - res4 38 | - bn4 39 | - gamma 40 | - beta 41 | ANCHOR_RATIOS: 42 | - 0.5 43 | - 1 44 | - 2 45 | ANCHOR_SCALES: 46 | - 4 47 | - 8 48 | - 16 49 | - 32 50 | NUM_ANCHORS: 12 51 | dataset: 52 | NUM_CLASSES: 81 53 | dataset: coco 54 | dataset_path: "./data/coco" 55 | image_set: train2014+valminusminival2014 56 | #image_set: minival2014 57 | root_path: "./" 58 | test_image_set: minival2014 59 | proposal: rpn 60 | TRAIN: 61 | lr: 0.0005 62 | lr_step: '5.33' 63 | warmup: false 64 | warmup_lr: 0.00005 65 | # typically we will use 8000 warmup step for single GPU for COCO 66 | warmup_step: 1000 67 | begin_epoch: 0 68 | end_epoch: 8 69 | model_prefix: 'rcnn_coco' 70 | # whether resume training 71 | RESUME: false 72 | # whether flip image 73 | FLIP: true 74 | # whether shuffle image 75 | SHUFFLE: true 76 | # whether use OHEM 77 | ENABLE_OHEM: true 78 | # size of images for each device, 2 for rcnn, 1 for rpn and e2e 79 | BATCH_IMAGES: 1 80 | # e2e changes behavior of anchor loader and metric 81 | END2END: true 82 | # group images with similar aspect ratio 83 | ASPECT_GROUPING: true 84 | # R-CNN 85 | # rcnn rois batch size 86 | BATCH_ROIS: -1 87 | BATCH_ROIS_OHEM: 128 88 | # rcnn rois sampling params 89 | FG_FRACTION: 0.25 90 | FG_THRESH: 0.5 91 | BG_THRESH_HI: 0.5 92 | BG_THRESH_LO: 0 93 | # rcnn bounding box regression params 94 | BBOX_REGRESSION_THRESH: 0.5 95 | BBOX_WEIGHTS: 96 | - 1.0 97 | - 1.0 98 | - 1.0 99 | - 1.0 100 | 101 | # RPN anchor loader 102 | # rpn anchors batch size 103 | RPN_BATCH_SIZE: 256 104 | # rpn anchors sampling params 105 | RPN_FG_FRACTION: 0.5 106 | RPN_POSITIVE_OVERLAP: 0.7 107 | RPN_NEGATIVE_OVERLAP: 0.3 108 | RPN_CLOBBER_POSITIVES: false 109 | # rpn bounding box regression params 110 | RPN_BBOX_WEIGHTS: 111 | - 1.0 112 | - 1.0 113 | - 1.0 114 | - 1.0 115 | RPN_POSITIVE_WEIGHT: -1.0 116 | # used for end2end training 117 | # RPN proposal 118 | CXX_PROPOSAL: false 119 | RPN_NMS_THRESH: 0.7 120 | RPN_PRE_NMS_TOP_N: 6000 121 | RPN_POST_NMS_TOP_N: 300 122 | RPN_MIN_SIZE: 0 123 | # approximate bounding box regression 124 | BBOX_NORMALIZATION_PRECOMPUTED: true 125 | BBOX_MEANS: 126 | - 0.0 127 | - 0.0 128 | - 0.0 129 | - 0.0 130 | BBOX_STDS: 131 | - 0.1 132 | - 0.1 133 | - 0.2 134 | - 0.2 135 | TEST: 136 | # use rpn to generate proposal 137 | HAS_RPN: true 138 | # size of images for each device 139 | BATCH_IMAGES: 1 140 | # RPN proposal 141 | CXX_PROPOSAL: false 142 | RPN_NMS_THRESH: 0.7 143 | RPN_PRE_NMS_TOP_N: 6000 144 | RPN_POST_NMS_TOP_N: 300 145 | RPN_MIN_SIZE: 0 146 | # RPN generate proposal 147 | PROPOSAL_NMS_THRESH: 0.7 148 | PROPOSAL_PRE_NMS_TOP_N: 20000 149 | PROPOSAL_POST_NMS_TOP_N: 2000 150 | PROPOSAL_MIN_SIZE: 0 151 | # RCNN nms 152 | NMS: 0.6 153 | SOFTNMS: true 154 | test_epoch: 8 155 | max_per_image: 100 156 | 157 | -------------------------------------------------------------------------------- /experiments/relation_rcnn/cfgs/resnet_v1_101_coco_trainvalminus_rcnn_end2end_learn_nms_3epoch.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | MXNET_VERSION: "mxnet_v1.1.0" 3 | output_path: "./output/rcnn/coco" 4 | symbol: resnet_v1_101_rcnn_learn_nms_1024_attention_1024_pairwise_position_multi_head_16 5 | gpus: '0,1,2,3' 6 | CLASS_AGNOSTIC: true 7 | SCALES: 8 | - 600 9 | - 1000 10 | default: 11 | frequent: 100 12 | kvstore: device 13 | network: 14 | pretrained: "./model/pretrained_model/coco_resnet_v1_101_rcnn" 15 | pretrained_epoch: 8 16 | PIXEL_MEANS: 17 | - 103.06 18 | - 115.90 19 | - 123.15 20 | IMAGE_STRIDE: 0 21 | RCNN_FEAT_STRIDE: 16 22 | RPN_FEAT_STRIDE: 16 23 | FIXED_PARAMS: 24 | - conv1 25 | - bn_conv1 26 | - res2 27 | - bn2 28 | - res3 29 | - bn3 30 | - res4 31 | - bn4 32 | - gamma 33 | - beta 34 | - rpn_conv_3x3 35 | - res5 36 | - bn5 37 | - fc_new 38 | - conv_new_1 39 | - cls_score 40 | - bbox_pred 41 | FIXED_PARAMS_SHARED: 42 | - conv1 43 | - bn_conv1 44 | - res2 45 | - bn2 46 | - res3 47 | - bn3 48 | - res4 49 | - bn4 50 | - gamma 51 | - beta 52 | ANCHOR_RATIOS: 53 | - 0.5 54 | - 1 55 | - 2 56 | ANCHOR_SCALES: 57 | - 4 58 | - 8 59 | - 16 60 | - 32 61 | NUM_ANCHORS: 12 62 | NMS_TARGET_THRESH: '0.5, 0.6, 0.7, 0.8, 0.9' 63 | dataset: 64 | NUM_CLASSES: 81 65 | dataset: coco 66 | dataset_path: "./data/coco" 67 | image_set: train2014+valminusminival2014 68 | root_path: "./" 69 | test_image_set: minival2014 70 | proposal: rpn 71 | TRAIN: 72 | lr: 0.0005 73 | lr_step: '2.0' 74 | warmup: false 75 | warmup_lr: 0.00005 76 | # typically we will use 8000 warmup step for single GPU for COCO 77 | warmup_step: 1000 78 | begin_epoch: 0 79 | end_epoch: 3 80 | model_prefix: 'rcnn_coco' 81 | # whether resume training 82 | RESUME: false 83 | # whether flip image 84 | FLIP: true 85 | # whether shuffle image 86 | SHUFFLE: true 87 | # whether use OHEM 88 | ENABLE_OHEM: false 89 | # size of images for each device, 2 for rcnn, 1 for rpn and e2e 90 | BATCH_IMAGES: 1 91 | # e2e changes behavior of anchor loader and metric 92 | END2END: true 93 | # group images with similar aspect ratio 94 | ASPECT_GROUPING: true 95 | # R-CNN 96 | # rcnn rois batch size 97 | BATCH_ROIS: -1 98 | BATCH_ROIS_OHEM: 128 99 | # rcnn rois sampling params 100 | FG_FRACTION: 0.25 101 | FG_THRESH: 0.5 102 | BG_THRESH_HI: 0.5 103 | BG_THRESH_LO: 0 104 | # rcnn bounding box regression params 105 | BBOX_REGRESSION_THRESH: 0.5 106 | BBOX_WEIGHTS: 107 | - 1.0 108 | - 1.0 109 | - 1.0 110 | - 1.0 111 | 112 | # RPN anchor loader 113 | # rpn anchors batch size 114 | RPN_BATCH_SIZE: 256 115 | # rpn anchors sampling params 116 | RPN_FG_FRACTION: 0.5 117 | RPN_POSITIVE_OVERLAP: 0.7 118 | RPN_NEGATIVE_OVERLAP: 0.3 119 | RPN_CLOBBER_POSITIVES: false 120 | # rpn bounding box regression params 121 | RPN_BBOX_WEIGHTS: 122 | - 1.0 123 | - 1.0 124 | - 1.0 125 | - 1.0 126 | RPN_POSITIVE_WEIGHT: -1.0 127 | # used for end2end training 128 | # RPN proposal 129 | CXX_PROPOSAL: false 130 | RPN_NMS_THRESH: 0.7 131 | RPN_PRE_NMS_TOP_N: 6000 132 | RPN_POST_NMS_TOP_N: 300 133 | RPN_MIN_SIZE: 0 134 | # approximate bounding box regression 135 | BBOX_NORMALIZATION_PRECOMPUTED: true 136 | BBOX_MEANS: 137 | - 0.0 138 | - 0.0 139 | - 0.0 140 | - 0.0 141 | BBOX_STDS: 142 | - 0.1 143 | - 0.1 144 | - 0.2 145 | - 0.2 146 | LEARN_NMS: true 147 | FIRST_N: 100 148 | TEST: 149 | # use rpn to generate proposal 150 | HAS_RPN: true 151 | # size of images for each device 152 | BATCH_IMAGES: 1 153 | # RPN proposal 154 | CXX_PROPOSAL: false 155 | RPN_NMS_THRESH: 0.7 156 | RPN_PRE_NMS_TOP_N: 6000 157 | RPN_POST_NMS_TOP_N: 300 158 | RPN_MIN_SIZE: 0 159 | # RPN generate proposal 160 | PROPOSAL_NMS_THRESH: 0.7 161 | PROPOSAL_PRE_NMS_TOP_N: 20000 162 | PROPOSAL_POST_NMS_TOP_N: 2000 163 | PROPOSAL_MIN_SIZE: 0 164 | # RCNN nms 165 | NMS: 10.0 166 | SOFTNMS: true 167 | test_epoch: 3 168 | max_per_image: 100 169 | # Learn nms 170 | LEARN_NMS: true 171 | LEARN_NMS_CLASS_SCORE_TH: 0.01 172 | FIRST_N: 100 173 | -------------------------------------------------------------------------------- /experiments/relation_rcnn/cfgs/resnet_v1_101_coco_trainvalminus_rcnn_end2end_relation_8epoch.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | MXNET_VERSION: "mxnet_v1.1.0" 3 | output_path: "./output/rcnn/coco" 4 | symbol: resnet_v1_101_rcnn_attention_1024_pairwise_position_multi_head_16 5 | gpus: '4,5,6,7' 6 | CLASS_AGNOSTIC: true 7 | SCALES: 8 | - 600 9 | - 1000 10 | default: 11 | frequent: 100 12 | kvstore: device 13 | network: 14 | pretrained: "./model/pretrained_model/resnet_v1_101" 15 | pretrained_epoch: 0 16 | PIXEL_MEANS: 17 | - 103.06 18 | - 115.90 19 | - 123.15 20 | IMAGE_STRIDE: 0 21 | RCNN_FEAT_STRIDE: 16 22 | RPN_FEAT_STRIDE: 16 23 | FIXED_PARAMS: 24 | - conv1 25 | - bn_conv1 26 | - res2 27 | - bn2 28 | - gamma 29 | - beta 30 | FIXED_PARAMS_SHARED: 31 | - conv1 32 | - bn_conv1 33 | - res2 34 | - bn2 35 | - res3 36 | - bn3 37 | - res4 38 | - bn4 39 | - gamma 40 | - beta 41 | ANCHOR_RATIOS: 42 | - 0.5 43 | - 1 44 | - 2 45 | ANCHOR_SCALES: 46 | - 4 47 | - 8 48 | - 16 49 | - 32 50 | NUM_ANCHORS: 12 51 | dataset: 52 | NUM_CLASSES: 81 53 | dataset: coco 54 | dataset_path: "./data/coco" 55 | image_set: train2014+valminusminival2014 56 | root_path: "./" 57 | test_image_set: minival2014 58 | proposal: rpn 59 | TRAIN: 60 | lr: 0.0005 61 | lr_step: '5.33' 62 | warmup: false 63 | warmup_lr: 0.00005 64 | # typically we will use 8000 warmup step for single GPU for COCO 65 | warmup_step: 1000 66 | begin_epoch: 0 67 | end_epoch: 8 68 | model_prefix: 'rcnn_coco' 69 | # whether resume training 70 | RESUME: false 71 | # whether flip image 72 | FLIP: true 73 | # whether shuffle image 74 | SHUFFLE: true 75 | # whether use OHEM 76 | ENABLE_OHEM: true 77 | # size of images for each device, 2 for rcnn, 1 for rpn and e2e 78 | BATCH_IMAGES: 1 79 | # e2e changes behavior of anchor loader and metric 80 | END2END: true 81 | # group images with similar aspect ratio 82 | ASPECT_GROUPING: true 83 | # R-CNN 84 | # rcnn rois batch size 85 | BATCH_ROIS: -1 86 | BATCH_ROIS_OHEM: 128 87 | # rcnn rois sampling params 88 | FG_FRACTION: 0.25 89 | FG_THRESH: 0.5 90 | BG_THRESH_HI: 0.5 91 | BG_THRESH_LO: 0 92 | # rcnn bounding box regression params 93 | BBOX_REGRESSION_THRESH: 0.5 94 | BBOX_WEIGHTS: 95 | - 1.0 96 | - 1.0 97 | - 1.0 98 | - 1.0 99 | 100 | # RPN anchor loader 101 | # rpn anchors batch size 102 | RPN_BATCH_SIZE: 256 103 | # rpn anchors sampling params 104 | RPN_FG_FRACTION: 0.5 105 | RPN_POSITIVE_OVERLAP: 0.7 106 | RPN_NEGATIVE_OVERLAP: 0.3 107 | RPN_CLOBBER_POSITIVES: false 108 | # rpn bounding box regression params 109 | RPN_BBOX_WEIGHTS: 110 | - 1.0 111 | - 1.0 112 | - 1.0 113 | - 1.0 114 | RPN_POSITIVE_WEIGHT: -1.0 115 | # used for end2end training 116 | # RPN proposal 117 | CXX_PROPOSAL: false 118 | RPN_NMS_THRESH: 0.7 119 | RPN_PRE_NMS_TOP_N: 6000 120 | RPN_POST_NMS_TOP_N: 300 121 | RPN_MIN_SIZE: 0 122 | # approximate bounding box regression 123 | BBOX_NORMALIZATION_PRECOMPUTED: true 124 | BBOX_MEANS: 125 | - 0.0 126 | - 0.0 127 | - 0.0 128 | - 0.0 129 | BBOX_STDS: 130 | - 0.1 131 | - 0.1 132 | - 0.2 133 | - 0.2 134 | TEST: 135 | # use rpn to generate proposal 136 | HAS_RPN: true 137 | # size of images for each device 138 | BATCH_IMAGES: 1 139 | # RPN proposal 140 | CXX_PROPOSAL: false 141 | RPN_NMS_THRESH: 0.7 142 | RPN_PRE_NMS_TOP_N: 6000 143 | RPN_POST_NMS_TOP_N: 300 144 | RPN_MIN_SIZE: 0 145 | # RPN generate proposal 146 | PROPOSAL_NMS_THRESH: 0.7 147 | PROPOSAL_PRE_NMS_TOP_N: 20000 148 | PROPOSAL_POST_NMS_TOP_N: 2000 149 | PROPOSAL_MIN_SIZE: 0 150 | # RCNN nms 151 | NMS: 0.6 152 | SOFTNMS: true 153 | test_epoch: 7 154 | max_per_image: 100 155 | 156 | -------------------------------------------------------------------------------- /experiments/relation_rcnn/cfgs/resnet_v1_101_coco_trainvalminus_rcnn_end2end_relation_learn_nms_8epoch.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | MXNET_VERSION: "mxnet_v1.1.0" 3 | output_path: "./output/rcnn/coco" 4 | symbol: resnet_v1_101_rcnn_attention_1024_pairwise_position_multi_head_16_learn_nms 5 | gpus: '0,1,2,3' 6 | CLASS_AGNOSTIC: true 7 | SCALES: 8 | - 600 9 | - 1000 10 | default: 11 | frequent: 100 12 | kvstore: device 13 | network: 14 | pretrained: "./model/pretrained_model/resnet_v1_101" 15 | pretrained_epoch: 0 16 | PIXEL_MEANS: 17 | - 103.06 18 | - 115.90 19 | - 123.15 20 | IMAGE_STRIDE: 0 21 | RCNN_FEAT_STRIDE: 16 22 | RPN_FEAT_STRIDE: 16 23 | FIXED_PARAMS: 24 | - conv1 25 | - bn_conv1 26 | - res2 27 | - bn2 28 | - gamma 29 | - beta 30 | FIXED_PARAMS_SHARED: 31 | - conv1 32 | - bn_conv1 33 | - res2 34 | - bn2 35 | - res3 36 | - bn3 37 | - res4 38 | - bn4 39 | - gamma 40 | - beta 41 | ANCHOR_RATIOS: 42 | - 0.5 43 | - 1 44 | - 2 45 | ANCHOR_SCALES: 46 | - 4 47 | - 8 48 | - 16 49 | - 32 50 | NUM_ANCHORS: 12 51 | NMS_TARGET_THRESH: '0.5, 0.6, 0.7, 0.8, 0.9' 52 | dataset: 53 | NUM_CLASSES: 81 54 | dataset: coco 55 | dataset_path: "./data/coco" 56 | image_set: train2014+valminusminival2014 57 | #image_set: minival2014 58 | root_path: "./" 59 | test_image_set: minival2014 60 | proposal: rpn 61 | TRAIN: 62 | lr: 0.0005 63 | lr_step: '5.33' 64 | warmup: false 65 | warmup_lr: 0.00005 66 | # typically we will use 8000 warmup step for single GPU for COCO 67 | warmup_step: 1000 68 | begin_epoch: 0 69 | end_epoch: 8 70 | model_prefix: 'rcnn_coco' 71 | # whether resume training 72 | RESUME: false 73 | # whether flip image 74 | FLIP: true 75 | # whether shuffle image 76 | SHUFFLE: true 77 | # whether use OHEM 78 | ENABLE_OHEM: true 79 | # size of images for each device, 2 for rcnn, 1 for rpn and e2e 80 | BATCH_IMAGES: 1 81 | # e2e changes behavior of anchor loader and metric 82 | END2END: true 83 | # group images with similar aspect ratio 84 | ASPECT_GROUPING: true 85 | # R-CNN 86 | # rcnn rois batch size 87 | BATCH_ROIS: -1 88 | BATCH_ROIS_OHEM: 128 89 | # rcnn rois sampling params 90 | FG_FRACTION: 0.25 91 | FG_THRESH: 0.5 92 | BG_THRESH_HI: 0.5 93 | BG_THRESH_LO: 0 94 | # rcnn bounding box regression params 95 | BBOX_REGRESSION_THRESH: 0.5 96 | BBOX_WEIGHTS: 97 | - 1.0 98 | - 1.0 99 | - 1.0 100 | - 1.0 101 | 102 | # RPN anchor loader 103 | # rpn anchors batch size 104 | RPN_BATCH_SIZE: 256 105 | # rpn anchors sampling params 106 | RPN_FG_FRACTION: 0.5 107 | RPN_POSITIVE_OVERLAP: 0.7 108 | RPN_NEGATIVE_OVERLAP: 0.3 109 | RPN_CLOBBER_POSITIVES: false 110 | # rpn bounding box regression params 111 | RPN_BBOX_WEIGHTS: 112 | - 1.0 113 | - 1.0 114 | - 1.0 115 | - 1.0 116 | RPN_POSITIVE_WEIGHT: -1.0 117 | # used for end2end training 118 | # RPN proposal 119 | CXX_PROPOSAL: false 120 | RPN_NMS_THRESH: 0.7 121 | RPN_PRE_NMS_TOP_N: 6000 122 | RPN_POST_NMS_TOP_N: 300 123 | RPN_MIN_SIZE: 0 124 | # approximate bounding box regression 125 | BBOX_NORMALIZATION_PRECOMPUTED: true 126 | BBOX_MEANS: 127 | - 0.0 128 | - 0.0 129 | - 0.0 130 | - 0.0 131 | BBOX_STDS: 132 | - 0.1 133 | - 0.1 134 | - 0.2 135 | - 0.2 136 | LEARN_NMS: true 137 | FIRST_N: 100 138 | JOINT_TRAINING: true 139 | TEST: 140 | # use rpn to generate proposal 141 | HAS_RPN: true 142 | # size of images for each device 143 | BATCH_IMAGES: 1 144 | # RPN proposal 145 | CXX_PROPOSAL: false 146 | RPN_NMS_THRESH: 0.7 147 | RPN_PRE_NMS_TOP_N: 6000 148 | RPN_POST_NMS_TOP_N: 300 149 | RPN_MIN_SIZE: 0 150 | # RPN generate proposal 151 | PROPOSAL_NMS_THRESH: 0.7 152 | PROPOSAL_PRE_NMS_TOP_N: 20000 153 | PROPOSAL_POST_NMS_TOP_N: 2000 154 | PROPOSAL_MIN_SIZE: 0 155 | # RCNN nms 156 | NMS: 10.0 157 | SOFTNMS: true 158 | test_epoch: 8 159 | max_per_image: 100 160 | # Learn nms 161 | LEARN_NMS: true 162 | LEARN_NMS_CLASS_SCORE_TH: 0.01 163 | FIRST_N: 100 164 | 165 | -------------------------------------------------------------------------------- /experiments/relation_rcnn/cfgs/resnet_v1_101_coco_trainvalminus_rcnn_fpn_8epoch.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | MXNET_VERSION: "mxnet_v1.1.0" 3 | output_path: "./output/rcnn/coco" 4 | symbol: resnet_v1_101_rcnn_fpn 5 | gpus: '4,5,6,7' 6 | CLASS_AGNOSTIC: true 7 | SCALES: 8 | - 800 9 | - 1000 10 | default: 11 | frequent: 100 12 | kvstore: device 13 | network: 14 | pretrained: "./model/pretrained_model/resnet_v1_101" 15 | pretrained_epoch: 0 16 | PIXEL_MEANS: 17 | - 103.06 18 | - 115.90 19 | - 123.15 20 | IMAGE_STRIDE: 32 21 | RCNN_FEAT_STRIDE: 16 22 | RPN_FEAT_STRIDE: 16 23 | FIXED_PARAMS: 24 | - conv1 25 | - bn_conv1 26 | - res2 27 | - bn2 28 | - gamma 29 | - beta 30 | FIXED_PARAMS_SHARED: 31 | - conv1 32 | - bn_conv1 33 | - res2 34 | - bn2 35 | - res3 36 | - bn3 37 | - res4 38 | - bn4 39 | - gamma 40 | - beta 41 | ANCHOR_RATIOS: 42 | - 0.5 43 | - 1 44 | - 2 45 | ANCHOR_SCALES: 46 | - 4 47 | - 8 48 | - 16 49 | - 32 50 | NUM_ANCHORS: 12 51 | ROIDispatch: true 52 | USE_NONGT_INDEX: false 53 | dataset: 54 | NUM_CLASSES: 81 55 | dataset: coco 56 | dataset_path: "./data/coco" 57 | image_set: train2014+valminusminival2014 58 | # image_set: minival2014 59 | root_path: "./" 60 | test_image_set: minival2014 61 | proposal: rpn 62 | proposal_cache: "./proposal/resnet_v1_101_fpn" 63 | TRAIN: 64 | lr: 0.00125 65 | lr_step: '5.33' 66 | warmup: false 67 | warmup_lr: 0.000125 68 | # typically we will use 8000 warmup step for single GPU for COCO 69 | warmup_step: 1000 70 | begin_epoch: 0 71 | end_epoch: 8 72 | model_prefix: 'rcnn_fpn_coco' 73 | # whether resume training 74 | RESUME: false 75 | # whether flip image 76 | FLIP: true 77 | # whether shuffle image 78 | SHUFFLE: true 79 | # whether use OHEM 80 | ENABLE_OHEM: true 81 | # size of images for each device, 2 for rcnn, 1 for rpn and e2e 82 | BATCH_IMAGES: 1 83 | # e2e changes behavior of anchor loader and metric 84 | END2END: false 85 | # group images with similar aspect ratio 86 | ASPECT_GROUPING: true 87 | # R-CNN 88 | # rcnn rois batch size 89 | TOP_ROIS: 1000 90 | BATCH_ROIS: -1 91 | BATCH_ROIS_OHEM: 512 92 | # rcnn rois sampling params 93 | FG_FRACTION: 0.25 94 | FG_THRESH: 0.5 95 | BG_THRESH_HI: 0.5 96 | BG_THRESH_LO: 0 97 | # rcnn bounding box regression params 98 | BBOX_REGRESSION_THRESH: 0.5 99 | BBOX_WEIGHTS: 100 | - 1.0 101 | - 1.0 102 | - 1.0 103 | - 1.0 104 | 105 | # RPN anchor loader 106 | # rpn anchors batch size 107 | RPN_BATCH_SIZE: 256 108 | # rpn anchors sampling params 109 | RPN_FG_FRACTION: 0.5 110 | RPN_POSITIVE_OVERLAP: 0.7 111 | RPN_NEGATIVE_OVERLAP: 0.3 112 | RPN_CLOBBER_POSITIVES: false 113 | # rpn bounding box regression params 114 | RPN_BBOX_WEIGHTS: 115 | - 1.0 116 | - 1.0 117 | - 1.0 118 | - 1.0 119 | RPN_POSITIVE_WEIGHT: -1.0 120 | # used for end2end training 121 | # RPN proposal 122 | CXX_PROPOSAL: false 123 | RPN_NMS_THRESH: 0.7 124 | RPN_PRE_NMS_TOP_N: 6000 125 | RPN_POST_NMS_TOP_N: 300 126 | RPN_MIN_SIZE: 0 127 | # approximate bounding box regression 128 | BBOX_NORMALIZATION_PRECOMPUTED: true 129 | BBOX_MEANS: 130 | - 0.0 131 | - 0.0 132 | - 0.0 133 | - 0.0 134 | BBOX_STDS: 135 | - 0.1 136 | - 0.1 137 | - 0.2 138 | - 0.2 139 | TEST: 140 | # use rpn to generate proposal 141 | HAS_RPN: false 142 | # size of images for each device 143 | BATCH_IMAGES: 1 144 | # RPN proposal 145 | CXX_PROPOSAL: false 146 | RPN_NMS_THRESH: 0.7 147 | RPN_PRE_NMS_TOP_N: 6000 148 | RPN_POST_NMS_TOP_N: 300 149 | RPN_MIN_SIZE: 0 150 | # RPN generate proposal 151 | PROPOSAL_NMS_THRESH: 0.7 152 | PROPOSAL_PRE_NMS_TOP_N: 20000 153 | PROPOSAL_POST_NMS_TOP_N: 2000 154 | PROPOSAL_MIN_SIZE: 0 155 | # RCNN nms 156 | NMS: 0.6 157 | SOFTNMS: true 158 | test_epoch: 8 159 | max_per_image: 100 160 | TOP_ROIS: 1000 161 | 162 | -------------------------------------------------------------------------------- /experiments/relation_rcnn/cfgs/resnet_v1_101_coco_trainvalminus_rcnn_fpn_relation_8epoch.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | MXNET_VERSION: "mxnet_v1.1.0" 3 | output_path: "./output/rcnn/coco" 4 | symbol: resnet_v1_101_rcnn_fpn_attention_1024_pairwise_position_multi_head_16 5 | gpus: '0, 1, 2, 3' 6 | CLASS_AGNOSTIC: true 7 | SCALES: 8 | - 800 9 | - 1000 10 | default: 11 | frequent: 100 12 | kvstore: device 13 | network: 14 | pretrained: "./model/pretrained_model/resnet_v1_101" 15 | pretrained_epoch: 0 16 | PIXEL_MEANS: 17 | - 103.06 18 | - 115.90 19 | - 123.15 20 | IMAGE_STRIDE: 32 21 | RCNN_FEAT_STRIDE: 16 22 | RPN_FEAT_STRIDE: 16 23 | FIXED_PARAMS: 24 | - conv1 25 | - bn_conv1 26 | - res2 27 | - bn2 28 | - gamma 29 | - beta 30 | FIXED_PARAMS_SHARED: 31 | - conv1 32 | - bn_conv1 33 | - res2 34 | - bn2 35 | - res3 36 | - bn3 37 | - res4 38 | - bn4 39 | - gamma 40 | - beta 41 | ANCHOR_RATIOS: 42 | - 0.5 43 | - 1 44 | - 2 45 | ANCHOR_SCALES: 46 | - 4 47 | - 8 48 | - 16 49 | - 32 50 | NUM_ANCHORS: 12 51 | ROIDispatch: true 52 | USE_NONGT_INDEX: true 53 | dataset: 54 | NUM_CLASSES: 81 55 | dataset: coco 56 | dataset_path: "./data/coco" 57 | image_set: train2014+valminusminival2014 58 | # image_set: minival2014 59 | root_path: "./" 60 | test_image_set: minival2014 61 | proposal: rpn 62 | proposal_cache: "./proposal/resnet_v1_101_fpn" 63 | TRAIN: 64 | lr: 0.00125 65 | lr_step: '5.33' 66 | warmup: false 67 | warmup_lr: 0.000125 68 | # typically we will use 8000 warmup step for single GPU for COCO 69 | warmup_step: 1000 70 | begin_epoch: 0 71 | end_epoch: 8 72 | model_prefix: 'rcnn_fpn_coco' 73 | # whether resume training 74 | RESUME: false 75 | # whether flip image 76 | FLIP: true 77 | # whether shuffle image 78 | SHUFFLE: true 79 | # whether use OHEM 80 | ENABLE_OHEM: true 81 | # size of images for each device, 2 for rcnn, 1 for rpn and e2e 82 | BATCH_IMAGES: 1 83 | # e2e changes behavior of anchor loader and metric 84 | END2END: false 85 | # group images with similar aspect ratio 86 | ASPECT_GROUPING: true 87 | # R-CNN 88 | # rcnn rois batch size 89 | TOP_ROIS: 1000 90 | BATCH_ROIS: -1 91 | BATCH_ROIS_OHEM: 512 92 | # rcnn rois sampling params 93 | FG_FRACTION: 0.25 94 | FG_THRESH: 0.5 95 | BG_THRESH_HI: 0.5 96 | BG_THRESH_LO: 0 97 | # rcnn bounding box regression params 98 | BBOX_REGRESSION_THRESH: 0.5 99 | BBOX_WEIGHTS: 100 | - 1.0 101 | - 1.0 102 | - 1.0 103 | - 1.0 104 | 105 | # RPN anchor loader 106 | # rpn anchors batch size 107 | RPN_BATCH_SIZE: 256 108 | # rpn anchors sampling params 109 | RPN_FG_FRACTION: 0.5 110 | RPN_POSITIVE_OVERLAP: 0.7 111 | RPN_NEGATIVE_OVERLAP: 0.3 112 | RPN_CLOBBER_POSITIVES: false 113 | # rpn bounding box regression params 114 | RPN_BBOX_WEIGHTS: 115 | - 1.0 116 | - 1.0 117 | - 1.0 118 | - 1.0 119 | RPN_POSITIVE_WEIGHT: -1.0 120 | # used for end2end training 121 | # RPN proposal 122 | CXX_PROPOSAL: false 123 | RPN_NMS_THRESH: 0.7 124 | RPN_PRE_NMS_TOP_N: 6000 125 | RPN_POST_NMS_TOP_N: 300 126 | RPN_MIN_SIZE: 0 127 | # approximate bounding box regression 128 | BBOX_NORMALIZATION_PRECOMPUTED: true 129 | BBOX_MEANS: 130 | - 0.0 131 | - 0.0 132 | - 0.0 133 | - 0.0 134 | BBOX_STDS: 135 | - 0.1 136 | - 0.1 137 | - 0.2 138 | - 0.2 139 | TEST: 140 | # use rpn to generate proposal 141 | HAS_RPN: false 142 | # size of images for each device 143 | BATCH_IMAGES: 1 144 | # RPN proposal 145 | CXX_PROPOSAL: false 146 | RPN_NMS_THRESH: 0.7 147 | RPN_PRE_NMS_TOP_N: 6000 148 | RPN_POST_NMS_TOP_N: 300 149 | RPN_MIN_SIZE: 0 150 | # RPN generate proposal 151 | PROPOSAL_NMS_THRESH: 0.7 152 | PROPOSAL_PRE_NMS_TOP_N: 20000 153 | PROPOSAL_POST_NMS_TOP_N: 2000 154 | PROPOSAL_MIN_SIZE: 0 155 | # RCNN nms 156 | NMS: 0.3 157 | test_epoch: 8 158 | max_per_image: 100 159 | TOP_ROIS: 1000 160 | 161 | -------------------------------------------------------------------------------- /experiments/relation_rcnn/cfgs/resnet_v1_101_coco_trainvalminus_rcnn_fpn_relation_learn_nms_8epoch.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | MXNET_VERSION: "mxnet_v1.1.0" 3 | output_path: "./output/rcnn/coco" 4 | symbol: resnet_v1_101_rcnn_fpn_attention_1024_pairwise_position_multi_head_16_learn_nms 5 | gpus: '4,5,6,7' 6 | CLASS_AGNOSTIC: true 7 | SCALES: 8 | - 800 9 | - 1000 10 | default: 11 | frequent: 100 12 | kvstore: device 13 | network: 14 | pretrained: "./model/pretrained_model/resnet_v1_101" 15 | pretrained_epoch: 0 16 | PIXEL_MEANS: 17 | - 103.06 18 | - 115.90 19 | - 123.15 20 | IMAGE_STRIDE: 32 21 | RCNN_FEAT_STRIDE: 16 22 | RPN_FEAT_STRIDE: 16 23 | FIXED_PARAMS: 24 | - conv1 25 | - bn_conv1 26 | - res2 27 | - bn2 28 | - gamma 29 | - beta 30 | FIXED_PARAMS_SHARED: 31 | - conv1 32 | - bn_conv1 33 | - res2 34 | - bn2 35 | - res3 36 | - bn3 37 | - res4 38 | - bn4 39 | - gamma 40 | - beta 41 | ANCHOR_RATIOS: 42 | - 0.5 43 | - 1 44 | - 2 45 | ANCHOR_SCALES: 46 | - 4 47 | - 8 48 | - 16 49 | - 32 50 | NUM_ANCHORS: 12 51 | ROIDispatch: true 52 | USE_NONGT_INDEX: true 53 | NMS_TARGET_THRESH: '0.5, 0.6, 0.7, 0.8, 0.9' 54 | dataset: 55 | NUM_CLASSES: 81 56 | dataset: coco 57 | dataset_path: "./data/coco" 58 | image_set: train2014+valminusminival2014 59 | #image_set: minival2014 60 | root_path: "./" 61 | test_image_set: minival2014 62 | proposal: rpn 63 | proposal_cache: "./proposal/resnet_v1_101_fpn" 64 | TRAIN: 65 | lr: 0.00125 66 | lr_step: '5.33' 67 | warmup: false 68 | warmup_lr: 0.000125 69 | # typically we will use 8000 warmup step for single GPU for COCO 70 | warmup_step: 1000 71 | begin_epoch: 0 72 | end_epoch: 8 73 | model_prefix: 'rcnn_fpn_coco' 74 | # whether resume training 75 | RESUME: false 76 | # whether flip image 77 | FLIP: true 78 | # whether shuffle image 79 | SHUFFLE: true 80 | # whether use OHEM 81 | ENABLE_OHEM: true 82 | # size of images for each device, 2 for rcnn, 1 for rpn and e2e 83 | BATCH_IMAGES: 1 84 | # e2e changes behavior of anchor loader and metric 85 | END2END: false 86 | # group images with similar aspect ratio 87 | ASPECT_GROUPING: true 88 | # R-CNN 89 | # rcnn rois batch size 90 | TOP_ROIS: 1000 91 | BATCH_ROIS: -1 92 | BATCH_ROIS_OHEM: 512 93 | # rcnn rois sampling params 94 | FG_FRACTION: 0.25 95 | FG_THRESH: 0.5 96 | BG_THRESH_HI: 0.5 97 | BG_THRESH_LO: 0 98 | # rcnn bounding box regression params 99 | BBOX_REGRESSION_THRESH: 0.5 100 | BBOX_WEIGHTS: 101 | - 1.0 102 | - 1.0 103 | - 1.0 104 | - 1.0 105 | 106 | # RPN anchor loader 107 | # rpn anchors batch size 108 | RPN_BATCH_SIZE: 256 109 | # rpn anchors sampling params 110 | RPN_FG_FRACTION: 0.5 111 | RPN_POSITIVE_OVERLAP: 0.7 112 | RPN_NEGATIVE_OVERLAP: 0.3 113 | RPN_CLOBBER_POSITIVES: false 114 | # rpn bounding box regression params 115 | RPN_BBOX_WEIGHTS: 116 | - 1.0 117 | - 1.0 118 | - 1.0 119 | - 1.0 120 | RPN_POSITIVE_WEIGHT: -1.0 121 | # used for end2end training 122 | # RPN proposal 123 | CXX_PROPOSAL: false 124 | RPN_NMS_THRESH: 0.7 125 | RPN_PRE_NMS_TOP_N: 6000 126 | RPN_POST_NMS_TOP_N: 300 127 | RPN_MIN_SIZE: 0 128 | # approximate bounding box regression 129 | BBOX_NORMALIZATION_PRECOMPUTED: true 130 | BBOX_MEANS: 131 | - 0.0 132 | - 0.0 133 | - 0.0 134 | - 0.0 135 | BBOX_STDS: 136 | - 0.1 137 | - 0.1 138 | - 0.2 139 | - 0.2 140 | LEARN_NMS: true 141 | FIRST_N: 150 142 | JOINT_TRAINING: true 143 | TEST: 144 | # use rpn to generate proposal 145 | HAS_RPN: false 146 | # size of images for each device 147 | BATCH_IMAGES: 1 148 | # RPN proposal 149 | CXX_PROPOSAL: false 150 | RPN_NMS_THRESH: 0.7 151 | RPN_PRE_NMS_TOP_N: 6000 152 | RPN_POST_NMS_TOP_N: 300 153 | RPN_MIN_SIZE: 0 154 | # RPN generate proposal 155 | PROPOSAL_NMS_THRESH: 0.7 156 | PROPOSAL_PRE_NMS_TOP_N: 20000 157 | PROPOSAL_POST_NMS_TOP_N: 2000 158 | PROPOSAL_MIN_SIZE: 0 159 | # RCNN nms 160 | NMS: 10.0 161 | SOFTNMS: true 162 | test_epoch: 8 163 | max_per_image: 100 164 | # Learn nms 165 | LEARN_NMS: true 166 | LEARN_NMS_CLASS_SCORE_TH: 0.05 167 | FIRST_N: 150 168 | TOP_ROIS: 1000 169 | 170 | -------------------------------------------------------------------------------- /experiments/relation_rcnn/rcnn_end2end_train_test.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Jiayuan Gu, Dazhi Cheng, Guodong Zhang 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | import sys 10 | os.environ['PYTHONUNBUFFERED'] = '1' 11 | os.environ['MXNET_CUDNN_AUTOTUNE_DEFAULT'] = '0' 12 | os.environ['MXNET_ENABLE_GPU_P2P'] = '0' 13 | #os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine' 14 | this_dir = os.path.dirname(__file__) 15 | sys.path.insert(0, os.path.join(this_dir, '..', '..', 'relation_rcnn')) 16 | 17 | import train_end2end 18 | import test 19 | 20 | if __name__ == "__main__": 21 | train_end2end.main() 22 | test.main() 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /experiments/relation_rcnn/rcnn_test.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Jiayuan Gu, Dazhi Cheng, Guodong Zhang 6 | # -------------------------------------------------------- 7 | 8 | import cv2 9 | import os 10 | import sys 11 | os.environ['PYTHONUNBUFFERED'] = '1' 12 | os.environ['MXNET_CUDNN_AUTOTUNE_DEFAULT'] = '0' 13 | os.environ['MXNET_ENABLE_GPU_P2P'] = '0' 14 | this_dir = os.path.dirname(__file__) 15 | sys.path.insert(0, os.path.join(this_dir, '..', '..', 'relation_rcnn')) 16 | 17 | import test 18 | 19 | if __name__ == "__main__": 20 | test.main() 21 | -------------------------------------------------------------------------------- /experiments/relation_rcnn/rcnn_train_test.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Jiayuan Gu, Dazhi Cheng, Guodong Zhang 6 | # -------------------------------------------------------- 7 | 8 | import cv2 9 | import os 10 | import sys 11 | os.environ['PYTHONUNBUFFERED'] = '1' 12 | os.environ['MXNET_CUDNN_AUTOTUNE_DEFAULT'] = '0' 13 | os.environ['MXNET_ENABLE_GPU_P2P'] = '0' 14 | os.environ['MXNET_GPU_MEM_POOL_RESERVE'] = '10' 15 | os.environ['MXNET_BACKWARD_DO_MIRROR'] = '1' 16 | # os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine' 17 | this_dir = os.path.dirname(__file__) 18 | sys.path.insert(0, os.path.join(this_dir, '..', '..', 'relation_rcnn')) 19 | 20 | import train_rcnn 21 | import test 22 | 23 | if __name__ == "__main__": 24 | train_rcnn.main() 25 | test.main() 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p ./data 4 | mkdir -p ./output 5 | mkdir -p ./external/mxnet 6 | mkdir -p ./model/pretrained_model 7 | 8 | cd lib/bbox 9 | python setup_linux.py build_ext --inplace 10 | cd ../dataset/pycocotools 11 | python setup_linux.py build_ext --inplace 12 | cd ../../nms 13 | python setup_linux.py build_ext --inplace 14 | cd ../.. 15 | -------------------------------------------------------------------------------- /lib/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | cd nms/; python setup.py build_ext --inplace; rm -rf build; cd ../../ 3 | cd bbox/; python setup.py build_ext --inplace; rm -rf build; cd ../../ 4 | cd dataset/pycocotools/; python setup.py build_ext --inplace; rm -rf build; cd ../../ 5 | clean: 6 | cd nms/; rm *.so *.c *.cpp; cd ../../ 7 | cd bbox/; rm *.so *.c *.cpp; cd ../../ 8 | cd dataset/pycocotools/; rm *.so; cd ../../ 9 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msracver/Relation-Networks-for-Object-Detection/e83e911d828e3c86624ce0aeb8d742d5ee67d5ba/lib/__init__.py -------------------------------------------------------------------------------- /lib/bbox/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp -------------------------------------------------------------------------------- /lib/bbox/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msracver/Relation-Networks-for-Object-Detection/e83e911d828e3c86624ce0aeb8d742d5ee67d5ba/lib/bbox/__init__.py -------------------------------------------------------------------------------- /lib/bbox/bbox.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Sergey Karayev 6 | # -------------------------------------------------------- 7 | 8 | cimport cython 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | DTYPE = np.float 13 | ctypedef np.float_t DTYPE_t 14 | 15 | def bbox_overlaps_cython( 16 | np.ndarray[DTYPE_t, ndim=2] boxes, 17 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 18 | """ 19 | Parameters 20 | ---------- 21 | boxes: (N, 4) ndarray of float 22 | query_boxes: (K, 4) ndarray of float 23 | Returns 24 | ------- 25 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 26 | """ 27 | cdef unsigned int N = boxes.shape[0] 28 | cdef unsigned int K = query_boxes.shape[0] 29 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 30 | cdef DTYPE_t iw, ih, box_area 31 | cdef DTYPE_t ua 32 | cdef unsigned int k, n 33 | for k in range(K): 34 | box_area = ( 35 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 36 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 37 | ) 38 | for n in range(N): 39 | iw = ( 40 | min(boxes[n, 2], query_boxes[k, 2]) - 41 | max(boxes[n, 0], query_boxes[k, 0]) + 1 42 | ) 43 | if iw > 0: 44 | ih = ( 45 | min(boxes[n, 3], query_boxes[k, 3]) - 46 | max(boxes[n, 1], query_boxes[k, 1]) + 1 47 | ) 48 | if ih > 0: 49 | ua = float( 50 | (boxes[n, 2] - boxes[n, 0] + 1) * 51 | (boxes[n, 3] - boxes[n, 1] + 1) + 52 | box_area - iw * ih 53 | ) 54 | overlaps[n, k] = iw * ih / ua 55 | return overlaps 56 | -------------------------------------------------------------------------------- /lib/bbox/bbox_regression.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Dazhi Cheng, Jiayuan Gu, Yuwen Xiong 6 | # -------------------------------------------------------- 7 | # Based on: 8 | # py-faster-rcnn 9 | # Copyright (c) 2016 by Contributors 10 | # Licence under The MIT License 11 | # py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 12 | # -------------------------------------------------------- 13 | 14 | 15 | """ 16 | This file has functions about generating bounding box regression targets 17 | """ 18 | 19 | import numpy as np 20 | 21 | from bbox_transform import bbox_overlaps, bbox_transform 22 | 23 | 24 | def compute_bbox_regression_targets(rois, overlaps, labels, cfg): 25 | """ 26 | given rois, overlaps, gt labels, compute bounding box regression targets 27 | :param rois: roidb[i]['boxes'] k * 4 28 | :param overlaps: roidb[i]['max_overlaps'] k * 1 29 | :param labels: roidb[i]['max_classes'] k * 1 30 | :return: targets[i][class, dx, dy, dw, dh] k * 5 31 | """ 32 | # Ensure ROIs are floats 33 | rois = rois.astype(np.float, copy=False) 34 | 35 | # Sanity check 36 | if len(rois) != len(overlaps): 37 | print 'bbox regression: this should not happen' 38 | 39 | # Indices of ground-truth ROIs 40 | gt_inds = np.where(overlaps == 1)[0] 41 | if len(gt_inds) == 0: 42 | print 'something wrong : zero ground truth rois' 43 | # Indices of examples for which we try to make predictions 44 | ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_REGRESSION_THRESH)[0] 45 | # Get IoU overlap between each ex ROI and gt ROI 46 | ex_gt_overlaps = bbox_overlaps(rois[ex_inds, :], rois[gt_inds, :]) 47 | 48 | # Find which gt ROI each ex ROI has max overlap with: 49 | # this will be the ex ROI's gt target 50 | gt_assignment = ex_gt_overlaps.argmax(axis=1) 51 | gt_rois = rois[gt_inds[gt_assignment], :] 52 | ex_rois = rois[ex_inds, :] 53 | 54 | targets = np.zeros((rois.shape[0], 5), dtype=np.float32) 55 | targets[ex_inds, 0] = labels[ex_inds] 56 | targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois) 57 | return targets 58 | 59 | 60 | def add_bbox_regression_targets(roidb, cfg): 61 | """ 62 | given roidb, add ['bbox_targets'] and normalize bounding box regression targets 63 | :param roidb: roidb to be processed. 64 | :return: means, std variances of targets 65 | """ 66 | print 'add bounding box regression targets' 67 | assert len(roidb) > 0 68 | assert 'max_classes' in roidb[0] 69 | 70 | num_images = len(roidb) 71 | num_classes = 2 if cfg.CLASS_AGNOSTIC else roidb[0]['gt_overlaps'].shape[1] 72 | 73 | for im_i in range(num_images): 74 | rois = roidb[im_i]['boxes'] 75 | max_overlaps = roidb[im_i]['max_overlaps'] 76 | max_classes = roidb[im_i]['max_classes'] 77 | roidb[im_i]['bbox_targets'] = compute_bbox_regression_targets(rois, max_overlaps, max_classes, cfg) 78 | 79 | if cfg.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED: 80 | # use fixed / precomputed means and stds instead of empirical values 81 | means = np.tile(np.array(cfg.TRAIN.BBOX_MEANS), (num_classes, 1)) 82 | stds = np.tile(np.array(cfg.TRAIN.BBOX_STDS), (num_classes, 1)) 83 | else: 84 | # compute mean, std values 85 | class_counts = np.zeros((num_classes, 1)) + 1e-14 86 | sums = np.zeros((num_classes, 4)) 87 | squared_sums = np.zeros((num_classes, 4)) 88 | for im_i in range(num_images): 89 | targets = roidb[im_i]['bbox_targets'] 90 | for cls in range(1, num_classes): 91 | cls_indexes = np.where(targets[:, 0] > 0)[0] if cfg.CLASS_AGNOSTIC else np.where(targets[:, 0] == cls)[0] 92 | if cls_indexes.size > 0: 93 | class_counts[cls] += cls_indexes.size 94 | sums[cls, :] += targets[cls_indexes, 1:].sum(axis=0) 95 | squared_sums[cls, :] += (targets[cls_indexes, 1:] ** 2).sum(axis=0) 96 | 97 | means = sums / class_counts 98 | # var(x) = E(x^2) - E(x)^2 99 | stds = np.sqrt(squared_sums / class_counts - means ** 2) 100 | 101 | print 'bbox target means:' 102 | print means 103 | print means[1:, :].mean(axis=0) # ignore bg class 104 | print 'bbox target stdevs:' 105 | print stds 106 | print stds[1:, :].mean(axis=0) # ignore bg class 107 | 108 | 109 | # normalized targets 110 | for im_i in range(num_images): 111 | targets = roidb[im_i]['bbox_targets'] 112 | for cls in range(1, num_classes): 113 | cls_indexes = np.where(targets[:, 0] > 0) if cfg.CLASS_AGNOSTIC else np.where(targets[:, 0] == cls)[0] 114 | roidb[im_i]['bbox_targets'][cls_indexes, 1:] -= means[cls, :] 115 | roidb[im_i]['bbox_targets'][cls_indexes, 1:] /= stds[cls, :] 116 | 117 | return means.ravel(), stds.ravel() 118 | 119 | 120 | def expand_bbox_regression_targets(bbox_targets_data, num_classes, cfg): 121 | """ 122 | expand from 5 to 4 * num_classes; only the right class has non-zero bbox regression targets 123 | :param bbox_targets_data: [k * 5] 124 | :param num_classes: number of classes 125 | :return: bbox target processed [k * 4 num_classes] 126 | bbox_weights ! only foreground boxes have bbox regression computation! 127 | """ 128 | classes = bbox_targets_data[:, 0] 129 | if cfg.CLASS_AGNOSTIC: 130 | num_classes = 2 131 | bbox_targets = np.zeros((classes.size, 4 * num_classes), dtype=np.float32) 132 | bbox_weights = np.zeros(bbox_targets.shape, dtype=np.float32) 133 | indexes = np.where(classes > 0)[0] 134 | for index in indexes: 135 | cls = classes[index] 136 | start = int(4 * 1 if cls > 0 else 0) if cfg.CLASS_AGNOSTIC else int(4 * cls) 137 | end = start + 4 138 | bbox_targets[index, start:end] = bbox_targets_data[index, 1:] 139 | bbox_weights[index, start:end] = cfg.TRAIN.BBOX_WEIGHTS 140 | return bbox_targets, bbox_weights 141 | 142 | -------------------------------------------------------------------------------- /lib/bbox/bbox_transform.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Dazhi Cheng, Jiayuan Gu, Yuwen Xiong 6 | # -------------------------------------------------------- 7 | # Based on: 8 | # py-faster-rcnn 9 | # Copyright (c) 2016 by Contributors 10 | # Licence under The MIT License 11 | # py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 12 | # -------------------------------------------------------- 13 | 14 | import numpy as np 15 | from bbox import bbox_overlaps_cython 16 | 17 | 18 | def bbox_overlaps(boxes, query_boxes): 19 | return bbox_overlaps_cython(boxes, query_boxes) 20 | 21 | 22 | def bbox_overlaps_py(boxes, query_boxes): 23 | """ 24 | determine overlaps between boxes and query_boxes 25 | :param boxes: n * 4 bounding boxes 26 | :param query_boxes: k * 4 bounding boxes 27 | :return: overlaps: n * k overlaps 28 | """ 29 | n_ = boxes.shape[0] 30 | k_ = query_boxes.shape[0] 31 | overlaps = np.zeros((n_, k_), dtype=np.float) 32 | for k in range(k_): 33 | query_box_area = (query_boxes[k, 2] - query_boxes[k, 0] + 1) * (query_boxes[k, 3] - query_boxes[k, 1] + 1) 34 | for n in range(n_): 35 | iw = min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0]) + 1 36 | if iw > 0: 37 | ih = min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1]) + 1 38 | if ih > 0: 39 | box_area = (boxes[n, 2] - boxes[n, 0] + 1) * (boxes[n, 3] - boxes[n, 1] + 1) 40 | all_area = float(box_area + query_box_area - iw * ih) 41 | overlaps[n, k] = iw * ih / all_area 42 | return overlaps 43 | 44 | 45 | def clip_boxes(boxes, im_shape): 46 | """ 47 | Clip boxes to image boundaries. 48 | :param boxes: [N, 4* num_classes] 49 | :param im_shape: tuple of 2 50 | :return: [N, 4* num_classes] 51 | """ 52 | # x1 >= 0 53 | boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) 54 | # y1 >= 0 55 | boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) 56 | # x2 < im_shape[1] 57 | boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) 58 | # y2 < im_shape[0] 59 | boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) 60 | return boxes 61 | 62 | def filter_boxes(boxes, min_size): 63 | """ 64 | filter small boxes. 65 | :param boxes: [N, 4* num_classes] 66 | :param min_size: 67 | :return: keep: 68 | """ 69 | ws = boxes[:, 2] - boxes[:, 0] + 1 70 | hs = boxes[:, 3] - boxes[:, 1] + 1 71 | keep = np.where((ws >= min_size) & (hs >= min_size))[0] 72 | return keep 73 | 74 | def nonlinear_transform(ex_rois, gt_rois): 75 | """ 76 | compute bounding box regression targets from ex_rois to gt_rois 77 | :param ex_rois: [N, 4] 78 | :param gt_rois: [N, 4] 79 | :return: [N, 4] 80 | """ 81 | assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number' 82 | 83 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 84 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 85 | ex_ctr_x = ex_rois[:, 0] + 0.5 * (ex_widths - 1.0) 86 | ex_ctr_y = ex_rois[:, 1] + 0.5 * (ex_heights - 1.0) 87 | 88 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 89 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 90 | gt_ctr_x = gt_rois[:, 0] + 0.5 * (gt_widths - 1.0) 91 | gt_ctr_y = gt_rois[:, 1] + 0.5 * (gt_heights - 1.0) 92 | 93 | targets_dx = (gt_ctr_x - ex_ctr_x) / (ex_widths + 1e-14) 94 | targets_dy = (gt_ctr_y - ex_ctr_y) / (ex_heights + 1e-14) 95 | targets_dw = np.log(gt_widths / ex_widths) 96 | targets_dh = np.log(gt_heights / ex_heights) 97 | 98 | targets = np.vstack( 99 | (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() 100 | return targets 101 | 102 | 103 | def nonlinear_pred(boxes, box_deltas): 104 | """ 105 | Transform the set of class-agnostic boxes into class-specific boxes 106 | by applying the predicted offsets (box_deltas) 107 | :param boxes: !important [N 4] 108 | :param box_deltas: [N, 4 * num_classes] 109 | :return: [N 4 * num_classes] 110 | """ 111 | if boxes.shape[0] == 0: 112 | return np.zeros((0, box_deltas.shape[1])) 113 | 114 | boxes = boxes.astype(np.float, copy=False) 115 | widths = boxes[:, 2] - boxes[:, 0] + 1.0 116 | heights = boxes[:, 3] - boxes[:, 1] + 1.0 117 | ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0) 118 | ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0) 119 | 120 | dx = box_deltas[:, 0::4] 121 | dy = box_deltas[:, 1::4] 122 | dw = box_deltas[:, 2::4] 123 | dh = box_deltas[:, 3::4] 124 | 125 | pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] 126 | pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] 127 | pred_w = np.exp(dw) * widths[:, np.newaxis] 128 | pred_h = np.exp(dh) * heights[:, np.newaxis] 129 | 130 | pred_boxes = np.zeros(box_deltas.shape) 131 | # x1 132 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * (pred_w - 1.0) 133 | # y1 134 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * (pred_h - 1.0) 135 | # x2 136 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * (pred_w - 1.0) 137 | # y2 138 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * (pred_h - 1.0) 139 | 140 | return pred_boxes 141 | 142 | 143 | def iou_transform(ex_rois, gt_rois): 144 | """ return bbox targets, IoU loss uses gt_rois as gt """ 145 | assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number' 146 | return gt_rois 147 | 148 | 149 | def iou_pred(boxes, box_deltas): 150 | """ 151 | Transform the set of class-agnostic boxes into class-specific boxes 152 | by applying the predicted offsets (box_deltas) 153 | :param boxes: !important [N 4] 154 | :param box_deltas: [N, 4 * num_classes] 155 | :return: [N 4 * num_classes] 156 | """ 157 | if boxes.shape[0] == 0: 158 | return np.zeros((0, box_deltas.shape[1])) 159 | 160 | boxes = boxes.astype(np.float, copy=False) 161 | x1 = boxes[:, 0] 162 | y1 = boxes[:, 1] 163 | x2 = boxes[:, 2] 164 | y2 = boxes[:, 3] 165 | 166 | dx1 = box_deltas[:, 0::4] 167 | dy1 = box_deltas[:, 1::4] 168 | dx2 = box_deltas[:, 2::4] 169 | dy2 = box_deltas[:, 3::4] 170 | 171 | pred_boxes = np.zeros(box_deltas.shape) 172 | # x1 173 | pred_boxes[:, 0::4] = dx1 + x1[:, np.newaxis] 174 | # y1 175 | pred_boxes[:, 1::4] = dy1 + y1[:, np.newaxis] 176 | # x2 177 | pred_boxes[:, 2::4] = dx2 + x2[:, np.newaxis] 178 | # y2 179 | pred_boxes[:, 3::4] = dy2 + y2[:, np.newaxis] 180 | 181 | return pred_boxes 182 | 183 | 184 | # define bbox_transform and bbox_pred 185 | bbox_transform = nonlinear_transform 186 | bbox_pred = nonlinear_pred 187 | -------------------------------------------------------------------------------- /lib/bbox/setup_linux.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Dazhi Cheng, Jiayuan Gu, Yuwen Xiong 6 | # -------------------------------------------------------- 7 | # Based on: 8 | # py-faster-rcnn 9 | # Copyright (c) 2016 by Contributors 10 | # Licence under The MIT License 11 | # py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 12 | # -------------------------------------------------------- 13 | 14 | import os 15 | from os.path import join as pjoin 16 | from setuptools import setup 17 | from distutils.extension import Extension 18 | from Cython.Distutils import build_ext 19 | import numpy as np 20 | 21 | # Obtain the numpy include directory. This logic works across numpy versions. 22 | try: 23 | numpy_include = np.get_include() 24 | except AttributeError: 25 | numpy_include = np.get_numpy_include() 26 | 27 | 28 | def customize_compiler_for_nvcc(self): 29 | """inject deep into distutils to customize how the dispatch 30 | to gcc/nvcc works. 31 | If you subclass UnixCCompiler, it's not trivial to get your subclass 32 | injected in, and still have the right customizations (i.e. 33 | distutils.sysconfig.customize_compiler) run on it. So instead of going 34 | the OO route, I have this. Note, it's kindof like a wierd functional 35 | subclassing going on.""" 36 | 37 | # tell the compiler it can processes .cu 38 | self.src_extensions.append('.cu') 39 | 40 | # save references to the default compiler_so and _comple methods 41 | default_compiler_so = self.compiler_so 42 | super = self._compile 43 | 44 | # now redefine the _compile method. This gets executed for each 45 | # object but distutils doesn't have the ability to change compilers 46 | # based on source extension: we add it. 47 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 48 | if os.path.splitext(src)[1] == '.cu': 49 | # use the cuda for .cu files 50 | self.set_executable('compiler_so', CUDA['nvcc']) 51 | # use only a subset of the extra_postargs, which are 1-1 translated 52 | # from the extra_compile_args in the Extension class 53 | postargs = extra_postargs['nvcc'] 54 | else: 55 | postargs = extra_postargs['gcc'] 56 | 57 | super(obj, src, ext, cc_args, postargs, pp_opts) 58 | # reset the default compiler_so, which we might have changed for cuda 59 | self.compiler_so = default_compiler_so 60 | 61 | # inject our redefined _compile method into the class 62 | self._compile = _compile 63 | 64 | 65 | # run the customize_compiler 66 | class custom_build_ext(build_ext): 67 | def build_extensions(self): 68 | customize_compiler_for_nvcc(self.compiler) 69 | build_ext.build_extensions(self) 70 | 71 | 72 | ext_modules = [ 73 | Extension( 74 | "bbox", 75 | ["bbox.pyx"], 76 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 77 | include_dirs=[numpy_include] 78 | ), 79 | ] 80 | 81 | setup( 82 | name='bbox_cython', 83 | ext_modules=ext_modules, 84 | # inject our custom trigger 85 | cmdclass={'build_ext': custom_build_ext}, 86 | ) 87 | -------------------------------------------------------------------------------- /lib/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from imdb import IMDB 2 | from coco import coco 3 | -------------------------------------------------------------------------------- /lib/dataset/pycocotools/.gitignore: -------------------------------------------------------------------------------- 1 | _mask.c 2 | -------------------------------------------------------------------------------- /lib/dataset/pycocotools/UPSTREAM_REV: -------------------------------------------------------------------------------- 1 | https://github.com/pdollar/coco/commit/3ac47c77ebd5a1ed4254a98b7fbf2ef4765a3574 2 | -------------------------------------------------------------------------------- /lib/dataset/pycocotools/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /lib/dataset/pycocotools/mask.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tsungyi' 2 | 3 | import _mask as _mask 4 | 5 | # Interface for manipulating masks stored in RLE format. 6 | # 7 | # RLE is a simple yet efficient format for storing binary masks. RLE 8 | # first divides a vector (or vectorized image) into a series of piecewise 9 | # constant regions and then for each piece simply stores the length of 10 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would 11 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] 12 | # (note that the odd counts are always the numbers of zeros). Instead of 13 | # storing the counts directly, additional compression is achieved with a 14 | # variable bitrate representation based on a common scheme called LEB128. 15 | # 16 | # Compression is greatest given large piecewise constant regions. 17 | # Specifically, the size of the RLE is proportional to the number of 18 | # *boundaries* in M (or for an image the number of boundaries in the y 19 | # direction). Assuming fairly simple shapes, the RLE representation is 20 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage 21 | # is substantially lower, especially for large simple objects (large n). 22 | # 23 | # Many common operations on masks can be computed directly using the RLE 24 | # (without need for decoding). This includes computations such as area, 25 | # union, intersection, etc. All of these operations are linear in the 26 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area 27 | # of the object. Computing these operations on the original mask is O(n). 28 | # Thus, using the RLE can result in substantial computational savings. 29 | # 30 | # The following API functions are defined: 31 | # encode - Encode binary masks using RLE. 32 | # decode - Decode binary masks encoded via RLE. 33 | # merge - Compute union or intersection of encoded masks. 34 | # iou - Compute intersection over union between masks. 35 | # area - Compute area of encoded masks. 36 | # toBbox - Get bounding boxes surrounding encoded masks. 37 | # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. 38 | # 39 | # Usage: 40 | # Rs = encode( masks ) 41 | # masks = decode( Rs ) 42 | # R = merge( Rs, intersect=false ) 43 | # o = iou( dt, gt, iscrowd ) 44 | # a = area( Rs ) 45 | # bbs = toBbox( Rs ) 46 | # Rs = frPyObjects( [pyObjects], h, w ) 47 | # 48 | # In the API the following formats are used: 49 | # Rs - [dict] Run-length encoding of binary masks 50 | # R - dict Run-length encoding of binary mask 51 | # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) 52 | # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore 53 | # bbs - [nx4] Bounding box(es) stored as [x y w h] 54 | # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) 55 | # dt,gt - May be either bounding boxes or encoded masks 56 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). 57 | # 58 | # Finally, a note about the intersection over union (iou) computation. 59 | # The standard iou of a ground truth (gt) and detected (dt) object is 60 | # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) 61 | # For "crowd" regions, we use a modified criteria. If a gt object is 62 | # marked as "iscrowd", we allow a dt to match any subregion of the gt. 63 | # Choosing gt' in the crowd gt that best matches the dt can be done using 64 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing 65 | # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) 66 | # For crowd gt regions we use this modified criteria above for the iou. 67 | # 68 | # To compile run "python setup.py build_ext --inplace" 69 | # Please do not contact us for help with compiling. 70 | # 71 | # Microsoft COCO Toolbox. version 2.0 72 | # Data, paper, and tutorials available at: http://mscoco.org/ 73 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 74 | # Licensed under the Simplified BSD License [see coco/license.txt] 75 | 76 | encode = _mask.encode 77 | #decode = _mask.decode 78 | def decode(rleObjs): 79 | if type(rleObjs) == list: 80 | return _mask.decode(rleObjs) 81 | else: 82 | return _mask.decode([rleObjs])[:,:,0] 83 | iou = _mask.iou 84 | merge = _mask.merge 85 | area = _mask.area 86 | toBbox = _mask.toBbox 87 | frPyObjects = _mask.frPyObjects 88 | -------------------------------------------------------------------------------- /lib/dataset/pycocotools/maskApi.c: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #include "maskApi.h" 8 | #include 9 | #include 10 | 11 | uint umin( uint a, uint b ) { return (ab) ? a : b; } 13 | 14 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) { 15 | R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m); 16 | if(cnts) for(siz j=0; jcnts[j]=cnts[j]; 17 | } 18 | 19 | void rleFree( RLE *R ) { 20 | free(R->cnts); R->cnts=0; 21 | } 22 | 23 | void rlesInit( RLE **R, siz n ) { 24 | *R = (RLE*) malloc(sizeof(RLE)*n); 25 | for(siz i=0; i0 ) { 61 | c=umin(ca,cb); cc+=c; ct=0; 62 | ca-=c; if(!ca && a0) { 83 | crowd=iscrowd!=NULL && iscrowd[g]; 84 | if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; } 85 | siz ka, kb, a, b; uint c, ca, cb, ct, i, u; bool va, vb; 86 | ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0; 87 | cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1; 88 | while( ct>0 ) { 89 | c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0; 90 | ca-=c; if(!ca && ad?1:c=dy && xs>xe) || (dxye); 151 | if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; } 152 | s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy; 153 | if(dx>=dy) for( int d=0; d<=dx; d++ ) { 154 | t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++; 155 | } else for( int d=0; d<=dy; d++ ) { 156 | t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++; 157 | } 158 | } 159 | // get points along y-boundary and downsample 160 | free(x); free(y); k=m; m=0; double xd, yd; 161 | x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k); 162 | for( j=1; jw-1 ) continue; 165 | yd=(double)(v[j]h) yd=h; yd=ceil(yd); 167 | x[m]=(int) xd; y[m]=(int) yd; m++; 168 | } 169 | // compute rle encoding given y-boundary points 170 | k=m; a=malloc(sizeof(uint)*(k+1)); 171 | for( j=0; j0) b[m++]=a[j++]; else { 177 | j++; if(jm, p=0; long x; bool more; 184 | char *s=malloc(sizeof(char)*m*6); 185 | for( i=0; icnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1; 187 | while( more ) { 188 | char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0; 189 | if(more) c |= 0x20; c+=48; s[p++]=c; 190 | } 191 | } 192 | s[p]=0; return s; 193 | } 194 | 195 | void rleFrString( RLE *R, char *s, siz h, siz w ) { 196 | siz m=0, p=0, k; long x; bool more; uint *cnts; 197 | while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0; 198 | while( s[p] ) { 199 | x=0; k=0; more=1; 200 | while( more ) { 201 | char c=s[p]-48; x |= (c & 0x1f) << 5*k; 202 | more = c & 0x20; p++; k++; 203 | if(!more && (c & 0x10)) x |= -1 << 5*k; 204 | } 205 | if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x; 206 | } 207 | rleInit(R,h,w,m,cnts); free(cnts); 208 | } 209 | -------------------------------------------------------------------------------- /lib/dataset/pycocotools/maskApi.h: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #pragma once 8 | #include 9 | 10 | typedef unsigned int uint; 11 | typedef unsigned long siz; 12 | typedef unsigned char byte; 13 | typedef double* BB; 14 | typedef struct { siz h, w, m; uint *cnts; } RLE; 15 | 16 | // Initialize/destroy RLE. 17 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ); 18 | void rleFree( RLE *R ); 19 | 20 | // Initialize/destroy RLE array. 21 | void rlesInit( RLE **R, siz n ); 22 | void rlesFree( RLE **R, siz n ); 23 | 24 | // Encode binary masks using RLE. 25 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n ); 26 | 27 | // Decode binary masks encoded via RLE. 28 | void rleDecode( const RLE *R, byte *mask, siz n ); 29 | 30 | // Compute union or intersection of encoded masks. 31 | void rleMerge( const RLE *R, RLE *M, siz n, bool intersect ); 32 | 33 | // Compute area of encoded masks. 34 | void rleArea( const RLE *R, siz n, uint *a ); 35 | 36 | // Compute intersection over union between masks. 37 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ); 38 | 39 | // Compute intersection over union between bounding boxes. 40 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ); 41 | 42 | // Get bounding boxes surrounding encoded masks. 43 | void rleToBbox( const RLE *R, BB bb, siz n ); 44 | 45 | // Convert bounding boxes to encoded masks. 46 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ); 47 | 48 | // Convert polygon to encoded mask. 49 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ); 50 | 51 | // Get compressed string representation of encoded mask. 52 | char* rleToString( const RLE *R ); 53 | 54 | // Convert from compressed string representation of encoded mask. 55 | void rleFrString( RLE *R, char *s, siz h, siz w ); 56 | -------------------------------------------------------------------------------- /lib/dataset/pycocotools/setup_linux.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from Cython.Build import cythonize 3 | from distutils.extension import Extension 4 | import numpy as np 5 | 6 | # To compile and install locally run "python setup.py build_ext --inplace" 7 | # To install library to Python site-packages run "python setup.py build_ext install" 8 | 9 | ext_modules = [ 10 | Extension( 11 | '_mask', 12 | sources=['maskApi.c', '_mask.pyx'], 13 | include_dirs=[np.get_include()], 14 | extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99'], 15 | ) 16 | ] 17 | 18 | setup(name='pycocotools', 19 | ext_modules=cythonize(ext_modules) 20 | ) 21 | -------------------------------------------------------------------------------- /lib/nms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msracver/Relation-Networks-for-Object-Detection/e83e911d828e3c86624ce0aeb8d742d5ee67d5ba/lib/nms/__init__.py -------------------------------------------------------------------------------- /lib/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Deformable Convolutional Networks 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1].astype('i') 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Deformable Convolutional Networks 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 15 | 16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 17 | np.int32_t device_id=0): 18 | cdef int boxes_num = dets.shape[0] 19 | cdef int boxes_dim = dets.shape[1] 20 | cdef int num_out 21 | cdef np.ndarray[np.int32_t, ndim=1] \ 22 | keep = np.zeros(boxes_num, dtype=np.int32) 23 | cdef np.ndarray[np.float32_t, ndim=1] \ 24 | scores = dets[:, 4] 25 | cdef np.ndarray[np.int32_t, ndim=1] \ 26 | order = scores.argsort()[::-1].astype(np.int32) 27 | cdef np.ndarray[np.float32_t, ndim=2] \ 28 | sorted_dets = dets[order, :] 29 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 30 | keep = keep[:num_out] 31 | return list(order[keep]) 32 | -------------------------------------------------------------------------------- /lib/nms/nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Dazhi Cheng, Jiayuan Gu, Yuwen Xiong 6 | # -------------------------------------------------------- 7 | # Based on: 8 | # py-faster-rcnn 9 | # Copyright (c) 2016 by Contributors 10 | # Licence under The MIT License 11 | # py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 12 | # -------------------------------------------------------- 13 | 14 | 15 | import numpy as np 16 | 17 | from cpu_nms import cpu_nms 18 | from gpu_nms import gpu_nms 19 | 20 | 21 | def py_nms_wrapper(thresh): 22 | def _nms(dets): 23 | return nms(dets, thresh) 24 | return _nms 25 | 26 | 27 | def py_softnms_wrapper(thresh, max_dets=-1): 28 | def _nms(dets): 29 | return soft_nms(dets, thresh, max_dets) 30 | return _nms 31 | 32 | 33 | def cpu_nms_wrapper(thresh): 34 | def _nms(dets): 35 | return cpu_nms(dets, thresh) 36 | return _nms 37 | 38 | 39 | def gpu_nms_wrapper(thresh, device_id): 40 | def _nms(dets): 41 | return gpu_nms(dets, thresh, device_id) 42 | return _nms 43 | 44 | 45 | def nms(dets, thresh): 46 | """ 47 | greedily select boxes with high confidence and overlap with current maximum <= thresh 48 | rule out overlap >= thresh 49 | :param dets: [[x1, y1, x2, y2 score]] 50 | :param thresh: retain overlap < thresh 51 | :return: indexes to keep 52 | """ 53 | if dets.shape[0] == 0: 54 | return [] 55 | 56 | x1 = dets[:, 0] 57 | y1 = dets[:, 1] 58 | x2 = dets[:, 2] 59 | y2 = dets[:, 3] 60 | scores = dets[:, 4] 61 | 62 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 63 | order = scores.argsort()[::-1] 64 | 65 | keep = [] 66 | while order.size > 0: 67 | i = order[0] 68 | keep.append(i) 69 | xx1 = np.maximum(x1[i], x1[order[1:]]) 70 | yy1 = np.maximum(y1[i], y1[order[1:]]) 71 | xx2 = np.minimum(x2[i], x2[order[1:]]) 72 | yy2 = np.minimum(y2[i], y2[order[1:]]) 73 | 74 | w = np.maximum(0.0, xx2 - xx1 + 1) 75 | h = np.maximum(0.0, yy2 - yy1 + 1) 76 | inter = w * h 77 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 78 | 79 | inds = np.where(ovr <= thresh)[0] 80 | order = order[inds + 1] 81 | 82 | return keep 83 | 84 | 85 | def rescore(overlap, scores, thresh, type='gaussian'): 86 | assert overlap.shape[0] == scores.shape[0] 87 | if type == 'linear': 88 | inds = np.where(overlap >= thresh)[0] 89 | scores[inds] = scores[inds] * (1 - overlap[inds]) 90 | else: 91 | scores = scores * np.exp(- overlap**2 / thresh) 92 | 93 | return scores 94 | 95 | 96 | def soft_nms(dets, thresh, max_dets): 97 | if dets.shape[0] == 0: 98 | return np.zeros((0, 5)) 99 | 100 | x1 = dets[:, 0] 101 | y1 = dets[:, 1] 102 | x2 = dets[:, 2] 103 | y2 = dets[:, 3] 104 | scores = dets[:, 4] 105 | 106 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 107 | order = scores.argsort()[::-1] 108 | scores = scores[order] 109 | 110 | if max_dets == -1: 111 | max_dets = order.size 112 | 113 | keep = np.zeros(max_dets, dtype=np.intp) 114 | keep_cnt = 0 115 | 116 | while order.size > 0 and keep_cnt < max_dets: 117 | i = order[0] 118 | dets[i, 4] = scores[0] 119 | xx1 = np.maximum(x1[i], x1[order[1:]]) 120 | yy1 = np.maximum(y1[i], y1[order[1:]]) 121 | xx2 = np.minimum(x2[i], x2[order[1:]]) 122 | yy2 = np.minimum(y2[i], y2[order[1:]]) 123 | 124 | w = np.maximum(0.0, xx2 - xx1 + 1) 125 | h = np.maximum(0.0, yy2 - yy1 + 1) 126 | inter = w * h 127 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 128 | 129 | order = order[1:] 130 | scores = rescore(ovr, scores[1:], thresh) 131 | 132 | tmp = scores.argsort()[::-1] 133 | order = order[tmp] 134 | scores = scores[tmp] 135 | 136 | keep[keep_cnt] = i 137 | keep_cnt += 1 138 | 139 | keep = keep[:keep_cnt] 140 | dets = dets[keep, :] 141 | return dets 142 | -------------------------------------------------------------------------------- /lib/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Deformable Convolutional Networks 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License 5 | // Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn) 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /lib/nms/setup_linux.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from os.path import join as pjoin 10 | from setuptools import setup 11 | from distutils.extension import Extension 12 | from Cython.Distutils import build_ext 13 | import numpy as np 14 | 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | # Adapted fom 19 | # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 20 | for dir in path.split(os.pathsep): 21 | binpath = pjoin(dir, name) 22 | if os.path.exists(binpath): 23 | return os.path.abspath(binpath) 24 | return None 25 | 26 | 27 | def locate_cuda(): 28 | """Locate the CUDA environment on the system 29 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 30 | and values giving the absolute path to each directory. 31 | Starts by looking for the CUDAHOME env variable. If not found, everything 32 | is based on finding 'nvcc' in the PATH. 33 | """ 34 | 35 | # first check if the CUDAHOME env variable is in use 36 | if 'CUDAHOME' in os.environ: 37 | home = os.environ['CUDAHOME'] 38 | nvcc = pjoin(home, 'bin', 'nvcc') 39 | else: 40 | # otherwise, search the PATH for NVCC 41 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 42 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 43 | if nvcc is None: 44 | raise EnvironmentError('The nvcc binary could not be ' 45 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME') 46 | home = os.path.dirname(os.path.dirname(nvcc)) 47 | 48 | cudaconfig = {'home':home, 'nvcc':nvcc, 49 | 'include': pjoin(home, 'include'), 50 | 'lib64': pjoin(home, 'lib64')} 51 | for k, v in cudaconfig.iteritems(): 52 | if not os.path.exists(v): 53 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 54 | 55 | return cudaconfig 56 | CUDA = locate_cuda() 57 | 58 | 59 | # Obtain the numpy include directory. This logic works across numpy versions. 60 | try: 61 | numpy_include = np.get_include() 62 | except AttributeError: 63 | numpy_include = np.get_numpy_include() 64 | 65 | 66 | def customize_compiler_for_nvcc(self): 67 | """inject deep into distutils to customize how the dispatch 68 | to gcc/nvcc works. 69 | If you subclass UnixCCompiler, it's not trivial to get your subclass 70 | injected in, and still have the right customizations (i.e. 71 | distutils.sysconfig.customize_compiler) run on it. So instead of going 72 | the OO route, I have this. Note, it's kindof like a wierd functional 73 | subclassing going on.""" 74 | 75 | # tell the compiler it can processes .cu 76 | self.src_extensions.append('.cu') 77 | 78 | # save references to the default compiler_so and _comple methods 79 | default_compiler_so = self.compiler_so 80 | super = self._compile 81 | 82 | # now redefine the _compile method. This gets executed for each 83 | # object but distutils doesn't have the ability to change compilers 84 | # based on source extension: we add it. 85 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 86 | if os.path.splitext(src)[1] == '.cu': 87 | # use the cuda for .cu files 88 | self.set_executable('compiler_so', CUDA['nvcc']) 89 | # use only a subset of the extra_postargs, which are 1-1 translated 90 | # from the extra_compile_args in the Extension class 91 | postargs = extra_postargs['nvcc'] 92 | else: 93 | postargs = extra_postargs['gcc'] 94 | 95 | super(obj, src, ext, cc_args, postargs, pp_opts) 96 | # reset the default compiler_so, which we might have changed for cuda 97 | self.compiler_so = default_compiler_so 98 | 99 | # inject our redefined _compile method into the class 100 | self._compile = _compile 101 | 102 | 103 | # run the customize_compiler 104 | class custom_build_ext(build_ext): 105 | def build_extensions(self): 106 | customize_compiler_for_nvcc(self.compiler) 107 | build_ext.build_extensions(self) 108 | 109 | 110 | ext_modules = [ 111 | Extension( 112 | "cpu_nms", 113 | ["cpu_nms.pyx"], 114 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 115 | include_dirs = [numpy_include] 116 | ), 117 | Extension('gpu_nms', 118 | ['nms_kernel.cu', 'gpu_nms.pyx'], 119 | library_dirs=[CUDA['lib64']], 120 | libraries=['cudart'], 121 | language='c++', 122 | runtime_library_dirs=[CUDA['lib64']], 123 | # this syntax is specific to this build system 124 | # we're only going to use certain compiler args with nvcc and not with 125 | # gcc the implementation of this trick is in customize_compiler() below 126 | extra_compile_args={'gcc': ["-Wno-unused-function"], 127 | 'nvcc': ['-arch=sm_35', 128 | '--ptxas-options=-v', 129 | '-c', 130 | '--compiler-options', 131 | "'-fPIC'"]}, 132 | include_dirs = [numpy_include, CUDA['include']] 133 | ), 134 | ] 135 | 136 | setup( 137 | name='nms', 138 | ext_modules=ext_modules, 139 | # inject our custom trigger 140 | cmdclass={'build_ext': custom_build_ext}, 141 | ) 142 | -------------------------------------------------------------------------------- /lib/rpn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msracver/Relation-Networks-for-Object-Detection/e83e911d828e3c86624ce0aeb8d742d5ee67d5ba/lib/rpn/__init__.py -------------------------------------------------------------------------------- /lib/rpn/generate_anchor.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Jiayuan Gu, Dazhi Cheng, Yuwen Xiong 6 | # -------------------------------------------------------- 7 | # Based on: 8 | # MX-RCNN 9 | # Copyright (c) 2016 by Contributors 10 | # Licence under The Apache 2.0 License 11 | # https://github.com/ijkguo/mx-rcnn/ 12 | # -------------------------------------------------------- 13 | 14 | 15 | """ 16 | Generate base anchors on index 0 17 | """ 18 | 19 | import numpy as np 20 | 21 | 22 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2], 23 | scales=2 ** np.arange(3, 6)): 24 | """ 25 | Generate anchor (reference) windows by enumerating aspect ratios X 26 | scales wrt a reference (0, 0, 15, 15) window. 27 | """ 28 | 29 | base_anchor = np.array([1, 1, base_size, base_size]) - 1 30 | ratio_anchors = _ratio_enum(base_anchor, ratios) 31 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) 32 | for i in xrange(ratio_anchors.shape[0])]) 33 | return anchors 34 | 35 | 36 | def _whctrs(anchor): 37 | """ 38 | Return width, height, x center, and y center for an anchor (window). 39 | """ 40 | 41 | w = anchor[2] - anchor[0] + 1 42 | h = anchor[3] - anchor[1] + 1 43 | x_ctr = anchor[0] + 0.5 * (w - 1) 44 | y_ctr = anchor[1] + 0.5 * (h - 1) 45 | return w, h, x_ctr, y_ctr 46 | 47 | 48 | def _mkanchors(ws, hs, x_ctr, y_ctr): 49 | """ 50 | Given a vector of widths (ws) and heights (hs) around a center 51 | (x_ctr, y_ctr), output a set of anchors (windows). 52 | """ 53 | 54 | ws = ws[:, np.newaxis] 55 | hs = hs[:, np.newaxis] 56 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), 57 | y_ctr - 0.5 * (hs - 1), 58 | x_ctr + 0.5 * (ws - 1), 59 | y_ctr + 0.5 * (hs - 1))) 60 | return anchors 61 | 62 | 63 | def _ratio_enum(anchor, ratios): 64 | """ 65 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 66 | """ 67 | 68 | w, h, x_ctr, y_ctr = _whctrs(anchor) 69 | size = w * h 70 | size_ratios = size / ratios 71 | ws = np.round(np.sqrt(size_ratios)) 72 | hs = np.round(ws * ratios) 73 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 74 | return anchors 75 | 76 | 77 | def _scale_enum(anchor, scales): 78 | """ 79 | Enumerate a set of anchors for each scale wrt an anchor. 80 | """ 81 | 82 | w, h, x_ctr, y_ctr = _whctrs(anchor) 83 | ws = w * scales 84 | hs = h * scales 85 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 86 | return anchors 87 | -------------------------------------------------------------------------------- /lib/utils/PrefetchingIter.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Jiayuan Gu, Dazhi Cheng, Han Hu, Yuwen Xiong 6 | # -------------------------------------------------------- 7 | # Based on: 8 | # MX-RCNN 9 | # Copyright (c) 2016 by Contributors 10 | # Licence under The Apache 2.0 License 11 | # https://github.com/ijkguo/mx-rcnn/ 12 | # -------------------------------------------------------- 13 | 14 | import mxnet as mx 15 | from mxnet.io import DataDesc, DataBatch 16 | import threading 17 | 18 | 19 | class PrefetchingIterV2(mx.io.DataIter): 20 | """Base class for prefetching iterators. Takes one or more DataIters ( 21 | or any class with "reset" and "next" methods) and combine them with 22 | prefetching. For example: 23 | 24 | Parameters 25 | ---------- 26 | iters : DataIter or list of DataIter 27 | one or more DataIters (or any class with "reset" and "next" methods) 28 | rename_data : None or list of dict 29 | i-th element is a renaming map for i-th iter, in the form of 30 | {'original_name' : 'new_name'}. Should have one entry for each entry 31 | in iter[i].provide_data 32 | rename_label : None or list of dict 33 | Similar to rename_data 34 | 35 | Examples 36 | -------- 37 | iter = PrefetchingIter([NDArrayIter({'data': X1}), NDArrayIter({'data': X2})], 38 | rename_data=[{'data': 'data1'}, {'data': 'data2'}]) 39 | """ 40 | def __init__(self, iters, rename_data=None, rename_label=None, prefetch_n_iter=4): 41 | super(PrefetchingIterV2, self).__init__() 42 | if not isinstance(iters, list): 43 | iters = [iters] 44 | self.n_iter = len(iters) 45 | self.prefetch_n_iter = prefetch_n_iter 46 | assert self.n_iter == 1, "Our prefetching iter only support 1 DataIter" 47 | self.iters = iters 48 | self.rename_data = rename_data 49 | self.rename_label = rename_label 50 | self.batch_size = len(self.provide_data) * self.provide_data[0][0][1][0] 51 | self.data_ready = [threading.Event() for i in range(self.prefetch_n_iter)] 52 | self.data_taken = [threading.Event() for i in range(self.prefetch_n_iter)] 53 | 54 | self.cur_id = 0 55 | for e in self.data_taken: 56 | e.set() 57 | self.started = True 58 | self.current_batch = None 59 | self.next_batch = [[None for _ in range(self.n_iter)] for _ in range(self.prefetch_n_iter)] 60 | 61 | def prefetch_func(self, i): 62 | """Thread entry""" 63 | while True: 64 | self.data_taken[i].wait() 65 | if not self.started: 66 | break 67 | try: 68 | self.next_batch[i][0] = self.iters[0].next() 69 | except StopIteration: 70 | self.next_batch[i][0] = None 71 | self.data_taken[i].clear() 72 | self.data_ready[i].set() 73 | self.prefetch_threads = [threading.Thread(target=prefetch_func, args=[self, i]) \ 74 | for i in range(self.prefetch_n_iter)] 75 | for thread in self.prefetch_threads: 76 | thread.setDaemon(True) 77 | thread.start() 78 | 79 | def __del__(self): 80 | self.started = False 81 | for e in self.data_taken: 82 | e.set() 83 | for thread in self.prefetch_threads: 84 | thread.join() 85 | 86 | @property 87 | def provide_data(self): 88 | """The name and shape of data provided by this iterator""" 89 | if self.rename_data is None: 90 | return sum([i.provide_data for i in self.iters], []) 91 | else: 92 | return sum([[ 93 | DataDesc(r[x.name], x.shape, x.dtype) 94 | if isinstance(x, DataDesc) else DataDesc(*x) 95 | for x in i.provide_data 96 | ] for r, i in zip(self.rename_data, self.iters)], []) 97 | 98 | @property 99 | def provide_label(self): 100 | """The name and shape of label provided by this iterator""" 101 | if self.rename_label is None: 102 | return sum([i.provide_label for i in self.iters], []) 103 | else: 104 | return sum([[ 105 | DataDesc(r[x.name], x.shape, x.dtype) 106 | if isinstance(x, DataDesc) else DataDesc(*x) 107 | for x in i.provide_label 108 | ] for r, i in zip(self.rename_label, self.iters)], []) 109 | 110 | def reset(self): 111 | for e in self.data_ready: 112 | e.wait() 113 | for i in self.iters: 114 | i.reset() 115 | for e in self.data_ready: 116 | e.clear() 117 | for e in self.data_taken: 118 | e.set() 119 | 120 | def iter_next(self): 121 | self.data_ready[self.cur_id].wait() 122 | if self.next_batch[self.cur_id][0] is None: 123 | self.cur_id = (self.cur_id + 1) % self.prefetch_n_iter 124 | return False 125 | else: 126 | self.current_batch = self.next_batch[self.cur_id][0] 127 | self.data_ready[self.cur_id].clear() 128 | self.data_taken[self.cur_id].set() 129 | 130 | self.cur_id = (self.cur_id + 1) % self.prefetch_n_iter 131 | return True 132 | 133 | def next(self): 134 | if self.iter_next(): 135 | return self.current_batch 136 | else: 137 | raise StopIteration 138 | 139 | def getdata(self): 140 | return self.current_batch.data 141 | 142 | def getlabel(self): 143 | return self.current_batch.label 144 | 145 | def getindex(self): 146 | return self.current_batch.index 147 | 148 | def getpad(self): 149 | return self.current_batch.pad 150 | 151 | -------------------------------------------------------------------------------- /lib/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msracver/Relation-Networks-for-Object-Detection/e83e911d828e3c86624ce0aeb8d742d5ee67d5ba/lib/utils/__init__.py -------------------------------------------------------------------------------- /lib/utils/create_logger.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Bin Xiao 6 | # -------------------------------------------------------- 7 | 8 | 9 | import os 10 | import logging 11 | import time 12 | 13 | def create_logger(root_output_path, cfg, image_set): 14 | # set up logger 15 | if not os.path.exists(root_output_path): 16 | os.makedirs(root_output_path) 17 | assert os.path.exists(root_output_path), '{} does not exist'.format(root_output_path) 18 | 19 | cfg_name = os.path.basename(cfg).split('.')[0] 20 | config_output_path = os.path.join(root_output_path, '{}'.format(cfg_name)) 21 | if not os.path.exists(config_output_path): 22 | os.makedirs(config_output_path) 23 | 24 | image_sets = [iset for iset in image_set.split('+')] 25 | final_output_path = os.path.join(config_output_path, '{}'.format('_'.join(image_sets))) 26 | if not os.path.exists(final_output_path): 27 | os.makedirs(final_output_path) 28 | 29 | log_file = '{}_{}.log'.format(cfg_name, time.strftime('%Y-%m-%d-%H-%M')) 30 | head = '%(asctime)-15s %(message)s' 31 | logging.basicConfig(filename=os.path.join(final_output_path, log_file), format=head) 32 | logger = logging.getLogger() 33 | logger.setLevel(logging.INFO) 34 | 35 | return logger, final_output_path 36 | 37 | -------------------------------------------------------------------------------- /lib/utils/image.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Yuwen Xiong 6 | # -------------------------------------------------------- 7 | 8 | 9 | import numpy as np 10 | import os 11 | import cv2 12 | import random 13 | from PIL import Image 14 | from bbox.bbox_transform import clip_boxes 15 | 16 | 17 | # TODO: This two functions should be merged with individual data loader 18 | def get_image(roidb, config): 19 | """ 20 | preprocess image and return processed roidb 21 | :param roidb: a list of roidb 22 | :return: list of img as in mxnet format 23 | roidb add new item['im_info'] 24 | 0 --- x (width, second dim of im) 25 | | 26 | y (height, first dim of im) 27 | """ 28 | num_images = len(roidb) 29 | processed_ims = [] 30 | processed_roidb = [] 31 | for i in range(num_images): 32 | roi_rec = roidb[i] 33 | assert os.path.exists(roi_rec['image']), '%s does not exist'.format(roi_rec['image']) 34 | im = cv2.imread(roi_rec['image'], cv2.IMREAD_COLOR|cv2.IMREAD_IGNORE_ORIENTATION) 35 | if roidb[i]['flipped']: 36 | im = im[:, ::-1, :] 37 | new_rec = roi_rec.copy() 38 | scale_ind = random.randrange(len(config.SCALES)) 39 | target_size = config.SCALES[scale_ind][0] 40 | max_size = config.SCALES[scale_ind][1] 41 | im, im_scale = resize(im, target_size, max_size, stride=config.network.IMAGE_STRIDE) 42 | im_tensor = transform(im, config.network.PIXEL_MEANS) 43 | processed_ims.append(im_tensor) 44 | im_info = [im_tensor.shape[2], im_tensor.shape[3], im_scale] 45 | new_rec['boxes'] = clip_boxes(np.round(roi_rec['boxes'].copy() * im_scale), im_info[:2]) 46 | new_rec['im_info'] = im_info 47 | processed_roidb.append(new_rec) 48 | return processed_ims, processed_roidb 49 | 50 | 51 | def get_segmentation_image(segdb, config): 52 | """ 53 | propocess image and return segdb 54 | :param segdb: a list of segdb 55 | :return: list of img as mxnet format 56 | """ 57 | num_images = len(segdb) 58 | assert num_images > 0, 'No images' 59 | processed_ims = [] 60 | processed_segdb = [] 61 | processed_seg_cls_gt = [] 62 | for i in range(num_images): 63 | seg_rec = segdb[i] 64 | assert os.path.exists(seg_rec['image']), '%s does not exist'.format(seg_rec['image']) 65 | im = np.array(cv2.imread(seg_rec['image'])) 66 | 67 | new_rec = seg_rec.copy() 68 | 69 | scale_ind = random.randrange(len(config.SCALES)) 70 | target_size = config.SCALES[scale_ind][0] 71 | max_size = config.SCALES[scale_ind][1] 72 | im, im_scale = resize(im, target_size, max_size, stride=config.network.IMAGE_STRIDE) 73 | im_tensor = transform(im, config.network.PIXEL_MEANS) 74 | im_info = [im_tensor.shape[2], im_tensor.shape[3], im_scale] 75 | new_rec['im_info'] = im_info 76 | 77 | seg_cls_gt = np.array(Image.open(seg_rec['seg_cls_path'])) 78 | seg_cls_gt, seg_cls_gt_scale = resize( 79 | seg_cls_gt, target_size, max_size, stride=config.network.IMAGE_STRIDE, interpolation=cv2.INTER_NEAREST) 80 | seg_cls_gt_tensor = transform_seg_gt(seg_cls_gt) 81 | 82 | processed_ims.append(im_tensor) 83 | processed_segdb.append(new_rec) 84 | processed_seg_cls_gt.append(seg_cls_gt_tensor) 85 | 86 | return processed_ims, processed_seg_cls_gt, processed_segdb 87 | 88 | def resize(im, target_size, max_size, stride=0, interpolation = cv2.INTER_LINEAR): 89 | """ 90 | only resize input image to target size and return scale 91 | :param im: BGR image input by opencv 92 | :param target_size: one dimensional size (the short side) 93 | :param max_size: one dimensional max size (the long side) 94 | :param stride: if given, pad the image to designated stride 95 | :param interpolation: if given, using given interpolation method to resize image 96 | :return: 97 | """ 98 | im_shape = im.shape 99 | im_size_min = np.min(im_shape[0:2]) 100 | im_size_max = np.max(im_shape[0:2]) 101 | im_scale = float(target_size) / float(im_size_min) 102 | # prevent bigger axis from being more than max_size: 103 | if np.round(im_scale * im_size_max) > max_size: 104 | im_scale = float(max_size) / float(im_size_max) 105 | im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=interpolation) 106 | 107 | if stride == 0: 108 | return im, im_scale 109 | else: 110 | # pad to product of stride 111 | im_height = int(np.ceil(im.shape[0] / float(stride)) * stride) 112 | im_width = int(np.ceil(im.shape[1] / float(stride)) * stride) 113 | im_channel = im.shape[2] 114 | padded_im = np.zeros((im_height, im_width, im_channel)) 115 | padded_im[:im.shape[0], :im.shape[1], :] = im 116 | return padded_im, im_scale 117 | 118 | def transform(im, pixel_means): 119 | """ 120 | transform into mxnet tensor 121 | substract pixel size and transform to correct format 122 | :param im: [height, width, channel] in BGR 123 | :param pixel_means: [B, G, R pixel means] 124 | :return: [batch, channel, height, width] 125 | """ 126 | im_tensor = np.zeros((1, 3, im.shape[0], im.shape[1])) 127 | for i in range(3): 128 | im_tensor[0, i, :, :] = im[:, :, 2 - i] - pixel_means[2 - i] 129 | return im_tensor 130 | 131 | def transform_seg_gt(gt): 132 | """ 133 | transform segmentation gt image into mxnet tensor 134 | :param gt: [height, width, channel = 1] 135 | :return: [batch, channel = 1, height, width] 136 | """ 137 | gt_tensor = np.zeros((1, 1, gt.shape[0], gt.shape[1])) 138 | gt_tensor[0, 0, :, :] = gt[:, :] 139 | 140 | return gt_tensor 141 | 142 | def transform_inverse(im_tensor, pixel_means): 143 | """ 144 | transform from mxnet im_tensor to ordinary RGB image 145 | im_tensor is limited to one image 146 | :param im_tensor: [batch, channel, height, width] 147 | :param pixel_means: [B, G, R pixel means] 148 | :return: im [height, width, channel(RGB)] 149 | """ 150 | assert im_tensor.shape[0] == 1 151 | im_tensor = im_tensor.copy() 152 | # put channel back 153 | channel_swap = (0, 2, 3, 1) 154 | im_tensor = im_tensor.transpose(channel_swap) 155 | im = im_tensor[0] 156 | assert im.shape[2] == 3 157 | im += pixel_means[[2, 1, 0]] 158 | im = im.astype(np.uint8) 159 | return im 160 | 161 | def tensor_vstack(tensor_list, pad=0): 162 | """ 163 | vertically stack tensors 164 | :param tensor_list: list of tensor to be stacked vertically 165 | :param pad: label to pad with 166 | :return: tensor with max shape 167 | """ 168 | ndim = len(tensor_list[0].shape) 169 | dtype = tensor_list[0].dtype 170 | islice = tensor_list[0].shape[0] 171 | dimensions = [] 172 | first_dim = sum([tensor.shape[0] for tensor in tensor_list]) 173 | dimensions.append(first_dim) 174 | for dim in range(1, ndim): 175 | dimensions.append(max([tensor.shape[dim] for tensor in tensor_list])) 176 | if pad == 0: 177 | all_tensor = np.zeros(tuple(dimensions), dtype=dtype) 178 | elif pad == 1: 179 | all_tensor = np.ones(tuple(dimensions), dtype=dtype) 180 | else: 181 | all_tensor = np.full(tuple(dimensions), pad, dtype=dtype) 182 | if ndim == 1: 183 | for ind, tensor in enumerate(tensor_list): 184 | all_tensor[ind*islice:(ind+1)*islice] = tensor 185 | elif ndim == 2: 186 | for ind, tensor in enumerate(tensor_list): 187 | all_tensor[ind*islice:(ind+1)*islice, :tensor.shape[1]] = tensor 188 | elif ndim == 3: 189 | for ind, tensor in enumerate(tensor_list): 190 | all_tensor[ind*islice:(ind+1)*islice, :tensor.shape[1], :tensor.shape[2]] = tensor 191 | elif ndim == 4: 192 | for ind, tensor in enumerate(tensor_list): 193 | all_tensor[ind*islice:(ind+1)*islice, :tensor.shape[1], :tensor.shape[2], :tensor.shape[3]] = tensor 194 | else: 195 | raise Exception('Sorry, unimplemented.') 196 | return all_tensor 197 | -------------------------------------------------------------------------------- /lib/utils/load_data.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Dazhi Cheng, Jiayuan Gu 6 | # Written by Yuwen Xiong 7 | # -------------------------------------------------------- 8 | 9 | 10 | import numpy as np 11 | from dataset import * 12 | 13 | 14 | def load_gt_roidb(dataset_name, image_set_name, root_path, dataset_path, result_path=None, 15 | flip=False): 16 | """ load ground truth roidb """ 17 | imdb = eval(dataset_name)(image_set_name, root_path, dataset_path, result_path) 18 | roidb = imdb.gt_roidb() 19 | if flip: 20 | roidb = imdb.append_flipped_images(roidb) 21 | return roidb 22 | 23 | 24 | def load_proposal_roidb(dataset_name, image_set_name, root_path, dataset_path, result_path=None, rpn_path=None, 25 | proposal='rpn', append_gt=True, flip=False, top_roi=-1): 26 | """ load proposal roidb (append_gt when training) """ 27 | imdb = eval(dataset_name)(image_set_name, root_path, dataset_path, result_path, rpn_path) 28 | 29 | gt_roidb = imdb.gt_roidb() 30 | roidb = eval('imdb.' + proposal + '_roidb')(gt_roidb, append_gt, top_roi) 31 | if flip: 32 | roidb = imdb.append_flipped_images(roidb) 33 | return roidb 34 | 35 | 36 | 37 | def merge_roidb(roidbs): 38 | """ roidb are list, concat them together """ 39 | roidb = roidbs[0] 40 | for r in roidbs[1:]: 41 | roidb.extend(r) 42 | return roidb 43 | 44 | 45 | def filter_roidb(roidb, config): 46 | """ remove roidb entries without usable rois """ 47 | 48 | def is_valid(entry): 49 | """ valid images have at least 1 fg or bg roi """ 50 | 51 | if all(entry['gt_classes'] == 0): 52 | valid = False 53 | else: 54 | overlaps = entry['max_overlaps'] 55 | fg_inds = np.where(overlaps >= config.TRAIN.FG_THRESH)[0] 56 | bg_inds = np.where((overlaps < config.TRAIN.BG_THRESH_HI) & (overlaps >= config.TRAIN.BG_THRESH_LO))[0] 57 | valid = len(fg_inds) > 0 or len(bg_inds) > 0 58 | return valid 59 | 60 | num = len(roidb) 61 | filtered_roidb = [entry for entry in roidb if is_valid(entry)] 62 | num_after = len(filtered_roidb) 63 | print 'filtered %d roidb entries: %d -> %d' % (num - num_after, num, num_after) 64 | 65 | return filtered_roidb 66 | -------------------------------------------------------------------------------- /lib/utils/load_model.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Yuwen Xiong 6 | # -------------------------------------------------------- 7 | 8 | 9 | import mxnet as mx 10 | 11 | 12 | def load_checkpoint(prefix, epoch): 13 | """ 14 | Load model checkpoint from file. 15 | :param prefix: Prefix of model name. 16 | :param epoch: Epoch number of model we would like to load. 17 | :return: (arg_params, aux_params) 18 | arg_params : dict of str to NDArray 19 | Model parameter, dict of name to NDArray of net's weights. 20 | aux_params : dict of str to NDArray 21 | Model parameter, dict of name to NDArray of net's auxiliary states. 22 | """ 23 | save_dict = mx.nd.load('%s-%04d.params' % (prefix, epoch)) 24 | arg_params = {} 25 | aux_params = {} 26 | for k, v in save_dict.items(): 27 | tp, name = k.split(':', 1) 28 | if tp == 'arg': 29 | arg_params[name] = v 30 | if tp == 'aux': 31 | aux_params[name] = v 32 | return arg_params, aux_params 33 | 34 | 35 | def convert_context(params, ctx): 36 | """ 37 | :param params: dict of str to NDArray 38 | :param ctx: the context to convert to 39 | :return: dict of str of NDArray with context ctx 40 | """ 41 | new_params = dict() 42 | for k, v in params.items(): 43 | new_params[k] = v.as_in_context(ctx) 44 | return new_params 45 | 46 | 47 | def load_param(prefix, epoch, convert=False, ctx=None, process=False): 48 | """ 49 | wrapper for load checkpoint 50 | :param prefix: Prefix of model name. 51 | :param epoch: Epoch number of model we would like to load. 52 | :param convert: reference model should be converted to GPU NDArray first 53 | :param ctx: if convert then ctx must be designated. 54 | :param process: model should drop any test 55 | :return: (arg_params, aux_params) 56 | """ 57 | arg_params, aux_params = load_checkpoint(prefix, epoch) 58 | if convert: 59 | if ctx is None: 60 | ctx = mx.cpu() 61 | arg_params = convert_context(arg_params, ctx) 62 | aux_params = convert_context(aux_params, ctx) 63 | if process: 64 | tests = [k for k in arg_params.keys() if '_test' in k] 65 | for test in tests: 66 | arg_params[test.replace('_test', '')] = arg_params.pop(test) 67 | return arg_params, aux_params 68 | -------------------------------------------------------------------------------- /lib/utils/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Yuwen Xiong 6 | # -------------------------------------------------------- 7 | 8 | import logging 9 | from mxnet.lr_scheduler import LRScheduler 10 | 11 | class WarmupMultiFactorScheduler(LRScheduler): 12 | """Reduce learning rate in factor at steps specified in a list 13 | 14 | Assume the weight has been updated by n times, then the learning rate will 15 | be 16 | 17 | base_lr * factor^(sum((step/n)<=1)) # step is an array 18 | 19 | Parameters 20 | ---------- 21 | step: list of int 22 | schedule learning rate after n updates 23 | factor: float 24 | the factor for reducing the learning rate 25 | """ 26 | def __init__(self, step, factor=1, warmup=False, warmup_lr=0, warmup_step=0): 27 | super(WarmupMultiFactorScheduler, self).__init__() 28 | assert isinstance(step, list) and len(step) >= 1 29 | for i, _step in enumerate(step): 30 | if i != 0 and step[i] <= step[i-1]: 31 | raise ValueError("Schedule step must be an increasing integer list") 32 | if _step < 1: 33 | raise ValueError("Schedule step must be greater or equal than 1 round") 34 | if factor > 1.0: 35 | raise ValueError("Factor must be no more than 1 to make lr reduce") 36 | self.step = step 37 | self.cur_step_ind = 0 38 | self.factor = factor 39 | self.count = 0 40 | self.warmup = warmup 41 | self.warmup_lr = warmup_lr 42 | self.warmup_step = warmup_step 43 | 44 | def __call__(self, num_update): 45 | """ 46 | Call to schedule current learning rate 47 | 48 | Parameters 49 | ---------- 50 | num_update: int 51 | the maximal number of updates applied to a weight. 52 | """ 53 | 54 | # NOTE: use while rather than if (for continuing training via load_epoch) 55 | if self.warmup and num_update < self.warmup_step: 56 | return self.warmup_lr 57 | while self.cur_step_ind <= len(self.step)-1: 58 | if num_update > self.step[self.cur_step_ind]: 59 | self.count = self.step[self.cur_step_ind] 60 | self.cur_step_ind += 1 61 | self.base_lr *= self.factor 62 | logging.info("Update[%d]: Change learning rate to %0.5e", 63 | num_update, self.base_lr) 64 | else: 65 | return self.base_lr 66 | return self.base_lr 67 | -------------------------------------------------------------------------------- /lib/utils/symbol.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Dazhi Cheng, Jiayuan Gu 6 | # Written by Yuwen Xiong 7 | # -------------------------------------------------------- 8 | 9 | import numpy as np 10 | class Symbol: 11 | def __init__(self): 12 | self.arg_shape_dict = None 13 | self.out_shape_dict = None 14 | self.aux_shape_dict = None 15 | self.sym = None 16 | 17 | @property 18 | def symbol(self): 19 | return self.sym 20 | 21 | def get_symbol(self, cfg, is_train=True): 22 | """ 23 | return a generated symbol, it also need to be assigned to self.sym 24 | """ 25 | raise NotImplementedError() 26 | 27 | def init_weights(self, cfg, arg_params, aux_params): 28 | raise NotImplementedError() 29 | 30 | def get_msra_std(self, shape): 31 | fan_in = float(shape[1]) 32 | if len(shape) > 2: 33 | fan_in *= np.prod(shape[2:]) 34 | print(np.sqrt(2 / fan_in)) 35 | return np.sqrt(2 / fan_in) 36 | 37 | def infer_shape(self, data_shape_dict): 38 | # infer shape 39 | arg_shape, out_shape, aux_shape = self.sym.infer_shape(**data_shape_dict) 40 | self.arg_shape_dict = dict(zip(self.sym.list_arguments(), arg_shape)) 41 | self.out_shape_dict = dict(zip(self.sym.list_outputs(), out_shape)) 42 | self.aux_shape_dict = dict(zip(self.sym.list_auxiliary_states(), aux_shape)) 43 | 44 | def check_parameter_shapes(self, arg_params, aux_params, data_shape_dict, is_train=True): 45 | for k in self.sym.list_arguments(): 46 | if k in data_shape_dict or (False if is_train else 'label' in k): 47 | continue 48 | assert k in arg_params, k + ' not initialized' 49 | assert arg_params[k].shape == self.arg_shape_dict[k], \ 50 | 'shape inconsistent for ' + k + ' inferred ' + str(self.arg_shape_dict[k]) + ' provided ' + str( 51 | arg_params[k].shape) 52 | for k in self.sym.list_auxiliary_states(): 53 | assert k in aux_params, k + ' not initialized' 54 | assert aux_params[k].shape == self.aux_shape_dict[k], \ 55 | 'shape inconsistent for ' + k + ' inferred ' + str(self.aux_shape_dict[k]) + ' provided ' + str( 56 | aux_params[k].shape) 57 | -------------------------------------------------------------------------------- /relation_rcnn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msracver/Relation-Networks-for-Object-Detection/e83e911d828e3c86624ce0aeb8d742d5ee67d5ba/relation_rcnn/__init__.py -------------------------------------------------------------------------------- /relation_rcnn/_init_paths.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Jiayuan Gu, Dazhi Cheng 6 | # -------------------------------------------------------- 7 | 8 | import os.path as osp 9 | import sys 10 | 11 | def add_path(path): 12 | if path not in sys.path: 13 | sys.path.insert(0, path) 14 | 15 | this_dir = osp.dirname(__file__) 16 | 17 | lib_path = osp.join(this_dir, '..', 'lib') 18 | add_path(lib_path) 19 | -------------------------------------------------------------------------------- /relation_rcnn/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msracver/Relation-Networks-for-Object-Detection/e83e911d828e3c86624ce0aeb8d742d5ee67d5ba/relation_rcnn/config/__init__.py -------------------------------------------------------------------------------- /relation_rcnn/config/config.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Dazhi Cheng, Jiayuan Gu, Yuwen Xiong, Bin Xiao 6 | # -------------------------------------------------------- 7 | # Based on: 8 | # py-faster-rcnn 9 | # Copyright (c) 2016 by Contributors 10 | # Licence under The MIT License 11 | # py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 12 | # -------------------------------------------------------- 13 | 14 | import yaml 15 | import numpy as np 16 | from easydict import EasyDict as edict 17 | 18 | config = edict() 19 | 20 | config.MXNET_VERSION = '' 21 | config.output_path = '' 22 | config.symbol = '' 23 | config.gpus = '' 24 | config.CLASS_AGNOSTIC = True 25 | config.SCALES = [(600, 1000)] # first is scale (the shorter side); second is max size 26 | 27 | # default training 28 | config.default = edict() 29 | config.default.frequent = 20 30 | config.default.kvstore = 'device' 31 | 32 | # network related params 33 | config.network = edict() 34 | config.network.pretrained = '' 35 | config.network.pretrained_epoch = 0 36 | config.network.PIXEL_MEANS = np.array([0, 0, 0]) 37 | config.network.IMAGE_STRIDE = 0 38 | config.network.RPN_FEAT_STRIDE = 16 39 | config.network.RCNN_FEAT_STRIDE = 16 40 | config.network.FIXED_PARAMS = ['gamma', 'beta'] 41 | config.network.FIXED_PARAMS_SHARED = ['gamma', 'beta'] 42 | config.network.ANCHOR_SCALES = (8, 16, 32) 43 | config.network.ANCHOR_RATIOS = (0.5, 1, 2) 44 | config.network.NUM_ANCHORS = len(config.network.ANCHOR_SCALES) * len(config.network.ANCHOR_RATIOS) 45 | config.network.ROIDispatch = False 46 | config.network.USE_NONGT_INDEX = False 47 | config.network.NMS_TARGET_THRESH = '0.5' 48 | 49 | # dataset related params 50 | config.dataset = edict() 51 | config.dataset.dataset = 'PascalVOC' 52 | config.dataset.image_set = '2007_trainval' 53 | config.dataset.test_image_set = '2007_test' 54 | config.dataset.root_path = './data' 55 | config.dataset.dataset_path = './data/VOCdevkit' 56 | config.dataset.NUM_CLASSES = 21 57 | 58 | 59 | config.TRAIN = edict() 60 | 61 | config.TRAIN.lr = 0 62 | config.TRAIN.lr_step = '' 63 | config.TRAIN.lr_factor = 0.1 64 | config.TRAIN.warmup = False 65 | config.TRAIN.warmup_lr = 0 66 | config.TRAIN.warmup_step = 0 67 | config.TRAIN.momentum = 0.9 68 | config.TRAIN.wd = 0.0005 69 | config.TRAIN.begin_epoch = 0 70 | config.TRAIN.end_epoch = 0 71 | config.TRAIN.model_prefix = '' 72 | config.TRAIN.rpn_loss_scale = 3.0 73 | config.TRAIN.nms_loss_scale = 1.0 74 | config.TRAIN.nms_pos_scale = 4.0 75 | 76 | config.TRAIN.ALTERNATE = edict() 77 | config.TRAIN.ALTERNATE.RPN_BATCH_IMAGES = 0 78 | config.TRAIN.FC_DROPOUT_RATIO = 0 79 | config.TRAIN.ATTENTION_DROPOUT_RATIO = 0 80 | config.TRAIN.ATTENTION_SCALE_METHOD = 0 81 | # whether resume training 82 | config.TRAIN.RESUME = False 83 | # whether flip image 84 | config.TRAIN.FLIP = True 85 | # whether shuffle image 86 | config.TRAIN.SHUFFLE = True 87 | # whether use OHEM 88 | config.TRAIN.ENABLE_OHEM = False 89 | # size of images for each device, 2 for rcnn, 1 for rpn and e2e 90 | config.TRAIN.BATCH_IMAGES = 2 91 | # e2e changes behavior of anchor loader and metric 92 | config.TRAIN.END2END = False 93 | # group images with similar aspect ratio 94 | config.TRAIN.ASPECT_GROUPING = True 95 | 96 | # R-CNN 97 | # rcnn rois batch size 98 | config.TRAIN.TOP_ROIS = -1 99 | config.TRAIN.BATCH_ROIS = 128 100 | config.TRAIN.BATCH_ROIS_OHEM = 128 101 | # rcnn rois sampling params 102 | config.TRAIN.FG_FRACTION = 0.25 103 | config.TRAIN.FG_THRESH = 0.5 104 | config.TRAIN.BG_THRESH_HI = 0.5 105 | config.TRAIN.BG_THRESH_LO = 0.0 106 | # rcnn bounding box regression params 107 | config.TRAIN.BBOX_REGRESSION_THRESH = 0.5 108 | config.TRAIN.BBOX_WEIGHTS = np.array([1.0, 1.0, 1.0, 1.0]) 109 | 110 | # RPN anchor loader 111 | # rpn anchors batch size 112 | config.TRAIN.RPN_BATCH_SIZE = 256 113 | # rpn anchors sampling params 114 | config.TRAIN.RPN_FG_FRACTION = 0.5 115 | config.TRAIN.RPN_POSITIVE_OVERLAP = 0.7 116 | config.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3 117 | config.TRAIN.RPN_CLOBBER_POSITIVES = False 118 | # rpn bounding box regression params 119 | config.TRAIN.RPN_BBOX_WEIGHTS = (1.0, 1.0, 1.0, 1.0) 120 | config.TRAIN.RPN_POSITIVE_WEIGHT = -1.0 121 | 122 | # used for end2end training 123 | # RPN proposal 124 | config.TRAIN.CXX_PROPOSAL = True 125 | config.TRAIN.RPN_NMS_THRESH = 0.7 126 | config.TRAIN.RPN_PRE_NMS_TOP_N = 12000 127 | config.TRAIN.RPN_POST_NMS_TOP_N = 2000 128 | config.TRAIN.RPN_MIN_SIZE = config.network.RPN_FEAT_STRIDE 129 | # approximate bounding box regression 130 | config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED = False 131 | config.TRAIN.BBOX_MEANS = (0.0, 0.0, 0.0, 0.0) 132 | config.TRAIN.BBOX_STDS = (0.1, 0.1, 0.2, 0.2) 133 | # Learn NMS 134 | config.TRAIN.LEARN_NMS = False 135 | config.TRAIN.JOINT_TRAINING = False 136 | config.TRAIN.FIRST_N = 100 137 | 138 | config.TEST = edict() 139 | 140 | # R-CNN testing 141 | # use rpn to generate proposal 142 | config.TEST.HAS_RPN = False 143 | # size of images for each device 144 | config.TEST.BATCH_IMAGES = 1 145 | config.TEST.TOP_ROIS = 2000 146 | 147 | # RPN proposal 148 | config.TEST.CXX_PROPOSAL = True 149 | config.TEST.RPN_NMS_THRESH = 0.7 150 | config.TEST.RPN_PRE_NMS_TOP_N = 6000 151 | config.TEST.RPN_POST_NMS_TOP_N = 300 152 | config.TEST.RPN_MIN_SIZE = config.network.RPN_FEAT_STRIDE 153 | 154 | # RPN generate proposal 155 | config.TEST.PROPOSAL_NMS_THRESH = 0.7 156 | config.TEST.PROPOSAL_PRE_NMS_TOP_N = 20000 157 | config.TEST.PROPOSAL_POST_NMS_TOP_N = 2000 158 | config.TEST.PROPOSAL_MIN_SIZE = config.network.RPN_FEAT_STRIDE 159 | 160 | # whether to use softnms 161 | config.TEST.SOFTNMS = False 162 | # whether to use LEARN_NMS 163 | config.TEST.LEARN_NMS = False 164 | config.TEST.FIRST_N = 0 165 | config.TEST.MERGE_METHOD = -1 166 | # RCNN nms 167 | config.TEST.NMS = 0.3 168 | 169 | config.TEST.max_per_image = 300 170 | 171 | # Test Model Epoch 172 | config.TEST.test_epoch = 0 173 | # increasing this thresh will speed up test-time learn nms module, but may hurt performance 174 | config.TEST.LEARN_NMS_CLASS_SCORE_TH = 0.01 175 | 176 | 177 | def update_config(config_file): 178 | exp_config = None 179 | with open(config_file) as f: 180 | exp_config = edict(yaml.load(f)) 181 | for k, v in exp_config.items(): 182 | if k in config: 183 | if isinstance(v, dict): 184 | if k == 'TRAIN': 185 | if 'BBOX_WEIGHTS' in v: 186 | v['BBOX_WEIGHTS'] = np.array(v['BBOX_WEIGHTS']) 187 | elif k == 'network': 188 | if 'PIXEL_MEANS' in v: 189 | v['PIXEL_MEANS'] = np.array(v['PIXEL_MEANS']) 190 | for vk, vv in v.items(): 191 | config[k][vk] = vv 192 | else: 193 | if k == 'SCALES': 194 | config[k][0] = (tuple(v)) 195 | else: 196 | config[k] = v 197 | else: 198 | raise ValueError("key must exist in config.py") 199 | -------------------------------------------------------------------------------- /relation_rcnn/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msracver/Relation-Networks-for-Object-Detection/e83e911d828e3c86624ce0aeb8d742d5ee67d5ba/relation_rcnn/core/__init__.py -------------------------------------------------------------------------------- /relation_rcnn/core/callback.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Dazhi cheng, Jiayuan Gu, Yuwen Xiong 6 | # -------------------------------------------------------- 7 | # Based on: 8 | # MX-RCNN 9 | # Copyright (c) 2016 by Contributors 10 | # Licence under The Apache 2.0 License 11 | # https://github.com/ijkguo/mx-rcnn/ 12 | # -------------------------------------------------------- 13 | 14 | import time 15 | import logging 16 | import mxnet as mx 17 | 18 | 19 | class Speedometer(object): 20 | def __init__(self, batch_size, frequent=50): 21 | self.batch_size = batch_size 22 | self.frequent = frequent 23 | self.init = False 24 | self.tic = 0 25 | self.last_count = 0 26 | 27 | def __call__(self, param): 28 | """Callback to Show speed.""" 29 | count = param.nbatch 30 | if self.last_count > count: 31 | self.init = False 32 | self.last_count = count 33 | 34 | if self.init: 35 | if count % self.frequent == 0: 36 | speed = self.frequent * self.batch_size / (time.time() - self.tic) 37 | s = '' 38 | if param.eval_metric is not None: 39 | name, value = param.eval_metric.get() 40 | s = "Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec\tTrain-" % (param.epoch, count, speed) 41 | for n, v in zip(name, value): 42 | s += "%s=%f,\t" % (n, v) 43 | else: 44 | s = "Iter[%d] Batch [%d]\tSpeed: %.2f samples/sec" % (param.epoch, count, speed) 45 | 46 | logging.info(s) 47 | print(s) 48 | self.tic = time.time() 49 | else: 50 | self.init = True 51 | self.tic = time.time() 52 | 53 | 54 | def do_checkpoint(prefix, means, stds): 55 | def _callback(iter_no, sym, arg, aux): 56 | arg['bbox_pred_weight_test'] = (arg['bbox_pred_weight'].T * mx.nd.array(stds)).T 57 | arg['bbox_pred_bias_test'] = arg['bbox_pred_bias'] * mx.nd.array(stds) + mx.nd.array(means) 58 | mx.model.save_checkpoint(prefix, iter_no + 1, sym, arg, aux) 59 | arg.pop('bbox_pred_weight_test') 60 | arg.pop('bbox_pred_bias_test') 61 | return _callback 62 | -------------------------------------------------------------------------------- /relation_rcnn/function/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msracver/Relation-Networks-for-Object-Detection/e83e911d828e3c86624ce0aeb8d742d5ee67d5ba/relation_rcnn/function/__init__.py -------------------------------------------------------------------------------- /relation_rcnn/function/test_rcnn.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Jiayuan Gu, Dazhi Cheng, Yuwen Xiong 6 | # -------------------------------------------------------- 7 | # Based on: 8 | # MX-RCNN 9 | # Copyright (c) 2016 by Contributors 10 | # Licence under The Apache 2.0 License 11 | # https://github.com/ijkguo/mx-rcnn/ 12 | # -------------------------------------------------------- 13 | 14 | 15 | import argparse 16 | import pprint 17 | import logging 18 | import time 19 | import os 20 | import mxnet as mx 21 | 22 | from symbols import * 23 | from dataset import * 24 | from core.loader import TestLoader 25 | from core.tester import Predictor, pred_eval 26 | from utils.load_model import load_param 27 | 28 | 29 | def test_rcnn(cfg, dataset, image_set, root_path, dataset_path, 30 | ctx, prefix, epoch, 31 | vis, ignore_cache, shuffle, has_rpn, proposal, thresh, logger=None, output_path=None): 32 | if not logger: 33 | assert False, 'require a logger' 34 | 35 | # print cfg 36 | pprint.pprint(cfg) 37 | logger.info('testing cfg:{}\n'.format(pprint.pformat(cfg))) 38 | 39 | # load symbol and testing data 40 | if has_rpn: 41 | sym_instance = eval(cfg.symbol + '.' + cfg.symbol)() 42 | sym = sym_instance.get_symbol(cfg, is_train=False) 43 | imdb = eval(dataset)(image_set, root_path, dataset_path, result_path=output_path) 44 | roidb = imdb.gt_roidb() 45 | else: 46 | sym_instance = eval(cfg.symbol + '.' + cfg.symbol)() 47 | sym = sym_instance.get_symbol_rcnn(cfg, is_train=False) 48 | rpn_path = cfg.dataset.proposal_cache 49 | imdb = eval(dataset)(image_set, root_path, dataset_path, result_path=output_path, rpn_path=rpn_path) 50 | gt_roidb = imdb.gt_roidb() 51 | roidb = eval('imdb.' + proposal + '_roidb')(gt_roidb, top_roi=cfg.TEST.TOP_ROIS) 52 | 53 | # get test data iter 54 | test_data = TestLoader(roidb, cfg, batch_size=len(ctx), shuffle=shuffle, has_rpn=has_rpn) 55 | 56 | # load model 57 | arg_params, aux_params = load_param(prefix, epoch, process=True) 58 | 59 | # infer shape 60 | data_shape_dict = dict(test_data.provide_data_single) 61 | #sym_instance.infer_shape(data_shape_dict) 62 | 63 | #sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict, is_train=False) 64 | 65 | # decide maximum shape 66 | data_names = [k[0] for k in test_data.provide_data_single] 67 | label_names = None 68 | #max_data_shape = [[('data', (1, 3, max([v[0] for v in cfg.SCALES]), max([v[1] for v in cfg.SCALES])))]] 69 | max_height = max([v[0] for v in cfg.SCALES]) 70 | max_width = max([v[1] for v in cfg.SCALES]) 71 | if cfg.network.IMAGE_STRIDE > 0: 72 | max_height = max_height + cfg.network.IMAGE_STRIDE - max_height%cfg.network.IMAGE_STRIDE 73 | max_width = max_width + cfg.network.IMAGE_STRIDE - max_width % cfg.network.IMAGE_STRIDE 74 | 75 | max_data_shape = [('data', (cfg.TRAIN.BATCH_IMAGES, 3, max_height, max_width))] 76 | 77 | if not has_rpn: 78 | #max_data_shape.append(('rois', (cfg.TEST.PROPOSAL_POST_NMS_TOP_N + 30, 5))) 79 | if cfg.network.ROIDispatch: 80 | max_data_shape.append(('rois_0', (1, cfg.TEST.PROPOSAL_POST_NMS_TOP_N/4, 5))) 81 | max_data_shape.append(('rois_1', (1, cfg.TEST.PROPOSAL_POST_NMS_TOP_N/4, 5))) 82 | max_data_shape.append(('rois_2', (1, cfg.TEST.PROPOSAL_POST_NMS_TOP_N/4, 5))) 83 | max_data_shape.append(('rois_3', (1, cfg.TEST.PROPOSAL_POST_NMS_TOP_N/4, 5))) 84 | 85 | max_data_shape = [max_data_shape] 86 | # create predictor 87 | #test_data.provide_label 88 | predictor = Predictor(sym, data_names, label_names, 89 | context=ctx, max_data_shapes=max_data_shape, 90 | provide_data=test_data.provide_data, provide_label=test_data.provide_label, 91 | arg_params=arg_params, aux_params=aux_params) 92 | 93 | # start detection 94 | pred_eval(predictor, test_data, imdb, cfg, vis=vis, ignore_cache=ignore_cache, thresh=thresh, logger=logger) 95 | 96 | -------------------------------------------------------------------------------- /relation_rcnn/function/test_rpn.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Jiayuan Gu, Dazhi Cheng, Yuwen Xiong 6 | # -------------------------------------------------------- 7 | # Based on: 8 | # MX-RCNN 9 | # Copyright (c) 2016 by Contributors 10 | # Licence under The Apache 2.0 License 11 | # https://github.com/ijkguo/mx-rcnn/ 12 | # -------------------------------------------------------- 13 | 14 | import argparse 15 | import pprint 16 | import logging 17 | import mxnet as mx 18 | 19 | from symbols import * 20 | from dataset import * 21 | from core.loader import TestLoader 22 | from core.tester import Predictor, generate_proposals 23 | from utils.load_model import load_param 24 | 25 | 26 | def test_rpn(cfg, dataset, image_set, root_path, dataset_path, 27 | ctx, prefix, epoch, 28 | vis, shuffle, thresh, logger=None, output_path=None): 29 | # set up logger 30 | if not logger: 31 | logging.basicConfig() 32 | logger = logging.getLogger() 33 | logger.setLevel(logging.INFO) 34 | 35 | # rpn generate proposal cfg 36 | cfg.TEST.HAS_RPN = True 37 | 38 | # print cfg 39 | pprint.pprint(cfg) 40 | logger.info('testing rpn cfg:{}\n'.format(pprint.pformat(cfg))) 41 | 42 | # load symbol 43 | sym_instance = eval(cfg.symbol + '.' + cfg.symbol)() 44 | sym = sym_instance.get_symbol_rpn(cfg, is_train=False) 45 | 46 | # load dataset and prepare imdb for training 47 | imdb = eval(dataset)(image_set, root_path, dataset_path, result_path=output_path) 48 | roidb = imdb.gt_roidb() 49 | test_data = TestLoader(roidb, cfg, batch_size=len(ctx), shuffle=shuffle, has_rpn=True) 50 | 51 | # load model 52 | arg_params, aux_params = load_param(prefix, epoch) 53 | 54 | # infer shape 55 | data_shape_dict = dict(test_data.provide_data_single) 56 | sym_instance.infer_shape(data_shape_dict) 57 | 58 | # check parameters 59 | sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict, is_train=False) 60 | 61 | # decide maximum shape 62 | data_names = [k[0] for k in test_data.provide_data[0]] 63 | label_names = None if test_data.provide_label[0] is None else [k[0] for k in test_data.provide_label[0]] 64 | max_data_shape = [[('data', (1, 3, max([v[0] for v in cfg.SCALES]), max([v[1] for v in cfg.SCALES])))]] 65 | 66 | # create predictor 67 | predictor = Predictor(sym, data_names, label_names, 68 | context=ctx, max_data_shapes=max_data_shape, 69 | provide_data=test_data.provide_data, provide_label=test_data.provide_label, 70 | arg_params=arg_params, aux_params=aux_params) 71 | 72 | # start testing 73 | imdb_boxes = generate_proposals(predictor, test_data, imdb, cfg, vis=vis, thresh=thresh) 74 | 75 | all_log_info = imdb.evaluate_recall(roidb, candidate_boxes=imdb_boxes) 76 | logger.info(all_log_info) 77 | -------------------------------------------------------------------------------- /relation_rcnn/function/train_rcnn.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Jiayuan Gu, Dazhi Cheng, Yuwen Xiong 6 | # -------------------------------------------------------- 7 | # Based on: 8 | # MX-RCNN 9 | # Copyright (c) 2016 by Contributors 10 | # Licence under The Apache 2.0 License 11 | # https://github.com/ijkguo/mx-rcnn/ 12 | # -------------------------------------------------------- 13 | 14 | import argparse 15 | import logging 16 | import pprint 17 | import os 18 | import mxnet as mx 19 | import numpy as np 20 | 21 | from symbols import * 22 | from core import callback, metric 23 | from core.loader import ROIIter 24 | from core.module import MutableModule 25 | from bbox.bbox_regression import add_bbox_regression_targets 26 | from utils.load_data import load_proposal_roidb, merge_roidb, filter_roidb 27 | from utils.load_model import load_param 28 | from utils.PrefetchingIter import PrefetchingIterV2 as PrefetchingIter 29 | from utils.lr_scheduler import WarmupMultiFactorScheduler 30 | 31 | 32 | def train_rcnn(cfg, dataset, image_set, root_path, dataset_path, 33 | frequent, kvstore, flip, shuffle, resume, 34 | ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, 35 | train_shared, lr, lr_step, proposal, logger=None, output_path=None): 36 | mx.random.seed(0) 37 | np.random.seed(0) 38 | # set up logger 39 | if not logger: 40 | logging.basicConfig() 41 | logger = logging.getLogger() 42 | logger.setLevel(logging.INFO) 43 | 44 | # load symbol 45 | sym_instance = eval(cfg.symbol + '.' + cfg.symbol)() 46 | sym = sym_instance.get_symbol_rcnn(cfg, is_train=True) 47 | 48 | # setup multi-gpu 49 | batch_size = len(ctx) 50 | input_batch_size = cfg.TRAIN.BATCH_IMAGES * batch_size 51 | 52 | # print cfg 53 | pprint.pprint(cfg) 54 | logger.info('training rcnn cfg:{}\n'.format(pprint.pformat(cfg))) 55 | 56 | rpn_path = cfg.dataset.proposal_cache 57 | # load dataset and prepare imdb for training 58 | image_sets = [iset for iset in image_set.split('+')] 59 | roidbs = [load_proposal_roidb(dataset, image_set, root_path, dataset_path, 60 | proposal=proposal, append_gt=True, flip=flip, result_path=output_path, 61 | rpn_path=rpn_path, top_roi=cfg.TRAIN.TOP_ROIS) 62 | for image_set in image_sets] 63 | roidb = merge_roidb(roidbs) 64 | roidb = filter_roidb(roidb, cfg) 65 | means, stds = add_bbox_regression_targets(roidb, cfg) 66 | 67 | # load training data 68 | train_data = ROIIter(roidb, cfg, batch_size=input_batch_size, shuffle=shuffle, 69 | ctx=ctx, aspect_grouping=cfg.TRAIN.ASPECT_GROUPING) 70 | 71 | # infer max shape 72 | max_height = max([v[0] for v in cfg.SCALES]) 73 | max_width = max([v[1] for v in cfg.SCALES]) 74 | paded_max_height = max_height + cfg.network.IMAGE_STRIDE - max_height % cfg.network.IMAGE_STRIDE 75 | paded_max_width = max_width + cfg.network.IMAGE_STRIDE - max_width % (cfg.network.IMAGE_STRIDE) 76 | 77 | max_data_shape = [('data', (cfg.TRAIN.BATCH_IMAGES, 3, paded_max_height, paded_max_width))] 78 | # infer shape 79 | data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) 80 | sym_instance.infer_shape(data_shape_dict) 81 | # print shape 82 | pprint.pprint(sym_instance.arg_shape_dict) 83 | logging.info(pprint.pformat(sym_instance.arg_shape_dict)) 84 | 85 | max_batch_roi = cfg.TRAIN.TOP_ROIS if cfg.TRAIN.BATCH_ROIS == -1 else cfg.TRAIN.BATCH_ROIS 86 | num_class = 2 if cfg.CLASS_AGNOSTIC else cfg.dataset.NUM_CLASSES 87 | max_label_shape = [('label', (cfg.TRAIN.BATCH_IMAGES, max_batch_roi)), 88 | ('bbox_target', (cfg.TRAIN.BATCH_IMAGES, max_batch_roi, num_class * 4)), 89 | ('bbox_weight', (cfg.TRAIN.BATCH_IMAGES, max_batch_roi, num_class * 4))] 90 | 91 | if cfg.network.USE_NONGT_INDEX: 92 | max_label_shape.append(('nongt_index', (2000,))) 93 | 94 | if cfg.network.ROIDispatch: 95 | max_data_shape.append(('rois_0', (cfg.TRAIN.BATCH_IMAGES, max_batch_roi / 4, 5))) 96 | max_data_shape.append(('rois_1', (cfg.TRAIN.BATCH_IMAGES, max_batch_roi / 4, 5))) 97 | max_data_shape.append(('rois_2', (cfg.TRAIN.BATCH_IMAGES, max_batch_roi / 4, 5))) 98 | max_data_shape.append(('rois_3', (cfg.TRAIN.BATCH_IMAGES, max_batch_roi / 4, 5))) 99 | else: 100 | max_data_shape.append(('rois', (cfg.TEST.PROPOSAL_POST_NMS_TOP_N + 30, 5))) 101 | 102 | #dot = mx.viz.plot_network(sym, node_attrs={'shape': 'rect', 'fixedsize': 'false'}) 103 | #dot.render(os.path.join('./output/rcnn/network_vis', cfg.symbol + cfg.TRAIN.model_prefix)) 104 | 105 | # load and initialize params 106 | if resume: 107 | print('continue training from ', begin_epoch) 108 | arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) 109 | else: 110 | arg_params, aux_params = load_param(pretrained, epoch, convert=True) 111 | sym_instance.init_weight_rcnn(cfg, arg_params, aux_params) 112 | 113 | # check parameter shapes 114 | sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) 115 | 116 | # prepare training 117 | # create solver 118 | data_names = [k[0] for k in train_data.provide_data_single] 119 | label_names = [k[0] for k in train_data.provide_label_single] 120 | if train_shared: 121 | fixed_param_prefix = cfg.network.FIXED_PARAMS_SHARED 122 | else: 123 | fixed_param_prefix = cfg.network.FIXED_PARAMS 124 | 125 | if cfg.network.ROIDispatch: 126 | mod = MutableModule(sym, data_names=data_names, label_names=label_names, 127 | logger=logger, context=ctx, 128 | max_data_shapes=[max_data_shape for _ in range(batch_size)], 129 | max_label_shapes=[max_label_shape for _ in range(batch_size)], 130 | fixed_param_prefix=fixed_param_prefix) 131 | else: 132 | mod = MutableModule(sym, data_names=data_names, label_names=label_names, 133 | logger=logger, context=ctx, 134 | max_data_shapes=[max_data_shape for _ in range(batch_size)], 135 | max_label_shapes=[max_label_shape for _ in range(batch_size)], 136 | fixed_param_prefix=fixed_param_prefix) 137 | if cfg.TRAIN.RESUME: 138 | mod._preload_opt_states = '%s-%04d.states' % (prefix, begin_epoch) 139 | 140 | # decide training params 141 | # metric 142 | eval_metric = metric.RCNNAccMetric(cfg) 143 | cls_metric = metric.RCNNLogLossMetric(cfg) 144 | bbox_metric = metric.RCNNL1LossMetric(cfg) 145 | eval_metrics = mx.metric.CompositeEvalMetric() 146 | for child_metric in [eval_metric, cls_metric, bbox_metric]: 147 | eval_metrics.add(child_metric) 148 | if cfg.TRAIN.LEARN_NMS: 149 | eval_metrics.add(metric.NMSLossMetric(cfg, 'pos')) 150 | eval_metrics.add(metric.NMSLossMetric(cfg, 'neg')) 151 | eval_metrics.add(metric.NMSAccMetric(cfg)) 152 | # callback 153 | batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=frequent) 154 | epoch_end_callback = [mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True), 155 | callback.do_checkpoint(prefix, means, stds)] 156 | # decide learning rate 157 | base_lr = lr 158 | lr_factor = cfg.TRAIN.lr_factor 159 | lr_epoch = [float(epoch) for epoch in lr_step.split(',')] 160 | lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] 161 | lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) 162 | lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] 163 | print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) 164 | lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, cfg.TRAIN.warmup, cfg.TRAIN.warmup_lr, 165 | cfg.TRAIN.warmup_step) 166 | # optimizer 167 | optimizer_params = {'momentum': cfg.TRAIN.momentum, 168 | 'wd': cfg.TRAIN.wd, 169 | 'learning_rate': lr, 170 | 'lr_scheduler': lr_scheduler, 171 | 'rescale_grad': 1.0, 172 | 'clip_gradient': None} 173 | 174 | # train 175 | 176 | if not isinstance(train_data, PrefetchingIter): 177 | train_data = PrefetchingIter(train_data) 178 | 179 | mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, 180 | batch_end_callback=batch_end_callback, kvstore=kvstore, 181 | optimizer='sgd', optimizer_params=optimizer_params, 182 | arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch) 183 | 184 | -------------------------------------------------------------------------------- /relation_rcnn/function/train_rpn.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Jiayuan Gu, Dazhi Cheng, Yuwen Xiong 6 | # -------------------------------------------------------- 7 | # Based on: 8 | # MX-RCNN 9 | # Copyright (c) 2016 by Contributors 10 | # Licence under The Apache 2.0 License 11 | # https://github.com/ijkguo/mx-rcnn/ 12 | # -------------------------------------------------------- 13 | 14 | import argparse 15 | import logging 16 | import pprint 17 | import mxnet as mx 18 | 19 | from symbols import * 20 | from core import callback, metric 21 | from core.loader import AnchorLoader 22 | from core.module import MutableModule 23 | from utils.load_data import load_gt_roidb, merge_roidb, filter_roidb 24 | from utils.load_model import load_param 25 | from utils.PrefetchingIter import PrefetchingIterV2 as PrefetchingIter 26 | from utils.lr_scheduler import WarmupMultiFactorScheduler 27 | 28 | 29 | def train_rpn(cfg, dataset, image_set, root_path, dataset_path, 30 | frequent, kvstore, flip, shuffle, resume, 31 | ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, 32 | train_shared, lr, lr_step, logger=None, output_path=None): 33 | # set up logger 34 | if not logger: 35 | logging.basicConfig() 36 | logger = logging.getLogger() 37 | logger.setLevel(logging.INFO) 38 | 39 | # set up config 40 | cfg.TRAIN.BATCH_IMAGES = cfg.TRAIN.ALTERNATE.RPN_BATCH_IMAGES 41 | 42 | # load symbol 43 | sym_instance = eval(cfg.symbol + '.' + cfg.symbol)() 44 | sym = sym_instance.get_symbol_rpn(cfg, is_train=True) 45 | feat_sym = sym.get_internals()['rpn_cls_score_output'] 46 | 47 | # setup multi-gpu 48 | batch_size = len(ctx) 49 | input_batch_size = cfg.TRAIN.BATCH_IMAGES * batch_size 50 | 51 | # print cfg 52 | pprint.pprint(cfg) 53 | logger.info('training rpn cfg:{}\n'.format(pprint.pformat(cfg))) 54 | 55 | # load dataset and prepare imdb for training 56 | image_sets = [iset for iset in image_set.split('+')] 57 | roidbs = [load_gt_roidb(dataset, image_set, root_path, dataset_path, result_path=output_path, 58 | flip=flip) 59 | for image_set in image_sets] 60 | roidb = merge_roidb(roidbs) 61 | roidb = filter_roidb(roidb, cfg) 62 | 63 | # load training data 64 | train_data = AnchorLoader(feat_sym, roidb, cfg, batch_size=input_batch_size, shuffle=shuffle, 65 | ctx=ctx, feat_stride=cfg.network.RPN_FEAT_STRIDE, anchor_scales=cfg.network.ANCHOR_SCALES, 66 | anchor_ratios=cfg.network.ANCHOR_RATIOS, aspect_grouping=cfg.TRAIN.ASPECT_GROUPING) 67 | 68 | # infer max shape 69 | max_data_shape = [('data', (cfg.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in cfg.SCALES]), max([v[1] for v in cfg.SCALES])))] 70 | max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) 71 | print('providing maximum shape', max_data_shape, max_label_shape) 72 | 73 | # infer shape 74 | data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) 75 | sym_instance.infer_shape(data_shape_dict) 76 | 77 | # load and initialize params 78 | if resume: 79 | print('continue training from ', begin_epoch) 80 | arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) 81 | else: 82 | arg_params, aux_params = load_param(pretrained, epoch, convert=True) 83 | sym_instance.init_weight_rpn(cfg, arg_params, aux_params) 84 | 85 | # check parameter shapes 86 | sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) 87 | 88 | # create solver 89 | data_names = [k[0] for k in train_data.provide_data_single] 90 | label_names = [k[0] for k in train_data.provide_label_single] 91 | if train_shared: 92 | fixed_param_prefix = cfg.network.FIXED_PARAMS_SHARED 93 | else: 94 | fixed_param_prefix = cfg.network.FIXED_PARAMS 95 | mod = MutableModule(sym, data_names=data_names, label_names=label_names, 96 | logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in xrange(batch_size)], 97 | max_label_shapes=[max_label_shape for _ in xrange(batch_size)], fixed_param_prefix=fixed_param_prefix) 98 | 99 | # decide training params 100 | # metric 101 | eval_metric = metric.RPNAccMetric() 102 | cls_metric = metric.RPNLogLossMetric() 103 | bbox_metric = metric.RPNL1LossMetric() 104 | eval_metrics = mx.metric.CompositeEvalMetric() 105 | for child_metric in [eval_metric, cls_metric, bbox_metric]: 106 | eval_metrics.add(child_metric) 107 | # callback 108 | batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=frequent) 109 | # epoch_end_callback = mx.callback.do_checkpoint(prefix) 110 | epoch_end_callback = mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True) 111 | # decide learning rate 112 | base_lr = lr 113 | lr_factor = cfg.TRAIN.lr_factor 114 | lr_epoch = [int(epoch) for epoch in lr_step.split(',')] 115 | lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] 116 | lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) 117 | lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] 118 | print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) 119 | lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, cfg.TRAIN.warmup, cfg.TRAIN.warmup_lr, cfg.TRAIN.warmup_step) 120 | # optimizer 121 | optimizer_params = {'momentum': cfg.TRAIN.momentum, 122 | 'wd': cfg.TRAIN.wd, 123 | 'learning_rate': lr, 124 | 'lr_scheduler': lr_scheduler, 125 | 'rescale_grad': 1.0, 126 | 'clip_gradient': None} 127 | 128 | if not isinstance(train_data, PrefetchingIter): 129 | train_data = PrefetchingIter(train_data) 130 | 131 | # train 132 | mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, 133 | batch_end_callback=batch_end_callback, kvstore=kvstore, 134 | optimizer='sgd', optimizer_params=optimizer_params, 135 | arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch) 136 | 137 | -------------------------------------------------------------------------------- /relation_rcnn/operator_cxx/deformable_convolution.cc: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 Microsoft 3 | * Licensed under The MIT License [see LICENSE for details] 4 | * \file deformable_convolution.cc 5 | * \brief 6 | * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai 7 | */ 8 | 9 | #include "./deformable_convolution-inl.h" 10 | 11 | namespace mxnet { 12 | namespace op { 13 | DMLC_REGISTER_PARAMETER(DeformableConvolutionParam); 14 | 15 | template<> 16 | Operator* CreateOp(DeformableConvolutionParam param, int dtype, 17 | std::vector *in_shape, 18 | std::vector *out_shape, 19 | Context ctx) { 20 | Operator *op = NULL; 21 | MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { 22 | op = new DeformableConvolutionOp(param); 23 | }) 24 | return op; 25 | } 26 | 27 | // DO_BIND_DISPATCH comes from operator_common.h 28 | Operator *DeformableConvolutionProp::CreateOperatorEx(Context ctx, 29 | std::vector *in_shape, 30 | std::vector *in_type) const { 31 | std::vector out_shape, aux_shape; 32 | std::vector out_type, aux_type; 33 | CHECK(InferType(in_type, &out_type, &aux_type)); 34 | CHECK(InferShape(in_shape, &out_shape, &aux_shape)); 35 | DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx); 36 | } 37 | 38 | MXNET_REGISTER_OP_PROPERTY(_contrib_DeformableConvolution, DeformableConvolutionProp) 39 | .describe(R"code(Compute *N*-D convolution on *(N+2)*-D input. 40 | 41 | In the 2-D convolution, given input data with shape *(batch_size, 42 | channel, height, width)*, the output is computed by 43 | 44 | .. math:: 45 | 46 | out[n,i,:,:] = bias[i] + \sum_{j=0}^{num\_filter} data[n,j,:,:] \star 47 | weight[i,j,:,:] 48 | 49 | where :math:`\star` is the 2-D cross-correlation operator. 50 | 51 | For general 2-D convolution, the shapes are 52 | 53 | - **data**: *(batch_size, channel, height, width)* 54 | - **weight**: *(num_filter, channel, kernel[0], kernel[1])* 55 | - **bias**: *(num_filter,)* 56 | - **out**: *(batch_size, num_filter, out_height, out_width)*. 57 | 58 | Define:: 59 | 60 | f(x,k,p,s,d) = floor((x+2*p-d*(k-1)-1)/s)+1 61 | 62 | then we have:: 63 | 64 | out_height=f(height, kernel[0], pad[0], stride[0], dilate[0]) 65 | out_width=f(width, kernel[1], pad[1], stride[1], dilate[1]) 66 | 67 | If ``no_bias`` is set to be true, then the ``bias`` term is ignored. 68 | 69 | The default data ``layout`` is *NCHW*, namely *(batch_size, channle, height, 70 | width)*. We can choose other layouts such as *NHWC*. 71 | 72 | If ``num_group`` is larger than 1, denoted by *g*, then split the input ``data`` 73 | evenly into *g* parts along the channel axis, and also evenly split ``weight`` 74 | along the first dimension. Next compute the convolution on the *i*-th part of 75 | the data with the *i*-th weight part. The output is obtained by concating all 76 | the *g* results. 77 | 78 | Both ``weight`` and ``bias`` are learnable parameters. 79 | 80 | 81 | )code" ADD_FILELINE) 82 | .add_argument("data", "NDArray-or-Symbol", "Input data to the DeformableConvolutionOp.") 83 | .add_argument("offset", "NDArray-or-Symbol", "Input offset to the DeformableConvolutionOp.") 84 | .add_argument("weight", "NDArray-or-Symbol", "Weight matrix.") 85 | .add_argument("bias", "NDArray-or-Symbol", "Bias parameter.") 86 | .add_arguments(DeformableConvolutionParam::__FIELDS__()); 87 | 88 | } // namespace op 89 | } // namespace mxnet 90 | -------------------------------------------------------------------------------- /relation_rcnn/operator_cxx/deformable_convolution.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 Microsoft 3 | * Licensed under The MIT License [see LICENSE for details] 4 | * \file deformable_convolution.cu 5 | * \brief 6 | * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai 7 | */ 8 | 9 | #include "./deformable_convolution-inl.h" 10 | #include 11 | 12 | namespace mxnet { 13 | namespace op { 14 | 15 | template<> 16 | Operator* CreateOp(DeformableConvolutionParam param, int dtype, 17 | std::vector *in_shape, 18 | std::vector *out_shape, 19 | Context ctx) { 20 | Operator *op = NULL; 21 | MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { 22 | op = new DeformableConvolutionOp(param); 23 | }) 24 | return op; 25 | } 26 | 27 | } // namespace op 28 | } // namespace mxnet 29 | 30 | -------------------------------------------------------------------------------- /relation_rcnn/operator_cxx/deformable_psroi_pooling.cc: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 Microsoft 3 | * Licensed under The MIT License [see LICENSE for details] 4 | * \file deformable_psroi_pooling.cc 5 | * \brief 6 | * \author Yi Li, Guodong Zhang, Jifeng Dai 7 | */ 8 | #include "./deformable_psroi_pooling-inl.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | using std::max; 16 | using std::min; 17 | using std::floor; 18 | using std::ceil; 19 | 20 | namespace mshadow { 21 | template 22 | inline void DeformablePSROIPoolForward(const Tensor &out, 23 | const Tensor &data, 24 | const Tensor &bbox, 25 | const Tensor &trans, 26 | const Tensor &top_count, 27 | const bool no_trans, 28 | const float spatial_scale, 29 | const int output_dim, 30 | const int group_size, 31 | const int pooled_size, 32 | const int part_size, 33 | const int sample_per_part, 34 | const float trans_std) { 35 | // NOT_IMPLEMENTED; 36 | return; 37 | } 38 | 39 | template 40 | inline void DeformablePSROIPoolBackwardAcc(const Tensor &in_grad, 41 | const Tensor &trans_grad, 42 | const Tensor &out_grad, 43 | const Tensor &data, 44 | const Tensor &bbox, 45 | const Tensor &trans, 46 | const Tensor &top_count, 47 | const bool no_trans, 48 | const float spatial_scale, 49 | const int output_dim, 50 | const int group_size, 51 | const int pooled_size, 52 | const int part_size, 53 | const int sample_per_part, 54 | const float trans_std) { 55 | // NOT_IMPLEMENTED; 56 | return; 57 | } 58 | } // namespace mshadow 59 | 60 | namespace mxnet { 61 | namespace op { 62 | 63 | template<> 64 | Operator *CreateOp(DeformablePSROIPoolingParam param, int dtype) { 65 | Operator* op = NULL; 66 | MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { 67 | op = new DeformablePSROIPoolingOp(param); 68 | }); 69 | return op; 70 | } 71 | 72 | Operator *DeformablePSROIPoolingProp::CreateOperatorEx(Context ctx, std::vector *in_shape, 73 | std::vector *in_type) const { 74 | std::vector out_shape, aux_shape; 75 | std::vector out_type, aux_type; 76 | CHECK(InferType(in_type, &out_type, &aux_type)); 77 | CHECK(InferShape(in_shape, &out_shape, &aux_shape)); 78 | DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); 79 | } 80 | 81 | DMLC_REGISTER_PARAMETER(DeformablePSROIPoolingParam); 82 | 83 | MXNET_REGISTER_OP_PROPERTY(_contrib_DeformablePSROIPooling, DeformablePSROIPoolingProp) 84 | .describe("Performs region-of-interest pooling on inputs. Resize bounding box coordinates by " 85 | "spatial_scale and crop input feature maps accordingly. The cropped feature maps are pooled " 86 | "by max pooling to a fixed size output indicated by pooled_size. batch_size will change to " 87 | "the number of region bounding boxes after DeformablePSROIPooling") 88 | .add_argument("data", "Symbol", "Input data to the pooling operator, a 4D Feature maps") 89 | .add_argument("rois", "Symbol", "Bounding box coordinates, a 2D array of " 90 | "[[batch_index, x1, y1, x2, y2]]. (x1, y1) and (x2, y2) are top left and down right corners " 91 | "of designated region of interest. batch_index indicates the index of corresponding image " 92 | "in the input data") 93 | .add_argument("trans", "Symbol", "transition parameter") 94 | .add_arguments(DeformablePSROIPoolingParam::__FIELDS__()); 95 | } // namespace op 96 | } // namespace mxnet 97 | -------------------------------------------------------------------------------- /relation_rcnn/operator_cxx/nn/deformable_im2col.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** 3 | * 4 | * COPYRIGHT 5 | * 6 | * All contributions by the University of California: 7 | * Copyright (c) 2014-2017 The Regents of the University of California (Regents) 8 | * All rights reserved. 9 | * 10 | * All other contributions: 11 | * Copyright (c) 2014-2017, the respective contributors 12 | * All rights reserved. 13 | * 14 | * Caffe uses a shared copyright model: each contributor holds copyright over 15 | * their contributions to Caffe. The project versioning records all such 16 | * contribution and copyright details. If a contributor wants to further mark 17 | * their specific copyright on a particular contribution, they should indicate 18 | * their copyright solely in the commit message of the change when it is 19 | * committed. 20 | * 21 | * LICENSE 22 | * 23 | * Redistribution and use in source and binary forms, with or without 24 | * modification, are permitted provided that the following conditions are met: 25 | * 26 | * 1. Redistributions of source code must retain the above copyright notice, this 27 | * list of conditions and the following disclaimer. 28 | * 2. Redistributions in binary form must reproduce the above copyright notice, 29 | * this list of conditions and the following disclaimer in the documentation 30 | * and/or other materials provided with the distribution. 31 | * 32 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 33 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 34 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 35 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 36 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 37 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 38 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 39 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 40 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 41 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 42 | * 43 | * CONTRIBUTION AGREEMENT 44 | * 45 | * By contributing to the BVLC/caffe repository through pull-request, comment, 46 | * or otherwise, the contributor releases their content to the 47 | * license and copyright terms herein. 48 | * 49 | ***************** END Caffe Copyright Notice and Disclaimer ******************** 50 | * 51 | * Copyright (c) 2017 Microsoft 52 | * Licensed under The MIT License [see LICENSE for details] 53 | * \file deformable_im2col.h 54 | * \brief Function definitions of converting an image to 55 | * column matrix based on kernel, padding, dilation, and offset. 56 | * These functions are mainly used in deformable convolution operators. 57 | * \ref: https://arxiv.org/abs/1703.06211 58 | * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai 59 | */ 60 | 61 | #ifndef MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_H_ 62 | #define MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_H_ 63 | 64 | #include 65 | #include 66 | #include 67 | #include 68 | #include "../../mxnet_op.h" 69 | 70 | namespace mxnet { 71 | namespace op { 72 | 73 | /*!\brief 74 | * cpu function of deformable_im2col algorithm 75 | * \param s device stream 76 | * \param data_im pointer of an image (C, H, W, ...) in the image batch 77 | * \param data_offset pointer of offset (C, H, W, ...) in the offset batch 78 | * \param im_shape input image shape in dimensions (N, C, H, W,) 79 | * \param col_shape column buffer shape (#channels, output_im_height, output_im_width, ...) 80 | * \param kernel_shape kernel filter shape 81 | * \param pad pad shape 82 | * \param stride stride shape 83 | * \param dilation dilation shape 84 | * \param deformable_group #offset group that deformable convolution use 85 | * \param data_col column buffer pointer 86 | */ 87 | template 88 | inline void deformable_im2col(mshadow::Stream* s, 89 | const DType* data_im, const DType* data_offset, 90 | const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape, 91 | const TShape& pad, const TShape& stride, const TShape& dilation, 92 | const uint32_t deformable_group, DType* data_col) { 93 | if (2 == kernel_shape.ndim()) { 94 | LOG(FATAL) << "not implemented"; 95 | } else { 96 | LOG(FATAL) << "not implemented"; 97 | } 98 | } 99 | 100 | 101 | /*!\brief 102 | * cpu function of deformable_col2im algorithm 103 | * \param s device stream 104 | * \param data_col start pointer of the column buffer to be filled 105 | * \param data_offset pointer of offset (C, H, W, ...) in the offset batch 106 | * \param im_shape input image shape in dimensions (N, C, H, W,) 107 | * \param col_shape column buffer shape 108 | * \param kernel_shape kernel filter shape 109 | * \param pad pad shape 110 | * \param stride stride shape 111 | * \param dilation dilation shape 112 | * \param deformable_group #offset group that deformable convolution use 113 | * \param grad_im pointer of a image (C, H, W,...) in the image batch 114 | */ 115 | template 116 | inline void deformable_col2im(mshadow::Stream* s, 117 | const DType* data_col, const DType* data_offset, 118 | const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape, 119 | const TShape& pad, const TShape& stride, 120 | const TShape& dilation, const uint32_t deformable_group, 121 | DType* grad_im, OpReqType req) { 122 | index_t num_spatial_axes = kernel_shape.ndim(); 123 | LOG(FATAL) << "not implemented"; 124 | } 125 | 126 | 127 | /*!\brief 128 | * cpu function of deformable_col2im_coord algorithm 129 | * \param s device stream 130 | * \param data_col start pointer of the column buffer to be filled 131 | * \param data_im pointer of an image (C, H, W, ...) in the image batch 132 | * \param data_offset pointer of offset (C, H, W, ...) in the offset batch 133 | * \param im_shape input image shape in dimensions (N, C, H, W,) 134 | * \param col_shape column buffer shape 135 | * \param kernel_shape kernel filter shape 136 | * \param pad pad shape 137 | * \param stride stride shape 138 | * \param dilation dilation shape 139 | * \param deformable_group #offset group that deformable convolution use 140 | * \param grad_offset pointer of the offset (C, H, W,...) in the offset batch 141 | */ 142 | 143 | template 144 | inline void deformable_col2im_coord(mshadow::Stream* s, 145 | const DType* data_col, const DType* data_im, const DType* data_offset, const TShape& im_shape, 146 | const TShape& col_shape, const TShape& kernel_shape, 147 | const TShape& pad, const TShape& stride, 148 | const TShape& dilation, const uint32_t deformable_group, DType* grad_offset, OpReqType req) { 149 | LOG(FATAL) << "not implemented"; 150 | } 151 | 152 | } // namespace op 153 | } // namespace mxnet 154 | #ifdef __CUDACC__ 155 | #include "./deformable_im2col.cuh" 156 | #endif 157 | #endif // MXNET_OPERATOR_CONTRIB_NN_DEFORMABLE_IM2COL_H_ 158 | -------------------------------------------------------------------------------- /relation_rcnn/operator_cxx/psroi_pooling.cc: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 by Contributors 3 | * Copyright (c) 2017 Microsoft 4 | * Licensed under The MIT License [see LICENSE for details] 5 | * \file psroi_pooling.cc 6 | * \brief psroi pooling operator 7 | * \author Yi Li, Tairui Chen, Guodong Zhang, Jifeng Dai 8 | */ 9 | #include "./psroi_pooling-inl.h" 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | using std::max; 17 | using std::min; 18 | using std::floor; 19 | using std::ceil; 20 | 21 | namespace mshadow { 22 | template 23 | inline void PSROIPoolForward(const Tensor &out, 24 | const Tensor &data, 25 | const Tensor &bbox, 26 | const Tensor &mapping_channel, 27 | const float spatial_scale_, 28 | const int output_dim_, 29 | const int group_size_) { 30 | // NOT_IMPLEMENTED; 31 | return; 32 | } 33 | 34 | template 35 | inline void PSROIPoolBackwardAcc(const Tensor &in_grad, 36 | const Tensor &out_grad, 37 | const Tensor &bbox, 38 | const Tensor &mapping_channel, 39 | const float spatial_scale_, 40 | const int output_dim_) { 41 | // NOT_IMPLEMENTED; 42 | return; 43 | } 44 | } // namespace mshadow 45 | 46 | namespace mxnet { 47 | namespace op { 48 | 49 | template<> 50 | Operator *CreateOp(PSROIPoolingParam param, int dtype) { 51 | Operator* op = NULL; 52 | MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { 53 | op = new PSROIPoolingOp(param); 54 | }); 55 | return op; 56 | } 57 | 58 | Operator *PSROIPoolingProp::CreateOperatorEx(Context ctx, std::vector *in_shape, 59 | std::vector *in_type) const { 60 | std::vector out_shape, aux_shape; 61 | std::vector out_type, aux_type; 62 | CHECK(InferType(in_type, &out_type, &aux_type)); 63 | CHECK(InferShape(in_shape, &out_shape, &aux_shape)); 64 | DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); 65 | } 66 | 67 | DMLC_REGISTER_PARAMETER(PSROIPoolingParam); 68 | 69 | MXNET_REGISTER_OP_PROPERTY(_contrib_PSROIPooling, PSROIPoolingProp) 70 | .describe("Performs region-of-interest pooling on inputs. Resize bounding box coordinates by " 71 | "spatial_scale and crop input feature maps accordingly. The cropped feature maps are pooled " 72 | "by max pooling to a fixed size output indicated by pooled_size. batch_size will change to " 73 | "the number of region bounding boxes after PSROIPooling") 74 | .add_argument("data", "Symbol", "Input data to the pooling operator, a 4D Feature maps") 75 | .add_argument("rois", "Symbol", "Bounding box coordinates, a 2D array of " 76 | "[[batch_index, x1, y1, x2, y2]]. (x1, y1) and (x2, y2) are top left and down right corners " 77 | "of designated region of interest. batch_index indicates the index of corresponding image " 78 | "in the input data") 79 | .add_arguments(PSROIPoolingParam::__FIELDS__()); 80 | } // namespace op 81 | } // namespace mxnet -------------------------------------------------------------------------------- /relation_rcnn/operator_py/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msracver/Relation-Networks-for-Object-Detection/e83e911d828e3c86624ce0aeb8d742d5ee67d5ba/relation_rcnn/operator_py/__init__.py -------------------------------------------------------------------------------- /relation_rcnn/operator_py/box_annotator_ohem.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Yuwen Xiong 6 | # -------------------------------------------------------- 7 | 8 | """ 9 | Proposal Target Operator selects foreground and background roi and assigns label, bbox_transform to them. 10 | """ 11 | 12 | import mxnet as mx 13 | import numpy as np 14 | from distutils.util import strtobool 15 | 16 | 17 | 18 | 19 | class BoxAnnotatorOHEMOperator(mx.operator.CustomOp): 20 | def __init__(self, num_classes, num_reg_classes, roi_per_img): 21 | super(BoxAnnotatorOHEMOperator, self).__init__() 22 | self._num_classes = num_classes 23 | self._num_reg_classes = num_reg_classes 24 | self._roi_per_img = roi_per_img 25 | 26 | def forward(self, is_train, req, in_data, out_data, aux): 27 | 28 | cls_score = in_data[0] 29 | bbox_pred = in_data[1] 30 | labels = in_data[2].asnumpy() 31 | bbox_targets = in_data[3] 32 | bbox_weights = in_data[4] 33 | 34 | per_roi_loss_cls = mx.nd.SoftmaxActivation(cls_score) + 1e-14 35 | per_roi_loss_cls = per_roi_loss_cls.asnumpy() 36 | per_roi_loss_cls = per_roi_loss_cls[np.arange(per_roi_loss_cls.shape[0], dtype='int'), labels.astype('int')] 37 | per_roi_loss_cls = -1 * np.log(per_roi_loss_cls) 38 | per_roi_loss_cls = np.reshape(per_roi_loss_cls, newshape=(-1,)) 39 | 40 | per_roi_loss_bbox = bbox_weights * mx.nd.smooth_l1((bbox_pred - bbox_targets), scalar=1.0) 41 | per_roi_loss_bbox = mx.nd.sum(per_roi_loss_bbox, axis=1).asnumpy() 42 | 43 | top_k_per_roi_loss = np.argsort(per_roi_loss_cls + per_roi_loss_bbox) 44 | labels_ohem = labels 45 | labels_ohem[top_k_per_roi_loss[::-1][self._roi_per_img:]] = -1 46 | bbox_weights_ohem = bbox_weights.asnumpy() 47 | bbox_weights_ohem[top_k_per_roi_loss[::-1][self._roi_per_img:]] = 0 48 | 49 | labels_ohem = mx.nd.array(labels_ohem) 50 | bbox_weights_ohem = mx.nd.array(bbox_weights_ohem) 51 | 52 | for ind, val in enumerate([labels_ohem, bbox_weights_ohem]): 53 | self.assign(out_data[ind], req[ind], val) 54 | 55 | 56 | def backward(self, req, out_grad, in_data, out_data, in_grad, aux): 57 | for i in range(len(in_grad)): 58 | self.assign(in_grad[i], req[i], 0) 59 | 60 | 61 | @mx.operator.register('BoxAnnotatorOHEM') 62 | class BoxAnnotatorOHEMProp(mx.operator.CustomOpProp): 63 | def __init__(self, num_classes, num_reg_classes, roi_per_img): 64 | super(BoxAnnotatorOHEMProp, self).__init__(need_top_grad=False) 65 | self._num_classes = int(num_classes) 66 | self._num_reg_classes = int(num_reg_classes) 67 | self._roi_per_img = int(roi_per_img) 68 | 69 | def list_arguments(self): 70 | return ['cls_score', 'bbox_pred', 'labels', 'bbox_targets', 'bbox_weights'] 71 | 72 | def list_outputs(self): 73 | return ['labels_ohem', 'bbox_weights_ohem'] 74 | 75 | def infer_shape(self, in_shape): 76 | labels_shape = in_shape[2] 77 | bbox_weights_shape = in_shape[4] 78 | 79 | return in_shape, \ 80 | [labels_shape, bbox_weights_shape] 81 | 82 | def create_operator(self, ctx, shapes, dtypes): 83 | return BoxAnnotatorOHEMOperator(self._num_classes, self._num_reg_classes, self._roi_per_img) 84 | 85 | def declare_backward_dependency(self, out_grad, in_data, out_data): 86 | return [] 87 | -------------------------------------------------------------------------------- /relation_rcnn/operator_py/monitor_op.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Jiayuan Gu, Dazhi Cheng 6 | # -------------------------------------------------------- 7 | import mxnet as mx 8 | import numpy as np 9 | from distutils.util import strtobool 10 | from easydict import EasyDict as edict 11 | import cPickle 12 | 13 | DEBUG = False 14 | 15 | 16 | class MonitorOperator(mx.operator.CustomOp): 17 | def __init__(self, nickname): 18 | super(MonitorOperator, self).__init__() 19 | self.nickname= nickname 20 | 21 | def forward(self, is_train, req, in_data, out_data, aux): 22 | self.assign(out_data[0], req[0], in_data[0]) 23 | 24 | def backward(self, req, out_grad, in_data, out_data, in_grad, aux): 25 | self.assign(in_grad[0], req[0], out_grad[0]) 26 | 27 | 28 | @mx.operator.register('monitor') 29 | class MonitorProp(mx.operator.CustomOpProp): 30 | def __init__(self, nickname): 31 | super(MonitorProp, self).__init__(need_top_grad=False) 32 | self.nickname = nickname 33 | 34 | def list_arguments(self): 35 | return ['input'] 36 | 37 | def list_outputs(self): 38 | return ['output'] 39 | 40 | def infer_shape(self, in_shape): 41 | output_shape = in_shape[0] 42 | return [output_shape], [output_shape] 43 | 44 | def create_operator(self, ctx, shapes, dtypes): 45 | return MonitorOperator(self.nickname) 46 | 47 | def declare_backward_dependency(self, out_grad, in_data, out_data): 48 | return [out_grad[0]] 49 | 50 | 51 | def monitor_wrapper(sym_instance, name): 52 | return mx.sym.Custom(input=sym_instance, 53 | op_type='monitor', 54 | nickname=name) -------------------------------------------------------------------------------- /relation_rcnn/operator_py/nms_multi_target.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Jiayuan Gu, Dazhi Cheng 6 | # -------------------------------------------------------- 7 | """ 8 | Nms Multi-thresh Target Operator selects foreground and background roi, 9 | and assigns label, bbox_transform to them. 10 | """ 11 | 12 | import mxnet as mx 13 | import numpy as np 14 | 15 | from bbox.bbox_transform import bbox_overlaps 16 | 17 | 18 | class NmsMultiTargetOp(mx.operator.CustomOp): 19 | def __init__(self, target_thresh): 20 | super(NmsMultiTargetOp, self).__init__() 21 | self._target_thresh = target_thresh 22 | self._num_thresh = len(target_thresh) 23 | 24 | def forward(self, is_train, req, in_data, out_data, aux): 25 | # bbox, [first_n, num_fg_classes, 4] 26 | bbox = in_data[0].asnumpy() 27 | num_boxes = bbox.shape[0] 28 | num_fg_classes = bbox.shape[1] 29 | gt_box = in_data[1].asnumpy() 30 | # score, [first_n, num_fg_classes] 31 | score = in_data[2].asnumpy() 32 | 33 | batch_image, num_gt, code_size = gt_box.shape 34 | assert batch_image == 1, 'only support batch_image=1, but receive %d' % num_gt 35 | assert code_size == 5, 'code_size of gt should be 5, but receive %d' % code_size 36 | assert len(score.shape) == 2, 'shape of score is %d instead of 2.' % len(score.shape) 37 | assert score.shape[1] == num_fg_classes, 'number of fg classes should be same for boxes and scores' 38 | 39 | output_list = [] 40 | for cls_idx in range(0, num_fg_classes): 41 | valid_gt_mask = (gt_box[0, :, -1].astype(np.int32)==(cls_idx+1)) 42 | valid_gt_box = gt_box[0, valid_gt_mask, :] 43 | num_valid_gt = len(valid_gt_box) 44 | 45 | if num_valid_gt == 0: 46 | output = np.zeros(shape=(num_boxes, self._num_thresh), dtype=np.float32) 47 | output_list.append(output) 48 | else: 49 | bbox_per_class = bbox[:, cls_idx, :] 50 | score_per_class = score[:, cls_idx:cls_idx+1] 51 | overlap_mat = bbox_overlaps(bbox_per_class.astype(np.float), 52 | valid_gt_box[:,:-1].astype(np.float)) 53 | 54 | eye_matrix = np.eye(num_valid_gt) 55 | output_list_per_class = [] 56 | 57 | for thresh in self._target_thresh: 58 | # following mAP metric 59 | overlap_mask = (overlap_mat > thresh) 60 | valid_bbox_indices = np.where(overlap_mask)[0] 61 | # require score be 2-dim 62 | overlap_score = np.tile(score_per_class, (1, num_valid_gt)) 63 | overlap_score *= overlap_mask 64 | max_overlap_indices = np.argmax(overlap_mat, axis=1) 65 | max_overlap_mask = eye_matrix[max_overlap_indices] 66 | overlap_score *= max_overlap_mask 67 | max_score_indices = np.argmax(overlap_score, axis=0) 68 | output = np.zeros((num_boxes,)) 69 | output[np.intersect1d(max_score_indices,valid_bbox_indices)] = 1 70 | output_list_per_class.append(output) 71 | output_per_class = np.stack(output_list_per_class, axis=-1) 72 | output_list.append(output_per_class) 73 | blob = np.stack(output_list, axis=1).astype(np.float32, copy=False) 74 | self.assign(out_data[0], req[0], blob) 75 | 76 | def backward(self, req, out_grad, in_data, out_data, in_grad, aux): 77 | self.assign(in_grad[0], req[0], 0) 78 | self.assign(in_grad[1], req[1], 0) 79 | self.assign(in_grad[2], req[2], 0) 80 | 81 | 82 | @mx.operator.register("nms_multi_target") 83 | class NmsMultiTargetProp(mx.operator.CustomOpProp): 84 | def __init__(self, target_thresh): 85 | super(NmsMultiTargetProp, self).__init__(need_top_grad=False) 86 | self._target_thresh = np.fromstring(target_thresh[1:-1], dtype=float, sep=' ') 87 | self._num_thresh = len(self._target_thresh) 88 | 89 | def list_arguments(self): 90 | return ['bbox', 'gt_bbox', 'score'] 91 | 92 | def list_outputs(self): 93 | return ['nms_multi_target'] 94 | 95 | def infer_shape(self, in_shape): 96 | bbox_shape = in_shape[0] 97 | # gt_box_shape = in_shape[1] 98 | score_shape = in_shape[2] 99 | 100 | assert bbox_shape[0] == score_shape[0], 'ROI number should be same for bbox and score' 101 | 102 | num_boxes = bbox_shape[0] 103 | num_fg_classes = bbox_shape[1] 104 | output_shape = (num_boxes, num_fg_classes, self._num_thresh) 105 | 106 | return in_shape, [output_shape] 107 | 108 | def create_operator(self, ctx, shapes, dtypes): 109 | return NmsMultiTargetOp(self._target_thresh) 110 | 111 | def declare_backward_dependency(self, out_grad, in_data, out_data): 112 | return [] 113 | -------------------------------------------------------------------------------- /relation_rcnn/operator_py/proposal_target.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Yuwen Xiong 6 | # -------------------------------------------------------- 7 | # Based on: 8 | # MX-RCNN 9 | # Copyright (c) 2016 by Contributors 10 | # Licence under The Apache 2.0 License 11 | # https://github.com/ijkguo/mx-rcnn/ 12 | # -------------------------------------------------------- 13 | 14 | """ 15 | Proposal Target Operator selects foreground and background roi and assigns label, bbox_transform to them. 16 | """ 17 | 18 | import mxnet as mx 19 | import numpy as np 20 | from distutils.util import strtobool 21 | from easydict import EasyDict as edict 22 | import cPickle 23 | 24 | 25 | from core.rcnn import sample_rois, sample_rois_v2 26 | 27 | DEBUG = False 28 | 29 | 30 | class ProposalTargetOperator(mx.operator.CustomOp): 31 | def __init__(self, num_classes, batch_images, batch_rois, cfg, fg_fraction): 32 | super(ProposalTargetOperator, self).__init__() 33 | self._num_classes = num_classes 34 | self._batch_images = batch_images 35 | self._batch_rois = batch_rois 36 | self._cfg = cfg 37 | self._fg_fraction = fg_fraction 38 | 39 | if DEBUG: 40 | self._count = 0 41 | self._fg_num = 0 42 | self._bg_num = 0 43 | 44 | def forward(self, is_train, req, in_data, out_data, aux): 45 | assert self._batch_rois == -1 or self._batch_rois % self._batch_images == 0, \ 46 | 'batchimages {} must devide batch_rois {}'.format(self._batch_images, self._batch_rois) 47 | all_rois = in_data[0].asnumpy() 48 | gt_boxes = in_data[1].asnumpy() 49 | 50 | if self._batch_rois == -1: 51 | rois_per_image = all_rois.shape[0] + gt_boxes.shape[0] 52 | fg_rois_per_image = rois_per_image 53 | elif self._batch_rois == -2: 54 | rois_per_image = all_rois.shape[0] 55 | fg_rois_per_image = rois_per_image 56 | elif self._batch_rois < -10: 57 | rois_per_image = -self._batch_rois / self._batch_images 58 | fg_rois_per_image = np.round(self._fg_fraction * rois_per_image).astype(int) 59 | else: 60 | rois_per_image = self._batch_rois / self._batch_images 61 | fg_rois_per_image = np.round(self._fg_fraction * rois_per_image).astype(int) 62 | 63 | 64 | # Include ground-truth boxes in the set of candidate rois 65 | zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype) 66 | if self._batch_rois >= -1: 67 | all_rois = np.vstack((all_rois, np.hstack((zeros, gt_boxes[:, :-1])))) 68 | # Sanity check: single batch only 69 | assert np.all(all_rois[:, 0] == 0), 'Only single item batches are supported' 70 | 71 | if self._batch_rois == -1 or self._batch_rois == -2: 72 | #rois, labels, bbox_targets, bbox_weights = \ 73 | # sample_rois(all_rois, fg_rois_per_image, rois_per_image, self._num_classes, self._cfg, gt_boxes=gt_boxes) 74 | rois, labels, bbox_targets, bbox_weights = \ 75 | sample_rois_v2(all_rois, self._num_classes, self._cfg, gt_boxes=gt_boxes) 76 | else: 77 | rois, labels, bbox_targets, bbox_weights = \ 78 | sample_rois(all_rois, fg_rois_per_image, rois_per_image, self._num_classes, self._cfg, gt_boxes=gt_boxes) 79 | 80 | if DEBUG: 81 | print "labels=", labels 82 | print 'num fg: {}'.format((labels > 0).sum()) 83 | print 'num bg: {}'.format((labels == 0).sum()) 84 | self._count += 1 85 | self._fg_num += (labels > 0).sum() 86 | self._bg_num += (labels == 0).sum() 87 | print "self._count=", self._count 88 | print 'num fg avg: {}'.format(self._fg_num / self._count) 89 | print 'num bg avg: {}'.format(self._bg_num / self._count) 90 | print 'ratio: {:.3f}'.format(float(self._fg_num) / float(self._bg_num)) 91 | 92 | for ind, val in enumerate([rois, labels, bbox_targets, bbox_weights]): 93 | self.assign(out_data[ind], req[ind], val) 94 | 95 | def backward(self, req, out_grad, in_data, out_data, in_grad, aux): 96 | self.assign(in_grad[0], req[0], 0) 97 | self.assign(in_grad[1], req[1], 0) 98 | 99 | 100 | @mx.operator.register('proposal_target') 101 | class ProposalTargetProp(mx.operator.CustomOpProp): 102 | def __init__(self, num_classes, batch_images, batch_rois, cfg, fg_fraction='0.25'): 103 | super(ProposalTargetProp, self).__init__(need_top_grad=False) 104 | self._num_classes = int(num_classes) 105 | self._batch_images = int(batch_images) 106 | self._batch_rois = int(batch_rois) 107 | self._cfg = cPickle.loads(cfg) 108 | self._fg_fraction = float(fg_fraction) 109 | 110 | def list_arguments(self): 111 | return ['rois', 'gt_boxes'] 112 | 113 | def list_outputs(self): 114 | return ['rois_output', 'label', 'bbox_target', 'bbox_weight'] 115 | 116 | def infer_shape(self, in_shape): 117 | rpn_rois_shape = in_shape[0] 118 | gt_boxes_shape = in_shape[1] 119 | 120 | if self._batch_rois == -1: 121 | rois = rpn_rois_shape[0] + gt_boxes_shape[0] 122 | elif self._batch_rois == -2: 123 | rois = rpn_rois_shape[0] 124 | elif self._batch_rois < -10: 125 | rois = -self._batch_rois 126 | else: 127 | rois = self._batch_rois 128 | 129 | #rois = rpn_rois_shape[0] + gt_boxes_shape[0] if self._batch_rois == -1 else self._batch_rois 130 | 131 | output_rois_shape = (rois, 5) 132 | label_shape = (rois, ) 133 | bbox_target_shape = (rois, self._num_classes * 4) 134 | bbox_weight_shape = (rois, self._num_classes * 4) 135 | 136 | return [rpn_rois_shape, gt_boxes_shape], \ 137 | [output_rois_shape, label_shape, bbox_target_shape, bbox_weight_shape] 138 | 139 | def create_operator(self, ctx, shapes, dtypes): 140 | return ProposalTargetOperator(self._num_classes, self._batch_images, self._batch_rois, self._cfg, self._fg_fraction) 141 | 142 | def declare_backward_dependency(self, out_grad, in_data, out_data): 143 | return [] 144 | -------------------------------------------------------------------------------- /relation_rcnn/symbols/__init__.py: -------------------------------------------------------------------------------- 1 | import resnet_v1_101_rcnn_attention_1024_pairwise_position_multi_head_16 2 | import resnet_v1_101_rcnn_dcn_attention_1024_pairwise_position_multi_head_16 3 | import resnet_v1_101_rcnn_attention_1024_pairwise_position_multi_head_16_learn_nms 4 | import resnet_v1_101_rcnn_dcn_attention_1024_pairwise_position_multi_head_16_learn_nms 5 | import resnet_v1_101_rcnn_fpn_attention_1024_pairwise_position_multi_head_16 6 | import resnet_v1_101_rcnn_fpn_attention_1024_pairwise_position_multi_head_16_learn_nms 7 | import resnet_v1_101_rcnn_learn_nms_1024_attention_1024_pairwise_position_multi_head_16 8 | import resnet_v1_101_rcnn_dcn 9 | import resnet_v1_101_rcnn_fpn 10 | import resnet_v1_101_rcnn 11 | -------------------------------------------------------------------------------- /relation_rcnn/test.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Jiayuan Gu, Dazhi Cheng, Yuwen Xiong 6 | # -------------------------------------------------------- 7 | # Based on: 8 | # MX-RCNN 9 | # Copyright (c) 2016 by Contributors 10 | # Licence under The Apache 2.0 License 11 | # https://github.com/ijkguo/mx-rcnn/ 12 | # -------------------------------------------------------- 13 | 14 | import _init_paths 15 | import argparse 16 | import os 17 | import sys 18 | import time 19 | import logging 20 | from config.config import config, update_config 21 | 22 | 23 | def parse_args(): 24 | parser = argparse.ArgumentParser(description='Test a Faster R-CNN network') 25 | # general 26 | parser.add_argument('--cfg', help='experiment configure file name', required=True, type=str) 27 | 28 | args, rest = parser.parse_known_args() 29 | update_config(args.cfg) 30 | 31 | # rcnn 32 | parser.add_argument('--vis', help='turn on visualization', action='store_true') 33 | parser.add_argument('--ignore_cache', help='ignore cached results boxes', action='store_true') 34 | parser.add_argument('--thresh', help='valid detection threshold', default=1e-3, type=float) 35 | parser.add_argument('--shuffle', help='shuffle data on visualization', action='store_true') 36 | parser.add_argument('--test_epoch', help='the epoch model to be test', default=config.TEST.test_epoch, type=int) 37 | # nms 38 | parser.add_argument('--nms', help='params for nms or softnms', default=config.TEST.NMS, type=float) 39 | parser.add_argument('--softnms', help='whether to enable softnms', default=config.TEST.SOFTNMS, action='store_true') 40 | parser.add_argument('--naive_nms', help='whether to enable naive nms', default=False, action='store_true') 41 | parser.add_argument('--first_n', help='first_n for learn nms or nms', default=config.TEST.FIRST_N, type=int) 42 | parser.add_argument('--merge', help='merge method for learn nms', default=config.TEST.MERGE_METHOD, type=int) 43 | parser.add_argument('--debug', help='whether to enable debug mode', default=False, action='store_true') 44 | # dataset 45 | parser.add_argument('--test_set', help='which set to be tested', default=config.dataset.test_image_set, type=str) 46 | args, rest = parser.parse_known_args() 47 | # update config 48 | config.TEST.test_epoch = args.test_epoch 49 | config.TEST.NMS = args.nms 50 | config.TEST.SOFTNMS = args.softnms and (not args.naive_nms) 51 | config.TEST.FIRST_N = args.first_n 52 | config.TEST.MERGE_METHOD = args.merge 53 | config.dataset.test_image_set = args.test_set 54 | return args 55 | 56 | 57 | args = parse_args() 58 | curr_path = os.path.abspath(os.path.dirname(__file__)) 59 | sys.path.insert(0, os.path.join(curr_path, '../external/mxnet', config.MXNET_VERSION)) 60 | 61 | import mxnet as mx 62 | import numpy as np 63 | from function.test_rcnn import test_rcnn 64 | from utils.create_logger import create_logger 65 | 66 | 67 | def main(): 68 | ctx = [mx.gpu(int(i)) for i in config.gpus.split(',')] 69 | print args 70 | np.random.seed(0) 71 | mx.random.seed(0) 72 | logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.test_image_set) 73 | 74 | test_rcnn(config, config.dataset.dataset, config.dataset.test_image_set, config.dataset.root_path, config.dataset.dataset_path, 75 | ctx, os.path.join(final_output_path, '..', '_'.join([iset for iset in config.dataset.image_set.split('+')]), config.TRAIN.model_prefix), config.TEST.test_epoch, 76 | args.vis, args.ignore_cache, args.shuffle, config.TEST.HAS_RPN, config.dataset.proposal, args.thresh, logger=logger, output_path=final_output_path) 77 | 78 | if __name__ == '__main__': 79 | main() 80 | -------------------------------------------------------------------------------- /relation_rcnn/train_end2end.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Jiayuan Gu, Dazhi Cheng, Yuwen Xiong 6 | # -------------------------------------------------------- 7 | # Based on: 8 | # MX-RCNN 9 | # Copyright (c) 2016 by Contributors 10 | # Licence under The Apache 2.0 License 11 | # https://github.com/ijkguo/mx-rcnn/ 12 | # -------------------------------------------------------- 13 | 14 | import _init_paths 15 | 16 | import time 17 | import argparse 18 | import logging 19 | import pprint 20 | import os 21 | import sys 22 | from config.config import config, update_config 23 | 24 | def parse_args(): 25 | parser = argparse.ArgumentParser(description='Train Faster-RCNN network') 26 | # general 27 | parser.add_argument('--cfg', help='experiment configure file name', required=True, type=str) 28 | 29 | args, rest = parser.parse_known_args() 30 | # update config 31 | update_config(args.cfg) 32 | 33 | # training 34 | parser.add_argument('--frequent', help='frequency of logging', default=config.default.frequent, type=int) 35 | args, rest = parser.parse_known_args() 36 | return args 37 | 38 | args = parse_args() 39 | curr_path = os.path.abspath(os.path.dirname(__file__)) 40 | sys.path.insert(0, os.path.join(curr_path, '../external/mxnet', config.MXNET_VERSION)) 41 | 42 | import shutil 43 | import numpy as np 44 | import mxnet as mx 45 | 46 | from symbols import * 47 | from core import callback, metric 48 | from core.loader import AnchorLoader 49 | from core.module import MutableModule 50 | from utils.create_logger import create_logger 51 | from utils.load_data import load_gt_roidb, merge_roidb, filter_roidb 52 | from utils.load_model import load_param 53 | from utils.PrefetchingIter import PrefetchingIterV2 as PrefetchingIter 54 | from utils.lr_scheduler import WarmupMultiFactorScheduler 55 | 56 | 57 | def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): 58 | np.random.seed(0) 59 | mx.random.seed(0) 60 | logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.image_set) 61 | prefix = os.path.join(final_output_path, prefix) 62 | 63 | # load symbol 64 | shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'), final_output_path) 65 | sym_instance = eval(config.symbol + '.' + config.symbol)() 66 | sym = sym_instance.get_symbol(config, is_train=True) 67 | feat_sym = sym.get_internals()['rpn_cls_score_output'] 68 | 69 | # setup multi-gpu 70 | batch_size = len(ctx) 71 | input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size 72 | 73 | # print config 74 | pprint.pprint(config) 75 | logger.info('training config:{}\n'.format(pprint.pformat(config))) 76 | 77 | # load dataset and prepare imdb for training 78 | image_sets = [iset for iset in config.dataset.image_set.split('+')] 79 | roidbs = [load_gt_roidb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, 80 | flip=config.TRAIN.FLIP) 81 | for image_set in image_sets] 82 | roidb = merge_roidb(roidbs) 83 | roidb = filter_roidb(roidb, config) 84 | 85 | # load training data 86 | train_data = AnchorLoader(feat_sym, roidb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, 87 | feat_stride=config.network.RPN_FEAT_STRIDE, anchor_scales=config.network.ANCHOR_SCALES, 88 | anchor_ratios=config.network.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING) 89 | 90 | # infer max shape 91 | max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] 92 | max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) 93 | max_data_shape.append(('gt_boxes', (config.TRAIN.BATCH_IMAGES, 100, 5))) 94 | print 'providing maximum shape', max_data_shape, max_label_shape 95 | 96 | data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) 97 | pprint.pprint(data_shape_dict) 98 | sym_instance.infer_shape(data_shape_dict) 99 | pprint.pprint(sym_instance.arg_shape_dict) 100 | logging.info(pprint.pformat(sym_instance.arg_shape_dict)) 101 | #dot = mx.viz.plot_network(sym, node_attrs={'shape': 'rect', 'fixedsize': 'false'}) 102 | #dot.render(os.path.join('./output/rcnn/network_vis', config.symbol + '_rcnn')) 103 | 104 | # load and initialize params 105 | if config.TRAIN.RESUME: 106 | print('continue training from ', begin_epoch) 107 | arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) 108 | else: 109 | arg_params, aux_params = load_param(pretrained, epoch, convert=True) 110 | sym_instance.init_weight(config, arg_params, aux_params) 111 | 112 | # check parameter shapes 113 | sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) 114 | 115 | # create solver 116 | fixed_param_prefix = config.network.FIXED_PARAMS 117 | data_names = [k[0] for k in train_data.provide_data_single] 118 | label_names = [k[0] for k in train_data.provide_label_single] 119 | 120 | mod = MutableModule(sym, data_names=data_names, label_names=label_names, 121 | logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in range(batch_size)], 122 | max_label_shapes=[max_label_shape for _ in range(batch_size)], fixed_param_prefix=fixed_param_prefix) 123 | 124 | if config.TRAIN.RESUME: 125 | mod._preload_opt_states = '%s-%04d.states'%(prefix, begin_epoch) 126 | 127 | # decide training params 128 | # metric 129 | eval_metric = metric.RCNNAccMetric(config) 130 | cls_metric = metric.RCNNLogLossMetric(config) 131 | bbox_metric = metric.RCNNL1LossMetric(config) 132 | eval_metrics = mx.metric.CompositeEvalMetric() 133 | # rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric 134 | if config.TRAIN.JOINT_TRAINING or (not config.TRAIN.LEARN_NMS): 135 | rpn_eval_metric = metric.RPNAccMetric() 136 | rpn_cls_metric = metric.RPNLogLossMetric() 137 | rpn_bbox_metric = metric.RPNL1LossMetric() 138 | for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric]: 139 | eval_metrics.add(child_metric) 140 | for child_metric in [eval_metric, cls_metric, bbox_metric]: 141 | eval_metrics.add(child_metric) 142 | if config.TRAIN.LEARN_NMS: 143 | eval_metrics.add(metric.NMSLossMetric(config, 'pos')) 144 | eval_metrics.add(metric.NMSLossMetric(config, 'neg')) 145 | eval_metrics.add(metric.NMSAccMetric(config)) 146 | 147 | # callback 148 | batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=args.frequent) 149 | means = np.tile(np.array(config.TRAIN.BBOX_MEANS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) 150 | stds = np.tile(np.array(config.TRAIN.BBOX_STDS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) 151 | epoch_end_callback = [mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True), 152 | callback.do_checkpoint(prefix, means, stds)] 153 | # decide learning rate 154 | base_lr = lr 155 | lr_factor = config.TRAIN.lr_factor 156 | lr_epoch = [float(epoch) for epoch in lr_step.split(',')] 157 | lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] 158 | lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) 159 | lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] 160 | print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) 161 | lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) 162 | # optimizer 163 | optimizer_params = {'momentum': config.TRAIN.momentum, 164 | 'wd': config.TRAIN.wd, 165 | 'learning_rate': lr, 166 | 'lr_scheduler': lr_scheduler, 167 | 'rescale_grad': 1.0, 168 | 'clip_gradient': None} 169 | 170 | if not isinstance(train_data, PrefetchingIter): 171 | train_data = PrefetchingIter(train_data) 172 | 173 | # train 174 | mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, 175 | batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, 176 | optimizer='sgd', optimizer_params=optimizer_params, 177 | arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch) 178 | 179 | 180 | def main(): 181 | print('Called with argument:', args) 182 | ctx = [mx.gpu(int(i)) for i in config.gpus.split(',')] 183 | train_net(args, ctx, config.network.pretrained, config.network.pretrained_epoch, config.TRAIN.model_prefix, 184 | config.TRAIN.begin_epoch, config.TRAIN.end_epoch, config.TRAIN.lr, config.TRAIN.lr_step) 185 | 186 | if __name__ == '__main__': 187 | main() 188 | -------------------------------------------------------------------------------- /relation_rcnn/train_rcnn.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Relation Networks for Object Detection 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Modified by Jiayuan Gu, Dazhi Cheng, Yuwen Xiong 6 | # -------------------------------------------------------- 7 | # Based on: 8 | # MX-RCNN 9 | # Copyright (c) 2016 by Contributors 10 | # Licence under The Apache 2.0 License 11 | # https://github.com/ijkguo/mx-rcnn/ 12 | # -------------------------------------------------------- 13 | 14 | import _init_paths 15 | 16 | import time 17 | import argparse 18 | import logging 19 | import pprint 20 | import os 21 | import sys 22 | from config.config import config, update_config 23 | 24 | def parse_args(): 25 | parser = argparse.ArgumentParser(description='Train Faster-RCNN network') 26 | # general 27 | parser.add_argument('--cfg', help='experiment configure file name', required=True, type=str) 28 | 29 | args, rest = parser.parse_known_args() 30 | # update config 31 | update_config(args.cfg) 32 | 33 | # training 34 | parser.add_argument('--frequent', help='frequency of logging', default=config.default.frequent, type=int) 35 | args = parser.parse_args() 36 | return args 37 | 38 | args = parse_args() 39 | curr_path = os.path.abspath(os.path.dirname(__file__)) 40 | sys.path.insert(0, os.path.join(curr_path, '../external/mxnet', config.MXNET_VERSION)) 41 | 42 | import shutil 43 | import numpy as np 44 | import mxnet as mx 45 | 46 | from function.train_rpn import train_rpn 47 | from function.test_rpn import test_rpn 48 | from function.train_rcnn import train_rcnn 49 | from utils.create_logger import create_logger 50 | 51 | 52 | def main(): 53 | print ('Called with argument:', args) 54 | ctx = [mx.gpu(int(i)) for i in config.gpus.split(',')] 55 | logger, output_path = create_logger(config.output_path, args.cfg, config.dataset.image_set) 56 | shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'), output_path) 57 | 58 | assert config.TRAIN.END2END == False 59 | prefix = os.path.join(output_path, config.TRAIN.model_prefix) 60 | logging.info('########## TRAIN rcnn WITH IMAGENET INIT AND RPN DETECTION') 61 | train_rcnn(config, config.dataset.dataset, config.dataset.image_set, config.dataset.root_path, config.dataset.dataset_path, 62 | args.frequent, config.default.kvstore, config.TRAIN.FLIP, config.TRAIN.SHUFFLE, config.TRAIN.RESUME, 63 | ctx, config.network.pretrained, config.network.pretrained_epoch, prefix, config.TRAIN.begin_epoch, 64 | config.TRAIN.end_epoch, train_shared=False, lr=config.TRAIN.lr, lr_step=config.TRAIN.lr_step, 65 | proposal=config.dataset.proposal, logger=logger, output_path=output_path) 66 | 67 | if __name__ == '__main__': 68 | main() 69 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython 2 | EasyDict 3 | opencv-python 4 | mxnet-cu80 5 | --------------------------------------------------------------------------------