├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── bua ├── __init__.py ├── caffe │ ├── __init__.py │ ├── config.py │ ├── dataloader │ │ ├── __init__.py │ │ ├── dataset_mapper.py │ │ ├── detection_utils.py │ │ └── transform_gen.py │ ├── modeling │ │ ├── __init__.py │ │ ├── backbone.py │ │ ├── box_regression.py │ │ ├── fast_rcnn.py │ │ ├── layers │ │ │ ├── csrc │ │ │ │ ├── __init__.py │ │ │ │ ├── nms │ │ │ │ │ ├── nms.cu │ │ │ │ │ ├── nms.h │ │ │ │ │ ├── nms_cpu.cpp │ │ │ │ │ ├── vision_cpu.h │ │ │ │ │ └── vision_cuda.h │ │ │ │ └── vision.cpp │ │ │ ├── nms.py │ │ │ └── wrappers.py │ │ ├── rcnn.py │ │ ├── roi_heads.py │ │ ├── rpn.py │ │ └── rpn_outputs.py │ └── postprocessing.py ├── d2 │ ├── __init__.py │ ├── config.py │ ├── dataloader │ │ ├── __init__.py │ │ ├── build_loader.py │ │ └── dataset_mapper.py │ └── modeling │ │ └── roi_heads.py └── visual_genome.py ├── configs ├── caffe │ ├── test-caffe-r101-fix36.yaml │ ├── test-caffe-r101.yaml │ └── test-caffe-r152.yaml └── d2 │ ├── base-d2.yaml │ ├── test-d2-X152.yaml │ ├── test-d2-r101.yaml │ ├── test-d2-r50.yaml │ ├── train-d2-r101.yaml │ └── train-d2-r50.yaml ├── datasets ├── demo │ ├── 000456.jpg │ ├── 000542.jpg │ ├── 001150.jpg │ ├── 001763.jpg │ ├── 004545.jpg │ ├── example_image.jpg │ ├── example_image1.png │ └── example_image2.png └── init ├── evaluation ├── __init__.py ├── attributes_vocab.txt ├── objects_vocab.txt ├── vg_eval.py └── vg_evaluation.py ├── extract_features.py ├── opts.py ├── setup.py ├── train_net.py └── utils ├── __init__.py ├── extract_d2features.py ├── extract_features_faster.py ├── extract_features_multigpu.py ├── extract_features_singlegpu.py ├── extract_utils.py ├── extractor.py ├── progress_bar.py ├── utils.py └── visualize.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | /bottom_up_attention.pytorch.egg-info/ 2 | __pycache__/ 3 | .ipynb_checkpoints/ 4 | /build/ 5 | /datasets/visual_genome/ 6 | /extract/ 7 | /output/ 8 | /output_caffe152/ 9 | *.pth 10 | *.pkl -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "detectron2"] 2 | path = detectron2 3 | url = https://github.com/facebookresearch/detectron2 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bottom-up-attention.pytorch 2 | 3 | This repository contains a **PyTorch** reimplementation of the [bottom-up-attention](https://github.com/peteanderson80/bottom-up-attention) project based on *Caffe*. 4 | 5 | We use [Detectron2](https://github.com/facebookresearch/detectron2) as the backend to provide completed functions including training, testing and feature extraction. Furthermore, we migrate the pre-trained Caffe-based model from the original repository which can extract **the same visual features** as the original model (with deviation < 0.01). 6 | 7 | Some example object and attribute predictions for salient image regions are illustrated below. The script to obtain the following visualizations can be found [here](utils/visualize.ipynb) 8 | 9 | ![example-image](datasets/demo/example_image.jpg?raw=true) 10 | 11 | ## Table of Contents 12 | 13 | 0. [Prerequisites](#Prerequisites) 14 | 1. [Training](#Training) 15 | 2. [Testing](#Testing) 16 | 3. [Feature Extraction](#Feature-Extraction) 17 | 4. [Pre-trained models](#Pre-trained-models) 18 | 19 | ## Prerequisites 20 | 21 | #### Requirements 22 | 23 | - [Python](https://www.python.org/downloads/) >= 3.6 24 | - [PyTorch](http://pytorch.org/) >= 1.4 25 | - [Cuda](https://developer.nvidia.com/cuda-toolkit) >= 9.2 and [cuDNN](https://developer.nvidia.com/cudnn) 26 | - [Apex](https://github.com/NVIDIA/apex.git) 27 | - [Detectron2](https://github.com/facebookresearch/detectron2) 28 | - [Ray](https://github.com/ray-project/ray) 29 | - [OpenCV](https://opencv.org/) 30 | - [Pycocotools](https://github.com/cocodataset/cocoapi) 31 | 32 | Note that most of the requirements above are needed for Detectron2. 33 | 34 | #### Installation 35 | 36 | 1. Clone the project including the required version (v0.2.1) of Detectron2. **Note that if you use another version, some strange problems may occur**. 37 | ```bash 38 | # clone the repository inclduing Detectron2(@be792b9) 39 | $ git clone --recursive https://github.com/MILVLG/bottom-up-attention.pytorch 40 | ``` 41 | 42 | 2. Install Detectron2 43 | ```bash 44 | $ cd detectron2 45 | $ pip install -e . 46 | $ cd .. 47 | ``` 48 | **We recommend using Detectron2 v0.2.1 (@be792b9) as backend for this project, which has been cloned in step 1. We believe a newer Detectron2 version is also compatible with this project unless their interface has been changed (we have tested v0.3 with PyTorch 1.5).** 49 | 50 | 3. Compile the rest tools using the following script: 51 | 52 | ```bash 53 | # install apex 54 | $ git clone https://github.com/NVIDIA/apex.git 55 | $ cd apex 56 | $ python setup.py install 57 | $ cd .. 58 | # install the rest modules 59 | $ python setup.py build develop 60 | $ pip install ray 61 | ``` 62 | 63 | #### Setup 64 | 65 | If you want to train or test the model, you need to download the images and annotation files of the Visual Genome (VG) dataset. **If you only need to extract visual features using the pre-trained model, you can skip this part**. 66 | 67 | The original VG images ([part1](https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip) and [part2](https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip)) are to be downloaded and unzipped to one folder and put it into the `datasets` folder. 68 | 69 | The generated annotation files in the original repository are needed to be transformed to a COCO data format required by Detectron2. The preprocessed annotation files can be downloaded [here](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EWpiE_5PvBdKiKfCi0pBx_EB5ONo8D8XABUz7tWcnltCrw?download=1) and unzipped to the `dataset` folder. 70 | 71 | Finally, the `datasets` folders will have the following structure: 72 | 73 | ```angular2html 74 | |-- datasets 75 | |-- visual_genome 76 | | |-- images 77 | | | | |-- 1.jpg 78 | | | | |-- 2.jpg 79 | | | | |-- ... 80 | | | | |-- ... 81 | | |-- annotations 82 | | | |-- visual_genome_train.json 83 | | | |-- visual_genome_test.json 84 | | | |-- visual_genome_val.json 85 | ``` 86 | 87 | ## Training 88 | 89 | The following script will train a bottom-up-attention model on the `train` split of VG. 90 | 91 | ```bash 92 | $ python3 train_net.py --mode d2 \ 93 | --config-file configs/d2/train-d2-r101.yaml \ 94 | --resume 95 | ``` 96 | 97 | 1. `mode = 'd2'` refers to training a model with the Detectron2 backend, which is inspired by the [grid-feats-vqa](https://github.com/facebookresearch/grid-feats-vqa/). We think it is unnecessary to train a new model using the `caffe` mode. The pre-trained Caffe models are provided for testing and feature extraction. 98 | 99 | 2. `config-file` refers to all the configurations of the model. 100 | 101 | 3. `resume` refers to a flag if you want to resume training from a specific checkpoint. 102 | 103 | ## Testing 104 | 105 | Given the trained model, the following script will test the performance on the `val` split of VG: 106 | 107 | ```bash 108 | $ python3 train_net.py --mode caffe \ 109 | --config-file configs/caffe/test-caffe-r101.yaml \ 110 | --eval-only 111 | ``` 112 | 113 | 1. `mode = {'caffe', 'd2'}` refers to the used mode. For the converted model from Caffe, you need to use the `caffe` mode. For other models trained with Detectron2, you need to use the `d2` mode. 114 | 115 | 2. `config-file` refers to all the configurations of the model, which also include the path of the model weights. 116 | 117 | 3. `eval-only` refers to a flag to declare the testing phase. 118 | 119 | ## Feature Extraction 120 | 121 | Given the trained model, the following script will extract the bottom-up-attention visual features. Single GPU and multiple GPUs are both supported. 122 | 123 | ```bash 124 | $ python3 extract_features.py --mode caffe \ 125 | --num-cpus 32 --gpus '0,1,2,3' \ 126 | --extract-mode roi_feats \ 127 | --min-max-boxes '10,100' \ 128 | --config-file configs/caffe/test-caffe-r101.yaml \ 129 | --image-dir --bbox-dir --out-dir 130 | --fastmode 131 | ``` 132 | 133 | 1. `mode = {'caffe', 'd2'}` refers to the used mode. For the converted model from Caffe, you need to use the `caffe` mode. For other models trained with Detectron2, you need to use the `detectron2` mode. `'caffe'` is the default value. **Note** that the `detecron2` mode need to run with [Ray](https://github.com/ray-project/ray). 134 | 135 | 2. `num-cpus` refers to the number of cpu cores to use for accelerating the cpu computation. **0** stands for using all possible cpus and **1** is the default value. 136 | 137 | 3. `gpus` refers to the ids of gpus to use. **'0'** is the default value. If the number of gpus greater than 1, for example **'0,1,2,3'**, the script will use the [Ray](https://github.com/ray-project/ray) library for parallelization. 138 | 139 | 4. `config-file` refers to all the configurations of the model, which also include the path of the model weights. 140 | 141 | 5. `extract-mode` refers to the modes for feature extraction, including {`roi_feats`, `bboxes` and `bbox_feats`}. 142 | 143 | 6. `min-max-boxes` refers to the min-and-max number of features (boxes) to be extracted. **Note** that `mode d2` only support to set the min-and-max number as `'100,100'` to get 100 boxes per image or other values to get about 50~60 boxes per image. 144 | 145 | 7. `image-dir` refers to the input image directory. 146 | 147 | 8. `bbox-dir` refers to the pre-proposed bbox directory. Only be used if the `extract-mode` is set to `'bbox_feats'`. 148 | 149 | 9. `out-dir` refers to the output feature directory. 150 | 151 | 10. `fastmode` refers to use the a faster version (about `2x` faster on a workstation with 4 Titan-V GPUs and 32 CPU cores), at the expense of a potential memory leakage problem if the computing capability of GPUs and CPUs is mismatched. More details and some matched examples in [here](https://github.com/MILVLG/bottom-up-attention.pytorch/pull/41). 152 | 153 | 154 | 155 | Using the same pre-trained model, we also provide an alternative *two-stage* strategy for extracting visual features. This results in (slightly) more accurate bounding boxes and visual features, at the expense of more time overhead: 156 | 157 | ```bash 158 | # extract bboxes only: 159 | $ python3 extract_features.py --mode caffe \ 160 | --num-cpus 32 --gpu '0,1,2,3' \ 161 | --extract-mode bboxes \ 162 | --config-file configs/caffe/test-caffe-r101.yaml \ 163 | --image-dir --out-dir --resume 164 | 165 | # extract visual features with the pre-extracted bboxes: 166 | $ python3 extract_features.py --mode caffe \ 167 | --num-cpus 32 --gpu '0,1,2,3' \ 168 | --extract-mode bbox_feats \ 169 | --config-file configs/caffe/test-caffe-r101.yaml \ 170 | --image-dir --bbox-dir --out-dir --resume 171 | 172 | ``` 173 | 174 | ## Pre-trained models 175 | 176 | We provided pre-trained models as follows, including the models trained in both the `caffe` and `d2` mode. 177 | 178 | For the models of the `caffe` mode, `R101-k36` and `R101-k10-100` refer to the [fix36 model](https://www.dropbox.com/s/2h4hmgcvpaewizu/resnet101_faster_rcnn_final_iter_320000.caffemodel?dl=1) and [dynamic 10-100 model](https://www.dropbox.com/s/5xethd2nxa8qrnq/resnet101_faster_rcnn_final.caffemodel?dl=1) provided in the original [bottom-up-attention](https://github.com/peteanderson80/bottom-up-attention) repository. We additionally provide a `R-152` model which outperforms the two counterparts above. 179 | 180 | For the models of the `d2` mode, we follow the configurations and implementations in the [grid-feats-vqa](https://github.com/facebookresearch/grid-feats-vqa/) and trained three models using the training script in this repository, namely `R50`, `R101` and `X152`. 181 | 182 | name | mode | objects mAP@0.5 |weighted objects mAP@0.5|download 183 | :-:|:-:|:-:|:-:|:-: 184 | [R101-k36](./configs/caffe/test-caffe-r101-fix36.yaml)|caffe|9.3|14.0|[model](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EUKhQ3hSRv9JrrW64qpNLSIBGoOjEGCkF8zvgBP9gKax-w?download=1) 185 | [R101-k10-100](./configs/caffe/test-caffe-r101.yaml)|caffe|10.2|15.1|[model](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EaXvCC3WjtlLvvEfLr3oa8UBLA21tcLh4L8YLbYXl6jgjg?download=1) 186 | [R152](./configs/caffe/test-caffe-r152.yaml)|caffe|**11.1**|15.7|[model](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/ETDgy4bY0xpGgsu5tEMzgLcBQjAwpnkKkltNTtPVuMj4GQ?download=1) 187 | [R50](./configs/d2/test-d2-r50.yaml)|d2|8.2|14.9|[model](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EfYoinBHrFlKmKonocse8yEBXN-hyCHNygYqjxGpIBsPvQ?download=1) 188 | [R101](./configs/d2/test-d2-r101.yaml)|d2|9.2|15.9|[model](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EXXItFlOpHlNq81O1H_cPyoBXUPyXoHmIwPEudnTWKX4rQ?download=1) 189 | [X152](./configs/d2/test-d2-X152.yaml)|d2|10.7|**17.7**|[model](https://awma1-my.sharepoint.com/:u:/g/personal/yuz_l0_tn/EdLhYc39P8tBkEDVCDOrNV4BgPhz9M4iBq8oPw1iyVSlmg?download=1) 190 | 191 | 192 | ## License 193 | 194 | This project is released under the [Apache 2.0 license](LICENSE). 195 | 196 | ## Contact 197 | 198 | This repository is currently maintained by Zhou Yu ([@yuzcccc](https://github.com/yuzcccc)), Tongan Luo ([@Zoroaster97](https://github.com/Zoroaster97)), and Jing Li ([@J1mL3e_](https://github.com/JimLee4530)). 199 | 200 | ## Citation 201 | 202 | If this repository is helpful for your research or you want to refer the provided pretrained models, you could cite the work using the following BibTeX entry: 203 | 204 | ``` 205 | @misc{yu2020buapt, 206 | author = {Yu, Zhou and Li, Jing and Luo, Tongan and Yu, Jun}, 207 | title = {A PyTorch Implementation of Bottom-Up-Attention}, 208 | howpublished = {\url{https://github.com/MILVLG/bottom-up-attention.pytorch}}, 209 | year = {2020} 210 | } 211 | 212 | ``` 213 | -------------------------------------------------------------------------------- /bua/__init__.py: -------------------------------------------------------------------------------- 1 | from .d2 import add_attribute_config 2 | from .caffe import add_bottom_up_attention_config 3 | 4 | def add_config(args, cfg): 5 | if args.mode == "caffe": 6 | add_bottom_up_attention_config(cfg, True) 7 | elif args.mode == "d2": 8 | add_attribute_config(cfg) 9 | else: 10 | raise Exception("detection model not supported: {}".format(args.model)) 11 | from . import visual_genome -------------------------------------------------------------------------------- /bua/caffe/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import add_bottom_up_attention_config 2 | from .modeling.backbone import build_bua_resnet_backbone 3 | from .modeling.rcnn import GeneralizedBUARCNN 4 | from .modeling.roi_heads import BUACaffeRes5ROIHeads 5 | from .modeling.rpn import StandardBUARPNHead, BUARPN -------------------------------------------------------------------------------- /bua/caffe/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | from detectron2.config import CfgNode as CN 5 | 6 | 7 | def add_bottom_up_attention_config(cfg, caffe=False): 8 | """ 9 | Add config for tridentnet. 10 | """ 11 | _C = cfg 12 | 13 | _C.MODEL.BUA = CN() 14 | _C.MODEL.BUA.CAFFE = caffe 15 | _C.MODEL.BUA.RESNET_VERSION = 1 16 | _C.MODEL.BUA.ATTRIBUTE_ON = False 17 | _C.MODEL.BUA.EXTRACT_FEATS = False 18 | 19 | _C.MODEL.BUA.RPN = CN() 20 | # out_channels of conv for bottom-up-attentions RPN. 21 | _C.MODEL.BUA.RPN.CONV_OUT_CHANNELS = 512 22 | 23 | _C.MODEL.BUA.EXTRACTOR = CN() 24 | 25 | # EXTRACTOR.MODE {1: extract roi features, 2: extract bbox only ,3: extract roi features by gt_bbox} 26 | _C.MODEL.BUA.EXTRACTOR.MODE = 1 27 | 28 | # config of postprocessing in extractor 29 | _C.MODEL.BUA.EXTRACTOR.MIN_BOXES = 10 30 | _C.MODEL.BUA.EXTRACTOR.MAX_BOXES = 100 31 | _C.MODEL.BUA.EXTRACTOR.CONF_THRESH = 0.2 32 | _C.MODEL.BUA.EXTRACTOR.OUTPUT_DIR = ".output/" 33 | 34 | _C.MODEL.BUA.ATTRIBUTE = CN() 35 | _C.MODEL.BUA.ATTRIBUTE.NUM_CLASSES = 401 36 | -------------------------------------------------------------------------------- /bua/caffe/dataloader/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataset_mapper import DatasetMapper 2 | 3 | __all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")] -------------------------------------------------------------------------------- /bua/caffe/dataloader/dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import copy 3 | import logging 4 | import numpy as np 5 | import torch 6 | import cv2 7 | 8 | from detectron2.data import detection_utils as utils 9 | from detectron2.data import transforms as T 10 | 11 | from .transform_gen import ResizeShortestEdge 12 | from .detection_utils import annotations_to_instances 13 | 14 | """ 15 | This file contains the default mapping that's applied to "dataset dicts". 16 | """ 17 | 18 | __all__ = ["DatasetMapper"] 19 | 20 | def build_transform_gen(cfg, is_train): 21 | """ 22 | Create a list of :class:`TransformGen` from config. 23 | Now it includes resizing and flipping. 24 | 25 | Returns: 26 | list[TransformGen] 27 | """ 28 | if is_train: 29 | min_size = cfg.INPUT.MIN_SIZE_TRAIN 30 | max_size = cfg.INPUT.MAX_SIZE_TRAIN 31 | else: 32 | min_size = cfg.INPUT.MIN_SIZE_TEST 33 | max_size = cfg.INPUT.MAX_SIZE_TEST 34 | 35 | logger = logging.getLogger(__name__) 36 | tfm_gens = [] 37 | tfm_gens.append(ResizeShortestEdge(min_size, max_size, cfg.MODEL.PIXEL_MEAN)) 38 | if is_train: 39 | logger.info("TransformGens used in training: " + str(tfm_gens)) 40 | return tfm_gens 41 | 42 | class DatasetMapper: 43 | """ 44 | A callable which takes a dataset dict in Detectron2 Dataset format, 45 | and map it into a format used by the model. 46 | 47 | This is the default callable to be used to map your dataset dict into training data. 48 | You may need to follow it to implement your own one for customized logic. 49 | 50 | The callable currently does the following: 51 | 1. Read the image from "file_name" 52 | 2. Applies cropping/geometric transforms to the image and annotations 53 | 3. Prepare data and annotations to Tensor and :class:`Instances` 54 | """ 55 | 56 | def __init__(self, cfg, is_train=True): 57 | if cfg.INPUT.CROP.ENABLED and is_train: 58 | self.crop_gen = T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE) 59 | logging.getLogger(__name__).info("CropGen used in training: " + str(self.crop_gen)) 60 | else: 61 | self.crop_gen = None 62 | 63 | self.tfm_gens = build_transform_gen(cfg, is_train) 64 | 65 | # fmt: off 66 | self.img_format = cfg.INPUT.FORMAT 67 | self.mask_on = cfg.MODEL.MASK_ON 68 | self.mask_format = cfg.INPUT.MASK_FORMAT 69 | self.keypoint_on = cfg.MODEL.KEYPOINT_ON 70 | self.load_proposals = cfg.MODEL.LOAD_PROPOSALS 71 | # fmt: on 72 | if self.keypoint_on and is_train: 73 | # Flip only makes sense in training 74 | self.keypoint_hflip_indices = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN) 75 | else: 76 | self.keypoint_hflip_indices = None 77 | 78 | if self.load_proposals: 79 | self.min_box_side_len = cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE 80 | self.proposal_topk = ( 81 | cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN 82 | if is_train 83 | else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST 84 | ) 85 | self.is_train = is_train 86 | 87 | def __call__(self, dataset_dict): 88 | """ 89 | Args: 90 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 91 | 92 | Returns: 93 | dict: a format that builtin models in detectron2 accept 94 | """ 95 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 96 | # USER: Write your own image loading if it's not from a file 97 | # image = utils.read_image(dataset_dict["file_name"], format=self.img_format) 98 | image = cv2.imread(dataset_dict["file_name"]) 99 | h, w = image.shape[:2] 100 | # utils.check_image_size(dataset_dict, image) 101 | 102 | if "annotations" not in dataset_dict: 103 | image, transforms = T.apply_transform_gens( 104 | ([self.crop_gen] if self.crop_gen else []) + self.tfm_gens, image 105 | ) 106 | else: 107 | # Crop around an instance if there are instances in the image. 108 | # USER: Remove if you don't use cropping 109 | if self.crop_gen: 110 | crop_tfm = utils.gen_crop_transform_with_instance( 111 | self.crop_gen.get_crop_size(image.shape[:2]), 112 | image.shape[:2], 113 | np.random.choice(dataset_dict["annotations"]), 114 | ) 115 | image = crop_tfm.apply_image(image) 116 | image, transforms = T.apply_transform_gens(self.tfm_gens, image) 117 | if self.crop_gen: 118 | transforms = crop_tfm + transforms 119 | 120 | image_shape = image.shape[:2] # h, w 121 | 122 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 123 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 124 | # Therefore it's important to use torch.Tensor. 125 | dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32")) 126 | dataset_dict["im_scale"] = float(image_shape[0])/ float(h) 127 | # Can use uint8 if it turns out to be slow some day 128 | 129 | # USER: Remove if you don't use pre-computed proposals. 130 | if self.load_proposals: 131 | utils.transform_proposals( 132 | dataset_dict, image_shape, transforms, self.min_box_side_len, self.proposal_topk 133 | ) 134 | 135 | if not self.is_train: 136 | dataset_dict.pop("annotations", None) 137 | dataset_dict.pop("sem_seg_file_name", None) 138 | return dataset_dict 139 | 140 | if "annotations" in dataset_dict: 141 | # USER: Modify this if you want to keep them for some reason. 142 | for anno in dataset_dict["annotations"]: 143 | if not self.mask_on: 144 | anno.pop("segmentation", None) 145 | if not self.keypoint_on: 146 | anno.pop("keypoints", None) 147 | 148 | # USER: Implement additional transformations if you have other types of data 149 | annos = [ 150 | utils.transform_instance_annotations( 151 | obj, transforms, image_shape 152 | ) 153 | for obj in dataset_dict.pop("annotations") 154 | if obj.get("iscrowd", 0) == 0 155 | ] 156 | instances = annotations_to_instances( 157 | annos, image_shape, mask_format=self.mask_format 158 | ) 159 | # Create a tight bounding box from masks, useful when image is cropped 160 | if self.crop_gen and instances.has("gt_masks"): 161 | instances.gt_boxes = instances.gt_masks.get_bounding_boxes() 162 | dataset_dict["instances"] = utils.filter_empty_instances(instances) 163 | 164 | return dataset_dict 165 | -------------------------------------------------------------------------------- /bua/caffe/dataloader/detection_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | """ 5 | Common data processing utilities that are used in a 6 | typical object detection data pipeline. 7 | """ 8 | import torch 9 | 10 | from detectron2.structures import ( 11 | Boxes, 12 | BoxMode, 13 | Instances, 14 | ) 15 | 16 | def transform_instance_annotations( 17 | annotation, transforms, image_size, *, keypoint_hflip_indices=None 18 | ): 19 | """ 20 | Apply transforms to box, segmentation and keypoints annotations of a single instance. 21 | 22 | It will use `transforms.apply_box` for the box, and 23 | `transforms.apply_coords` for segmentation polygons & keypoints. 24 | If you need anything more specially designed for each data structure, 25 | you'll need to implement your own version of this function or the transforms. 26 | 27 | Args: 28 | annotation (dict): dict of instance annotations for a single instance. 29 | It will be modified in-place. 30 | transforms (TransformList): 31 | image_size (tuple): the height, width of the transformed image 32 | keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`. 33 | 34 | Returns: 35 | dict: 36 | the same input dict with fields "bbox", "segmentation", "keypoints" 37 | transformed according to `transforms`. 38 | The "bbox_mode" field will be set to XYXY_ABS. 39 | """ 40 | bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS) 41 | # Note that bbox is 1d (per-instance bounding box) 42 | annotation["bbox"] = transforms.apply_box([bbox])[0] 43 | annotation["bbox_mode"] = BoxMode.XYXY_ABS 44 | 45 | if "attributes" in annotation: 46 | annotation["attributes"] = annotation["attributes"] 47 | 48 | return annotation 49 | 50 | def annotations_to_instances(annos, image_size, mask_format="polygon"): 51 | """ 52 | Create an :class:`Instances` object used by the models, 53 | from instance annotations in the dataset dict. 54 | 55 | Args: 56 | annos (list[dict]): a list of instance annotations in one image, each 57 | element for one instance. 58 | image_size (tuple): height, width 59 | 60 | Returns: 61 | Instances: 62 | It will contain fields "gt_boxes", "gt_classes", 63 | "gt_masks", "gt_keypoints", if they can be obtained from `annos`. 64 | This is the format that builtin models expect. 65 | """ 66 | boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos] 67 | target = Instances(image_size) 68 | boxes = target.gt_boxes = Boxes(boxes) 69 | boxes.clip(image_size) 70 | 71 | classes = [obj["category_id"] for obj in annos] 72 | classes = torch.tensor(classes, dtype=torch.int64) 73 | target.gt_classes = classes 74 | 75 | # attributes = [obj["attributes"] for obj in annos] 76 | attributes = [] 77 | for obj in annos: 78 | if "attributes" in obj.keys(): 79 | attributes.append(obj["attributes"]) 80 | else: 81 | attributes.append([-1]*16) 82 | attributes = torch.tensor(attributes, dtype=torch.int64) 83 | target.gt_attributes = attributes 84 | 85 | return target -------------------------------------------------------------------------------- /bua/caffe/dataloader/transform_gen.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import PIL.Image as Image 3 | import numpy as np 4 | from fvcore.transforms.transform import Transform 5 | from detectron2.data.transforms import TransformGen 6 | 7 | 8 | class ResizeTransform(Transform): 9 | """ 10 | Resize the image to a target size. 11 | """ 12 | 13 | def __init__(self, h, w, im_scale, pixel_mean): 14 | """ 15 | Args: 16 | h, w (int): original image size 17 | im_scale: im_scale of new_h/h or new_w/w 18 | """ 19 | # TODO decide on PIL vs opencv 20 | super().__init__() 21 | self._set_attributes(locals()) 22 | 23 | def apply_image(self, img): 24 | assert img.shape[:2] == (self.h, self.w) 25 | img_norm = img.astype(np.float32, copy=True) - np.asarray(self.pixel_mean) 26 | im = cv2.resize( 27 | img_norm, 28 | None, 29 | None, 30 | fx=self.im_scale, 31 | fy=self.im_scale, 32 | interpolation=cv2.INTER_LINEAR 33 | ) 34 | ret = np.asarray(im) 35 | return ret 36 | 37 | def apply_coords(self, coords): 38 | coords[:, 0] = coords[:, 0] * (self.im_scale) 39 | coords[:, 1] = coords[:, 1] * (self.im_scale) 40 | return coords 41 | 42 | def apply_segmentation(self, segmentation): 43 | segmentation = self.apply_image(segmentation, interp=Image.NEAREST) 44 | return segmentation 45 | 46 | 47 | class ResizeShortestEdge(TransformGen): 48 | """ 49 | Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge. 50 | If `max_size` is reached, then downscale so that the longer edge does not exceed max_size. 51 | """ 52 | 53 | def __init__( 54 | self, min_size, max_size, pixel_mean): 55 | """ 56 | Args: 57 | min_size (int): minimum allowed smallest edge length. 58 | max_size (int): maximum allowed longest edge length. 59 | """ 60 | super().__init__() 61 | self.min_size = min_size 62 | self.max_size = max_size 63 | self.pixel_mean = pixel_mean 64 | 65 | self._init(locals()) 66 | 67 | def get_transform(self, img): 68 | h, w = img.shape[:2] 69 | 70 | im_shape = img.shape 71 | im_size_min = np.min(im_shape[0:2]) 72 | im_size_max = np.max(im_shape[0:2]) 73 | 74 | im_scale = float(self.min_size if not type(self.min_size) is tuple else self.min_size[0]) / float(im_size_min) 75 | 76 | # Prevent the biggest axis from being more than max_size 77 | if np.round(im_scale * im_size_max) > self.max_size: 78 | im_scale = float(self.max_size) / float(im_size_max) 79 | 80 | return ResizeTransform(h, w, im_scale, self.pixel_mean) 81 | -------------------------------------------------------------------------------- /bua/caffe/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbone import build_bua_resnet_backbone 2 | from .rcnn import GeneralizedBUARCNN 3 | from .roi_heads import BUACaffeRes5ROIHeads 4 | from .rpn import StandardBUARPNHead, BUARPN 5 | -------------------------------------------------------------------------------- /bua/caffe/modeling/backbone.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import fvcore.nn.weight_init as weight_init 3 | from torch import nn 4 | import torch.nn.functional as F 5 | 6 | from detectron2.layers import Conv2d, FrozenBatchNorm2d, get_norm, BatchNorm2d 7 | from detectron2.modeling import BACKBONE_REGISTRY, ResNet, make_stage 8 | from detectron2.modeling.backbone.resnet import BottleneckBlock, DeformBottleneckBlock, ResNetBlockBase 9 | 10 | from .layers.wrappers import Conv2dv2 11 | 12 | __all__ = ["BUABasicStem", "BUABasicStemv2", "build_bua_resnet_backbone"] 13 | 14 | class BUABasicStem(nn.Module): 15 | def __init__(self, in_channels=3, out_channels=64, norm="BN"): 16 | """ 17 | Args: 18 | norm (str or callable): a callable that takes the number of 19 | channels and return a `nn.Module`, or a pre-defined string 20 | (one of {"FrozenBN", "BN", "GN"}). 21 | """ 22 | super().__init__() 23 | self.conv1 = Conv2d( 24 | in_channels, 25 | out_channels, 26 | kernel_size=7, 27 | stride=2, 28 | padding=3, 29 | bias=False, 30 | norm=get_norm(norm, out_channels), 31 | ) 32 | weight_init.c2_msra_fill(self.conv1) 33 | 34 | def forward(self, x): 35 | x = self.conv1(x) 36 | x = F.relu_(x) 37 | x = F.max_pool2d(x, kernel_size=3, stride=2, padding=0, ceil_mode=True) 38 | return x 39 | 40 | @property 41 | def out_channels(self): 42 | return self.conv1.out_channels 43 | 44 | @property 45 | def stride(self): 46 | return 4 # = stride 2 conv -> stride 2 max pool 47 | 48 | class BUABasicStemv2(nn.Module): 49 | def __init__(self, in_channels=3, out_channels=64, norm="BN"): 50 | """ 51 | Args: 52 | norm (str or callable): a callable that takes the number of 53 | channels and return a `nn.Module`, or a pre-defined string 54 | (one of {"FrozenBN", "BN", "GN"}). 55 | """ 56 | super().__init__() 57 | self.norm = BatchNorm2d(in_channels, eps=2e-5) 58 | self.conv1 = Conv2d( 59 | in_channels, 60 | out_channels, 61 | kernel_size=7, 62 | stride=2, 63 | padding=3, 64 | bias=False, 65 | norm=BatchNorm2d(out_channels, eps=2e-5), 66 | ) 67 | # weight_init.c2_msra_fill(self.norm) 68 | weight_init.c2_msra_fill(self.conv1) 69 | 70 | def forward(self, x): 71 | x = self.norm(x) 72 | x = self.conv1(x) 73 | x = F.relu_(x) 74 | x = F.max_pool2d(x, kernel_size=3, stride=2, padding=0, ceil_mode=True) 75 | return x 76 | 77 | @property 78 | def out_channels(self): 79 | return self.conv1.out_channels 80 | 81 | @property 82 | def stride(self): 83 | return 4 # = stride 2 conv -> stride 2 max pool 84 | 85 | @BACKBONE_REGISTRY.register() 86 | def build_bua_resnet_backbone(cfg, input_shape): 87 | """ 88 | Create a ResNet instance from config. 89 | 90 | Returns: 91 | ResNet: a :class:`ResNet` instance. 92 | """ 93 | # need registration of new blocks/stems? 94 | norm = cfg.MODEL.RESNETS.NORM 95 | if cfg.MODEL.BUA.RESNET_VERSION == 2: 96 | stem = BUABasicStemv2( 97 | in_channels=input_shape.channels, 98 | out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS, 99 | ) 100 | else: 101 | stem = BUABasicStem( 102 | in_channels=input_shape.channels, 103 | out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS, 104 | norm=norm, 105 | ) 106 | freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT 107 | 108 | if freeze_at >= 1: 109 | for p in stem.parameters(): 110 | p.requires_grad = False 111 | stem = FrozenBatchNorm2d.convert_frozen_batchnorm(stem) 112 | 113 | # fmt: off 114 | out_features = cfg.MODEL.RESNETS.OUT_FEATURES 115 | depth = cfg.MODEL.RESNETS.DEPTH 116 | num_groups = cfg.MODEL.RESNETS.NUM_GROUPS 117 | width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP 118 | bottleneck_channels = num_groups * width_per_group 119 | in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS 120 | out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS 121 | stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1 122 | res5_dilation = cfg.MODEL.RESNETS.RES5_DILATION 123 | deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE 124 | deform_modulated = cfg.MODEL.RESNETS.DEFORM_MODULATED 125 | deform_num_groups = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS 126 | # fmt: on 127 | assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation) 128 | 129 | num_blocks_per_stage = {50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3]}[depth] 130 | 131 | stages = [] 132 | 133 | # Avoid creating variables without gradients 134 | # It consumes extra memory and may cause allreduce to fail 135 | out_stage_idx = [{"res2": 2, "res3": 3, "res4": 4, "res5": 5}[f] for f in out_features] 136 | max_stage_idx = max(out_stage_idx) 137 | for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)): 138 | dilation = res5_dilation if stage_idx == 5 else 1 139 | first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2 140 | stage_kargs = { 141 | "num_blocks": num_blocks_per_stage[idx], 142 | "first_stride": first_stride, 143 | "in_channels": in_channels, 144 | "bottleneck_channels": bottleneck_channels, 145 | "out_channels": out_channels, 146 | "num_groups": num_groups, 147 | "norm": norm, 148 | "stride_in_1x1": stride_in_1x1, 149 | "dilation": dilation, 150 | } 151 | if deform_on_per_stage[idx]: 152 | stage_kargs["block_class"] = DeformBottleneckBlock 153 | stage_kargs["deform_modulated"] = deform_modulated 154 | stage_kargs["deform_num_groups"] = deform_num_groups 155 | else: 156 | stage_kargs["block_class"] = BottleneckBlock if cfg.MODEL.BUA.RESNET_VERSION == 1 else BottleneckBlockv2 157 | blocks = make_stage(**stage_kargs) 158 | in_channels = out_channels 159 | out_channels *= 2 160 | bottleneck_channels *= 2 161 | 162 | if freeze_at >= stage_idx: 163 | for block in blocks: 164 | block.freeze() 165 | stages.append(blocks) 166 | return ResNet(stem, stages, out_features=out_features) 167 | 168 | class BottleneckBlockv2(ResNetBlockBase): 169 | def __init__( 170 | self, 171 | in_channels, 172 | out_channels, 173 | *, 174 | bottleneck_channels, 175 | stride=1, 176 | num_groups=1, 177 | norm="BN", 178 | stride_in_1x1=False, 179 | dilation=1, 180 | ): 181 | """ 182 | Args: 183 | norm (str or callable): a callable that takes the number of 184 | channels and return a `nn.Module`, or a pre-defined string 185 | (one of {"FrozenBN", "BN", "GN"}). 186 | stride_in_1x1 (bool): when stride==2, whether to put stride in the 187 | first 1x1 convolution or the bottleneck 3x3 convolution. 188 | """ 189 | super().__init__(in_channels, out_channels, stride) 190 | 191 | if in_channels != out_channels: 192 | self.shortcut = Conv2dv2( 193 | in_channels, 194 | out_channels, 195 | kernel_size=1, 196 | stride=stride, 197 | bias=False, 198 | norm=None, 199 | ) 200 | else: 201 | self.shortcut = None 202 | 203 | # The original MSRA ResNet models have stride in the first 1x1 conv 204 | # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have 205 | # stride in the 3x3 conv 206 | stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) 207 | 208 | self.conv1 = Conv2dv2( 209 | in_channels, 210 | bottleneck_channels, 211 | kernel_size=1, 212 | stride=stride_1x1, 213 | bias=False, 214 | norm=None, 215 | ) 216 | 217 | self.conv2 = Conv2dv2( 218 | bottleneck_channels, 219 | bottleneck_channels, 220 | kernel_size=3, 221 | stride=stride_3x3, 222 | padding=1 * dilation, 223 | bias=False, 224 | groups=num_groups, 225 | dilation=dilation, 226 | norm=BatchNorm2d(bottleneck_channels, eps=2e-5), 227 | activation=F.relu_, 228 | ) 229 | 230 | self.conv3 = Conv2dv2( 231 | bottleneck_channels, 232 | out_channels, 233 | kernel_size=1, 234 | bias=False, 235 | norm=BatchNorm2d(bottleneck_channels, eps=2e-5), 236 | activation=F.relu_, 237 | ) 238 | 239 | for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]: 240 | if layer is not None: # shortcut can be None 241 | weight_init.c2_msra_fill(layer) 242 | 243 | self.norm = BatchNorm2d(in_channels, eps=2e-5) 244 | 245 | # Zero-initialize the last normalization in each residual branch, 246 | # so that at the beginning, the residual branch starts with zeros, 247 | # and each residual block behaves like an identity. 248 | # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour": 249 | # "For BN layers, the learnable scaling coefficient γ is initialized 250 | # to be 1, except for each residual block's last BN 251 | # where γ is initialized to be 0." 252 | 253 | # nn.init.constant_(self.conv3.norm.weight, 0) 254 | # TODO this somehow hurts performance when training GN models from scratch. 255 | # Add it as an option when we need to use this code to train a backbone. 256 | 257 | def forward(self, x): 258 | x_2 = self.norm(x) 259 | x_2 = F.relu_(x_2) 260 | 261 | out = self.conv1(x_2) 262 | # out = F.relu_(out) 263 | 264 | out = self.conv2(out) 265 | # out = F.relu_(out) 266 | 267 | out = self.conv3(out) 268 | 269 | if self.shortcut is not None: 270 | shortcut = self.shortcut(x_2) 271 | else: 272 | shortcut = x 273 | 274 | out += shortcut 275 | # out = F.relu_(out) 276 | return out -------------------------------------------------------------------------------- /bua/caffe/modeling/box_regression.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import math 3 | import torch 4 | from detectron2.structures import Boxes 5 | from typing import List, Tuple, Union 6 | 7 | # Value for clamping large dw and dh predictions. The heuristic is that we clamp 8 | # such that dw and dh are no larger than what would transform a 16px box into a 9 | # 1000px box (based on a small anchor, 16px, and a typical image size, 1000px). 10 | _DEFAULT_SCALE_CLAMP = math.log(1000.0 / 16) 11 | 12 | 13 | __all__ = ["BUABoxes", "BUABox2BoxTransform"] 14 | 15 | class BUABoxes(Boxes): 16 | """ 17 | This structure stores a list of boxes as a Nx4 torch.Tensor. 18 | It supports some common methods about boxes 19 | (`area`, `clip`, `nonempty`, etc), 20 | and also behaves like a Tensor 21 | (support indexing, `to(device)`, `.device`, and iteration over all boxes) 22 | 23 | Attributes: 24 | tensor: float matrix of Nx4. 25 | """ 26 | 27 | BoxSizeType = Union[List[int], Tuple[int, int]] 28 | def __init__(self, tensor: torch.Tensor): 29 | super().__init__(tensor) 30 | 31 | def clip(self, box_size: BoxSizeType) -> None: 32 | """ 33 | NOTE: In order to be the same as bottom-up-attention network, we have 34 | defined the new clip function. 35 | 36 | Clip (in place) the boxes by limiting x coordinates to the range [0, width] 37 | and y coordinates to the range [0, height]. 38 | 39 | Args: 40 | box_size (height, width): The clipping box's size. 41 | """ 42 | assert torch.isfinite(self.tensor).all(), "Box tensor contains infinite or NaN!" 43 | TO_REMOVE = 1 44 | h, w = box_size 45 | self.tensor[:, 0].clamp_(min=0, max=w - TO_REMOVE) 46 | self.tensor[:, 1].clamp_(min=0, max=h - TO_REMOVE) 47 | self.tensor[:, 2].clamp_(min=0, max=w - TO_REMOVE) 48 | self.tensor[:, 3].clamp_(min=0, max=h - TO_REMOVE) 49 | 50 | def nonempty(self, threshold: int = 0) -> torch.Tensor: 51 | """ 52 | NOTE: In order to be the same as bottom-up-attention network, we have 53 | defined the new nonempty function. 54 | 55 | Find boxes that are non-empty. 56 | A box is considered empty, if either of its side is no larger than threshold. 57 | 58 | Returns: 59 | Tensor: 60 | a binary vector which represents whether each box is empty 61 | (False) or non-empty (True). 62 | """ 63 | TO_REMOVE = 1 64 | box = self.tensor 65 | widths = box[:, 2] - box[:, 0] + TO_REMOVE 66 | heights = box[:, 3] - box[:, 1] + TO_REMOVE 67 | keep = (widths > threshold) & (heights > threshold) 68 | return keep 69 | 70 | def filter_boxes(self): 71 | box = self.tensor 72 | keep = (box[:, 3] > box[:, 1]) & (box[:, 2] > box[:, 0]) 73 | return keep 74 | 75 | def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Boxes": 76 | """ 77 | Returns: 78 | BUABoxes: Create a new :class:`BUABoxes` by indexing. 79 | 80 | The following usage are allowed: 81 | 1. `new_boxes = boxes[3]`: return a `Boxes` which contains only one box. 82 | 2. `new_boxes = boxes[2:10]`: return a slice of boxes. 83 | 3. `new_boxes = boxes[vector]`, where vector is a torch.BoolTensor 84 | with `length = len(boxes)`. Nonzero elements in the vector will be selected. 85 | 86 | Note that the returned Boxes might share storage with this Boxes, 87 | subject to Pytorch's indexing semantics. 88 | """ 89 | if isinstance(item, int): 90 | return BUABoxes(self.tensor[item].view(1, -1)) 91 | b = self.tensor[item] 92 | assert b.dim() == 2, "Indexing on Boxes with {} failed to return a matrix!".format(item) 93 | return BUABoxes(b) 94 | 95 | class BUABox2BoxTransform(object): 96 | """ 97 | The box-to-box transform defined in R-CNN. The transformation is parameterized 98 | by 4 deltas: (dx, dy, dw, dh). The transformation scales the box's width and height 99 | by exp(dw), exp(dh) and shifts a box's center by the offset (dx * width, dy * height). 100 | """ 101 | 102 | def __init__(self, weights, scale_clamp=_DEFAULT_SCALE_CLAMP): 103 | """ 104 | Args: 105 | weights (4-element tuple): Scaling factors that are applied to the 106 | (dx, dy, dw, dh) deltas. In Fast R-CNN, these were originally set 107 | such that the deltas have unit variance; now they are treated as 108 | hyperparameters of the system. 109 | scale_clamp (float): When predicting deltas, the predicted box scaling 110 | factors (dw and dh) are clamped such that they are <= scale_clamp. 111 | """ 112 | self.weights = weights 113 | self.scale_clamp = scale_clamp 114 | 115 | def get_deltas(self, src_boxes, target_boxes): 116 | """ 117 | Get box regression transformation deltas (dx, dy, dw, dh) that can be used 118 | to transform the `src_boxes` into the `target_boxes`. That is, the relation 119 | ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless 120 | any delta is too large and is clamped). 121 | 122 | Args: 123 | src_boxes (Tensor): source boxes, e.g., object proposals 124 | target_boxes (Tensor): target of the transformation, e.g., ground-truth 125 | boxes. 126 | """ 127 | assert isinstance(src_boxes, torch.Tensor), type(src_boxes) 128 | assert isinstance(target_boxes, torch.Tensor), type(target_boxes) 129 | 130 | TO_REMOVE = 1 # TODO remove 131 | src_widths = src_boxes[:, 2] - src_boxes[:, 0] + TO_REMOVE 132 | src_heights = src_boxes[:, 3] - src_boxes[:, 1] + TO_REMOVE 133 | src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths 134 | src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights 135 | 136 | target_widths = target_boxes[:, 2] - target_boxes[:, 0] + TO_REMOVE 137 | target_heights = target_boxes[:, 3] - target_boxes[:, 1] + TO_REMOVE 138 | target_ctr_x = target_boxes[:, 0] + 0.5 * target_widths 139 | target_ctr_y = target_boxes[:, 1] + 0.5 * target_heights 140 | 141 | wx, wy, ww, wh = self.weights 142 | dx = wx * (target_ctr_x - src_ctr_x) / src_widths 143 | dy = wy * (target_ctr_y - src_ctr_y) / src_heights 144 | dw = ww * torch.log(target_widths / src_widths) 145 | dh = wh * torch.log(target_heights / src_heights) 146 | 147 | deltas = torch.stack((dx, dy, dw, dh), dim=1) 148 | assert (src_widths > 0).all().item(), "Input boxes to Box2BoxTransform are not valid!" 149 | return deltas 150 | 151 | def apply_deltas(self, deltas, boxes): 152 | """ 153 | Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`. 154 | 155 | Args: 156 | deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1. 157 | deltas[i] represents k potentially different class-specific 158 | box transformations for the single box boxes[i]. 159 | boxes (Tensor): boxes to transform, of shape (N, 4) 160 | """ 161 | assert torch.isfinite(deltas).all().item(), "Box regression deltas become infinite or NaN!" 162 | boxes = boxes.to(deltas.dtype) 163 | 164 | TO_REMOVE = 1 # TODO remove 165 | widths = boxes[:, 2] - boxes[:, 0] + TO_REMOVE 166 | heights = boxes[:, 3] - boxes[:, 1] + TO_REMOVE 167 | ctr_x = boxes[:, 0] + 0.5 * widths 168 | ctr_y = boxes[:, 1] + 0.5 * heights 169 | 170 | wx, wy, ww, wh = self.weights 171 | dx = deltas[:, 0::4] / wx 172 | dy = deltas[:, 1::4] / wy 173 | dw = deltas[:, 2::4] / ww 174 | dh = deltas[:, 3::4] / wh 175 | 176 | # Prevent sending too large values into torch.exp() 177 | dw = torch.clamp(dw, max=self.scale_clamp) 178 | dh = torch.clamp(dh, max=self.scale_clamp) 179 | 180 | pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] 181 | pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] 182 | pred_w = torch.exp(dw) * widths[:, None] 183 | pred_h = torch.exp(dh) * heights[:, None] 184 | 185 | pred_boxes = torch.zeros_like(deltas) 186 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w # x1 187 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h # y1 188 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w # x2 189 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h # y2 190 | return pred_boxes -------------------------------------------------------------------------------- /bua/caffe/modeling/layers/csrc/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | from .nms import SwapAlign2Nat, swap_align2nat 3 | 4 | __all__ = [k for k in globals().keys() if not k.startswith("_")] -------------------------------------------------------------------------------- /bua/caffe/modeling/layers/csrc/nms/nms.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 12 | 13 | __device__ inline float devIoU(float const * const a, float const * const b) { 14 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 15 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 16 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 17 | float interS = width * height; 18 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 19 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 20 | return interS / (Sa + Sb - interS); 21 | } 22 | 23 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 24 | const float *dev_boxes, unsigned long long *dev_mask) { 25 | const int row_start = blockIdx.y; 26 | const int col_start = blockIdx.x; 27 | 28 | // if (row_start > col_start) return; 29 | 30 | const int row_size = 31 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 32 | const int col_size = 33 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 34 | 35 | __shared__ float block_boxes[threadsPerBlock * 5]; 36 | if (threadIdx.x < col_size) { 37 | block_boxes[threadIdx.x * 5 + 0] = 38 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 39 | block_boxes[threadIdx.x * 5 + 1] = 40 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 41 | block_boxes[threadIdx.x * 5 + 2] = 42 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 43 | block_boxes[threadIdx.x * 5 + 3] = 44 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 45 | block_boxes[threadIdx.x * 5 + 4] = 46 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 47 | } 48 | __syncthreads(); 49 | 50 | if (threadIdx.x < row_size) { 51 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 52 | const float *cur_box = dev_boxes + cur_box_idx * 5; 53 | int i = 0; 54 | unsigned long long t = 0; 55 | int start = 0; 56 | if (row_start == col_start) { 57 | start = threadIdx.x + 1; 58 | } 59 | for (i = start; i < col_size; i++) { 60 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 61 | t |= 1ULL << i; 62 | } 63 | } 64 | const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); 65 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 66 | } 67 | } 68 | 69 | // boxes is a N x 5 tensor 70 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) { 71 | using scalar_t = float; 72 | AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); 73 | auto scores = boxes.select(1, 4); 74 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 75 | auto boxes_sorted = boxes.index_select(0, order_t); 76 | 77 | int boxes_num = boxes.size(0); 78 | 79 | const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); 80 | 81 | scalar_t* boxes_dev = boxes_sorted.data(); 82 | 83 | THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState 84 | 85 | unsigned long long* mask_dev = NULL; 86 | //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, 87 | // boxes_num * col_blocks * sizeof(unsigned long long))); 88 | 89 | mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); 90 | 91 | dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), 92 | THCCeilDiv(boxes_num, threadsPerBlock)); 93 | dim3 threads(threadsPerBlock); 94 | nms_kernel<<>>(boxes_num, 95 | nms_overlap_thresh, 96 | boxes_dev, 97 | mask_dev); 98 | 99 | std::vector mask_host(boxes_num * col_blocks); 100 | THCudaCheck(cudaMemcpy(&mask_host[0], 101 | mask_dev, 102 | sizeof(unsigned long long) * boxes_num * col_blocks, 103 | cudaMemcpyDeviceToHost)); 104 | 105 | std::vector remv(col_blocks); 106 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 107 | 108 | at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); 109 | int64_t* keep_out = keep.data(); 110 | 111 | int num_to_keep = 0; 112 | for (int i = 0; i < boxes_num; i++) { 113 | int nblock = i / threadsPerBlock; 114 | int inblock = i % threadsPerBlock; 115 | 116 | if (!(remv[nblock] & (1ULL << inblock))) { 117 | keep_out[num_to_keep++] = i; 118 | unsigned long long *p = &mask_host[0] + i * col_blocks; 119 | for (int j = nblock; j < col_blocks; j++) { 120 | remv[j] |= p[j]; 121 | } 122 | } 123 | } 124 | 125 | THCudaFree(state, mask_dev); 126 | // TODO improve this part 127 | return std::get<0>(order_t.index({ 128 | keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( 129 | order_t.device(), keep.scalar_type()) 130 | }).sort(0, false)); 131 | } 132 | -------------------------------------------------------------------------------- /bua/caffe/modeling/layers/csrc/nms/nms.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include "vision_cpu.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "vision_cuda.h" 7 | #endif 8 | 9 | 10 | at::Tensor nms(const at::Tensor& dets, 11 | const at::Tensor& scores, 12 | const float threshold) { 13 | 14 | if (dets.type().is_cuda()) { 15 | #ifdef WITH_CUDA 16 | // TODO raise error if not compiled with CUDA 17 | if (dets.numel() == 0) 18 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 19 | auto b = at::cat({dets, scores.unsqueeze(1)}, 1); 20 | return nms_cuda(b, threshold); 21 | #else 22 | AT_ERROR("Not compiled with GPU support"); 23 | #endif 24 | } 25 | 26 | at::Tensor result = nms_cpu(dets, scores, threshold); 27 | return result; 28 | } 29 | -------------------------------------------------------------------------------- /bua/caffe/modeling/layers/csrc/nms/nms_cpu.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include "vision_cpu.h" 3 | 4 | 5 | template 6 | at::Tensor nms_cpu_kernel(const at::Tensor& dets, 7 | const at::Tensor& scores, 8 | const float threshold) { 9 | AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); 10 | AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor"); 11 | AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores"); 12 | 13 | if (dets.numel() == 0) { 14 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 15 | } 16 | 17 | auto x1_t = dets.select(1, 0).contiguous(); 18 | auto y1_t = dets.select(1, 1).contiguous(); 19 | auto x2_t = dets.select(1, 2).contiguous(); 20 | auto y2_t = dets.select(1, 3).contiguous(); 21 | 22 | at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); 23 | 24 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 25 | 26 | auto ndets = dets.size(0); 27 | at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); 28 | 29 | auto suppressed = suppressed_t.data(); 30 | auto order = order_t.data(); 31 | auto x1 = x1_t.data(); 32 | auto y1 = y1_t.data(); 33 | auto x2 = x2_t.data(); 34 | auto y2 = y2_t.data(); 35 | auto areas = areas_t.data(); 36 | 37 | for (int64_t _i = 0; _i < ndets; _i++) { 38 | auto i = order[_i]; 39 | if (suppressed[i] == 1) 40 | continue; 41 | auto ix1 = x1[i]; 42 | auto iy1 = y1[i]; 43 | auto ix2 = x2[i]; 44 | auto iy2 = y2[i]; 45 | auto iarea = areas[i]; 46 | 47 | for (int64_t _j = _i + 1; _j < ndets; _j++) { 48 | auto j = order[_j]; 49 | if (suppressed[j] == 1) 50 | continue; 51 | auto xx1 = std::max(ix1, x1[j]); 52 | auto yy1 = std::max(iy1, y1[j]); 53 | auto xx2 = std::min(ix2, x2[j]); 54 | auto yy2 = std::min(iy2, y2[j]); 55 | 56 | auto w = std::max(static_cast(0), xx2 - xx1 + 1); 57 | auto h = std::max(static_cast(0), yy2 - yy1 + 1); 58 | auto inter = w * h; 59 | auto ovr = inter / (iarea + areas[j] - inter); 60 | if (ovr >= threshold) 61 | suppressed[j] = 1; 62 | } 63 | } 64 | return at::nonzero(suppressed_t == 0).squeeze(1); 65 | } 66 | 67 | at::Tensor nms_cpu(const at::Tensor& dets, 68 | const at::Tensor& scores, 69 | const float threshold) { 70 | at::Tensor result; 71 | AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] { 72 | result = nms_cpu_kernel(dets, scores, threshold); 73 | }); 74 | return result; 75 | } 76 | -------------------------------------------------------------------------------- /bua/caffe/modeling/layers/csrc/nms/vision_cpu.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include 4 | 5 | 6 | at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, 7 | const at::Tensor& rois, 8 | const float spatial_scale, 9 | const int pooled_height, 10 | const int pooled_width, 11 | const int sampling_ratio); 12 | 13 | 14 | at::Tensor nms_cpu(const at::Tensor& dets, 15 | const at::Tensor& scores, 16 | const float threshold); 17 | -------------------------------------------------------------------------------- /bua/caffe/modeling/layers/csrc/nms/vision_cuda.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include 4 | 5 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh); 6 | 7 | 8 | at::Tensor compute_flow_cuda(const at::Tensor& boxes, 9 | const int height, 10 | const int width); 11 | -------------------------------------------------------------------------------- /bua/caffe/modeling/layers/csrc/vision.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | 3 | #include 4 | #include "nms/nms.h" 5 | 6 | namespace bottom_up_attention { 7 | 8 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 9 | m.def("nms", &nms, "non-maximum suppression"); 10 | } 11 | 12 | } // namespace bottom_up_attention 13 | -------------------------------------------------------------------------------- /bua/caffe/modeling/layers/nms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # from ._utils import _C 3 | from bua.caffe.modeling import _C 4 | 5 | from apex import amp 6 | import torch 7 | 8 | # Only valid with fp32 inputs - give AMP the hint 9 | nms = amp.float_function(_C.nms) 10 | 11 | # nms.__doc__ = """ 12 | # This function performs Non-maximum suppresion""" 13 | 14 | # NOTE: In order to be consistent with bottom-up-attention, we nms core function from maskrcnn-benchmark 15 | 16 | def batched_nms(boxes, scores, idxs, iou_threshold): 17 | """ 18 | Same as torchvision.ops.boxes.batched_nms, but safer. 19 | """ 20 | assert boxes.shape[-1] == 4 21 | boxes = boxes.cpu() 22 | scores = scores.cpu() 23 | # TODO may need better strategy. 24 | # Investigate after having a fully-cuda NMS op. 25 | if len(boxes) < 40000: 26 | return box_ops_batched_nms(boxes, scores, idxs, iou_threshold) 27 | 28 | result_mask = scores.new_zeros(scores.size(), dtype=torch.bool) 29 | for id in torch.unique(idxs).cpu().tolist(): 30 | # if id == 0: 31 | # continue 32 | mask = (idxs == id).nonzero().view(-1) 33 | keep = nms(boxes[mask], scores[mask], iou_threshold) 34 | result_mask[mask[keep]] = True 35 | keep = result_mask.nonzero().view(-1) 36 | keep = keep[scores[keep].argsort(descending=True)] 37 | return keep 38 | 39 | def box_ops_batched_nms(boxes, scores, idxs, iou_threshold): 40 | """ 41 | Performs non-maximum suppression in a batched fashion. 42 | 43 | Each index value correspond to a category, and NMS 44 | will not be applied between elements of different categories. 45 | 46 | Parameters 47 | ---------- 48 | boxes : Tensor[N, 4] 49 | boxes where NMS will be performed. They 50 | are expected to be in (x1, y1, x2, y2) format 51 | scores : Tensor[N] 52 | scores for each one of the boxes 53 | idxs : Tensor[N] 54 | indices of the categories for each one of the boxes. 55 | iou_threshold : float 56 | discards all overlapping boxes 57 | with IoU < iou_threshold 58 | 59 | Returns 60 | ------- 61 | keep : Tensor 62 | int64 tensor with the indices of 63 | the elements that have been kept by NMS, sorted 64 | in decreasing order of scores 65 | """ 66 | if boxes.numel() == 0: 67 | return torch.empty((0,), dtype=torch.int64, device=boxes.device) 68 | # strategy: in order to perform NMS independently per class. 69 | # we add an offset to all the boxes. The offset is dependent 70 | # only on the class idx, and is large enough so that boxes 71 | # from different classes do not overlap 72 | max_coordinate = boxes.max() 73 | offsets = idxs.to(boxes) * (max_coordinate + 1) 74 | boxes_for_nms = boxes + offsets[:, None] 75 | keep = nms(boxes_for_nms, scores, iou_threshold) 76 | return keep -------------------------------------------------------------------------------- /bua/caffe/modeling/layers/wrappers.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.nn.modules.utils import _ntuple 4 | 5 | class Conv2dv2(torch.nn.Conv2d): 6 | """ 7 | A wrapper around :class:`torch.nn.Conv2d` to support more features. 8 | """ 9 | 10 | def __init__(self, *args, **kwargs): 11 | """ 12 | Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`: 13 | 14 | Args: 15 | norm (nn.Module, optional): a normalization layer 16 | activation (callable(Tensor) -> Tensor): a callable activation function 17 | 18 | It assumes that norm layer is used before activation. 19 | """ 20 | norm = kwargs.pop("norm", None) 21 | activation = kwargs.pop("activation", None) 22 | super().__init__(*args, **kwargs) 23 | 24 | self.norm = norm 25 | self.activation = activation 26 | 27 | def forward(self, x): 28 | if x.numel() == 0 and self.training: 29 | # https://github.com/pytorch/pytorch/issues/12013 30 | assert not isinstance( 31 | self.norm, torch.nn.SyncBatchNorm 32 | ), "SyncBatchNorm does not support empty inputs!" 33 | if self.norm is not None: 34 | x = self.norm(x) 35 | if self.activation is not None: 36 | x = self.activation(x) 37 | x = super().forward(x) 38 | return x -------------------------------------------------------------------------------- /bua/caffe/modeling/rcnn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import logging, os 3 | import torch 4 | from torch import nn 5 | import torch.nn.functional as F 6 | 7 | from detectron2.structures import ImageList 8 | from detectron2.utils.logger import log_first_n 9 | 10 | from detectron2.modeling.backbone import build_backbone 11 | from detectron2.modeling.postprocessing import detector_postprocess 12 | from detectron2.modeling.proposal_generator import build_proposal_generator 13 | from detectron2.modeling.roi_heads import build_roi_heads 14 | from detectron2.modeling.meta_arch import META_ARCH_REGISTRY 15 | 16 | # from models.bua_caffe.postprocessing import extractor_postprocess 17 | #from utils import save_features 18 | 19 | __all__ = ["GeneralizedBUARCNN"] 20 | 21 | 22 | @META_ARCH_REGISTRY.register() 23 | class GeneralizedBUARCNN(nn.Module): 24 | """ 25 | Generalized R-CNN. Any models that contains the following three components: 26 | 1. Per-image feature extraction (aka backbone) 27 | 2. Region proposal generation 28 | 3. Per-region feature extraction and prediction 29 | """ 30 | 31 | def __init__(self, cfg): 32 | super().__init__() 33 | 34 | self.device = torch.device(cfg.MODEL.DEVICE) 35 | self.bua_caffe = cfg.MODEL.BUA.CAFFE 36 | self.resnet_version = cfg.MODEL.BUA.RESNET_VERSION 37 | self.backbone = build_backbone(cfg) 38 | self.in_features = cfg.MODEL.RPN.IN_FEATURES 39 | self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape()) 40 | self.roi_heads = build_roi_heads(cfg, self.backbone.output_shape()) 41 | 42 | assert len(cfg.MODEL.PIXEL_MEAN) == len(cfg.MODEL.PIXEL_STD) 43 | self.extract_on = cfg.MODEL.BUA.EXTRACT_FEATS 44 | self.extractor = cfg.MODEL.BUA.EXTRACTOR 45 | self.to(self.device) 46 | 47 | def forward(self, batched_inputs): 48 | """ 49 | Args: 50 | batched_inputs: a list, batched outputs of :class:`DatasetMapper` . 51 | Each item in the list contains the inputs for one image. 52 | For now, each item in the list is a dict that contains: 53 | 54 | * image: Tensor, image in (C, H, W) format. 55 | * instances (optional): groundtruth :class:`Instances` 56 | * proposals (optional): :class:`Instances`, precomputed proposals. 57 | 58 | Other information that's included in the original dicts, such as: 59 | 60 | * "height", "width" (int): the output resolution of the model, used in inference. 61 | See :meth:`postprocess` for details. 62 | 63 | Returns: 64 | list[dict]: 65 | Each dict is the output for one input image. 66 | The dict contains one key "instances" whose value is a :class:`Instances`. 67 | The :class:`Instances` object has the following keys: 68 | "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" 69 | """ 70 | if not self.training: 71 | return self.inference(batched_inputs) 72 | 73 | images = self.preprocess_image(batched_inputs) 74 | if "instances" in batched_inputs[0]: 75 | gt_instances = [x["instances"].to(self.device) for x in batched_inputs] 76 | elif "targets" in batched_inputs[0]: 77 | log_first_n( 78 | logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 79 | ) 80 | gt_instances = [x["targets"].to(self.device) for x in batched_inputs] 81 | else: 82 | gt_instances = None 83 | 84 | features = self.backbone(images.tensor) 85 | 86 | if self.resnet_version == 2: 87 | for f in features: 88 | out = self.roi_heads.res5[0].norm(features[f]) 89 | features[f] = F.relu_(out) 90 | 91 | if self.proposal_generator: 92 | proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) 93 | else: 94 | assert "proposals" in batched_inputs[0] 95 | proposals = [x["proposals"].to(self.device) for x in batched_inputs] 96 | proposal_losses = {} 97 | 98 | _, detector_losses = self.roi_heads(images, features, proposals, gt_instances) 99 | 100 | losses = {} 101 | losses.update(detector_losses) 102 | losses.update(proposal_losses) 103 | return losses 104 | 105 | def inference(self, batched_inputs, detected_instances=None, do_postprocess=True): 106 | """ 107 | Run inference on the given inputs. 108 | 109 | Args: 110 | batched_inputs (list[dict]): same as in :meth:`forward` 111 | detected_instances (None or list[Instances]): if not None, it 112 | contains an `Instances` object per image. The `Instances` 113 | object contains "pred_boxes" and "pred_classes" which are 114 | known boxes in the image. 115 | The inference will then skip the detection of bounding boxes, 116 | and only predict other per-ROI outputs. 117 | do_postprocess (bool): whether to apply post-processing on the outputs. 118 | 119 | Returns: 120 | same as in :meth:`forward`. 121 | """ 122 | assert not self.training 123 | 124 | images = self.preprocess_image(batched_inputs) 125 | features = self.backbone(images.tensor) 126 | 127 | if self.resnet_version == 2: 128 | for f in features: 129 | out = self.roi_heads.res5[0].norm(features[f]) 130 | features[f] = F.relu_(out) 131 | 132 | if detected_instances is None: 133 | if self.proposal_generator: 134 | proposals, _ = self.proposal_generator(images, features, None) 135 | else: 136 | assert "proposals" in batched_inputs[0] 137 | proposals = [x["proposals"].to(self.device) for x in batched_inputs] 138 | 139 | if self.extract_on: 140 | return self.roi_heads(images, features, proposals, None) 141 | else: 142 | results, _ = self.roi_heads(images, features, proposals, None) 143 | else: 144 | detected_instances = [x.to(self.device) for x in detected_instances] 145 | results = self.roi_heads.forward_with_given_boxes(features, detected_instances) 146 | 147 | if do_postprocess: 148 | processed_results = [] 149 | for results_per_image, input_per_image, image_size in zip( 150 | results, batched_inputs, images.image_sizes 151 | ): 152 | height = input_per_image.get("height", image_size[0]) 153 | width = input_per_image.get("width", image_size[1]) 154 | if not self.bua_caffe: 155 | results_per_image = detector_postprocess(results_per_image, height, width) 156 | processed_results.append({"instances": results_per_image}) 157 | return processed_results 158 | else: 159 | return results 160 | 161 | def preprocess_image(self, batched_inputs): 162 | """ 163 | Normalize, pad and batch the input images. 164 | """ 165 | images = [x["image"].to(self.device) for x in batched_inputs] 166 | image_scales = [x["im_scale"] for x in batched_inputs] 167 | images = ImageList.from_tensors(images, self.backbone.size_divisibility) 168 | images.image_scales = image_scales 169 | return images 170 | -------------------------------------------------------------------------------- /bua/caffe/modeling/rpn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | from typing import Dict, List 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from detectron2.modeling import RPN_HEAD_REGISTRY 9 | from detectron2.layers import ShapeSpec 10 | 11 | from detectron2.modeling.proposal_generator import build_rpn_head 12 | from detectron2.modeling.proposal_generator.build import PROPOSAL_GENERATOR_REGISTRY 13 | from detectron2.modeling.anchor_generator import build_anchor_generator 14 | from .box_regression import BUABox2BoxTransform 15 | from detectron2.modeling.matcher import Matcher 16 | from .rpn_outputs import BUARPNOutputs, find_top_bua_rpn_proposals 17 | 18 | import copy 19 | 20 | @RPN_HEAD_REGISTRY.register() 21 | class StandardBUARPNHead(nn.Module): 22 | """ 23 | RPN classification and regression heads. Uses a 3x3 conv to produce a shared 24 | hidden state from which one 1x1 conv predicts objectness logits for each anchor 25 | and a second 1x1 conv predicts bounding-box deltas specifying how to deform 26 | each anchor into an object proposal. 27 | """ 28 | 29 | def __init__(self, cfg, input_shape: List[ShapeSpec]): 30 | super().__init__() 31 | 32 | # Standard RPN is shared across levels: 33 | out_channels = cfg.MODEL.BUA.RPN.CONV_OUT_CHANNELS 34 | 35 | in_channels = [s.channels for s in input_shape] 36 | assert len(set(in_channels)) == 1, "Each level must have the same channel!" 37 | in_channels = in_channels[0] 38 | 39 | # RPNHead should take the same input as anchor generator 40 | # NOTE: it assumes that creating an anchor generator does not have unwanted side effect. 41 | anchor_generator = build_anchor_generator(cfg, input_shape) 42 | num_cell_anchors = anchor_generator.num_cell_anchors 43 | box_dim = anchor_generator.box_dim 44 | assert ( 45 | len(set(num_cell_anchors)) == 1 46 | ), "Each level must have the same number of cell anchors" 47 | num_cell_anchors = num_cell_anchors[0] 48 | 49 | # 3x3 conv for the hidden representation 50 | self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) 51 | # 1x1 conv for predicting objectness logits 52 | self.objectness_logits = nn.Conv2d(out_channels, num_cell_anchors * 2, kernel_size=1, stride=1) 53 | # 1x1 conv for predicting box2box transform deltas 54 | self.anchor_deltas = nn.Conv2d( 55 | out_channels, num_cell_anchors * box_dim, kernel_size=1, stride=1 56 | ) 57 | 58 | for l in [self.conv, self.objectness_logits, self.anchor_deltas]: 59 | nn.init.normal_(l.weight, std=0.01) 60 | nn.init.constant_(l.bias, 0) 61 | 62 | def forward(self, features): 63 | """ 64 | Args: 65 | features (list[Tensor]): list of feature maps 66 | """ 67 | pred_objectness_logits = [] 68 | pred_anchor_deltas = [] 69 | for x in features: 70 | t = F.relu(self.conv(x)) 71 | pred_objectness_logits.append(self.objectness_logits(t)) 72 | pred_anchor_deltas.append(self.anchor_deltas(t)) 73 | return pred_objectness_logits, pred_anchor_deltas 74 | 75 | @PROPOSAL_GENERATOR_REGISTRY.register() 76 | class BUARPN(nn.Module): 77 | """ 78 | Region Proposal Network, introduced by the Faster R-CNN paper. 79 | """ 80 | 81 | def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]): 82 | super().__init__() 83 | 84 | # fmt: off 85 | self.min_box_side_len = cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE 86 | self.in_features = cfg.MODEL.RPN.IN_FEATURES 87 | self.nms_thresh = cfg.MODEL.RPN.NMS_THRESH 88 | self.batch_size_per_image = cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE 89 | self.positive_fraction = cfg.MODEL.RPN.POSITIVE_FRACTION 90 | self.smooth_l1_beta = cfg.MODEL.RPN.SMOOTH_L1_BETA 91 | self.loss_weight = cfg.MODEL.RPN.LOSS_WEIGHT 92 | # fmt: on 93 | 94 | # Map from self.training state to train/test settings 95 | self.pre_nms_topk = { 96 | True: cfg.MODEL.RPN.PRE_NMS_TOPK_TRAIN, 97 | False: cfg.MODEL.RPN.PRE_NMS_TOPK_TEST, 98 | } 99 | self.post_nms_topk = { 100 | True: cfg.MODEL.RPN.POST_NMS_TOPK_TRAIN, 101 | False: cfg.MODEL.RPN.POST_NMS_TOPK_TEST, 102 | } 103 | self.boundary_threshold = cfg.MODEL.RPN.BOUNDARY_THRESH 104 | 105 | self.anchor_generator = build_anchor_generator( 106 | cfg, [input_shape[f] for f in self.in_features] 107 | ) 108 | self.box2box_transform = BUABox2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS) 109 | self.anchor_matcher = Matcher( 110 | cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, allow_low_quality_matches=True 111 | ) 112 | self.rpn_head = build_rpn_head(cfg, [input_shape[f] for f in self.in_features]) 113 | 114 | def forward(self, images, features, gt_instances=None): 115 | """ 116 | Args: 117 | images (ImageList): input images of length `N` 118 | features (dict[str: Tensor]): input data as a mapping from feature 119 | map name to tensor. Axis 0 represents the number of images `N` in 120 | the input data; axes 1-3 are channels, height, and width, which may 121 | vary between feature maps (e.g., if a feature pyramid is used). 122 | gt_instances (list[Instances], optional): a length `N` list of `Instances`s. 123 | Each `Instances` stores ground-truth instances for the corresponding image. 124 | 125 | Returns: 126 | proposals: list[Instances] or None 127 | loss: dict[Tensor] 128 | """ 129 | gt_boxes = [x.gt_boxes for x in gt_instances] if gt_instances is not None else None 130 | del gt_instances 131 | features = [features[f] for f in self.in_features] 132 | pred_objectness_logits, pred_anchor_deltas = self.rpn_head(features) 133 | anchors_in_image = self.anchor_generator(features) 134 | anchors = [copy.deepcopy(anchors_in_image) for _ in range(len(features[0]))] 135 | # TODO: The anchors only depend on the feature map shape; there's probably 136 | # an opportunity for some optimizations (e.g., caching anchors). 137 | outputs = BUARPNOutputs( 138 | self.box2box_transform, 139 | self.anchor_matcher, 140 | self.batch_size_per_image, 141 | self.positive_fraction, 142 | images, 143 | pred_objectness_logits, 144 | pred_anchor_deltas, 145 | anchors, 146 | self.boundary_threshold, 147 | gt_boxes, 148 | self.smooth_l1_beta, 149 | ) 150 | 151 | if self.training: 152 | losses = {k: v * self.loss_weight for k, v in outputs.losses().items()} 153 | else: 154 | losses = {} 155 | 156 | with torch.no_grad(): 157 | # Find the top proposals by applying NMS and removing boxes that 158 | # are too small. The proposals are treated as fixed for approximate 159 | # joint training with roi heads. This approach ignores the derivative 160 | # w.r.t. the proposal boxes’ coordinates that are also network 161 | # responses, so is approximate. 162 | proposals = find_top_bua_rpn_proposals( 163 | outputs.predict_proposals(), 164 | outputs.predict_objectness_logits(), 165 | images, 166 | self.nms_thresh, 167 | self.pre_nms_topk[self.training], 168 | self.post_nms_topk[self.training], 169 | self.min_box_side_len, 170 | self.training, 171 | ) 172 | # For RPN-only models, the proposals are the final output and we return them in 173 | # high-to-low confidence order. 174 | # For end-to-end models, the RPN proposals are an intermediate state 175 | # and this sorting is actually not needed. But the cost is negligible. 176 | # inds = [p.objectness_logits.sort(descending=True)[1] for p in proposals] 177 | # proposals = [p[ind] for p, ind in zip(proposals, inds)] 178 | 179 | return proposals, losses -------------------------------------------------------------------------------- /bua/caffe/postprocessing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from detectron2.structures import Instances 7 | from modeling.layers.nms import nms # BC-compat 8 | 9 | def extractor_postprocess(boxes, scores, features_pooled, input_per_image, extractor): 10 | """ 11 | Resize the output instances. 12 | The input images are often resized when entering an object detector. 13 | As a result, we often need the outputs of the detector in a different 14 | resolution from its inputs. 15 | 16 | This function will resize the raw outputs of an R-CNN detector 17 | to produce outputs according to the desired output resolution. 18 | 19 | Args: 20 | results (Instances): the raw outputs from the detector. 21 | `results.image_size` contains the input image resolution the detector sees. 22 | This object might be modified in-place. 23 | output_height, output_width: the desired output resolution. 24 | 25 | Returns: 26 | Instances: the resized output from the model, based on the output resolution 27 | """ 28 | MIN_BOXES = extractor.MIN_BOXES 29 | MAX_BOXES = extractor.MAX_BOXES 30 | CONF_THRESH = extractor.CONF_THRESH 31 | 32 | cur_device = scores.device 33 | 34 | dets = boxes / input_per_image["im_scale"] 35 | 36 | max_conf = torch.zeros((scores.shape[0])).to(cur_device) 37 | 38 | for cls_ind in range(1, scores.shape[1]): 39 | cls_scores = scores[:, cls_ind] 40 | keep = nms(dets, cls_scores, 0.3) 41 | max_conf[keep] = torch.where(cls_scores[keep] > max_conf[keep], 42 | cls_scores[keep], 43 | max_conf[keep]) 44 | 45 | keep_boxes = torch.nonzero(max_conf >= CONF_THRESH).flatten() 46 | if len(keep_boxes) < MIN_BOXES: 47 | keep_boxes = torch.argsort(max_conf, descending=True)[:MIN_BOXES] 48 | elif len(keep_boxes) > MAX_BOXES: 49 | keep_boxes = torch.argsort(max_conf, descending=True)[:MAX_BOXES] 50 | # keep_boxes = torch.argsort(max_conf, descending=True)[:100] 51 | # feat_list.append(feats[i][keep_boxes]) 52 | image_feat = features_pooled[keep_boxes] 53 | image_bboxes = dets[keep_boxes] 54 | 55 | return image_feat, image_bboxes -------------------------------------------------------------------------------- /bua/d2/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataloader.build_loader import ( 2 | build_detection_train_loader_with_attributes, 3 | build_detection_test_loader_with_attributes, 4 | ) 5 | from .modeling.roi_heads import AttributeRes5ROIHeads 6 | from .. import visual_genome 7 | from .config import add_attribute_config -------------------------------------------------------------------------------- /bua/d2/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | from detectron2.config import CfgNode as CN 5 | 6 | """ 7 | config for mode detectron2 8 | """ 9 | 10 | def add_attribute_config(cfg): 11 | """ 12 | Add config for attribute prediction. 13 | """ 14 | # Whether to have attribute prediction 15 | cfg.MODEL.ATTRIBUTE_ON = False 16 | # Maximum number of attributes per foreground instance 17 | cfg.INPUT.MAX_ATTR_PER_INS = 16 18 | # ------------------------------------------------------------------------ # 19 | # Attribute Head 20 | # ----------------------------------------------------------------------- # 21 | cfg.MODEL.ROI_ATTRIBUTE_HEAD = CN() 22 | # Dimension for object class embedding, used in conjunction with 23 | # visual features to predict attributes 24 | cfg.MODEL.ROI_ATTRIBUTE_HEAD.OBJ_EMBED_DIM = 256 25 | # Dimension of the hidden fc layer of the input visual features 26 | cfg.MODEL.ROI_ATTRIBUTE_HEAD.FC_DIM = 512 27 | # Loss weight for attribute prediction, 0.2 is best per analysis 28 | cfg.MODEL.ROI_ATTRIBUTE_HEAD.LOSS_WEIGHT = 0.2 29 | # Number of classes for attributes 30 | cfg.MODEL.ROI_ATTRIBUTE_HEAD.NUM_CLASSES = 400 31 | 32 | """ 33 | Add config for box regression loss adjustment. 34 | """ 35 | # Loss weights for RPN box regression 36 | cfg.MODEL.RPN.BBOX_LOSS_WEIGHT = 1.0 37 | # Loss weights for R-CNN box regression 38 | cfg.MODEL.ROI_BOX_HEAD.BBOX_LOSS_WEIGHT = 1.0 39 | 40 | cfg.MODEL.EXTRACT_FEATS = False 41 | cfg.MODEL.EXTRACT_MODE = 1 42 | 43 | _C = cfg 44 | _C.MODEL.BUA = CN() 45 | _C.MODEL.BUA.EXTRACT_FEATS = False 46 | _C.MODEL.BUA.EXTRACTOR = CN() 47 | _C.MODEL.BUA.ATTRIBUTE_ON = False 48 | # _C.MODEL.BUA.EXTRACT_FEATS = False 49 | 50 | # EXTRACTOR.MODE {1: extract roi features, 2: extract bbox only ,3: extract roi features by gt_bbox} 51 | _C.MODEL.BUA.EXTRACTOR.MODE = 1 52 | 53 | # config of postprocessing in extractor 54 | _C.MODEL.BUA.EXTRACTOR.MIN_BOXES = 10 55 | _C.MODEL.BUA.EXTRACTOR.MAX_BOXES = 100 56 | _C.MODEL.BUA.EXTRACTOR.CONF_THRESH = 0.2 57 | _C.MODEL.BUA.EXTRACTOR.OUTPUT_DIR = ".output/" -------------------------------------------------------------------------------- /bua/d2/dataloader/__init__.py: -------------------------------------------------------------------------------- 1 | from .build_loader import ( 2 | build_detection_train_loader_with_attributes, 3 | build_detection_test_loader_with_attributes, 4 | ) 5 | from ... import visual_genome -------------------------------------------------------------------------------- /bua/d2/dataloader/build_loader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import logging 3 | import operator 4 | import torch.utils.data 5 | 6 | from detectron2.utils.comm import get_world_size 7 | from detectron2.data import samplers 8 | from detectron2.data.build import get_detection_dataset_dicts, worker_init_reset_seed, trivial_batch_collator 9 | from detectron2.data.common import AspectRatioGroupedDataset, DatasetFromList, MapDataset 10 | 11 | from .dataset_mapper import AttributeDatasetMapper 12 | 13 | """ 14 | data_loader for mode detectron2 15 | """ 16 | 17 | def build_detection_train_loader_with_attributes(cfg, mapper=None): 18 | num_workers = get_world_size() 19 | images_per_batch = cfg.SOLVER.IMS_PER_BATCH 20 | assert ( 21 | images_per_batch % num_workers == 0 22 | ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( 23 | images_per_batch, num_workers 24 | ) 25 | assert ( 26 | images_per_batch >= num_workers 27 | ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( 28 | images_per_batch, num_workers 29 | ) 30 | images_per_worker = images_per_batch // num_workers 31 | # NOTE above is added 32 | 33 | dataset_dicts = get_detection_dataset_dicts( 34 | cfg.DATASETS.TRAIN, 35 | filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, 36 | min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE 37 | if cfg.MODEL.KEYPOINT_ON 38 | else 0, 39 | proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, 40 | ) 41 | dataset = DatasetFromList(dataset_dicts, copy=False) 42 | 43 | if mapper is None: 44 | mapper = AttributeDatasetMapper(cfg, True) 45 | dataset = MapDataset(dataset, mapper) 46 | 47 | sampler_name = cfg.DATALOADER.SAMPLER_TRAIN 48 | logger = logging.getLogger(__name__) 49 | logger.info("Using training sampler {}".format(sampler_name)) 50 | if sampler_name == "TrainingSampler": 51 | sampler = samplers.TrainingSampler(len(dataset)) 52 | elif sampler_name == "RepeatFactorTrainingSampler": 53 | sampler = samplers.RepeatFactorTrainingSampler( 54 | dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD 55 | ) 56 | else: 57 | raise ValueError("Unknown training sampler: {}".format(sampler_name)) 58 | 59 | if cfg.DATALOADER.ASPECT_RATIO_GROUPING: 60 | data_loader = torch.utils.data.DataLoader( 61 | dataset, 62 | sampler=sampler, 63 | num_workers=cfg.DATALOADER.NUM_WORKERS, 64 | batch_sampler=None, 65 | collate_fn=operator.itemgetter(0), 66 | worker_init_fn=worker_init_reset_seed, 67 | ) 68 | data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) 69 | else: 70 | batch_sampler = torch.utils.data.sampler.BatchSampler( 71 | sampler, images_per_worker, drop_last=True 72 | ) 73 | data_loader = torch.utils.data.DataLoader( 74 | dataset, 75 | num_workers=cfg.DATALOADER.NUM_WORKERS, 76 | batch_sampler=batch_sampler, 77 | collate_fn=trivial_batch_collator, 78 | worker_init_fn=worker_init_reset_seed, 79 | ) 80 | 81 | return data_loader 82 | 83 | 84 | def build_detection_test_loader_with_attributes(cfg, dataset_name, mapper=None): 85 | dataset_dicts = get_detection_dataset_dicts( 86 | [dataset_name], 87 | filter_empty=False, 88 | proposal_files=[ 89 | cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)] 90 | ] 91 | if cfg.MODEL.LOAD_PROPOSALS 92 | else None, 93 | ) 94 | 95 | dataset = DatasetFromList(dataset_dicts) 96 | if mapper is None: 97 | mapper = AttributeDatasetMapper(cfg, False) 98 | dataset = MapDataset(dataset, mapper) 99 | 100 | sampler = samplers.InferenceSampler(len(dataset)) 101 | batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False) 102 | 103 | data_loader = torch.utils.data.DataLoader( 104 | dataset, 105 | num_workers=cfg.DATALOADER.NUM_WORKERS, 106 | batch_sampler=batch_sampler, 107 | collate_fn=trivial_batch_collator, 108 | ) 109 | return data_loader -------------------------------------------------------------------------------- /bua/d2/dataloader/dataset_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import copy 3 | import logging 4 | import numpy as np 5 | import torch 6 | from fvcore.common.file_io import PathManager 7 | from PIL import Image 8 | 9 | from detectron2.data import detection_utils as utils 10 | from detectron2.data import transforms as T 11 | from detectron2.data import DatasetMapper 12 | from detectron2.structures import ( 13 | BitMasks, 14 | Boxes, 15 | BoxMode, 16 | Instances, 17 | Keypoints, 18 | PolygonMasks, 19 | polygons_to_bitmask, 20 | ) 21 | 22 | """ 23 | data mapper for mode detecrton2 24 | """ 25 | 26 | def annotations_to_instances_with_attributes(annos, 27 | image_size, 28 | mask_format="polygon", 29 | load_attributes=False, 30 | max_attr_per_ins=16): 31 | """ 32 | Extend the function annotations_to_instances() to support attributes 33 | """ 34 | boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos] 35 | target = Instances(image_size) 36 | boxes = target.gt_boxes = Boxes(boxes) 37 | boxes.clip(image_size) 38 | 39 | classes = [obj["category_id"] for obj in annos] 40 | classes = torch.tensor(classes, dtype=torch.int64) 41 | target.gt_classes = classes 42 | 43 | if len(annos) and "segmentation" in annos[0]: 44 | segms = [obj["segmentation"] for obj in annos] 45 | if mask_format == "polygon": 46 | masks = PolygonMasks(segms) 47 | else: 48 | assert mask_format == "bitmask", mask_format 49 | masks = [] 50 | for segm in segms: 51 | if isinstance(segm, list): 52 | # polygon 53 | masks.append(polygons_to_bitmask(segm, *image_size)) 54 | elif isinstance(segm, dict): 55 | # COCO RLE 56 | masks.append(mask_util.decode(segm)) 57 | elif isinstance(segm, np.ndarray): 58 | assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( 59 | segm.ndim 60 | ) 61 | # mask array 62 | masks.append(segm) 63 | else: 64 | raise ValueError( 65 | "Cannot convert segmentation of type '{}' to BitMasks!" 66 | "Supported types are: polygons as list[list[float] or ndarray]," 67 | " COCO-style RLE as a dict, or a full-image segmentation mask " 68 | "as a 2D ndarray.".format(type(segm)) 69 | ) 70 | masks = BitMasks( 71 | torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks]) 72 | ) 73 | target.gt_masks = masks 74 | 75 | if len(annos) and "keypoints" in annos[0]: 76 | kpts = [obj.get("keypoints", []) for obj in annos] 77 | target.gt_keypoints = Keypoints(kpts) 78 | 79 | if len(annos) and load_attributes: 80 | attributes = -torch.ones((len(annos), max_attr_per_ins), dtype=torch.int64) 81 | for idx, anno in enumerate(annos): 82 | if "attribute_ids" in anno: 83 | for jdx, attr_id in enumerate(anno["attribute_ids"]): 84 | attributes[idx, jdx] = attr_id 85 | target.gt_attributes = attributes 86 | 87 | return target 88 | 89 | 90 | class AttributeDatasetMapper(DatasetMapper): 91 | """ 92 | Extend DatasetMapper to support attributes. 93 | """ 94 | def __init__(self, cfg, is_train=True): 95 | super().__init__(cfg, is_train) 96 | 97 | # fmt: off 98 | self.attribute_on = cfg.MODEL.BUA.ATTRIBUTE_ON 99 | self.max_attr_per_ins = cfg.INPUT.MAX_ATTR_PER_INS 100 | # fmt: on 101 | # NOTE Added to fit d202 102 | if cfg.INPUT.CROP.ENABLED and is_train: 103 | self.crop_gen = T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE) 104 | else: 105 | self.crop_gen = None 106 | 107 | self.tfm_gens = utils.build_transform_gen(cfg, is_train) 108 | self.load_proposals = cfg.MODEL.LOAD_PROPOSALS 109 | self.mask_on = cfg.MODEL.MASK_ON 110 | self.keypoint_on = cfg.MODEL.KEYPOINT_ON 111 | self.mask_format = cfg.INPUT.MASK_FORMAT 112 | # NOTE ok 113 | 114 | def __call__(self, dataset_dict): 115 | dataset_dict = copy.deepcopy(dataset_dict) 116 | # NOTE Added to fit d202 117 | image = utils.read_image(dataset_dict["file_name"], format=self.image_format) # image_format 118 | # image = utils.read_image(dataset_dict["file_name"], format=self.img_format) # image_format 119 | utils.check_image_size(dataset_dict, image) 120 | 121 | if "annotations" not in dataset_dict: 122 | image, transforms = T.apply_transform_gens( 123 | ([self.crop_gen] if self.crop_gen else []) + self.tfm_gens, image 124 | ) 125 | else: 126 | if self.crop_gen: 127 | crop_tfm = utils.gen_crop_transform_with_instance( 128 | self.crop_gen.get_crop_size(image.shape[:2]), 129 | image.shape[:2], 130 | np.random.choice(dataset_dict["annotations"]), 131 | ) 132 | image = crop_tfm.apply_image(image) 133 | image, transforms = T.apply_transform_gens(self.tfm_gens, image) 134 | if self.crop_gen: 135 | transforms = crop_tfm + transforms 136 | 137 | image_shape = image.shape[:2] 138 | dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) 139 | 140 | if self.load_proposals: 141 | utils.transform_proposals( 142 | dataset_dict, image_shape, transforms, self.min_box_side_len, self.proposal_topk 143 | ) 144 | 145 | if not self.is_train: 146 | dataset_dict.pop("annotations", None) 147 | dataset_dict.pop("sem_seg_file_name", None) 148 | return dataset_dict 149 | 150 | if "annotations" in dataset_dict: 151 | for anno in dataset_dict["annotations"]: 152 | if not self.mask_on: 153 | anno.pop("segmentation", None) 154 | if not self.keypoint_on: 155 | anno.pop("keypoints", None) 156 | if not self.attribute_on: 157 | anno.pop("attribute_ids") 158 | 159 | annos = [ 160 | utils.transform_instance_annotations( 161 | obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices 162 | ) 163 | for obj in dataset_dict.pop("annotations") 164 | if obj.get("iscrowd", 0) == 0 165 | ] 166 | instances = annotations_to_instances_with_attributes( 167 | annos, image_shape, mask_format=self.mask_format, 168 | load_attributes=self.attribute_on, max_attr_per_ins=self.max_attr_per_ins 169 | ) 170 | if self.crop_gen and instances.has("gt_masks"): 171 | instances.gt_boxes = instances.gt_masks.get_bounding_boxes() 172 | dataset_dict["instances"] = utils.filter_empty_instances(instances) 173 | 174 | if "sem_seg_file_name" in dataset_dict: 175 | with PathManager.open(dataset_dict.pop("sem_seg_file_name"), "rb") as f: 176 | sem_seg_gt = Image.open(f) 177 | sem_seg_gt = np.asarray(sem_seg_gt, dtype="uint8") 178 | sem_seg_gt = transforms.apply_segmentation(sem_seg_gt) 179 | sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) 180 | dataset_dict["sem_seg"] = sem_seg_gt 181 | return dataset_dict 182 | -------------------------------------------------------------------------------- /bua/visual_genome.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | import contextlib 4 | import io 5 | import logging 6 | import os 7 | from fvcore.common.file_io import PathManager 8 | from fvcore.common.timer import Timer 9 | 10 | from detectron2.data import DatasetCatalog, MetadataCatalog 11 | from detectron2.structures import BoxMode 12 | 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | """ 17 | load json for mode detectron2 18 | """ 19 | 20 | def load_coco_with_attributes_json(json_file, 21 | image_root, 22 | dataset_name=None, 23 | extra_annotation_keys=None): 24 | """ 25 | Extend load_coco_json() with additional support for attributes 26 | """ 27 | from pycocotools.coco import COCO 28 | 29 | timer = Timer() 30 | json_file = PathManager.get_local_path(json_file) 31 | with contextlib.redirect_stdout(io.StringIO()): 32 | coco_api = COCO(json_file) 33 | if timer.seconds() > 1: 34 | logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds())) 35 | 36 | id_map = None 37 | if dataset_name is not None: 38 | meta = MetadataCatalog.get(dataset_name) 39 | cat_ids = sorted(coco_api.getCatIds()) 40 | cats = coco_api.loadCats(cat_ids) 41 | thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])] 42 | meta.thing_classes = thing_classes 43 | if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)): 44 | if "coco" not in dataset_name: 45 | logger.warning( 46 | """ 47 | Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you. 48 | """ 49 | ) 50 | id_map = {v: i for i, v in enumerate(cat_ids)} 51 | meta.thing_dataset_id_to_contiguous_id = id_map 52 | 53 | img_ids = sorted(coco_api.imgs.keys()) 54 | imgs = coco_api.loadImgs(img_ids) 55 | anns = [coco_api.imgToAnns[img_id] for img_id in img_ids] 56 | 57 | if "minival" not in json_file: 58 | ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image] 59 | assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format( 60 | json_file 61 | ) 62 | 63 | imgs_anns = list(zip(imgs, anns)) 64 | 65 | logger.info("Loaded {} images in COCO format from {}".format(len(imgs_anns), json_file)) 66 | 67 | dataset_dicts = [] 68 | 69 | ann_keys = ["iscrowd", "bbox", "keypoints", "category_id"] + (extra_annotation_keys or []) 70 | 71 | num_instances_without_valid_segmentation = 0 72 | 73 | for (img_dict, anno_dict_list) in imgs_anns: 74 | record = {} 75 | record["file_name"] = os.path.join(image_root, img_dict["file_name"]) 76 | record["height"] = img_dict["height"] 77 | record["width"] = img_dict["width"] 78 | image_id = record["image_id"] = img_dict["id"] 79 | 80 | objs = [] 81 | for anno in anno_dict_list: 82 | assert anno["image_id"] == image_id 83 | 84 | assert anno.get("ignore", 0) == 0, '"ignore" in COCO json file is not supported.' 85 | 86 | obj = {key: anno[key] for key in ann_keys if key in anno} 87 | 88 | segm = anno.get("segmentation", None) 89 | if segm: 90 | if not isinstance(segm, dict): 91 | segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6] 92 | if len(segm) == 0: 93 | num_instances_without_valid_segmentation += 1 94 | continue 95 | obj["segmentation"] = segm 96 | 97 | keypts = anno.get("keypoints", None) 98 | if keypts: 99 | for idx, v in enumerate(keypts): 100 | if idx % 3 != 2: 101 | keypts[idx] = v + 0.5 102 | obj["keypoints"] = keypts 103 | 104 | attrs = anno.get("attribute_ids", None) 105 | if attrs: # list[int] 106 | obj["attribute_ids"] = attrs 107 | 108 | attr = anno.get("attribute", None) 109 | if attr: 110 | # NOTE import from bua 111 | # obj["attributes"] = attr # 正常读入 112 | # obj["attribute_ids"] = attr # 正常读入 113 | # print(attr) 114 | max_attributes_per_ins = 16 115 | attributes = [-1 for _ in range(max_attributes_per_ins)] 116 | for idx, a in enumerate(attr): 117 | attributes[idx] = a - 1 # bua train、val的json中attr类别是1-400 118 | obj["attribute_ids"] = attributes 119 | 120 | obj["bbox_mode"] = BoxMode.XYWH_ABS 121 | if id_map: 122 | obj["category_id"] = id_map[obj["category_id"]] 123 | objs.append(obj) 124 | record["annotations"] = objs 125 | dataset_dicts.append(record) 126 | 127 | if num_instances_without_valid_segmentation > 0: 128 | logger.warning( 129 | "Filtered out {} instances without valid segmentation. " 130 | "There might be issues in your dataset generation process.".format( 131 | num_instances_without_valid_segmentation 132 | ) 133 | ) 134 | return dataset_dicts 135 | 136 | def register_coco_instances_with_attributes(name, metadata, json_file, image_root): 137 | DatasetCatalog.register(name, lambda: load_coco_with_attributes_json(json_file, 138 | image_root, 139 | name)) 140 | MetadataCatalog.get(name).set( 141 | json_file=json_file, image_root=image_root, evaluator_type="coco", **metadata 142 | ) 143 | # ==== Predefined splits for visual genome images =========== 144 | _PREDEFINED_SPLITS_VG = { 145 | "visual_genome_train": ("visual_genome/images", 146 | "visual_genome/annotations/visual_genome_train.json"), 147 | "visual_genome_val": ("visual_genome/images", 148 | "visual_genome/annotations/visual_genome_val.json"), 149 | "visual_genome_test": ("visual_genome/images", 150 | "visual_genome/annotations/visual_genome_test.json"), 151 | } 152 | 153 | def register_all_vg(root): 154 | for key, (image_root, json_file) in _PREDEFINED_SPLITS_VG.items(): 155 | register_coco_instances_with_attributes( 156 | key, 157 | {}, # no meta data 158 | os.path.join(root, json_file), 159 | os.path.join(root, image_root), 160 | ) 161 | 162 | # Register them all under "./datasets" 163 | _root = os.getenv("DETECTRON2_DATASETS", "datasets") 164 | register_all_vg(_root) -------------------------------------------------------------------------------- /configs/caffe/test-caffe-r101-fix36.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | WEIGHTS: "bua-caffe-frcn-r101-k36.pth" 3 | META_ARCHITECTURE: "GeneralizedBUARCNN" 4 | PIXEL_MEAN: [102.9801, 115.9465, 122.7717] 5 | ANCHOR_GENERATOR: 6 | SIZES: [[4, 8, 16, 32]] 7 | PROPOSAL_GENERATOR: 8 | NAME: "BUARPN" 9 | MIN_SIZE: 16 10 | BUA: 11 | ATTRIBUTE_ON: True 12 | EXTRACT_FEATS: False # auto True when extract feats 13 | RPN: 14 | CONV_OUT_CHANNELS: 512 15 | ATTRIBUTE: 16 | NUM_CLASSES: 401 17 | RESNETS: 18 | DEPTH: 101 19 | OUT_FEATURES: ["res4"] 20 | NORM: "BN" 21 | RES5_DILATION: 2 22 | BACKBONE: 23 | NAME: "build_bua_resnet_backbone" 24 | FREEZE_AT: 3 25 | RPN: 26 | HEAD_NAME: "StandardBUARPNHead" 27 | PRE_NMS_TOPK_TRAIN: 12000 28 | POST_NMS_TOPK_TRAIN: 2000 29 | POST_NMS_TOPK_TEST: 300 30 | PRE_NMS_TOPK_TEST: 6000 31 | BATCH_SIZE_PER_IMAGE: 64 32 | ROI_HEADS: 33 | NAME: "BUACaffeRes5ROIHeads" 34 | BATCH_SIZE_PER_IMAGE: 64 35 | SCORE_THRESH_TEST: -1.0 36 | NMS_THRESH_TEST: 0.3 37 | POSITIVE_FRACTION: 0.5 38 | NUM_CLASSES: 1601 39 | ROI_BOX_HEAD: 40 | POOLER_TYPE: "ROIPool" 41 | BBOX_REG_WEIGHTS: (1.0, 1.0, 1.0, 1.0) 42 | DATASETS: 43 | TRAIN: ("visual_genome_train",) 44 | TEST: ("visual_genome_val",) 45 | TEST: 46 | DETECTIONS_PER_IMAGE: 400 47 | DATALOADER: 48 | NUM_WORKERS: 1 49 | INPUT: 50 | MIN_SIZE_TRAIN: (600, ) 51 | MAX_SIZE_TRAIN: 1000 52 | MIN_SIZE_TEST: 600 53 | MAX_SIZE_TEST: 1000 -------------------------------------------------------------------------------- /configs/caffe/test-caffe-r101.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | WEIGHTS: "bua-caffe-frcn-r101-k10-100.pth" 3 | META_ARCHITECTURE: "GeneralizedBUARCNN" 4 | PIXEL_MEAN: [102.9801, 115.9465, 122.7717] 5 | ANCHOR_GENERATOR: 6 | SIZES: [[4, 8, 16, 32]] 7 | PROPOSAL_GENERATOR: 8 | NAME: "BUARPN" 9 | MIN_SIZE: 16 10 | BUA: 11 | ATTRIBUTE_ON: True 12 | EXTRACT_FEATS: False # auto True when extract feats 13 | RPN: 14 | CONV_OUT_CHANNELS: 512 15 | ATTRIBUTE: 16 | NUM_CLASSES: 401 17 | RESNETS: 18 | DEPTH: 101 19 | OUT_FEATURES: ["res4"] 20 | NORM: "BN" 21 | RES5_DILATION: 2 22 | BACKBONE: 23 | NAME: "build_bua_resnet_backbone" 24 | FREEZE_AT: 3 25 | RPN: 26 | HEAD_NAME: "StandardBUARPNHead" 27 | PRE_NMS_TOPK_TRAIN: 12000 28 | POST_NMS_TOPK_TRAIN: 2000 29 | POST_NMS_TOPK_TEST: 300 30 | PRE_NMS_TOPK_TEST: 6000 31 | BATCH_SIZE_PER_IMAGE: 64 32 | ROI_HEADS: 33 | NAME: "BUACaffeRes5ROIHeads" 34 | BATCH_SIZE_PER_IMAGE: 64 35 | SCORE_THRESH_TEST: -1.0 36 | NMS_THRESH_TEST: 0.3 37 | POSITIVE_FRACTION: 0.5 38 | NUM_CLASSES: 1601 39 | ROI_BOX_HEAD: 40 | POOLER_TYPE: "ROIPool" 41 | BBOX_REG_WEIGHTS: (1.0, 1.0, 1.0, 1.0) 42 | DATASETS: 43 | TRAIN: ("visual_genome_train",) 44 | TEST: ("visual_genome_val",) 45 | TEST: 46 | DETECTIONS_PER_IMAGE: 400 47 | DATALOADER: 48 | NUM_WORKERS: 1 49 | INPUT: 50 | MIN_SIZE_TRAIN: (600, ) 51 | MAX_SIZE_TRAIN: 1000 52 | MIN_SIZE_TEST: 600 53 | MAX_SIZE_TEST: 1000 54 | 55 | -------------------------------------------------------------------------------- /configs/caffe/test-caffe-r152.yaml: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR: "./output_caffe152" 2 | MODEL: 3 | WEIGHTS: "bua-caffe-frcn-r152.pth" 4 | META_ARCHITECTURE: "GeneralizedBUARCNN" 5 | PIXEL_MEAN: [0, 0, 0] 6 | ANCHOR_GENERATOR: 7 | SIZES: [[4, 8, 16, 32]] 8 | PROPOSAL_GENERATOR: 9 | NAME: "BUARPN" 10 | MIN_SIZE: 16 11 | BUA: 12 | ATTRIBUTE_ON: True 13 | EXTRACT_FEATS: False # auto True when extract feats 14 | RESNET_VERSION: 2 15 | RPN: 16 | CONV_OUT_CHANNELS: 512 17 | EXTRACTOR: 18 | MIN_BOXES: 100 19 | MAX_BOXES: 100 20 | ATTRIBUTE: 21 | NUM_CLASSES: 401 22 | RESNETS: 23 | DEPTH: 152 24 | OUT_FEATURES: ["res4"] 25 | NORM: "BN" 26 | RES5_DILATION: 1 27 | STRIDE_IN_1X1: False 28 | BACKBONE: 29 | NAME: "build_bua_resnet_backbone" 30 | FREEZE_AT: 3 31 | RPN: 32 | HEAD_NAME: "StandardBUARPNHead" 33 | PRE_NMS_TOPK_TRAIN: 12000 34 | POST_NMS_TOPK_TRAIN: 2000 35 | POST_NMS_TOPK_TEST: 300 36 | PRE_NMS_TOPK_TEST: 6000 37 | BATCH_SIZE_PER_IMAGE: 64 38 | ROI_HEADS: 39 | NAME: "BUACaffeRes5ROIHeads" 40 | BATCH_SIZE_PER_IMAGE: 64 41 | SCORE_THRESH_TEST: -1.0 42 | NMS_THRESH_TEST: 0.3 43 | POSITIVE_FRACTION: 0.5 44 | NUM_CLASSES: 1601 45 | ROI_BOX_HEAD: 46 | POOLER_TYPE: "ROIPool" 47 | BBOX_REG_WEIGHTS: (1.0, 1.0, 1.0, 1.0) 48 | DATASETS: 49 | TRAIN: ("visual_genome_train",) 50 | TEST: ("visual_genome_val",) 51 | TEST: 52 | DETECTIONS_PER_IMAGE: 400 53 | DATALOADER: 54 | NUM_WORKERS: 1 55 | INPUT: 56 | MIN_SIZE_TRAIN: (600, ) 57 | MAX_SIZE_TRAIN: 1000 58 | MIN_SIZE_TEST: 600 59 | MAX_SIZE_TEST: 1000 60 | 61 | -------------------------------------------------------------------------------- /configs/d2/base-d2.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedRCNN" 3 | ATTRIBUTE_ON: True 4 | RPN: 5 | PRE_NMS_TOPK_TEST: 6000 6 | POST_NMS_TOPK_TEST: 1000 7 | SMOOTH_L1_BETA: 0.1111 8 | BOUNDARY_THRESH: 0 9 | ROI_HEADS: 10 | NAME: "AttributeRes5ROIHeads" 11 | NUM_CLASSES: 1600 12 | ROI_BOX_HEAD: 13 | POOLER_SAMPLING_RATIO: 2 14 | SMOOTH_L1_BETA: 1. 15 | DATASETS: 16 | TRAIN: ("visual_genome_train", "visual_genome_val") 17 | TEST: ("visual_genome_test",) 18 | SOLVER: 19 | IMS_PER_BATCH: 8 20 | BASE_LR: 0.01 21 | STEPS: (120000, 160000) 22 | MAX_ITER: 180000 23 | # IMS_PER_BATCH: 16 24 | # BASE_LR: 0.02 25 | # STEPS: (60000, 80000) 26 | # MAX_ITER: 90000 27 | INPUT: 28 | MIN_SIZE_TRAIN: (600,) 29 | MAX_SIZE_TRAIN: 1000 30 | MIN_SIZE_TEST: 600 31 | MAX_SIZE_TEST: 1000 32 | VERSION: 2 33 | -------------------------------------------------------------------------------- /configs/d2/test-d2-X152.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | WEIGHTS: "bua-d2-frcn-x152.pth" 3 | META_ARCHITECTURE: "GeneralizedRCNN" 4 | ATTRIBUTE_ON: True 5 | BUA: 6 | ATTRIBUTE_ON: True 7 | EXTRACTOR: 8 | MODE: 1 9 | MIN_BOXES: 10 10 | MAX_BOXES: 20 11 | CONF_THRESH: 0.4 12 | RESNETS: 13 | STRIDE_IN_1X1: False # this is a C2 model 14 | NUM_GROUPS: 32 15 | WIDTH_PER_GROUP: 8 16 | DEPTH: 152 17 | RPN: 18 | PRE_NMS_TOPK_TEST: 6000 19 | POST_NMS_TOPK_TEST: 1000 20 | SMOOTH_L1_BETA: 0.1111 21 | BOUNDARY_THRESH: 0 22 | ROI_HEADS: 23 | NAME: "AttributeRes5ROIHeads" 24 | NUM_CLASSES: 1600 25 | ROI_BOX_HEAD: 26 | NAME: "FastRCNNConvFCHead" 27 | NUM_FC: 2 28 | POOLER_RESOLUTION: 7 29 | POOLER_SAMPLING_RATIO: 2 30 | SMOOTH_L1_BETA: 1. 31 | DATASETS: 32 | TRAIN: ("visual_genome_train", "visual_genome_val") 33 | TEST: ("visual_genome_test",) 34 | SOLVER: 35 | # IMS_PER_BATCH: 16 36 | # BASE_LR: 0.02 37 | # STEPS: (60000, 80000) 38 | # MAX_ITER: 90000 39 | IMS_PER_BATCH: 8 40 | BASE_LR: 0.01 41 | STEPS: (120000, 160000) 42 | MAX_ITER: 180000 43 | INPUT: 44 | MIN_SIZE_TRAIN: (600,) 45 | MAX_SIZE_TRAIN: 1000 46 | MIN_SIZE_TEST: 600 47 | MAX_SIZE_TEST: 1000 48 | VERSION: 2 -------------------------------------------------------------------------------- /configs/d2/test-d2-r101.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "base-d2.yaml" 2 | MODEL: 3 | WEIGHTS: "bua-d2-frcn-r101.pth" 4 | BUA: 5 | ATTRIBUTE_ON: True 6 | EXTRACTOR: 7 | MODE: 1 8 | MIN_BOXES: 10 9 | MAX_BOXES: 20 10 | CONF_THRESH: 0.4 11 | RESNETS: 12 | DEPTH: 101 -------------------------------------------------------------------------------- /configs/d2/test-d2-r50.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "base-d2.yaml" 2 | MODEL: 3 | WEIGHTS: "bua-d2-frcn-r50.pth" 4 | BUA: 5 | ATTRIBUTE_ON: True 6 | EXTRACTOR: 7 | MODE: 1 8 | MIN_BOXES: 10 9 | MAX_BOXES: 20 10 | CONF_THRESH: 0.4 11 | RESNETS: 12 | DEPTH: 50 -------------------------------------------------------------------------------- /configs/d2/train-d2-r101.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "base-d2.yaml" 2 | MODEL: 3 | WEIGHTS: "R-101.pkl" # the backbone weight is download from d2 at https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-101.pkl 4 | BUA: 5 | ATTRIBUTE_ON: True 6 | EXTRACTOR: 7 | MODE: 1 8 | MIN_BOXES: 10 9 | MAX_BOXES: 20 10 | CONF_THRESH: 0.4 11 | RESNETS: 12 | DEPTH: 101 -------------------------------------------------------------------------------- /configs/d2/train-d2-r50.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "base-d2.yaml" 2 | MODEL: 3 | WEIGHTS: "R-50.pkl" # the backbone weight is download from d2 at https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-50.pkl 4 | BUA: 5 | ATTRIBUTE_ON: True 6 | EXTRACTOR: 7 | MODE: 1 8 | MIN_BOXES: 10 9 | MAX_BOXES: 20 10 | CONF_THRESH: 0.4 11 | RESNETS: 12 | DEPTH: 50 -------------------------------------------------------------------------------- /datasets/demo/000456.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MILVLG/bottom-up-attention.pytorch/4dbce869ad17117ca9f1df55bc5604cdbcd47f59/datasets/demo/000456.jpg -------------------------------------------------------------------------------- /datasets/demo/000542.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MILVLG/bottom-up-attention.pytorch/4dbce869ad17117ca9f1df55bc5604cdbcd47f59/datasets/demo/000542.jpg -------------------------------------------------------------------------------- /datasets/demo/001150.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MILVLG/bottom-up-attention.pytorch/4dbce869ad17117ca9f1df55bc5604cdbcd47f59/datasets/demo/001150.jpg -------------------------------------------------------------------------------- /datasets/demo/001763.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MILVLG/bottom-up-attention.pytorch/4dbce869ad17117ca9f1df55bc5604cdbcd47f59/datasets/demo/001763.jpg -------------------------------------------------------------------------------- /datasets/demo/004545.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MILVLG/bottom-up-attention.pytorch/4dbce869ad17117ca9f1df55bc5604cdbcd47f59/datasets/demo/004545.jpg -------------------------------------------------------------------------------- /datasets/demo/example_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MILVLG/bottom-up-attention.pytorch/4dbce869ad17117ca9f1df55bc5604cdbcd47f59/datasets/demo/example_image.jpg -------------------------------------------------------------------------------- /datasets/demo/example_image1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MILVLG/bottom-up-attention.pytorch/4dbce869ad17117ca9f1df55bc5604cdbcd47f59/datasets/demo/example_image1.png -------------------------------------------------------------------------------- /datasets/demo/example_image2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MILVLG/bottom-up-attention.pytorch/4dbce869ad17117ca9f1df55bc5604cdbcd47f59/datasets/demo/example_image2.png -------------------------------------------------------------------------------- /datasets/init: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MILVLG/bottom-up-attention.pytorch/4dbce869ad17117ca9f1df55bc5604cdbcd47f59/datasets/init -------------------------------------------------------------------------------- /evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .vg_evaluation import VGEvaluator -------------------------------------------------------------------------------- /evaluation/attributes_vocab.txt: -------------------------------------------------------------------------------- 1 | gray,grey 2 | multi colored,multi-colored,multicolored 3 | double decker,double-decker 4 | unmade 5 | red 6 | camouflage 7 | blue 8 | white 9 | green 10 | pink 11 | yellow 12 | black 13 | ivory 14 | throwing 15 | orange 16 | spiky 17 | plaid 18 | purple 19 | soccer 20 | brake 21 | blonde 22 | american 23 | flat screen 24 | brown 25 | wooden 26 | performing 27 | pulled back 28 | windshield 29 | bald 30 | chocolate 31 | khaki 32 | apple 33 | blowing 34 | parked 35 | sticking out 36 | fluorescent 37 | glazed 38 | cooking 39 | brick 40 | home 41 | palm 42 | curly 43 | cheese 44 | crashing 45 | calm 46 | christmas 47 | gravel 48 | chain link,chainlink 49 | clear 50 | cloudy 51 | curled 52 | striped 53 | flying 54 | pine 55 | arched 56 | hardwood 57 | silver 58 | framed 59 | one way,oneway 60 | tall 61 | muscular 62 | skiing 63 | tiled 64 | bare 65 | surfing 66 | stuffed 67 | wii 68 | taking off 69 | sleeping 70 | jumping 71 | metal 72 | fire 73 | neon green 74 | soap 75 | park 76 | chalk 77 | license 78 | powdered 79 | up 80 | woven 81 | baby 82 | polar 83 | floppy 84 | toasted 85 | coffee 86 | potted 87 | wet 88 | tennis 89 | dry 90 | balding 91 | carpeted 92 | deep blue 93 | cardboard 94 | pointed 95 | sandy 96 | snow-covered,snow covered 97 | sheer 98 | wood 99 | swimming 100 | traffic 101 | crouching 102 | short 103 | melted 104 | marble 105 | rock 106 | open 107 | paper 108 | stacked 109 | stainless 110 | cluttered 111 | dirt 112 | waving 113 | ripe 114 | salt 115 | rolling 116 | long 117 | clock 118 | maroon 119 | little 120 | triangle 121 | large 122 | sand 123 | fallen 124 | foamy 125 | stack 126 | sliced 127 | blond 128 | plain 129 | straw 130 | busy 131 | checkered 132 | extended 133 | stainless steel,stainless-steel 134 | stone 135 | rocky 136 | laying down 137 | grazing 138 | porcelain 139 | snowboarding 140 | stop 141 | leather 142 | gold 143 | cargo 144 | playing tennis 145 | winter 146 | walking 147 | roman 148 | peeled 149 | plastic 150 | colorful 151 | shining 152 | burnt 153 | messy 154 | tile 155 | cloudless 156 | glass 157 | smiling 158 | fruit 159 | overcast 160 | adult 161 | water 162 | round 163 | birthday 164 | dark 165 | snowy 166 | leafless 167 | young 168 | wicker 169 | skateboarding 170 | cooked 171 | huge 172 | dress 173 | wire 174 | cracked 175 | concrete 176 | laying 177 | grassy 178 | foggy 179 | fried 180 | slice 181 | batting 182 | mountain 183 | halved 184 | ski 185 | statue 186 | still 187 | octagonal 188 | side view 189 | sitting 190 | wavy 191 | floral 192 | running 193 | moving 194 | small 195 | door 196 | wine 197 | closed 198 | cement 199 | splashing 200 | empty 201 | eating 202 | skating 203 | playing 204 | old 205 | tan 206 | leafy 207 | down 208 | electrical 209 | manicured 210 | standing 211 | blurry 212 | choppy 213 | driving 214 | watching 215 | parking 216 | pointy 217 | covering 218 | for sale 219 | reflecting 220 | railroad 221 | golden brown 222 | steep 223 | granite 224 | roll 225 | train 226 | spotted 227 | fluffy 228 | bending 229 | tarmacked 230 | furry 231 | dirty 232 | hanging 233 | above 234 | half full 235 | bright 236 | chrome 237 | toilet paper 238 | squatting 239 | chopped 240 | flowing 241 | neon 242 | skate 243 | rusty 244 | male 245 | covered 246 | outstretched 247 | lit 248 | riding 249 | shirtless 250 | reaching 251 | baseball 252 | iron 253 | night 254 | speckled 255 | bright blue 256 | horizontal 257 | denim 258 | cake 259 | hazy 260 | chipped 261 | police 262 | off 263 | dead 264 | nike 265 | steamed 266 | beige 267 | brunette 268 | short sleeved 269 | laptop 270 | decorated 271 | sharp 272 | perched 273 | clay 274 | made 275 | mesh 276 | street 277 | burgundy 278 | bent 279 | rusted 280 | paved 281 | patterned 282 | painted 283 | flat 284 | landing 285 | light blue 286 | puffy 287 | shaggy 288 | resting 289 | overgrown 290 | bending over 291 | circular 292 | curved 293 | cast 294 | rainbow colored,rainbow 295 | lime green 296 | ceramic 297 | dried 298 | styrofoam 299 | long sleeved,long sleeve 300 | wispy 301 | ocean 302 | big 303 | teal 304 | oval 305 | greenish 306 | murky 307 | tomato 308 | letter 309 | bricked 310 | in air 311 | distant 312 | full 313 | opened 314 | looking 315 | power 316 | holding 317 | browned 318 | growing 319 | backwards 320 | clean 321 | racing 322 | grilled 323 | seasoned 324 | barefoot 325 | kneeling 326 | digital 327 | herd 328 | sliding 329 | recessed 330 | lying 331 | serving 332 | polka dot 333 | cut 334 | ornate 335 | piled 336 | steel 337 | muddy 338 | hilly 339 | raised 340 | hitting 341 | evergreen 342 | sunny 343 | wrist 344 | half 345 | blank 346 | numbered 347 | electric 348 | computer 349 | rolled 350 | whole 351 | lush 352 | daytime 353 | toilet 354 | pointing 355 | asphalt 356 | public 357 | alone 358 | posing 359 | bunch 360 | square 361 | safety 362 | wearing 363 | stripes 364 | bathroom 365 | reflective 366 | assorted 367 | swinging 368 | airborne 369 | dark blue 370 | grass 371 | burned 372 | telephone 373 | docked 374 | pile 375 | laughing 376 | brass 377 | rubber 378 | frosted 379 | hairy 380 | overhead 381 | glowing 382 | soda 383 | number 384 | talking 385 | barren 386 | shaved 387 | shiny 388 | rough 389 | written 390 | older 391 | thin 392 | decorative 393 | wrinkled 394 | peeling 395 | golden 396 | metallic 397 | back 398 | thick 399 | black and white 400 | leaning -------------------------------------------------------------------------------- /evaluation/vg_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | import numpy as np 7 | 8 | 9 | def vg_eval(detpath, 10 | gt_roidb, 11 | image_index, 12 | classindex, 13 | ovthresh=0.5, 14 | use_07_metric=False, 15 | eval_attributes=False): 16 | """rec, prec, ap, sorted_scores, npos = voc_eval( 17 | detpath, 18 | gt_roidb, 19 | image_index, 20 | classindex, 21 | [ovthresh], 22 | [use_07_metric]) 23 | Top level function that does the Visual Genome evaluation. 24 | detpath: Path to detections 25 | gt_roidb: List of ground truth structs. 26 | image_index: List of image ids. 27 | classindex: Category index 28 | [ovthresh]: Overlap threshold (default = 0.5) 29 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 30 | (default False) 31 | """ 32 | # extract gt objects for this class 33 | class_recs = {} 34 | npos = 0 35 | for item, imagename in zip(gt_roidb, image_index): 36 | if eval_attributes: 37 | bbox = item['boxes'][np.where(np.any(item['gt_attributes'].toarray() == classindex, axis=1))[0], :] 38 | else: 39 | bbox = item['boxes'][np.where(item['gt_classes'] == classindex)[0], :] 40 | difficult = np.zeros((bbox.shape[0],)).astype(np.bool) 41 | det = [False] * bbox.shape[0] 42 | npos = npos + sum(~difficult) 43 | class_recs[str(imagename)] = {'bbox': bbox, 44 | 'difficult': difficult, 45 | 'det': det} 46 | if npos == 0: 47 | # No ground truth examples 48 | return 0, 0, 0, 0, npos 49 | 50 | # read dets 51 | with open(detpath, 'r') as f: 52 | lines = f.readlines() 53 | if len(lines) == 0: 54 | # No detection examples 55 | return 0, 0, 0, 0, npos 56 | 57 | splitlines = [x.strip().split(' ') for x in lines] 58 | image_ids = [x[0] for x in splitlines] 59 | confidence = np.array([float(x[1]) for x in splitlines]) 60 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 61 | 62 | # sort by confidence 63 | sorted_ind = np.argsort(-confidence) 64 | sorted_scores = -np.sort(-confidence) 65 | BB = BB[sorted_ind, :] 66 | image_ids = [image_ids[x] for x in sorted_ind] 67 | 68 | # go down dets and mark TPs and FPs 69 | nd = len(image_ids) 70 | tp = np.zeros(nd) 71 | fp = np.zeros(nd) 72 | for d in range(nd): 73 | if image_ids[d] not in class_recs: 74 | print(image_ids[d], detpath) 75 | continue 76 | R = class_recs[image_ids[d]] 77 | bb = BB[d, :].astype(float) 78 | ovmax = -np.inf 79 | BBGT = R['bbox'].astype(float) 80 | 81 | if BBGT.size > 0: 82 | # compute overlaps 83 | # intersection 84 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 85 | iymin = np.maximum(BBGT[:, 1], bb[1]) 86 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 87 | iymax = np.minimum(BBGT[:, 3], bb[3]) 88 | iw = np.maximum(ixmax - ixmin + 1., 0.) 89 | ih = np.maximum(iymax - iymin + 1., 0.) 90 | inters = iw * ih 91 | 92 | # union 93 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 94 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 95 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 96 | 97 | overlaps = inters / uni 98 | ovmax = np.max(overlaps) 99 | jmax = np.argmax(overlaps) 100 | 101 | if ovmax > ovthresh: 102 | if not R['difficult'][jmax]: 103 | if not R['det'][jmax]: 104 | tp[d] = 1. 105 | R['det'][jmax] = 1 106 | else: 107 | fp[d] = 1. 108 | else: 109 | fp[d] = 1. 110 | 111 | # compute precision recall 112 | fp = np.cumsum(fp) 113 | tp = np.cumsum(tp) 114 | rec = tp / float(npos) 115 | # avoid divide by zero in case the first detection matches a difficult 116 | # ground truth 117 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 118 | ap = voc_ap(rec, prec, use_07_metric) 119 | 120 | return rec, prec, ap, sorted_scores, npos 121 | 122 | def voc_ap(rec, prec, use_07_metric=False): 123 | """ ap = voc_ap(rec, prec, [use_07_metric]) 124 | Compute VOC AP given precision and recall. 125 | If use_07_metric is true, uses the 126 | VOC 07 11 point method (default:False). 127 | """ 128 | if use_07_metric: 129 | # 11 point metric 130 | ap = 0. 131 | for t in np.arange(0., 1.1, 0.1): 132 | if np.sum(rec >= t) == 0: 133 | p = 0 134 | else: 135 | p = np.max(prec[rec >= t]) 136 | ap = ap + p / 11. 137 | else: 138 | # correct AP calculation 139 | # first append sentinel values at the end 140 | mrec = np.concatenate(([0.], rec, [1.])) 141 | mpre = np.concatenate(([0.], prec, [0.])) 142 | 143 | # compute the precision envelope 144 | for i in range(mpre.size - 1, 0, -1): 145 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 146 | 147 | # to calculate area under PR curve, look for points 148 | # where X axis (recall) changes value 149 | i = np.where(mrec[1:] != mrec[:-1])[0] 150 | 151 | # and sum (\Delta recall) * prec 152 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 153 | return ap -------------------------------------------------------------------------------- /evaluation/vg_evaluation.py: -------------------------------------------------------------------------------- 1 | import os, io 2 | import numpy as np 3 | 4 | import copy 5 | import torch 6 | import logging 7 | import pickle as cPickle 8 | import itertools 9 | import contextlib 10 | from pycocotools.coco import COCO 11 | from collections import OrderedDict 12 | from fvcore.common.file_io import PathManager 13 | 14 | import detectron2.utils.comm as comm 15 | from detectron2.data import MetadataCatalog 16 | from detectron2.evaluation.evaluator import DatasetEvaluator 17 | from detectron2.data.datasets.coco import convert_to_coco_json 18 | from detectron2.evaluation.coco_evaluation import instances_to_coco_json 19 | 20 | from .vg_eval import vg_eval 21 | 22 | class VGEvaluator(DatasetEvaluator): 23 | """ 24 | Evaluate object proposal, instance detection 25 | outputs using VG's metrics and APIs. 26 | """ 27 | def __init__(self, dataset_name, cfg, distributed, output_dir=None): 28 | """ 29 | Args: 30 | dataset_name (str): name of the dataset to be evaluated. 31 | It must have either the following corresponding metadata: 32 | 33 | "json_file": the path to the COCO format annotation 34 | 35 | Or it must be in detectron2's standard dataset format 36 | so it can be converted to COCO format automatically. 37 | cfg (CfgNode): config instance 38 | distributed (True): if True, will collect results from all ranks for evaluation. 39 | Otherwise, will evaluate the results in the current process. 40 | output_dir (str): optional, an output directory to dump all 41 | results predicted on the dataset. The dump contains two files: 42 | 43 | 1. "instance_predictions.pth" a file in torch serialization 44 | format that contains all the raw original predictions. 45 | 2. "coco_instances_results.json" a json file in COCO's result 46 | format. 47 | """ 48 | self._tasks = self._tasks_from_config(cfg) 49 | self._distributed = distributed 50 | self._logger = logging.getLogger(__name__) 51 | self._cpu_device = torch.device("cpu") 52 | self._output_dir = output_dir 53 | 54 | self._metadata = MetadataCatalog.get(dataset_name) 55 | if not hasattr(self._metadata, "json_file"): 56 | self._logger.warning(f"json_file was not found in MetaDataCatalog for '{dataset_name}'") 57 | 58 | cache_path = os.path.join(output_dir, f"{dataset_name}_vg_format.json") 59 | self._metadata.json_file = cache_path 60 | convert_to_coco_json(dataset_name, cache_path) 61 | 62 | json_file = PathManager.get_local_path(self._metadata.json_file) 63 | with contextlib.redirect_stdout(io.StringIO()): 64 | self._coco_api = COCO(json_file) 65 | 66 | self._classes = ['__background__'] 67 | self._class_to_ind = {} 68 | self._class_to_ind[self._classes[0]] = 0 69 | with open(os.path.join('evaluation/objects_vocab.txt')) as f: 70 | count = 1 71 | for object in f.readlines(): 72 | names = [n.lower().strip() for n in object.split(',')] 73 | self._classes.append(names[0]) 74 | for n in names: 75 | self._class_to_ind[n] = count 76 | count += 1 77 | 78 | # Load attributes 79 | self._attributes = ['__no_attribute__'] 80 | self._attribute_to_ind = {} 81 | self._attribute_to_ind[self._attributes[0]] = 0 82 | with open(os.path.join('evaluation/attributes_vocab.txt')) as f: 83 | count = 1 84 | for att in f.readlines(): 85 | names = [n.lower().strip() for n in att.split(',')] 86 | self._attributes.append(names[0]) 87 | for n in names: 88 | self._attribute_to_ind[n] = count 89 | count += 1 90 | 91 | self.roidb, self.image_index = self.gt_roidb(self._coco_api) 92 | 93 | def _tasks_from_config(self, cfg): 94 | """ 95 | Returns: 96 | tuple[str]: tasks that can be evaluated under the given configuration. 97 | """ 98 | tasks = ("bbox",) 99 | if cfg.MODEL.MASK_ON: 100 | tasks = tasks + ("segm",) 101 | if cfg.MODEL.KEYPOINT_ON: 102 | tasks = tasks + ("keypoints",) 103 | return tasks 104 | 105 | def gt_roidb(self, dataset): 106 | roidb = [] 107 | image_index = dataset.imgToAnns.keys() 108 | for img_index in dataset.imgToAnns: 109 | tmp_dict = {} 110 | num_objs = len(dataset.imgToAnns[img_index]) 111 | bboxes = np.zeros((num_objs, 4), dtype=np.uint16) 112 | gt_attributes = np.zeros((num_objs, 16), dtype=np.int32) 113 | gt_classes = np.zeros((num_objs), dtype=np.int32) 114 | for ind, item in enumerate(dataset.imgToAnns[img_index]): 115 | bboxes[ind, :] = item['bbox'] 116 | gt_classes[ind] = item['category_id'] + 1 # NOTE 117 | for j, attr in enumerate(item['attribute_ids']): 118 | gt_attributes[ind, j] = attr 119 | bboxes[:, 2] = bboxes[:, 2] + bboxes[:, 0] 120 | bboxes[:, 3] = bboxes[:, 3] + bboxes[:, 1] 121 | tmp_dict['boxes'] = bboxes 122 | tmp_dict['gt_attributes'] = gt_attributes 123 | tmp_dict['gt_classes'] = gt_classes 124 | roidb.append(tmp_dict) 125 | return roidb, image_index 126 | 127 | def reset(self): 128 | self._predictions = [] 129 | 130 | def process(self, inputs, outputs): 131 | """ 132 | Args: 133 | inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). 134 | It is a list of dict. Each dict corresponds to an image and 135 | contains keys like "height", "width", "file_name", "image_id". 136 | outputs: the outputs of a COCO model. It is a list of dicts with key 137 | "instances" that contains :class:`Instances`. 138 | """ 139 | for input, output in zip(inputs, outputs): 140 | prediction = {"image_id": input["image_id"]} 141 | 142 | # TODO this is ugly 143 | if "instances" in output: 144 | instances = output["instances"].to(self._cpu_device) 145 | prediction["boxes"] = instances.pred_boxes.tensor.numpy() 146 | prediction["labels"] = instances.pred_classes.numpy() 147 | prediction["scores"] = instances.scores.numpy() 148 | self._predictions.append(prediction) 149 | 150 | def evaluate(self): 151 | if self._distributed: 152 | comm.synchronize() 153 | self._predictions = comm.gather(self._predictions, dst=0) 154 | self._predictions = list(itertools.chain(*self._predictions)) 155 | 156 | if not comm.is_main_process(): 157 | return {} 158 | 159 | # self._predictions = torch.load(os.path.join(self._output_dir, "instances_predictions.pth")) 160 | 161 | if len(self._predictions) == 0: 162 | self._logger.warning("[VGEvaluator] Did not receive valid predictions.") 163 | return {} 164 | 165 | if self._output_dir: 166 | PathManager.mkdirs(self._output_dir) 167 | file_path = os.path.join(self._output_dir, "instances_predictions.pth") 168 | with PathManager.open(file_path, "wb") as f: 169 | torch.save(self._predictions, f) 170 | 171 | self._results = OrderedDict() 172 | self._eval_vg() 173 | # Copy so the caller can do whatever with results 174 | return copy.deepcopy(self._results) 175 | 176 | def _eval_vg(self): 177 | self.write_voc_results_file(self._predictions, output_dir=self._output_dir) 178 | self.do_python_eval(self._output_dir) 179 | 180 | def write_voc_results_file(self, predictions, output_dir): 181 | 182 | # preds = [] 183 | # for item in predictions: 184 | # pred = {} 185 | # pred['image_id'] = item['image_id'] 186 | # scores = item["scores"] 187 | # labels = item["labels"] 188 | # bbox = item["boxes"] 189 | # for ind, instance in enumerate(item['instances']): 190 | # scores[ind] = instance['score'] 191 | # labels[ind] = instance['category_id'] 192 | # bbox[ind, :] = instance['bbox'][:] 193 | # pred['scores'] = scores 194 | # pred['lables'] = labels 195 | # pred['bbox'] = bbox 196 | # preds.append(pred) 197 | 198 | for cls_ind, cls in enumerate(self._classes): 199 | if cls == '__background__': 200 | continue 201 | print('Writing "{}" vg result file'.format(cls)) 202 | filename = self.get_vg_results_file_template(output_dir).format(cls) 203 | with open(filename, 'wt') as f: 204 | for pred_ind, item in enumerate(predictions): 205 | scores = item["scores"] 206 | labels = item["labels"]+1 207 | bbox = item["boxes"] 208 | if cls_ind not in labels: 209 | continue 210 | dets = bbox[labels==cls_ind] 211 | scores = scores[labels==cls_ind] 212 | for k in range(dets.shape[0]): 213 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. 214 | format(str(item["image_id"]), scores[k], 215 | dets[k, 0] + 1, dets[k, 1] + 1, 216 | dets[k, 2] + 1, dets[k, 3] + 1)) 217 | 218 | def get_vg_results_file_template(self, output_dir, pickle=True, eval_attributes = False): 219 | filename = 'detections_vg'+'_{:s}.txt' 220 | path = os.path.join(output_dir, filename) 221 | return path 222 | 223 | def do_python_eval(self, output_dir, pickle=True, eval_attributes = False): 224 | # We re-use parts of the pascal voc python code for visual genome 225 | aps = [] 226 | nposs = [] 227 | thresh = [] 228 | # The PASCAL VOC metric changed in 2010 229 | use_07_metric = False 230 | print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) 231 | if not os.path.isdir(output_dir): 232 | os.mkdir(output_dir) 233 | # Load ground truth 234 | if eval_attributes: 235 | classes = self._attributes 236 | else: 237 | classes = self._classes 238 | for i, cls in enumerate(classes): 239 | if cls == '__background__' or cls == '__no_attribute__': 240 | continue 241 | filename = self.get_vg_results_file_template(output_dir).format(cls) 242 | rec, prec, ap, scores, npos = vg_eval( 243 | filename, self.roidb, self.image_index, i, ovthresh=0.5, 244 | use_07_metric=use_07_metric, eval_attributes=eval_attributes) 245 | 246 | # Determine per class detection thresholds that maximise f score 247 | if npos > 1 and not (type(prec) == int and type(rec) == int and prec+rec ==0): 248 | f = np.nan_to_num((prec * rec) / (prec + rec)) 249 | thresh += [scores[np.argmax(f)]] 250 | else: 251 | thresh += [0] 252 | aps += [ap] 253 | nposs += [float(npos)] 254 | print('AP for {} = {:.4f} (npos={:,})'.format(cls, ap, npos)) 255 | if pickle: 256 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f: 257 | cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap, 258 | 'scores': scores, 'npos': npos}, f) 259 | 260 | # Set thresh to mean for classes with poor results 261 | thresh = np.array(thresh) 262 | avg_thresh = np.mean(thresh[thresh != 0]) 263 | thresh[thresh == 0] = avg_thresh 264 | if eval_attributes: 265 | filename = 'attribute_thresholds_vg.txt' 266 | else: 267 | filename = 'object_thresholds_vg.txt' 268 | path = os.path.join(output_dir, filename) 269 | with open(path, 'wt') as f: 270 | for i, cls in enumerate(classes[1:]): 271 | f.write('{:s} {:.3f}\n'.format(cls, thresh[i])) 272 | 273 | weights = np.array(nposs) 274 | weights /= weights.sum() 275 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 276 | print('Weighted Mean AP = {:.4f}'.format(np.average(aps, weights=weights))) 277 | print('Mean Detection Threshold = {:.3f}'.format(avg_thresh)) 278 | # print('~~~~~~~~') 279 | # print('Results:') 280 | # for ap, npos in zip(aps, nposs): 281 | # print('{:.3f}\t{:.3f}'.format(ap, npos)) 282 | # print('{:.3f}'.format(np.mean(aps))) 283 | # print('~~~~~~~~') 284 | # print('') 285 | # print('--------------------------------------------------------------') 286 | print('Results computed with the **unofficial** PASCAL VOC Python eval code.') 287 | print('--------------------------------------------------------------') 288 | -------------------------------------------------------------------------------- /extract_features.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | # pylint: disable=no-member 3 | """ 4 | TridentNet Training Script. 5 | 6 | This script is a simplified version of the training script in detectron2/tools. 7 | """ 8 | import argparse 9 | from ast import arg 10 | import os 11 | import sys 12 | import torch 13 | # import tqdm 14 | import cv2 15 | import numpy as np 16 | 17 | from utils.extract_d2features import extract_feat_d2_start 18 | sys.path.append('detectron2') 19 | 20 | import detectron2.utils.comm as comm 21 | from detectron2.checkpoint import DetectionCheckpointer 22 | from detectron2.data import build_detection_test_loader, build_detection_train_loader 23 | from detectron2.config import get_cfg 24 | from detectron2.engine import DefaultTrainer, default_setup, launch 25 | from detectron2.evaluation import COCOEvaluator, verify_results 26 | from detectron2.structures import Instances 27 | 28 | from utils.utils import mkdir, save_features 29 | from utils.extract_utils import get_image_blob, save_bbox, save_roi_features_by_bbox, save_roi_features 30 | from utils.progress_bar import ProgressBar 31 | from bua import add_config 32 | from bua.caffe.modeling.box_regression import BUABoxes 33 | from torch.nn import functional as F 34 | from detectron2.modeling import postprocessing 35 | from utils.extract_features_singlegpu import extract_feat_singlegpu_start 36 | from utils.extract_features_multigpu import extract_feat_multigpu_start 37 | from utils.extract_features_faster import extract_feat_faster_start 38 | 39 | def switch_extract_mode(mode): 40 | if mode == 'roi_feats': 41 | switch_cmd = ['MODEL.BUA.EXTRACTOR.MODE', 1] 42 | elif mode == 'bboxes': 43 | switch_cmd = ['MODEL.BUA.EXTRACTOR.MODE', 2] 44 | elif mode == 'bbox_feats': 45 | switch_cmd = ['MODEL.BUA.EXTRACTOR.MODE', 3, 'MODEL.PROPOSAL_GENERATOR.NAME', 'PrecomputedProposals'] 46 | else: 47 | print('Wrong extract mode! ') 48 | exit() 49 | return switch_cmd 50 | # ROI_HEADS: # Add to get 100 box or Delete it to get ~50 boxes 51 | # SCORE_THRESH_TEST: 0.0 52 | # NMS_THRESH_TEST: 0.3 53 | def set_min_max_boxes(min_max_boxes, mode): 54 | if min_max_boxes == 'min_max_default': 55 | return [] 56 | try: 57 | min_boxes = int(min_max_boxes.split(',')[0]) 58 | max_boxes = int(min_max_boxes.split(',')[1]) 59 | if mode == "caffe": 60 | pass 61 | elif mode == "d2": 62 | if min_boxes == 100 & max_boxes == 100: 63 | cmd = ['MODEL.BUA.EXTRACTOR.MIN_BOXES', min_boxes, 64 | 'MODEL.BUA.EXTRACTOR.MAX_BOXES', max_boxes, 65 | 'MODEL.ROI_HEADS.SCORE_THRESH_TEST', 0.0, 66 | 'MODEL.ROI_HEADS.NMS_THRESH_TEST', 0.3 ] 67 | return cmd 68 | else: 69 | raise Exception("detection mode not supported: {}".format(mode)) 70 | except: 71 | print('Illegal min-max boxes setting, using config default. ') 72 | return [] 73 | cmd = ['MODEL.BUA.EXTRACTOR.MIN_BOXES', min_boxes, 74 | 'MODEL.BUA.EXTRACTOR.MAX_BOXES', max_boxes] 75 | return cmd 76 | 77 | def setup(args): 78 | """ 79 | Create configs and perform basic setups. 80 | """ 81 | cfg = get_cfg() 82 | add_config(args, cfg) 83 | cfg.merge_from_file(args.config_file) 84 | cfg.merge_from_list(args.opts) 85 | cfg.merge_from_list(['MODEL.BUA.EXTRACT_FEATS',True]) 86 | cfg.merge_from_list(switch_extract_mode(args.extract_mode)) 87 | cfg.merge_from_list(set_min_max_boxes(args.min_max_boxes, args.mode)) 88 | cfg.freeze() 89 | default_setup(cfg, args) 90 | return cfg 91 | 92 | def main(): 93 | parser = argparse.ArgumentParser(description="PyTorch Object Detection2 Inference") 94 | parser.add_argument( 95 | "--config-file", 96 | default="configs/caffe/test-caffe-r101.yaml", 97 | metavar="FILE", 98 | help="path to config file", 99 | ) 100 | 101 | parser.add_argument('--num-cpus', default=1, type=int, 102 | help='number of cpus to use for ray, 0 means no limit') 103 | 104 | parser.add_argument('--gpus', dest='gpu_id', help='GPU id(s) to use', 105 | default='0', type=str) 106 | 107 | parser.add_argument("--mode", default="caffe", type=str, help="'caffe' and 'd2' indicates \ 108 | 'use caffe model' and 'use detectron2 model'respectively") 109 | 110 | parser.add_argument('--extract-mode', default='roi_feats', type=str, 111 | help="'roi_feats', 'bboxes' and 'bbox_feats' indicates \ 112 | 'extract roi features directly', 'extract bboxes only' and \ 113 | 'extract roi features with pre-computed bboxes' respectively") 114 | 115 | parser.add_argument('--min-max-boxes', default='min_max_default', type=str, 116 | help='the number of min-max boxes of extractor') 117 | 118 | parser.add_argument('--out-dir', dest='output_dir', 119 | help='output directory for features', 120 | default="features") 121 | parser.add_argument('--image-dir', dest='image_dir', 122 | help='directory with images', 123 | default="image") 124 | parser.add_argument('--bbox-dir', dest='bbox_dir', 125 | help='directory with bbox', 126 | default="bbox") 127 | parser.add_argument("--fastmode", action="store_true", help="whether to use multi cpus to extract faster.",) 128 | 129 | parser.add_argument( 130 | "--resume", 131 | action="store_true", 132 | help="whether to attempt to resume from the checkpoint directory", 133 | ) 134 | parser.add_argument( 135 | "opts", 136 | help="Modify config options using the command-line", 137 | default=None, 138 | nargs=argparse.REMAINDER, 139 | ) 140 | 141 | args = parser.parse_args() 142 | 143 | cfg = setup(args) 144 | num_gpus = len(args.gpu_id.split(',')) 145 | print(args.mode) 146 | if args.mode == "caffe": 147 | if args.fastmode: # faster.py 148 | print("faster") 149 | extract_feat_faster_start(args,cfg) 150 | else: # multi or single 151 | if num_gpus == 1: # without ray 152 | print("single") 153 | extract_feat_singlegpu_start(args,cfg) 154 | else: # use ray to accelerate 155 | print("multi") 156 | extract_feat_multigpu_start(args,cfg) 157 | elif args.mode == "d2": 158 | print("d2 mode use ray") 159 | extract_feat_d2_start(args,cfg) 160 | else: 161 | raise Exception("detection model not supported: {}".format(args.model)) 162 | 163 | if __name__ == "__main__": 164 | main() 165 | -------------------------------------------------------------------------------- /opts.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | def parse_opt(): 5 | """ 6 | Create a parser with some common arguments used by detectron2 users. 7 | 8 | Returns: 9 | argparse.ArgumentParser: 10 | """ 11 | parser = argparse.ArgumentParser(description="BottomUpAttention Training") 12 | parser.add_argument("--mode", default="caffe", type=str, help="'caffe' and 'd2' indicates \ 13 | 'use caffe model' and 'use detectron2 model'respectively") 14 | parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file") 15 | parser.add_argument( 16 | "--resume", 17 | action="store_true", 18 | help="whether to attempt to resume from the checkpoint directory", 19 | ) 20 | parser.add_argument("--eval-only", action="store_true", help="perform evaluation only") 21 | parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*") 22 | parser.add_argument("--num-machines", type=int, default=1) 23 | parser.add_argument( 24 | "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)" 25 | ) 26 | 27 | # PyTorch still may leave orphan processes in multi-gpu training. 28 | # Therefore we use a deterministic way to obtain port, 29 | # so that users are aware of orphan processes by seeing the port occupied. 30 | port = 2 ** 15 + 2 ** 14 + hash(os.getuid()) % 2 ** 14 31 | parser.add_argument("--dist-url", default="tcp://127.0.0.1:{}".format(port)) 32 | parser.add_argument( 33 | "opts", 34 | help="Modify config options using the command-line", 35 | default=None, 36 | nargs=argparse.REMAINDER, 37 | ) 38 | return parser 39 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | import glob 5 | import os 6 | from setuptools import find_packages, setup 7 | import torch 8 | from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension 9 | 10 | torch_ver = [int(x) for x in torch.__version__.split(".")[:2]] 11 | assert torch_ver >= [1, 3], "Requires PyTorch >= 1.3" 12 | 13 | 14 | def get_extensions(): 15 | this_dir = os.path.dirname(os.path.abspath(__file__)) 16 | extensions_dir = os.path.join(this_dir, "bua","caffe", "modeling","layers", "csrc") 17 | 18 | main_source = os.path.join(extensions_dir, "vision.cpp") 19 | sources = glob.glob(os.path.join(extensions_dir, "**", "*.cpp")) 20 | source_cuda = glob.glob(os.path.join(extensions_dir, "**", "*.cu")) + glob.glob( 21 | os.path.join(extensions_dir, "*.cu") 22 | ) 23 | 24 | sources = [main_source] + sources 25 | 26 | extension = CppExtension 27 | 28 | extra_compile_args = {"cxx": []} 29 | define_macros = [] 30 | 31 | if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1": 32 | extension = CUDAExtension 33 | sources += source_cuda 34 | define_macros += [("WITH_CUDA", None)] 35 | extra_compile_args["nvcc"] = [ 36 | "-DCUDA_HAS_FP16=1", 37 | "-D__CUDA_NO_HALF_OPERATORS__", 38 | "-D__CUDA_NO_HALF_CONVERSIONS__", 39 | "-D__CUDA_NO_HALF2_OPERATORS__", 40 | ] 41 | 42 | 43 | sources = [os.path.join(extensions_dir, s) for s in sources] 44 | 45 | include_dirs = [extensions_dir] 46 | 47 | ext_modules = [ 48 | extension( 49 | "bua.caffe.modeling._C", 50 | sources, 51 | include_dirs=include_dirs, 52 | define_macros=define_macros, 53 | extra_compile_args=extra_compile_args, 54 | ) 55 | ] 56 | 57 | return ext_modules 58 | 59 | 60 | setup( 61 | name="bottom-up-attention.pytorch", 62 | packages=find_packages(exclude=("configs", "tests")), 63 | python_requires=">=3.6", 64 | ext_modules=get_extensions(), 65 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 66 | ) 67 | -------------------------------------------------------------------------------- /train_net.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | TridentNet Training Script. 4 | 5 | This script is a simplified version of the training script in detectron2/tools. 6 | """ 7 | 8 | import os 9 | import sys 10 | import time 11 | sys.path.append('detectron2') 12 | 13 | import detectron2.utils.comm as comm 14 | from detectron2.checkpoint import DetectionCheckpointer 15 | from detectron2.data import build_detection_test_loader, build_detection_train_loader 16 | from detectron2.config import get_cfg 17 | from detectron2.engine import DefaultTrainer, default_setup, launch 18 | from detectron2.evaluation import COCOEvaluator, verify_results 19 | 20 | from bua import add_config 21 | from bua.d2 import build_detection_test_loader_with_attributes, build_detection_train_loader_with_attributes 22 | from bua.caffe.dataloader import DatasetMapper 23 | from opts import parse_opt 24 | from evaluation import VGEvaluator 25 | 26 | 27 | class Trainer(DefaultTrainer): 28 | def __init__(self, cfg): 29 | super().__init__(cfg) 30 | self.rpn_box_lw = cfg.MODEL.RPN.BBOX_LOSS_WEIGHT 31 | self.rcnn_box_lw = cfg.MODEL.ROI_BOX_HEAD.BBOX_LOSS_WEIGHT 32 | 33 | @classmethod 34 | def build_evaluator(cls, cfg, dataset_name, output_folder=None): 35 | if output_folder is None: 36 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 37 | return VGEvaluator(dataset_name, cfg, True, output_folder) 38 | 39 | @classmethod 40 | def build_test_loader(cls, cfg, dataset_name): 41 | if cfg.MODE == "caffe": 42 | return build_detection_test_loader(cfg, dataset_name, mapper=DatasetMapper(cfg, False)) 43 | elif cfg.MODE == "d2": 44 | return build_detection_test_loader_with_attributes(cfg, dataset_name) 45 | else: 46 | raise Exception("detectron mode note supported: {}".format(args.model)) 47 | 48 | 49 | 50 | @classmethod 51 | def build_train_loader(cls, cfg): 52 | if cfg.MODE == "caffe": 53 | return build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True)) 54 | elif cfg.MODE == "d2": 55 | return build_detection_train_loader_with_attributes(cfg) 56 | else: 57 | raise Exception("detectron mode note supported: {}".format(args.model)) 58 | 59 | def run_step(self): 60 | """ 61 | !!Hack!! for the run_step method in SimpleTrainer to adjust the loss 62 | """ 63 | assert self.model.training, "[Trainer] model was changed to eval mode!" 64 | start = time.perf_counter() 65 | data = next(self._data_loader_iter) 66 | data_time = time.perf_counter() - start 67 | loss_dict = self.model(data) 68 | # RPN box loss: 69 | loss_dict["loss_rpn_loc"] *= self.rpn_box_lw 70 | # R-CNN box loss: 71 | loss_dict["loss_box_reg"] *= self.rcnn_box_lw 72 | losses = sum(loss_dict.values()) 73 | self._detect_anomaly(losses, loss_dict) 74 | 75 | metrics_dict = loss_dict 76 | metrics_dict["data_time"] = data_time 77 | self._write_metrics(metrics_dict) 78 | self.optimizer.zero_grad() 79 | losses.backward() 80 | self.optimizer.step() 81 | 82 | def setup(args): 83 | """ 84 | Create configs and perform basic setups. 85 | """ 86 | cfg = get_cfg() 87 | add_config(args, cfg) 88 | cfg.merge_from_file(args.config_file) 89 | cfg.merge_from_list(args.opts) 90 | cfg.MODE = args.mode 91 | cfg.freeze() 92 | default_setup(cfg, args) 93 | return cfg 94 | 95 | 96 | def main(args): 97 | cfg = setup(args) 98 | 99 | if args.eval_only: 100 | model = Trainer.build_model(cfg) 101 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 102 | cfg.MODEL.WEIGHTS, resume=args.resume 103 | ) 104 | res = Trainer.test(cfg, model) 105 | if comm.is_main_process(): 106 | verify_results(cfg, res) 107 | return res 108 | 109 | trainer = Trainer(cfg) 110 | trainer.resume_or_load(resume=args.resume) 111 | return trainer.train() 112 | 113 | 114 | if __name__ == "__main__": 115 | args = parse_opt().parse_args() 116 | print("Command Line Args:", args) 117 | launch( 118 | main, 119 | args.num_gpus, 120 | num_machines=args.num_machines, 121 | machine_rank=args.machine_rank, 122 | dist_url=args.dist_url, 123 | args=(args,), 124 | ) 125 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import save_features 2 | from .extract_features_faster import extract_feat_faster_start 3 | from .extract_features_multigpu import extract_feat_multigpu_start 4 | from .extract_features_singlegpu import extract_feat_singlegpu_start 5 | from .extract_d2features import extract_feat_d2_start -------------------------------------------------------------------------------- /utils/extract_features_faster.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | # pylint: disable=no-member 3 | """ 4 | TridentNet Training Script. 5 | 6 | This script is a simplified version of the training script in detectron2/tools. 7 | """ 8 | import argparse 9 | import os 10 | import sys 11 | import torch 12 | # import tqdm 13 | import cv2 14 | import numpy as np 15 | sys.path.append('detectron2') 16 | 17 | import detectron2.utils.comm as comm 18 | from detectron2.checkpoint import DetectionCheckpointer 19 | from detectron2.data import build_detection_test_loader, build_detection_train_loader 20 | from detectron2.config import get_cfg 21 | from detectron2.engine import DefaultTrainer, default_setup, launch 22 | from detectron2.evaluation import COCOEvaluator, verify_results 23 | from detectron2.structures import Instances 24 | 25 | from utils.utils import mkdir, save_features 26 | from utils.extract_utils import get_image_blob, save_bbox, save_roi_features_by_bbox, save_roi_features 27 | from utils.progress_bar import ProgressBar 28 | from bua import add_config 29 | from bua.caffe.modeling.box_regression import BUABoxes 30 | 31 | import ray 32 | from ray.actor import ActorHandle 33 | 34 | """ 35 | add ray to generate_npz 36 | """ 37 | def switch_extract_mode(mode): 38 | if mode == 'roi_feats': 39 | switch_cmd = ['MODEL.BUA.EXTRACTOR.MODE', 1] 40 | elif mode == 'bboxes': 41 | switch_cmd = ['MODEL.BUA.EXTRACTOR.MODE', 2] 42 | elif mode == 'bbox_feats': 43 | switch_cmd = ['MODEL.BUA.EXTRACTOR.MODE', 3, 'MODEL.PROPOSAL_GENERATOR.NAME', 'PrecomputedProposals'] 44 | else: 45 | print('Wrong extract mode! ') 46 | exit() 47 | return switch_cmd 48 | 49 | def set_min_max_boxes(min_max_boxes): 50 | if min_max_boxes == 'min_max_default': 51 | return [] 52 | try: 53 | min_boxes = int(min_max_boxes.split(',')[0]) 54 | max_boxes = int(min_max_boxes.split(',')[1]) 55 | except: 56 | print('Illegal min-max boxes setting, using config default. ') 57 | return [] 58 | cmd = ['MODEL.BUA.EXTRACTOR.MIN_BOXES', min_boxes, 59 | 'MODEL.BUA.EXTRACTOR.MAX_BOXES', max_boxes] 60 | return cmd 61 | 62 | def setup(args): 63 | """ 64 | Create configs and perform basic setups. 65 | """ 66 | cfg = get_cfg() 67 | add_config(args, cfg) 68 | cfg.merge_from_file(args.config_file) 69 | cfg.merge_from_list(args.opts) 70 | cfg.merge_from_list(['MODEL.BUA.EXTRACT_FEATS',True]) 71 | cfg.merge_from_list(switch_extract_mode(args.extract_mode)) 72 | cfg.merge_from_list(set_min_max_boxes(args.min_max_boxes)) 73 | cfg.freeze() 74 | default_setup(cfg, args) 75 | return cfg 76 | 77 | @ray.remote 78 | def generate_npz(extract_mode, pba: ActorHandle, *args): 79 | if extract_mode == 1: 80 | save_roi_features(*args) 81 | elif extract_mode == 2: 82 | save_bbox(*args) 83 | elif extract_mode == 3: 84 | save_roi_features_by_bbox(*args) 85 | else: 86 | print('Invalid Extract Mode! ') 87 | pba.update.remote(1) 88 | 89 | @ray.remote(num_gpus=1) 90 | def extract_feat_faster(split_idx, img_list, cfg, args, actor: ActorHandle): 91 | num_images = len(img_list) 92 | print('Number of images on split{}: {}.'.format(split_idx, num_images)) 93 | 94 | model = DefaultTrainer.build_model(cfg) 95 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 96 | cfg.MODEL.WEIGHTS, resume=args.resume 97 | ) 98 | model.eval() 99 | 100 | generate_npz_list = [] 101 | for im_file in (img_list): 102 | if os.path.exists(os.path.join(args.output_dir, im_file.split('.')[0]+'.npz')): 103 | actor.update.remote(1) 104 | continue 105 | im = cv2.imread(os.path.join(args.image_dir, im_file)) 106 | if im is None: 107 | print(os.path.join(args.image_dir, im_file), "is illegal!") 108 | actor.update.remote(1) 109 | continue 110 | dataset_dict = get_image_blob(im, cfg.MODEL.PIXEL_MEAN) 111 | # extract roi features 112 | if cfg.MODEL.BUA.EXTRACTOR.MODE == 1: 113 | attr_scores = None 114 | with torch.set_grad_enabled(False): 115 | if cfg.MODEL.BUA.ATTRIBUTE_ON: 116 | boxes, scores, features_pooled, attr_scores = model([dataset_dict]) 117 | else: 118 | boxes, scores, features_pooled = model([dataset_dict]) 119 | boxes = [box.tensor.cpu() for box in boxes] 120 | scores = [score.cpu() for score in scores] 121 | features_pooled = [feat.cpu() for feat in features_pooled] 122 | if not attr_scores is None: 123 | attr_scores = [attr_score.cpu() for attr_score in attr_scores] 124 | generate_npz_list.append(generate_npz.remote(1, actor, 125 | args, cfg, im_file, im, dataset_dict, 126 | boxes, scores, features_pooled, attr_scores)) 127 | # extract bbox only 128 | elif cfg.MODEL.BUA.EXTRACTOR.MODE == 2: 129 | with torch.set_grad_enabled(False): 130 | boxes, scores = model([dataset_dict]) 131 | boxes = [box.cpu() for box in boxes] 132 | scores = [score.cpu() for score in scores] 133 | generate_npz_list.append(generate_npz.remote(2, actor, 134 | args, cfg, im_file, im, dataset_dict, 135 | boxes, scores)) 136 | # extract roi features by bbox 137 | elif cfg.MODEL.BUA.EXTRACTOR.MODE == 3: 138 | if not os.path.exists(os.path.join(args.bbox_dir, im_file.split('.')[0]+'.npz')): 139 | actor.update.remote(1) 140 | continue 141 | bbox = torch.from_numpy(np.load(os.path.join(args.bbox_dir, im_file.split('.')[0]+'.npz'))['bbox']) * dataset_dict['im_scale'] 142 | proposals = Instances(dataset_dict['image'].shape[-2:]) 143 | proposals.proposal_boxes = BUABoxes(bbox) 144 | dataset_dict['proposals'] = proposals 145 | 146 | attr_scores = None 147 | with torch.set_grad_enabled(False): 148 | if cfg.MODEL.BUA.ATTRIBUTE_ON: 149 | boxes, scores, features_pooled, attr_scores = model([dataset_dict]) 150 | else: 151 | boxes, scores, features_pooled = model([dataset_dict]) 152 | boxes = [box.tensor.cpu() for box in boxes] 153 | scores = [score.cpu() for score in scores] 154 | features_pooled = [feat.cpu() for feat in features_pooled] 155 | if not attr_scores is None: 156 | attr_scores = [attr_score.data.cpu() for attr_score in attr_scores] 157 | generate_npz_list.append(generate_npz.remote(3, actor, 158 | args, cfg, im_file, im, dataset_dict, 159 | boxes, scores, features_pooled, attr_scores)) 160 | 161 | ray.get(generate_npz_list) 162 | 163 | 164 | def main(): 165 | parser = argparse.ArgumentParser(description="PyTorch Object Detection2 Inference") 166 | parser.add_argument( 167 | "--config-file", 168 | default="configs/caffe/test-caffe-r101.yaml", 169 | metavar="FILE", 170 | help="path to config file", 171 | ) 172 | 173 | parser.add_argument('--num-cpus', default=1, type=int, 174 | help='number of cpus to use for ray, 0 means no limit') 175 | 176 | parser.add_argument('--gpus', dest='gpu_id', help='GPU id(s) to use', 177 | default='0', type=str) 178 | 179 | parser.add_argument("--mode", default="caffe", type=str, help="'caffe' and 'd2' indicates \ 180 | 'use caffe model' and 'use detectron2 model'respectively") 181 | 182 | parser.add_argument('--extract-mode', default='roi_feats', type=str, 183 | help="'roi_feats', 'bboxes' and 'bbox_feats' indicates \ 184 | 'extract roi features directly', 'extract bboxes only' and \ 185 | 'extract roi features with pre-computed bboxes' respectively") 186 | 187 | parser.add_argument('--min-max-boxes', default='min_max_default', type=str, 188 | help='the number of min-max boxes of extractor') 189 | 190 | parser.add_argument('--out-dir', dest='output_dir', 191 | help='output directory for features', 192 | default="features") 193 | parser.add_argument('--image-dir', dest='image_dir', 194 | help='directory with images', 195 | default="image") 196 | parser.add_argument('--bbox-dir', dest='bbox_dir', 197 | help='directory with bbox', 198 | default="bbox") 199 | parser.add_argument( 200 | "--resume", 201 | action="store_true", 202 | help="whether to attempt to resume from the checkpoint directory", 203 | ) 204 | parser.add_argument( 205 | "opts", 206 | help="Modify config options using the command-line", 207 | default=None, 208 | nargs=argparse.REMAINDER, 209 | ) 210 | 211 | args = parser.parse_args() 212 | 213 | cfg = setup(args) 214 | extract_feat_faster_start(args,cfg) 215 | 216 | def extract_feat_faster_start(args,cfg): 217 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id 218 | num_gpus = len(args.gpu_id.split(',')) 219 | 220 | MIN_BOXES = cfg.MODEL.BUA.EXTRACTOR.MIN_BOXES 221 | MAX_BOXES = cfg.MODEL.BUA.EXTRACTOR.MAX_BOXES 222 | CONF_THRESH = cfg.MODEL.BUA.EXTRACTOR.CONF_THRESH 223 | 224 | # Extract features. 225 | imglist = os.listdir(args.image_dir) 226 | num_images = len(imglist) 227 | print('Number of images: {}.'.format(num_images)) 228 | 229 | if args.num_cpus != 0: 230 | ray.init(num_cpus=args.num_cpus) 231 | else: 232 | ray.init() 233 | img_lists = [imglist[i::num_gpus] for i in range(num_gpus)] 234 | 235 | pb = ProgressBar(len(imglist)) 236 | actor = pb.actor 237 | 238 | print('Number of GPUs: {}.'.format(num_gpus)) 239 | extract_feat_list = [] 240 | for i in range(num_gpus): 241 | extract_feat_list.append(extract_feat_faster.remote(i, img_lists[i], cfg, args, actor)) 242 | 243 | pb.print_until_done() 244 | ray.get(extract_feat_list) 245 | ray.get(actor.get_counter.remote()) 246 | 247 | if __name__ == "__main__": 248 | main() -------------------------------------------------------------------------------- /utils/extract_features_multigpu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | # pylint: disable=no-member 3 | """ 4 | TridentNet Training Script. 5 | 6 | This script is a simplified version of the training script in detectron2/tools. 7 | """ 8 | import argparse 9 | import os 10 | import sys 11 | import torch 12 | # import tqdm 13 | import cv2 14 | import numpy as np 15 | sys.path.append('detectron2') 16 | 17 | import detectron2.utils.comm as comm 18 | from detectron2.checkpoint import DetectionCheckpointer 19 | from detectron2.data import build_detection_test_loader, build_detection_train_loader 20 | from detectron2.config import get_cfg 21 | from detectron2.engine import DefaultTrainer, default_setup, launch 22 | from detectron2.evaluation import COCOEvaluator, verify_results 23 | from detectron2.structures import Instances 24 | 25 | from utils.utils import mkdir, save_features 26 | from utils.extract_utils import get_image_blob, save_bbox, save_roi_features_by_bbox, save_roi_features 27 | from utils.progress_bar import ProgressBar 28 | from bua import add_config 29 | from bua.caffe.modeling.box_regression import BUABoxes 30 | from torch.nn import functional as F 31 | from detectron2.modeling import postprocessing 32 | 33 | import ray 34 | from ray.actor import ActorHandle 35 | """ 36 | use ray to accelerate multi gpu 37 | """ 38 | def switch_extract_mode(mode): 39 | if mode == 'roi_feats': 40 | switch_cmd = ['MODEL.BUA.EXTRACTOR.MODE', 1] 41 | elif mode == 'bboxes': 42 | switch_cmd = ['MODEL.BUA.EXTRACTOR.MODE', 2] 43 | elif mode == 'bbox_feats': 44 | switch_cmd = ['MODEL.BUA.EXTRACTOR.MODE', 3, 'MODEL.PROPOSAL_GENERATOR.NAME', 'PrecomputedProposals'] 45 | else: 46 | print('Wrong extract mode! ') 47 | exit() 48 | return switch_cmd 49 | 50 | def set_min_max_boxes(min_max_boxes): 51 | if min_max_boxes == 'min_max_default': 52 | return [] 53 | try: 54 | min_boxes = int(min_max_boxes.split(',')[0]) 55 | max_boxes = int(min_max_boxes.split(',')[1]) 56 | except: 57 | print('Illegal min-max boxes setting, using config default. ') 58 | return [] 59 | cmd = ['MODEL.BUA.EXTRACTOR.MIN_BOXES', min_boxes, 60 | 'MODEL.BUA.EXTRACTOR.MAX_BOXES', max_boxes] 61 | return cmd 62 | 63 | def setup(args): 64 | """ 65 | Create configs and perform basic setups. 66 | """ 67 | cfg = get_cfg() 68 | add_config(args, cfg) 69 | cfg.merge_from_file(args.config_file) 70 | cfg.merge_from_list(args.opts) 71 | cfg.merge_from_list(['MODEL.BUA.EXTRACT_FEATS',True]) 72 | cfg.merge_from_list(switch_extract_mode(args.extract_mode)) 73 | cfg.merge_from_list(set_min_max_boxes(args.min_max_boxes)) 74 | cfg.freeze() 75 | default_setup(cfg, args) 76 | return cfg 77 | 78 | def generate_npz(extract_mode, *args): 79 | if extract_mode == 1: 80 | save_roi_features(*args) 81 | elif extract_mode == 2: 82 | save_bbox(*args) 83 | elif extract_mode == 3: 84 | save_roi_features_by_bbox(*args) 85 | else: 86 | print('Invalid Extract Mode! ') 87 | 88 | @ray.remote(num_gpus=1) 89 | def extract_feat_multigpu(split_idx, img_list, cfg, args, actor: ActorHandle): # NOTE ray 90 | num_images = len(img_list) 91 | print('Number of images on split{}: {}.'.format(split_idx, num_images)) 92 | 93 | model = DefaultTrainer.build_model(cfg) 94 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 95 | cfg.MODEL.WEIGHTS, resume=args.resume 96 | ) 97 | model.eval() 98 | 99 | for im_file in (img_list): 100 | if os.path.exists(os.path.join(args.output_dir, im_file.split('.')[0]+'.npz')): 101 | actor.update.remote(1) # NOTE ray 102 | continue 103 | im = cv2.imread(os.path.join(args.image_dir, im_file)) 104 | if im is None: 105 | print(os.path.join(args.image_dir, im_file), "is illegal!") 106 | actor.update.remote(1) # NOTE ray 107 | continue 108 | dataset_dict = get_image_blob(im, cfg.MODEL.PIXEL_MEAN) 109 | # extract roi features 110 | if cfg.MODEL.BUA.EXTRACTOR.MODE == 1: 111 | attr_scores = None 112 | with torch.set_grad_enabled(False): 113 | if cfg.MODEL.BUA.ATTRIBUTE_ON: 114 | boxes, scores, features_pooled, attr_scores = model([dataset_dict]) 115 | else: 116 | boxes, scores, features_pooled = model([dataset_dict]) 117 | boxes = [box.tensor.cpu() for box in boxes] 118 | scores = [score.cpu() for score in scores] 119 | features_pooled = [feat.cpu() for feat in features_pooled] 120 | if not attr_scores is None: 121 | attr_scores = [attr_score.cpu() for attr_score in attr_scores] 122 | generate_npz(1, 123 | args, cfg, im_file, im, dataset_dict, 124 | boxes, scores, features_pooled, attr_scores) 125 | # extract bbox only 126 | elif cfg.MODEL.BUA.EXTRACTOR.MODE == 2: 127 | with torch.set_grad_enabled(False): 128 | boxes, scores = model([dataset_dict]) 129 | boxes = [box.cpu() for box in boxes] 130 | scores = [score.cpu() for score in scores] 131 | generate_npz(2, 132 | args, cfg, im_file, im, dataset_dict, 133 | boxes, scores) 134 | # extract roi features by bbox 135 | elif cfg.MODEL.BUA.EXTRACTOR.MODE == 3: 136 | if not os.path.exists(os.path.join(args.bbox_dir, im_file.split('.')[0]+'.npz')): 137 | actor.update.remote(1) # NOTE ray 138 | continue 139 | bbox = torch.from_numpy(np.load(os.path.join(args.bbox_dir, im_file.split('.')[0]+'.npz'))['bbox']) * dataset_dict['im_scale'] 140 | proposals = Instances(dataset_dict['image'].shape[-2:]) 141 | proposals.proposal_boxes = BUABoxes(bbox) 142 | dataset_dict['proposals'] = proposals 143 | 144 | attr_scores = None 145 | with torch.set_grad_enabled(False): 146 | if cfg.MODEL.BUA.ATTRIBUTE_ON: 147 | boxes, scores, features_pooled, attr_scores = model([dataset_dict]) 148 | else: 149 | boxes, scores, features_pooled = model([dataset_dict]) 150 | boxes = [box.tensor.cpu() for box in boxes] 151 | scores = [score.cpu() for score in scores] 152 | features_pooled = [feat.cpu() for feat in features_pooled] 153 | if not attr_scores is None: 154 | attr_scores = [attr_score.data.cpu() for attr_score in attr_scores] 155 | generate_npz(3, 156 | args, cfg, im_file, im, dataset_dict, 157 | boxes, scores, features_pooled, attr_scores) 158 | 159 | actor.update.remote(1) # NOTE ray 160 | 161 | 162 | def main(): 163 | parser = argparse.ArgumentParser(description="PyTorch Object Detection2 Inference") 164 | parser.add_argument( 165 | "--config-file", 166 | default="configs/caffe/test-caffe-r101.yaml", 167 | metavar="FILE", 168 | help="path to config file", 169 | ) 170 | 171 | parser.add_argument('--num-cpus', default=1, type=int, 172 | help='number of cpus to use for ray, 0 means no limit') 173 | 174 | parser.add_argument('--gpus', dest='gpu_id', help='GPU id(s) to use', 175 | default='0', type=str) 176 | 177 | parser.add_argument("--mode", default="caffe", type=str, help="'caffe' and 'd2' indicates \ 178 | 'use caffe model' and 'use detectron2 model'respectively") 179 | 180 | parser.add_argument('--extract-mode', default='roi_feats', type=str, 181 | help="'roi_feats', 'bboxes' and 'bbox_feats' indicates \ 182 | 'extract roi features directly', 'extract bboxes only' and \ 183 | 'extract roi features with pre-computed bboxes' respectively") 184 | 185 | parser.add_argument('--min-max-boxes', default='min_max_default', type=str, 186 | help='the number of min-max boxes of extractor') 187 | 188 | parser.add_argument('--out-dir', dest='output_dir', 189 | help='output directory for features', 190 | default="features") 191 | parser.add_argument('--image-dir', dest='image_dir', 192 | help='directory with images', 193 | default="image") 194 | parser.add_argument('--bbox-dir', dest='bbox_dir', 195 | help='directory with bbox', 196 | default="bbox") 197 | parser.add_argument( 198 | "--resume", 199 | action="store_true", 200 | help="whether to attempt to resume from the checkpoint directory", 201 | ) 202 | parser.add_argument( 203 | "opts", 204 | help="Modify config options using the command-line", 205 | default=None, 206 | nargs=argparse.REMAINDER, 207 | ) 208 | 209 | args = parser.parse_args() 210 | 211 | cfg = setup(args) 212 | extract_feat_multigpu_start(args,cfg) 213 | 214 | def extract_feat_multigpu_start(args,cfg): 215 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id 216 | num_gpus = len(args.gpu_id.split(',')) 217 | 218 | MIN_BOXES = cfg.MODEL.BUA.EXTRACTOR.MIN_BOXES 219 | MAX_BOXES = cfg.MODEL.BUA.EXTRACTOR.MAX_BOXES 220 | CONF_THRESH = cfg.MODEL.BUA.EXTRACTOR.CONF_THRESH 221 | 222 | # Extract features. 223 | imglist = os.listdir(args.image_dir) 224 | num_images = len(imglist) 225 | print('Number of images: {}.'.format(num_images)) 226 | 227 | # ray 228 | if args.num_cpus != 0: 229 | ray.init(num_cpus=args.num_cpus) 230 | else: 231 | ray.init() 232 | img_lists = [imglist[i::num_gpus] for i in range(num_gpus)] 233 | 234 | # ray 235 | pb = ProgressBar(len(imglist)) 236 | actor = pb.actor 237 | 238 | print('Number of GPUs: {}.'.format(num_gpus)) 239 | # for i in range(num_gpus): 240 | # extract_feat(i, img_lists[i], cfg, args) 241 | 242 | extract_feat_list = [] 243 | for i in range(num_gpus): 244 | extract_feat_list.append(extract_feat_multigpu.remote(i, img_lists[i], cfg, args, actor)) 245 | 246 | pb.print_until_done() 247 | ray.get(extract_feat_list) 248 | ray.get(actor.get_counter.remote()) 249 | 250 | 251 | if __name__ == "__main__": 252 | main() 253 | -------------------------------------------------------------------------------- /utils/extract_features_singlegpu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | # pylint: disable=no-member 3 | """ 4 | TridentNet Training Script. 5 | 6 | This script is a simplified version of the training script in detectron2/tools. 7 | """ 8 | import argparse 9 | import os 10 | import sys 11 | import torch 12 | # import tqdm 13 | import cv2 14 | import numpy as np 15 | sys.path.append('detectron2') 16 | 17 | import detectron2.utils.comm as comm 18 | from detectron2.checkpoint import DetectionCheckpointer 19 | from detectron2.data import build_detection_test_loader, build_detection_train_loader 20 | from detectron2.config import get_cfg 21 | from detectron2.engine import DefaultTrainer, default_setup, launch 22 | from detectron2.evaluation import COCOEvaluator, verify_results 23 | from detectron2.structures import Instances 24 | 25 | from utils.utils import mkdir, save_features 26 | from utils.extract_utils import get_image_blob, save_bbox, save_roi_features_by_bbox, save_roi_features 27 | from utils.progress_bar import ProgressBar 28 | from bua import add_config 29 | from bua.caffe.modeling.box_regression import BUABoxes 30 | from torch.nn import functional as F 31 | from detectron2.modeling import postprocessing 32 | 33 | def switch_extract_mode(mode): 34 | if mode == 'roi_feats': 35 | switch_cmd = ['MODEL.BUA.EXTRACTOR.MODE', 1] 36 | elif mode == 'bboxes': 37 | switch_cmd = ['MODEL.BUA.EXTRACTOR.MODE', 2] 38 | elif mode == 'bbox_feats': 39 | switch_cmd = ['MODEL.BUA.EXTRACTOR.MODE', 3, 'MODEL.PROPOSAL_GENERATOR.NAME', 'PrecomputedProposals'] 40 | else: 41 | print('Wrong extract mode! ') 42 | exit() 43 | return switch_cmd 44 | 45 | def set_min_max_boxes(min_max_boxes): 46 | if min_max_boxes == 'min_max_default': 47 | return [] 48 | try: 49 | min_boxes = int(min_max_boxes.split(',')[0]) 50 | max_boxes = int(min_max_boxes.split(',')[1]) 51 | except: 52 | print('Illegal min-max boxes setting, using config default. ') 53 | return [] 54 | cmd = ['MODEL.BUA.EXTRACTOR.MIN_BOXES', min_boxes, 55 | 'MODEL.BUA.EXTRACTOR.MAX_BOXES', max_boxes] 56 | return cmd 57 | 58 | def setup(args): 59 | """ 60 | Create configs and perform basic setups. 61 | """ 62 | cfg = get_cfg() 63 | add_config(args, cfg) 64 | cfg.merge_from_file(args.config_file) 65 | cfg.merge_from_list(args.opts) 66 | cfg.merge_from_list(['MODEL.BUA.EXTRACT_FEATS',True]) 67 | cfg.merge_from_list(switch_extract_mode(args.extract_mode)) 68 | cfg.merge_from_list(set_min_max_boxes(args.min_max_boxes)) 69 | cfg.freeze() 70 | default_setup(cfg, args) 71 | return cfg 72 | 73 | def generate_npz(extract_mode, *args): 74 | if extract_mode == 1: 75 | save_roi_features(*args) 76 | elif extract_mode == 2: 77 | save_bbox(*args) 78 | elif extract_mode == 3: 79 | save_roi_features_by_bbox(*args) 80 | else: 81 | print('Invalid Extract Mode! ') 82 | 83 | def extract_feat_singlegpu(split_idx, img_list, cfg, args): 84 | num_images = len(img_list) 85 | print('Number of images on split{}: {}.'.format(split_idx, num_images)) 86 | 87 | model = DefaultTrainer.build_model(cfg) 88 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 89 | cfg.MODEL.WEIGHTS, resume=args.resume 90 | ) 91 | model.eval() 92 | 93 | for im_file in (img_list): 94 | if os.path.exists(os.path.join(args.output_dir, im_file.split('.')[0]+'.npz')): 95 | continue 96 | im = cv2.imread(os.path.join(args.image_dir, im_file)) 97 | if im is None: 98 | print(os.path.join(args.image_dir, im_file), "is illegal!") 99 | continue 100 | dataset_dict = get_image_blob(im, cfg.MODEL.PIXEL_MEAN) 101 | # extract roi features 102 | if cfg.MODEL.BUA.EXTRACTOR.MODE == 1: 103 | attr_scores = None 104 | with torch.set_grad_enabled(False): 105 | if cfg.MODEL.BUA.ATTRIBUTE_ON: 106 | boxes, scores, features_pooled, attr_scores = model([dataset_dict]) # caffe mode 107 | else: 108 | boxes, scores, features_pooled = model([dataset_dict]) 109 | boxes = [box.tensor.cpu() for box in boxes] 110 | scores = [score.cpu() for score in scores] 111 | features_pooled = [feat.cpu() for feat in features_pooled] 112 | if not attr_scores is None: 113 | attr_scores = [attr_score.cpu() for attr_score in attr_scores] 114 | generate_npz(1, 115 | args, cfg, im_file, im, dataset_dict, 116 | boxes, scores, features_pooled, attr_scores) 117 | # extract bbox only 118 | elif cfg.MODEL.BUA.EXTRACTOR.MODE == 2: 119 | with torch.set_grad_enabled(False): 120 | boxes, scores = model([dataset_dict]) 121 | boxes = [box.cpu() for box in boxes] 122 | scores = [score.cpu() for score in scores] 123 | generate_npz(2, 124 | args, cfg, im_file, im, dataset_dict, 125 | boxes, scores) 126 | # extract roi features by bbox 127 | elif cfg.MODEL.BUA.EXTRACTOR.MODE == 3: 128 | if not os.path.exists(os.path.join(args.bbox_dir, im_file.split('.')[0]+'.npz')): 129 | continue 130 | bbox = torch.from_numpy(np.load(os.path.join(args.bbox_dir, im_file.split('.')[0]+'.npz'))['bbox']) * dataset_dict['im_scale'] 131 | proposals = Instances(dataset_dict['image'].shape[-2:]) 132 | proposals.proposal_boxes = BUABoxes(bbox) 133 | dataset_dict['proposals'] = proposals 134 | 135 | attr_scores = None 136 | with torch.set_grad_enabled(False): 137 | if cfg.MODEL.BUA.ATTRIBUTE_ON: 138 | boxes, scores, features_pooled, attr_scores = model([dataset_dict]) 139 | else: 140 | boxes, scores, features_pooled = model([dataset_dict]) 141 | boxes = [box.tensor.cpu() for box in boxes] 142 | scores = [score.cpu() for score in scores] 143 | features_pooled = [feat.cpu() for feat in features_pooled] 144 | if not attr_scores is None: 145 | attr_scores = [attr_score.data.cpu() for attr_score in attr_scores] 146 | generate_npz(3, 147 | args, cfg, im_file, im, dataset_dict, 148 | boxes, scores, features_pooled, attr_scores) 149 | 150 | 151 | def main(): 152 | parser = argparse.ArgumentParser(description="PyTorch Object Detection2 Inference") 153 | parser.add_argument( 154 | "--config-file", 155 | default="configs/caffe/test-caffe-r101.yaml", 156 | metavar="FILE", 157 | help="path to config file", 158 | ) 159 | 160 | parser.add_argument('--num-cpus', default=1, type=int, 161 | help='number of cpus to use for ray, 0 means no limit') 162 | 163 | parser.add_argument('--gpus', dest='gpu_id', help='GPU id(s) to use', 164 | default='0', type=str) 165 | 166 | parser.add_argument("--mode", default="caffe", type=str, help="'caffe' and 'd2' indicates \ 167 | 'use caffe model' and 'use detectron2 model'respectively") 168 | 169 | parser.add_argument('--extract-mode', default='roi_feats', type=str, 170 | help="'roi_feats', 'bboxes' and 'bbox_feats' indicates \ 171 | 'extract roi features directly', 'extract bboxes only' and \ 172 | 'extract roi features with pre-computed bboxes' respectively") 173 | 174 | parser.add_argument('--min-max-boxes', default='min_max_default', type=str, 175 | help='the number of min-max boxes of extractor') 176 | 177 | parser.add_argument('--out-dir', dest='output_dir', 178 | help='output directory for features', 179 | default="features") 180 | parser.add_argument('--image-dir', dest='image_dir', 181 | help='directory with images', 182 | default="image") 183 | parser.add_argument('--bbox-dir', dest='bbox_dir', 184 | help='directory with bbox', 185 | default="bbox") 186 | parser.add_argument( 187 | "--resume", 188 | action="store_true", 189 | help="whether to attempt to resume from the checkpoint directory", 190 | ) 191 | parser.add_argument( 192 | "opts", 193 | help="Modify config options using the command-line", 194 | default=None, 195 | nargs=argparse.REMAINDER, 196 | ) 197 | 198 | args = parser.parse_args() 199 | 200 | cfg = setup(args) 201 | extract_feat_singlegpu_start(args,cfg) 202 | 203 | def extract_feat_singlegpu_start(args,cfg): 204 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id 205 | num_gpus = len(args.gpu_id.split(',')) 206 | 207 | MIN_BOXES = cfg.MODEL.BUA.EXTRACTOR.MIN_BOXES 208 | MAX_BOXES = cfg.MODEL.BUA.EXTRACTOR.MAX_BOXES 209 | CONF_THRESH = cfg.MODEL.BUA.EXTRACTOR.CONF_THRESH 210 | 211 | # Extract features. 212 | imglist = os.listdir(args.image_dir) 213 | num_images = len(imglist) 214 | print('Number of images: {}.'.format(num_images)) 215 | 216 | img_lists = [imglist[i::num_gpus] for i in range(num_gpus)] 217 | 218 | print('Number of GPUs: {}.'.format(num_gpus)) 219 | for i in range(num_gpus): 220 | extract_feat_singlegpu(i, img_lists[i], cfg, args) 221 | 222 | 223 | if __name__ == "__main__": 224 | main() 225 | -------------------------------------------------------------------------------- /utils/extract_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import cv2 4 | import os 5 | 6 | from bua.caffe.modeling.layers.nms import nms 7 | from bua.caffe.modeling.box_regression import BUABoxes 8 | 9 | PIXEL_MEANS = np.array([[[102.9801, 115.9465, 122.7717]]]) 10 | TEST_SCALES = (600,) 11 | TEST_MAX_SIZE = 1000 12 | 13 | def im_list_to_blob(ims): 14 | """Convert a list of images into a network input. 15 | 16 | Assumes images are already prepared (means subtracted, BGR order, ...). 17 | """ 18 | max_shape = np.array([im.shape for im in ims]).max(axis=0) 19 | num_images = len(ims) 20 | blob = np.zeros((num_images, max_shape[0], max_shape[1], 3), 21 | dtype=np.float32) 22 | for i in range(num_images): 23 | im = ims[i] 24 | blob[i, 0:im.shape[0], 0:im.shape[1], :] = im 25 | 26 | return blob 27 | 28 | def get_image_blob(im, pixel_means): 29 | """Converts an image into a network input. 30 | Arguments: 31 | im (ndarray): a color image 32 | Returns: 33 | blob (ndarray): a data blob holding an image pyramid 34 | im_scale_factors (list): list of image scales (relative to im) used 35 | in the image pyramid 36 | """ 37 | pixel_means = np.array([[pixel_means]]) 38 | dataset_dict = {} 39 | im_orig = im.astype(np.float32, copy=True) 40 | im_orig -= pixel_means 41 | 42 | im_shape = im_orig.shape 43 | im_size_min = np.min(im_shape[0:2]) 44 | im_size_max = np.max(im_shape[0:2]) 45 | 46 | for target_size in TEST_SCALES: 47 | im_scale = float(target_size) / float(im_size_min) 48 | # Prevent the biggest axis from being more than MAX_SIZE 49 | if np.round(im_scale * im_size_max) > TEST_MAX_SIZE: 50 | im_scale = float(TEST_MAX_SIZE) / float(im_size_max) 51 | im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, 52 | interpolation=cv2.INTER_LINEAR) 53 | 54 | dataset_dict["image"] = torch.from_numpy(im).permute(2, 0, 1) 55 | dataset_dict["im_scale"] = im_scale 56 | 57 | return dataset_dict 58 | 59 | 60 | def save_roi_features(args, cfg, im_file, im, dataset_dict, boxes, scores, features_pooled, attr_scores=None): 61 | MIN_BOXES = cfg.MODEL.BUA.EXTRACTOR.MIN_BOXES 62 | MAX_BOXES = cfg.MODEL.BUA.EXTRACTOR.MAX_BOXES 63 | CONF_THRESH = cfg.MODEL.BUA.EXTRACTOR.CONF_THRESH 64 | 65 | dets = boxes[0] / dataset_dict['im_scale'] 66 | scores = scores[0] 67 | feats = features_pooled[0] 68 | 69 | max_conf = torch.zeros((scores.shape[0])).to(scores.device) 70 | for cls_ind in range(1, scores.shape[1]): 71 | cls_scores = scores[:, cls_ind] 72 | keep = nms(dets, cls_scores, 0.3) 73 | max_conf[keep] = torch.where(cls_scores[keep] > max_conf[keep], 74 | cls_scores[keep], 75 | max_conf[keep]) 76 | 77 | keep_boxes = torch.nonzero(max_conf >= CONF_THRESH).flatten() 78 | if len(keep_boxes) < MIN_BOXES: 79 | keep_boxes = torch.argsort(max_conf, descending=True)[:MIN_BOXES] 80 | elif len(keep_boxes) > MAX_BOXES: 81 | keep_boxes = torch.argsort(max_conf, descending=True)[:MAX_BOXES] 82 | image_feat = feats[keep_boxes] 83 | image_bboxes = dets[keep_boxes] 84 | image_objects_conf = np.max(scores[keep_boxes].numpy()[:,1:], axis=1) 85 | image_objects = np.argmax(scores[keep_boxes].numpy()[:,1:], axis=1) 86 | if not attr_scores is None: 87 | attr_scores = attr_scores[0] 88 | image_attrs_conf = np.max(attr_scores[keep_boxes].numpy()[:,1:], axis=1) 89 | image_attrs = np.argmax(attr_scores[keep_boxes].numpy()[:,1:], axis=1) 90 | info = { 91 | 'image_id': im_file.split('.')[0], 92 | 'image_h': np.size(im, 0), 93 | 'image_w': np.size(im, 1), 94 | 'num_boxes': len(keep_boxes), 95 | 'objects_id': image_objects, 96 | 'objects_conf': image_objects_conf, 97 | 'attrs_id': image_attrs, 98 | 'attrs_conf': image_attrs_conf, 99 | } 100 | else: 101 | info = { 102 | 'image_id': im_file.split('.')[0], 103 | 'image_h': np.size(im, 0), 104 | 'image_w': np.size(im, 1), 105 | 'num_boxes': len(keep_boxes), 106 | 'objects_id': image_objects, 107 | 'objects_conf': image_objects_conf 108 | } 109 | 110 | output_file = os.path.join(args.output_dir, im_file.split('.')[0]) 111 | np.savez_compressed(output_file, x=image_feat, bbox=image_bboxes, num_bbox=len(keep_boxes), image_h=np.size(im, 0), image_w=np.size(im, 1), info=info) 112 | 113 | def save_bbox(args, cfg, im_file, im, dataset_dict, boxes, scores): 114 | MIN_BOXES = cfg.MODEL.BUA.EXTRACTOR.MIN_BOXES 115 | MAX_BOXES = cfg.MODEL.BUA.EXTRACTOR.MAX_BOXES 116 | CONF_THRESH = cfg.MODEL.BUA.EXTRACTOR.CONF_THRESH 117 | 118 | scores = scores[0] 119 | boxes = boxes[0] 120 | num_classes = scores.shape[1] 121 | boxes = BUABoxes(boxes.reshape(-1, 4)) 122 | boxes.clip((dataset_dict['image'].shape[1]/dataset_dict['im_scale'], dataset_dict['image'].shape[2]/dataset_dict['im_scale'])) 123 | boxes = boxes.tensor.view(-1, num_classes*4) # R x C x 4 124 | 125 | cls_boxes = torch.zeros((boxes.shape[0], 4)) 126 | for idx in range(boxes.shape[0]): 127 | cls_idx = torch.argmax(scores[idx, 1:]) + 1 128 | cls_boxes[idx, :] = boxes[idx, cls_idx * 4:(cls_idx + 1) * 4] 129 | 130 | max_conf = torch.zeros((scores.shape[0])).to(scores.device) 131 | for cls_ind in range(1, num_classes): 132 | cls_scores = scores[:, cls_ind] 133 | keep = nms(cls_boxes, cls_scores, 0.3) 134 | max_conf[keep] = torch.where(cls_scores[keep] > max_conf[keep], 135 | cls_scores[keep], 136 | max_conf[keep]) 137 | 138 | keep_boxes = torch.argsort(max_conf, descending=True)[:MAX_BOXES] 139 | image_bboxes = cls_boxes[keep_boxes] 140 | 141 | output_file = os.path.join(args.output_dir, im_file.split('.')[0]) 142 | np.savez_compressed(output_file, bbox=image_bboxes, num_bbox=len(keep_boxes), image_h=np.size(im, 0), image_w=np.size(im, 1)) 143 | 144 | def save_roi_features_by_bbox(args, cfg, im_file, im, dataset_dict, boxes, scores, features_pooled, attr_scores=None): 145 | MIN_BOXES = cfg.MODEL.BUA.EXTRACTOR.MIN_BOXES 146 | MAX_BOXES = cfg.MODEL.BUA.EXTRACTOR.MAX_BOXES 147 | CONF_THRESH = cfg.MODEL.BUA.EXTRACTOR.CONF_THRESH 148 | dets = boxes[0] / dataset_dict['im_scale'] 149 | scores = scores[0] 150 | feats = features_pooled[0] 151 | keep_boxes = [i for i in range(scores.shape[0])] 152 | 153 | image_feat = feats[keep_boxes] 154 | image_bboxes = dets[keep_boxes] 155 | image_objects_conf = np.max(scores[keep_boxes].numpy()[:,1:], axis=1) 156 | image_objects = np.argmax(scores[keep_boxes].numpy()[:,1:], axis=1) 157 | if not attr_scores is None: 158 | attr_scores = attr_scores[0] 159 | image_attrs_conf = np.max(attr_scores[keep_boxes].numpy()[:,1:], axis=1) 160 | image_attrs = np.argmax(attr_scores[keep_boxes].numpy()[:,1:], axis=1) 161 | info = { 162 | 'image_id': im_file.split('.')[0], 163 | 'image_h': np.size(im, 0), 164 | 'image_w': np.size(im, 1), 165 | 'num_boxes': len(keep_boxes), 166 | 'objects_id': image_objects, 167 | 'objects_conf': image_objects_conf, 168 | 'attrs_id': image_attrs, 169 | 'attrs_conf': image_attrs_conf, 170 | } 171 | else: 172 | info = { 173 | 'image_id': im_file.split('.')[0], 174 | 'image_h': np.size(im, 0), 175 | 'image_w': np.size(im, 1), 176 | 'num_boxes': len(keep_boxes), 177 | 'objects_id': image_objects, 178 | 'objects_conf': image_objects_conf 179 | } 180 | 181 | output_file = os.path.join(args.output_dir, im_file.split('.')[0]) 182 | np.savez_compressed(output_file, x=image_feat, bbox=image_bboxes, num_bbox=len(keep_boxes), image_h=np.size(im, 0), image_w=np.size(im, 1), info=info) 183 | -------------------------------------------------------------------------------- /utils/extractor.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import time 4 | import torch 5 | 6 | from contextlib import contextmanager 7 | 8 | def inference_on_dataset(model, data_loader): 9 | """ 10 | Run model on the data_loader and extract the features with extractor. 11 | The model will be used in eval mode. 12 | 13 | Args: 14 | model (nn.Module): a module which accepts an object from 15 | `data_loader` and returns some outputs. It will be temporarily set to `eval` mode. 16 | 17 | If you wish to extract a model in `training` mode instead, you can 18 | wrap the given model and override its behavior of `.eval()` and `.train()`. 19 | data_loader: an iterable object with a length. 20 | The elements it generates will be the inputs to the model. 21 | evaluator (DatasetEvaluator): the evaluator to run. Use 22 | :class:`DatasetEvaluators([])` if you only want to benchmark, but 23 | don't want to do any evaluation. 24 | 25 | Returns: 26 | The return value of `evaluator.evaluate()` 27 | """ 28 | num_devices = torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1 29 | logger = logging.getLogger(__name__) 30 | logger.info("Start inference on {} images".format(len(data_loader))) 31 | 32 | total = len(data_loader) # inference data loader must have a fixed length 33 | 34 | logging_interval = 50 35 | num_warmup = min(5, logging_interval - 1, total - 1) 36 | start_time = time.time() 37 | total_compute_time = 0 38 | with inference_context(model), torch.no_grad(): 39 | for idx, inputs in enumerate(data_loader): 40 | if idx == num_warmup: 41 | start_time = time.time() 42 | total_compute_time = 0 43 | 44 | start_compute_time = time.time() 45 | outputs = model(inputs) 46 | torch.cuda.synchronize() 47 | total_compute_time += time.time() - start_compute_time 48 | if (idx + 1) % logging_interval == 0: 49 | duration = time.time() - start_time 50 | seconds_per_img = duration / (idx + 1 - num_warmup) 51 | eta = datetime.timedelta( 52 | seconds=int(seconds_per_img * (total - num_warmup) - duration) 53 | ) 54 | logger.info( 55 | "Inference done {}/{}. {:.4f} s / img. ETA={}".format( 56 | idx + 1, total, seconds_per_img, str(eta) 57 | ) 58 | ) 59 | 60 | # Measure the time only for this worker (before the synchronization barrier) 61 | total_time = int(time.time() - start_time) 62 | total_time_str = str(datetime.timedelta(seconds=total_time)) 63 | # NOTE this format is parsed by grep 64 | logger.info( 65 | "Total inference time: {} ({:.6f} s / img per device, on {} devices)".format( 66 | total_time_str, total_time / (total - num_warmup), num_devices 67 | ) 68 | ) 69 | total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time))) 70 | logger.info( 71 | "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)".format( 72 | total_compute_time_str, total_compute_time / (total - num_warmup), num_devices 73 | ) 74 | ) 75 | 76 | @contextmanager 77 | def inference_context(model): 78 | """ 79 | A context where the model is temporarily changed to eval mode, 80 | and restored to previous mode afterwards. 81 | 82 | Args: 83 | model: a torch Module 84 | """ 85 | training_mode = model.training 86 | model.eval() 87 | yield 88 | model.train(training_mode) -------------------------------------------------------------------------------- /utils/progress_bar.py: -------------------------------------------------------------------------------- 1 | """ 2 | Progress Bar for Ray Actors (tqdm) 3 | ================================== 4 | 5 | Tracking progress of distributed tasks can be tricky. 6 | 7 | This script will demonstrate how to implement a simple 8 | progress bar for a Ray actor to track progress across various 9 | different distributed components. 10 | 11 | Original source: `Link `_ 12 | 13 | Setup: Dependencies 14 | ------------------- 15 | 16 | First, import some dependencies. 17 | """ 18 | 19 | # Inspiration: https://github.com/honnibal/spacy-ray/pull/ 20 | # 1/files#diff-7ede881ddc3e8456b320afb958362b2aR12-R45 21 | from asyncio import Event 22 | from typing import Tuple 23 | from time import sleep 24 | 25 | import ray 26 | # For typing purposes 27 | from ray.actor import ActorHandle 28 | from tqdm import tqdm 29 | 30 | import os 31 | 32 | ############################################################ 33 | # This is the Ray "actor" that can be called from anywhere to update 34 | # our progress. You'll be using the `update` method. Don't 35 | # instantiate this class yourself. Instead, 36 | # it's something that you'll get from a `ProgressBar`. 37 | 38 | 39 | @ray.remote 40 | class ProgressBarActor: 41 | counter: int 42 | delta: int 43 | event: Event 44 | 45 | def __init__(self) -> None: 46 | self.counter = 0 47 | self.delta = 0 48 | self.event = Event() 49 | 50 | def update(self, num_items_completed: int) -> None: 51 | """Updates the ProgressBar with the incremental 52 | number of items that were just completed. 53 | """ 54 | self.counter += num_items_completed 55 | self.delta += num_items_completed 56 | self.event.set() 57 | 58 | async def wait_for_update(self) -> Tuple[int, int]: 59 | """Blocking call. 60 | 61 | Waits until somebody calls `update`, then returns a tuple of 62 | the number of updates since the last call to 63 | `wait_for_update`, and the total number of completed items. 64 | """ 65 | await self.event.wait() 66 | self.event.clear() 67 | saved_delta = self.delta 68 | self.delta = 0 69 | return saved_delta, self.counter 70 | 71 | def get_counter(self) -> int: 72 | """ 73 | Returns the total number of complete items. 74 | """ 75 | return self.counter 76 | 77 | 78 | ###################################################################### 79 | # This is where the progress bar starts. You create one of these 80 | # on the head node, passing in the expected total number of items, 81 | # and an optional string description. 82 | # Pass along the `actor` reference to any remote task, 83 | # and if they complete ten 84 | # tasks, they'll call `actor.update.remote(10)`. 85 | 86 | # Back on the local node, once you launch your remote Ray tasks, call 87 | # `print_until_done`, which will feed everything back into a `tqdm` counter. 88 | 89 | 90 | class ProgressBar: 91 | progress_actor: ActorHandle 92 | total: int 93 | description: str 94 | pbar: tqdm 95 | 96 | def __init__(self, total: int, description: str = ""): 97 | # Ray actors don't seem to play nice with mypy, generating 98 | # a spurious warning for the following line, 99 | # which we need to suppress. The code is fine. 100 | self.progress_actor = ProgressBarActor.remote() # type: ignore 101 | self.total = total 102 | self.description = description 103 | 104 | @property 105 | def actor(self) -> ActorHandle: 106 | """Returns a reference to the remote `ProgressBarActor`. 107 | 108 | When you complete tasks, call `update` on the actor. 109 | """ 110 | return self.progress_actor 111 | 112 | def print_until_done(self) -> None: 113 | """Blocking call. 114 | 115 | Do this after starting a series of remote Ray tasks, to which you've 116 | passed the actor handle. Each of them calls `update` on the actor. 117 | When the progress meter reaches 100%, this method returns. 118 | """ 119 | pbar = tqdm(desc=self.description, total=self.total) 120 | while True: 121 | delta, counter = ray.get(self.actor.wait_for_update.remote()) 122 | pbar.update(delta) 123 | if counter >= self.total: 124 | pbar.close() 125 | return 126 | 127 | 128 | ################################################################# 129 | # This is an example of a task that increments the progress bar. 130 | # Note that this is a Ray Task, but it could very well 131 | # be any generic Ray Actor. 132 | # 133 | @ray.remote(num_gpus=1) 134 | def sleep_then_increment(i: int, pba: ActorHandle) -> int: 135 | print('ray.get_gpu_ids():', ray.get_gpu_ids()) 136 | print('CUDA_VISIBLE_DEVICES:', os.environ['CUDA_VISIBLE_DEVICES']) 137 | sleep(i / 2.0) 138 | pba.update.remote(1) 139 | return i 140 | 141 | 142 | ################################################################# 143 | # Now you can run it and see what happens! 144 | # 145 | 146 | 147 | def run(): 148 | ray.init() 149 | num_ticks = 6 150 | pb = ProgressBar(num_ticks) 151 | actor = pb.actor 152 | # You can replace this with any arbitrary Ray task/actor. 153 | tasks_pre_launch = [ 154 | sleep_then_increment.remote(i, actor) for i in range(0, num_ticks) 155 | ] 156 | 157 | pb.print_until_done() 158 | tasks = ray.get(tasks_pre_launch) 159 | 160 | tasks == list(range(num_ticks)) 161 | num_ticks == ray.get(actor.get_counter.remote()) 162 | 163 | 164 | # run() 165 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import errno 3 | import numpy as np 4 | import torch 5 | 6 | from detectron2.structures import Instances 7 | from bua.caffe.modeling.layers.nms import nms 8 | 9 | def save_features(output_file, features, boxes=None): 10 | if boxes is None: 11 | res = features 12 | np.save(output_file, res) 13 | else: 14 | np.savez(output_file, x=features, bbox=boxes) 15 | 16 | def mkdir(path): 17 | try: 18 | os.makedirs(path) 19 | except OSError as e: 20 | if e.errno != errno.EEXIST: 21 | raise 22 | 23 | def extractor_postprocess(boxes, scores, features_pooled, input_per_image, extractor): 24 | """ 25 | Resize the output instances. 26 | The input images are often resized when entering an object detector. 27 | As a result, we often need the outputs of the detector in a different 28 | resolution from its inputs. 29 | 30 | This function will resize the raw outputs of an R-CNN detector 31 | to produce outputs according to the desired output resolution. 32 | 33 | Args: 34 | results (Instances): the raw outputs from the detector. 35 | `results.image_size` contains the input image resolution the detector sees. 36 | This object might be modified in-place. 37 | output_height, output_width: the desired output resolution. 38 | 39 | Returns: 40 | Instances: the resized output from the model, based on the output resolution 41 | """ 42 | MIN_BOXES = extractor.MIN_BOXES 43 | MAX_BOXES = extractor.MAX_BOXES 44 | CONF_THRESH = extractor.CONF_THRESH 45 | 46 | cur_device = scores.device 47 | 48 | dets = boxes / input_per_image["im_scale"] 49 | 50 | max_conf = torch.zeros((scores.shape[0])).to(cur_device) 51 | 52 | for cls_ind in range(1, scores.shape[1]): 53 | cls_scores = scores[:, cls_ind] 54 | keep = nms(dets, cls_scores, 0.3) 55 | max_conf[keep] = torch.where(cls_scores[keep] > max_conf[keep], 56 | cls_scores[keep], 57 | max_conf[keep]) 58 | 59 | keep_boxes = torch.nonzero(max_conf >= CONF_THRESH).flatten() 60 | if len(keep_boxes) < MIN_BOXES: 61 | keep_boxes = torch.argsort(max_conf, descending=True)[:MIN_BOXES] 62 | elif len(keep_boxes) > MAX_BOXES: 63 | keep_boxes = torch.argsort(max_conf, descending=True)[:MAX_BOXES] 64 | image_feat = features_pooled[keep_boxes] 65 | image_bboxes = dets[keep_boxes] 66 | 67 | return image_feat, image_bboxes --------------------------------------------------------------------------------