├── .gitignore ├── LICENSE ├── README.md ├── backbone ├── __init__.py ├── darknet19.py ├── darknet53.py ├── darknet_tiny.py ├── resnet.py └── weights │ └── README.md ├── data ├── __init__.py ├── coco2017.py ├── config.py ├── scripts │ ├── COCO2017.sh │ ├── VOC2007.sh │ └── VOC2012.sh └── voc0712.py ├── demo.py ├── eval.py ├── img_file └── darknet_tiny.png ├── models ├── __pycache__ │ ├── yolo_anchor.cpython-36.pyc │ ├── yolo_anchor_ms.cpython-36.pyc │ ├── yolo_fusion.cpython-36.pyc │ ├── yolo_kitti.cpython-36.pyc │ ├── yolo_light.cpython-36.pyc │ ├── yolo_mobile.cpython-36.pyc │ ├── yolo_msf.cpython-36.pyc │ ├── yolo_v1.cpython-36.pyc │ ├── yolo_v1_ms.cpython-36.pyc │ ├── yolo_v2.cpython-36.pyc │ └── yolo_v2.cpython-37.pyc ├── yolov2_d19.py ├── yolov2_r50.py ├── yolov3.py ├── yolov3_spp.py └── yolov3_tiny.py ├── test.py ├── tools.py ├── train.py ├── utils ├── __init__.py ├── augmentations.py ├── cocoapi_evaluator.py ├── com_paras_flops.py ├── distributed_utils.py ├── kmeans_anchor.py ├── modules.py └── vocapi_evaluator.py └── weights └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.pt 2 | *.pth 3 | *.txt 4 | *.pkl 5 | __pycache__ 6 | .vscode 7 | det_results -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Update 2 | Recently, I have released a new YOLO project: 3 | 4 | https://github.com/yjh0410/PyTorch_YOLO_Tutorial 5 | 6 | In my new YOLO project, you can enjoy: 7 | - a new and stronger YOLOv1 8 | - a new and stronger YOLOv2 9 | - YOLOv3 10 | - YOLOv4 11 | - YOLOv5 12 | - YOLOv7 13 | - YOLOX 14 | - RTCDet 15 | 16 | 17 | # This project 18 | In this project, you can enjoy: 19 | - YOLOv2 with DarkNet-19 20 | - YOLOv2 with ResNet-50 21 | - YOLOv2Slim 22 | - YOLOv3 23 | - YOLOv3-Spp 24 | - YOLOv3-Tiny 25 | 26 | 27 | I just want to provide a good YOLO project for everyone who is interested in Object Detection. 28 | 29 | # Weights 30 | Google Drive: https://drive.google.com/drive/folders/1T5hHyGICbFSdu6u2_vqvxn_puotvPsbd?usp=sharing 31 | 32 | BaiDuYunDisk: https://pan.baidu.com/s/1tSylvzOVFReUAvaAxKRSwg 33 | Password d266 34 | 35 | You can download all my models from the above links. 36 | 37 | # YOLOv2 38 | 39 | ## YOLOv2 with DarkNet-19 40 | ### Tricks 41 | Tricks in official paper: 42 | - [x] batch norm 43 | - [x] hi-res classifier 44 | - [x] convolutional 45 | - [x] anchor boxes 46 | - [x] new network 47 | - [x] dimension priors 48 | - [x] location prediction 49 | - [x] passthrough 50 | - [x] multi-scale 51 | - [x] hi-red detector 52 | 53 | ## VOC2007 54 | 55 | 56 | 57 | 58 | 59 |
size Original (darknet) Ours (pytorch) 160peochs Ours (pytorch) 250epochs
VOC07 test 416 76.8 76.0 77.1
VOC07 test 544 78.6 77.0 78.1
60 | 61 | ## COCO 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 |
data AP AP50 AP75 AP_S AP_M AP_L
Original (darknet) COCO test-dev 21.6 44.0 19.2 5.0 22.4 35.5
Ours (pytorch) COCO test-dev 26.8 46.6 26.8 5.8 27.4 45.2
Ours (pytorch) COCO eval 26.6 46.0 26.7 5.9 27.8 47.1
72 | 73 | 74 | ## YOLOv2 with ResNet-50 75 | 76 | I replace darknet-19 with resnet-50 and get a better result on COCO-val 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 |
data AP AP50 AP75 AP_S AP_M AP_L
Our YOLOv2-320 COCO eval 25.8 44.6 25.9 4.6 26.8 47.9
Our YOLOv2-416 COCO eval 29.0 48.8 29.7 7.4 31.9 48.3
Our YOLOv2-512 COCO eval 30.4 51.6 30.9 10.1 34.9 46.6
Our YOLOv2-544 COCO eval 30.4 51.9 30.9 11.1 35.8 45.5
Our YOLOv2-608 COCO eval 29.2 51.6 29.1 13.6 36.8 40.5
91 | 92 | # YOLOv3 93 | 94 | ## VOC2007 95 | 96 | 97 | 98 | 99 |
size Original (darknet) Ours (pytorch) 250epochs
VOC07 test 416 80.25 81.4
100 | 101 | # COCO 102 | 103 | Official YOLOv3: 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 |
data AP AP50 AP75 AP_S AP_M AP_L
YOLOv3-320 COCO test-dev 28.2 51.5 - - - -
YOLOv3-416 COCO test-dev 31.0 55.3 - - - -
YOLOv3-608 COCO test-dev 33.0 57.0 34.4 18.3 35.4 41.9
114 | 115 | Our YOLOv3: 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 |
data AP AP50 AP75 AP_S AP_M AP_L
YOLOv3-320 COCO test-dev 33.1 54.1 34.5 12.1 34.5 49.6
YOLOv3-416 COCO test-dev 36.0 57.4 37.0 16.3 37.5 51.1
YOLOv3-608 COCO test-dev 37.6 59.4 39.9 20.4 39.9 48.2
126 | 127 | # YOLOv3SPP 128 | ## COCO: 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 |
data AP AP50 AP75 AP_S AP_M AP_L
YOLOv3Spp-320 COCO eval 32.78 53.79 33.9 12.4 35.5 50.6
YOLOv3Spp-416 COCO eval 35.66 57.09 37.4 16.8 38.1 50.7
YOLOv3Spp-608 COCO eval 37.52 59.44 39.3 21.5 40.6 49.6
141 | 142 | # YOLOv3Tiny 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 |
data AP AP50 AP75 AP_S AP_M AP_L
(official) YOLOv3Tiny COCO test-dev - 33.1 - - - -
(Our) YOLOv3Tiny COCO val 15.9 33.8 12.8 7.6 17.7 22.4
151 | 152 | 153 | # Installation 154 | - Pytorch-gpu 1.1.0/1.2.0/1.3.0 155 | - Tensorboard 1.14. 156 | - opencv-python, python3.6/3.7 157 | 158 | # Dataset 159 | 160 | ## VOC Dataset 161 | I copy the download files from the following excellent project: 162 | https://github.com/amdegroot/ssd.pytorch 163 | 164 | I have uploaded the VOC2007 and VOC2012 to BaiDuYunDisk, so for researchers in China, you can download them from BaiDuYunDisk: 165 | 166 | Link:https://pan.baidu.com/s/1tYPGCYGyC0wjpC97H-zzMQ 167 | 168 | Password:4la9 169 | 170 | You will get a ```VOCdevkit.zip```, then what you need to do is just to unzip it and put it into ```data/```. After that, the whole path to VOC dataset is ```data/VOCdevkit/VOC2007``` and ```data/VOCdevkit/VOC2012```. 171 | 172 | ### Download VOC2007 trainval & test 173 | 174 | ```Shell 175 | # specify a directory for dataset to be downloaded into, else default is ~/data/ 176 | sh data/scripts/VOC2007.sh # 177 | ``` 178 | 179 | ### Download VOC2012 trainval 180 | ```Shell 181 | # specify a directory for dataset to be downloaded into, else default is ~/data/ 182 | sh data/scripts/VOC2012.sh # 183 | ``` 184 | 185 | ## MSCOCO Dataset 186 | I copy the download files from the following excellent project: 187 | https://github.com/DeNA/PyTorch_YOLOv3 188 | 189 | ### Download MSCOCO 2017 dataset 190 | Just run ```sh data/scripts/COCO2017.sh```. You will get COCO train2017, val2017, test2017. 191 | 192 | 193 | # Train 194 | ## VOC 195 | ```Shell 196 | python train.py -d voc --cuda -v [select a model] -hr -ms --ema 197 | ``` 198 | 199 | You can run ```python train.py -h``` to check all optional argument. 200 | 201 | ## COCO 202 | If you have only one gpu: 203 | ```Shell 204 | python train.py -d coco --cuda -v [select a model] -hr -ms --ema 205 | ``` 206 | 207 | If you have multi gpus like 8, and you put 4 images on each gpu: 208 | ```Shell 209 | python -m torch.distributed.launch --nproc_per_node=8 train.py -d coco --cuda -v [select a model] -hr -ms --ema \ 210 | -dist \ 211 | --sybn \ 212 | --num_gpu 8\ 213 | --batch_size 4 214 | ``` 215 | 216 | # Test 217 | ## VOC 218 | ```Shell 219 | python test.py -d voc --cuda -v [select a model] --trained_model [ Please input the path to model dir. ] 220 | ``` 221 | 222 | ## COCO 223 | ```Shell 224 | python test.py -d coco-val --cuda -v [select a model] --trained_model [ Please input the path to model dir. ] 225 | ``` 226 | 227 | 228 | # Evaluation 229 | ## VOC 230 | ```Shell 231 | python eval.py -d voc --cuda -v [select a model] --train_model [ Please input the path to model dir. ] 232 | ``` 233 | 234 | ## COCO 235 | To run on COCO_val: 236 | ```Shell 237 | python eval.py -d coco-val --cuda -v [select a model] --train_model [ Please input the path to model dir. ] 238 | ``` 239 | 240 | To run on COCO_test-dev(You must be sure that you have downloaded test2017): 241 | ```Shell 242 | python eval.py -d coco-test --cuda -v [select a model] --train_model [ Please input the path to model dir. ] 243 | ``` 244 | You will get a .json file which can be evaluated on COCO test server. 245 | -------------------------------------------------------------------------------- /backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet import build_resnet 2 | from .darknet19 import build_darknet19 3 | from .darknet53 import build_darknet53 4 | from .darknet_tiny import build_darknet_tiny 5 | 6 | 7 | def build_backbone(model_name='resnet18', pretrained=False): 8 | if 'resnet' in model_name: 9 | backbone = build_resnet(model_name, pretrained) 10 | 11 | elif model_name == 'darknet19': 12 | backbone = build_darknet19(pretrained) 13 | 14 | elif model_name == 'darknet53': 15 | backbone = build_darknet53(pretrained) 16 | 17 | elif model_name == 'darknet19': 18 | backbone = build_darknet_tiny(pretrained) 19 | 20 | return backbone 21 | -------------------------------------------------------------------------------- /backbone/darknet19.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import os 4 | 5 | 6 | model_urls = { 7 | "darknet19": "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/darknet19.pth", 8 | } 9 | 10 | 11 | __all__ = ['darknet19'] 12 | 13 | 14 | class Conv_BN_LeakyReLU(nn.Module): 15 | def __init__(self, in_channels, out_channels, ksize, padding=0, stride=1, dilation=1): 16 | super(Conv_BN_LeakyReLU, self).__init__() 17 | self.convs = nn.Sequential( 18 | nn.Conv2d(in_channels, out_channels, ksize, padding=padding, stride=stride, dilation=dilation), 19 | nn.BatchNorm2d(out_channels), 20 | nn.LeakyReLU(0.1, inplace=True) 21 | ) 22 | 23 | def forward(self, x): 24 | return self.convs(x) 25 | 26 | 27 | class DarkNet_19(nn.Module): 28 | def __init__(self): 29 | super(DarkNet_19, self).__init__() 30 | # backbone network : DarkNet-19 31 | # output : stride = 2, c = 32 32 | self.conv_1 = nn.Sequential( 33 | Conv_BN_LeakyReLU(3, 32, 3, 1), 34 | nn.MaxPool2d((2,2), 2), 35 | ) 36 | 37 | # output : stride = 4, c = 64 38 | self.conv_2 = nn.Sequential( 39 | Conv_BN_LeakyReLU(32, 64, 3, 1), 40 | nn.MaxPool2d((2,2), 2) 41 | ) 42 | 43 | # output : stride = 8, c = 128 44 | self.conv_3 = nn.Sequential( 45 | Conv_BN_LeakyReLU(64, 128, 3, 1), 46 | Conv_BN_LeakyReLU(128, 64, 1), 47 | Conv_BN_LeakyReLU(64, 128, 3, 1), 48 | nn.MaxPool2d((2,2), 2) 49 | ) 50 | 51 | # output : stride = 8, c = 256 52 | self.conv_4 = nn.Sequential( 53 | Conv_BN_LeakyReLU(128, 256, 3, 1), 54 | Conv_BN_LeakyReLU(256, 128, 1), 55 | Conv_BN_LeakyReLU(128, 256, 3, 1), 56 | ) 57 | 58 | # output : stride = 16, c = 512 59 | self.maxpool_4 = nn.MaxPool2d((2, 2), 2) 60 | self.conv_5 = nn.Sequential( 61 | Conv_BN_LeakyReLU(256, 512, 3, 1), 62 | Conv_BN_LeakyReLU(512, 256, 1), 63 | Conv_BN_LeakyReLU(256, 512, 3, 1), 64 | Conv_BN_LeakyReLU(512, 256, 1), 65 | Conv_BN_LeakyReLU(256, 512, 3, 1), 66 | ) 67 | 68 | # output : stride = 32, c = 1024 69 | self.maxpool_5 = nn.MaxPool2d((2, 2), 2) 70 | self.conv_6 = nn.Sequential( 71 | Conv_BN_LeakyReLU(512, 1024, 3, 1), 72 | Conv_BN_LeakyReLU(1024, 512, 1), 73 | Conv_BN_LeakyReLU(512, 1024, 3, 1), 74 | Conv_BN_LeakyReLU(1024, 512, 1), 75 | Conv_BN_LeakyReLU(512, 1024, 3, 1) 76 | ) 77 | 78 | def forward(self, x): 79 | c1 = self.conv_1(x) 80 | c2 = self.conv_2(c1) 81 | c3 = self.conv_3(c2) 82 | c3 = self.conv_4(c3) 83 | c4 = self.conv_5(self.maxpool_4(c3)) 84 | c5 = self.conv_6(self.maxpool_5(c4)) 85 | 86 | output = { 87 | 'layer1': c3, 88 | 'layer2': c4, 89 | 'layer3': c5 90 | } 91 | 92 | return output 93 | 94 | 95 | def build_darknet19(pretrained=False): 96 | # model 97 | model = DarkNet_19() 98 | 99 | # load weight 100 | if pretrained: 101 | print('Loading pretrained weight ...') 102 | url = model_urls['darknet19'] 103 | # checkpoint state dict 104 | checkpoint_state_dict = torch.hub.load_state_dict_from_url( 105 | url=url, map_location="cpu", check_hash=True) 106 | # model state dict 107 | model_state_dict = model.state_dict() 108 | # check 109 | for k in list(checkpoint_state_dict.keys()): 110 | if k in model_state_dict: 111 | shape_model = tuple(model_state_dict[k].shape) 112 | shape_checkpoint = tuple(checkpoint_state_dict[k].shape) 113 | if shape_model != shape_checkpoint: 114 | checkpoint_state_dict.pop(k) 115 | else: 116 | checkpoint_state_dict.pop(k) 117 | print(k) 118 | 119 | model.load_state_dict(checkpoint_state_dict) 120 | 121 | return model 122 | 123 | 124 | if __name__ == '__main__': 125 | import time 126 | net = build_darknet19(pretrained=True) 127 | x = torch.randn(1, 3, 224, 224) 128 | t0 = time.time() 129 | output = net(x) 130 | t1 = time.time() 131 | print('Time: ', t1 - t0) 132 | 133 | for k in output.keys(): 134 | print('{} : {}'.format(k, output[k].shape)) 135 | -------------------------------------------------------------------------------- /backbone/darknet53.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | model_urls = { 6 | "darknet53": "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/darknet53.pth", 7 | } 8 | 9 | 10 | __all__ = ['darknet53'] 11 | 12 | 13 | class Conv_BN_LeakyReLU(nn.Module): 14 | def __init__(self, in_channels, out_channels, ksize, padding=0, stride=1, dilation=1): 15 | super(Conv_BN_LeakyReLU, self).__init__() 16 | self.convs = nn.Sequential( 17 | nn.Conv2d(in_channels, out_channels, ksize, padding=padding, stride=stride, dilation=dilation), 18 | nn.BatchNorm2d(out_channels), 19 | nn.LeakyReLU(0.1, inplace=True) 20 | ) 21 | 22 | def forward(self, x): 23 | return self.convs(x) 24 | 25 | 26 | class ResBlock(nn.Module): 27 | def __init__(self, ch, nblocks=1): 28 | super().__init__() 29 | self.module_list = nn.ModuleList() 30 | for _ in range(nblocks): 31 | resblock_one = nn.Sequential( 32 | Conv_BN_LeakyReLU(ch, ch//2, 1), 33 | Conv_BN_LeakyReLU(ch//2, ch, 3, padding=1) 34 | ) 35 | self.module_list.append(resblock_one) 36 | 37 | def forward(self, x): 38 | for module in self.module_list: 39 | x = module(x) + x 40 | return x 41 | 42 | 43 | class DarkNet_53(nn.Module): 44 | """ 45 | DarkNet-53. 46 | """ 47 | def __init__(self): 48 | super(DarkNet_53, self).__init__() 49 | # stride = 2 50 | self.layer_1 = nn.Sequential( 51 | Conv_BN_LeakyReLU(3, 32, 3, padding=1), 52 | Conv_BN_LeakyReLU(32, 64, 3, padding=1, stride=2), 53 | ResBlock(64, nblocks=1) 54 | ) 55 | # stride = 4 56 | self.layer_2 = nn.Sequential( 57 | Conv_BN_LeakyReLU(64, 128, 3, padding=1, stride=2), 58 | ResBlock(128, nblocks=2) 59 | ) 60 | # stride = 8 61 | self.layer_3 = nn.Sequential( 62 | Conv_BN_LeakyReLU(128, 256, 3, padding=1, stride=2), 63 | ResBlock(256, nblocks=8) 64 | ) 65 | # stride = 16 66 | self.layer_4 = nn.Sequential( 67 | Conv_BN_LeakyReLU(256, 512, 3, padding=1, stride=2), 68 | ResBlock(512, nblocks=8) 69 | ) 70 | # stride = 32 71 | self.layer_5 = nn.Sequential( 72 | Conv_BN_LeakyReLU(512, 1024, 3, padding=1, stride=2), 73 | ResBlock(1024, nblocks=4) 74 | ) 75 | 76 | 77 | def forward(self, x, targets=None): 78 | c1 = self.layer_1(x) 79 | c2 = self.layer_2(c1) 80 | c3 = self.layer_3(c2) 81 | c4 = self.layer_4(c3) 82 | c5 = self.layer_5(c4) 83 | 84 | output = { 85 | 'layer1': c3, 86 | 'layer2': c4, 87 | 'layer3': c5 88 | } 89 | 90 | return output 91 | 92 | 93 | def build_darknet53(pretrained=False): 94 | # model 95 | model = DarkNet_53() 96 | 97 | # load weight 98 | if pretrained: 99 | print('Loading pretrained weight ...') 100 | url = model_urls['darknet53'] 101 | # checkpoint state dict 102 | checkpoint_state_dict = torch.hub.load_state_dict_from_url( 103 | url=url, map_location="cpu", check_hash=True) 104 | # model state dict 105 | model_state_dict = model.state_dict() 106 | # check 107 | for k in list(checkpoint_state_dict.keys()): 108 | if k in model_state_dict: 109 | shape_model = tuple(model_state_dict[k].shape) 110 | shape_checkpoint = tuple(checkpoint_state_dict[k].shape) 111 | if shape_model != shape_checkpoint: 112 | checkpoint_state_dict.pop(k) 113 | else: 114 | checkpoint_state_dict.pop(k) 115 | print(k) 116 | 117 | model.load_state_dict(checkpoint_state_dict) 118 | 119 | return model 120 | 121 | 122 | if __name__ == '__main__': 123 | import time 124 | net = build_darknet53(pretrained=True) 125 | x = torch.randn(1, 3, 224, 224) 126 | t0 = time.time() 127 | output = net(x) 128 | t1 = time.time() 129 | print('Time: ', t1 - t0) 130 | 131 | for k in output.keys(): 132 | print('{} : {}'.format(k, output[k].shape)) 133 | -------------------------------------------------------------------------------- /backbone/darknet_tiny.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | model_urls = { 6 | "darknet_tiny": "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/darknet_tiny.pth", 7 | } 8 | 9 | 10 | __all__ = ['darknet_tiny'] 11 | 12 | 13 | class Conv_BN_LeakyReLU(nn.Module): 14 | def __init__(self, in_channels, out_channels, ksize, padding=0, stride=1, dilation=1): 15 | super(Conv_BN_LeakyReLU, self).__init__() 16 | self.convs = nn.Sequential( 17 | nn.Conv2d(in_channels, out_channels, ksize, padding=padding, stride=stride, dilation=dilation), 18 | nn.BatchNorm2d(out_channels), 19 | nn.LeakyReLU(0.1, inplace=True) 20 | ) 21 | 22 | def forward(self, x): 23 | return self.convs(x) 24 | 25 | 26 | class DarkNet_Tiny(nn.Module): 27 | def __init__(self): 28 | 29 | super(DarkNet_Tiny, self).__init__() 30 | # backbone network : DarkNet_Tiny 31 | self.conv_1 = Conv_BN_LeakyReLU(3, 16, 3, 1) 32 | self.maxpool_1 = nn.MaxPool2d((2, 2), 2) # stride = 2 33 | 34 | self.conv_2 = Conv_BN_LeakyReLU(16, 32, 3, 1) 35 | self.maxpool_2 = nn.MaxPool2d((2, 2), 2) # stride = 4 36 | 37 | self.conv_3 = Conv_BN_LeakyReLU(32, 64, 3, 1) 38 | self.maxpool_3 = nn.MaxPool2d((2, 2), 2) # stride = 8 39 | 40 | self.conv_4 = Conv_BN_LeakyReLU(64, 128, 3, 1) 41 | self.maxpool_4 = nn.MaxPool2d((2, 2), 2) # stride = 16 42 | 43 | self.conv_5 = Conv_BN_LeakyReLU(128, 256, 3, 1) 44 | self.maxpool_5 = nn.MaxPool2d((2, 2), 2) # stride = 32 45 | 46 | self.conv_6 = Conv_BN_LeakyReLU(256, 512, 3, 1) 47 | self.maxpool_6 = nn.Sequential( 48 | nn.ZeroPad2d((0, 1, 0, 1)), 49 | nn.MaxPool2d((2, 2), 1) # stride = 32 50 | ) 51 | 52 | self.conv_7 = Conv_BN_LeakyReLU(512, 1024, 3, 1) 53 | 54 | 55 | def forward(self, x): 56 | x = self.conv_1(x) 57 | c1 = self.maxpool_1(x) 58 | c1 = self.conv_2(c1) 59 | c2 = self.maxpool_2(c1) 60 | c2 = self.conv_3(c2) 61 | c3 = self.maxpool_3(c2) 62 | c3 = self.conv_4(c3) 63 | c4 = self.maxpool_4(c3) 64 | c4 = self.conv_5(c4) # stride = 16 65 | c5 = self.maxpool_5(c4) 66 | c5 = self.conv_6(c5) 67 | c5 = self.maxpool_6(c5) 68 | c5 = self.conv_7(c5) # stride = 32 69 | 70 | output = { 71 | 'layer1': c3, 72 | 'layer2': c4, 73 | 'layer3': c5 74 | } 75 | 76 | return output 77 | 78 | 79 | def build_darknet_tiny(pretrained=False): 80 | # model 81 | model = DarkNet_Tiny() 82 | 83 | # load weight 84 | if pretrained: 85 | print('Loading pretrained weight ...') 86 | url = model_urls['darknet_tiny'] 87 | # checkpoint state dict 88 | checkpoint_state_dict = torch.hub.load_state_dict_from_url( 89 | url=url, map_location="cpu", check_hash=True) 90 | # model state dict 91 | model_state_dict = model.state_dict() 92 | # check 93 | for k in list(checkpoint_state_dict.keys()): 94 | if k in model_state_dict: 95 | shape_model = tuple(model_state_dict[k].shape) 96 | shape_checkpoint = tuple(checkpoint_state_dict[k].shape) 97 | if shape_model != shape_checkpoint: 98 | checkpoint_state_dict.pop(k) 99 | else: 100 | checkpoint_state_dict.pop(k) 101 | print(k) 102 | 103 | model.load_state_dict(checkpoint_state_dict) 104 | 105 | return model 106 | 107 | 108 | if __name__ == '__main__': 109 | import time 110 | net = build_darknet_tiny(pretrained=True) 111 | x = torch.randn(1, 3, 224, 224) 112 | t0 = time.time() 113 | output = net(x) 114 | t1 = time.time() 115 | print('Time: ', t1 - t0) 116 | 117 | for k in output.keys(): 118 | print('{} : {}'.format(k, output[k].shape)) 119 | -------------------------------------------------------------------------------- /backbone/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.utils.model_zoo as model_zoo 4 | 5 | 6 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 7 | 'resnet152'] 8 | 9 | 10 | model_urls = { 11 | 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 12 | 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 13 | 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 14 | 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 15 | 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', 16 | } 17 | 18 | 19 | def conv3x3(in_planes, out_planes, stride=1): 20 | """3x3 convolution with padding""" 21 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 22 | padding=1, bias=False) 23 | 24 | def conv1x1(in_planes, out_planes, stride=1): 25 | """1x1 convolution""" 26 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) 27 | 28 | class BasicBlock(nn.Module): 29 | expansion = 1 30 | 31 | def __init__(self, inplanes, planes, stride=1, downsample=None): 32 | super(BasicBlock, self).__init__() 33 | self.conv1 = conv3x3(inplanes, planes, stride) 34 | self.bn1 = nn.BatchNorm2d(planes) 35 | self.relu = nn.ReLU(inplace=True) 36 | self.conv2 = conv3x3(planes, planes) 37 | self.bn2 = nn.BatchNorm2d(planes) 38 | self.downsample = downsample 39 | self.stride = stride 40 | 41 | def forward(self, x): 42 | identity = x 43 | 44 | out = self.conv1(x) 45 | out = self.bn1(out) 46 | out = self.relu(out) 47 | 48 | out = self.conv2(out) 49 | out = self.bn2(out) 50 | 51 | if self.downsample is not None: 52 | identity = self.downsample(x) 53 | 54 | out += identity 55 | out = self.relu(out) 56 | 57 | return out 58 | 59 | class Bottleneck(nn.Module): 60 | expansion = 4 61 | 62 | def __init__(self, inplanes, planes, stride=1, downsample=None): 63 | super(Bottleneck, self).__init__() 64 | self.conv1 = conv1x1(inplanes, planes) 65 | self.bn1 = nn.BatchNorm2d(planes) 66 | self.conv2 = conv3x3(planes, planes, stride) 67 | self.bn2 = nn.BatchNorm2d(planes) 68 | self.conv3 = conv1x1(planes, planes * self.expansion) 69 | self.bn3 = nn.BatchNorm2d(planes * self.expansion) 70 | self.relu = nn.ReLU(inplace=True) 71 | self.downsample = downsample 72 | self.stride = stride 73 | 74 | def forward(self, x): 75 | identity = x 76 | 77 | out = self.conv1(x) 78 | out = self.bn1(out) 79 | out = self.relu(out) 80 | 81 | out = self.conv2(out) 82 | out = self.bn2(out) 83 | out = self.relu(out) 84 | 85 | out = self.conv3(out) 86 | out = self.bn3(out) 87 | 88 | if self.downsample is not None: 89 | identity = self.downsample(x) 90 | 91 | out += identity 92 | out = self.relu(out) 93 | 94 | return out 95 | 96 | class ResNet(nn.Module): 97 | 98 | def __init__(self, block, layers, zero_init_residual=False): 99 | super(ResNet, self).__init__() 100 | self.inplanes = 64 101 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, 102 | bias=False) 103 | self.bn1 = nn.BatchNorm2d(64) 104 | self.relu = nn.ReLU(inplace=True) 105 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 106 | self.layer1 = self._make_layer(block, 64, layers[0]) 107 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 108 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 109 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 110 | 111 | for m in self.modules(): 112 | if isinstance(m, nn.Conv2d): 113 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 114 | elif isinstance(m, nn.BatchNorm2d): 115 | nn.init.constant_(m.weight, 1) 116 | nn.init.constant_(m.bias, 0) 117 | 118 | # Zero-initialize the last BN in each residual branch, 119 | # so that the residual branch starts with zeros, and each residual block behaves like an identity. 120 | # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 121 | if zero_init_residual: 122 | for m in self.modules(): 123 | if isinstance(m, Bottleneck): 124 | nn.init.constant_(m.bn3.weight, 0) 125 | elif isinstance(m, BasicBlock): 126 | nn.init.constant_(m.bn2.weight, 0) 127 | 128 | def _make_layer(self, block, planes, blocks, stride=1): 129 | downsample = None 130 | if stride != 1 or self.inplanes != planes * block.expansion: 131 | downsample = nn.Sequential( 132 | conv1x1(self.inplanes, planes * block.expansion, stride), 133 | nn.BatchNorm2d(planes * block.expansion), 134 | ) 135 | 136 | layers = [] 137 | layers.append(block(self.inplanes, planes, stride, downsample)) 138 | self.inplanes = planes * block.expansion 139 | for _ in range(1, blocks): 140 | layers.append(block(self.inplanes, planes)) 141 | 142 | return nn.Sequential(*layers) 143 | 144 | def forward(self, x): 145 | c1 = self.conv1(x) 146 | c1 = self.bn1(c1) 147 | c1 = self.relu(c1) 148 | c1 = self.maxpool(c1) 149 | 150 | c2 = self.layer1(c1) 151 | c3 = self.layer2(c2) 152 | c4 = self.layer3(c3) 153 | c5 = self.layer4(c4) 154 | 155 | output = { 156 | 'layer1': c3, 157 | 'layer2': c4, 158 | 'layer3': c5 159 | } 160 | 161 | return output 162 | 163 | 164 | def resnet18(pretrained=False, **kwargs): 165 | """Constructs a ResNet-18 model. 166 | 167 | Args: 168 | pretrained (bool): If True, returns a model pre-trained on ImageNet 169 | """ 170 | model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) 171 | if pretrained: 172 | # strict = False as we don't need fc layer params. 173 | model.load_state_dict(model_zoo.load_url(model_urls['resnet18']), strict=False) 174 | return model 175 | 176 | def resnet34(pretrained=False, **kwargs): 177 | """Constructs a ResNet-34 model. 178 | 179 | Args: 180 | pretrained (bool): If True, returns a model pre-trained on ImageNet 181 | """ 182 | model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) 183 | if pretrained: 184 | model.load_state_dict(model_zoo.load_url(model_urls['resnet34']), strict=False) 185 | return model 186 | 187 | def resnet50(pretrained=False, **kwargs): 188 | """Constructs a ResNet-50 model. 189 | 190 | Args: 191 | pretrained (bool): If True, returns a model pre-trained on ImageNet 192 | """ 193 | model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) 194 | if pretrained: 195 | model.load_state_dict(model_zoo.load_url(model_urls['resnet50']), strict=False) 196 | return model 197 | 198 | def resnet101(pretrained=False, **kwargs): 199 | """Constructs a ResNet-101 model. 200 | 201 | Args: 202 | pretrained (bool): If True, returns a model pre-trained on ImageNet 203 | """ 204 | model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) 205 | if pretrained: 206 | model.load_state_dict(model_zoo.load_url(model_urls['resnet101']), strict=False) 207 | return model 208 | 209 | def resnet152(pretrained=False, **kwargs): 210 | """Constructs a ResNet-152 model. 211 | 212 | Args: 213 | pretrained (bool): If True, returns a model pre-trained on ImageNet 214 | """ 215 | model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) 216 | if pretrained: 217 | model.load_state_dict(model_zoo.load_url(model_urls['resnet152'])) 218 | return model 219 | 220 | 221 | def build_resnet(model_name='resnet18', pretrained=False): 222 | 223 | if model_name == 'resnet18': 224 | model = resnet18(pretrained=pretrained) 225 | 226 | elif model_name == 'resnet34': 227 | model = resnet34(pretrained=pretrained) 228 | 229 | elif model_name == 'resnet50': 230 | model = resnet50(pretrained=pretrained) 231 | 232 | elif model_name == 'resnet101': 233 | model = resnet101(pretrained=pretrained) 234 | 235 | elif model_name == 'resnet152': 236 | model = resnet152(pretrained=pretrained) 237 | 238 | 239 | return model 240 | 241 | 242 | if __name__ == "__main__": 243 | import time 244 | 245 | model = build_resnet(model_name='resnet18', pretrained=True) 246 | x = torch.randn(1, 3, 224, 224) 247 | t0 = time.time() 248 | output = model(x) 249 | t1 = time.time() 250 | print('Time: ', t1 - t0) 251 | 252 | for k in output.keys(): 253 | print('{} : {}'.format(k, output[k].shape)) 254 | -------------------------------------------------------------------------------- /backbone/weights/README.md: -------------------------------------------------------------------------------- 1 | # darknet19, darknet53, darknet-tiny, darknet-light 2 | darknet-tiny is designed by myself. It is a very simple and lightweight backbone. 3 | 4 | darknet-light is same to the backbone used in official TinyYOLOv3. 5 | 6 | For researchers in China, you can download them from BaiduYunDisk: 7 | 8 | link:https://pan.baidu.com/s/1Rm87Fcj1RXZFmeTUrDWANA 9 | 10 | password:qgzn 11 | 12 | 13 | Also, you can download them from Google Drive: 14 | 15 | link: https://drive.google.com/drive/folders/15saMtvYiz3yfFNu5EnC7GSltEAvTImMB?usp=sharing 16 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | from .voc0712 import VOCDetection, VOCAnnotationTransform, VOC_CLASSES 2 | from .coco2017 import COCODataset, coco_class_labels, coco_class_index 3 | from .config import * 4 | import torch 5 | import cv2 6 | import numpy as np 7 | 8 | 9 | def detection_collate(batch): 10 | """Custom collate fn for dealing with batches of images that have a different 11 | number of associated object annotations (bounding boxes). 12 | 13 | Arguments: 14 | batch: (tuple) A tuple of tensor images and lists of annotations 15 | 16 | Return: 17 | A tuple containing: 18 | 1) (tensor) batch of images stacked on their 0 dim 19 | 2) (list of tensors) annotations for a given image are stacked on 20 | 0 dim 21 | """ 22 | targets = [] 23 | imgs = [] 24 | for sample in batch: 25 | imgs.append(sample[0]) 26 | targets.append(torch.FloatTensor(sample[1])) 27 | return torch.stack(imgs, 0), targets 28 | 29 | 30 | def base_transform(image, size, mean, std): 31 | x = cv2.resize(image, (size, size)).astype(np.float32) 32 | x /= 255. 33 | x -= mean 34 | x /= std 35 | return x 36 | 37 | 38 | class BaseTransform: 39 | def __init__(self, size, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)): 40 | self.size = size 41 | self.mean = np.array(mean, dtype=np.float32) 42 | self.std = np.array(std, dtype=np.float32) 43 | 44 | def __call__(self, image, boxes=None, labels=None): 45 | return base_transform(image, self.size, self.mean, self.std), boxes, labels 46 | -------------------------------------------------------------------------------- /data/coco2017.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import random 4 | 5 | import torch 6 | from torch.utils.data import Dataset 7 | import cv2 8 | from pycocotools.coco import COCO 9 | 10 | 11 | coco_class_labels = ('background', 12 | 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 13 | 'boat', 'traffic light', 'fire hydrant', 'street sign', 'stop sign', 14 | 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 15 | 'elephant', 'bear', 'zebra', 'giraffe', 'hat', 'backpack', 'umbrella', 16 | 'shoe', 'eye glasses', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 17 | 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 18 | 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'plate', 'wine glass', 19 | 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 20 | 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 21 | 'couch', 'potted plant', 'bed', 'mirror', 'dining table', 'window', 'desk', 22 | 'toilet', 'door', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 23 | 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'blender', 'book', 24 | 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') 25 | 26 | coco_class_index = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 27 | 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 28 | 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 29 | 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90] 30 | 31 | 32 | class COCODataset(Dataset): 33 | """ 34 | COCO dataset class. 35 | """ 36 | def __init__(self, 37 | data_dir=None, 38 | transform=None, 39 | json_file='instances_train2017.json', 40 | name='train2017'): 41 | """ 42 | COCO dataset initialization. Annotation data are read into memory by COCO API. 43 | Args: 44 | data_dir (str): dataset root directory 45 | json_file (str): COCO json file name 46 | name (str): COCO data name (e.g. 'train2017' or 'val2017') 47 | img_size (int): target image size after pre-processing 48 | min_size (int): bounding boxes smaller than this are ignored 49 | debug (bool): if True, only one data id is selected from the dataset 50 | """ 51 | self.data_dir = data_dir 52 | self.json_file = json_file 53 | self.coco = COCO(os.path.join(self.data_dir, 'annotations', self.json_file)) 54 | self.ids = self.coco.getImgIds() 55 | self.class_ids = sorted(self.coco.getCatIds()) 56 | self.name = name 57 | self.transform = transform 58 | 59 | 60 | def __len__(self): 61 | return len(self.ids) 62 | 63 | 64 | def pull_image(self, index): 65 | id_ = self.ids[index] 66 | img_file = os.path.join(self.data_dir, self.name, 67 | '{:012}'.format(id_) + '.jpg') 68 | img = cv2.imread(img_file) 69 | 70 | if self.json_file == 'instances_val5k.json' and img is None: 71 | img_file = os.path.join(self.data_dir, 'train2017', 72 | '{:012}'.format(id_) + '.jpg') 73 | img = cv2.imread(img_file) 74 | 75 | return img, id_ 76 | 77 | 78 | def pull_anno(self, index): 79 | id_ = self.ids[index] 80 | 81 | anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=None) 82 | annotations = self.coco.loadAnns(anno_ids) 83 | 84 | target = [] 85 | for anno in annotations: 86 | if 'bbox' in anno: 87 | xmin = np.max((0, anno['bbox'][0])) 88 | ymin = np.max((0, anno['bbox'][1])) 89 | xmax = xmin + anno['bbox'][2] 90 | ymax = ymin + anno['bbox'][3] 91 | 92 | if anno['area'] > 0 and xmax >= xmin and ymax >= ymin: 93 | label_ind = anno['category_id'] 94 | cls_id = self.class_ids.index(label_ind) 95 | 96 | target.append([xmin, ymin, xmax, ymax, cls_id]) # [xmin, ymin, xmax, ymax, label_ind] 97 | else: 98 | print('No bbox !!') 99 | return target 100 | 101 | 102 | def __getitem__(self, index): 103 | img, gt, h, w = self.pull_item(index) 104 | 105 | return img, gt 106 | 107 | 108 | def pull_item(self, index): 109 | id_ = self.ids[index] 110 | 111 | anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=None) 112 | annotations = self.coco.loadAnns(anno_ids) 113 | 114 | # load an image 115 | img_file = os.path.join(self.data_dir, self.name, 116 | '{:012}'.format(id_) + '.jpg') 117 | img = cv2.imread(img_file) 118 | 119 | if self.json_file == 'instances_val5k.json' and img is None: 120 | img_file = os.path.join(self.data_dir, 'train2017', 121 | '{:012}'.format(id_) + '.jpg') 122 | img = cv2.imread(img_file) 123 | 124 | assert img is not None 125 | 126 | height, width, channels = img.shape 127 | 128 | # load a target 129 | target = [] 130 | for anno in annotations: 131 | if 'bbox' in anno and anno['area'] > 0: 132 | xmin = np.max((0, anno['bbox'][0])) 133 | ymin = np.max((0, anno['bbox'][1])) 134 | xmax = np.min((width - 1, xmin + np.max((0, anno['bbox'][2] - 1)))) 135 | ymax = np.min((height - 1, ymin + np.max((0, anno['bbox'][3] - 1)))) 136 | if xmax > xmin and ymax > ymin: 137 | label_ind = anno['category_id'] 138 | cls_id = self.class_ids.index(label_ind) 139 | xmin /= width 140 | ymin /= height 141 | xmax /= width 142 | ymax /= height 143 | 144 | target.append([xmin, ymin, xmax, ymax, cls_id]) # [xmin, ymin, xmax, ymax, label_ind] 145 | else: 146 | print('No bbox !!!') 147 | 148 | # check target 149 | if len(target) == 0: 150 | target = np.zeros([1, 5]) 151 | else: 152 | target = np.array(target) 153 | # transform 154 | if self.transform is not None: 155 | img, boxes, labels = self.transform(img, target[:, :4], target[:, 4]) 156 | # to rgb 157 | img = img[:, :, (2, 1, 0)] 158 | # to tensor 159 | img = torch.from_numpy(img).permute(2, 0, 1).float() 160 | target = np.hstack((boxes, np.expand_dims(labels, axis=1))) 161 | 162 | return img, target, height, width 163 | 164 | 165 | if __name__ == "__main__": 166 | def base_transform(image, size, mean): 167 | x = cv2.resize(image, (size, size)).astype(np.float32) 168 | x -= mean 169 | x = x.astype(np.float32) 170 | return x 171 | 172 | class BaseTransform: 173 | def __init__(self, size, mean): 174 | self.size = size 175 | self.mean = np.array(mean, dtype=np.float32) 176 | 177 | def __call__(self, image, boxes=None, labels=None): 178 | return base_transform(image, self.size, self.mean), boxes, labels 179 | 180 | img_size = 640 181 | dataset = COCODataset( 182 | data_dir='/mnt/share/ssd2/dataset/COCO/', 183 | transform=BaseTransform(img_size, (0, 0, 0))) 184 | 185 | for i in range(1000): 186 | im, gt, h, w = dataset.pull_item(i) 187 | img = im.permute(1,2,0).numpy()[:, :, (2, 1, 0)].astype(np.uint8) 188 | img = img.copy() 189 | 190 | for box in gt: 191 | xmin, ymin, xmax, ymax, _ = box 192 | xmin *= img_size 193 | ymin *= img_size 194 | xmax *= img_size 195 | ymax *= img_size 196 | img = cv2.rectangle(img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0,0,255), 2) 197 | cv2.imshow('gt', img) 198 | # cv2.imwrite(str(i)+'.jpg', img) 199 | cv2.waitKey(0) 200 | -------------------------------------------------------------------------------- /data/config.py: -------------------------------------------------------------------------------- 1 | # config.py 2 | 3 | # YOLOv2 with darknet-19 4 | yolov2_d19_cfg = { 5 | # network 6 | 'backbone': 'd19', 7 | # for multi-scale trick 8 | 'train_size': 640, 9 | 'val_size': 416, 10 | 'random_size_range': [10, 19], 11 | # anchor size 12 | 'anchor_size_voc': [[1.19, 1.98], [2.79, 4.59], [4.53, 8.92], [8.06, 5.29], [10.32, 10.65]], 13 | 'anchor_size_coco': [[0.53, 0.79], [1.71, 2.36], [2.89, 6.44], [6.33, 3.79], [9.03, 9.74]], 14 | # train 15 | 'lr_epoch': (150, 200), 16 | 'max_epoch': 250, 17 | 'ignore_thresh': 0.5 18 | } 19 | 20 | # YOLOv2 with resnet-50 21 | yolov2_r50_cfg = { 22 | # network 23 | 'backbone': 'r50', 24 | # for multi-scale trick 25 | 'train_size': 640, 26 | 'val_size': 416, 27 | 'random_size_range': [10, 19], 28 | # anchor size 29 | 'anchor_size_voc': [[1.19, 1.98], [2.79, 4.59], [4.53, 8.92], [8.06, 5.29], [10.32, 10.65]], 30 | 'anchor_size_coco': [[0.53, 0.79], [1.71, 2.36], [2.89, 6.44], [6.33, 3.79], [9.03, 9.74]], 31 | # train 32 | 'lr_epoch': (150, 200), 33 | 'max_epoch': 250, 34 | 'ignore_thresh': 0.5 35 | } 36 | 37 | # YOLOv3 / YOLOv3Spp 38 | yolov3_d53_cfg = { 39 | # network 40 | 'backbone': 'd53', 41 | # for multi-scale trick 42 | 'train_size': 640, 43 | 'val_size': 416, 44 | 'random_size_range': [10, 19], 45 | # anchor size 46 | 'anchor_size_voc': [[32.64, 47.68], [50.24, 108.16], [126.72, 96.32], 47 | [78.4, 201.92], [178.24, 178.56], [129.6, 294.72], 48 | [331.84, 194.56], [227.84, 325.76], [365.44, 358.72]], 49 | 'anchor_size_coco': [[12.48, 19.2], [31.36, 46.4],[46.4, 113.92], 50 | [97.28, 55.04], [133.12, 127.36], [79.04, 224.], 51 | [301.12, 150.4 ], [172.16, 285.76], [348.16, 341.12]], 52 | # train 53 | 'lr_epoch': (150, 200), 54 | 'max_epoch': 250, 55 | 'ignore_thresh': 0.5 56 | } 57 | 58 | # YOLOv3Tiny 59 | yolov3_tiny_cfg = { 60 | # network 61 | 'backbone': 'd-light', 62 | # for multi-scale trick 63 | 'train_size': 640, 64 | 'val_size': 416, 65 | 'random_size_range':[10, 19], 66 | # anchor size 67 | 'anchor_size_voc': [[34.01, 61.79], [86.94, 109.68], [93.49, 227.46], 68 | [246.38, 163.33], [178.68, 306.55], [344.89, 337.14]], 69 | 'anchor_size_coco': [[15.09, 23.25], [46.36, 61.47], [68.41, 161.84], 70 | [168.88, 93.59], [154.96, 257.45], [334.74, 302.47]], 71 | # train 72 | 'lr_epoch': (150, 200), 73 | 'max_epoch': 250, 74 | 'ignore_thresh': 0.5 75 | } 76 | -------------------------------------------------------------------------------- /data/scripts/COCO2017.sh: -------------------------------------------------------------------------------- 1 | mkdir COCO 2 | cd COCO 3 | 4 | wget http://images.cocodataset.org/zips/train2017.zip 5 | wget http://images.cocodataset.org/zips/val2017.zip 6 | wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip 7 | wget http://images.cocodataset.org/zips/test2017.zip 8 | wget http://images.cocodataset.org/annotations/image_info_test2017.zip  9 | 10 | unzip train2017.zip 11 | unzip val2017.zip 12 | unzip annotations_trainval2017.zip 13 | unzip test2017.zip 14 | unzip image_info_test2017.zip 15 | 16 | # rm -f train2017.zip 17 | # rm -f val2017.zip 18 | # rm -f annotations_trainval2017.zip 19 | # rm -f test2017.zip 20 | # rm -f image_info_test2017.zip 21 | -------------------------------------------------------------------------------- /data/scripts/VOC2007.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2007 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar 26 | echo "Downloading VOC2007 test data ..." 27 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar 28 | echo "Done downloading." 29 | 30 | # Extract data 31 | echo "Extracting trainval ..." 32 | tar -xvf VOCtrainval_06-Nov-2007.tar 33 | echo "Extracting test ..." 34 | tar -xvf VOCtest_06-Nov-2007.tar 35 | echo "removing tars ..." 36 | rm VOCtrainval_06-Nov-2007.tar 37 | rm VOCtest_06-Nov-2007.tar 38 | 39 | end=`date +%s` 40 | runtime=$((end-start)) 41 | 42 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /data/scripts/VOC2012.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2012 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar 26 | echo "Done downloading." 27 | 28 | 29 | # Extract data 30 | echo "Extracting trainval ..." 31 | tar -xvf VOCtrainval_11-May-2012.tar 32 | echo "removing tar ..." 33 | rm VOCtrainval_11-May-2012.tar 34 | 35 | end=`date +%s` 36 | runtime=$((end-start)) 37 | 38 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /data/voc0712.py: -------------------------------------------------------------------------------- 1 | """VOC Dataset Classes 2 | 3 | Original author: Francisco Massa 4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py 5 | 6 | Updated by: Ellis Brown, Max deGroot 7 | """ 8 | import os.path as osp 9 | import sys 10 | import torch 11 | import torch.utils.data as data 12 | import cv2 13 | import numpy as np 14 | import random 15 | import xml.etree.ElementTree as ET 16 | 17 | 18 | VOC_CLASSES = ( # always index 0 19 | 'aeroplane', 'bicycle', 'bird', 'boat', 20 | 'bottle', 'bus', 'car', 'cat', 'chair', 21 | 'cow', 'diningtable', 'dog', 'horse', 22 | 'motorbike', 'person', 'pottedplant', 23 | 'sheep', 'sofa', 'train', 'tvmonitor') 24 | 25 | 26 | class VOCAnnotationTransform(object): 27 | """Transforms a VOC annotation into a Tensor of bbox coords and label index 28 | Initilized with a dictionary lookup of classnames to indexes 29 | 30 | Arguments: 31 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes 32 | (default: alphabetic indexing of VOC's 20 classes) 33 | keep_difficult (bool, optional): keep difficult instances or not 34 | (default: False) 35 | height (int): height 36 | width (int): width 37 | """ 38 | 39 | def __init__(self, class_to_ind=None, keep_difficult=False): 40 | self.class_to_ind = class_to_ind or dict( 41 | zip(VOC_CLASSES, range(len(VOC_CLASSES)))) 42 | self.keep_difficult = keep_difficult 43 | 44 | def __call__(self, target, width, height): 45 | """ 46 | Arguments: 47 | target (annotation) : the target annotation to be made usable 48 | will be an ET.Element 49 | Returns: 50 | a list containing lists of bounding boxes [bbox coords, class name] 51 | """ 52 | res = [] 53 | for obj in target.iter('object'): 54 | difficult = int(obj.find('difficult').text) == 1 55 | if not self.keep_difficult and difficult: 56 | continue 57 | name = obj.find('name').text.lower().strip() 58 | bbox = obj.find('bndbox') 59 | 60 | pts = ['xmin', 'ymin', 'xmax', 'ymax'] 61 | bndbox = [] 62 | for i, pt in enumerate(pts): 63 | cur_pt = int(bbox.find(pt).text) - 1 64 | # scale height or width 65 | cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height 66 | bndbox.append(cur_pt) 67 | label_idx = self.class_to_ind[name] 68 | bndbox.append(label_idx) 69 | res += [bndbox] # [xmin, ymin, xmax, ymax, label_ind] 70 | # img_id = target.find('filename').text[:-4] 71 | 72 | return res # [[xmin, ymin, xmax, ymax, label_ind], ... ] 73 | 74 | 75 | class VOCDetection(data.Dataset): 76 | """VOC Detection Dataset Object 77 | 78 | input is image, target is annotation 79 | 80 | Arguments: 81 | root (string): filepath to VOCdevkit folder. 82 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 83 | transform (callable, optional): transformation to perform on the 84 | input image 85 | target_transform (callable, optional): transformation to perform on the 86 | target `annotation` 87 | (eg: take in caption string, return tensor of word indices) 88 | dataset_name (string, optional): which dataset to load 89 | (default: 'VOC2007') 90 | """ 91 | 92 | def __init__(self, 93 | data_dir=None, 94 | image_sets=[('2007', 'trainval'), ('2012', 'trainval')], 95 | transform=None, 96 | target_transform=VOCAnnotationTransform(), 97 | dataset_name='VOC0712'): 98 | self.root = data_dir 99 | self.image_set = image_sets 100 | self.transform = transform 101 | self.target_transform = target_transform 102 | self.name = dataset_name 103 | self._annopath = osp.join('%s', 'Annotations', '%s.xml') 104 | self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg') 105 | self.ids = list() 106 | for (year, name) in image_sets: 107 | rootpath = osp.join(self.root, 'VOC' + year) 108 | for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')): 109 | self.ids.append((rootpath, line.strip())) 110 | 111 | 112 | def __getitem__(self, index): 113 | im, gt, h, w = self.pull_item(index) 114 | 115 | return im, gt 116 | 117 | 118 | def __len__(self): 119 | return len(self.ids) 120 | 121 | 122 | def pull_item(self, index): 123 | # load an image 124 | img_id = self.ids[index] 125 | img = cv2.imread(self._imgpath % img_id) 126 | height, width, channels = img.shape 127 | 128 | # load a target 129 | target = ET.parse(self._annopath % img_id).getroot() 130 | if self.target_transform is not None: 131 | target = self.target_transform(target, width, height) 132 | 133 | # check target 134 | if len(target) == 0: 135 | target = np.zeros([1, 5]) 136 | else: 137 | target = np.array(target) 138 | # transform 139 | if self.transform is not None: 140 | img, boxes, labels = self.transform(img, target[:, :4], target[:, 4]) 141 | # to rgb 142 | img = img[:, :, (2, 1, 0)] 143 | # to tensor 144 | img = torch.from_numpy(img).permute(2, 0, 1).float() 145 | # target 146 | target = np.hstack((boxes, np.expand_dims(labels, axis=1))) 147 | 148 | return img, target, height, width 149 | 150 | 151 | def pull_image(self, index): 152 | '''Returns the original image object at index in PIL form 153 | 154 | Note: not using self.__getitem__(), as any transformations passed in 155 | could mess up this functionality. 156 | 157 | Argument: 158 | index (int): index of img to show 159 | Return: 160 | PIL img 161 | ''' 162 | img_id = self.ids[index] 163 | return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR), img_id 164 | 165 | 166 | def pull_anno(self, index): 167 | '''Returns the original annotation of image at index 168 | 169 | Note: not using self.__getitem__(), as any transformations passed in 170 | could mess up this functionality. 171 | 172 | Argument: 173 | index (int): index of img to get annotation of 174 | Return: 175 | list: [img_id, [(label, bbox coords),...]] 176 | eg: ('001718', [('dog', (96, 13, 438, 332))]) 177 | ''' 178 | img_id = self.ids[index] 179 | anno = ET.parse(self._annopath % img_id).getroot() 180 | gt = self.target_transform(anno, 1, 1) 181 | return img_id[1], gt 182 | 183 | 184 | def pull_tensor(self, index): 185 | '''Returns the original image at an index in tensor form 186 | 187 | Note: not using self.__getitem__(), as any transformations passed in 188 | could mess up this functionality. 189 | 190 | Argument: 191 | index (int): index of img to show 192 | Return: 193 | tensorized version of img, squeezed 194 | ''' 195 | return torch.Tensor(self.pull_image(index)).unsqueeze_(0) 196 | 197 | 198 | if __name__ == "__main__": 199 | def base_transform(image, size, mean): 200 | x = cv2.resize(image, (size, size)).astype(np.float32) 201 | x -= mean 202 | x = x.astype(np.float32) 203 | return x 204 | 205 | class BaseTransform: 206 | def __init__(self, size, mean): 207 | self.size = size 208 | self.mean = np.array(mean, dtype=np.float32) 209 | 210 | def __call__(self, image, boxes=None, labels=None): 211 | return base_transform(image, self.size, self.mean), boxes, labels 212 | 213 | img_size = 640 214 | # dataset 215 | dataset = VOCDetection(data_dir='/mnt/share/ssd2/dataset/VOCdevkit/', 216 | image_sets=[('2007', 'trainval')], 217 | transform=BaseTransform(img_size, (0, 0, 0))) 218 | for i in range(1000): 219 | im, gt, h, w = dataset.pull_item(i) 220 | img = im.permute(1,2,0).numpy()[:, :, (2, 1, 0)].astype(np.uint8) 221 | img = img.copy() 222 | for box in gt: 223 | xmin, ymin, xmax, ymax, _ = box 224 | xmin *= img_size 225 | ymin *= img_size 226 | xmax *= img_size 227 | ymax *= img_size 228 | img = cv2.rectangle(img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0,0,255), 2) 229 | cv2.imshow('gt', img) 230 | cv2.waitKey(0) 231 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | import cv2 5 | import time 6 | import torch 7 | from data.coco2017 import coco_class_index, coco_class_labels 8 | from data import config, BaseTransform 9 | 10 | 11 | 12 | def parse_args(): 13 | parser = argparse.ArgumentParser(description='YOLO Demo Detection') 14 | # basic 15 | parser.add_argument('--mode', default='image', 16 | type=str, help='Use the data from image, video or camera') 17 | parser.add_argument('-size', '--input_size', default=416, type=int, 18 | help='input_size') 19 | parser.add_argument('--cuda', action='store_true', default=False, 20 | help='Use cuda') 21 | parser.add_argument('--path_to_img', default='data/demo/images/', 22 | type=str, help='The path to image files') 23 | parser.add_argument('--path_to_vid', default='data/demo/videos/', 24 | type=str, help='The path to video files') 25 | parser.add_argument('--path_to_save', default='det_results/', 26 | type=str, help='The path to save the detection results') 27 | parser.add_argument('-vs', '--visual_threshold', default=0.3, 28 | type=float, help='visual threshold') 29 | # model 30 | parser.add_argument('-v', '--version', default='yolo_v2', 31 | help='yolov2_d19, yolov2_r50, yolov2_slim, yolov3, yolov3_spp, yolov3_tiny') 32 | parser.add_argument('--conf_thresh', default=0.1, type=float, 33 | help='NMS threshold') 34 | parser.add_argument('--nms_thresh', default=0.45, type=float, 35 | help='NMS threshold') 36 | parser.add_argument('--trained_model', default='weights/', 37 | type=str, help='Trained state_dict file path to open') 38 | 39 | return parser.parse_args() 40 | 41 | 42 | def plot_bbox_labels(img, bbox, label, cls_color, test_scale=0.4): 43 | x1, y1, x2, y2 = bbox 44 | x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) 45 | t_size = cv2.getTextSize(label, 0, fontScale=1, thickness=2)[0] 46 | # plot bbox 47 | cv2.rectangle(img, (x1, y1), (x2, y2), cls_color, 2) 48 | # plot title bbox 49 | cv2.rectangle(img, (x1, y1-t_size[1]), (int(x1 + t_size[0] * test_scale), y1), cls_color, -1) 50 | # put the test on the title bbox 51 | cv2.putText(img, label, (int(x1), int(y1 - 5)), 0, test_scale, (0, 0, 0), 1, lineType=cv2.LINE_AA) 52 | 53 | return img 54 | 55 | 56 | def visualize(img, bboxes, scores, cls_inds, class_colors, vis_thresh=0.3): 57 | ts = 0.4 58 | for i, bbox in enumerate(bboxes): 59 | if scores[i] > vis_thresh: 60 | cls_color = class_colors[int(cls_inds[i])] 61 | cls_id = coco_class_index[int(cls_inds[i])] 62 | mess = '%s: %.2f' % (coco_class_labels[cls_id], scores[i]) 63 | img = plot_bbox_labels(img, bbox, mess, cls_color, test_scale=ts) 64 | 65 | return img 66 | 67 | 68 | def detect(net, 69 | device, 70 | transform, 71 | vis_thresh, 72 | mode='image', 73 | path_to_img=None, 74 | path_to_vid=None, 75 | path_to_save=None): 76 | # class color 77 | class_colors = [(np.random.randint(255), 78 | np.random.randint(255), 79 | np.random.randint(255)) for _ in range(80)] 80 | save_path = os.path.join(path_to_save, mode) 81 | os.makedirs(save_path, exist_ok=True) 82 | 83 | # ------------------------- Camera ---------------------------- 84 | if mode == 'camera': 85 | print('use camera !!!') 86 | cap = cv2.VideoCapture(0, cv2.CAP_DSHOW) 87 | while True: 88 | ret, frame = cap.read() 89 | if ret: 90 | if cv2.waitKey(1) == ord('q'): 91 | break 92 | img_h, img_w = frame.shape[:2] 93 | scale = np.array([[img_w, img_h, img_w, img_h]]) 94 | 95 | # prepare 96 | x = torch.from_numpy(transform(frame)[0][:, :, ::-1]).permute(2, 0, 1) 97 | x = x.unsqueeze(0).to(device) 98 | # inference 99 | t0 = time.time() 100 | bboxes, scores, cls_inds = net(x) 101 | t1 = time.time() 102 | print("detection time used ", t1-t0, "s") 103 | 104 | # rescale 105 | bboxes *= scale 106 | 107 | frame_processed = visualize(img=frame, 108 | bboxes=bboxes, 109 | scores=scores, 110 | cls_inds=cls_inds, 111 | class_colors=class_colors, 112 | vis_thresh=vis_thresh) 113 | cv2.imshow('detection result', frame_processed) 114 | cv2.waitKey(1) 115 | else: 116 | break 117 | cap.release() 118 | cv2.destroyAllWindows() 119 | 120 | # ------------------------- Image ---------------------------- 121 | elif mode == 'image': 122 | for i, img_id in enumerate(os.listdir(path_to_img)): 123 | img = cv2.imread(path_to_img + '/' + img_id, cv2.IMREAD_COLOR) 124 | img_h, img_w = img.shape[:2] 125 | scale = np.array([[img_w, img_h, img_w, img_h]]) 126 | 127 | # prepare 128 | x = torch.from_numpy(transform(img)[0][:, :, ::-1]).permute(2, 0, 1) 129 | x = x.unsqueeze(0).to(device) 130 | # inference 131 | t0 = time.time() 132 | bboxes, scores, cls_inds = net(x) 133 | t1 = time.time() 134 | print("detection time used ", t1-t0, "s") 135 | 136 | # rescale 137 | bboxes *= scale 138 | 139 | img_processed = visualize(img=img, 140 | bboxes=bboxes, 141 | scores=scores, 142 | cls_inds=cls_inds, 143 | class_colors=class_colors, 144 | vis_thresh=vis_thresh) 145 | 146 | cv2.imshow('detection', img_processed) 147 | cv2.imwrite(os.path.join(save_path, str(i).zfill(6)+'.jpg'), img_processed) 148 | cv2.waitKey(0) 149 | 150 | # ------------------------- Video --------------------------- 151 | elif mode == 'video': 152 | video = cv2.VideoCapture(path_to_vid) 153 | fourcc = cv2.VideoWriter_fourcc(*'XVID') 154 | save_size = (640, 480) 155 | save_path = os.path.join(save_path, 'det.avi') 156 | fps = 15.0 157 | out = cv2.VideoWriter(save_path, fourcc, fps, save_size) 158 | 159 | while(True): 160 | ret, frame = video.read() 161 | 162 | if ret: 163 | # ------------------------- Detection --------------------------- 164 | img_h, img_w = frame.shape[:2] 165 | scale = np.array([[img_w, img_h, img_w, img_h]]) 166 | # prepare 167 | x = torch.from_numpy(transform(frame)[0][:, :, ::-1]).permute(2, 0, 1) 168 | x = x.unsqueeze(0).to(device) 169 | # inference 170 | t0 = time.time() 171 | bboxes, scores, cls_inds = net(x) 172 | t1 = time.time() 173 | print("detection time used ", t1-t0, "s") 174 | 175 | # rescale 176 | bboxes *= scale 177 | 178 | frame_processed = visualize(img=frame, 179 | bboxes=bboxes, 180 | scores=scores, 181 | cls_inds=cls_inds, 182 | class_colors=class_colors, 183 | vis_thresh=vis_thresh) 184 | 185 | frame_processed_resize = cv2.resize(frame_processed, save_size) 186 | out.write(frame_processed_resize) 187 | cv2.imshow('detection', frame_processed) 188 | cv2.waitKey(1) 189 | else: 190 | break 191 | video.release() 192 | out.release() 193 | cv2.destroyAllWindows() 194 | 195 | 196 | def run(): 197 | args = parse_args() 198 | 199 | # use cuda 200 | if args.cuda: 201 | device = torch.device("cuda") 202 | else: 203 | device = torch.device("cpu") 204 | 205 | # model 206 | model_name = args.version 207 | print('Model: ', model_name) 208 | 209 | # load model and config file 210 | if model_name == 'yolov2_d19': 211 | from models.yolov2_d19 import YOLOv2D19 as yolo_net 212 | cfg = config.yolov2_d19_cfg 213 | 214 | elif model_name == 'yolov2_r50': 215 | from models.yolov2_r50 import YOLOv2R50 as yolo_net 216 | cfg = config.yolov2_r50_cfg 217 | 218 | elif model_name == 'yolov2_slim': 219 | from models.yolov2_slim import YOLOv2Slim as yolo_net 220 | cfg = config.yolov2_slim_cfg 221 | 222 | elif model_name == 'yolov3': 223 | from models.yolov3 import YOLOv3 as yolo_net 224 | cfg = config.yolov3_d53_cfg 225 | 226 | elif model_name == 'yolov3_spp': 227 | from models.yolov3_spp import YOLOv3Spp as yolo_net 228 | cfg = config.yolov3_d53_cfg 229 | 230 | elif model_name == 'yolov3_tiny': 231 | from models.yolov3_tiny import YOLOv3tiny as yolo_net 232 | cfg = config.yolov3_tiny_cfg 233 | else: 234 | print('Unknown model name...') 235 | exit(0) 236 | 237 | input_size = [args.input_size, args.input_size] 238 | 239 | # build model 240 | anchor_size = cfg['anchor_size_coco'] 241 | net = yolo_net(device=device, 242 | input_size=input_size, 243 | num_classes=80, 244 | trainable=False, 245 | conf_thresh=args.conf_thresh, 246 | nms_thresh=args.nms_thresh, 247 | anchor_size=anchor_size) 248 | 249 | # load weight 250 | net.load_state_dict(torch.load(args.trained_model, map_location=device)) 251 | net.to(device).eval() 252 | print('Finished loading model!') 253 | 254 | # run 255 | detect(net=net, 256 | device=device, 257 | transform=BaseTransform(input_size), 258 | mode=args.mode, 259 | path_to_img=args.path_to_img, 260 | path_to_vid=args.path_to_vid, 261 | path_to_save=args.path_to_save, 262 | thresh=args.visual_threshold 263 | ) 264 | 265 | 266 | if __name__ == '__main__': 267 | run() 268 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import torch 4 | 5 | from utils.vocapi_evaluator import VOCAPIEvaluator 6 | from utils.cocoapi_evaluator import COCOAPIEvaluator 7 | from data import BaseTransform, config 8 | 9 | 10 | 11 | parser = argparse.ArgumentParser(description='YOLO Detector Evaluation') 12 | parser.add_argument('-v', '--version', default='yolo_v2', 13 | help='yolov2_d19, yolov2_r50, yolov2_slim, yolov3, yolov3_spp, yolov3_tiny') 14 | parser.add_argument('--trained_model', type=str, default='weights/', 15 | help='Trained state_dict file path to open') 16 | parser.add_argument('-size', '--input_size', default=416, type=int, 17 | help='input_size') 18 | parser.add_argument('--cuda', action='store_true', default=False, 19 | help='Use cuda') 20 | # dataset 21 | parser.add_argument('--root', default='/mnt/share/ssd2/dataset', 22 | help='data root') 23 | parser.add_argument('-d', '--dataset', default='coco-val', 24 | help='voc, coco-val, coco-test.') 25 | 26 | args = parser.parse_args() 27 | 28 | 29 | 30 | def voc_test(model, data_dir, device, input_size): 31 | evaluator = VOCAPIEvaluator(data_root=data_dir, 32 | img_size=input_size, 33 | device=device, 34 | transform=BaseTransform(input_size), 35 | display=True) 36 | 37 | # VOC evaluation 38 | evaluator.evaluate(model) 39 | 40 | 41 | def coco_test(model, data_dir, device, input_size, test=False): 42 | if test: 43 | # test-dev 44 | print('test on test-dev 2017') 45 | evaluator = COCOAPIEvaluator( 46 | data_dir=data_dir, 47 | img_size=input_size, 48 | device=device, 49 | testset=True, 50 | transform=BaseTransform(input_size) 51 | ) 52 | 53 | else: 54 | # eval 55 | evaluator = COCOAPIEvaluator( 56 | data_dir=data_dir, 57 | img_size=input_size, 58 | device=device, 59 | testset=False, 60 | transform=BaseTransform(input_size) 61 | ) 62 | 63 | # COCO evaluation 64 | evaluator.evaluate(model) 65 | 66 | 67 | if __name__ == '__main__': 68 | # dataset 69 | if args.dataset == 'voc': 70 | print('eval on voc ...') 71 | num_classes = 20 72 | data_dir = os.path.join(args.root, 'VOCdevkit') 73 | elif args.dataset == 'coco-val': 74 | print('eval on coco-val ...') 75 | num_classes = 80 76 | data_dir = os.path.join(args.root, 'COCO') 77 | elif args.dataset == 'coco-test': 78 | print('eval on coco-test-dev ...') 79 | num_classes = 80 80 | data_dir = os.path.join(args.root, 'COCO') 81 | else: 82 | print('unknow dataset !! we only support voc, coco-val, coco-test !!!') 83 | exit(0) 84 | 85 | # cuda 86 | if args.cuda: 87 | print('use cuda') 88 | torch.backends.cudnn.benchmark = True 89 | device = torch.device("cuda") 90 | else: 91 | device = torch.device("cpu") 92 | 93 | 94 | # model 95 | model_name = args.version 96 | print('Model: ', model_name) 97 | 98 | # load model and config file 99 | if model_name == 'yolov2_d19': 100 | from models.yolov2_d19 import YOLOv2D19 as yolo_net 101 | cfg = config.yolov2_d19_cfg 102 | 103 | elif model_name == 'yolov2_r50': 104 | from models.yolov2_r50 import YOLOv2R50 as yolo_net 105 | cfg = config.yolov2_r50_cfg 106 | 107 | elif model_name == 'yolov2_slim': 108 | from models.yolov2_slim import YOLOv2Slim as yolo_net 109 | cfg = config.yolov2_slim_cfg 110 | 111 | elif model_name == 'yolov3': 112 | from models.yolov3 import YOLOv3 as yolo_net 113 | cfg = config.yolov3_d53_cfg 114 | 115 | elif model_name == 'yolov3_spp': 116 | from models.yolov3_spp import YOLOv3Spp as yolo_net 117 | cfg = config.yolov3_d53_cfg 118 | 119 | elif model_name == 'yolov3_tiny': 120 | from models.yolov3_tiny import YOLOv3tiny as yolo_net 121 | cfg = config.yolov3_tiny_cfg 122 | else: 123 | print('Unknown model name...') 124 | exit(0) 125 | 126 | # input size 127 | input_size = args.input_size 128 | 129 | # build model 130 | anchor_size = cfg['anchor_size_voc'] if args.dataset == 'voc' else cfg['anchor_size_coco'] 131 | net = yolo_net(device=device, 132 | input_size=input_size, 133 | num_classes=num_classes, 134 | trainable=False, 135 | anchor_size=anchor_size) 136 | 137 | # load net 138 | net.load_state_dict(torch.load(args.trained_model, map_location='cuda')) 139 | net.eval() 140 | print('Finished loading model!') 141 | net = net.to(device) 142 | 143 | # evaluation 144 | with torch.no_grad(): 145 | if args.dataset == 'voc': 146 | voc_test(net, data_dir, device, input_size) 147 | elif args.dataset == 'coco-val': 148 | coco_test(net, data_dir, device, input_size, test=False) 149 | elif args.dataset == 'coco-test': 150 | coco_test(net, data_dir, device, input_size, test=True) 151 | -------------------------------------------------------------------------------- /img_file/darknet_tiny.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/img_file/darknet_tiny.png -------------------------------------------------------------------------------- /models/__pycache__/yolo_anchor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_anchor.cpython-36.pyc -------------------------------------------------------------------------------- /models/__pycache__/yolo_anchor_ms.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_anchor_ms.cpython-36.pyc -------------------------------------------------------------------------------- /models/__pycache__/yolo_fusion.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_fusion.cpython-36.pyc -------------------------------------------------------------------------------- /models/__pycache__/yolo_kitti.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_kitti.cpython-36.pyc -------------------------------------------------------------------------------- /models/__pycache__/yolo_light.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_light.cpython-36.pyc -------------------------------------------------------------------------------- /models/__pycache__/yolo_mobile.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_mobile.cpython-36.pyc -------------------------------------------------------------------------------- /models/__pycache__/yolo_msf.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_msf.cpython-36.pyc -------------------------------------------------------------------------------- /models/__pycache__/yolo_v1.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_v1.cpython-36.pyc -------------------------------------------------------------------------------- /models/__pycache__/yolo_v1_ms.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_v1_ms.cpython-36.pyc -------------------------------------------------------------------------------- /models/__pycache__/yolo_v2.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_v2.cpython-36.pyc -------------------------------------------------------------------------------- /models/__pycache__/yolo_v2.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_v2.cpython-37.pyc -------------------------------------------------------------------------------- /models/yolov2_d19.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from utils.modules import Conv, reorg_layer 5 | 6 | from backbone import build_backbone 7 | import tools 8 | 9 | 10 | class YOLOv2D19(nn.Module): 11 | def __init__(self, device, input_size=None, num_classes=20, trainable=False, conf_thresh=0.001, nms_thresh=0.5, anchor_size=None): 12 | super(YOLOv2D19, self).__init__() 13 | self.device = device 14 | self.input_size = input_size 15 | self.num_classes = num_classes 16 | self.trainable = trainable 17 | self.conf_thresh = conf_thresh 18 | self.nms_thresh = nms_thresh 19 | self.anchor_size = torch.tensor(anchor_size) 20 | self.num_anchors = len(anchor_size) 21 | self.stride = 32 22 | self.grid_cell, self.all_anchor_wh = self.create_grid(input_size) 23 | 24 | # backbone darknet-19 25 | self.backbone = build_backbone(model_name='darknet19', pretrained=trainable) 26 | 27 | # detection head 28 | self.convsets_1 = nn.Sequential( 29 | Conv(1024, 1024, k=3, p=1), 30 | Conv(1024, 1024, k=3, p=1) 31 | ) 32 | 33 | self.route_layer = Conv(512, 64, k=1) 34 | self.reorg = reorg_layer(stride=2) 35 | 36 | self.convsets_2 = Conv(1280, 1024, k=3, p=1) 37 | 38 | # prediction layer 39 | self.pred = nn.Conv2d(1024, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) 40 | 41 | 42 | def create_grid(self, input_size): 43 | w, h = input_size, input_size 44 | # generate grid cells 45 | ws, hs = w // self.stride, h // self.stride 46 | grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)]) 47 | grid_xy = torch.stack([grid_x, grid_y], dim=-1).float() 48 | grid_xy = grid_xy.view(1, hs*ws, 1, 2).to(self.device) 49 | 50 | # generate anchor_wh tensor 51 | anchor_wh = self.anchor_size.repeat(hs*ws, 1, 1).unsqueeze(0).to(self.device) 52 | 53 | return grid_xy, anchor_wh 54 | 55 | 56 | def set_grid(self, input_size): 57 | self.input_size = input_size 58 | self.grid_cell, self.all_anchor_wh = self.create_grid(input_size) 59 | 60 | 61 | def decode_xywh(self, txtytwth_pred): 62 | """ 63 | Input: \n 64 | txtytwth_pred : [B, H*W, anchor_n, 4] \n 65 | Output: \n 66 | xywh_pred : [B, H*W*anchor_n, 4] \n 67 | """ 68 | B, HW, ab_n, _ = txtytwth_pred.size() 69 | # b_x = sigmoid(tx) + gride_x 70 | # b_y = sigmoid(ty) + gride_y 71 | xy_pred = torch.sigmoid(txtytwth_pred[..., :2]) + self.grid_cell 72 | # b_w = anchor_w * exp(tw) 73 | # b_h = anchor_h * exp(th) 74 | wh_pred = torch.exp(txtytwth_pred[..., 2:]) * self.all_anchor_wh 75 | # [B, H*W, anchor_n, 4] -> [B, H*W*anchor_n, 4] 76 | xywh_pred = torch.cat([xy_pred, wh_pred], -1).view(B, -1, 4) * self.stride 77 | 78 | return xywh_pred 79 | 80 | 81 | def decode_boxes(self, txtytwth_pred): 82 | """ 83 | Input: \n 84 | txtytwth_pred : [B, H*W, anchor_n, 4] \n 85 | Output: \n 86 | x1y1x2y2_pred : [B, H*W*anchor_n, 4] \n 87 | """ 88 | # txtytwth -> cxcywh 89 | xywh_pred = self.decode_xywh(txtytwth_pred) 90 | 91 | # cxcywh -> x1y1x2y2 92 | x1y1x2y2_pred = torch.zeros_like(xywh_pred) 93 | x1y1_pred = xywh_pred[..., :2] - xywh_pred[..., 2:] * 0.5 94 | x2y2_pred = xywh_pred[..., :2] + xywh_pred[..., 2:] * 0.5 95 | x1y1x2y2_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1) 96 | 97 | return x1y1x2y2_pred 98 | 99 | 100 | def nms(self, dets, scores): 101 | """"Pure Python NMS baseline.""" 102 | x1 = dets[:, 0] #xmin 103 | y1 = dets[:, 1] #ymin 104 | x2 = dets[:, 2] #xmax 105 | y2 = dets[:, 3] #ymax 106 | 107 | areas = (x2 - x1) * (y2 - y1) 108 | order = scores.argsort()[::-1] 109 | 110 | keep = [] 111 | while order.size > 0: 112 | i = order[0] 113 | keep.append(i) 114 | xx1 = np.maximum(x1[i], x1[order[1:]]) 115 | yy1 = np.maximum(y1[i], y1[order[1:]]) 116 | xx2 = np.minimum(x2[i], x2[order[1:]]) 117 | yy2 = np.minimum(y2[i], y2[order[1:]]) 118 | 119 | w = np.maximum(1e-10, xx2 - xx1) 120 | h = np.maximum(1e-10, yy2 - yy1) 121 | inter = w * h 122 | 123 | # Cross Area / (bbox + particular area - Cross Area) 124 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 125 | #reserve all the boundingbox whose ovr less than thresh 126 | inds = np.where(ovr <= self.nms_thresh)[0] 127 | order = order[inds + 1] 128 | 129 | return keep 130 | 131 | 132 | def postprocess(self, bboxes, scores): 133 | """ 134 | bboxes: (HxW, 4), bsize = 1 135 | scores: (HxW, num_classes), bsize = 1 136 | """ 137 | 138 | cls_inds = np.argmax(scores, axis=1) 139 | scores = scores[(np.arange(scores.shape[0]), cls_inds)] 140 | 141 | # threshold 142 | keep = np.where(scores >= self.conf_thresh) 143 | bboxes = bboxes[keep] 144 | scores = scores[keep] 145 | cls_inds = cls_inds[keep] 146 | 147 | # NMS 148 | keep = np.zeros(len(bboxes), dtype=np.int) 149 | for i in range(self.num_classes): 150 | inds = np.where(cls_inds == i)[0] 151 | if len(inds) == 0: 152 | continue 153 | c_bboxes = bboxes[inds] 154 | c_scores = scores[inds] 155 | c_keep = self.nms(c_bboxes, c_scores) 156 | keep[inds[c_keep]] = 1 157 | 158 | keep = np.where(keep > 0) 159 | bboxes = bboxes[keep] 160 | scores = scores[keep] 161 | cls_inds = cls_inds[keep] 162 | 163 | return bboxes, scores, cls_inds 164 | 165 | 166 | @ torch.no_grad() 167 | def inference(self, x): 168 | # backbone 169 | feats = self.backbone(x) 170 | 171 | # reorg layer 172 | p5 = self.convsets_1(feats['layer3']) 173 | p4 = self.reorg(self.route_layer(feats['layer2'])) 174 | p5 = torch.cat([p4, p5], dim=1) 175 | 176 | # head 177 | p5 = self.convsets_2(p5) 178 | 179 | # pred 180 | pred = self.pred(p5) 181 | 182 | B, abC, H, W = pred.size() 183 | 184 | # [B, num_anchor * C, H, W] -> [B, H, W, num_anchor * C] -> [B, H*W, num_anchor*C] 185 | pred = pred.permute(0, 2, 3, 1).contiguous().view(B, H*W, abC) 186 | 187 | # [B, H*W*num_anchor, 1] 188 | conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, 1) 189 | # [B, H*W, num_anchor, num_cls] 190 | cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, self.num_classes) 191 | # [B, H*W, num_anchor, 4] 192 | reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() 193 | # decode box 194 | reg_pred = reg_pred.view(B, H*W, self.num_anchors, 4) 195 | box_pred = self.decode_boxes(reg_pred) 196 | 197 | # batch size = 1 198 | conf_pred = conf_pred[0] 199 | cls_pred = cls_pred[0] 200 | box_pred = box_pred[0] 201 | 202 | # score 203 | scores = torch.sigmoid(conf_pred) * torch.softmax(cls_pred, dim=-1) 204 | 205 | # normalize bbox 206 | bboxes = torch.clamp(box_pred / self.input_size, 0., 1.) 207 | 208 | # to cpu 209 | scores = scores.to('cpu').numpy() 210 | bboxes = bboxes.to('cpu').numpy() 211 | 212 | # post-process 213 | bboxes, scores, cls_inds = self.postprocess(bboxes, scores) 214 | 215 | return bboxes, scores, cls_inds 216 | 217 | 218 | def forward(self, x, target=None): 219 | if not self.trainable: 220 | return self.inference(x) 221 | else: 222 | # backbone 223 | feats = self.backbone(x) 224 | 225 | # reorg layer 226 | p5 = self.convsets_1(feats['layer3']) 227 | p4 = self.reorg(self.route_layer(feats['layer2'])) 228 | p5 = torch.cat([p4, p5], dim=1) 229 | 230 | # head 231 | p5 = self.convsets_2(p5) 232 | 233 | # pred 234 | pred = self.pred(p5) 235 | 236 | B, abC, H, W = pred.size() 237 | 238 | # [B, num_anchor * C, H, W] -> [B, H, W, num_anchor * C] -> [B, H*W, num_anchor*C] 239 | pred = pred.permute(0, 2, 3, 1).contiguous().view(B, H*W, abC) 240 | 241 | # [B, H*W*num_anchor, 1] 242 | conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, 1) 243 | # [B, H*W, num_anchor, num_cls] 244 | cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, self.num_classes) 245 | # [B, H*W, num_anchor, 4] 246 | reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() 247 | reg_pred = reg_pred.view(B, H*W, self.num_anchors, 4) 248 | 249 | # decode bbox 250 | x1y1x2y2_pred = (self.decode_boxes(reg_pred) / self.input_size).view(-1, 4) 251 | x1y1x2y2_gt = target[:, :, 7:].view(-1, 4) 252 | reg_pred = reg_pred.view(B, H*W*self.num_anchors, 4) 253 | 254 | # set conf target 255 | iou_pred = tools.iou_score(x1y1x2y2_pred, x1y1x2y2_gt).view(B, -1, 1) 256 | gt_conf = iou_pred.clone().detach() 257 | 258 | # [obj, cls, txtytwth, x1y1x2y2] -> [conf, obj, cls, txtytwth] 259 | target = torch.cat([gt_conf, target[:, :, :7]], dim=2) 260 | 261 | # loss 262 | ( 263 | conf_loss, 264 | cls_loss, 265 | bbox_loss, 266 | iou_loss 267 | ) = tools.loss(pred_conf=conf_pred, 268 | pred_cls=cls_pred, 269 | pred_txtytwth=reg_pred, 270 | pred_iou=iou_pred, 271 | label=target 272 | ) 273 | 274 | return conf_loss, cls_loss, bbox_loss, iou_loss 275 | -------------------------------------------------------------------------------- /models/yolov2_r50.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from utils.modules import Conv, reorg_layer 5 | from backbone import build_backbone 6 | import numpy as np 7 | import tools 8 | 9 | 10 | class YOLOv2R50(nn.Module): 11 | def __init__(self, device, input_size=None, num_classes=20, trainable=False, conf_thresh=0.001, nms_thresh=0.6, anchor_size=None, hr=False): 12 | super(YOLOv2R50, self).__init__() 13 | self.device = device 14 | self.input_size = input_size 15 | self.num_classes = num_classes 16 | self.trainable = trainable 17 | self.conf_thresh = conf_thresh 18 | self.nms_thresh = nms_thresh 19 | self.anchor_size = torch.tensor(anchor_size) 20 | self.num_anchors = len(anchor_size) 21 | self.stride = 32 22 | self.grid_cell, self.all_anchor_wh = self.create_grid(input_size) 23 | 24 | # backbone 25 | self.backbone = build_backbone(model_name='resnet50', pretrained=trainable) 26 | 27 | # head 28 | self.convsets_1 = nn.Sequential( 29 | Conv(2048, 1024, k=1), 30 | Conv(1024, 1024, k=3, p=1), 31 | Conv(1024, 1024, k=3, p=1) 32 | ) 33 | 34 | # reorg 35 | self.route_layer = Conv(1024, 128, k=1) 36 | self.reorg = reorg_layer(stride=2) 37 | 38 | # head 39 | self.convsets_2 = Conv(1024+128*4, 1024, k=3, p=1) 40 | 41 | # pred 42 | self.pred = nn.Conv2d(1024, self.num_anchors*(1 + 4 + self.num_classes), 1) 43 | 44 | 45 | if self.trainable: 46 | # init bias 47 | self.init_bias() 48 | 49 | 50 | def init_bias(self): 51 | # init bias 52 | init_prob = 0.01 53 | bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob)) 54 | nn.init.constant_(self.pred.bias[..., :self.num_anchors], bias_value) 55 | 56 | 57 | def create_grid(self, input_size): 58 | w, h = input_size, input_size 59 | # generate grid cells 60 | ws, hs = w // self.stride, h // self.stride 61 | grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)]) 62 | grid_xy = torch.stack([grid_x, grid_y], dim=-1).float() 63 | grid_xy = grid_xy.view(1, hs*ws, 1, 2).to(self.device) 64 | 65 | # generate anchor_wh tensor 66 | anchor_wh = self.anchor_size.repeat(hs*ws, 1, 1).unsqueeze(0).to(self.device) 67 | 68 | 69 | return grid_xy, anchor_wh 70 | 71 | 72 | def set_grid(self, input_size): 73 | self.input_size = input_size 74 | self.grid_cell, self.all_anchor_wh = self.create_grid(input_size) 75 | 76 | 77 | def decode_xywh(self, txtytwth_pred): 78 | """ 79 | Input: \n 80 | txtytwth_pred : [B, H*W, anchor_n, 4] \n 81 | Output: \n 82 | xywh_pred : [B, H*W*anchor_n, 4] \n 83 | """ 84 | B, HW, ab_n, _ = txtytwth_pred.size() 85 | # b_x = sigmoid(tx) + gride_x 86 | # b_y = sigmoid(ty) + gride_y 87 | xy_pred = torch.sigmoid(txtytwth_pred[:, :, :, :2]) + self.grid_cell 88 | # b_w = anchor_w * exp(tw) 89 | # b_h = anchor_h * exp(th) 90 | wh_pred = torch.exp(txtytwth_pred[:, :, :, 2:]) * self.all_anchor_wh 91 | # [H*W, anchor_n, 4] -> [H*W*anchor_n, 4] 92 | xywh_pred = torch.cat([xy_pred, wh_pred], -1).view(B, -1, 4) * self.stride 93 | 94 | return xywh_pred 95 | 96 | 97 | def decode_boxes(self, txtytwth_pred): 98 | """ 99 | Input: \n 100 | txtytwth_pred : [B, H*W, anchor_n, 4] \n 101 | Output: \n 102 | x1y1x2y2_pred : [B, H*W*anchor_n, 4] \n 103 | """ 104 | # txtytwth -> cxcywh 105 | xywh_pred = self.decode_xywh(txtytwth_pred) 106 | 107 | # cxcywh -> x1y1x2y2 108 | x1y1x2y2_pred = torch.zeros_like(xywh_pred) 109 | x1y1_pred = xywh_pred[..., :2] - xywh_pred[..., 2:] * 0.5 110 | x2y2_pred = xywh_pred[..., :2] + xywh_pred[..., 2:] * 0.5 111 | x1y1x2y2_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1) 112 | 113 | return x1y1x2y2_pred 114 | 115 | 116 | def nms(self, dets, scores): 117 | """"Pure Python NMS baseline.""" 118 | x1 = dets[:, 0] #xmin 119 | y1 = dets[:, 1] #ymin 120 | x2 = dets[:, 2] #xmax 121 | y2 = dets[:, 3] #ymax 122 | 123 | areas = (x2 - x1) * (y2 - y1) 124 | order = scores.argsort()[::-1] 125 | 126 | keep = [] 127 | while order.size > 0: 128 | i = order[0] 129 | keep.append(i) 130 | xx1 = np.maximum(x1[i], x1[order[1:]]) 131 | yy1 = np.maximum(y1[i], y1[order[1:]]) 132 | xx2 = np.minimum(x2[i], x2[order[1:]]) 133 | yy2 = np.minimum(y2[i], y2[order[1:]]) 134 | 135 | w = np.maximum(1e-10, xx2 - xx1) 136 | h = np.maximum(1e-10, yy2 - yy1) 137 | inter = w * h 138 | 139 | # Cross Area / (bbox + particular area - Cross Area) 140 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 141 | #reserve all the boundingbox whose ovr less than thresh 142 | inds = np.where(ovr <= self.nms_thresh)[0] 143 | order = order[inds + 1] 144 | 145 | return keep 146 | 147 | 148 | def postprocess(self, bboxes, scores): 149 | """ 150 | bboxes: (HxW, 4), bsize = 1 151 | scores: (HxW, num_classes), bsize = 1 152 | """ 153 | 154 | cls_inds = np.argmax(scores, axis=1) 155 | scores = scores[(np.arange(scores.shape[0]), cls_inds)] 156 | 157 | # threshold 158 | keep = np.where(scores >= self.conf_thresh) 159 | bboxes = bboxes[keep] 160 | scores = scores[keep] 161 | cls_inds = cls_inds[keep] 162 | 163 | # NMS 164 | keep = np.zeros(len(bboxes), dtype=np.int) 165 | for i in range(self.num_classes): 166 | inds = np.where(cls_inds == i)[0] 167 | if len(inds) == 0: 168 | continue 169 | c_bboxes = bboxes[inds] 170 | c_scores = scores[inds] 171 | c_keep = self.nms(c_bboxes, c_scores) 172 | keep[inds[c_keep]] = 1 173 | 174 | keep = np.where(keep > 0) 175 | bboxes = bboxes[keep] 176 | scores = scores[keep] 177 | cls_inds = cls_inds[keep] 178 | 179 | return bboxes, scores, cls_inds 180 | 181 | 182 | @ torch.no_grad() 183 | def inference(self, x): 184 | # backbone 185 | feats = self.backbone(x) 186 | 187 | # reorg layer 188 | p5 = self.convsets_1(feats['layer3']) 189 | p4 = self.reorg(self.route_layer(feats['layer2'])) 190 | p5 = torch.cat([p4, p5], dim=1) 191 | 192 | # head 193 | p5 = self.convsets_2(p5) 194 | 195 | # pred 196 | pred = self.pred(p5) 197 | 198 | B, abC, H, W = pred.size() 199 | 200 | # [B, num_anchor * C, H, W] -> [B, H, W, num_anchor * C] -> [B, H*W, num_anchor*C] 201 | pred = pred.permute(0, 2, 3, 1).contiguous().view(B, H*W, abC) 202 | 203 | # [B, H*W*num_anchor, 1] 204 | conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, 1) 205 | # [B, H*W, num_anchor, num_cls] 206 | cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, self.num_classes) 207 | # [B, H*W, num_anchor, 4] 208 | reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() 209 | # decode box 210 | reg_pred = reg_pred.view(B, H*W, self.num_anchors, 4) 211 | box_pred = self.decode_boxes(reg_pred) 212 | 213 | # batch size = 1 214 | conf_pred = conf_pred[0] 215 | cls_pred = cls_pred[0] 216 | box_pred = box_pred[0] 217 | 218 | # score 219 | scores = torch.sigmoid(conf_pred) * torch.softmax(cls_pred, dim=-1) 220 | 221 | # normalize bbox 222 | bboxes = torch.clamp(box_pred / self.input_size, 0., 1.) 223 | 224 | # to cpu 225 | scores = scores.to('cpu').numpy() 226 | bboxes = bboxes.to('cpu').numpy() 227 | 228 | # post-process 229 | bboxes, scores, cls_inds = self.postprocess(bboxes, scores) 230 | 231 | return bboxes, scores, cls_inds 232 | 233 | 234 | def forward(self, x, target=None): 235 | if not self.trainable: 236 | return self.inference(x) 237 | else: 238 | # backbone 239 | feats = self.backbone(x) 240 | 241 | # reorg layer 242 | p5 = self.convsets_1(feats['layer3']) 243 | p4 = self.reorg(self.route_layer(feats['layer2'])) 244 | p5 = torch.cat([p4, p5], dim=1) 245 | 246 | # head 247 | p5 = self.convsets_2(p5) 248 | 249 | # pred 250 | pred = self.pred(p5) 251 | 252 | B, abC, H, W = pred.size() 253 | 254 | # [B, num_anchor * C, H, W] -> [B, H, W, num_anchor * C] -> [B, H*W, num_anchor*C] 255 | pred = pred.permute(0, 2, 3, 1).contiguous().view(B, H*W, abC) 256 | 257 | # [B, H*W*num_anchor, 1] 258 | conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, 1) 259 | # [B, H*W, num_anchor, num_cls] 260 | cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, self.num_classes) 261 | # [B, H*W, num_anchor, 4] 262 | reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() 263 | reg_pred = reg_pred.view(B, H*W, self.num_anchors, 4) 264 | 265 | # decode bbox 266 | x1y1x2y2_pred = (self.decode_boxes(reg_pred) / self.input_size).view(-1, 4) 267 | x1y1x2y2_gt = target[:, :, 7:].view(-1, 4) 268 | reg_pred = reg_pred.view(B, H*W*self.num_anchors, 4) 269 | 270 | # set conf target 271 | iou_pred = tools.iou_score(x1y1x2y2_pred, x1y1x2y2_gt).view(B, -1, 1) 272 | gt_conf = iou_pred.clone().detach() 273 | 274 | # [obj, cls, txtytwth, x1y1x2y2] -> [conf, obj, cls, txtytwth] 275 | target = torch.cat([gt_conf, target[:, :, :7]], dim=2) 276 | 277 | # loss 278 | ( 279 | conf_loss, 280 | cls_loss, 281 | bbox_loss, 282 | iou_loss 283 | ) = tools.loss(pred_conf=conf_pred, 284 | pred_cls=cls_pred, 285 | pred_txtytwth=reg_pred, 286 | pred_iou=iou_pred, 287 | label=target 288 | ) 289 | 290 | return conf_loss, cls_loss, bbox_loss, iou_loss 291 | -------------------------------------------------------------------------------- /models/yolov3.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from utils.modules import Conv 5 | from backbone import build_backbone 6 | import numpy as np 7 | import tools 8 | 9 | 10 | class YOLOv3(nn.Module): 11 | def __init__(self, 12 | device, 13 | input_size=None, 14 | num_classes=20, 15 | trainable=False, 16 | conf_thresh=0.001, 17 | nms_thresh=0.50, 18 | anchor_size=None): 19 | super(YOLOv3, self).__init__() 20 | self.device = device 21 | self.input_size = input_size 22 | self.num_classes = num_classes 23 | self.trainable = trainable 24 | self.conf_thresh = conf_thresh 25 | self.nms_thresh = nms_thresh 26 | self.topk = 3000 27 | self.stride = [8, 16, 32] 28 | self.anchor_size = torch.tensor(anchor_size).view(3, len(anchor_size) // 3, 2) 29 | self.num_anchors = self.anchor_size.size(1) 30 | 31 | self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size) 32 | 33 | # backbone 34 | self.backbone = build_backbone(model_name='darknet53', pretrained=trainable) 35 | 36 | # s = 32 37 | self.conv_set_3 = nn.Sequential( 38 | Conv(1024, 512, k=1), 39 | Conv(512, 1024, k=3, p=1), 40 | Conv(1024, 512, k=1), 41 | Conv(512, 1024, k=3, p=1), 42 | Conv(1024, 512, k=1) 43 | ) 44 | self.conv_1x1_3 = Conv(512, 256, k=1) 45 | self.extra_conv_3 = Conv(512, 1024, k=3, p=1) 46 | self.pred_3 = nn.Conv2d(1024, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) 47 | 48 | # s = 16 49 | self.conv_set_2 = nn.Sequential( 50 | Conv(768, 256, k=1), 51 | Conv(256, 512, k=3, p=1), 52 | Conv(512, 256, k=1), 53 | Conv(256, 512, k=3, p=1), 54 | Conv(512, 256, k=1) 55 | ) 56 | self.conv_1x1_2 = Conv(256, 128, k=1) 57 | self.extra_conv_2 = Conv(256, 512, k=3, p=1) 58 | self.pred_2 = nn.Conv2d(512, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) 59 | 60 | # s = 8 61 | self.conv_set_1 = nn.Sequential( 62 | Conv(384, 128, k=1), 63 | Conv(128, 256, k=3, p=1), 64 | Conv(256, 128, k=1), 65 | Conv(128, 256, k=3, p=1), 66 | Conv(256, 128, k=1) 67 | ) 68 | self.extra_conv_1 = Conv(128, 256, k=3, p=1) 69 | self.pred_1 = nn.Conv2d(256, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) 70 | 71 | self.init_yolo() 72 | 73 | 74 | def init_yolo(self): 75 | # Init head 76 | init_prob = 0.01 77 | bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob)) 78 | # init obj&cls pred 79 | for pred in [self.pred_1, self.pred_2, self.pred_3]: 80 | nn.init.constant_(pred.bias[..., :self.num_anchors], bias_value) 81 | nn.init.constant_(pred.bias[..., self.num_anchors : (1 + self.num_classes) * self.num_anchors], bias_value) 82 | 83 | 84 | def create_grid(self, input_size): 85 | total_grid_xy = [] 86 | total_stride = [] 87 | total_anchor_wh = [] 88 | w, h = input_size, input_size 89 | for ind, s in enumerate(self.stride): 90 | # generate grid cells 91 | ws, hs = w // s, h // s 92 | grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)]) 93 | grid_xy = torch.stack([grid_x, grid_y], dim=-1).float() 94 | grid_xy = grid_xy.view(1, hs*ws, 1, 2) 95 | 96 | # generate stride tensor 97 | stride_tensor = torch.ones([1, hs*ws, self.num_anchors, 2]) * s 98 | 99 | # generate anchor_wh tensor 100 | anchor_wh = self.anchor_size[ind].repeat(hs*ws, 1, 1) 101 | 102 | total_grid_xy.append(grid_xy) 103 | total_stride.append(stride_tensor) 104 | total_anchor_wh.append(anchor_wh) 105 | 106 | total_grid_xy = torch.cat(total_grid_xy, dim=1).to(self.device) 107 | total_stride = torch.cat(total_stride, dim=1).to(self.device) 108 | total_anchor_wh = torch.cat(total_anchor_wh, dim=0).to(self.device).unsqueeze(0) 109 | 110 | return total_grid_xy, total_stride, total_anchor_wh 111 | 112 | 113 | def set_grid(self, input_size): 114 | self.input_size = input_size 115 | self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size) 116 | 117 | 118 | def decode_xywh(self, txtytwth_pred): 119 | """ 120 | Input: 121 | txtytwth_pred : [B, H*W, anchor_n, 4] containing [tx, ty, tw, th] 122 | Output: 123 | xywh_pred : [B, H*W*anchor_n, 4] containing [x, y, w, h] 124 | """ 125 | # b_x = sigmoid(tx) + gride_x, b_y = sigmoid(ty) + gride_y 126 | B, HW, ab_n, _ = txtytwth_pred.size() 127 | c_xy_pred = (torch.sigmoid(txtytwth_pred[..., :2]) + self.grid_cell) * self.stride_tensor 128 | # b_w = anchor_w * exp(tw), b_h = anchor_h * exp(th) 129 | b_wh_pred = torch.exp(txtytwth_pred[..., 2:]) * self.all_anchors_wh 130 | # [B, H*W, anchor_n, 4] -> [B, H*W*anchor_n, 4] 131 | xywh_pred = torch.cat([c_xy_pred, b_wh_pred], -1).view(B, HW*ab_n, 4) 132 | 133 | return xywh_pred 134 | 135 | 136 | def decode_boxes(self, txtytwth_pred): 137 | """ 138 | Input: \n 139 | txtytwth_pred : [B, H*W, anchor_n, 4] \n 140 | Output: \n 141 | x1y1x2y2_pred : [B, H*W*anchor_n, 4] \n 142 | """ 143 | # txtytwth -> cxcywh 144 | xywh_pred = self.decode_xywh(txtytwth_pred) 145 | 146 | # cxcywh -> x1y1x2y2 147 | x1y1x2y2_pred = torch.zeros_like(xywh_pred) 148 | x1y1_pred = xywh_pred[..., :2] - xywh_pred[..., 2:] * 0.5 149 | x2y2_pred = xywh_pred[..., :2] + xywh_pred[..., 2:] * 0.5 150 | x1y1x2y2_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1) 151 | 152 | return x1y1x2y2_pred 153 | 154 | 155 | def nms(self, dets, scores): 156 | """"Pure Python NMS baseline.""" 157 | x1 = dets[:, 0] #xmin 158 | y1 = dets[:, 1] #ymin 159 | x2 = dets[:, 2] #xmax 160 | y2 = dets[:, 3] #ymax 161 | 162 | areas = (x2 - x1) * (y2 - y1) 163 | order = scores.argsort()[::-1] 164 | 165 | keep = [] 166 | while order.size > 0: 167 | i = order[0] 168 | keep.append(i) 169 | xx1 = np.maximum(x1[i], x1[order[1:]]) 170 | yy1 = np.maximum(y1[i], y1[order[1:]]) 171 | xx2 = np.minimum(x2[i], x2[order[1:]]) 172 | yy2 = np.minimum(y2[i], y2[order[1:]]) 173 | 174 | w = np.maximum(1e-10, xx2 - xx1) 175 | h = np.maximum(1e-10, yy2 - yy1) 176 | inter = w * h 177 | 178 | # Cross Area / (bbox + particular area - Cross Area) 179 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 180 | #reserve all the boundingbox whose ovr less than thresh 181 | inds = np.where(ovr <= self.nms_thresh)[0] 182 | order = order[inds + 1] 183 | 184 | return keep 185 | 186 | 187 | def postprocess(self, bboxes, scores): 188 | """ 189 | bboxes: (HxW, 4), bsize = 1 190 | scores: (HxW, num_classes), bsize = 1 191 | """ 192 | 193 | cls_inds = np.argmax(scores, axis=1) 194 | scores = scores[(np.arange(scores.shape[0]), cls_inds)] 195 | 196 | # threshold 197 | keep = np.where(scores >= self.conf_thresh) 198 | bboxes = bboxes[keep] 199 | scores = scores[keep] 200 | cls_inds = cls_inds[keep] 201 | 202 | # NMS 203 | keep = np.zeros(len(bboxes), dtype=np.int) 204 | for i in range(self.num_classes): 205 | inds = np.where(cls_inds == i)[0] 206 | if len(inds) == 0: 207 | continue 208 | c_bboxes = bboxes[inds] 209 | c_scores = scores[inds] 210 | c_keep = self.nms(c_bboxes, c_scores) 211 | keep[inds[c_keep]] = 1 212 | 213 | keep = np.where(keep > 0) 214 | bboxes = bboxes[keep] 215 | scores = scores[keep] 216 | cls_inds = cls_inds[keep] 217 | 218 | # topk 219 | scores_sorted, scores_sorted_inds = np.sort(scores), np.argsort(scores) 220 | topk_scores, topk_scores_inds = scores_sorted[:self.topk], scores_sorted_inds[:self.topk] 221 | topk_bboxes = bboxes[topk_scores_inds] 222 | topk_cls_inds = cls_inds[topk_scores_inds] 223 | 224 | return topk_bboxes, topk_scores, topk_cls_inds 225 | 226 | 227 | @torch.no_grad() 228 | def inference(self, x): 229 | B = x.size(0) 230 | # backbone 231 | feats = self.backbone(x) 232 | c3, c4, c5 = feats['layer1'], feats['layer2'], feats['layer3'] 233 | 234 | # FPN 235 | p5 = self.conv_set_3(c5) 236 | p5_up = F.interpolate(self.conv_1x1_3(p5), scale_factor=2.0, mode='bilinear', align_corners=True) 237 | 238 | p4 = torch.cat([c4, p5_up], 1) 239 | p4 = self.conv_set_2(p4) 240 | p4_up = F.interpolate(self.conv_1x1_2(p4), scale_factor=2.0, mode='bilinear', align_corners=True) 241 | 242 | p3 = torch.cat([c3, p4_up], 1) 243 | p3 = self.conv_set_1(p3) 244 | 245 | # head 246 | # s = 32 247 | p5 = self.extra_conv_3(p5) 248 | pred_3 = self.pred_3(p5) 249 | 250 | # s = 16 251 | p4 = self.extra_conv_2(p4) 252 | pred_2 = self.pred_2(p4) 253 | 254 | # s = 8 255 | p3 = self.extra_conv_1(p3) 256 | pred_1 = self.pred_1(p3) 257 | 258 | preds = [pred_1, pred_2, pred_3] 259 | total_conf_pred = [] 260 | total_cls_pred = [] 261 | total_reg_pred = [] 262 | for pred in preds: 263 | C = pred.size(1) 264 | 265 | # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C] 266 | pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C) 267 | 268 | # [B, H*W*anchor_n, 1] 269 | conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1) 270 | # [B, H*W*anchor_n, num_cls] 271 | cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes) 272 | # [B, H*W*anchor_n, 4] 273 | reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() 274 | 275 | total_conf_pred.append(conf_pred) 276 | total_cls_pred.append(cls_pred) 277 | total_reg_pred.append(reg_pred) 278 | 279 | conf_pred = torch.cat(total_conf_pred, dim=1) 280 | cls_pred = torch.cat(total_cls_pred, dim=1) 281 | reg_pred = torch.cat(total_reg_pred, dim=1) 282 | # decode bbox 283 | reg_pred = reg_pred.view(B, -1, self.num_anchors, 4) 284 | box_pred = self.decode_boxes(reg_pred) 285 | 286 | # batch size = 1 287 | conf_pred = conf_pred[0] 288 | cls_pred = cls_pred[0] 289 | box_pred = box_pred[0] 290 | 291 | # score 292 | scores = torch.sigmoid(conf_pred) * torch.softmax(cls_pred, dim=-1) 293 | 294 | # normalize bbox 295 | bboxes = torch.clamp(box_pred / self.input_size, 0., 1.) 296 | 297 | # to cpu 298 | scores = scores.to('cpu').numpy() 299 | bboxes = bboxes.to('cpu').numpy() 300 | 301 | # post-process 302 | bboxes, scores, cls_inds = self.postprocess(bboxes, scores) 303 | 304 | return bboxes, scores, cls_inds 305 | 306 | 307 | def forward(self, x, target=None): 308 | if not self.trainable: 309 | return self.inference(x) 310 | else: 311 | # backbone 312 | B = x.size(0) 313 | # backbone 314 | feats = self.backbone(x) 315 | c3, c4, c5 = feats['layer1'], feats['layer2'], feats['layer3'] 316 | 317 | # FPN 318 | p5 = self.conv_set_3(c5) 319 | p5_up = F.interpolate(self.conv_1x1_3(p5), scale_factor=2.0, mode='bilinear', align_corners=True) 320 | 321 | p4 = torch.cat([c4, p5_up], 1) 322 | p4 = self.conv_set_2(p4) 323 | p4_up = F.interpolate(self.conv_1x1_2(p4), scale_factor=2.0, mode='bilinear', align_corners=True) 324 | 325 | p3 = torch.cat([c3, p4_up], 1) 326 | p3 = self.conv_set_1(p3) 327 | 328 | # head 329 | # s = 32 330 | p5 = self.extra_conv_3(p5) 331 | pred_3 = self.pred_3(p5) 332 | 333 | # s = 16 334 | p4 = self.extra_conv_2(p4) 335 | pred_2 = self.pred_2(p4) 336 | 337 | # s = 8 338 | p3 = self.extra_conv_1(p3) 339 | pred_1 = self.pred_1(p3) 340 | 341 | preds = [pred_1, pred_2, pred_3] 342 | total_conf_pred = [] 343 | total_cls_pred = [] 344 | total_reg_pred = [] 345 | for pred in preds: 346 | C = pred.size(1) 347 | 348 | # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C] 349 | pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C) 350 | 351 | # [B, H*W*anchor_n, 1] 352 | conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1) 353 | # [B, H*W*anchor_n, num_cls] 354 | cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes) 355 | # [B, H*W*anchor_n, 4] 356 | reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() 357 | 358 | total_conf_pred.append(conf_pred) 359 | total_cls_pred.append(cls_pred) 360 | total_reg_pred.append(reg_pred) 361 | 362 | conf_pred = torch.cat(total_conf_pred, dim=1) 363 | cls_pred = torch.cat(total_cls_pred, dim=1) 364 | reg_pred = torch.cat(total_reg_pred, dim=1) 365 | 366 | # decode bbox 367 | reg_pred = reg_pred.view(B, -1, self.num_anchors, 4) 368 | x1y1x2y2_pred = (self.decode_boxes(reg_pred) / self.input_size).view(-1, 4) 369 | reg_pred = reg_pred.view(B, -1, 4) 370 | x1y1x2y2_gt = target[:, :, 7:].view(-1, 4) 371 | 372 | # set conf target 373 | iou_pred = tools.iou_score(x1y1x2y2_pred, x1y1x2y2_gt).view(B, -1, 1) 374 | gt_conf = iou_pred.clone().detach() 375 | 376 | # [obj, cls, txtytwth, scale_weight, x1y1x2y2] -> [conf, obj, cls, txtytwth, scale_weight] 377 | target = torch.cat([gt_conf, target[:, :, :7]], dim=2) 378 | 379 | # loss 380 | ( 381 | conf_loss, 382 | cls_loss, 383 | bbox_loss, 384 | iou_loss 385 | ) = tools.loss(pred_conf=conf_pred, 386 | pred_cls=cls_pred, 387 | pred_txtytwth=reg_pred, 388 | pred_iou=iou_pred, 389 | label=target 390 | ) 391 | 392 | return conf_loss, cls_loss, bbox_loss, iou_loss 393 | -------------------------------------------------------------------------------- /models/yolov3_spp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | from utils.modules import Conv, SPP 7 | from backbone import build_backbone 8 | import tools 9 | 10 | 11 | # YOLOv3 SPP 12 | class YOLOv3Spp(nn.Module): 13 | def __init__(self, 14 | device, 15 | input_size=None, 16 | num_classes=20, 17 | trainable=False, 18 | conf_thresh=0.001, 19 | nms_thresh=0.50, 20 | anchor_size=None): 21 | super(YOLOv3Spp, self).__init__() 22 | self.device = device 23 | self.input_size = input_size 24 | self.num_classes = num_classes 25 | self.trainable = trainable 26 | self.conf_thresh = conf_thresh 27 | self.nms_thresh = nms_thresh 28 | self.stride = [8, 16, 32] 29 | self.anchor_size = torch.tensor(anchor_size).view(3, len(anchor_size) // 3, 2) 30 | self.num_anchors = self.anchor_size.size(1) 31 | 32 | self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size) 33 | 34 | # backbone 35 | self.backbone = build_backbone(model_name='darknet53', pretrained=trainable) 36 | 37 | # s = 32 38 | self.conv_set_3 = nn.Sequential( 39 | SPP(), 40 | Conv(1024*4, 512, k=1), 41 | Conv(512, 1024, k=3, p=1), 42 | Conv(1024, 512, k=1), 43 | Conv(512, 1024, k=3, p=1), 44 | Conv(1024, 512, k=1) 45 | ) 46 | self.conv_1x1_3 = Conv(512, 256, k=1) 47 | self.extra_conv_3 = Conv(512, 1024, k=3, p=1) 48 | self.pred_3 = nn.Conv2d(1024, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) 49 | 50 | # s = 16 51 | self.conv_set_2 = nn.Sequential( 52 | Conv(768, 256, k=1), 53 | Conv(256, 512, k=3, p=1), 54 | Conv(512, 256, k=1), 55 | Conv(256, 512, k=3, p=1), 56 | Conv(512, 256, k=1) 57 | ) 58 | self.conv_1x1_2 = Conv(256, 128, k=1) 59 | self.extra_conv_2 = Conv(256, 512, k=3, p=1) 60 | self.pred_2 = nn.Conv2d(512, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) 61 | 62 | # s = 8 63 | self.conv_set_1 = nn.Sequential( 64 | Conv(384, 128, k=1), 65 | Conv(128, 256, k=3, p=1), 66 | Conv(256, 128, k=1), 67 | Conv(128, 256, k=3, p=1), 68 | Conv(256, 128, k=1) 69 | ) 70 | self.extra_conv_1 = Conv(128, 256, k=3, p=1) 71 | self.pred_1 = nn.Conv2d(256, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) 72 | 73 | 74 | self.init_yolo() 75 | 76 | 77 | def init_yolo(self): 78 | # Init head 79 | init_prob = 0.01 80 | bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob)) 81 | # init obj&cls pred 82 | for pred in [self.pred_1, self.pred_2, self.pred_3]: 83 | nn.init.constant_(pred.bias[..., :self.num_anchors], bias_value) 84 | nn.init.constant_(pred.bias[..., self.num_anchors : (1 + self.num_classes) * self.num_anchors], bias_value) 85 | 86 | 87 | def create_grid(self, input_size): 88 | total_grid_xy = [] 89 | total_stride = [] 90 | total_anchor_wh = [] 91 | w, h = input_size, input_size 92 | for ind, s in enumerate(self.stride): 93 | # generate grid cells 94 | ws, hs = w // s, h // s 95 | grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)]) 96 | grid_xy = torch.stack([grid_x, grid_y], dim=-1).float() 97 | grid_xy = grid_xy.view(1, hs*ws, 1, 2) 98 | 99 | # generate stride tensor 100 | stride_tensor = torch.ones([1, hs*ws, self.num_anchors, 2]) * s 101 | 102 | # generate anchor_wh tensor 103 | anchor_wh = self.anchor_size[ind].repeat(hs*ws, 1, 1) 104 | 105 | total_grid_xy.append(grid_xy) 106 | total_stride.append(stride_tensor) 107 | total_anchor_wh.append(anchor_wh) 108 | 109 | total_grid_xy = torch.cat(total_grid_xy, dim=1).to(self.device) 110 | total_stride = torch.cat(total_stride, dim=1).to(self.device) 111 | total_anchor_wh = torch.cat(total_anchor_wh, dim=0).to(self.device).unsqueeze(0) 112 | 113 | return total_grid_xy, total_stride, total_anchor_wh 114 | 115 | 116 | def set_grid(self, input_size): 117 | self.input_size = input_size 118 | self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size) 119 | 120 | 121 | def decode_xywh(self, txtytwth_pred): 122 | """ 123 | Input: 124 | txtytwth_pred : [B, H*W, anchor_n, 4] containing [tx, ty, tw, th] 125 | Output: 126 | xywh_pred : [B, H*W*anchor_n, 4] containing [x, y, w, h] 127 | """ 128 | # b_x = sigmoid(tx) + gride_x, b_y = sigmoid(ty) + gride_y 129 | B, HW, ab_n, _ = txtytwth_pred.size() 130 | c_xy_pred = (torch.sigmoid(txtytwth_pred[:, :, :, :2]) + self.grid_cell) * self.stride_tensor 131 | # b_w = anchor_w * exp(tw), b_h = anchor_h * exp(th) 132 | b_wh_pred = torch.exp(txtytwth_pred[:, :, :, 2:]) * self.all_anchors_wh 133 | # [B, H*W, anchor_n, 4] -> [B, H*W*anchor_n, 4] 134 | xywh_pred = torch.cat([c_xy_pred, b_wh_pred], -1).view(B, HW*ab_n, 4) 135 | 136 | return xywh_pred 137 | 138 | 139 | def decode_boxes(self, txtytwth_pred): 140 | """ 141 | Input: \n 142 | txtytwth_pred : [B, H*W, anchor_n, 4] \n 143 | Output: \n 144 | x1y1x2y2_pred : [B, H*W*anchor_n, 4] \n 145 | """ 146 | # txtytwth -> cxcywh 147 | xywh_pred = self.decode_xywh(txtytwth_pred) 148 | 149 | # cxcywh -> x1y1x2y2 150 | x1y1x2y2_pred = torch.zeros_like(xywh_pred) 151 | x1y1_pred = xywh_pred[..., :2] - xywh_pred[..., 2:] * 0.5 152 | x2y2_pred = xywh_pred[..., :2] + xywh_pred[..., 2:] * 0.5 153 | x1y1x2y2_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1) 154 | 155 | return x1y1x2y2_pred 156 | 157 | 158 | def nms(self, dets, scores): 159 | """"Pure Python NMS baseline.""" 160 | x1 = dets[:, 0] #xmin 161 | y1 = dets[:, 1] #ymin 162 | x2 = dets[:, 2] #xmax 163 | y2 = dets[:, 3] #ymax 164 | 165 | areas = (x2 - x1) * (y2 - y1) 166 | order = scores.argsort()[::-1] 167 | 168 | keep = [] 169 | while order.size > 0: 170 | i = order[0] 171 | keep.append(i) 172 | xx1 = np.maximum(x1[i], x1[order[1:]]) 173 | yy1 = np.maximum(y1[i], y1[order[1:]]) 174 | xx2 = np.minimum(x2[i], x2[order[1:]]) 175 | yy2 = np.minimum(y2[i], y2[order[1:]]) 176 | 177 | w = np.maximum(1e-10, xx2 - xx1) 178 | h = np.maximum(1e-10, yy2 - yy1) 179 | inter = w * h 180 | 181 | # Cross Area / (bbox + particular area - Cross Area) 182 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 183 | #reserve all the boundingbox whose ovr less than thresh 184 | inds = np.where(ovr <= self.nms_thresh)[0] 185 | order = order[inds + 1] 186 | 187 | return keep 188 | 189 | 190 | def postprocess(self, bboxes, scores): 191 | """ 192 | bboxes: (HxW, 4), bsize = 1 193 | scores: (HxW, num_classes), bsize = 1 194 | """ 195 | 196 | cls_inds = np.argmax(scores, axis=1) 197 | scores = scores[(np.arange(scores.shape[0]), cls_inds)] 198 | 199 | # threshold 200 | keep = np.where(scores >= self.conf_thresh) 201 | bboxes = bboxes[keep] 202 | scores = scores[keep] 203 | cls_inds = cls_inds[keep] 204 | 205 | # NMS 206 | keep = np.zeros(len(bboxes), dtype=np.int) 207 | for i in range(self.num_classes): 208 | inds = np.where(cls_inds == i)[0] 209 | if len(inds) == 0: 210 | continue 211 | c_bboxes = bboxes[inds] 212 | c_scores = scores[inds] 213 | c_keep = self.nms(c_bboxes, c_scores) 214 | keep[inds[c_keep]] = 1 215 | 216 | keep = np.where(keep > 0) 217 | bboxes = bboxes[keep] 218 | scores = scores[keep] 219 | cls_inds = cls_inds[keep] 220 | 221 | return bboxes, scores, cls_inds 222 | 223 | 224 | @torch.no_grad() 225 | def inference(self, x): 226 | B = x.size(0) 227 | # backbone 228 | feats = self.backbone(x) 229 | c3, c4, c5 = feats['layer1'], feats['layer2'], feats['layer3'] 230 | 231 | # FPN 232 | p5 = self.conv_set_3(c5) 233 | p5_up = F.interpolate(self.conv_1x1_3(p5), scale_factor=2.0, mode='bilinear', align_corners=True) 234 | 235 | p4 = torch.cat([c4, p5_up], 1) 236 | p4 = self.conv_set_2(p4) 237 | p4_up = F.interpolate(self.conv_1x1_2(p4), scale_factor=2.0, mode='bilinear', align_corners=True) 238 | 239 | p3 = torch.cat([c3, p4_up], 1) 240 | p3 = self.conv_set_1(p3) 241 | 242 | # head 243 | # s = 32 244 | p5 = self.extra_conv_3(p5) 245 | pred_3 = self.pred_3(p5) 246 | 247 | # s = 16 248 | p4 = self.extra_conv_2(p4) 249 | pred_2 = self.pred_2(p4) 250 | 251 | # s = 8 252 | p3 = self.extra_conv_1(p3) 253 | pred_1 = self.pred_1(p3) 254 | 255 | preds = [pred_1, pred_2, pred_3] 256 | total_conf_pred = [] 257 | total_cls_pred = [] 258 | total_reg_pred = [] 259 | for pred in preds: 260 | C = pred.size(1) 261 | 262 | # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C] 263 | pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C) 264 | 265 | # [B, H*W*anchor_n, 1] 266 | conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1) 267 | # [B, H*W*anchor_n, num_cls] 268 | cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes) 269 | # [B, H*W*anchor_n, 4] 270 | reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() 271 | 272 | total_conf_pred.append(conf_pred) 273 | total_cls_pred.append(cls_pred) 274 | total_reg_pred.append(reg_pred) 275 | 276 | conf_pred = torch.cat(total_conf_pred, dim=1) 277 | cls_pred = torch.cat(total_cls_pred, dim=1) 278 | reg_pred = torch.cat(total_reg_pred, dim=1) 279 | # decode bbox 280 | reg_pred = reg_pred.view(B, -1, self.num_anchors, 4) 281 | box_pred = self.decode_boxes(reg_pred) 282 | 283 | # batch size = 1 284 | conf_pred = conf_pred[0] 285 | cls_pred = cls_pred[0] 286 | box_pred = box_pred[0] 287 | 288 | # score 289 | scores = torch.sigmoid(conf_pred) * torch.softmax(cls_pred, dim=-1) 290 | 291 | # normalize bbox 292 | bboxes = torch.clamp(box_pred / self.input_size, 0., 1.) 293 | 294 | # to cpu 295 | scores = scores.to('cpu').numpy() 296 | bboxes = bboxes.to('cpu').numpy() 297 | 298 | # post-process 299 | bboxes, scores, cls_inds = self.postprocess(bboxes, scores) 300 | 301 | return bboxes, scores, cls_inds 302 | 303 | 304 | def forward(self, x, target=None): 305 | if not self.trainable: 306 | return self.inference(x) 307 | else: 308 | # backbone 309 | B = x.size(0) 310 | # backbone 311 | feats = self.backbone(x) 312 | c3, c4, c5 = feats['layer1'], feats['layer2'], feats['layer3'] 313 | 314 | # FPN 315 | p5 = self.conv_set_3(c5) 316 | p5_up = F.interpolate(self.conv_1x1_3(p5), scale_factor=2.0, mode='bilinear', align_corners=True) 317 | 318 | p4 = torch.cat([c4, p5_up], 1) 319 | p4 = self.conv_set_2(p4) 320 | p4_up = F.interpolate(self.conv_1x1_2(p4), scale_factor=2.0, mode='bilinear', align_corners=True) 321 | 322 | p3 = torch.cat([c3, p4_up], 1) 323 | p3 = self.conv_set_1(p3) 324 | 325 | # head 326 | # s = 32 327 | p5 = self.extra_conv_3(p5) 328 | pred_3 = self.pred_3(p5) 329 | 330 | # s = 16 331 | p4 = self.extra_conv_2(p4) 332 | pred_2 = self.pred_2(p4) 333 | 334 | # s = 8 335 | p3 = self.extra_conv_1(p3) 336 | pred_1 = self.pred_1(p3) 337 | 338 | preds = [pred_1, pred_2, pred_3] 339 | total_conf_pred = [] 340 | total_cls_pred = [] 341 | total_reg_pred = [] 342 | for pred in preds: 343 | C = pred.size(1) 344 | 345 | # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C] 346 | pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C) 347 | 348 | # [B, H*W*anchor_n, 1] 349 | conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1) 350 | # [B, H*W*anchor_n, num_cls] 351 | cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes) 352 | # [B, H*W*anchor_n, 4] 353 | reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() 354 | 355 | total_conf_pred.append(conf_pred) 356 | total_cls_pred.append(cls_pred) 357 | total_reg_pred.append(reg_pred) 358 | 359 | conf_pred = torch.cat(total_conf_pred, dim=1) 360 | cls_pred = torch.cat(total_cls_pred, dim=1) 361 | reg_pred = torch.cat(total_reg_pred, dim=1) 362 | 363 | # decode bbox 364 | reg_pred = reg_pred.view(B, -1, self.num_anchors, 4) 365 | x1y1x2y2_pred = (self.decode_boxes(reg_pred) / self.input_size).view(-1, 4) 366 | reg_pred = reg_pred.view(B, -1, 4) 367 | x1y1x2y2_gt = target[:, :, 7:].view(-1, 4) 368 | 369 | # set conf target 370 | iou_pred = tools.iou_score(x1y1x2y2_pred, x1y1x2y2_gt).view(B, -1, 1) 371 | gt_conf = iou_pred.clone().detach() 372 | 373 | # [obj, cls, txtytwth, scale_weight, x1y1x2y2] -> [conf, obj, cls, txtytwth, scale_weight] 374 | target = torch.cat([gt_conf, target[:, :, :7]], dim=2) 375 | 376 | # loss 377 | ( 378 | conf_loss, 379 | cls_loss, 380 | bbox_loss, 381 | iou_loss 382 | ) = tools.loss(pred_conf=conf_pred, 383 | pred_cls=cls_pred, 384 | pred_txtytwth=reg_pred, 385 | pred_iou=iou_pred, 386 | label=target 387 | ) 388 | 389 | return conf_loss, cls_loss, bbox_loss, iou_loss 390 | -------------------------------------------------------------------------------- /models/yolov3_tiny.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | from utils.modules import Conv 7 | from backbone import build_backbone 8 | import tools 9 | 10 | 11 | # YOLOv3 Tiny 12 | class YOLOv3tiny(nn.Module): 13 | def __init__(self, device, input_size=None, num_classes=20, trainable=False, conf_thresh=0.01, nms_thresh=0.50, anchor_size=None, hr=False): 14 | super(YOLOv3tiny, self).__init__() 15 | self.device = device 16 | self.input_size = input_size 17 | self.num_classes = num_classes 18 | self.trainable = trainable 19 | self.conf_thresh = conf_thresh 20 | self.nms_thresh = nms_thresh 21 | self.stride = [16, 32] 22 | self.anchor_size = torch.tensor(anchor_size).view(2, len(anchor_size) // 2, 2) 23 | self.num_anchors = self.anchor_size.size(1) 24 | 25 | self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size) 26 | 27 | # backbone 28 | self.backbone = build_backbone(model_name='darknet_tiny', pretrained=trainable) 29 | 30 | # s = 32 31 | self.conv_set_2 = Conv(1024, 256, k=3, p=1) 32 | 33 | self.conv_1x1_2 = Conv(256, 128, k=1) 34 | 35 | self.extra_conv_2 = Conv(256, 512, k=3, p=1) 36 | self.pred_2 = nn.Conv2d(512, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) 37 | 38 | # s = 16 39 | self.conv_set_1 = Conv(384, 256, k=3, p=1) 40 | self.pred_1 = nn.Conv2d(256, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1) 41 | 42 | 43 | self.init_yolo() 44 | 45 | 46 | def init_yolo(self): 47 | # Init head 48 | init_prob = 0.01 49 | bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob)) 50 | # init obj&cls pred 51 | for pred in [self.pred_1, self.pred_2, self.pred_3]: 52 | nn.init.constant_(pred.bias[..., :self.num_anchors], bias_value) 53 | nn.init.constant_(pred.bias[..., self.num_anchors : (1 + self.num_classes) * self.num_anchors], bias_value) 54 | 55 | 56 | def create_grid(self, input_size): 57 | total_grid_xy = [] 58 | total_stride = [] 59 | total_anchor_wh = [] 60 | w, h = input_size, input_size 61 | for ind, s in enumerate(self.stride): 62 | # generate grid cells 63 | ws, hs = w // s, h // s 64 | grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)]) 65 | grid_xy = torch.stack([grid_x, grid_y], dim=-1).float() 66 | grid_xy = grid_xy.view(1, hs*ws, 1, 2) 67 | 68 | # generate stride tensor 69 | stride_tensor = torch.ones([1, hs*ws, self.num_anchors, 2]) * s 70 | 71 | # generate anchor_wh tensor 72 | anchor_wh = self.anchor_size[ind].repeat(hs*ws, 1, 1) 73 | 74 | total_grid_xy.append(grid_xy) 75 | total_stride.append(stride_tensor) 76 | total_anchor_wh.append(anchor_wh) 77 | 78 | total_grid_xy = torch.cat(total_grid_xy, dim=1).to(self.device) 79 | total_stride = torch.cat(total_stride, dim=1).to(self.device) 80 | total_anchor_wh = torch.cat(total_anchor_wh, dim=0).to(self.device).unsqueeze(0) 81 | 82 | return total_grid_xy, total_stride, total_anchor_wh 83 | 84 | 85 | def set_grid(self, input_size): 86 | self.input_size = input_size 87 | self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size) 88 | 89 | 90 | def decode_xywh(self, txtytwth_pred): 91 | """ 92 | Input: 93 | txtytwth_pred : [B, H*W, anchor_n, 4] containing [tx, ty, tw, th] 94 | Output: 95 | xywh_pred : [B, H*W*anchor_n, 4] containing [x, y, w, h] 96 | """ 97 | # b_x = sigmoid(tx) + gride_x, b_y = sigmoid(ty) + gride_y 98 | B, HW, ab_n, _ = txtytwth_pred.size() 99 | c_xy_pred = (torch.sigmoid(txtytwth_pred[:, :, :, :2]) + self.grid_cell) * self.stride_tensor 100 | # b_w = anchor_w * exp(tw), b_h = anchor_h * exp(th) 101 | b_wh_pred = torch.exp(txtytwth_pred[:, :, :, 2:]) * self.all_anchors_wh 102 | # [B, H*W, anchor_n, 4] -> [B, H*W*anchor_n, 4] 103 | xywh_pred = torch.cat([c_xy_pred, b_wh_pred], -1).view(B, HW*ab_n, 4) 104 | 105 | return xywh_pred 106 | 107 | 108 | def decode_boxes(self, txtytwth_pred): 109 | """ 110 | Input: \n 111 | txtytwth_pred : [B, H*W, anchor_n, 4] \n 112 | Output: \n 113 | x1y1x2y2_pred : [B, H*W*anchor_n, 4] \n 114 | """ 115 | # txtytwth -> cxcywh 116 | xywh_pred = self.decode_xywh(txtytwth_pred) 117 | 118 | # cxcywh -> x1y1x2y2 119 | x1y1x2y2_pred = torch.zeros_like(xywh_pred) 120 | x1y1_pred = xywh_pred[..., :2] - xywh_pred[..., 2:] * 0.5 121 | x2y2_pred = xywh_pred[..., :2] + xywh_pred[..., 2:] * 0.5 122 | x1y1x2y2_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1) 123 | 124 | return x1y1x2y2_pred 125 | 126 | 127 | def nms(self, dets, scores): 128 | """"Pure Python NMS baseline.""" 129 | x1 = dets[:, 0] #xmin 130 | y1 = dets[:, 1] #ymin 131 | x2 = dets[:, 2] #xmax 132 | y2 = dets[:, 3] #ymax 133 | 134 | areas = (x2 - x1) * (y2 - y1) 135 | order = scores.argsort()[::-1] 136 | 137 | keep = [] 138 | while order.size > 0: 139 | i = order[0] 140 | keep.append(i) 141 | xx1 = np.maximum(x1[i], x1[order[1:]]) 142 | yy1 = np.maximum(y1[i], y1[order[1:]]) 143 | xx2 = np.minimum(x2[i], x2[order[1:]]) 144 | yy2 = np.minimum(y2[i], y2[order[1:]]) 145 | 146 | w = np.maximum(1e-10, xx2 - xx1) 147 | h = np.maximum(1e-10, yy2 - yy1) 148 | inter = w * h 149 | 150 | # Cross Area / (bbox + particular area - Cross Area) 151 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 152 | #reserve all the boundingbox whose ovr less than thresh 153 | inds = np.where(ovr <= self.nms_thresh)[0] 154 | order = order[inds + 1] 155 | 156 | return keep 157 | 158 | 159 | def postprocess(self, bboxes, scores): 160 | """ 161 | bboxes: (HxW, 4), bsize = 1 162 | scores: (HxW, num_classes), bsize = 1 163 | """ 164 | 165 | cls_inds = np.argmax(scores, axis=1) 166 | scores = scores[(np.arange(scores.shape[0]), cls_inds)] 167 | 168 | # threshold 169 | keep = np.where(scores >= self.conf_thresh) 170 | bboxes = bboxes[keep] 171 | scores = scores[keep] 172 | cls_inds = cls_inds[keep] 173 | 174 | # NMS 175 | keep = np.zeros(len(bboxes), dtype=np.int) 176 | for i in range(self.num_classes): 177 | inds = np.where(cls_inds == i)[0] 178 | if len(inds) == 0: 179 | continue 180 | c_bboxes = bboxes[inds] 181 | c_scores = scores[inds] 182 | c_keep = self.nms(c_bboxes, c_scores) 183 | keep[inds[c_keep]] = 1 184 | 185 | keep = np.where(keep > 0) 186 | bboxes = bboxes[keep] 187 | scores = scores[keep] 188 | cls_inds = cls_inds[keep] 189 | 190 | return bboxes, scores, cls_inds 191 | 192 | 193 | @torch.no_grad() 194 | def inference(self, x): 195 | B = x.size(0) 196 | # backbone 197 | feats = self.backbone(x) 198 | c4, c5 = feats['layer2'], feats['layer3'] 199 | 200 | # FPN 201 | p5 = self.conv_set_2(c5) 202 | p5_up = F.interpolate(self.conv_1x1_2(p5), scale_factor=2.0, mode='bilinear', align_corners=True) 203 | 204 | p4 = torch.cat([c4, p5_up], dim=1) 205 | p4 = self.conv_set_1(p4) 206 | 207 | # head 208 | # s = 32 209 | p5 = self.extra_conv_2(p5) 210 | pred_2 = self.pred_2(p5) 211 | 212 | # s = 16 213 | pred_1 = self.pred_1(p4) 214 | 215 | 216 | preds = [pred_1, pred_2] 217 | total_conf_pred = [] 218 | total_cls_pred = [] 219 | total_reg_pred = [] 220 | for pred in preds: 221 | C = pred.size(1) 222 | 223 | # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C] 224 | pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C) 225 | 226 | # Divide prediction to obj_pred, xywh_pred and cls_pred 227 | # [B, H*W*anchor_n, 1] 228 | conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1) 229 | # [B, H*W*anchor_n, num_cls] 230 | cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes) 231 | # [B, H*W*anchor_n, 4] 232 | reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() 233 | 234 | total_conf_pred.append(conf_pred) 235 | total_cls_pred.append(cls_pred) 236 | total_reg_pred.append(reg_pred) 237 | 238 | conf_pred = torch.cat(total_conf_pred, dim=1) 239 | cls_pred = torch.cat(total_cls_pred, dim=1) 240 | reg_pred = torch.cat(total_reg_pred, dim=1) 241 | # decode bbox 242 | reg_pred = reg_pred.view(B, -1, self.num_anchors, 4) 243 | box_pred = self.decode_boxes(reg_pred) 244 | 245 | # batch size = 1 246 | conf_pred = conf_pred[0] 247 | cls_pred = cls_pred[0] 248 | box_pred = box_pred[0] 249 | 250 | # score 251 | scores = torch.sigmoid(conf_pred) * torch.softmax(cls_pred, dim=-1) 252 | 253 | # normalize bbox 254 | bboxes = torch.clamp(box_pred / self.input_size, 0., 1.) 255 | 256 | # to cpu 257 | scores = scores.to('cpu').numpy() 258 | bboxes = bboxes.to('cpu').numpy() 259 | 260 | # post-process 261 | bboxes, scores, cls_inds = self.postprocess(bboxes, scores) 262 | 263 | return bboxes, scores, cls_inds 264 | 265 | 266 | def forward(self, x, target=None): 267 | if not self.trainable: 268 | return self.inference(x) 269 | else: 270 | # backbone 271 | B = x.size(0) 272 | # backbone 273 | feats = self.backbone(x) 274 | c4, c5 = feats['layer2'], feats['layer3'] 275 | 276 | # FPN 277 | p5 = self.conv_set_2(c5) 278 | p5_up = F.interpolate(self.conv_1x1_2(p5), scale_factor=2.0, mode='bilinear', align_corners=True) 279 | 280 | p4 = torch.cat([c4, p5_up], dim=1) 281 | p4 = self.conv_set_1(p4) 282 | 283 | # head 284 | # s = 32 285 | p5 = self.extra_conv_2(p5) 286 | pred_2 = self.pred_2(p5) 287 | 288 | # s = 16 289 | pred_1 = self.pred_1(p4) 290 | 291 | preds = [pred_1, pred_2] 292 | total_conf_pred = [] 293 | total_cls_pred = [] 294 | total_reg_pred = [] 295 | for pred in preds: 296 | C = pred.size(1) 297 | 298 | # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C] 299 | pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C) 300 | 301 | # Divide prediction to obj_pred, xywh_pred and cls_pred 302 | # [B, H*W*anchor_n, 1] 303 | conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1) 304 | # [B, H*W*anchor_n, num_cls] 305 | cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes) 306 | # [B, H*W*anchor_n, 4] 307 | reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous() 308 | 309 | total_conf_pred.append(conf_pred) 310 | total_cls_pred.append(cls_pred) 311 | total_reg_pred.append(reg_pred) 312 | 313 | conf_pred = torch.cat(total_conf_pred, dim=1) 314 | cls_pred = torch.cat(total_cls_pred, dim=1) 315 | reg_pred = torch.cat(total_reg_pred, dim=1) 316 | 317 | # decode bbox 318 | reg_pred = reg_pred.view(B, -1, self.num_anchors, 4) 319 | x1y1x2y2_pred = (self.decode_boxes(reg_pred) / self.input_size).view(-1, 4) 320 | reg_pred = reg_pred.view(B, -1, 4) 321 | x1y1x2y2_gt = target[:, :, 7:].view(-1, 4) 322 | 323 | # set conf target 324 | iou_pred = tools.iou_score(x1y1x2y2_pred, x1y1x2y2_gt).view(B, -1, 1) 325 | gt_conf = iou_pred.clone().detach() 326 | 327 | # [obj, cls, txtytwth, scale_weight, x1y1x2y2] -> [conf, obj, cls, txtytwth, scale_weight] 328 | target = torch.cat([gt_conf, target[:, :, :7]], dim=2) 329 | 330 | # loss 331 | ( 332 | conf_loss, 333 | cls_loss, 334 | bbox_loss, 335 | iou_loss 336 | ) = tools.loss(pred_conf=conf_pred, 337 | pred_cls=cls_pred, 338 | pred_txtytwth=reg_pred, 339 | pred_iou=iou_pred, 340 | label=target 341 | ) 342 | 343 | return conf_loss, cls_loss, bbox_loss, iou_loss 344 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import torch 4 | import torch.backends.cudnn as cudnn 5 | from data.voc0712 import VOC_CLASSES, VOCDetection 6 | from data.coco2017 import COCODataset, coco_class_index, coco_class_labels 7 | from data import config, BaseTransform 8 | import numpy as np 9 | import cv2 10 | import time 11 | 12 | 13 | parser = argparse.ArgumentParser(description='YOLO Detection') 14 | # basic 15 | parser.add_argument('-size', '--input_size', default=416, type=int, 16 | help='input_size') 17 | parser.add_argument('--cuda', action='store_true', default=False, 18 | help='use cuda.') 19 | # model 20 | parser.add_argument('-v', '--version', default='yolo_v2', 21 | help='yolov2_d19, yolov2_r50, yolov2_slim, yolov3, yolov3_spp, yolov3_tiny') 22 | parser.add_argument('--trained_model', default='weight/', 23 | type=str, help='Trained state_dict file path to open') 24 | parser.add_argument('--conf_thresh', default=0.1, type=float, 25 | help='Confidence threshold') 26 | parser.add_argument('--nms_thresh', default=0.50, type=float, 27 | help='NMS threshold') 28 | # dataset 29 | parser.add_argument('-root', '--data_root', default='/mnt/share/ssd2/dataset', 30 | help='dataset root') 31 | parser.add_argument('-d', '--dataset', default='voc', 32 | help='voc or coco') 33 | # visualize 34 | parser.add_argument('-vs', '--visual_threshold', default=0.25, type=float, 35 | help='Final confidence threshold') 36 | parser.add_argument('--show', action='store_true', default=False, 37 | help='show the visulization results.') 38 | 39 | 40 | args = parser.parse_args() 41 | 42 | 43 | def plot_bbox_labels(img, bbox, label=None, cls_color=None, text_scale=0.4): 44 | x1, y1, x2, y2 = bbox 45 | x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) 46 | t_size = cv2.getTextSize(label, 0, fontScale=1, thickness=2)[0] 47 | # plot bbox 48 | cv2.rectangle(img, (x1, y1), (x2, y2), cls_color, 2) 49 | 50 | if label is not None: 51 | # plot title bbox 52 | cv2.rectangle(img, (x1, y1-t_size[1]), (int(x1 + t_size[0] * text_scale), y1), cls_color, -1) 53 | # put the test on the title bbox 54 | cv2.putText(img, label, (int(x1), int(y1 - 5)), 0, text_scale, (0, 0, 0), 1, lineType=cv2.LINE_AA) 55 | 56 | return img 57 | 58 | 59 | def visualize(img, 60 | bboxes, 61 | scores, 62 | cls_inds, 63 | vis_thresh, 64 | class_colors, 65 | class_names, 66 | class_indexs=None, 67 | dataset_name='voc'): 68 | ts = 0.4 69 | for i, bbox in enumerate(bboxes): 70 | if scores[i] > vis_thresh: 71 | cls_id = int(cls_inds[i]) 72 | if dataset_name == 'coco': 73 | cls_color = class_colors[cls_id] 74 | cls_id = class_indexs[cls_id] 75 | else: 76 | cls_color = class_colors[cls_id] 77 | 78 | if len(class_names) > 1: 79 | mess = '%s: %.2f' % (class_names[cls_id], scores[i]) 80 | else: 81 | cls_color = [255, 0, 0] 82 | mess = None 83 | img = plot_bbox_labels(img, bbox, mess, cls_color, text_scale=ts) 84 | 85 | return img 86 | 87 | 88 | def test(net, 89 | device, 90 | dataset, 91 | transform, 92 | vis_thresh, 93 | class_colors=None, 94 | class_names=None, 95 | class_indexs=None, 96 | dataset_name='voc'): 97 | 98 | num_images = len(dataset) 99 | save_path = os.path.join('det_results/', args.dataset, args.version) 100 | os.makedirs(save_path, exist_ok=True) 101 | 102 | for index in range(num_images): 103 | print('Testing image {:d}/{:d}....'.format(index+1, num_images)) 104 | image, _ = dataset.pull_image(index) 105 | h, w, _ = image.shape 106 | scale = np.array([[w, h, w, h]]) 107 | 108 | # to tensor 109 | x = torch.from_numpy(transform(image)[0][:, :, (2, 1, 0)]).permute(2, 0, 1) 110 | x = x.unsqueeze(0).to(device) 111 | 112 | t0 = time.time() 113 | # forward 114 | bboxes, scores, cls_inds = net(x) 115 | print("detection time used ", time.time() - t0, "s") 116 | 117 | # rescale 118 | bboxes *= scale 119 | 120 | # vis detection 121 | img_processed = visualize( 122 | img=image, 123 | bboxes=bboxes, 124 | scores=scores, 125 | cls_inds=cls_inds, 126 | vis_thresh=vis_thresh, 127 | class_colors=class_colors, 128 | class_names=class_names, 129 | class_indexs=class_indexs, 130 | dataset_name=dataset_name 131 | ) 132 | if args.show: 133 | cv2.imshow('detection', img_processed) 134 | cv2.waitKey(0) 135 | # save result 136 | cv2.imwrite(os.path.join(save_path, str(index).zfill(6) +'.jpg'), img_processed) 137 | 138 | 139 | if __name__ == '__main__': 140 | # cuda 141 | if args.cuda: 142 | print('use cuda') 143 | cudnn.benchmark = True 144 | device = torch.device("cuda") 145 | else: 146 | device = torch.device("cpu") 147 | 148 | # input size 149 | input_size = args.input_size 150 | 151 | # dataset 152 | if args.dataset == 'voc': 153 | print('test on voc ...') 154 | data_dir = os.path.join(args.data_root, 'VOCdevkit') 155 | class_names = VOC_CLASSES 156 | class_indexs = None 157 | num_classes = 20 158 | dataset = VOCDetection(root=data_dir, 159 | image_sets=[('2007', 'test')]) 160 | 161 | elif args.dataset == 'coco': 162 | print('test on coco-val ...') 163 | data_dir = os.path.join(args.data_root, 'COCO') 164 | class_names = coco_class_labels 165 | class_indexs = coco_class_index 166 | num_classes = 80 167 | dataset = COCODataset( 168 | data_dir=data_dir, 169 | json_file='instances_val2017.json', 170 | name='val2017') 171 | 172 | class_colors = [(np.random.randint(255), 173 | np.random.randint(255), 174 | np.random.randint(255)) for _ in range(num_classes)] 175 | 176 | # model 177 | model_name = args.version 178 | print('Model: ', model_name) 179 | 180 | # load model and config file 181 | if model_name == 'yolov2_d19': 182 | from models.yolov2_d19 import YOLOv2D19 as yolo_net 183 | cfg = config.yolov2_d19_cfg 184 | 185 | elif model_name == 'yolov2_r50': 186 | from models.yolov2_r50 import YOLOv2R50 as yolo_net 187 | cfg = config.yolov2_r50_cfg 188 | 189 | elif model_name == 'yolov3': 190 | from models.yolov3 import YOLOv3 as yolo_net 191 | cfg = config.yolov3_d53_cfg 192 | 193 | elif model_name == 'yolov3_spp': 194 | from models.yolov3_spp import YOLOv3Spp as yolo_net 195 | cfg = config.yolov3_d53_cfg 196 | 197 | elif model_name == 'yolov3_tiny': 198 | from models.yolov3_tiny import YOLOv3tiny as yolo_net 199 | cfg = config.yolov3_tiny_cfg 200 | else: 201 | print('Unknown model name...') 202 | exit(0) 203 | 204 | # build model 205 | anchor_size = cfg['anchor_size_voc'] if args.dataset == 'voc' else cfg['anchor_size_coco'] 206 | net = yolo_net(device=device, 207 | input_size=input_size, 208 | num_classes=num_classes, 209 | trainable=False, 210 | conf_thresh=args.conf_thresh, 211 | nms_thresh=args.nms_thresh, 212 | anchor_size=anchor_size) 213 | 214 | # load weight 215 | net.load_state_dict(torch.load(args.trained_model, map_location=device)) 216 | net.to(device).eval() 217 | print('Finished loading model!') 218 | 219 | # evaluation 220 | test(net=net, 221 | device=device, 222 | dataset=dataset, 223 | transform=BaseTransform(input_size), 224 | vis_thresh=args.visual_threshold, 225 | class_colors=class_colors, 226 | class_names=class_names, 227 | class_indexs=class_indexs, 228 | dataset_name=args.dataset 229 | ) 230 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/utils/__init__.py -------------------------------------------------------------------------------- /utils/augmentations.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from numpy import random 4 | 5 | 6 | def intersect(box_a, box_b): 7 | max_xy = np.minimum(box_a[:, 2:], box_b[2:]) 8 | min_xy = np.maximum(box_a[:, :2], box_b[:2]) 9 | inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf) 10 | return inter[:, 0] * inter[:, 1] 11 | 12 | 13 | def jaccard_numpy(box_a, box_b): 14 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 15 | is simply the intersection over union of two boxes. 16 | E.g.: 17 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 18 | Args: 19 | box_a: Multiple bounding boxes, Shape: [num_boxes,4] 20 | box_b: Single bounding box, Shape: [4] 21 | Return: 22 | jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]] 23 | """ 24 | inter = intersect(box_a, box_b) 25 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 26 | (box_a[:, 3]-box_a[:, 1])) # [A,B] 27 | area_b = ((box_b[2]-box_b[0]) * 28 | (box_b[3]-box_b[1])) # [A,B] 29 | union = area_a + area_b - inter 30 | return inter / union # [A,B] 31 | 32 | 33 | class Compose(object): 34 | """Composes several augmentations together. 35 | Args: 36 | transforms (List[Transform]): list of transforms to compose. 37 | Example: 38 | >>> augmentations.Compose([ 39 | >>> transforms.CenterCrop(10), 40 | >>> transforms.ToTensor(), 41 | >>> ]) 42 | """ 43 | 44 | def __init__(self, transforms): 45 | self.transforms = transforms 46 | 47 | def __call__(self, img, boxes=None, labels=None): 48 | for t in self.transforms: 49 | img, boxes, labels = t(img, boxes, labels) 50 | return img, boxes, labels 51 | 52 | 53 | class ConvertFromInts(object): 54 | def __call__(self, image, boxes=None, labels=None): 55 | return image.astype(np.float32), boxes, labels 56 | 57 | 58 | class Normalize(object): 59 | def __init__(self, mean=None, std=None): 60 | self.mean = np.array(mean, dtype=np.float32) 61 | self.std = np.array(std, dtype=np.float32) 62 | 63 | def __call__(self, image, boxes=None, labels=None): 64 | image = image.astype(np.float32) 65 | image /= 255. 66 | image -= self.mean 67 | image /= self.std 68 | 69 | return image, boxes, labels 70 | 71 | 72 | class ToAbsoluteCoords(object): 73 | def __call__(self, image, boxes=None, labels=None): 74 | height, width, channels = image.shape 75 | boxes[:, 0] *= width 76 | boxes[:, 2] *= width 77 | boxes[:, 1] *= height 78 | boxes[:, 3] *= height 79 | 80 | return image, boxes, labels 81 | 82 | 83 | class ToPercentCoords(object): 84 | def __call__(self, image, boxes=None, labels=None): 85 | height, width, channels = image.shape 86 | boxes[:, 0] /= width 87 | boxes[:, 2] /= width 88 | boxes[:, 1] /= height 89 | boxes[:, 3] /= height 90 | 91 | return image, boxes, labels 92 | 93 | 94 | class Resize(object): 95 | def __init__(self, size=416): 96 | self.size = size 97 | 98 | def __call__(self, image, boxes=None, labels=None): 99 | image = cv2.resize(image, (self.size, self.size)) 100 | return image, boxes, labels 101 | 102 | 103 | class RandomSaturation(object): 104 | def __init__(self, lower=0.5, upper=1.5): 105 | self.lower = lower 106 | self.upper = upper 107 | assert self.upper >= self.lower, "contrast upper must be >= lower." 108 | assert self.lower >= 0, "contrast lower must be non-negative." 109 | 110 | def __call__(self, image, boxes=None, labels=None): 111 | if random.randint(2): 112 | image[:, :, 1] *= random.uniform(self.lower, self.upper) 113 | 114 | return image, boxes, labels 115 | 116 | 117 | class RandomHue(object): 118 | def __init__(self, delta=18.0): 119 | assert delta >= 0.0 and delta <= 360.0 120 | self.delta = delta 121 | 122 | def __call__(self, image, boxes=None, labels=None): 123 | if random.randint(2): 124 | image[:, :, 0] += random.uniform(-self.delta, self.delta) 125 | image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0 126 | image[:, :, 0][image[:, :, 0] < 0.0] += 360.0 127 | return image, boxes, labels 128 | 129 | 130 | class RandomLightingNoise(object): 131 | def __init__(self): 132 | self.perms = ((0, 1, 2), (0, 2, 1), 133 | (1, 0, 2), (1, 2, 0), 134 | (2, 0, 1), (2, 1, 0)) 135 | 136 | def __call__(self, image, boxes=None, labels=None): 137 | if random.randint(2): 138 | swap = self.perms[random.randint(len(self.perms))] 139 | shuffle = SwapChannels(swap) # shuffle channels 140 | image = shuffle(image) 141 | return image, boxes, labels 142 | 143 | 144 | class ConvertColor(object): 145 | def __init__(self, current='BGR', transform='HSV'): 146 | self.transform = transform 147 | self.current = current 148 | 149 | def __call__(self, image, boxes=None, labels=None): 150 | if self.current == 'BGR' and self.transform == 'HSV': 151 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 152 | elif self.current == 'HSV' and self.transform == 'BGR': 153 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 154 | else: 155 | raise NotImplementedError 156 | return image, boxes, labels 157 | 158 | 159 | class RandomContrast(object): 160 | def __init__(self, lower=0.5, upper=1.5): 161 | self.lower = lower 162 | self.upper = upper 163 | assert self.upper >= self.lower, "contrast upper must be >= lower." 164 | assert self.lower >= 0, "contrast lower must be non-negative." 165 | 166 | # expects float image 167 | def __call__(self, image, boxes=None, labels=None): 168 | if random.randint(2): 169 | alpha = random.uniform(self.lower, self.upper) 170 | image *= alpha 171 | return image, boxes, labels 172 | 173 | 174 | class RandomBrightness(object): 175 | def __init__(self, delta=32): 176 | assert delta >= 0.0 177 | assert delta <= 255.0 178 | self.delta = delta 179 | 180 | def __call__(self, image, boxes=None, labels=None): 181 | if random.randint(2): 182 | delta = random.uniform(-self.delta, self.delta) 183 | image += delta 184 | return image, boxes, labels 185 | 186 | 187 | class RandomSampleCrop(object): 188 | """Crop 189 | Arguments: 190 | img (Image): the image being input during training 191 | boxes (Tensor): the original bounding boxes in pt form 192 | labels (Tensor): the class labels for each bbox 193 | mode (float tuple): the min and max jaccard overlaps 194 | Return: 195 | (img, boxes, classes) 196 | img (Image): the cropped image 197 | boxes (Tensor): the adjusted bounding boxes in pt form 198 | labels (Tensor): the class labels for each bbox 199 | """ 200 | def __init__(self): 201 | self.sample_options = ( 202 | # using entire original input image 203 | None, 204 | # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9 205 | (0.1, None), 206 | (0.3, None), 207 | (0.7, None), 208 | (0.9, None), 209 | # randomly sample a patch 210 | (None, None), 211 | ) 212 | 213 | def __call__(self, image, boxes=None, labels=None): 214 | height, width, _ = image.shape 215 | while True: 216 | # randomly choose a mode 217 | sample_id = np.random.randint(len(self.sample_options)) 218 | mode = self.sample_options[sample_id] 219 | if mode is None: 220 | return image, boxes, labels 221 | 222 | min_iou, max_iou = mode 223 | if min_iou is None: 224 | min_iou = float('-inf') 225 | if max_iou is None: 226 | max_iou = float('inf') 227 | 228 | # max trails (50) 229 | for _ in range(50): 230 | current_image = image 231 | 232 | w = random.uniform(0.3 * width, width) 233 | h = random.uniform(0.3 * height, height) 234 | 235 | # aspect ratio constraint b/t .5 & 2 236 | if h / w < 0.5 or h / w > 2: 237 | continue 238 | 239 | left = random.uniform(width - w) 240 | top = random.uniform(height - h) 241 | 242 | # convert to integer rect x1,y1,x2,y2 243 | rect = np.array([int(left), int(top), int(left+w), int(top+h)]) 244 | 245 | # calculate IoU (jaccard overlap) b/t the cropped and gt boxes 246 | overlap = jaccard_numpy(boxes, rect) 247 | 248 | # is min and max overlap constraint satisfied? if not try again 249 | if overlap.min() < min_iou and max_iou < overlap.max(): 250 | continue 251 | 252 | # cut the crop from the image 253 | current_image = current_image[rect[1]:rect[3], rect[0]:rect[2], 254 | :] 255 | 256 | # keep overlap with gt box IF center in sampled patch 257 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0 258 | 259 | # mask in all gt boxes that above and to the left of centers 260 | m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1]) 261 | 262 | # mask in all gt boxes that under and to the right of centers 263 | m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1]) 264 | 265 | # mask in that both m1 and m2 are true 266 | mask = m1 * m2 267 | 268 | # have any valid boxes? try again if not 269 | if not mask.any(): 270 | continue 271 | 272 | # take only matching gt boxes 273 | current_boxes = boxes[mask, :].copy() 274 | 275 | # take only matching gt labels 276 | current_labels = labels[mask] 277 | 278 | # should we use the box left and top corner or the crop's 279 | current_boxes[:, :2] = np.maximum(current_boxes[:, :2], 280 | rect[:2]) 281 | # adjust to crop (by substracting crop's left,top) 282 | current_boxes[:, :2] -= rect[:2] 283 | 284 | current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:], 285 | rect[2:]) 286 | # adjust to crop (by substracting crop's left,top) 287 | current_boxes[:, 2:] -= rect[:2] 288 | 289 | return current_image, current_boxes, current_labels 290 | 291 | 292 | class RandomMirror(object): 293 | def __call__(self, image, boxes, classes): 294 | _, width, _ = image.shape 295 | if random.randint(2): 296 | image = image[:, ::-1] 297 | boxes = boxes.copy() 298 | boxes[:, 0::2] = width - boxes[:, 2::-2] 299 | return image, boxes, classes 300 | 301 | 302 | class SwapChannels(object): 303 | """Transforms a tensorized image by swapping the channels in the order 304 | specified in the swap tuple. 305 | Args: 306 | swaps (int triple): final order of channels 307 | eg: (2, 1, 0) 308 | """ 309 | 310 | def __init__(self, swaps): 311 | self.swaps = swaps 312 | 313 | def __call__(self, image): 314 | """ 315 | Args: 316 | image (Tensor): image tensor to be transformed 317 | Return: 318 | a tensor with channels swapped according to swap 319 | """ 320 | # if torch.is_tensor(image): 321 | # image = image.data.cpu().numpy() 322 | # else: 323 | # image = np.array(image) 324 | image = image[:, :, self.swaps] 325 | return image 326 | 327 | 328 | class PhotometricDistort(object): 329 | def __init__(self): 330 | self.pd = [ 331 | RandomContrast(), 332 | ConvertColor(transform='HSV'), 333 | RandomSaturation(), 334 | RandomHue(), 335 | ConvertColor(current='HSV', transform='BGR'), 336 | RandomContrast() 337 | ] 338 | self.rand_brightness = RandomBrightness() 339 | # self.rand_light_noise = RandomLightingNoise() 340 | 341 | def __call__(self, image, boxes, labels): 342 | im = image.copy() 343 | im, boxes, labels = self.rand_brightness(im, boxes, labels) 344 | if random.randint(2): 345 | distort = Compose(self.pd[:-1]) 346 | else: 347 | distort = Compose(self.pd[1:]) 348 | im, boxes, labels = distort(im, boxes, labels) 349 | return im, boxes, labels 350 | # return self.rand_light_noise(im, boxes, labels) 351 | 352 | 353 | class SSDAugmentation(object): 354 | def __init__(self, size=416, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)): 355 | self.mean = mean 356 | self.size = size 357 | self.std = std 358 | self.augment = Compose([ 359 | ConvertFromInts(), 360 | ToAbsoluteCoords(), 361 | PhotometricDistort(), 362 | RandomSampleCrop(), 363 | RandomMirror(), 364 | ToPercentCoords(), 365 | Resize(self.size), 366 | Normalize(self.mean, self.std) 367 | ]) 368 | 369 | def __call__(self, img, boxes, labels): 370 | return self.augment(img, boxes, labels) 371 | 372 | 373 | class ColorAugmentation(object): 374 | def __init__(self, size=416, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)): 375 | self.mean = mean 376 | self.size = size 377 | self.std = std 378 | self.augment = Compose([ 379 | ConvertFromInts(), 380 | ToAbsoluteCoords(), 381 | PhotometricDistort(), 382 | RandomMirror(), 383 | ToPercentCoords(), 384 | Resize(self.size), 385 | Normalize(self.mean, self.std) 386 | ]) 387 | 388 | def __call__(self, img, boxes, labels): 389 | return self.augment(img, boxes, labels) 390 | -------------------------------------------------------------------------------- /utils/cocoapi_evaluator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tempfile 3 | 4 | from pycocotools.cocoeval import COCOeval 5 | from torch.autograd import Variable 6 | 7 | from data.coco2017 import * 8 | from data import * 9 | 10 | 11 | class COCOAPIEvaluator(): 12 | """ 13 | COCO AP Evaluation class. 14 | All the data in the val2017 dataset are processed \ 15 | and evaluated by COCO API. 16 | """ 17 | def __init__(self, data_dir, img_size, device, testset=False, transform=None): 18 | """ 19 | Args: 20 | data_dir (str): dataset root directory 21 | img_size (int): image size after preprocess. images are resized \ 22 | to squares whose shape is (img_size, img_size). 23 | confthre (float): 24 | confidence threshold ranging from 0 to 1, \ 25 | which is defined in the config file. 26 | nmsthre (float): 27 | IoU threshold of non-max supression ranging from 0 to 1. 28 | """ 29 | self.testset = testset 30 | if self.testset: 31 | json_file='image_info_test-dev2017.json' 32 | name = 'test2017' 33 | else: 34 | json_file='instances_val2017.json' 35 | name='val2017' 36 | 37 | self.dataset = COCODataset(data_dir=data_dir, 38 | json_file=json_file, 39 | name=name) 40 | self.img_size = img_size 41 | self.transform = transform 42 | self.device = device 43 | 44 | self.map = 0. 45 | self.ap50_95 = 0. 46 | self.ap50 = 0. 47 | 48 | def evaluate(self, model): 49 | """ 50 | COCO average precision (AP) Evaluation. Iterate inference on the test dataset 51 | and the results are evaluated by COCO API. 52 | Args: 53 | model : model object 54 | Returns: 55 | ap50_95 (float) : calculated COCO AP for IoU=50:95 56 | ap50 (float) : calculated COCO AP for IoU=50 57 | """ 58 | model.eval() 59 | ids = [] 60 | data_dict = [] 61 | num_images = len(self.dataset) 62 | print('total number of images: %d' % (num_images)) 63 | 64 | # start testing 65 | for index in range(num_images): # all the data in val2017 66 | if index % 500 == 0: 67 | print('[Eval: %d / %d]'%(index, num_images)) 68 | 69 | img, id_ = self.dataset.pull_image(index) # load a batch 70 | if self.transform is not None: 71 | x = torch.from_numpy(self.transform(img)[0][:, :, (2, 1, 0)]).permute(2, 0, 1) 72 | x = x.unsqueeze(0).to(self.device) 73 | scale = np.array([[img.shape[1], img.shape[0], 74 | img.shape[1], img.shape[0]]]) 75 | 76 | id_ = int(id_) 77 | ids.append(id_) 78 | with torch.no_grad(): 79 | outputs = model(x) 80 | bboxes, scores, cls_inds = outputs 81 | bboxes *= scale 82 | for i, box in enumerate(bboxes): 83 | x1 = float(box[0]) 84 | y1 = float(box[1]) 85 | x2 = float(box[2]) 86 | y2 = float(box[3]) 87 | label = self.dataset.class_ids[int(cls_inds[i])] 88 | 89 | bbox = [x1, y1, x2 - x1, y2 - y1] 90 | score = float(scores[i]) # object score * class score 91 | A = {"image_id": id_, "category_id": label, "bbox": bbox, 92 | "score": score} # COCO json format 93 | data_dict.append(A) 94 | 95 | annType = ['segm', 'bbox', 'keypoints'] 96 | 97 | # Evaluate the Dt (detection) json comparing with the ground truth 98 | if len(data_dict) > 0: 99 | print('evaluating ......') 100 | cocoGt = self.dataset.coco 101 | # For test 102 | if self.testset: 103 | json.dump(data_dict, open('yolov2_2017.json', 'w')) 104 | cocoDt = cocoGt.loadRes('yolov2_2017.json') 105 | print('inference on test-dev is done !!') 106 | return -1, -1 107 | # For val 108 | else: 109 | _, tmp = tempfile.mkstemp() 110 | json.dump(data_dict, open(tmp, 'w')) 111 | cocoDt = cocoGt.loadRes(tmp) 112 | cocoEval = COCOeval(self.dataset.coco, cocoDt, annType[1]) 113 | cocoEval.params.imgIds = ids 114 | cocoEval.evaluate() 115 | cocoEval.accumulate() 116 | cocoEval.summarize() 117 | 118 | ap50_95, ap50 = cocoEval.stats[0], cocoEval.stats[1] 119 | print('ap50_95 : ', ap50_95) 120 | print('ap50 : ', ap50) 121 | self.map = ap50_95 122 | self.ap50_95 = ap50_95 123 | self.ap50 = ap50 124 | 125 | return ap50, ap50_95 126 | else: 127 | return 0, 0 128 | 129 | -------------------------------------------------------------------------------- /utils/com_paras_flops.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from thop import profile 3 | 4 | 5 | def FLOPs_and_Params(model, size, device): 6 | x = torch.randn(1, 3, size, size).to(device) 7 | model.trainable = False 8 | model.eval() 9 | 10 | flops, params = profile(model, inputs=(x, )) 11 | print('FLOPs : ', flops / 1e9, ' B') 12 | print('Params : ', params / 1e6, ' M') 13 | 14 | model.trainable = True 15 | model.train() 16 | 17 | 18 | if __name__ == "__main__": 19 | pass 20 | -------------------------------------------------------------------------------- /utils/distributed_utils.py: -------------------------------------------------------------------------------- 1 | # from github: https://github.com/ruinmessi/ASFF/blob/master/utils/distributed_util.py 2 | 3 | import torch 4 | import torch.distributed as dist 5 | import os 6 | import subprocess 7 | import pickle 8 | 9 | 10 | def all_gather(data): 11 | """ 12 | Run all_gather on arbitrary picklable data (not necessarily tensors) 13 | Args: 14 | data: any picklable object 15 | Returns: 16 | list[data]: list of data gathered from each rank 17 | """ 18 | world_size = get_world_size() 19 | if world_size == 1: 20 | return [data] 21 | 22 | # serialized to a Tensor 23 | buffer = pickle.dumps(data) 24 | storage = torch.ByteStorage.from_buffer(buffer) 25 | tensor = torch.ByteTensor(storage).to("cuda") 26 | 27 | # obtain Tensor size of each rank 28 | local_size = torch.tensor([tensor.numel()], device="cuda") 29 | size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)] 30 | dist.all_gather(size_list, local_size) 31 | size_list = [int(size.item()) for size in size_list] 32 | max_size = max(size_list) 33 | 34 | # receiving Tensor from all ranks 35 | # we pad the tensor because torch all_gather does not support 36 | # gathering tensors of different shapes 37 | tensor_list = [] 38 | for _ in size_list: 39 | tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda")) 40 | if local_size != max_size: 41 | padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda") 42 | tensor = torch.cat((tensor, padding), dim=0) 43 | dist.all_gather(tensor_list, tensor) 44 | 45 | data_list = [] 46 | for size, tensor in zip(size_list, tensor_list): 47 | buffer = tensor.cpu().numpy().tobytes()[:size] 48 | data_list.append(pickle.loads(buffer)) 49 | 50 | return data_list 51 | 52 | 53 | def reduce_dict(input_dict, average=True): 54 | """ 55 | Args: 56 | input_dict (dict): all the values will be reduced 57 | average (bool): whether to do average or sum 58 | Reduce the values in the dictionary from all processes so that all processes 59 | have the averaged results. Returns a dict with the same fields as 60 | input_dict, after reduction. 61 | """ 62 | world_size = get_world_size() 63 | if world_size < 2: 64 | return input_dict 65 | with torch.no_grad(): 66 | names = [] 67 | values = [] 68 | # sort the keys so that they are consistent across processes 69 | for k in sorted(input_dict.keys()): 70 | names.append(k) 71 | values.append(input_dict[k]) 72 | values = torch.stack(values, dim=0) 73 | dist.all_reduce(values) 74 | if average: 75 | values /= world_size 76 | reduced_dict = {k: v for k, v in zip(names, values)} 77 | return reduced_dict 78 | 79 | 80 | def get_sha(): 81 | cwd = os.path.dirname(os.path.abspath(__file__)) 82 | 83 | def _run(command): 84 | return subprocess.check_output(command, cwd=cwd).decode('ascii').strip() 85 | sha = 'N/A' 86 | diff = "clean" 87 | branch = 'N/A' 88 | try: 89 | sha = _run(['git', 'rev-parse', 'HEAD']) 90 | subprocess.check_output(['git', 'diff'], cwd=cwd) 91 | diff = _run(['git', 'diff-index', 'HEAD']) 92 | diff = "has uncommited changes" if diff else "clean" 93 | branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD']) 94 | except Exception: 95 | pass 96 | message = f"sha: {sha}, status: {diff}, branch: {branch}" 97 | return message 98 | 99 | 100 | def setup_for_distributed(is_master): 101 | """ 102 | This function disables printing when not in master process 103 | """ 104 | import builtins as __builtin__ 105 | builtin_print = __builtin__.print 106 | 107 | def print(*args, **kwargs): 108 | force = kwargs.pop('force', False) 109 | if is_master or force: 110 | builtin_print(*args, **kwargs) 111 | 112 | __builtin__.print = print 113 | 114 | 115 | def is_dist_avail_and_initialized(): 116 | if not dist.is_available(): 117 | return False 118 | if not dist.is_initialized(): 119 | return False 120 | return True 121 | 122 | 123 | def get_world_size(): 124 | if not is_dist_avail_and_initialized(): 125 | return 1 126 | return dist.get_world_size() 127 | 128 | 129 | def get_rank(): 130 | if not is_dist_avail_and_initialized(): 131 | return 0 132 | return dist.get_rank() 133 | 134 | 135 | def is_main_process(): 136 | return get_rank() == 0 137 | 138 | 139 | def save_on_master(*args, **kwargs): 140 | if is_main_process(): 141 | torch.save(*args, **kwargs) 142 | 143 | 144 | def init_distributed_mode(args): 145 | if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: 146 | args.rank = int(os.environ["RANK"]) 147 | args.world_size = int(os.environ['WORLD_SIZE']) 148 | args.gpu = int(os.environ['LOCAL_RANK']) 149 | elif 'SLURM_PROCID' in os.environ: 150 | args.rank = int(os.environ['SLURM_PROCID']) 151 | args.gpu = args.rank % torch.cuda.device_count() 152 | else: 153 | print('Not using distributed mode') 154 | args.distributed = False 155 | return 156 | 157 | args.distributed = True 158 | 159 | torch.cuda.set_device(args.gpu) 160 | args.dist_backend = 'nccl' 161 | print('| distributed init (rank {}): {}'.format( 162 | args.rank, args.dist_url), flush=True) 163 | torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 164 | world_size=args.world_size, rank=args.rank) 165 | torch.distributed.barrier() 166 | setup_for_distributed(args.rank == 0) 167 | -------------------------------------------------------------------------------- /utils/kmeans_anchor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import argparse 4 | import os 5 | import sys 6 | sys.path.append('..') 7 | 8 | from data.voc0712 import VOCDetection 9 | from data.coco2017 import COCODataset 10 | 11 | 12 | def parse_args(): 13 | parser = argparse.ArgumentParser(description='kmeans for anchor box') 14 | 15 | parser.add_argument('-root', '--data_root', default='/mnt/share/ssd2/dataset', 16 | help='dataset root') 17 | parser.add_argument('-d', '--dataset', default='coco', 18 | help='coco, voc.') 19 | parser.add_argument('-na', '--num_anchorbox', default=9, type=int, 20 | help='number of anchor box.') 21 | parser.add_argument('-size', '--input_size', default=416, type=int, 22 | help='input size.') 23 | parser.add_argument('--scale', action='store_true', default=False, 24 | help='divide the sizes of anchor boxes by 32 .') 25 | return parser.parse_args() 26 | 27 | args = parse_args() 28 | 29 | 30 | class Box(): 31 | def __init__(self, x, y, w, h): 32 | self.x = x 33 | self.y = y 34 | self.w = w 35 | self.h = h 36 | 37 | 38 | def iou(box1, box2): 39 | x1, y1, w1, h1 = box1.x, box1.y, box1.w, box1.h 40 | x2, y2, w2, h2 = box2.x, box2.y, box2.w, box2.h 41 | 42 | S_1 = w1 * h1 43 | S_2 = w2 * h2 44 | 45 | xmin_1, ymin_1 = x1 - w1 / 2, y1 - h1 / 2 46 | xmax_1, ymax_1 = x1 + w1 / 2, y1 + h1 / 2 47 | xmin_2, ymin_2 = x2 - w2 / 2, y2 - h2 / 2 48 | xmax_2, ymax_2 = x2 + w2 / 2, y2 + h2 / 2 49 | 50 | I_w = min(xmax_1, xmax_2) - max(xmin_1, xmin_2) 51 | I_h = min(ymax_1, ymax_2) - max(ymin_1, ymin_2) 52 | if I_w < 0 or I_h < 0: 53 | return 0 54 | I = I_w * I_h 55 | 56 | IoU = I / (S_1 + S_2 - I) 57 | 58 | return IoU 59 | 60 | 61 | def init_centroids(boxes, n_anchors): 62 | """ 63 | We use kmeans++ to initialize centroids. 64 | """ 65 | centroids = [] 66 | boxes_num = len(boxes) 67 | 68 | centroid_index = int(np.random.choice(boxes_num, 1)[0]) 69 | centroids.append(boxes[centroid_index]) 70 | print(centroids[0].w,centroids[0].h) 71 | 72 | for centroid_index in range(0, n_anchors-1): 73 | sum_distance = 0 74 | distance_thresh = 0 75 | distance_list = [] 76 | cur_sum = 0 77 | 78 | for box in boxes: 79 | min_distance = 1 80 | for centroid_i, centroid in enumerate(centroids): 81 | distance = (1 - iou(box, centroid)) 82 | if distance < min_distance: 83 | min_distance = distance 84 | sum_distance += min_distance 85 | distance_list.append(min_distance) 86 | 87 | distance_thresh = sum_distance * np.random.random() 88 | 89 | for i in range(0, boxes_num): 90 | cur_sum += distance_list[i] 91 | if cur_sum > distance_thresh: 92 | centroids.append(boxes[i]) 93 | print(boxes[i].w, boxes[i].h) 94 | break 95 | return centroids 96 | 97 | 98 | def do_kmeans(n_anchors, boxes, centroids): 99 | loss = 0 100 | groups = [] 101 | new_centroids = [] 102 | # for box in centroids: 103 | # print('box: ', box.x, box.y, box.w, box.h) 104 | # exit() 105 | for i in range(n_anchors): 106 | groups.append([]) 107 | new_centroids.append(Box(0, 0, 0, 0)) 108 | 109 | for box in boxes: 110 | min_distance = 1 111 | group_index = 0 112 | for centroid_index, centroid in enumerate(centroids): 113 | distance = (1 - iou(box, centroid)) 114 | if distance < min_distance: 115 | min_distance = distance 116 | group_index = centroid_index 117 | groups[group_index].append(box) 118 | loss += min_distance 119 | new_centroids[group_index].w += box.w 120 | new_centroids[group_index].h += box.h 121 | 122 | for i in range(n_anchors): 123 | new_centroids[i].w /= max(len(groups[i]), 1) 124 | new_centroids[i].h /= max(len(groups[i]), 1) 125 | 126 | return new_centroids, groups, loss# / len(boxes) 127 | 128 | 129 | def anchor_box_kmeans(total_gt_boxes, n_anchors, loss_convergence, iters, plus=True): 130 | """ 131 | This function will use k-means to get appropriate anchor boxes for train dataset. 132 | Input: 133 | total_gt_boxes: 134 | n_anchor : int -> the number of anchor boxes. 135 | loss_convergence : float -> threshold of iterating convergence. 136 | iters: int -> the number of iterations for training kmeans. 137 | Output: anchor_boxes : list -> [[w1, h1], [w2, h2], ..., [wn, hn]]. 138 | """ 139 | boxes = total_gt_boxes 140 | centroids = [] 141 | if plus: 142 | centroids = init_centroids(boxes, n_anchors) 143 | else: 144 | total_indexs = range(len(boxes)) 145 | sample_indexs = random.sample(total_indexs, n_anchors) 146 | for i in sample_indexs: 147 | centroids.append(boxes[i]) 148 | 149 | # iterate k-means 150 | centroids, groups, old_loss = do_kmeans(n_anchors, boxes, centroids) 151 | iterations = 1 152 | while(True): 153 | centroids, groups, loss = do_kmeans(n_anchors, boxes, centroids) 154 | iterations += 1 155 | print("Loss = %f" % loss) 156 | if abs(old_loss - loss) < loss_convergence or iterations > iters: 157 | break 158 | old_loss = loss 159 | 160 | for centroid in centroids: 161 | print(centroid.w, centroid.h) 162 | 163 | print("k-means result : ") 164 | for centroid in centroids: 165 | if args.scale: 166 | print("w, h: ", round(centroid.w / 32., 2), round(centroid.h / 32., 2), 167 | "area: ", round(centroid.w / 32., 2) * round(centroid.h / 32., 2)) 168 | else: 169 | print("w, h: ", round(centroid.w, 2), round(centroid.h, 2), 170 | "area: ", round(centroid.w, 2) * round(centroid.h, 2)) 171 | 172 | return centroids 173 | 174 | 175 | if __name__ == "__main__": 176 | 177 | n_anchors = args.num_anchorbox 178 | img_size = args.img_size 179 | dataset = args.dataset 180 | 181 | loss_convergence = 1e-6 182 | iters_n = 1000 183 | 184 | dataset_voc = VOCDetection(data_dir=os.path.join(args.root, 'VOCdevkit'), 185 | img_size=img_size) 186 | 187 | dataset_coco = COCODataset(data_dir=os.path.join(args.root, 'COCO'), 188 | img_size=img_size) 189 | 190 | boxes = [] 191 | print("The dataset size: ", len(dataset)) 192 | print("Loading the dataset ...") 193 | # VOC 194 | for i in range(len(dataset_voc)): 195 | if i % 5000 == 0: 196 | print('Loading voc data [%d / %d]' % (i+1, len(dataset_voc))) 197 | 198 | # For VOC 199 | img, _ = dataset_voc.pull_image(i) 200 | w, h = img.shape[1], img.shape[0] 201 | _, annotation = dataset_voc.pull_anno(i) 202 | 203 | # prepare bbox datas 204 | for box_and_label in annotation: 205 | box = box_and_label[:-1] 206 | xmin, ymin, xmax, ymax = box 207 | bw = (xmax - xmin) / w * img_size 208 | bh = (ymax - ymin) / h * img_size 209 | # check bbox 210 | if bw < 1.0 or bh < 1.0: 211 | continue 212 | boxes.append(Box(0, 0, bw, bh)) 213 | 214 | # COCO 215 | for i in range(len(dataset_coco)): 216 | if i % 5000 == 0: 217 | print('Loading coco datat [%d / %d]' % (i+1, len(dataset_coco))) 218 | 219 | # For COCO 220 | img, _ = dataset_coco.pull_image(i) 221 | w, h = img.shape[1], img.shape[0] 222 | annotation = dataset_coco.pull_anno(i) 223 | 224 | # prepare bbox datas 225 | for box_and_label in annotation: 226 | box = box_and_label[:-1] 227 | xmin, ymin, xmax, ymax = box 228 | bw = (xmax - xmin) / w * img_size 229 | bh = (ymax - ymin) / h * img_size 230 | # check bbox 231 | if bw < 1.0 or bh < 1.0: 232 | continue 233 | boxes.append(Box(0, 0, bw, bh)) 234 | 235 | print("Number of all bboxes: ", len(boxes)) 236 | print("Start k-means !") 237 | centroids = anchor_box_kmeans(boxes, n_anchors, loss_convergence, iters_n, plus=True) 238 | -------------------------------------------------------------------------------- /utils/modules.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | from copy import deepcopy 5 | 6 | 7 | class Conv(nn.Module): 8 | def __init__(self, in_ch, out_ch, k=1, p=0, s=1, d=1, g=1, act=True): 9 | super(Conv, self).__init__() 10 | if act: 11 | self.convs = nn.Sequential( 12 | nn.Conv2d(in_ch, out_ch, k, stride=s, padding=p, dilation=d, groups=g), 13 | nn.BatchNorm2d(out_ch), 14 | nn.LeakyReLU(0.1, inplace=True) 15 | ) 16 | else: 17 | self.convs = nn.Sequential( 18 | nn.Conv2d(in_ch, out_ch, k, stride=s, padding=p, dilation=d, groups=g), 19 | nn.BatchNorm2d(out_ch) 20 | ) 21 | 22 | def forward(self, x): 23 | return self.convs(x) 24 | 25 | 26 | class UpSample(nn.Module): 27 | def __init__(self, size=None, scale_factor=None, mode='nearest', align_corner=None): 28 | super(UpSample, self).__init__() 29 | self.size = size 30 | self.scale_factor = scale_factor 31 | self.mode = mode 32 | self.align_corner = align_corner 33 | 34 | def forward(self, x): 35 | return torch.nn.functional.interpolate(x, size=self.size, scale_factor=self.scale_factor, 36 | mode=self.mode, align_corners=self.align_corner) 37 | 38 | 39 | class reorg_layer(nn.Module): 40 | def __init__(self, stride): 41 | super(reorg_layer, self).__init__() 42 | self.stride = stride 43 | 44 | def forward(self, x): 45 | batch_size, channels, height, width = x.size() 46 | _height, _width = height // self.stride, width // self.stride 47 | 48 | x = x.view(batch_size, channels, _height, self.stride, _width, self.stride).transpose(3, 4).contiguous() 49 | x = x.view(batch_size, channels, _height * _width, self.stride * self.stride).transpose(2, 3).contiguous() 50 | x = x.view(batch_size, channels, self.stride * self.stride, _height, _width).transpose(1, 2).contiguous() 51 | x = x.view(batch_size, -1, _height, _width) 52 | 53 | return x 54 | 55 | 56 | class SPP(nn.Module): 57 | """ 58 | Spatial Pyramid Pooling 59 | """ 60 | def __init__(self): 61 | super(SPP, self).__init__() 62 | 63 | def forward(self, x): 64 | x_1 = torch.nn.functional.max_pool2d(x, 5, stride=1, padding=2) 65 | x_2 = torch.nn.functional.max_pool2d(x, 9, stride=1, padding=4) 66 | x_3 = torch.nn.functional.max_pool2d(x, 13, stride=1, padding=6) 67 | x = torch.cat([x, x_1, x_2, x_3], dim=1) 68 | 69 | return x 70 | 71 | 72 | class ModelEMA(object): 73 | def __init__(self, model, decay=0.9999, updates=0): 74 | # create EMA 75 | self.ema = deepcopy(model).eval() 76 | self.updates = updates 77 | self.decay = lambda x: decay * (1 - math.exp(-x / 2000.)) 78 | for p in self.ema.parameters(): 79 | p.requires_grad_(False) 80 | 81 | def update(self, model): 82 | # Update EMA parameters 83 | with torch.no_grad(): 84 | self.updates += 1 85 | d = self.decay(self.updates) 86 | 87 | msd = model.state_dict() 88 | for k, v in self.ema.state_dict().items(): 89 | if v.dtype.is_floating_point: 90 | v *= d 91 | v += (1. - d) * msd[k].detach() 92 | -------------------------------------------------------------------------------- /utils/vocapi_evaluator.py: -------------------------------------------------------------------------------- 1 | """Adapted from: 2 | @longcw faster_rcnn_pytorch: https://github.com/longcw/faster_rcnn_pytorch 3 | @rbgirshick py-faster-rcnn https://github.com/rbgirshick/py-faster-rcnn 4 | Licensed under The MIT License [see LICENSE for details] 5 | """ 6 | 7 | from torch.autograd import Variable 8 | from data.voc0712 import VOCDetection, VOC_CLASSES 9 | import sys 10 | import os 11 | import time 12 | import numpy as np 13 | import pickle 14 | import xml.etree.ElementTree as ET 15 | 16 | 17 | class VOCAPIEvaluator(): 18 | """ VOC AP Evaluation class """ 19 | def __init__(self, data_root, img_size, device, transform, set_type='test', year='2007', display=False): 20 | self.data_root = data_root 21 | self.img_size = img_size 22 | self.device = device 23 | self.transform = transform 24 | self.labelmap = VOC_CLASSES 25 | self.set_type = set_type 26 | self.year = year 27 | self.display = display 28 | 29 | # path 30 | self.devkit_path = data_root + 'VOC' + year 31 | self.annopath = os.path.join(data_root, 'VOC2007', 'Annotations', '%s.xml') 32 | self.imgpath = os.path.join(data_root, 'VOC2007', 'JPEGImages', '%s.jpg') 33 | self.imgsetpath = os.path.join(data_root, 'VOC2007', 'ImageSets', 'Main', set_type+'.txt') 34 | self.output_dir = self.get_output_dir('voc_eval/', self.set_type) 35 | 36 | # dataset 37 | self.dataset = VOCDetection(data_dir=data_root, 38 | image_sets=[('2007', set_type)], 39 | transform=transform 40 | ) 41 | 42 | def evaluate(self, net): 43 | net.eval() 44 | num_images = len(self.dataset) 45 | # all detections are collected into: 46 | # all_boxes[cls][image] = N x 5 array of detections in 47 | # (x1, y1, x2, y2, score) 48 | self.all_boxes = [[[] for _ in range(num_images)] 49 | for _ in range(len(self.labelmap))] 50 | 51 | # timers 52 | det_file = os.path.join(self.output_dir, 'detections.pkl') 53 | 54 | for i in range(num_images): 55 | im, gt, h, w = self.dataset.pull_item(i) 56 | 57 | x = Variable(im.unsqueeze(0)).to(self.device) 58 | t0 = time.time() 59 | # forward 60 | bboxes, scores, cls_inds = net(x) 61 | detect_time = time.time() - t0 62 | scale = np.array([[w, h, w, h]]) 63 | bboxes *= scale 64 | 65 | for j in range(len(self.labelmap)): 66 | inds = np.where(cls_inds == j)[0] 67 | if len(inds) == 0: 68 | self.all_boxes[j][i] = np.empty([0, 5], dtype=np.float32) 69 | continue 70 | c_bboxes = bboxes[inds] 71 | c_scores = scores[inds] 72 | c_dets = np.hstack((c_bboxes, 73 | c_scores[:, np.newaxis])).astype(np.float32, 74 | copy=False) 75 | self.all_boxes[j][i] = c_dets 76 | 77 | if i % 500 == 0: 78 | print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1, num_images, detect_time)) 79 | 80 | with open(det_file, 'wb') as f: 81 | pickle.dump(self.all_boxes, f, pickle.HIGHEST_PROTOCOL) 82 | 83 | print('Evaluating detections') 84 | self.evaluate_detections(self.all_boxes) 85 | 86 | print('Mean AP: ', self.map) 87 | 88 | 89 | def parse_rec(self, filename): 90 | """ Parse a PASCAL VOC xml file """ 91 | tree = ET.parse(filename) 92 | objects = [] 93 | for obj in tree.findall('object'): 94 | obj_struct = {} 95 | obj_struct['name'] = obj.find('name').text 96 | obj_struct['pose'] = obj.find('pose').text 97 | obj_struct['truncated'] = int(obj.find('truncated').text) 98 | obj_struct['difficult'] = int(obj.find('difficult').text) 99 | bbox = obj.find('bndbox') 100 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 101 | int(bbox.find('ymin').text), 102 | int(bbox.find('xmax').text), 103 | int(bbox.find('ymax').text)] 104 | objects.append(obj_struct) 105 | 106 | return objects 107 | 108 | 109 | def get_output_dir(self, name, phase): 110 | """Return the directory where experimental artifacts are placed. 111 | If the directory does not exist, it is created. 112 | A canonical path is built using the name from an imdb and a network 113 | (if not None). 114 | """ 115 | filedir = os.path.join(name, phase) 116 | if not os.path.exists(filedir): 117 | os.makedirs(filedir) 118 | return filedir 119 | 120 | 121 | def get_voc_results_file_template(self, cls): 122 | # VOCdevkit/VOC2007/results/det_test_aeroplane.txt 123 | filename = 'det_' + self.set_type + '_%s.txt' % (cls) 124 | filedir = os.path.join(self.devkit_path, 'results') 125 | if not os.path.exists(filedir): 126 | os.makedirs(filedir) 127 | path = os.path.join(filedir, filename) 128 | return path 129 | 130 | 131 | def write_voc_results_file(self, all_boxes): 132 | for cls_ind, cls in enumerate(self.labelmap): 133 | if self.display: 134 | print('Writing {:s} VOC results file'.format(cls)) 135 | filename = self.get_voc_results_file_template(cls) 136 | with open(filename, 'wt') as f: 137 | for im_ind, index in enumerate(self.dataset.ids): 138 | dets = all_boxes[cls_ind][im_ind] 139 | if dets == []: 140 | continue 141 | # the VOCdevkit expects 1-based indices 142 | for k in range(dets.shape[0]): 143 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. 144 | format(index[1], dets[k, -1], 145 | dets[k, 0] + 1, dets[k, 1] + 1, 146 | dets[k, 2] + 1, dets[k, 3] + 1)) 147 | 148 | 149 | def do_python_eval(self, use_07=True): 150 | cachedir = os.path.join(self.devkit_path, 'annotations_cache') 151 | aps = [] 152 | # The PASCAL VOC metric changed in 2010 153 | use_07_metric = use_07 154 | print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) 155 | if not os.path.isdir(self.output_dir): 156 | os.mkdir(self.output_dir) 157 | for i, cls in enumerate(self.labelmap): 158 | filename = self.get_voc_results_file_template(cls) 159 | rec, prec, ap = self.voc_eval(detpath=filename, 160 | classname=cls, 161 | cachedir=cachedir, 162 | ovthresh=0.5, 163 | use_07_metric=use_07_metric 164 | ) 165 | aps += [ap] 166 | print('AP for {} = {:.4f}'.format(cls, ap)) 167 | with open(os.path.join(self.output_dir, cls + '_pr.pkl'), 'wb') as f: 168 | pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 169 | if self.display: 170 | self.map = np.mean(aps) 171 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 172 | print('~~~~~~~~') 173 | print('Results:') 174 | for ap in aps: 175 | print('{:.3f}'.format(ap)) 176 | print('{:.3f}'.format(np.mean(aps))) 177 | print('~~~~~~~~') 178 | print('') 179 | print('--------------------------------------------------------------') 180 | print('Results computed with the **unofficial** Python eval code.') 181 | print('Results should be very close to the official MATLAB eval code.') 182 | print('--------------------------------------------------------------') 183 | else: 184 | self.map = np.mean(aps) 185 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 186 | 187 | 188 | def voc_ap(self, rec, prec, use_07_metric=True): 189 | """ ap = voc_ap(rec, prec, [use_07_metric]) 190 | Compute VOC AP given precision and recall. 191 | If use_07_metric is true, uses the 192 | VOC 07 11 point method (default:True). 193 | """ 194 | if use_07_metric: 195 | # 11 point metric 196 | ap = 0. 197 | for t in np.arange(0., 1.1, 0.1): 198 | if np.sum(rec >= t) == 0: 199 | p = 0 200 | else: 201 | p = np.max(prec[rec >= t]) 202 | ap = ap + p / 11. 203 | else: 204 | # correct AP calculation 205 | # first append sentinel values at the end 206 | mrec = np.concatenate(([0.], rec, [1.])) 207 | mpre = np.concatenate(([0.], prec, [0.])) 208 | 209 | # compute the precision envelope 210 | for i in range(mpre.size - 1, 0, -1): 211 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 212 | 213 | # to calculate area under PR curve, look for points 214 | # where X axis (recall) changes value 215 | i = np.where(mrec[1:] != mrec[:-1])[0] 216 | 217 | # and sum (\Delta recall) * prec 218 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 219 | return ap 220 | 221 | 222 | def voc_eval(self, detpath, classname, cachedir, ovthresh=0.5, use_07_metric=True): 223 | if not os.path.isdir(cachedir): 224 | os.mkdir(cachedir) 225 | cachefile = os.path.join(cachedir, 'annots.pkl') 226 | # read list of images 227 | with open(self.imgsetpath, 'r') as f: 228 | lines = f.readlines() 229 | imagenames = [x.strip() for x in lines] 230 | if not os.path.isfile(cachefile): 231 | # load annots 232 | recs = {} 233 | for i, imagename in enumerate(imagenames): 234 | recs[imagename] = self.parse_rec(self.annopath % (imagename)) 235 | if i % 100 == 0 and self.display: 236 | print('Reading annotation for {:d}/{:d}'.format( 237 | i + 1, len(imagenames))) 238 | # save 239 | if self.display: 240 | print('Saving cached annotations to {:s}'.format(cachefile)) 241 | with open(cachefile, 'wb') as f: 242 | pickle.dump(recs, f) 243 | else: 244 | # load 245 | with open(cachefile, 'rb') as f: 246 | recs = pickle.load(f) 247 | 248 | # extract gt objects for this class 249 | class_recs = {} 250 | npos = 0 251 | for imagename in imagenames: 252 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 253 | bbox = np.array([x['bbox'] for x in R]) 254 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 255 | det = [False] * len(R) 256 | npos = npos + sum(~difficult) 257 | class_recs[imagename] = {'bbox': bbox, 258 | 'difficult': difficult, 259 | 'det': det} 260 | 261 | # read dets 262 | detfile = detpath.format(classname) 263 | with open(detfile, 'r') as f: 264 | lines = f.readlines() 265 | if any(lines) == 1: 266 | 267 | splitlines = [x.strip().split(' ') for x in lines] 268 | image_ids = [x[0] for x in splitlines] 269 | confidence = np.array([float(x[1]) for x in splitlines]) 270 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 271 | 272 | # sort by confidence 273 | sorted_ind = np.argsort(-confidence) 274 | sorted_scores = np.sort(-confidence) 275 | BB = BB[sorted_ind, :] 276 | image_ids = [image_ids[x] for x in sorted_ind] 277 | 278 | # go down dets and mark TPs and FPs 279 | nd = len(image_ids) 280 | tp = np.zeros(nd) 281 | fp = np.zeros(nd) 282 | for d in range(nd): 283 | R = class_recs[image_ids[d]] 284 | bb = BB[d, :].astype(float) 285 | ovmax = -np.inf 286 | BBGT = R['bbox'].astype(float) 287 | if BBGT.size > 0: 288 | # compute overlaps 289 | # intersection 290 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 291 | iymin = np.maximum(BBGT[:, 1], bb[1]) 292 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 293 | iymax = np.minimum(BBGT[:, 3], bb[3]) 294 | iw = np.maximum(ixmax - ixmin, 0.) 295 | ih = np.maximum(iymax - iymin, 0.) 296 | inters = iw * ih 297 | uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) + 298 | (BBGT[:, 2] - BBGT[:, 0]) * 299 | (BBGT[:, 3] - BBGT[:, 1]) - inters) 300 | overlaps = inters / uni 301 | ovmax = np.max(overlaps) 302 | jmax = np.argmax(overlaps) 303 | 304 | if ovmax > ovthresh: 305 | if not R['difficult'][jmax]: 306 | if not R['det'][jmax]: 307 | tp[d] = 1. 308 | R['det'][jmax] = 1 309 | else: 310 | fp[d] = 1. 311 | else: 312 | fp[d] = 1. 313 | 314 | # compute precision recall 315 | fp = np.cumsum(fp) 316 | tp = np.cumsum(tp) 317 | rec = tp / float(npos) 318 | # avoid divide by zero in case the first detection matches a difficult 319 | # ground truth 320 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 321 | ap = self.voc_ap(rec, prec, use_07_metric) 322 | else: 323 | rec = -1. 324 | prec = -1. 325 | ap = -1. 326 | 327 | return rec, prec, ap 328 | 329 | 330 | def evaluate_detections(self, box_list): 331 | self.write_voc_results_file(box_list) 332 | self.do_python_eval() 333 | 334 | 335 | if __name__ == '__main__': 336 | pass -------------------------------------------------------------------------------- /weights/README.md: -------------------------------------------------------------------------------- 1 | # yolo-v2-v3 and tiny model 2 | Hi, guys ! 3 | 4 | For researchers in China, you can download them from BaiduYunDisk. 5 | There are 5 models including yolo-v2, yolo-v3, yolo_v3_spp, slim-yolo-v2 and tiny-yolo-v3. 6 | 7 | The link is as following: 8 | 9 | link: https://pan.baidu.com/s/1rnmM8HGFzE2NTv6AkljJdg 10 | 11 | password: 5c8h 12 | 13 | 14 | 15 | I will upload all models to googledrive. --------------------------------------------------------------------------------