├── .gitignore
├── LICENSE
├── README.md
├── backbone
    ├── __init__.py
    ├── darknet19.py
    ├── darknet53.py
    ├── darknet_tiny.py
    ├── resnet.py
    └── weights
    │   └── README.md
├── data
    ├── __init__.py
    ├── coco2017.py
    ├── config.py
    ├── scripts
    │   ├── COCO2017.sh
    │   ├── VOC2007.sh
    │   └── VOC2012.sh
    └── voc0712.py
├── demo.py
├── eval.py
├── img_file
    └── darknet_tiny.png
├── models
    ├── __pycache__
    │   ├── yolo_anchor.cpython-36.pyc
    │   ├── yolo_anchor_ms.cpython-36.pyc
    │   ├── yolo_fusion.cpython-36.pyc
    │   ├── yolo_kitti.cpython-36.pyc
    │   ├── yolo_light.cpython-36.pyc
    │   ├── yolo_mobile.cpython-36.pyc
    │   ├── yolo_msf.cpython-36.pyc
    │   ├── yolo_v1.cpython-36.pyc
    │   ├── yolo_v1_ms.cpython-36.pyc
    │   ├── yolo_v2.cpython-36.pyc
    │   └── yolo_v2.cpython-37.pyc
    ├── yolov2_d19.py
    ├── yolov2_r50.py
    ├── yolov3.py
    ├── yolov3_spp.py
    └── yolov3_tiny.py
├── test.py
├── tools.py
├── train.py
├── utils
    ├── __init__.py
    ├── augmentations.py
    ├── cocoapi_evaluator.py
    ├── com_paras_flops.py
    ├── distributed_utils.py
    ├── kmeans_anchor.py
    ├── modules.py
    └── vocapi_evaluator.py
└── weights
    └── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pt
2 | *.pth
3 | *.txt
4 | *.pkl
5 | __pycache__
6 | .vscode
7 | det_results


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Update
  2 | Recently, I have released a new YOLO project:
  3 | 
  4 | https://github.com/yjh0410/PyTorch_YOLO_Tutorial
  5 | 
  6 | In my new YOLO project, you can enjoy: 
  7 | - a new and stronger YOLOv1
  8 | - a new and stronger YOLOv2
  9 | - YOLOv3
 10 | - YOLOv4
 11 | - YOLOv5
 12 | - YOLOv7
 13 | - YOLOX
 14 | - RTCDet
 15 | 
 16 | 
 17 | # This project
 18 | In this project, you can enjoy: 
 19 | - YOLOv2 with DarkNet-19
 20 | - YOLOv2 with ResNet-50
 21 | - YOLOv2Slim
 22 | - YOLOv3
 23 | - YOLOv3-Spp
 24 | - YOLOv3-Tiny
 25 | 
 26 | 
 27 | I just want to provide a good YOLO project for everyone who is interested in Object Detection.
 28 | 
 29 | # Weights
 30 | Google Drive: https://drive.google.com/drive/folders/1T5hHyGICbFSdu6u2_vqvxn_puotvPsbd?usp=sharing 
 31 | 
 32 | BaiDuYunDisk: https://pan.baidu.com/s/1tSylvzOVFReUAvaAxKRSwg 
 33 | Password d266
 34 | 
 35 | You can download all my models from the above links.
 36 | 
 37 | # YOLOv2
 38 | 
 39 | ## YOLOv2 with DarkNet-19
 40 | ### Tricks
 41 | Tricks in official paper:
 42 | - [x] batch norm
 43 | - [x] hi-res classifier
 44 | - [x] convolutional
 45 | - [x] anchor boxes
 46 | - [x] new network
 47 | - [x] dimension priors
 48 | - [x] location prediction
 49 | - [x] passthrough
 50 | - [x] multi-scale
 51 | - [x] hi-red detector
 52 | 
 53 | ## VOC2007
 54 | 
 55 | <table><tbody>
 56 | <tr><th align="left" bgcolor=#f8f8f8> </th>     <td bgcolor=white> size </td><td bgcolor=white> Original (darknet) </td><td bgcolor=white> Ours (pytorch) 160peochs </td><td bgcolor=white> Ours (pytorch) 250epochs </td></tr>
 57 | <tr><th align="left" bgcolor=#f8f8f8> VOC07 test</th><td bgcolor=white> 416 </td><td bgcolor=white> 76.8 </td><td bgcolor=white> 76.0 </td><td bgcolor=white> 77.1 </td></tr>
 58 | <tr><th align="left" bgcolor=#f8f8f8> VOC07 test</th><td bgcolor=white> 544 </td><td bgcolor=white> 78.6 </td><td bgcolor=white> 77.0 </td><td bgcolor=white> 78.1 </td></tr>
 59 | </table></tbody>
 60 | 
 61 | ## COCO
 62 | 
 63 | <table><tbody>
 64 | <tr><th align="left" bgcolor=#f8f8f8> </th>     <td bgcolor=white> data </td><td bgcolor=white> AP </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white> AP_S </td><td bgcolor=white> AP_M </td><td bgcolor=white> AP_L </td></tr>
 65 | 
 66 | <tr><th align="left" bgcolor=#f8f8f8> Original (darknet)</th><td bgcolor=white> COCO test-dev </td><td bgcolor=white> 21.6 </td><td bgcolor=white> 44.0 </td><td bgcolor=white> 19.2 </td><td bgcolor=white> 5.0 </td><td bgcolor=white> 22.4 </td><td bgcolor=white> 35.5 </td></tr>
 67 | 
 68 | <tr><th align="left" bgcolor=#f8f8f8> Ours (pytorch)</th><td bgcolor=white> COCO test-dev </td><td bgcolor=white> 26.8 </td><td bgcolor=white> 46.6 </td><td bgcolor=white> 26.8 </td><td bgcolor=white> 5.8 </td><td bgcolor=white> 27.4 </td><td bgcolor=white> 45.2 </td></tr>
 69 | 
 70 | <tr><th align="left" bgcolor=#f8f8f8> Ours (pytorch)</th><td bgcolor=white> COCO eval </td><td bgcolor=white> 26.6 </td><td bgcolor=white> 46.0 </td><td bgcolor=white> 26.7 </td><td bgcolor=white> 5.9 </td><td bgcolor=white> 27.8 </td><td bgcolor=white> 47.1 </td></tr>
 71 | </table></tbody>
 72 | 
 73 | 
 74 | ## YOLOv2 with ResNet-50
 75 | 
 76 | I replace darknet-19 with resnet-50 and get a better result on COCO-val
 77 | 
 78 | <table><tbody>
 79 | <tr><th align="left" bgcolor=#f8f8f8> </th>     <td bgcolor=white> data </td><td bgcolor=white> AP </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white> AP_S </td><td bgcolor=white> AP_M </td><td bgcolor=white> AP_L </td></tr>
 80 | 
 81 | <tr><th align="left" bgcolor=#f8f8f8> Our YOLOv2-320</th><td bgcolor=white> COCO eval </td><td bgcolor=white> 25.8 </td><td bgcolor=white> 44.6 </td><td bgcolor=white> 25.9 </td><td bgcolor=white> 4.6 </td><td bgcolor=white> 26.8 </td><td bgcolor=white> 47.9 </td></tr>
 82 | 
 83 | <tr><th align="left" bgcolor=#f8f8f8> Our YOLOv2-416</th><td bgcolor=white> COCO eval </td><td bgcolor=white> 29.0 </td><td bgcolor=white> 48.8 </td><td bgcolor=white> 29.7 </td><td bgcolor=white> 7.4 </td><td bgcolor=white> 31.9 </td><td bgcolor=white> 48.3 </td></tr>
 84 | 
 85 | <tr><th align="left" bgcolor=#f8f8f8> Our YOLOv2-512</th><td bgcolor=white> COCO eval </td><td bgcolor=white> 30.4 </td><td bgcolor=white> 51.6 </td><td bgcolor=white> 30.9 </td><td bgcolor=white> 10.1 </td><td bgcolor=white> 34.9 </td><td bgcolor=white> 46.6 </td></tr>
 86 | 
 87 | <tr><th align="left" bgcolor=#f8f8f8> Our YOLOv2-544</th><td bgcolor=white> COCO eval </td><td bgcolor=white> 30.4 </td><td bgcolor=white> 51.9 </td><td bgcolor=white> 30.9 </td><td bgcolor=white> 11.1 </td><td bgcolor=white> 35.8 </td><td bgcolor=white> 45.5 </td></tr>
 88 | 
 89 | <tr><th align="left" bgcolor=#f8f8f8> Our YOLOv2-608</th><td bgcolor=white> COCO eval </td><td bgcolor=white> 29.2 </td><td bgcolor=white> 51.6 </td><td bgcolor=white> 29.1 </td><td bgcolor=white> 13.6 </td><td bgcolor=white> 36.8 </td><td bgcolor=white> 40.5 </td></tr>
 90 | </table></tbody>
 91 | 
 92 | # YOLOv3
 93 | 
 94 | ## VOC2007
 95 | 
 96 | <table><tbody>
 97 | <tr><th align="left" bgcolor=#f8f8f8> </th>     <td bgcolor=white> size </td><td bgcolor=white> Original (darknet) </td><td bgcolor=white> Ours (pytorch) 250epochs </td></tr>
 98 | <tr><th align="left" bgcolor=#f8f8f8> VOC07 test</th><td bgcolor=white> 416 </td><td bgcolor=white> 80.25 </td><td bgcolor=white> 81.4 </td></tr>
 99 | </table></tbody>
100 | 
101 | # COCO
102 | 
103 | Official YOLOv3:
104 | 
105 | <table><tbody>
106 | <tr><th align="left" bgcolor=#f8f8f8> </th>     <td bgcolor=white> data </td><td bgcolor=white> AP </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white> AP_S </td><td bgcolor=white> AP_M </td><td bgcolor=white> AP_L </td></tr>
107 | 
108 | <tr><th align="left" bgcolor=#f8f8f8> YOLOv3-320</th><td bgcolor=white> COCO test-dev </td><td bgcolor=white> 28.2 </td><td bgcolor=white> 51.5 </td><td bgcolor=white> - </td><td bgcolor=white> - </td><td bgcolor=white> - </td><td bgcolor=white> - </td></tr>
109 | 
110 | <tr><th align="left" bgcolor=#f8f8f8> YOLOv3-416</th><td bgcolor=white> COCO test-dev </td><td bgcolor=white> 31.0 </td><td bgcolor=white> 55.3 </td><td bgcolor=white> - </td><td bgcolor=white> - </td><td bgcolor=white> - </td><td bgcolor=white> - </td></tr>
111 | 
112 | <tr><th align="left" bgcolor=#f8f8f8> YOLOv3-608</th><td bgcolor=white> COCO test-dev </td><td bgcolor=white> 33.0 </td><td bgcolor=white> 57.0 </td><td bgcolor=white> 34.4 </td><td bgcolor=white> 18.3 </td><td bgcolor=white> 35.4 </td><td bgcolor=white> 41.9 </td></tr>
113 | </table></tbody>
114 | 
115 | Our YOLOv3:
116 | 
117 | <table><tbody>
118 | <tr><th align="left" bgcolor=#f8f8f8> </th>     <td bgcolor=white> data </td><td bgcolor=white> AP </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white> AP_S </td><td bgcolor=white> AP_M </td><td bgcolor=white> AP_L </td></tr>
119 | 
120 | <tr><th align="left" bgcolor=#f8f8f8> YOLOv3-320</th><td bgcolor=white> COCO test-dev </td><td bgcolor=white> 33.1 </td><td bgcolor=white> 54.1 </td><td bgcolor=white> 34.5 </td><td bgcolor=white> 12.1 </td><td bgcolor=white> 34.5 </td><td bgcolor=white> 49.6 </td></tr>
121 | 
122 | <tr><th align="left" bgcolor=#f8f8f8> YOLOv3-416</th><td bgcolor=white> COCO test-dev </td><td bgcolor=white> 36.0 </td><td bgcolor=white> 57.4 </td><td bgcolor=white> 37.0 </td><td bgcolor=white> 16.3 </td><td bgcolor=white> 37.5 </td><td bgcolor=white> 51.1 </td></tr>
123 | 
124 | <tr><th align="left" bgcolor=#f8f8f8> YOLOv3-608</th><td bgcolor=white> COCO test-dev </td><td bgcolor=white> 37.6 </td><td bgcolor=white> 59.4 </td><td bgcolor=white> 39.9 </td><td bgcolor=white> 20.4 </td><td bgcolor=white> 39.9 </td><td bgcolor=white> 48.2 </td></tr>
125 | </table></tbody>
126 | 
127 | # YOLOv3SPP
128 | ## COCO:
129 | 
130 | <table><tbody>
131 | <tr><th align="left" bgcolor=#f8f8f8> </th>     <td bgcolor=white> data </td><td bgcolor=white> AP </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white> AP_S </td><td bgcolor=white> AP_M </td><td bgcolor=white> AP_L </td></tr>
132 | 
133 | <tr><th align="left" bgcolor=#f8f8f8> YOLOv3Spp-320</th><td bgcolor=white> COCO eval </td><td bgcolor=white> 32.78 </td><td bgcolor=white> 53.79 </td><td bgcolor=white> 33.9 </td><td bgcolor=white> 12.4 </td><td bgcolor=white> 35.5 </td><td bgcolor=white> 50.6 </td></tr>
134 | 
135 | <tr><th align="left" bgcolor=#f8f8f8> YOLOv3Spp-416</th><td bgcolor=white> COCO eval </td><td bgcolor=white> 35.66 </td><td bgcolor=white> 57.09 </td><td bgcolor=white> 37.4 </td><td bgcolor=white> 16.8 </td><td bgcolor=white> 38.1 </td><td bgcolor=white> 50.7 </td></tr>
136 | 
137 | 
138 | <tr><th align="left" bgcolor=#f8f8f8> YOLOv3Spp-608</th><td bgcolor=white> COCO eval </td><td bgcolor=white> 37.52 </td><td bgcolor=white> 59.44 </td><td bgcolor=white> 39.3 </td><td bgcolor=white> 21.5 </td><td bgcolor=white> 40.6 </td><td bgcolor=white> 49.6 </td></tr>
139 | 
140 | </table></tbody>
141 | 
142 | # YOLOv3Tiny
143 | <table><tbody>
144 | <tr><th align="left" bgcolor=#f8f8f8> </th>     <td bgcolor=white> data </td><td bgcolor=white> AP </td><td bgcolor=white> AP50 </td><td bgcolor=white> AP75 </td><td bgcolor=white> AP_S </td><td bgcolor=white> AP_M </td><td bgcolor=white> AP_L </td></tr>
145 | 
146 | <tr><th align="left" bgcolor=#f8f8f8> (official) YOLOv3Tiny </th><td bgcolor=white> COCO test-dev </td><td bgcolor=white> - </td><td bgcolor=white> 33.1 </td><td bgcolor=white> - </td><td bgcolor=white>- </td><td bgcolor=white> - </td><td bgcolor=white> - </td></tr>
147 | 
148 | <tr><th align="left" bgcolor=#f8f8f8> (Our) YOLOv3Tiny </th><td bgcolor=white> COCO val </td><td bgcolor=white> 15.9 </td><td bgcolor=white> 33.8 </td><td bgcolor=white> 12.8 </td><td bgcolor=white> 7.6 </td><td bgcolor=white> 17.7 </td><td bgcolor=white> 22.4 </td></tr>
149 | 
150 | </table></tbody>
151 | 
152 | 
153 | # Installation
154 | - Pytorch-gpu 1.1.0/1.2.0/1.3.0
155 | - Tensorboard 1.14.
156 | - opencv-python, python3.6/3.7
157 | 
158 | # Dataset
159 | 
160 | ## VOC Dataset
161 | I copy the download files from the following excellent project:
162 | https://github.com/amdegroot/ssd.pytorch
163 | 
164 | I have uploaded the VOC2007 and VOC2012 to BaiDuYunDisk, so for researchers in China, you can download them from BaiDuYunDisk:
165 | 
166 | Link：https://pan.baidu.com/s/1tYPGCYGyC0wjpC97H-zzMQ 
167 | 
168 | Password：4la9
169 | 
170 | You will get a ```VOCdevkit.zip```, then what you need to do is just to unzip it and put it into ```data/```. After that, the whole path to VOC dataset is ```data/VOCdevkit/VOC2007``` and ```data/VOCdevkit/VOC2012```.
171 | 
172 | ### Download VOC2007 trainval & test
173 | 
174 | ```Shell
175 | # specify a directory for dataset to be downloaded into, else default is ~/data/
176 | sh data/scripts/VOC2007.sh # <directory>
177 | ```
178 | 
179 | ### Download VOC2012 trainval
180 | ```Shell
181 | # specify a directory for dataset to be downloaded into, else default is ~/data/
182 | sh data/scripts/VOC2012.sh # <directory>
183 | ```
184 | 
185 | ## MSCOCO Dataset
186 | I copy the download files from the following excellent project:
187 | https://github.com/DeNA/PyTorch_YOLOv3
188 | 
189 | ### Download MSCOCO 2017 dataset
190 | Just run ```sh data/scripts/COCO2017.sh```. You will get COCO train2017, val2017, test2017.
191 | 
192 | 
193 | # Train
194 | ## VOC
195 | ```Shell
196 | python train.py -d voc --cuda -v [select a model] -hr -ms --ema
197 | ```
198 | 
199 | You can run ```python train.py -h``` to check all optional argument.
200 | 
201 | ## COCO
202 | If you have only one gpu:
203 | ```Shell
204 | python train.py -d coco --cuda -v [select a model] -hr -ms --ema
205 | ```
206 | 
207 | If you have multi gpus like 8, and you put 4 images on each gpu:
208 | ```Shell
209 | python -m torch.distributed.launch --nproc_per_node=8 train.py -d coco --cuda -v [select a model] -hr -ms --ema \
210 |                                                                         -dist \
211 |                                                                         --sybn \
212 |                                                                         --num_gpu 8\
213 |                                                                         --batch_size 4
214 | ```
215 | 
216 | # Test
217 | ## VOC
218 | ```Shell
219 | python test.py -d voc --cuda -v [select a model] --trained_model [ Please input the path to model dir. ]
220 | ```
221 | 
222 | ## COCO
223 | ```Shell
224 | python test.py -d coco-val --cuda -v [select a model] --trained_model [ Please input the path to model dir. ]
225 | ```
226 | 
227 | 
228 | # Evaluation
229 | ## VOC
230 | ```Shell
231 | python eval.py -d voc --cuda -v [select a model] --train_model [ Please input the path to model dir. ]
232 | ```
233 | 
234 | ## COCO
235 | To run on COCO_val:
236 | ```Shell
237 | python eval.py -d coco-val --cuda -v [select a model] --train_model [ Please input the path to model dir. ]
238 | ```
239 | 
240 | To run on COCO_test-dev(You must be sure that you have downloaded test2017):
241 | ```Shell
242 | python eval.py -d coco-test --cuda -v [select a model] --train_model [ Please input the path to model dir. ]
243 | ```
244 | You will get a .json file which can be evaluated on COCO test server.
245 | 


--------------------------------------------------------------------------------
/backbone/__init__.py:
--------------------------------------------------------------------------------
 1 | from .resnet import build_resnet
 2 | from .darknet19 import build_darknet19
 3 | from .darknet53 import build_darknet53
 4 | from .darknet_tiny import build_darknet_tiny
 5 | 
 6 | 
 7 | def build_backbone(model_name='resnet18', pretrained=False):
 8 |     if 'resnet' in model_name:
 9 |         backbone = build_resnet(model_name, pretrained)
10 | 
11 |     elif model_name == 'darknet19':
12 |         backbone = build_darknet19(pretrained)
13 | 
14 |     elif model_name == 'darknet53':
15 |         backbone = build_darknet53(pretrained)
16 | 
17 |     elif model_name == 'darknet19':
18 |         backbone = build_darknet_tiny(pretrained)
19 |                         
20 |     return backbone
21 | 


--------------------------------------------------------------------------------
/backbone/darknet19.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import os
  4 | 
  5 | 
  6 | model_urls = {
  7 |     "darknet19": "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/darknet19.pth",
  8 | }
  9 | 
 10 | 
 11 | __all__ = ['darknet19']
 12 | 
 13 | 
 14 | class Conv_BN_LeakyReLU(nn.Module):
 15 |     def __init__(self, in_channels, out_channels, ksize, padding=0, stride=1, dilation=1):
 16 |         super(Conv_BN_LeakyReLU, self).__init__()
 17 |         self.convs = nn.Sequential(
 18 |             nn.Conv2d(in_channels, out_channels, ksize, padding=padding, stride=stride, dilation=dilation),
 19 |             nn.BatchNorm2d(out_channels),
 20 |             nn.LeakyReLU(0.1, inplace=True)
 21 |         )
 22 | 
 23 |     def forward(self, x):
 24 |         return self.convs(x)
 25 | 
 26 | 
 27 | class DarkNet_19(nn.Module):
 28 |     def __init__(self):        
 29 |         super(DarkNet_19, self).__init__()
 30 |         # backbone network : DarkNet-19
 31 |         # output : stride = 2, c = 32
 32 |         self.conv_1 = nn.Sequential(
 33 |             Conv_BN_LeakyReLU(3, 32, 3, 1),
 34 |             nn.MaxPool2d((2,2), 2),
 35 |         )
 36 | 
 37 |         # output : stride = 4, c = 64
 38 |         self.conv_2 = nn.Sequential(
 39 |             Conv_BN_LeakyReLU(32, 64, 3, 1),
 40 |             nn.MaxPool2d((2,2), 2)
 41 |         )
 42 | 
 43 |         # output : stride = 8, c = 128
 44 |         self.conv_3 = nn.Sequential(
 45 |             Conv_BN_LeakyReLU(64, 128, 3, 1),
 46 |             Conv_BN_LeakyReLU(128, 64, 1),
 47 |             Conv_BN_LeakyReLU(64, 128, 3, 1),
 48 |             nn.MaxPool2d((2,2), 2)
 49 |         )
 50 | 
 51 |         # output : stride = 8, c = 256
 52 |         self.conv_4 = nn.Sequential(
 53 |             Conv_BN_LeakyReLU(128, 256, 3, 1),
 54 |             Conv_BN_LeakyReLU(256, 128, 1),
 55 |             Conv_BN_LeakyReLU(128, 256, 3, 1),
 56 |         )
 57 | 
 58 |         # output : stride = 16, c = 512
 59 |         self.maxpool_4 = nn.MaxPool2d((2, 2), 2)
 60 |         self.conv_5 = nn.Sequential(
 61 |             Conv_BN_LeakyReLU(256, 512, 3, 1),
 62 |             Conv_BN_LeakyReLU(512, 256, 1),
 63 |             Conv_BN_LeakyReLU(256, 512, 3, 1),
 64 |             Conv_BN_LeakyReLU(512, 256, 1),
 65 |             Conv_BN_LeakyReLU(256, 512, 3, 1),
 66 |         )
 67 |         
 68 |         # output : stride = 32, c = 1024
 69 |         self.maxpool_5 = nn.MaxPool2d((2, 2), 2)
 70 |         self.conv_6 = nn.Sequential(
 71 |             Conv_BN_LeakyReLU(512, 1024, 3, 1),
 72 |             Conv_BN_LeakyReLU(1024, 512, 1),
 73 |             Conv_BN_LeakyReLU(512, 1024, 3, 1),
 74 |             Conv_BN_LeakyReLU(1024, 512, 1),
 75 |             Conv_BN_LeakyReLU(512, 1024, 3, 1)
 76 |         )
 77 | 
 78 |     def forward(self, x):
 79 |         c1 = self.conv_1(x)
 80 |         c2 = self.conv_2(c1)
 81 |         c3 = self.conv_3(c2)
 82 |         c3 = self.conv_4(c3)
 83 |         c4 = self.conv_5(self.maxpool_4(c3))
 84 |         c5 = self.conv_6(self.maxpool_5(c4))
 85 | 
 86 |         output = {
 87 |             'layer1': c3,
 88 |             'layer2': c4,
 89 |             'layer3': c5
 90 |         }
 91 | 
 92 |         return output
 93 | 
 94 | 
 95 | def build_darknet19(pretrained=False):
 96 |     # model
 97 |     model = DarkNet_19()
 98 | 
 99 |     # load weight
100 |     if pretrained:
101 |         print('Loading pretrained weight ...')
102 |         url = model_urls['darknet19']
103 |         # checkpoint state dict
104 |         checkpoint_state_dict = torch.hub.load_state_dict_from_url(
105 |             url=url, map_location="cpu", check_hash=True)
106 |         # model state dict
107 |         model_state_dict = model.state_dict()
108 |         # check
109 |         for k in list(checkpoint_state_dict.keys()):
110 |             if k in model_state_dict:
111 |                 shape_model = tuple(model_state_dict[k].shape)
112 |                 shape_checkpoint = tuple(checkpoint_state_dict[k].shape)
113 |                 if shape_model != shape_checkpoint:
114 |                     checkpoint_state_dict.pop(k)
115 |             else:
116 |                 checkpoint_state_dict.pop(k)
117 |                 print(k)
118 | 
119 |         model.load_state_dict(checkpoint_state_dict)
120 | 
121 |     return model
122 | 
123 | 
124 | if __name__ == '__main__':
125 |     import time
126 |     net = build_darknet19(pretrained=True)
127 |     x = torch.randn(1, 3, 224, 224)
128 |     t0 = time.time()
129 |     output = net(x)
130 |     t1 = time.time()
131 |     print('Time: ', t1 - t0)
132 | 
133 |     for k in output.keys():
134 |         print('{} : {}'.format(k, output[k].shape))
135 | 


--------------------------------------------------------------------------------
/backbone/darknet53.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | 
  5 | model_urls = {
  6 |     "darknet53": "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/darknet53.pth",
  7 | }
  8 | 
  9 | 
 10 | __all__ = ['darknet53']
 11 | 
 12 | 
 13 | class Conv_BN_LeakyReLU(nn.Module):
 14 |     def __init__(self, in_channels, out_channels, ksize, padding=0, stride=1, dilation=1):
 15 |         super(Conv_BN_LeakyReLU, self).__init__()
 16 |         self.convs = nn.Sequential(
 17 |             nn.Conv2d(in_channels, out_channels, ksize, padding=padding, stride=stride, dilation=dilation),
 18 |             nn.BatchNorm2d(out_channels),
 19 |             nn.LeakyReLU(0.1, inplace=True)
 20 |         )
 21 | 
 22 |     def forward(self, x):
 23 |         return self.convs(x)
 24 | 
 25 | 
 26 | class ResBlock(nn.Module):
 27 |     def __init__(self, ch, nblocks=1):
 28 |         super().__init__()
 29 |         self.module_list = nn.ModuleList()
 30 |         for _ in range(nblocks):
 31 |             resblock_one = nn.Sequential(
 32 |                 Conv_BN_LeakyReLU(ch, ch//2, 1),
 33 |                 Conv_BN_LeakyReLU(ch//2, ch, 3, padding=1)
 34 |             )
 35 |             self.module_list.append(resblock_one)
 36 | 
 37 |     def forward(self, x):
 38 |         for module in self.module_list:
 39 |             x = module(x) + x
 40 |         return x
 41 | 
 42 | 
 43 | class DarkNet_53(nn.Module):
 44 |     """
 45 |     DarkNet-53.
 46 |     """
 47 |     def __init__(self):
 48 |         super(DarkNet_53, self).__init__()
 49 |         # stride = 2
 50 |         self.layer_1 = nn.Sequential(
 51 |             Conv_BN_LeakyReLU(3, 32, 3, padding=1),
 52 |             Conv_BN_LeakyReLU(32, 64, 3, padding=1, stride=2),
 53 |             ResBlock(64, nblocks=1)
 54 |         )
 55 |         # stride = 4
 56 |         self.layer_2 = nn.Sequential(
 57 |             Conv_BN_LeakyReLU(64, 128, 3, padding=1, stride=2),
 58 |             ResBlock(128, nblocks=2)
 59 |         )
 60 |         # stride = 8
 61 |         self.layer_3 = nn.Sequential(
 62 |             Conv_BN_LeakyReLU(128, 256, 3, padding=1, stride=2),
 63 |             ResBlock(256, nblocks=8)
 64 |         )
 65 |         # stride = 16
 66 |         self.layer_4 = nn.Sequential(
 67 |             Conv_BN_LeakyReLU(256, 512, 3, padding=1, stride=2),
 68 |             ResBlock(512, nblocks=8)
 69 |         )
 70 |         # stride = 32
 71 |         self.layer_5 = nn.Sequential(
 72 |             Conv_BN_LeakyReLU(512, 1024, 3, padding=1, stride=2),
 73 |             ResBlock(1024, nblocks=4)
 74 |         )
 75 | 
 76 | 
 77 |     def forward(self, x, targets=None):
 78 |         c1 = self.layer_1(x)
 79 |         c2 = self.layer_2(c1)
 80 |         c3 = self.layer_3(c2)
 81 |         c4 = self.layer_4(c3)
 82 |         c5 = self.layer_5(c4)
 83 | 
 84 |         output = {
 85 |             'layer1': c3,
 86 |             'layer2': c4,
 87 |             'layer3': c5
 88 |         }
 89 | 
 90 |         return output
 91 | 
 92 | 
 93 | def build_darknet53(pretrained=False):
 94 |     # model
 95 |     model = DarkNet_53()
 96 | 
 97 |     # load weight
 98 |     if pretrained:
 99 |         print('Loading pretrained weight ...')
100 |         url = model_urls['darknet53']
101 |         # checkpoint state dict
102 |         checkpoint_state_dict = torch.hub.load_state_dict_from_url(
103 |             url=url, map_location="cpu", check_hash=True)
104 |         # model state dict
105 |         model_state_dict = model.state_dict()
106 |         # check
107 |         for k in list(checkpoint_state_dict.keys()):
108 |             if k in model_state_dict:
109 |                 shape_model = tuple(model_state_dict[k].shape)
110 |                 shape_checkpoint = tuple(checkpoint_state_dict[k].shape)
111 |                 if shape_model != shape_checkpoint:
112 |                     checkpoint_state_dict.pop(k)
113 |             else:
114 |                 checkpoint_state_dict.pop(k)
115 |                 print(k)
116 | 
117 |         model.load_state_dict(checkpoint_state_dict)
118 | 
119 |     return model
120 | 
121 | 
122 | if __name__ == '__main__':
123 |     import time
124 |     net = build_darknet53(pretrained=True)
125 |     x = torch.randn(1, 3, 224, 224)
126 |     t0 = time.time()
127 |     output = net(x)
128 |     t1 = time.time()
129 |     print('Time: ', t1 - t0)
130 | 
131 |     for k in output.keys():
132 |         print('{} : {}'.format(k, output[k].shape))
133 | 


--------------------------------------------------------------------------------
/backbone/darknet_tiny.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | 
  5 | model_urls = {
  6 |     "darknet_tiny": "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/darknet_tiny.pth",
  7 | }
  8 | 
  9 | 
 10 | __all__ = ['darknet_tiny']
 11 | 
 12 | 
 13 | class Conv_BN_LeakyReLU(nn.Module):
 14 |     def __init__(self, in_channels, out_channels, ksize, padding=0, stride=1, dilation=1):
 15 |         super(Conv_BN_LeakyReLU, self).__init__()
 16 |         self.convs = nn.Sequential(
 17 |             nn.Conv2d(in_channels, out_channels, ksize, padding=padding, stride=stride, dilation=dilation),
 18 |             nn.BatchNorm2d(out_channels),
 19 |             nn.LeakyReLU(0.1, inplace=True)
 20 |         )
 21 | 
 22 |     def forward(self, x):
 23 |         return self.convs(x)
 24 | 
 25 | 
 26 | class DarkNet_Tiny(nn.Module):
 27 |     def __init__(self):
 28 |         
 29 |         super(DarkNet_Tiny, self).__init__()
 30 |         # backbone network : DarkNet_Tiny
 31 |         self.conv_1 = Conv_BN_LeakyReLU(3, 16, 3, 1)
 32 |         self.maxpool_1 = nn.MaxPool2d((2, 2), 2)              # stride = 2
 33 | 
 34 |         self.conv_2 = Conv_BN_LeakyReLU(16, 32, 3, 1)
 35 |         self.maxpool_2 = nn.MaxPool2d((2, 2), 2)              # stride = 4
 36 | 
 37 |         self.conv_3 = Conv_BN_LeakyReLU(32, 64, 3, 1)
 38 |         self.maxpool_3 = nn.MaxPool2d((2, 2), 2)              # stride = 8
 39 | 
 40 |         self.conv_4 = Conv_BN_LeakyReLU(64, 128, 3, 1)
 41 |         self.maxpool_4 = nn.MaxPool2d((2, 2), 2)              # stride = 16
 42 | 
 43 |         self.conv_5 = Conv_BN_LeakyReLU(128, 256, 3, 1)
 44 |         self.maxpool_5 = nn.MaxPool2d((2, 2), 2)              # stride = 32
 45 | 
 46 |         self.conv_6 = Conv_BN_LeakyReLU(256, 512, 3, 1)
 47 |         self.maxpool_6 = nn.Sequential(
 48 |             nn.ZeroPad2d((0, 1, 0, 1)),
 49 |             nn.MaxPool2d((2, 2), 1)                           # stride = 32
 50 |         )
 51 | 
 52 |         self.conv_7 = Conv_BN_LeakyReLU(512, 1024, 3, 1)
 53 | 
 54 | 
 55 |     def forward(self, x):
 56 |         x = self.conv_1(x)
 57 |         c1 = self.maxpool_1(x)
 58 |         c1 = self.conv_2(c1)
 59 |         c2 = self.maxpool_2(c1)
 60 |         c2 = self.conv_3(c2)
 61 |         c3 = self.maxpool_3(c2)
 62 |         c3 = self.conv_4(c3)
 63 |         c4 = self.maxpool_4(c3)
 64 |         c4 = self.conv_5(c4)       # stride = 16
 65 |         c5 = self.maxpool_5(c4)  
 66 |         c5 = self.conv_6(c5)
 67 |         c5 = self.maxpool_6(c5)
 68 |         c5 = self.conv_7(c5)       # stride = 32
 69 | 
 70 |         output = {
 71 |             'layer1': c3,
 72 |             'layer2': c4,
 73 |             'layer3': c5
 74 |         }
 75 | 
 76 |         return output
 77 | 
 78 | 
 79 | def build_darknet_tiny(pretrained=False):
 80 |     # model
 81 |     model = DarkNet_Tiny()
 82 | 
 83 |     # load weight
 84 |     if pretrained:
 85 |         print('Loading pretrained weight ...')
 86 |         url = model_urls['darknet_tiny']
 87 |         # checkpoint state dict
 88 |         checkpoint_state_dict = torch.hub.load_state_dict_from_url(
 89 |             url=url, map_location="cpu", check_hash=True)
 90 |         # model state dict
 91 |         model_state_dict = model.state_dict()
 92 |         # check
 93 |         for k in list(checkpoint_state_dict.keys()):
 94 |             if k in model_state_dict:
 95 |                 shape_model = tuple(model_state_dict[k].shape)
 96 |                 shape_checkpoint = tuple(checkpoint_state_dict[k].shape)
 97 |                 if shape_model != shape_checkpoint:
 98 |                     checkpoint_state_dict.pop(k)
 99 |             else:
100 |                 checkpoint_state_dict.pop(k)
101 |                 print(k)
102 | 
103 |         model.load_state_dict(checkpoint_state_dict)
104 | 
105 |     return model
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     import time
110 |     net = build_darknet_tiny(pretrained=True)
111 |     x = torch.randn(1, 3, 224, 224)
112 |     t0 = time.time()
113 |     output = net(x)
114 |     t1 = time.time()
115 |     print('Time: ', t1 - t0)
116 | 
117 |     for k in output.keys():
118 |         print('{} : {}'.format(k, output[k].shape))
119 | 


--------------------------------------------------------------------------------
/backbone/resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.utils.model_zoo as model_zoo
  4 | 
  5 | 
  6 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
  7 |            'resnet152']
  8 | 
  9 | 
 10 | model_urls = {
 11 |     'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
 12 |     'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
 13 |     'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
 14 |     'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
 15 |     'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
 16 | }
 17 | 
 18 | 
 19 | def conv3x3(in_planes, out_planes, stride=1):
 20 |     """3x3 convolution with padding"""
 21 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
 22 |                      padding=1, bias=False)
 23 | 
 24 | def conv1x1(in_planes, out_planes, stride=1):
 25 |     """1x1 convolution"""
 26 |     return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
 27 | 
 28 | class BasicBlock(nn.Module):
 29 |     expansion = 1
 30 | 
 31 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 32 |         super(BasicBlock, self).__init__()
 33 |         self.conv1 = conv3x3(inplanes, planes, stride)
 34 |         self.bn1 = nn.BatchNorm2d(planes)
 35 |         self.relu = nn.ReLU(inplace=True)
 36 |         self.conv2 = conv3x3(planes, planes)
 37 |         self.bn2 = nn.BatchNorm2d(planes)
 38 |         self.downsample = downsample
 39 |         self.stride = stride
 40 | 
 41 |     def forward(self, x):
 42 |         identity = x
 43 | 
 44 |         out = self.conv1(x)
 45 |         out = self.bn1(out)
 46 |         out = self.relu(out)
 47 | 
 48 |         out = self.conv2(out)
 49 |         out = self.bn2(out)
 50 | 
 51 |         if self.downsample is not None:
 52 |             identity = self.downsample(x)
 53 | 
 54 |         out += identity
 55 |         out = self.relu(out)
 56 | 
 57 |         return out
 58 | 
 59 | class Bottleneck(nn.Module):
 60 |     expansion = 4
 61 | 
 62 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 63 |         super(Bottleneck, self).__init__()
 64 |         self.conv1 = conv1x1(inplanes, planes)
 65 |         self.bn1 = nn.BatchNorm2d(planes)
 66 |         self.conv2 = conv3x3(planes, planes, stride)
 67 |         self.bn2 = nn.BatchNorm2d(planes)
 68 |         self.conv3 = conv1x1(planes, planes * self.expansion)
 69 |         self.bn3 = nn.BatchNorm2d(planes * self.expansion)
 70 |         self.relu = nn.ReLU(inplace=True)
 71 |         self.downsample = downsample
 72 |         self.stride = stride
 73 | 
 74 |     def forward(self, x):
 75 |         identity = x
 76 | 
 77 |         out = self.conv1(x)
 78 |         out = self.bn1(out)
 79 |         out = self.relu(out)
 80 | 
 81 |         out = self.conv2(out)
 82 |         out = self.bn2(out)
 83 |         out = self.relu(out)
 84 | 
 85 |         out = self.conv3(out)
 86 |         out = self.bn3(out)
 87 | 
 88 |         if self.downsample is not None:
 89 |             identity = self.downsample(x)
 90 | 
 91 |         out += identity
 92 |         out = self.relu(out)
 93 | 
 94 |         return out
 95 | 
 96 | class ResNet(nn.Module):
 97 | 
 98 |     def __init__(self, block, layers, zero_init_residual=False):
 99 |         super(ResNet, self).__init__()
100 |         self.inplanes = 64
101 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
102 |                                bias=False)
103 |         self.bn1 = nn.BatchNorm2d(64)
104 |         self.relu = nn.ReLU(inplace=True)
105 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
106 |         self.layer1 = self._make_layer(block, 64, layers[0])
107 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
108 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
109 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
110 | 
111 |         for m in self.modules():
112 |             if isinstance(m, nn.Conv2d):
113 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
114 |             elif isinstance(m, nn.BatchNorm2d):
115 |                 nn.init.constant_(m.weight, 1)
116 |                 nn.init.constant_(m.bias, 0)
117 | 
118 |         # Zero-initialize the last BN in each residual branch,
119 |         # so that the residual branch starts with zeros, and each residual block behaves like an identity.
120 |         # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
121 |         if zero_init_residual:
122 |             for m in self.modules():
123 |                 if isinstance(m, Bottleneck):
124 |                     nn.init.constant_(m.bn3.weight, 0)
125 |                 elif isinstance(m, BasicBlock):
126 |                     nn.init.constant_(m.bn2.weight, 0)
127 | 
128 |     def _make_layer(self, block, planes, blocks, stride=1):
129 |         downsample = None
130 |         if stride != 1 or self.inplanes != planes * block.expansion:
131 |             downsample = nn.Sequential(
132 |                 conv1x1(self.inplanes, planes * block.expansion, stride),
133 |                 nn.BatchNorm2d(planes * block.expansion),
134 |             )
135 | 
136 |         layers = []
137 |         layers.append(block(self.inplanes, planes, stride, downsample))
138 |         self.inplanes = planes * block.expansion
139 |         for _ in range(1, blocks):
140 |             layers.append(block(self.inplanes, planes))
141 | 
142 |         return nn.Sequential(*layers)
143 | 
144 |     def forward(self, x):
145 |         c1 = self.conv1(x)
146 |         c1 = self.bn1(c1)
147 |         c1 = self.relu(c1)
148 |         c1 = self.maxpool(c1)
149 | 
150 |         c2 = self.layer1(c1)
151 |         c3 = self.layer2(c2)
152 |         c4 = self.layer3(c3)
153 |         c5 = self.layer4(c4)
154 | 
155 |         output = {
156 |             'layer1': c3,
157 |             'layer2': c4,
158 |             'layer3': c5
159 |         }
160 | 
161 |         return output
162 | 
163 | 
164 | def resnet18(pretrained=False, **kwargs):
165 |     """Constructs a ResNet-18 model.
166 | 
167 |     Args:
168 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
169 |     """
170 |     model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
171 |     if pretrained:
172 |         # strict = False as we don't need fc layer params.
173 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet18']), strict=False)
174 |     return model
175 | 
176 | def resnet34(pretrained=False, **kwargs):
177 |     """Constructs a ResNet-34 model.
178 | 
179 |     Args:
180 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
181 |     """
182 |     model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
183 |     if pretrained:
184 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet34']), strict=False)
185 |     return model
186 | 
187 | def resnet50(pretrained=False, **kwargs):
188 |     """Constructs a ResNet-50 model.
189 | 
190 |     Args:
191 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
192 |     """
193 |     model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
194 |     if pretrained:
195 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet50']), strict=False)
196 |     return model
197 | 
198 | def resnet101(pretrained=False, **kwargs):
199 |     """Constructs a ResNet-101 model.
200 | 
201 |     Args:
202 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
203 |     """
204 |     model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
205 |     if pretrained:
206 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet101']), strict=False)
207 |     return model
208 | 
209 | def resnet152(pretrained=False, **kwargs):
210 |     """Constructs a ResNet-152 model.
211 | 
212 |     Args:
213 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
214 |     """
215 |     model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
216 |     if pretrained:
217 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
218 |     return model
219 | 
220 | 
221 | def build_resnet(model_name='resnet18', pretrained=False):
222 |     
223 |     if model_name == 'resnet18':
224 |         model = resnet18(pretrained=pretrained)
225 |     
226 |     elif model_name == 'resnet34':
227 |         model = resnet34(pretrained=pretrained)
228 |     
229 |     elif model_name == 'resnet50':
230 |         model = resnet50(pretrained=pretrained)
231 |     
232 |     elif model_name == 'resnet101':
233 |         model = resnet101(pretrained=pretrained)
234 | 
235 |     elif model_name == 'resnet152':
236 |         model = resnet152(pretrained=pretrained)
237 |     
238 | 
239 |     return model
240 | 
241 | 
242 | if __name__ == "__main__":
243 |     import time
244 | 
245 |     model = build_resnet(model_name='resnet18', pretrained=True)
246 |     x = torch.randn(1, 3, 224, 224)
247 |     t0 = time.time()
248 |     output = model(x)
249 |     t1 = time.time()
250 |     print('Time: ', t1 - t0)
251 | 
252 |     for k in output.keys():
253 |         print('{} : {}'.format(k, output[k].shape))
254 | 


--------------------------------------------------------------------------------
/backbone/weights/README.md:
--------------------------------------------------------------------------------
 1 | # darknet19, darknet53, darknet-tiny, darknet-light
 2 | darknet-tiny is designed by myself. It is a very simple and lightweight backbone.
 3 | 
 4 | darknet-light is same to the backbone used in official TinyYOLOv3.
 5 | 
 6 | For researchers in China, you can download them from BaiduYunDisk:
 7 | 
 8 | link：https://pan.baidu.com/s/1Rm87Fcj1RXZFmeTUrDWANA 
 9 | 
10 | password：qgzn
11 | 
12 | 
13 | Also, you can download them from Google Drive:
14 | 
15 | link: https://drive.google.com/drive/folders/15saMtvYiz3yfFNu5EnC7GSltEAvTImMB?usp=sharing
16 | 


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
 1 | from .voc0712 import VOCDetection, VOCAnnotationTransform, VOC_CLASSES
 2 | from .coco2017 import COCODataset, coco_class_labels, coco_class_index
 3 | from .config import *
 4 | import torch
 5 | import cv2
 6 | import numpy as np
 7 | 
 8 | 
 9 | def detection_collate(batch):
10 |     """Custom collate fn for dealing with batches of images that have a different
11 |     number of associated object annotations (bounding boxes).
12 | 
13 |     Arguments:
14 |         batch: (tuple) A tuple of tensor images and lists of annotations
15 | 
16 |     Return:
17 |         A tuple containing:
18 |             1) (tensor) batch of images stacked on their 0 dim
19 |             2) (list of tensors) annotations for a given image are stacked on
20 |                                  0 dim
21 |     """
22 |     targets = []
23 |     imgs = []
24 |     for sample in batch:
25 |         imgs.append(sample[0])
26 |         targets.append(torch.FloatTensor(sample[1]))
27 |     return torch.stack(imgs, 0), targets
28 | 
29 | 
30 | def base_transform(image, size, mean, std):
31 |     x = cv2.resize(image, (size, size)).astype(np.float32)
32 |     x /= 255.
33 |     x -= mean
34 |     x /= std
35 |     return x
36 | 
37 | 
38 | class BaseTransform:
39 |     def __init__(self, size, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)):
40 |         self.size = size
41 |         self.mean = np.array(mean, dtype=np.float32)
42 |         self.std = np.array(std, dtype=np.float32)
43 | 
44 |     def __call__(self, image, boxes=None, labels=None):
45 |         return base_transform(image, self.size, self.mean, self.std), boxes, labels
46 | 


--------------------------------------------------------------------------------
/data/coco2017.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import random
  4 | 
  5 | import torch
  6 | from torch.utils.data import Dataset
  7 | import cv2
  8 | from pycocotools.coco import COCO
  9 | 
 10 | 
 11 | coco_class_labels = ('background',
 12 |                         'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck',
 13 |                         'boat', 'traffic light', 'fire hydrant', 'street sign', 'stop sign',
 14 |                         'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
 15 |                         'elephant', 'bear', 'zebra', 'giraffe', 'hat', 'backpack', 'umbrella',
 16 |                         'shoe', 'eye glasses', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
 17 |                         'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
 18 |                         'skateboard', 'surfboard', 'tennis racket', 'bottle', 'plate', 'wine glass',
 19 |                         'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
 20 |                         'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
 21 |                         'couch', 'potted plant', 'bed', 'mirror', 'dining table', 'window', 'desk',
 22 |                         'toilet', 'door', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
 23 |                         'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'blender', 'book',
 24 |                         'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')
 25 | 
 26 | coco_class_index = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20,
 27 |                     21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
 28 |                     46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67,
 29 |                     70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
 30 | 
 31 | 
 32 | class COCODataset(Dataset):
 33 |     """
 34 |     COCO dataset class.
 35 |     """
 36 |     def __init__(self, 
 37 |                  data_dir=None, 
 38 |                  transform=None, 
 39 |                  json_file='instances_train2017.json',
 40 |                  name='train2017'):
 41 |         """
 42 |         COCO dataset initialization. Annotation data are read into memory by COCO API.
 43 |         Args:
 44 |             data_dir (str): dataset root directory
 45 |             json_file (str): COCO json file name
 46 |             name (str): COCO data name (e.g. 'train2017' or 'val2017')
 47 |             img_size (int): target image size after pre-processing
 48 |             min_size (int): bounding boxes smaller than this are ignored
 49 |             debug (bool): if True, only one data id is selected from the dataset
 50 |         """
 51 |         self.data_dir = data_dir
 52 |         self.json_file = json_file
 53 |         self.coco = COCO(os.path.join(self.data_dir, 'annotations', self.json_file))
 54 |         self.ids = self.coco.getImgIds()
 55 |         self.class_ids = sorted(self.coco.getCatIds())
 56 |         self.name = name
 57 |         self.transform = transform
 58 | 
 59 | 
 60 |     def __len__(self):
 61 |         return len(self.ids)
 62 | 
 63 | 
 64 |     def pull_image(self, index):
 65 |         id_ = self.ids[index]
 66 |         img_file = os.path.join(self.data_dir, self.name,
 67 |                                 '{:012}'.format(id_) + '.jpg')
 68 |         img = cv2.imread(img_file)
 69 | 
 70 |         if self.json_file == 'instances_val5k.json' and img is None:
 71 |             img_file = os.path.join(self.data_dir, 'train2017',
 72 |                                     '{:012}'.format(id_) + '.jpg')
 73 |             img = cv2.imread(img_file)
 74 | 
 75 |         return img, id_
 76 | 
 77 | 
 78 |     def pull_anno(self, index):
 79 |         id_ = self.ids[index]
 80 | 
 81 |         anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=None)
 82 |         annotations = self.coco.loadAnns(anno_ids)
 83 |         
 84 |         target = []
 85 |         for anno in annotations:
 86 |             if 'bbox' in anno:
 87 |                 xmin = np.max((0, anno['bbox'][0]))
 88 |                 ymin = np.max((0, anno['bbox'][1]))
 89 |                 xmax = xmin + anno['bbox'][2]
 90 |                 ymax = ymin + anno['bbox'][3]
 91 |                 
 92 |                 if anno['area'] > 0 and xmax >= xmin and ymax >= ymin:
 93 |                     label_ind = anno['category_id']
 94 |                     cls_id = self.class_ids.index(label_ind)
 95 | 
 96 |                     target.append([xmin, ymin, xmax, ymax, cls_id])  # [xmin, ymin, xmax, ymax, label_ind]
 97 |             else:
 98 |                 print('No bbox !!')
 99 |         return target
100 | 
101 | 
102 |     def __getitem__(self, index):
103 |         img, gt, h, w = self.pull_item(index)
104 | 
105 |         return img, gt
106 | 
107 | 
108 |     def pull_item(self, index):
109 |         id_ = self.ids[index]
110 | 
111 |         anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=None)
112 |         annotations = self.coco.loadAnns(anno_ids)
113 | 
114 |         # load an image
115 |         img_file = os.path.join(self.data_dir, self.name,
116 |                                 '{:012}'.format(id_) + '.jpg')
117 |         img = cv2.imread(img_file)
118 |         
119 |         if self.json_file == 'instances_val5k.json' and img is None:
120 |             img_file = os.path.join(self.data_dir, 'train2017',
121 |                                     '{:012}'.format(id_) + '.jpg')
122 |             img = cv2.imread(img_file)
123 | 
124 |         assert img is not None
125 | 
126 |         height, width, channels = img.shape
127 |         
128 |         # load a target
129 |         target = []
130 |         for anno in annotations:
131 |             if 'bbox' in anno and anno['area'] > 0:   
132 |                 xmin = np.max((0, anno['bbox'][0]))
133 |                 ymin = np.max((0, anno['bbox'][1]))
134 |                 xmax = np.min((width - 1, xmin + np.max((0, anno['bbox'][2] - 1))))
135 |                 ymax = np.min((height - 1, ymin + np.max((0, anno['bbox'][3] - 1))))
136 |                 if xmax > xmin and ymax > ymin:
137 |                     label_ind = anno['category_id']
138 |                     cls_id = self.class_ids.index(label_ind)
139 |                     xmin /= width
140 |                     ymin /= height
141 |                     xmax /= width
142 |                     ymax /= height
143 | 
144 |                     target.append([xmin, ymin, xmax, ymax, cls_id])  # [xmin, ymin, xmax, ymax, label_ind]
145 |             else:
146 |                 print('No bbox !!!')
147 | 
148 |         # check target
149 |         if len(target) == 0:
150 |             target = np.zeros([1, 5])
151 |         else:
152 |             target = np.array(target)
153 |         # transform
154 |         if self.transform is not None:
155 |             img, boxes, labels = self.transform(img, target[:, :4], target[:, 4])
156 |             # to rgb
157 |             img = img[:, :, (2, 1, 0)]
158 |             # to tensor
159 |             img = torch.from_numpy(img).permute(2, 0, 1).float()
160 |             target = np.hstack((boxes, np.expand_dims(labels, axis=1)))
161 | 
162 |         return img, target, height, width
163 | 
164 | 
165 | if __name__ == "__main__":
166 |     def base_transform(image, size, mean):
167 |         x = cv2.resize(image, (size, size)).astype(np.float32)
168 |         x -= mean
169 |         x = x.astype(np.float32)
170 |         return x
171 | 
172 |     class BaseTransform:
173 |         def __init__(self, size, mean):
174 |             self.size = size
175 |             self.mean = np.array(mean, dtype=np.float32)
176 | 
177 |         def __call__(self, image, boxes=None, labels=None):
178 |             return base_transform(image, self.size, self.mean), boxes, labels
179 | 
180 |     img_size = 640
181 |     dataset = COCODataset(
182 |                 data_dir='/mnt/share/ssd2/dataset/COCO/',
183 |                 transform=BaseTransform(img_size, (0, 0, 0)))
184 |     
185 |     for i in range(1000):
186 |         im, gt, h, w = dataset.pull_item(i)
187 |         img = im.permute(1,2,0).numpy()[:, :, (2, 1, 0)].astype(np.uint8)
188 |         img = img.copy()
189 | 
190 |         for box in gt:
191 |             xmin, ymin, xmax, ymax, _ = box
192 |             xmin *= img_size
193 |             ymin *= img_size
194 |             xmax *= img_size
195 |             ymax *= img_size
196 |             img = cv2.rectangle(img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0,0,255), 2)
197 |         cv2.imshow('gt', img)
198 |         # cv2.imwrite(str(i)+'.jpg', img)
199 |         cv2.waitKey(0)
200 | 


--------------------------------------------------------------------------------
/data/config.py:
--------------------------------------------------------------------------------
 1 | # config.py
 2 | 
 3 | # YOLOv2 with darknet-19
 4 | yolov2_d19_cfg = {
 5 |     # network
 6 |     'backbone': 'd19',
 7 |     # for multi-scale trick
 8 |     'train_size': 640,
 9 |     'val_size': 416,
10 |     'random_size_range': [10, 19],
11 |     # anchor size
12 |     'anchor_size_voc': [[1.19, 1.98], [2.79, 4.59], [4.53, 8.92], [8.06, 5.29], [10.32, 10.65]],
13 |     'anchor_size_coco': [[0.53, 0.79], [1.71, 2.36], [2.89, 6.44], [6.33, 3.79], [9.03, 9.74]],
14 |     # train
15 |     'lr_epoch': (150, 200),
16 |     'max_epoch': 250,
17 |     'ignore_thresh': 0.5
18 | }
19 | 
20 | # YOLOv2 with resnet-50
21 | yolov2_r50_cfg = {
22 |     # network
23 |     'backbone': 'r50',
24 |     # for multi-scale trick
25 |     'train_size': 640,
26 |     'val_size': 416,
27 |     'random_size_range': [10, 19],
28 |     # anchor size
29 |     'anchor_size_voc': [[1.19, 1.98], [2.79, 4.59], [4.53, 8.92], [8.06, 5.29], [10.32, 10.65]],
30 |     'anchor_size_coco': [[0.53, 0.79], [1.71, 2.36], [2.89, 6.44], [6.33, 3.79], [9.03, 9.74]],
31 |     # train
32 |     'lr_epoch': (150, 200),
33 |     'max_epoch': 250,
34 |     'ignore_thresh': 0.5
35 | }
36 | 
37 | # YOLOv3 / YOLOv3Spp
38 | yolov3_d53_cfg = {
39 |     # network
40 |     'backbone': 'd53',
41 |     # for multi-scale trick
42 |     'train_size': 640,
43 |     'val_size': 416,
44 |     'random_size_range': [10, 19],
45 |     # anchor size
46 |     'anchor_size_voc': [[32.64, 47.68], [50.24, 108.16], [126.72, 96.32],     
47 |                         [78.4, 201.92], [178.24, 178.56], [129.6, 294.72],     
48 |                         [331.84, 194.56], [227.84, 325.76], [365.44, 358.72]],
49 |     'anchor_size_coco': [[12.48, 19.2], [31.36, 46.4],[46.4, 113.92],
50 |                          [97.28, 55.04], [133.12, 127.36], [79.04, 224.],
51 |                          [301.12, 150.4 ], [172.16, 285.76], [348.16, 341.12]],
52 |     # train
53 |     'lr_epoch': (150, 200),
54 |     'max_epoch': 250,
55 |     'ignore_thresh': 0.5
56 | }
57 | 
58 | # YOLOv3Tiny
59 | yolov3_tiny_cfg = {
60 |     # network
61 |     'backbone': 'd-light',
62 |     # for multi-scale trick
63 |     'train_size': 640,
64 |     'val_size': 416,
65 |     'random_size_range':[10, 19],
66 |     # anchor size
67 |     'anchor_size_voc': [[34.01, 61.79],   [86.94, 109.68],  [93.49, 227.46],     
68 |                         [246.38, 163.33], [178.68, 306.55], [344.89, 337.14]],
69 |     'anchor_size_coco': [[15.09, 23.25],  [46.36, 61.47],   [68.41, 161.84],
70 |                          [168.88, 93.59], [154.96, 257.45], [334.74, 302.47]],
71 |     # train
72 |     'lr_epoch': (150, 200),
73 |     'max_epoch': 250,
74 |     'ignore_thresh': 0.5
75 | }
76 | 


--------------------------------------------------------------------------------
/data/scripts/COCO2017.sh:
--------------------------------------------------------------------------------
 1 | mkdir COCO
 2 | cd COCO
 3 | 
 4 | wget http://images.cocodataset.org/zips/train2017.zip
 5 | wget http://images.cocodataset.org/zips/val2017.zip
 6 | wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
 7 | wget http://images.cocodataset.org/zips/test2017.zip
 8 | wget http://images.cocodataset.org/annotations/image_info_test2017.zip 
 9 | 
10 | unzip train2017.zip
11 | unzip val2017.zip
12 | unzip annotations_trainval2017.zip
13 | unzip test2017.zip
14 | unzip image_info_test2017.zip
15 | 
16 | # rm -f train2017.zip
17 | # rm -f val2017.zip
18 | # rm -f annotations_trainval2017.zip
19 | # rm -f test2017.zip
20 | # rm -f image_info_test2017.zip
21 | 


--------------------------------------------------------------------------------
/data/scripts/VOC2007.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Ellis Brown
 3 | 
 4 | start=`date +%s`
 5 | 
 6 | # handle optional download dir
 7 | if [ -z "$1" ]
 8 |   then
 9 |     # navigate to ~/data
10 |     echo "navigating to ~/data/ ..." 
11 |     mkdir -p ~/data
12 |     cd ~/data/
13 |   else
14 |     # check if is valid directory
15 |     if [ ! -d $1 ]; then
16 |         echo $1 "is not a valid directory"
17 |         exit 0
18 |     fi
19 |     echo "navigating to" $1 "..."
20 |     cd $1
21 | fi
22 | 
23 | echo "Downloading VOC2007 trainval ..."
24 | # Download the data.
25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
26 | echo "Downloading VOC2007 test data ..."
27 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
28 | echo "Done downloading."
29 | 
30 | # Extract data
31 | echo "Extracting trainval ..."
32 | tar -xvf VOCtrainval_06-Nov-2007.tar
33 | echo "Extracting test ..."
34 | tar -xvf VOCtest_06-Nov-2007.tar
35 | echo "removing tars ..."
36 | rm VOCtrainval_06-Nov-2007.tar
37 | rm VOCtest_06-Nov-2007.tar
38 | 
39 | end=`date +%s`
40 | runtime=$((end-start))
41 | 
42 | echo "Completed in" $runtime "seconds"


--------------------------------------------------------------------------------
/data/scripts/VOC2012.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Ellis Brown
 3 | 
 4 | start=`date +%s`
 5 | 
 6 | # handle optional download dir
 7 | if [ -z "$1" ]
 8 |   then
 9 |     # navigate to ~/data
10 |     echo "navigating to ~/data/ ..." 
11 |     mkdir -p ~/data
12 |     cd ~/data/
13 |   else
14 |     # check if is valid directory
15 |     if [ ! -d $1 ]; then
16 |         echo $1 "is not a valid directory"
17 |         exit 0
18 |     fi
19 |     echo "navigating to" $1 "..."
20 |     cd $1
21 | fi
22 | 
23 | echo "Downloading VOC2012 trainval ..."
24 | # Download the data.
25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
26 | echo "Done downloading."
27 | 
28 | 
29 | # Extract data
30 | echo "Extracting trainval ..."
31 | tar -xvf VOCtrainval_11-May-2012.tar
32 | echo "removing tar ..."
33 | rm VOCtrainval_11-May-2012.tar
34 | 
35 | end=`date +%s`
36 | runtime=$((end-start))
37 | 
38 | echo "Completed in" $runtime "seconds"


--------------------------------------------------------------------------------
/data/voc0712.py:
--------------------------------------------------------------------------------
  1 | """VOC Dataset Classes
  2 | 
  3 | Original author: Francisco Massa
  4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
  5 | 
  6 | Updated by: Ellis Brown, Max deGroot
  7 | """
  8 | import os.path as osp
  9 | import sys
 10 | import torch
 11 | import torch.utils.data as data
 12 | import cv2
 13 | import numpy as np
 14 | import random
 15 | import xml.etree.ElementTree as ET
 16 | 
 17 | 
 18 | VOC_CLASSES = (  # always index 0
 19 |     'aeroplane', 'bicycle', 'bird', 'boat',
 20 |     'bottle', 'bus', 'car', 'cat', 'chair',
 21 |     'cow', 'diningtable', 'dog', 'horse',
 22 |     'motorbike', 'person', 'pottedplant',
 23 |     'sheep', 'sofa', 'train', 'tvmonitor')
 24 | 
 25 | 
 26 | class VOCAnnotationTransform(object):
 27 |     """Transforms a VOC annotation into a Tensor of bbox coords and label index
 28 |     Initilized with a dictionary lookup of classnames to indexes
 29 | 
 30 |     Arguments:
 31 |         class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
 32 |             (default: alphabetic indexing of VOC's 20 classes)
 33 |         keep_difficult (bool, optional): keep difficult instances or not
 34 |             (default: False)
 35 |         height (int): height
 36 |         width (int): width
 37 |     """
 38 | 
 39 |     def __init__(self, class_to_ind=None, keep_difficult=False):
 40 |         self.class_to_ind = class_to_ind or dict(
 41 |             zip(VOC_CLASSES, range(len(VOC_CLASSES))))
 42 |         self.keep_difficult = keep_difficult
 43 | 
 44 |     def __call__(self, target, width, height):
 45 |         """
 46 |         Arguments:
 47 |             target (annotation) : the target annotation to be made usable
 48 |                 will be an ET.Element
 49 |         Returns:
 50 |             a list containing lists of bounding boxes  [bbox coords, class name]
 51 |         """
 52 |         res = []
 53 |         for obj in target.iter('object'):
 54 |             difficult = int(obj.find('difficult').text) == 1
 55 |             if not self.keep_difficult and difficult:
 56 |                 continue
 57 |             name = obj.find('name').text.lower().strip()
 58 |             bbox = obj.find('bndbox')
 59 | 
 60 |             pts = ['xmin', 'ymin', 'xmax', 'ymax']
 61 |             bndbox = []
 62 |             for i, pt in enumerate(pts):
 63 |                 cur_pt = int(bbox.find(pt).text) - 1
 64 |                 # scale height or width
 65 |                 cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
 66 |                 bndbox.append(cur_pt)
 67 |             label_idx = self.class_to_ind[name]
 68 |             bndbox.append(label_idx)
 69 |             res += [bndbox]  # [xmin, ymin, xmax, ymax, label_ind]
 70 |             # img_id = target.find('filename').text[:-4]
 71 | 
 72 |         return res  # [[xmin, ymin, xmax, ymax, label_ind], ... ]
 73 | 
 74 | 
 75 | class VOCDetection(data.Dataset):
 76 |     """VOC Detection Dataset Object
 77 | 
 78 |     input is image, target is annotation
 79 | 
 80 |     Arguments:
 81 |         root (string): filepath to VOCdevkit folder.
 82 |         image_set (string): imageset to use (eg. 'train', 'val', 'test')
 83 |         transform (callable, optional): transformation to perform on the
 84 |             input image
 85 |         target_transform (callable, optional): transformation to perform on the
 86 |             target `annotation`
 87 |             (eg: take in caption string, return tensor of word indices)
 88 |         dataset_name (string, optional): which dataset to load
 89 |             (default: 'VOC2007')
 90 |     """
 91 | 
 92 |     def __init__(self, 
 93 |                  data_dir=None,
 94 |                  image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
 95 |                  transform=None, 
 96 |                  target_transform=VOCAnnotationTransform(),
 97 |                  dataset_name='VOC0712'):
 98 |         self.root = data_dir
 99 |         self.image_set = image_sets
100 |         self.transform = transform
101 |         self.target_transform = target_transform
102 |         self.name = dataset_name
103 |         self._annopath = osp.join('%s', 'Annotations', '%s.xml')
104 |         self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg')
105 |         self.ids = list()
106 |         for (year, name) in image_sets:
107 |             rootpath = osp.join(self.root, 'VOC' + year)
108 |             for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
109 |                 self.ids.append((rootpath, line.strip()))
110 | 
111 | 
112 |     def __getitem__(self, index):
113 |         im, gt, h, w = self.pull_item(index)
114 | 
115 |         return im, gt
116 | 
117 | 
118 |     def __len__(self):
119 |         return len(self.ids)
120 | 
121 | 
122 |     def pull_item(self, index):
123 |         # load an image
124 |         img_id = self.ids[index]
125 |         img = cv2.imread(self._imgpath % img_id)
126 |         height, width, channels = img.shape
127 | 
128 |         # load a target
129 |         target = ET.parse(self._annopath % img_id).getroot()
130 |         if self.target_transform is not None:
131 |             target = self.target_transform(target, width, height)
132 | 
133 |         # check target
134 |         if len(target) == 0:
135 |             target = np.zeros([1, 5])
136 |         else:
137 |             target = np.array(target)
138 |         # transform
139 |         if self.transform is not None:
140 |             img, boxes, labels = self.transform(img, target[:, :4], target[:, 4])
141 |             # to rgb
142 |             img = img[:, :, (2, 1, 0)]
143 |             # to tensor
144 |             img = torch.from_numpy(img).permute(2, 0, 1).float()
145 |             # target
146 |             target = np.hstack((boxes, np.expand_dims(labels, axis=1)))
147 | 
148 |         return img, target, height, width
149 | 
150 | 
151 |     def pull_image(self, index):
152 |         '''Returns the original image object at index in PIL form
153 | 
154 |         Note: not using self.__getitem__(), as any transformations passed in
155 |         could mess up this functionality.
156 | 
157 |         Argument:
158 |             index (int): index of img to show
159 |         Return:
160 |             PIL img
161 |         '''
162 |         img_id = self.ids[index]
163 |         return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR), img_id
164 | 
165 | 
166 |     def pull_anno(self, index):
167 |         '''Returns the original annotation of image at index
168 | 
169 |         Note: not using self.__getitem__(), as any transformations passed in
170 |         could mess up this functionality.
171 | 
172 |         Argument:
173 |             index (int): index of img to get annotation of
174 |         Return:
175 |             list:  [img_id, [(label, bbox coords),...]]
176 |                 eg: ('001718', [('dog', (96, 13, 438, 332))])
177 |         '''
178 |         img_id = self.ids[index]
179 |         anno = ET.parse(self._annopath % img_id).getroot()
180 |         gt = self.target_transform(anno, 1, 1)
181 |         return img_id[1], gt
182 | 
183 | 
184 |     def pull_tensor(self, index):
185 |         '''Returns the original image at an index in tensor form
186 | 
187 |         Note: not using self.__getitem__(), as any transformations passed in
188 |         could mess up this functionality.
189 | 
190 |         Argument:
191 |             index (int): index of img to show
192 |         Return:
193 |             tensorized version of img, squeezed
194 |         '''
195 |         return torch.Tensor(self.pull_image(index)).unsqueeze_(0)
196 | 
197 | 
198 | if __name__ == "__main__":
199 |     def base_transform(image, size, mean):
200 |         x = cv2.resize(image, (size, size)).astype(np.float32)
201 |         x -= mean
202 |         x = x.astype(np.float32)
203 |         return x
204 | 
205 |     class BaseTransform:
206 |         def __init__(self, size, mean):
207 |             self.size = size
208 |             self.mean = np.array(mean, dtype=np.float32)
209 | 
210 |         def __call__(self, image, boxes=None, labels=None):
211 |             return base_transform(image, self.size, self.mean), boxes, labels
212 | 
213 |     img_size = 640
214 |     # dataset
215 |     dataset = VOCDetection(data_dir='/mnt/share/ssd2/dataset/VOCdevkit/', 
216 |                            image_sets=[('2007', 'trainval')],
217 |                            transform=BaseTransform(img_size, (0, 0, 0)))
218 |     for i in range(1000):
219 |         im, gt, h, w = dataset.pull_item(i)
220 |         img = im.permute(1,2,0).numpy()[:, :, (2, 1, 0)].astype(np.uint8)
221 |         img = img.copy()
222 |         for box in gt:
223 |             xmin, ymin, xmax, ymax, _ = box
224 |             xmin *= img_size
225 |             ymin *= img_size
226 |             xmax *= img_size
227 |             ymax *= img_size
228 |             img = cv2.rectangle(img, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0,0,255), 2)
229 |         cv2.imshow('gt', img)
230 |         cv2.waitKey(0)
231 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import numpy as np
  4 | import cv2
  5 | import time
  6 | import torch
  7 | from data.coco2017 import coco_class_index, coco_class_labels
  8 | from data import config, BaseTransform
  9 | 
 10 | 
 11 | 
 12 | def parse_args():
 13 |     parser = argparse.ArgumentParser(description='YOLO Demo Detection')
 14 |     # basic
 15 |     parser.add_argument('--mode', default='image',
 16 |                         type=str, help='Use the data from image, video or camera')
 17 |     parser.add_argument('-size', '--input_size', default=416, type=int,
 18 |                         help='input_size')
 19 |     parser.add_argument('--cuda', action='store_true', default=False,
 20 |                         help='Use cuda')
 21 |     parser.add_argument('--path_to_img', default='data/demo/images/',
 22 |                         type=str, help='The path to image files')
 23 |     parser.add_argument('--path_to_vid', default='data/demo/videos/',
 24 |                         type=str, help='The path to video files')
 25 |     parser.add_argument('--path_to_save', default='det_results/',
 26 |                         type=str, help='The path to save the detection results')
 27 |     parser.add_argument('-vs', '--visual_threshold', default=0.3,
 28 |                         type=float, help='visual threshold')
 29 |     # model
 30 |     parser.add_argument('-v', '--version', default='yolo_v2',
 31 |                         help='yolov2_d19, yolov2_r50, yolov2_slim, yolov3, yolov3_spp, yolov3_tiny')
 32 |     parser.add_argument('--conf_thresh', default=0.1, type=float,
 33 |                         help='NMS threshold')
 34 |     parser.add_argument('--nms_thresh', default=0.45, type=float,
 35 |                         help='NMS threshold')
 36 |     parser.add_argument('--trained_model', default='weights/',
 37 |                         type=str, help='Trained state_dict file path to open')
 38 |     
 39 |     return parser.parse_args()
 40 |                     
 41 | 
 42 | def plot_bbox_labels(img, bbox, label, cls_color, test_scale=0.4):
 43 |     x1, y1, x2, y2 = bbox
 44 |     x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
 45 |     t_size = cv2.getTextSize(label, 0, fontScale=1, thickness=2)[0]
 46 |     # plot bbox
 47 |     cv2.rectangle(img, (x1, y1), (x2, y2), cls_color, 2)
 48 |     # plot title bbox
 49 |     cv2.rectangle(img, (x1, y1-t_size[1]), (int(x1 + t_size[0] * test_scale), y1), cls_color, -1)
 50 |     # put the test on the title bbox
 51 |     cv2.putText(img, label, (int(x1), int(y1 - 5)), 0, test_scale, (0, 0, 0), 1, lineType=cv2.LINE_AA)
 52 | 
 53 |     return img
 54 | 
 55 | 
 56 | def visualize(img, bboxes, scores, cls_inds, class_colors, vis_thresh=0.3):
 57 |     ts = 0.4
 58 |     for i, bbox in enumerate(bboxes):
 59 |         if scores[i] > vis_thresh:
 60 |             cls_color = class_colors[int(cls_inds[i])]
 61 |             cls_id = coco_class_index[int(cls_inds[i])]
 62 |             mess = '%s: %.2f' % (coco_class_labels[cls_id], scores[i])
 63 |             img = plot_bbox_labels(img, bbox, mess, cls_color, test_scale=ts)
 64 | 
 65 |     return img
 66 | 
 67 | 
 68 | def detect(net, 
 69 |            device, 
 70 |            transform, 
 71 |            vis_thresh, 
 72 |            mode='image', 
 73 |            path_to_img=None, 
 74 |            path_to_vid=None, 
 75 |            path_to_save=None):
 76 |     # class color
 77 |     class_colors = [(np.random.randint(255),
 78 |                      np.random.randint(255),
 79 |                      np.random.randint(255)) for _ in range(80)]
 80 |     save_path = os.path.join(path_to_save, mode)
 81 |     os.makedirs(save_path, exist_ok=True)
 82 | 
 83 |     # ------------------------- Camera ----------------------------
 84 |     if mode == 'camera':
 85 |         print('use camera !!!')
 86 |         cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)
 87 |         while True:
 88 |             ret, frame = cap.read()
 89 |             if ret:
 90 |                 if cv2.waitKey(1) == ord('q'):
 91 |                     break
 92 |                 img_h, img_w = frame.shape[:2]
 93 |                 scale = np.array([[img_w, img_h, img_w, img_h]])
 94 | 
 95 |                 # prepare
 96 |                 x = torch.from_numpy(transform(frame)[0][:, :, ::-1]).permute(2, 0, 1)
 97 |                 x = x.unsqueeze(0).to(device)
 98 |                 # inference
 99 |                 t0 = time.time()
100 |                 bboxes, scores, cls_inds = net(x)
101 |                 t1 = time.time()
102 |                 print("detection time used ", t1-t0, "s")
103 | 
104 |                 # rescale
105 |                 bboxes *= scale
106 | 
107 |                 frame_processed = visualize(img=frame, 
108 |                                             bboxes=bboxes,
109 |                                             scores=scores, 
110 |                                             cls_inds=cls_inds,
111 |                                             class_colors=class_colors,
112 |                                             vis_thresh=vis_thresh)
113 |                 cv2.imshow('detection result', frame_processed)
114 |                 cv2.waitKey(1)
115 |             else:
116 |                 break
117 |         cap.release()
118 |         cv2.destroyAllWindows()
119 | 
120 |     # ------------------------- Image ----------------------------
121 |     elif mode == 'image':
122 |         for i, img_id in enumerate(os.listdir(path_to_img)):
123 |             img = cv2.imread(path_to_img + '/' + img_id, cv2.IMREAD_COLOR)
124 |             img_h, img_w = img.shape[:2]
125 |             scale = np.array([[img_w, img_h, img_w, img_h]])
126 |             
127 |             # prepare
128 |             x = torch.from_numpy(transform(img)[0][:, :, ::-1]).permute(2, 0, 1)
129 |             x = x.unsqueeze(0).to(device)
130 |             # inference
131 |             t0 = time.time()
132 |             bboxes, scores, cls_inds = net(x)
133 |             t1 = time.time()
134 |             print("detection time used ", t1-t0, "s")
135 | 
136 |             # rescale
137 |             bboxes *= scale
138 | 
139 |             img_processed = visualize(img=img, 
140 |                                     bboxes=bboxes,
141 |                                     scores=scores, 
142 |                                     cls_inds=cls_inds,
143 |                                     class_colors=class_colors,
144 |                                     vis_thresh=vis_thresh)
145 | 
146 |             cv2.imshow('detection', img_processed)
147 |             cv2.imwrite(os.path.join(save_path, str(i).zfill(6)+'.jpg'), img_processed)
148 |             cv2.waitKey(0)
149 | 
150 |     # ------------------------- Video ---------------------------
151 |     elif mode == 'video':
152 |         video = cv2.VideoCapture(path_to_vid)
153 |         fourcc = cv2.VideoWriter_fourcc(*'XVID')
154 |         save_size = (640, 480)
155 |         save_path = os.path.join(save_path, 'det.avi')
156 |         fps = 15.0
157 |         out = cv2.VideoWriter(save_path, fourcc, fps, save_size)
158 | 
159 |         while(True):
160 |             ret, frame = video.read()
161 |             
162 |             if ret:
163 |                 # ------------------------- Detection ---------------------------
164 |                 img_h, img_w = frame.shape[:2]
165 |                 scale = np.array([[img_w, img_h, img_w, img_h]])
166 |                 # prepare
167 |                 x = torch.from_numpy(transform(frame)[0][:, :, ::-1]).permute(2, 0, 1)
168 |                 x = x.unsqueeze(0).to(device)
169 |                 # inference
170 |                 t0 = time.time()
171 |                 bboxes, scores, cls_inds = net(x)
172 |                 t1 = time.time()
173 |                 print("detection time used ", t1-t0, "s")
174 | 
175 |                 # rescale
176 |                 bboxes *= scale
177 |                 
178 |                 frame_processed = visualize(img=frame, 
179 |                                             bboxes=bboxes,
180 |                                             scores=scores, 
181 |                                             cls_inds=cls_inds,
182 |                                             class_colors=class_colors,
183 |                                             vis_thresh=vis_thresh)
184 | 
185 |                 frame_processed_resize = cv2.resize(frame_processed, save_size)
186 |                 out.write(frame_processed_resize)
187 |                 cv2.imshow('detection', frame_processed)
188 |                 cv2.waitKey(1)
189 |             else:
190 |                 break
191 |         video.release()
192 |         out.release()
193 |         cv2.destroyAllWindows()
194 | 
195 | 
196 | def run():
197 |     args = parse_args()
198 | 
199 |     # use cuda
200 |     if args.cuda:
201 |         device = torch.device("cuda")
202 |     else:
203 |         device = torch.device("cpu")
204 | 
205 |     # model
206 |     model_name = args.version
207 |     print('Model: ', model_name)
208 | 
209 |     # load model and config file
210 |     if model_name == 'yolov2_d19':
211 |         from models.yolov2_d19 import YOLOv2D19 as yolo_net
212 |         cfg = config.yolov2_d19_cfg
213 | 
214 |     elif model_name == 'yolov2_r50':
215 |         from models.yolov2_r50 import YOLOv2R50 as yolo_net
216 |         cfg = config.yolov2_r50_cfg
217 | 
218 |     elif model_name == 'yolov2_slim':
219 |         from models.yolov2_slim import YOLOv2Slim as yolo_net
220 |         cfg = config.yolov2_slim_cfg
221 | 
222 |     elif model_name == 'yolov3':
223 |         from models.yolov3 import YOLOv3 as yolo_net
224 |         cfg = config.yolov3_d53_cfg
225 | 
226 |     elif model_name == 'yolov3_spp':
227 |         from models.yolov3_spp import YOLOv3Spp as yolo_net
228 |         cfg = config.yolov3_d53_cfg
229 | 
230 |     elif model_name == 'yolov3_tiny':
231 |         from models.yolov3_tiny import YOLOv3tiny as yolo_net
232 |         cfg = config.yolov3_tiny_cfg
233 |     else:
234 |         print('Unknown model name...')
235 |         exit(0)
236 | 
237 |     input_size = [args.input_size, args.input_size]
238 |     
239 |     # build model
240 |     anchor_size = cfg['anchor_size_coco']
241 |     net = yolo_net(device=device, 
242 |                    input_size=input_size, 
243 |                    num_classes=80, 
244 |                    trainable=False, 
245 |                    conf_thresh=args.conf_thresh,
246 |                    nms_thresh=args.nms_thresh,
247 |                    anchor_size=anchor_size)
248 | 
249 |     # load weight
250 |     net.load_state_dict(torch.load(args.trained_model, map_location=device))
251 |     net.to(device).eval()
252 |     print('Finished loading model!')
253 | 
254 |     # run
255 |     detect(net=net, 
256 |             device=device,
257 |             transform=BaseTransform(input_size),
258 |             mode=args.mode,
259 |             path_to_img=args.path_to_img,
260 |             path_to_vid=args.path_to_vid,
261 |             path_to_save=args.path_to_save,
262 |             thresh=args.visual_threshold
263 |             )
264 | 
265 | 
266 | if __name__ == '__main__':
267 |     run()
268 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import torch
  4 | 
  5 | from utils.vocapi_evaluator import VOCAPIEvaluator
  6 | from utils.cocoapi_evaluator import COCOAPIEvaluator
  7 | from data import BaseTransform, config
  8 | 
  9 | 
 10 | 
 11 | parser = argparse.ArgumentParser(description='YOLO Detector Evaluation')
 12 | parser.add_argument('-v', '--version', default='yolo_v2',
 13 |                     help='yolov2_d19, yolov2_r50, yolov2_slim, yolov3, yolov3_spp, yolov3_tiny')
 14 | parser.add_argument('--trained_model', type=str, default='weights/', 
 15 |                     help='Trained state_dict file path to open')
 16 | parser.add_argument('-size', '--input_size', default=416, type=int,
 17 |                     help='input_size')
 18 | parser.add_argument('--cuda', action='store_true', default=False,
 19 |                     help='Use cuda')
 20 | # dataset
 21 | parser.add_argument('--root', default='/mnt/share/ssd2/dataset',
 22 |                     help='data root')
 23 | parser.add_argument('-d', '--dataset', default='coco-val',
 24 |                     help='voc, coco-val, coco-test.')
 25 | 
 26 | args = parser.parse_args()
 27 | 
 28 | 
 29 | 
 30 | def voc_test(model, data_dir, device, input_size):
 31 |     evaluator = VOCAPIEvaluator(data_root=data_dir,
 32 |                                 img_size=input_size,
 33 |                                 device=device,
 34 |                                 transform=BaseTransform(input_size),
 35 |                                 display=True)
 36 | 
 37 |     # VOC evaluation
 38 |     evaluator.evaluate(model)
 39 | 
 40 | 
 41 | def coco_test(model, data_dir, device, input_size, test=False):
 42 |     if test:
 43 |         # test-dev
 44 |         print('test on test-dev 2017')
 45 |         evaluator = COCOAPIEvaluator(
 46 |                         data_dir=data_dir,
 47 |                         img_size=input_size,
 48 |                         device=device,
 49 |                         testset=True,
 50 |                         transform=BaseTransform(input_size)
 51 |                         )
 52 | 
 53 |     else:
 54 |         # eval
 55 |         evaluator = COCOAPIEvaluator(
 56 |                         data_dir=data_dir,
 57 |                         img_size=input_size,
 58 |                         device=device,
 59 |                         testset=False,
 60 |                         transform=BaseTransform(input_size)
 61 |                         )
 62 | 
 63 |     # COCO evaluation
 64 |     evaluator.evaluate(model)
 65 | 
 66 | 
 67 | if __name__ == '__main__':
 68 |     # dataset
 69 |     if args.dataset == 'voc':
 70 |         print('eval on voc ...')
 71 |         num_classes = 20
 72 |         data_dir = os.path.join(args.root, 'VOCdevkit')
 73 |     elif args.dataset == 'coco-val':
 74 |         print('eval on coco-val ...')
 75 |         num_classes = 80
 76 |         data_dir = os.path.join(args.root, 'COCO')
 77 |     elif args.dataset == 'coco-test':
 78 |         print('eval on coco-test-dev ...')
 79 |         num_classes = 80
 80 |         data_dir = os.path.join(args.root, 'COCO')
 81 |     else:
 82 |         print('unknow dataset !! we only support voc, coco-val, coco-test !!!')
 83 |         exit(0)
 84 | 
 85 |     # cuda
 86 |     if args.cuda:
 87 |         print('use cuda')
 88 |         torch.backends.cudnn.benchmark = True
 89 |         device = torch.device("cuda")
 90 |     else:
 91 |         device = torch.device("cpu")
 92 | 
 93 | 
 94 |     # model
 95 |     model_name = args.version
 96 |     print('Model: ', model_name)
 97 | 
 98 |     # load model and config file
 99 |     if model_name == 'yolov2_d19':
100 |         from models.yolov2_d19 import YOLOv2D19 as yolo_net
101 |         cfg = config.yolov2_d19_cfg
102 | 
103 |     elif model_name == 'yolov2_r50':
104 |         from models.yolov2_r50 import YOLOv2R50 as yolo_net
105 |         cfg = config.yolov2_r50_cfg
106 | 
107 |     elif model_name == 'yolov2_slim':
108 |         from models.yolov2_slim import YOLOv2Slim as yolo_net
109 |         cfg = config.yolov2_slim_cfg
110 | 
111 |     elif model_name == 'yolov3':
112 |         from models.yolov3 import YOLOv3 as yolo_net
113 |         cfg = config.yolov3_d53_cfg
114 | 
115 |     elif model_name == 'yolov3_spp':
116 |         from models.yolov3_spp import YOLOv3Spp as yolo_net
117 |         cfg = config.yolov3_d53_cfg
118 | 
119 |     elif model_name == 'yolov3_tiny':
120 |         from models.yolov3_tiny import YOLOv3tiny as yolo_net
121 |         cfg = config.yolov3_tiny_cfg
122 |     else:
123 |         print('Unknown model name...')
124 |         exit(0)
125 | 
126 |     # input size
127 |     input_size = args.input_size
128 | 
129 |     # build model
130 |     anchor_size = cfg['anchor_size_voc'] if args.dataset == 'voc' else cfg['anchor_size_coco']
131 |     net = yolo_net(device=device, 
132 |                    input_size=input_size, 
133 |                    num_classes=num_classes, 
134 |                    trainable=False, 
135 |                    anchor_size=anchor_size)
136 | 
137 |     # load net
138 |     net.load_state_dict(torch.load(args.trained_model, map_location='cuda'))
139 |     net.eval()
140 |     print('Finished loading model!')
141 |     net = net.to(device)
142 |     
143 |     # evaluation
144 |     with torch.no_grad():
145 |         if args.dataset == 'voc':
146 |             voc_test(net, data_dir, device, input_size)
147 |         elif args.dataset == 'coco-val':
148 |             coco_test(net, data_dir, device, input_size, test=False)
149 |         elif args.dataset == 'coco-test':
150 |             coco_test(net, data_dir, device, input_size, test=True)
151 | 


--------------------------------------------------------------------------------
/img_file/darknet_tiny.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/img_file/darknet_tiny.png


--------------------------------------------------------------------------------
/models/__pycache__/yolo_anchor.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_anchor.cpython-36.pyc


--------------------------------------------------------------------------------
/models/__pycache__/yolo_anchor_ms.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_anchor_ms.cpython-36.pyc


--------------------------------------------------------------------------------
/models/__pycache__/yolo_fusion.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_fusion.cpython-36.pyc


--------------------------------------------------------------------------------
/models/__pycache__/yolo_kitti.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_kitti.cpython-36.pyc


--------------------------------------------------------------------------------
/models/__pycache__/yolo_light.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_light.cpython-36.pyc


--------------------------------------------------------------------------------
/models/__pycache__/yolo_mobile.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_mobile.cpython-36.pyc


--------------------------------------------------------------------------------
/models/__pycache__/yolo_msf.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_msf.cpython-36.pyc


--------------------------------------------------------------------------------
/models/__pycache__/yolo_v1.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_v1.cpython-36.pyc


--------------------------------------------------------------------------------
/models/__pycache__/yolo_v1_ms.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_v1_ms.cpython-36.pyc


--------------------------------------------------------------------------------
/models/__pycache__/yolo_v2.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_v2.cpython-36.pyc


--------------------------------------------------------------------------------
/models/__pycache__/yolo_v2.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/models/__pycache__/yolo_v2.cpython-37.pyc


--------------------------------------------------------------------------------
/models/yolov2_d19.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | from utils.modules import Conv, reorg_layer
  5 | 
  6 | from backbone import build_backbone
  7 | import tools
  8 | 
  9 | 
 10 | class YOLOv2D19(nn.Module):
 11 |     def __init__(self, device, input_size=None, num_classes=20, trainable=False, conf_thresh=0.001, nms_thresh=0.5, anchor_size=None):
 12 |         super(YOLOv2D19, self).__init__()
 13 |         self.device = device
 14 |         self.input_size = input_size
 15 |         self.num_classes = num_classes
 16 |         self.trainable = trainable
 17 |         self.conf_thresh = conf_thresh
 18 |         self.nms_thresh = nms_thresh
 19 |         self.anchor_size = torch.tensor(anchor_size)
 20 |         self.num_anchors = len(anchor_size)
 21 |         self.stride = 32
 22 |         self.grid_cell, self.all_anchor_wh = self.create_grid(input_size)
 23 | 
 24 |         # backbone darknet-19
 25 |         self.backbone = build_backbone(model_name='darknet19', pretrained=trainable)
 26 |         
 27 |         # detection head
 28 |         self.convsets_1 = nn.Sequential(
 29 |             Conv(1024, 1024, k=3, p=1),
 30 |             Conv(1024, 1024, k=3, p=1)
 31 |         )
 32 | 
 33 |         self.route_layer = Conv(512, 64, k=1)
 34 |         self.reorg = reorg_layer(stride=2)
 35 | 
 36 |         self.convsets_2 = Conv(1280, 1024, k=3, p=1)
 37 |         
 38 |         # prediction layer
 39 |         self.pred = nn.Conv2d(1024, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1)
 40 | 
 41 | 
 42 |     def create_grid(self, input_size):
 43 |         w, h = input_size, input_size
 44 |         # generate grid cells
 45 |         ws, hs = w // self.stride, h // self.stride
 46 |         grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)])
 47 |         grid_xy = torch.stack([grid_x, grid_y], dim=-1).float()
 48 |         grid_xy = grid_xy.view(1, hs*ws, 1, 2).to(self.device)
 49 | 
 50 |         # generate anchor_wh tensor
 51 |         anchor_wh = self.anchor_size.repeat(hs*ws, 1, 1).unsqueeze(0).to(self.device)
 52 | 
 53 |         return grid_xy, anchor_wh
 54 | 
 55 | 
 56 |     def set_grid(self, input_size):
 57 |         self.input_size = input_size
 58 |         self.grid_cell, self.all_anchor_wh = self.create_grid(input_size)
 59 | 
 60 | 
 61 |     def decode_xywh(self, txtytwth_pred):
 62 |         """
 63 |             Input: \n
 64 |                 txtytwth_pred : [B, H*W, anchor_n, 4] \n
 65 |             Output: \n
 66 |                 xywh_pred : [B, H*W*anchor_n, 4] \n
 67 |         """
 68 |         B, HW, ab_n, _ = txtytwth_pred.size()
 69 |         # b_x = sigmoid(tx) + gride_x
 70 |         # b_y = sigmoid(ty) + gride_y
 71 |         xy_pred = torch.sigmoid(txtytwth_pred[..., :2]) + self.grid_cell
 72 |         # b_w = anchor_w * exp(tw)
 73 |         # b_h = anchor_h * exp(th)
 74 |         wh_pred = torch.exp(txtytwth_pred[..., 2:]) * self.all_anchor_wh
 75 |         # [B, H*W, anchor_n, 4] -> [B, H*W*anchor_n, 4]
 76 |         xywh_pred = torch.cat([xy_pred, wh_pred], -1).view(B, -1, 4) * self.stride
 77 | 
 78 |         return xywh_pred
 79 |     
 80 | 
 81 |     def decode_boxes(self, txtytwth_pred):
 82 |         """
 83 |             Input: \n
 84 |                 txtytwth_pred : [B, H*W, anchor_n, 4] \n
 85 |             Output: \n
 86 |                 x1y1x2y2_pred : [B, H*W*anchor_n, 4] \n
 87 |         """
 88 |         # txtytwth -> cxcywh
 89 |         xywh_pred = self.decode_xywh(txtytwth_pred)
 90 | 
 91 |         # cxcywh -> x1y1x2y2
 92 |         x1y1x2y2_pred = torch.zeros_like(xywh_pred)
 93 |         x1y1_pred = xywh_pred[..., :2] - xywh_pred[..., 2:] * 0.5
 94 |         x2y2_pred = xywh_pred[..., :2] + xywh_pred[..., 2:] * 0.5
 95 |         x1y1x2y2_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1)
 96 |         
 97 |         return x1y1x2y2_pred
 98 | 
 99 | 
100 |     def nms(self, dets, scores):
101 |         """"Pure Python NMS baseline."""
102 |         x1 = dets[:, 0]  #xmin
103 |         y1 = dets[:, 1]  #ymin
104 |         x2 = dets[:, 2]  #xmax
105 |         y2 = dets[:, 3]  #ymax
106 | 
107 |         areas = (x2 - x1) * (y2 - y1)
108 |         order = scores.argsort()[::-1]
109 | 
110 |         keep = []
111 |         while order.size > 0:
112 |             i = order[0]
113 |             keep.append(i)
114 |             xx1 = np.maximum(x1[i], x1[order[1:]])
115 |             yy1 = np.maximum(y1[i], y1[order[1:]])
116 |             xx2 = np.minimum(x2[i], x2[order[1:]])
117 |             yy2 = np.minimum(y2[i], y2[order[1:]])
118 | 
119 |             w = np.maximum(1e-10, xx2 - xx1)
120 |             h = np.maximum(1e-10, yy2 - yy1)
121 |             inter = w * h
122 | 
123 |             # Cross Area / (bbox + particular area - Cross Area)
124 |             ovr = inter / (areas[i] + areas[order[1:]] - inter)
125 |             #reserve all the boundingbox whose ovr less than thresh
126 |             inds = np.where(ovr <= self.nms_thresh)[0]
127 |             order = order[inds + 1]
128 | 
129 |         return keep
130 | 
131 | 
132 |     def postprocess(self, bboxes, scores):
133 |         """
134 |         bboxes: (HxW, 4), bsize = 1
135 |         scores: (HxW, num_classes), bsize = 1
136 |         """
137 | 
138 |         cls_inds = np.argmax(scores, axis=1)
139 |         scores = scores[(np.arange(scores.shape[0]), cls_inds)]
140 |         
141 |         # threshold
142 |         keep = np.where(scores >= self.conf_thresh)
143 |         bboxes = bboxes[keep]
144 |         scores = scores[keep]
145 |         cls_inds = cls_inds[keep]
146 | 
147 |         # NMS
148 |         keep = np.zeros(len(bboxes), dtype=np.int)
149 |         for i in range(self.num_classes):
150 |             inds = np.where(cls_inds == i)[0]
151 |             if len(inds) == 0:
152 |                 continue
153 |             c_bboxes = bboxes[inds]
154 |             c_scores = scores[inds]
155 |             c_keep = self.nms(c_bboxes, c_scores)
156 |             keep[inds[c_keep]] = 1
157 | 
158 |         keep = np.where(keep > 0)
159 |         bboxes = bboxes[keep]
160 |         scores = scores[keep]
161 |         cls_inds = cls_inds[keep]
162 | 
163 |         return bboxes, scores, cls_inds
164 | 
165 | 
166 |     @ torch.no_grad()
167 |     def inference(self, x):
168 |         # backbone
169 |         feats = self.backbone(x)
170 | 
171 |         # reorg layer
172 |         p5 = self.convsets_1(feats['layer3'])
173 |         p4 = self.reorg(self.route_layer(feats['layer2']))
174 |         p5 = torch.cat([p4, p5], dim=1)
175 | 
176 |         # head
177 |         p5 = self.convsets_2(p5)
178 | 
179 |         # pred
180 |         pred = self.pred(p5)
181 | 
182 |         B, abC, H, W = pred.size()
183 | 
184 |         # [B, num_anchor * C, H, W] -> [B, H, W, num_anchor * C] -> [B, H*W, num_anchor*C]
185 |         pred = pred.permute(0, 2, 3, 1).contiguous().view(B, H*W, abC)
186 | 
187 |         # [B, H*W*num_anchor, 1]
188 |         conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, 1)
189 |         # [B, H*W, num_anchor, num_cls]
190 |         cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, self.num_classes)
191 |         # [B, H*W, num_anchor, 4]
192 |         reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous()
193 |         # decode box
194 |         reg_pred = reg_pred.view(B, H*W, self.num_anchors, 4)
195 |         box_pred = self.decode_boxes(reg_pred)
196 | 
197 |         # batch size = 1
198 |         conf_pred = conf_pred[0]
199 |         cls_pred = cls_pred[0]
200 |         box_pred = box_pred[0]
201 | 
202 |         # score
203 |         scores = torch.sigmoid(conf_pred) * torch.softmax(cls_pred, dim=-1)
204 | 
205 |         # normalize bbox
206 |         bboxes = torch.clamp(box_pred / self.input_size, 0., 1.)
207 | 
208 |         # to cpu
209 |         scores = scores.to('cpu').numpy()
210 |         bboxes = bboxes.to('cpu').numpy()
211 | 
212 |         # post-process
213 |         bboxes, scores, cls_inds = self.postprocess(bboxes, scores)
214 | 
215 |         return bboxes, scores, cls_inds
216 | 
217 | 
218 |     def forward(self, x, target=None):
219 |         if not self.trainable:
220 |             return self.inference(x)
221 |         else:
222 |             # backbone
223 |             feats = self.backbone(x)
224 | 
225 |             # reorg layer
226 |             p5 = self.convsets_1(feats['layer3'])
227 |             p4 = self.reorg(self.route_layer(feats['layer2']))
228 |             p5 = torch.cat([p4, p5], dim=1)
229 | 
230 |             # head
231 |             p5 = self.convsets_2(p5)
232 | 
233 |             # pred
234 |             pred = self.pred(p5)
235 | 
236 |             B, abC, H, W = pred.size()
237 | 
238 |             # [B, num_anchor * C, H, W] -> [B, H, W, num_anchor * C] -> [B, H*W, num_anchor*C]
239 |             pred = pred.permute(0, 2, 3, 1).contiguous().view(B, H*W, abC)
240 | 
241 |             # [B, H*W*num_anchor, 1]
242 |             conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, 1)
243 |             # [B, H*W, num_anchor, num_cls]
244 |             cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, self.num_classes)
245 |             # [B, H*W, num_anchor, 4]
246 |             reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous()
247 |             reg_pred = reg_pred.view(B, H*W, self.num_anchors, 4)
248 | 
249 |             # decode bbox
250 |             x1y1x2y2_pred = (self.decode_boxes(reg_pred) / self.input_size).view(-1, 4)
251 |             x1y1x2y2_gt = target[:, :, 7:].view(-1, 4)
252 |             reg_pred = reg_pred.view(B, H*W*self.num_anchors, 4)
253 | 
254 |             # set conf target
255 |             iou_pred = tools.iou_score(x1y1x2y2_pred, x1y1x2y2_gt).view(B, -1, 1)
256 |             gt_conf = iou_pred.clone().detach()
257 | 
258 |             # [obj, cls, txtytwth, x1y1x2y2] -> [conf, obj, cls, txtytwth]
259 |             target = torch.cat([gt_conf, target[:, :, :7]], dim=2)
260 | 
261 |             # loss
262 |             (
263 |                 conf_loss,
264 |                 cls_loss,
265 |                 bbox_loss,
266 |                 iou_loss
267 |             ) = tools.loss(pred_conf=conf_pred,
268 |                            pred_cls=cls_pred,
269 |                            pred_txtytwth=reg_pred,
270 |                            pred_iou=iou_pred,
271 |                            label=target
272 |                            )
273 | 
274 |             return conf_loss, cls_loss, bbox_loss, iou_loss   
275 | 


--------------------------------------------------------------------------------
/models/yolov2_r50.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from utils.modules import Conv, reorg_layer
  5 | from backbone import build_backbone
  6 | import numpy as np
  7 | import tools
  8 | 
  9 | 
 10 | class YOLOv2R50(nn.Module):
 11 |     def __init__(self, device, input_size=None, num_classes=20, trainable=False, conf_thresh=0.001, nms_thresh=0.6, anchor_size=None, hr=False):
 12 |         super(YOLOv2R50, self).__init__()
 13 |         self.device = device
 14 |         self.input_size = input_size
 15 |         self.num_classes = num_classes
 16 |         self.trainable = trainable
 17 |         self.conf_thresh = conf_thresh
 18 |         self.nms_thresh = nms_thresh
 19 |         self.anchor_size = torch.tensor(anchor_size)
 20 |         self.num_anchors = len(anchor_size)
 21 |         self.stride = 32
 22 |         self.grid_cell, self.all_anchor_wh = self.create_grid(input_size)
 23 | 
 24 |         # backbone
 25 |         self.backbone = build_backbone(model_name='resnet50', pretrained=trainable)
 26 |         
 27 |         # head
 28 |         self.convsets_1 = nn.Sequential(
 29 |             Conv(2048, 1024, k=1),
 30 |             Conv(1024, 1024, k=3, p=1),
 31 |             Conv(1024, 1024, k=3, p=1)
 32 |         )
 33 | 
 34 |         # reorg
 35 |         self.route_layer = Conv(1024, 128, k=1)
 36 |         self.reorg = reorg_layer(stride=2)
 37 | 
 38 |         # head
 39 |         self.convsets_2 = Conv(1024+128*4, 1024, k=3, p=1)
 40 |         
 41 |         # pred
 42 |         self.pred = nn.Conv2d(1024, self.num_anchors*(1 + 4 + self.num_classes), 1)
 43 | 
 44 | 
 45 |         if self.trainable:
 46 |             # init bias
 47 |             self.init_bias()
 48 | 
 49 | 
 50 |     def init_bias(self):               
 51 |         # init bias
 52 |         init_prob = 0.01
 53 |         bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
 54 |         nn.init.constant_(self.pred.bias[..., :self.num_anchors], bias_value)
 55 | 
 56 | 
 57 |     def create_grid(self, input_size):
 58 |         w, h = input_size, input_size
 59 |         # generate grid cells
 60 |         ws, hs = w // self.stride, h // self.stride
 61 |         grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)])
 62 |         grid_xy = torch.stack([grid_x, grid_y], dim=-1).float()
 63 |         grid_xy = grid_xy.view(1, hs*ws, 1, 2).to(self.device)
 64 | 
 65 |         # generate anchor_wh tensor
 66 |         anchor_wh = self.anchor_size.repeat(hs*ws, 1, 1).unsqueeze(0).to(self.device)
 67 | 
 68 | 
 69 |         return grid_xy, anchor_wh
 70 | 
 71 | 
 72 |     def set_grid(self, input_size):
 73 |         self.input_size = input_size
 74 |         self.grid_cell, self.all_anchor_wh = self.create_grid(input_size)
 75 | 
 76 | 
 77 |     def decode_xywh(self, txtytwth_pred):
 78 |         """
 79 |             Input: \n
 80 |                 txtytwth_pred : [B, H*W, anchor_n, 4] \n
 81 |             Output: \n
 82 |                 xywh_pred : [B, H*W*anchor_n, 4] \n
 83 |         """
 84 |         B, HW, ab_n, _ = txtytwth_pred.size()
 85 |         # b_x = sigmoid(tx) + gride_x
 86 |         # b_y = sigmoid(ty) + gride_y
 87 |         xy_pred = torch.sigmoid(txtytwth_pred[:, :, :, :2]) + self.grid_cell
 88 |         # b_w = anchor_w * exp(tw)
 89 |         # b_h = anchor_h * exp(th)
 90 |         wh_pred = torch.exp(txtytwth_pred[:, :, :, 2:]) * self.all_anchor_wh
 91 |         # [H*W, anchor_n, 4] -> [H*W*anchor_n, 4]
 92 |         xywh_pred = torch.cat([xy_pred, wh_pred], -1).view(B, -1, 4) * self.stride
 93 | 
 94 |         return xywh_pred
 95 |     
 96 | 
 97 |     def decode_boxes(self, txtytwth_pred):
 98 |         """
 99 |             Input: \n
100 |                 txtytwth_pred : [B, H*W, anchor_n, 4] \n
101 |             Output: \n
102 |                 x1y1x2y2_pred : [B, H*W*anchor_n, 4] \n
103 |         """
104 |         # txtytwth -> cxcywh
105 |         xywh_pred = self.decode_xywh(txtytwth_pred)
106 | 
107 |         # cxcywh -> x1y1x2y2
108 |         x1y1x2y2_pred = torch.zeros_like(xywh_pred)
109 |         x1y1_pred = xywh_pred[..., :2] - xywh_pred[..., 2:] * 0.5
110 |         x2y2_pred = xywh_pred[..., :2] + xywh_pred[..., 2:] * 0.5
111 |         x1y1x2y2_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1)
112 |         
113 |         return x1y1x2y2_pred
114 | 
115 | 
116 |     def nms(self, dets, scores):
117 |         """"Pure Python NMS baseline."""
118 |         x1 = dets[:, 0]  #xmin
119 |         y1 = dets[:, 1]  #ymin
120 |         x2 = dets[:, 2]  #xmax
121 |         y2 = dets[:, 3]  #ymax
122 | 
123 |         areas = (x2 - x1) * (y2 - y1)
124 |         order = scores.argsort()[::-1]
125 | 
126 |         keep = []
127 |         while order.size > 0:
128 |             i = order[0]
129 |             keep.append(i)
130 |             xx1 = np.maximum(x1[i], x1[order[1:]])
131 |             yy1 = np.maximum(y1[i], y1[order[1:]])
132 |             xx2 = np.minimum(x2[i], x2[order[1:]])
133 |             yy2 = np.minimum(y2[i], y2[order[1:]])
134 | 
135 |             w = np.maximum(1e-10, xx2 - xx1)
136 |             h = np.maximum(1e-10, yy2 - yy1)
137 |             inter = w * h
138 | 
139 |             # Cross Area / (bbox + particular area - Cross Area)
140 |             ovr = inter / (areas[i] + areas[order[1:]] - inter)
141 |             #reserve all the boundingbox whose ovr less than thresh
142 |             inds = np.where(ovr <= self.nms_thresh)[0]
143 |             order = order[inds + 1]
144 | 
145 |         return keep
146 | 
147 | 
148 |     def postprocess(self, bboxes, scores):
149 |         """
150 |         bboxes: (HxW, 4), bsize = 1
151 |         scores: (HxW, num_classes), bsize = 1
152 |         """
153 | 
154 |         cls_inds = np.argmax(scores, axis=1)
155 |         scores = scores[(np.arange(scores.shape[0]), cls_inds)]
156 |         
157 |         # threshold
158 |         keep = np.where(scores >= self.conf_thresh)
159 |         bboxes = bboxes[keep]
160 |         scores = scores[keep]
161 |         cls_inds = cls_inds[keep]
162 | 
163 |         # NMS
164 |         keep = np.zeros(len(bboxes), dtype=np.int)
165 |         for i in range(self.num_classes):
166 |             inds = np.where(cls_inds == i)[0]
167 |             if len(inds) == 0:
168 |                 continue
169 |             c_bboxes = bboxes[inds]
170 |             c_scores = scores[inds]
171 |             c_keep = self.nms(c_bboxes, c_scores)
172 |             keep[inds[c_keep]] = 1
173 | 
174 |         keep = np.where(keep > 0)
175 |         bboxes = bboxes[keep]
176 |         scores = scores[keep]
177 |         cls_inds = cls_inds[keep]
178 | 
179 |         return bboxes, scores, cls_inds
180 | 
181 | 
182 |     @ torch.no_grad()
183 |     def inference(self, x):
184 |         # backbone
185 |         feats = self.backbone(x)
186 | 
187 |         # reorg layer
188 |         p5 = self.convsets_1(feats['layer3'])
189 |         p4 = self.reorg(self.route_layer(feats['layer2']))
190 |         p5 = torch.cat([p4, p5], dim=1)
191 | 
192 |         # head
193 |         p5 = self.convsets_2(p5)
194 | 
195 |         # pred
196 |         pred = self.pred(p5)
197 | 
198 |         B, abC, H, W = pred.size()
199 | 
200 |         # [B, num_anchor * C, H, W] -> [B, H, W, num_anchor * C] -> [B, H*W, num_anchor*C]
201 |         pred = pred.permute(0, 2, 3, 1).contiguous().view(B, H*W, abC)
202 | 
203 |         # [B, H*W*num_anchor, 1]
204 |         conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, 1)
205 |         # [B, H*W, num_anchor, num_cls]
206 |         cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, self.num_classes)
207 |         # [B, H*W, num_anchor, 4]
208 |         reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous()
209 |         # decode box
210 |         reg_pred = reg_pred.view(B, H*W, self.num_anchors, 4)
211 |         box_pred = self.decode_boxes(reg_pred)
212 | 
213 |         # batch size = 1
214 |         conf_pred = conf_pred[0]
215 |         cls_pred = cls_pred[0]
216 |         box_pred = box_pred[0]
217 | 
218 |         # score
219 |         scores = torch.sigmoid(conf_pred) * torch.softmax(cls_pred, dim=-1)
220 | 
221 |         # normalize bbox
222 |         bboxes = torch.clamp(box_pred / self.input_size, 0., 1.)
223 | 
224 |         # to cpu
225 |         scores = scores.to('cpu').numpy()
226 |         bboxes = bboxes.to('cpu').numpy()
227 | 
228 |         # post-process
229 |         bboxes, scores, cls_inds = self.postprocess(bboxes, scores)
230 | 
231 |         return bboxes, scores, cls_inds
232 | 
233 | 
234 |     def forward(self, x, target=None):
235 |         if not self.trainable:
236 |             return self.inference(x)
237 |         else:
238 |             # backbone
239 |             feats = self.backbone(x)
240 | 
241 |             # reorg layer
242 |             p5 = self.convsets_1(feats['layer3'])
243 |             p4 = self.reorg(self.route_layer(feats['layer2']))
244 |             p5 = torch.cat([p4, p5], dim=1)
245 | 
246 |             # head
247 |             p5 = self.convsets_2(p5)
248 | 
249 |             # pred
250 |             pred = self.pred(p5)
251 | 
252 |             B, abC, H, W = pred.size()
253 | 
254 |             # [B, num_anchor * C, H, W] -> [B, H, W, num_anchor * C] -> [B, H*W, num_anchor*C]
255 |             pred = pred.permute(0, 2, 3, 1).contiguous().view(B, H*W, abC)
256 | 
257 |             # [B, H*W*num_anchor, 1]
258 |             conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, 1)
259 |             # [B, H*W, num_anchor, num_cls]
260 |             cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, H*W*self.num_anchors, self.num_classes)
261 |             # [B, H*W, num_anchor, 4]
262 |             reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous()
263 |             reg_pred = reg_pred.view(B, H*W, self.num_anchors, 4)
264 | 
265 |             # decode bbox
266 |             x1y1x2y2_pred = (self.decode_boxes(reg_pred) / self.input_size).view(-1, 4)
267 |             x1y1x2y2_gt = target[:, :, 7:].view(-1, 4)
268 |             reg_pred = reg_pred.view(B, H*W*self.num_anchors, 4)
269 | 
270 |             # set conf target
271 |             iou_pred = tools.iou_score(x1y1x2y2_pred, x1y1x2y2_gt).view(B, -1, 1)
272 |             gt_conf = iou_pred.clone().detach()
273 | 
274 |             # [obj, cls, txtytwth, x1y1x2y2] -> [conf, obj, cls, txtytwth]
275 |             target = torch.cat([gt_conf, target[:, :, :7]], dim=2)
276 | 
277 |             # loss
278 |             (
279 |                 conf_loss,
280 |                 cls_loss,
281 |                 bbox_loss,
282 |                 iou_loss
283 |             ) = tools.loss(pred_conf=conf_pred,
284 |                            pred_cls=cls_pred,
285 |                            pred_txtytwth=reg_pred,
286 |                            pred_iou=iou_pred,
287 |                            label=target
288 |                            )
289 | 
290 |             return conf_loss, cls_loss, bbox_loss, iou_loss   
291 | 


--------------------------------------------------------------------------------
/models/yolov3.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from utils.modules import Conv
  5 | from backbone import build_backbone
  6 | import numpy as np
  7 | import tools
  8 | 
  9 | 
 10 | class YOLOv3(nn.Module):
 11 |     def __init__(self, 
 12 |                  device, 
 13 |                  input_size=None, 
 14 |                  num_classes=20, 
 15 |                  trainable=False, 
 16 |                  conf_thresh=0.001, 
 17 |                  nms_thresh=0.50, 
 18 |                  anchor_size=None):
 19 |         super(YOLOv3, self).__init__()
 20 |         self.device = device
 21 |         self.input_size = input_size
 22 |         self.num_classes = num_classes
 23 |         self.trainable = trainable
 24 |         self.conf_thresh = conf_thresh
 25 |         self.nms_thresh = nms_thresh
 26 |         self.topk = 3000
 27 |         self.stride = [8, 16, 32]
 28 |         self.anchor_size = torch.tensor(anchor_size).view(3, len(anchor_size) // 3, 2)
 29 |         self.num_anchors = self.anchor_size.size(1)
 30 | 
 31 |         self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size)
 32 | 
 33 |         # backbone
 34 |         self.backbone = build_backbone(model_name='darknet53', pretrained=trainable)
 35 |         
 36 |         # s = 32
 37 |         self.conv_set_3 = nn.Sequential(
 38 |             Conv(1024, 512, k=1),
 39 |             Conv(512, 1024, k=3, p=1),
 40 |             Conv(1024, 512, k=1),
 41 |             Conv(512, 1024, k=3, p=1),
 42 |             Conv(1024, 512, k=1)
 43 |         )
 44 |         self.conv_1x1_3 = Conv(512, 256, k=1)
 45 |         self.extra_conv_3 = Conv(512, 1024, k=3, p=1)
 46 |         self.pred_3 = nn.Conv2d(1024, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1)
 47 | 
 48 |         # s = 16
 49 |         self.conv_set_2 = nn.Sequential(
 50 |             Conv(768, 256, k=1),
 51 |             Conv(256, 512, k=3, p=1),
 52 |             Conv(512, 256, k=1),
 53 |             Conv(256, 512, k=3, p=1),
 54 |             Conv(512, 256, k=1)
 55 |         )
 56 |         self.conv_1x1_2 = Conv(256, 128, k=1)
 57 |         self.extra_conv_2 = Conv(256, 512, k=3, p=1)
 58 |         self.pred_2 = nn.Conv2d(512, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1)
 59 | 
 60 |         # s = 8
 61 |         self.conv_set_1 = nn.Sequential(
 62 |             Conv(384, 128, k=1),
 63 |             Conv(128, 256, k=3, p=1),
 64 |             Conv(256, 128, k=1),
 65 |             Conv(128, 256, k=3, p=1),
 66 |             Conv(256, 128, k=1)
 67 |         )
 68 |         self.extra_conv_1 = Conv(128, 256, k=3, p=1)
 69 |         self.pred_1 = nn.Conv2d(256, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1)
 70 |     
 71 |         self.init_yolo()
 72 | 
 73 | 
 74 |     def init_yolo(self):  
 75 |         # Init head
 76 |         init_prob = 0.01
 77 |         bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
 78 |         # init obj&cls pred
 79 |         for pred in [self.pred_1, self.pred_2, self.pred_3]:
 80 |             nn.init.constant_(pred.bias[..., :self.num_anchors], bias_value)
 81 |             nn.init.constant_(pred.bias[..., self.num_anchors : (1 + self.num_classes) * self.num_anchors], bias_value)
 82 | 
 83 | 
 84 |     def create_grid(self, input_size):
 85 |         total_grid_xy = []
 86 |         total_stride = []
 87 |         total_anchor_wh = []
 88 |         w, h = input_size, input_size
 89 |         for ind, s in enumerate(self.stride):
 90 |             # generate grid cells
 91 |             ws, hs = w // s, h // s
 92 |             grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)])
 93 |             grid_xy = torch.stack([grid_x, grid_y], dim=-1).float()
 94 |             grid_xy = grid_xy.view(1, hs*ws, 1, 2)
 95 | 
 96 |             # generate stride tensor
 97 |             stride_tensor = torch.ones([1, hs*ws, self.num_anchors, 2]) * s
 98 | 
 99 |             # generate anchor_wh tensor
100 |             anchor_wh = self.anchor_size[ind].repeat(hs*ws, 1, 1)
101 | 
102 |             total_grid_xy.append(grid_xy)
103 |             total_stride.append(stride_tensor)
104 |             total_anchor_wh.append(anchor_wh)
105 | 
106 |         total_grid_xy = torch.cat(total_grid_xy, dim=1).to(self.device)
107 |         total_stride = torch.cat(total_stride, dim=1).to(self.device)
108 |         total_anchor_wh = torch.cat(total_anchor_wh, dim=0).to(self.device).unsqueeze(0)
109 | 
110 |         return total_grid_xy, total_stride, total_anchor_wh
111 | 
112 | 
113 |     def set_grid(self, input_size):
114 |         self.input_size = input_size
115 |         self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size)
116 | 
117 | 
118 |     def decode_xywh(self, txtytwth_pred):
119 |         """
120 |             Input:
121 |                 txtytwth_pred : [B, H*W, anchor_n, 4] containing [tx, ty, tw, th]
122 |             Output:
123 |                 xywh_pred : [B, H*W*anchor_n, 4] containing [x, y, w, h]
124 |         """
125 |         # b_x = sigmoid(tx) + gride_x,  b_y = sigmoid(ty) + gride_y
126 |         B, HW, ab_n, _ = txtytwth_pred.size()
127 |         c_xy_pred = (torch.sigmoid(txtytwth_pred[..., :2]) + self.grid_cell) * self.stride_tensor
128 |         # b_w = anchor_w * exp(tw),     b_h = anchor_h * exp(th)
129 |         b_wh_pred = torch.exp(txtytwth_pred[..., 2:]) * self.all_anchors_wh
130 |         # [B, H*W, anchor_n, 4] -> [B, H*W*anchor_n, 4]
131 |         xywh_pred = torch.cat([c_xy_pred, b_wh_pred], -1).view(B, HW*ab_n, 4)
132 | 
133 |         return xywh_pred
134 | 
135 | 
136 |     def decode_boxes(self, txtytwth_pred):
137 |         """
138 |             Input: \n
139 |                 txtytwth_pred : [B, H*W, anchor_n, 4] \n
140 |             Output: \n
141 |                 x1y1x2y2_pred : [B, H*W*anchor_n, 4] \n
142 |         """
143 |         # txtytwth -> cxcywh
144 |         xywh_pred = self.decode_xywh(txtytwth_pred)
145 | 
146 |         # cxcywh -> x1y1x2y2
147 |         x1y1x2y2_pred = torch.zeros_like(xywh_pred)
148 |         x1y1_pred = xywh_pred[..., :2] - xywh_pred[..., 2:] * 0.5
149 |         x2y2_pred = xywh_pred[..., :2] + xywh_pred[..., 2:] * 0.5
150 |         x1y1x2y2_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1)
151 |         
152 |         return x1y1x2y2_pred
153 | 
154 | 
155 |     def nms(self, dets, scores):
156 |         """"Pure Python NMS baseline."""
157 |         x1 = dets[:, 0]  #xmin
158 |         y1 = dets[:, 1]  #ymin
159 |         x2 = dets[:, 2]  #xmax
160 |         y2 = dets[:, 3]  #ymax
161 | 
162 |         areas = (x2 - x1) * (y2 - y1)
163 |         order = scores.argsort()[::-1]
164 | 
165 |         keep = []
166 |         while order.size > 0:
167 |             i = order[0]
168 |             keep.append(i)
169 |             xx1 = np.maximum(x1[i], x1[order[1:]])
170 |             yy1 = np.maximum(y1[i], y1[order[1:]])
171 |             xx2 = np.minimum(x2[i], x2[order[1:]])
172 |             yy2 = np.minimum(y2[i], y2[order[1:]])
173 | 
174 |             w = np.maximum(1e-10, xx2 - xx1)
175 |             h = np.maximum(1e-10, yy2 - yy1)
176 |             inter = w * h
177 | 
178 |             # Cross Area / (bbox + particular area - Cross Area)
179 |             ovr = inter / (areas[i] + areas[order[1:]] - inter)
180 |             #reserve all the boundingbox whose ovr less than thresh
181 |             inds = np.where(ovr <= self.nms_thresh)[0]
182 |             order = order[inds + 1]
183 | 
184 |         return keep
185 | 
186 | 
187 |     def postprocess(self, bboxes, scores):
188 |         """
189 |         bboxes: (HxW, 4), bsize = 1
190 |         scores: (HxW, num_classes), bsize = 1
191 |         """
192 | 
193 |         cls_inds = np.argmax(scores, axis=1)
194 |         scores = scores[(np.arange(scores.shape[0]), cls_inds)]
195 |         
196 |         # threshold
197 |         keep = np.where(scores >= self.conf_thresh)
198 |         bboxes = bboxes[keep]
199 |         scores = scores[keep]
200 |         cls_inds = cls_inds[keep]
201 | 
202 |         # NMS
203 |         keep = np.zeros(len(bboxes), dtype=np.int)
204 |         for i in range(self.num_classes):
205 |             inds = np.where(cls_inds == i)[0]
206 |             if len(inds) == 0:
207 |                 continue
208 |             c_bboxes = bboxes[inds]
209 |             c_scores = scores[inds]
210 |             c_keep = self.nms(c_bboxes, c_scores)
211 |             keep[inds[c_keep]] = 1
212 | 
213 |         keep = np.where(keep > 0)
214 |         bboxes = bboxes[keep]
215 |         scores = scores[keep]
216 |         cls_inds = cls_inds[keep]
217 | 
218 |         # topk
219 |         scores_sorted, scores_sorted_inds = np.sort(scores), np.argsort(scores)
220 |         topk_scores, topk_scores_inds = scores_sorted[:self.topk], scores_sorted_inds[:self.topk]
221 |         topk_bboxes = bboxes[topk_scores_inds]
222 |         topk_cls_inds = cls_inds[topk_scores_inds]
223 | 
224 |         return topk_bboxes, topk_scores, topk_cls_inds
225 | 
226 | 
227 |     @torch.no_grad()
228 |     def inference(self, x):
229 |         B = x.size(0)
230 |         # backbone
231 |         feats = self.backbone(x)
232 |         c3, c4, c5 = feats['layer1'], feats['layer2'], feats['layer3']
233 | 
234 |         # FPN
235 |         p5 = self.conv_set_3(c5)
236 |         p5_up = F.interpolate(self.conv_1x1_3(p5), scale_factor=2.0, mode='bilinear', align_corners=True)
237 | 
238 |         p4 = torch.cat([c4, p5_up], 1)
239 |         p4 = self.conv_set_2(p4)
240 |         p4_up = F.interpolate(self.conv_1x1_2(p4), scale_factor=2.0, mode='bilinear', align_corners=True)
241 | 
242 |         p3 = torch.cat([c3, p4_up], 1)
243 |         p3 = self.conv_set_1(p3)
244 | 
245 |         # head
246 |         # s = 32
247 |         p5 = self.extra_conv_3(p5)
248 |         pred_3 = self.pred_3(p5)
249 | 
250 |         # s = 16
251 |         p4 = self.extra_conv_2(p4)
252 |         pred_2 = self.pred_2(p4)
253 | 
254 |         # s = 8
255 |         p3 = self.extra_conv_1(p3)
256 |         pred_1 = self.pred_1(p3)
257 | 
258 |         preds = [pred_1, pred_2, pred_3]
259 |         total_conf_pred = []
260 |         total_cls_pred = []
261 |         total_reg_pred = []
262 |         for pred in preds:
263 |             C = pred.size(1)
264 | 
265 |             # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C]
266 |             pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C)
267 | 
268 |             # [B, H*W*anchor_n, 1]
269 |             conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1)
270 |             # [B, H*W*anchor_n, num_cls]
271 |             cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes)
272 |             # [B, H*W*anchor_n, 4]
273 |             reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous()
274 | 
275 |             total_conf_pred.append(conf_pred)
276 |             total_cls_pred.append(cls_pred)
277 |             total_reg_pred.append(reg_pred)
278 |         
279 |         conf_pred = torch.cat(total_conf_pred, dim=1)
280 |         cls_pred = torch.cat(total_cls_pred, dim=1)
281 |         reg_pred = torch.cat(total_reg_pred, dim=1)
282 |         # decode bbox
283 |         reg_pred = reg_pred.view(B, -1, self.num_anchors, 4)
284 |         box_pred = self.decode_boxes(reg_pred)
285 | 
286 |         # batch size = 1
287 |         conf_pred = conf_pred[0]
288 |         cls_pred = cls_pred[0]
289 |         box_pred = box_pred[0]
290 | 
291 |         # score
292 |         scores = torch.sigmoid(conf_pred) * torch.softmax(cls_pred, dim=-1)
293 | 
294 |         # normalize bbox
295 |         bboxes = torch.clamp(box_pred / self.input_size, 0., 1.)
296 | 
297 |         # to cpu
298 |         scores = scores.to('cpu').numpy()
299 |         bboxes = bboxes.to('cpu').numpy()
300 | 
301 |         # post-process
302 |         bboxes, scores, cls_inds = self.postprocess(bboxes, scores)
303 | 
304 |         return bboxes, scores, cls_inds
305 |         
306 | 
307 |     def forward(self, x, target=None):
308 |         if not self.trainable:
309 |             return self.inference(x)
310 |         else:
311 |             # backbone
312 |             B = x.size(0)
313 |             # backbone
314 |             feats = self.backbone(x)
315 |             c3, c4, c5 = feats['layer1'], feats['layer2'], feats['layer3']
316 | 
317 |             # FPN
318 |             p5 = self.conv_set_3(c5)
319 |             p5_up = F.interpolate(self.conv_1x1_3(p5), scale_factor=2.0, mode='bilinear', align_corners=True)
320 | 
321 |             p4 = torch.cat([c4, p5_up], 1)
322 |             p4 = self.conv_set_2(p4)
323 |             p4_up = F.interpolate(self.conv_1x1_2(p4), scale_factor=2.0, mode='bilinear', align_corners=True)
324 | 
325 |             p3 = torch.cat([c3, p4_up], 1)
326 |             p3 = self.conv_set_1(p3)
327 | 
328 |             # head
329 |             # s = 32
330 |             p5 = self.extra_conv_3(p5)
331 |             pred_3 = self.pred_3(p5)
332 | 
333 |             # s = 16
334 |             p4 = self.extra_conv_2(p4)
335 |             pred_2 = self.pred_2(p4)
336 | 
337 |             # s = 8
338 |             p3 = self.extra_conv_1(p3)
339 |             pred_1 = self.pred_1(p3)
340 | 
341 |             preds = [pred_1, pred_2, pred_3]
342 |             total_conf_pred = []
343 |             total_cls_pred = []
344 |             total_reg_pred = []
345 |             for pred in preds:
346 |                 C = pred.size(1)
347 | 
348 |                 # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C]
349 |                 pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C)
350 | 
351 |                 # [B, H*W*anchor_n, 1]
352 |                 conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1)
353 |                 # [B, H*W*anchor_n, num_cls]
354 |                 cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes)
355 |                 # [B, H*W*anchor_n, 4]
356 |                 reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous()
357 | 
358 |                 total_conf_pred.append(conf_pred)
359 |                 total_cls_pred.append(cls_pred)
360 |                 total_reg_pred.append(reg_pred)
361 |             
362 |             conf_pred = torch.cat(total_conf_pred, dim=1)
363 |             cls_pred = torch.cat(total_cls_pred, dim=1)
364 |             reg_pred = torch.cat(total_reg_pred, dim=1)
365 | 
366 |             # decode bbox
367 |             reg_pred = reg_pred.view(B, -1, self.num_anchors, 4)
368 |             x1y1x2y2_pred = (self.decode_boxes(reg_pred) / self.input_size).view(-1, 4)
369 |             reg_pred = reg_pred.view(B, -1, 4)
370 |             x1y1x2y2_gt = target[:, :, 7:].view(-1, 4)
371 |                 
372 |             # set conf target
373 |             iou_pred = tools.iou_score(x1y1x2y2_pred, x1y1x2y2_gt).view(B, -1, 1)
374 |             gt_conf = iou_pred.clone().detach()
375 | 
376 |             # [obj, cls, txtytwth, scale_weight, x1y1x2y2] -> [conf, obj, cls, txtytwth, scale_weight]
377 |             target = torch.cat([gt_conf, target[:, :, :7]], dim=2)
378 | 
379 |             # loss
380 |             (
381 |                 conf_loss,
382 |                 cls_loss,
383 |                 bbox_loss,
384 |                 iou_loss
385 |             ) = tools.loss(pred_conf=conf_pred,
386 |                             pred_cls=cls_pred,
387 |                             pred_txtytwth=reg_pred,
388 |                             pred_iou=iou_pred,
389 |                             label=target
390 |                             )
391 | 
392 |             return conf_loss, cls_loss, bbox_loss, iou_loss   
393 | 


--------------------------------------------------------------------------------
/models/yolov3_spp.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | 
  6 | from utils.modules import Conv, SPP
  7 | from backbone import build_backbone
  8 | import tools
  9 | 
 10 | 
 11 | # YOLOv3 SPP
 12 | class YOLOv3Spp(nn.Module):
 13 |     def __init__(self,
 14 |                  device,
 15 |                  input_size=None,
 16 |                  num_classes=20,
 17 |                  trainable=False,
 18 |                  conf_thresh=0.001,
 19 |                  nms_thresh=0.50,
 20 |                  anchor_size=None):
 21 |         super(YOLOv3Spp, self).__init__()
 22 |         self.device = device
 23 |         self.input_size = input_size
 24 |         self.num_classes = num_classes
 25 |         self.trainable = trainable
 26 |         self.conf_thresh = conf_thresh
 27 |         self.nms_thresh = nms_thresh
 28 |         self.stride = [8, 16, 32]
 29 |         self.anchor_size = torch.tensor(anchor_size).view(3, len(anchor_size) // 3, 2)
 30 |         self.num_anchors = self.anchor_size.size(1)
 31 | 
 32 |         self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size)
 33 | 
 34 |         # backbone
 35 |         self.backbone = build_backbone(model_name='darknet53', pretrained=trainable)
 36 |         
 37 |         # s = 32
 38 |         self.conv_set_3 = nn.Sequential(
 39 |             SPP(),
 40 |             Conv(1024*4, 512, k=1),
 41 |             Conv(512, 1024, k=3, p=1),
 42 |             Conv(1024, 512, k=1),
 43 |             Conv(512, 1024, k=3, p=1),
 44 |             Conv(1024, 512, k=1)
 45 |         )
 46 |         self.conv_1x1_3 = Conv(512, 256, k=1)
 47 |         self.extra_conv_3 = Conv(512, 1024, k=3, p=1)
 48 |         self.pred_3 = nn.Conv2d(1024, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1)
 49 | 
 50 |         # s = 16
 51 |         self.conv_set_2 = nn.Sequential(
 52 |             Conv(768, 256, k=1),
 53 |             Conv(256, 512, k=3, p=1),
 54 |             Conv(512, 256, k=1),
 55 |             Conv(256, 512, k=3, p=1),
 56 |             Conv(512, 256, k=1)
 57 |         )
 58 |         self.conv_1x1_2 = Conv(256, 128, k=1)
 59 |         self.extra_conv_2 = Conv(256, 512, k=3, p=1)
 60 |         self.pred_2 = nn.Conv2d(512, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1)
 61 | 
 62 |         # s = 8
 63 |         self.conv_set_1 = nn.Sequential(
 64 |             Conv(384, 128, k=1),
 65 |             Conv(128, 256, k=3, p=1),
 66 |             Conv(256, 128, k=1),
 67 |             Conv(128, 256, k=3, p=1),
 68 |             Conv(256, 128, k=1)
 69 |         )
 70 |         self.extra_conv_1 = Conv(128, 256, k=3, p=1)
 71 |         self.pred_1 = nn.Conv2d(256, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1)
 72 | 
 73 |     
 74 |         self.init_yolo()
 75 | 
 76 | 
 77 |     def init_yolo(self):  
 78 |         # Init head
 79 |         init_prob = 0.01
 80 |         bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
 81 |         # init obj&cls pred
 82 |         for pred in [self.pred_1, self.pred_2, self.pred_3]:
 83 |             nn.init.constant_(pred.bias[..., :self.num_anchors], bias_value)
 84 |             nn.init.constant_(pred.bias[..., self.num_anchors : (1 + self.num_classes) * self.num_anchors], bias_value)
 85 | 
 86 | 
 87 |     def create_grid(self, input_size):
 88 |         total_grid_xy = []
 89 |         total_stride = []
 90 |         total_anchor_wh = []
 91 |         w, h = input_size, input_size
 92 |         for ind, s in enumerate(self.stride):
 93 |             # generate grid cells
 94 |             ws, hs = w // s, h // s
 95 |             grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)])
 96 |             grid_xy = torch.stack([grid_x, grid_y], dim=-1).float()
 97 |             grid_xy = grid_xy.view(1, hs*ws, 1, 2)
 98 | 
 99 |             # generate stride tensor
100 |             stride_tensor = torch.ones([1, hs*ws, self.num_anchors, 2]) * s
101 | 
102 |             # generate anchor_wh tensor
103 |             anchor_wh = self.anchor_size[ind].repeat(hs*ws, 1, 1)
104 | 
105 |             total_grid_xy.append(grid_xy)
106 |             total_stride.append(stride_tensor)
107 |             total_anchor_wh.append(anchor_wh)
108 | 
109 |         total_grid_xy = torch.cat(total_grid_xy, dim=1).to(self.device)
110 |         total_stride = torch.cat(total_stride, dim=1).to(self.device)
111 |         total_anchor_wh = torch.cat(total_anchor_wh, dim=0).to(self.device).unsqueeze(0)
112 | 
113 |         return total_grid_xy, total_stride, total_anchor_wh
114 | 
115 | 
116 |     def set_grid(self, input_size):
117 |         self.input_size = input_size
118 |         self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size)
119 | 
120 | 
121 |     def decode_xywh(self, txtytwth_pred):
122 |         """
123 |             Input:
124 |                 txtytwth_pred : [B, H*W, anchor_n, 4] containing [tx, ty, tw, th]
125 |             Output:
126 |                 xywh_pred : [B, H*W*anchor_n, 4] containing [x, y, w, h]
127 |         """
128 |         # b_x = sigmoid(tx) + gride_x,  b_y = sigmoid(ty) + gride_y
129 |         B, HW, ab_n, _ = txtytwth_pred.size()
130 |         c_xy_pred = (torch.sigmoid(txtytwth_pred[:, :, :, :2]) + self.grid_cell) * self.stride_tensor
131 |         # b_w = anchor_w * exp(tw),     b_h = anchor_h * exp(th)
132 |         b_wh_pred = torch.exp(txtytwth_pred[:, :, :, 2:]) * self.all_anchors_wh
133 |         # [B, H*W, anchor_n, 4] -> [B, H*W*anchor_n, 4]
134 |         xywh_pred = torch.cat([c_xy_pred, b_wh_pred], -1).view(B, HW*ab_n, 4)
135 | 
136 |         return xywh_pred
137 | 
138 | 
139 |     def decode_boxes(self, txtytwth_pred):
140 |         """
141 |             Input: \n
142 |                 txtytwth_pred : [B, H*W, anchor_n, 4] \n
143 |             Output: \n
144 |                 x1y1x2y2_pred : [B, H*W*anchor_n, 4] \n
145 |         """
146 |         # txtytwth -> cxcywh
147 |         xywh_pred = self.decode_xywh(txtytwth_pred)
148 | 
149 |         # cxcywh -> x1y1x2y2
150 |         x1y1x2y2_pred = torch.zeros_like(xywh_pred)
151 |         x1y1_pred = xywh_pred[..., :2] - xywh_pred[..., 2:] * 0.5
152 |         x2y2_pred = xywh_pred[..., :2] + xywh_pred[..., 2:] * 0.5
153 |         x1y1x2y2_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1)
154 |         
155 |         return x1y1x2y2_pred
156 | 
157 | 
158 |     def nms(self, dets, scores):
159 |         """"Pure Python NMS baseline."""
160 |         x1 = dets[:, 0]  #xmin
161 |         y1 = dets[:, 1]  #ymin
162 |         x2 = dets[:, 2]  #xmax
163 |         y2 = dets[:, 3]  #ymax
164 | 
165 |         areas = (x2 - x1) * (y2 - y1)
166 |         order = scores.argsort()[::-1]
167 | 
168 |         keep = []
169 |         while order.size > 0:
170 |             i = order[0]
171 |             keep.append(i)
172 |             xx1 = np.maximum(x1[i], x1[order[1:]])
173 |             yy1 = np.maximum(y1[i], y1[order[1:]])
174 |             xx2 = np.minimum(x2[i], x2[order[1:]])
175 |             yy2 = np.minimum(y2[i], y2[order[1:]])
176 | 
177 |             w = np.maximum(1e-10, xx2 - xx1)
178 |             h = np.maximum(1e-10, yy2 - yy1)
179 |             inter = w * h
180 | 
181 |             # Cross Area / (bbox + particular area - Cross Area)
182 |             ovr = inter / (areas[i] + areas[order[1:]] - inter)
183 |             #reserve all the boundingbox whose ovr less than thresh
184 |             inds = np.where(ovr <= self.nms_thresh)[0]
185 |             order = order[inds + 1]
186 | 
187 |         return keep
188 | 
189 | 
190 |     def postprocess(self, bboxes, scores):
191 |         """
192 |         bboxes: (HxW, 4), bsize = 1
193 |         scores: (HxW, num_classes), bsize = 1
194 |         """
195 | 
196 |         cls_inds = np.argmax(scores, axis=1)
197 |         scores = scores[(np.arange(scores.shape[0]), cls_inds)]
198 |         
199 |         # threshold
200 |         keep = np.where(scores >= self.conf_thresh)
201 |         bboxes = bboxes[keep]
202 |         scores = scores[keep]
203 |         cls_inds = cls_inds[keep]
204 | 
205 |         # NMS
206 |         keep = np.zeros(len(bboxes), dtype=np.int)
207 |         for i in range(self.num_classes):
208 |             inds = np.where(cls_inds == i)[0]
209 |             if len(inds) == 0:
210 |                 continue
211 |             c_bboxes = bboxes[inds]
212 |             c_scores = scores[inds]
213 |             c_keep = self.nms(c_bboxes, c_scores)
214 |             keep[inds[c_keep]] = 1
215 | 
216 |         keep = np.where(keep > 0)
217 |         bboxes = bboxes[keep]
218 |         scores = scores[keep]
219 |         cls_inds = cls_inds[keep]
220 | 
221 |         return bboxes, scores, cls_inds
222 | 
223 | 
224 |     @torch.no_grad()
225 |     def inference(self, x):
226 |         B = x.size(0)
227 |         # backbone
228 |         feats = self.backbone(x)
229 |         c3, c4, c5 = feats['layer1'], feats['layer2'], feats['layer3']
230 | 
231 |         # FPN
232 |         p5 = self.conv_set_3(c5)
233 |         p5_up = F.interpolate(self.conv_1x1_3(p5), scale_factor=2.0, mode='bilinear', align_corners=True)
234 | 
235 |         p4 = torch.cat([c4, p5_up], 1)
236 |         p4 = self.conv_set_2(p4)
237 |         p4_up = F.interpolate(self.conv_1x1_2(p4), scale_factor=2.0, mode='bilinear', align_corners=True)
238 | 
239 |         p3 = torch.cat([c3, p4_up], 1)
240 |         p3 = self.conv_set_1(p3)
241 | 
242 |         # head
243 |         # s = 32
244 |         p5 = self.extra_conv_3(p5)
245 |         pred_3 = self.pred_3(p5)
246 | 
247 |         # s = 16
248 |         p4 = self.extra_conv_2(p4)
249 |         pred_2 = self.pred_2(p4)
250 | 
251 |         # s = 8
252 |         p3 = self.extra_conv_1(p3)
253 |         pred_1 = self.pred_1(p3)
254 | 
255 |         preds = [pred_1, pred_2, pred_3]
256 |         total_conf_pred = []
257 |         total_cls_pred = []
258 |         total_reg_pred = []
259 |         for pred in preds:
260 |             C = pred.size(1)
261 | 
262 |             # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C]
263 |             pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C)
264 | 
265 |             # [B, H*W*anchor_n, 1]
266 |             conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1)
267 |             # [B, H*W*anchor_n, num_cls]
268 |             cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes)
269 |             # [B, H*W*anchor_n, 4]
270 |             reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous()
271 | 
272 |             total_conf_pred.append(conf_pred)
273 |             total_cls_pred.append(cls_pred)
274 |             total_reg_pred.append(reg_pred)
275 |         
276 |         conf_pred = torch.cat(total_conf_pred, dim=1)
277 |         cls_pred = torch.cat(total_cls_pred, dim=1)
278 |         reg_pred = torch.cat(total_reg_pred, dim=1)
279 |         # decode bbox
280 |         reg_pred = reg_pred.view(B, -1, self.num_anchors, 4)
281 |         box_pred = self.decode_boxes(reg_pred)
282 | 
283 |         # batch size = 1
284 |         conf_pred = conf_pred[0]
285 |         cls_pred = cls_pred[0]
286 |         box_pred = box_pred[0]
287 | 
288 |         # score
289 |         scores = torch.sigmoid(conf_pred) * torch.softmax(cls_pred, dim=-1)
290 | 
291 |         # normalize bbox
292 |         bboxes = torch.clamp(box_pred / self.input_size, 0., 1.)
293 | 
294 |         # to cpu
295 |         scores = scores.to('cpu').numpy()
296 |         bboxes = bboxes.to('cpu').numpy()
297 | 
298 |         # post-process
299 |         bboxes, scores, cls_inds = self.postprocess(bboxes, scores)
300 | 
301 |         return bboxes, scores, cls_inds
302 |         
303 | 
304 |     def forward(self, x, target=None):
305 |         if not self.trainable:
306 |             return self.inference(x)
307 |         else:
308 |             # backbone
309 |             B = x.size(0)
310 |             # backbone
311 |             feats = self.backbone(x)
312 |             c3, c4, c5 = feats['layer1'], feats['layer2'], feats['layer3']
313 | 
314 |             # FPN
315 |             p5 = self.conv_set_3(c5)
316 |             p5_up = F.interpolate(self.conv_1x1_3(p5), scale_factor=2.0, mode='bilinear', align_corners=True)
317 | 
318 |             p4 = torch.cat([c4, p5_up], 1)
319 |             p4 = self.conv_set_2(p4)
320 |             p4_up = F.interpolate(self.conv_1x1_2(p4), scale_factor=2.0, mode='bilinear', align_corners=True)
321 | 
322 |             p3 = torch.cat([c3, p4_up], 1)
323 |             p3 = self.conv_set_1(p3)
324 | 
325 |             # head
326 |             # s = 32
327 |             p5 = self.extra_conv_3(p5)
328 |             pred_3 = self.pred_3(p5)
329 | 
330 |             # s = 16
331 |             p4 = self.extra_conv_2(p4)
332 |             pred_2 = self.pred_2(p4)
333 | 
334 |             # s = 8
335 |             p3 = self.extra_conv_1(p3)
336 |             pred_1 = self.pred_1(p3)
337 | 
338 |             preds = [pred_1, pred_2, pred_3]
339 |             total_conf_pred = []
340 |             total_cls_pred = []
341 |             total_reg_pred = []
342 |             for pred in preds:
343 |                 C = pred.size(1)
344 | 
345 |                 # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C]
346 |                 pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C)
347 | 
348 |                 # [B, H*W*anchor_n, 1]
349 |                 conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1)
350 |                 # [B, H*W*anchor_n, num_cls]
351 |                 cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes)
352 |                 # [B, H*W*anchor_n, 4]
353 |                 reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous()
354 | 
355 |                 total_conf_pred.append(conf_pred)
356 |                 total_cls_pred.append(cls_pred)
357 |                 total_reg_pred.append(reg_pred)
358 |             
359 |             conf_pred = torch.cat(total_conf_pred, dim=1)
360 |             cls_pred = torch.cat(total_cls_pred, dim=1)
361 |             reg_pred = torch.cat(total_reg_pred, dim=1)
362 | 
363 |             # decode bbox
364 |             reg_pred = reg_pred.view(B, -1, self.num_anchors, 4)
365 |             x1y1x2y2_pred = (self.decode_boxes(reg_pred) / self.input_size).view(-1, 4)
366 |             reg_pred = reg_pred.view(B, -1, 4)
367 |             x1y1x2y2_gt = target[:, :, 7:].view(-1, 4)
368 |                 
369 |             # set conf target
370 |             iou_pred = tools.iou_score(x1y1x2y2_pred, x1y1x2y2_gt).view(B, -1, 1)
371 |             gt_conf = iou_pred.clone().detach()
372 | 
373 |             # [obj, cls, txtytwth, scale_weight, x1y1x2y2] -> [conf, obj, cls, txtytwth, scale_weight]
374 |             target = torch.cat([gt_conf, target[:, :, :7]], dim=2)
375 | 
376 |             # loss
377 |             (
378 |                 conf_loss,
379 |                 cls_loss,
380 |                 bbox_loss,
381 |                 iou_loss
382 |             ) = tools.loss(pred_conf=conf_pred,
383 |                             pred_cls=cls_pred,
384 |                             pred_txtytwth=reg_pred,
385 |                             pred_iou=iou_pred,
386 |                             label=target
387 |                             )
388 | 
389 |             return conf_loss, cls_loss, bbox_loss, iou_loss   
390 | 


--------------------------------------------------------------------------------
/models/yolov3_tiny.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | 
  6 | from utils.modules import Conv
  7 | from backbone import build_backbone
  8 | import tools
  9 | 
 10 | 
 11 | # YOLOv3 Tiny
 12 | class YOLOv3tiny(nn.Module):
 13 |     def __init__(self, device, input_size=None, num_classes=20, trainable=False, conf_thresh=0.01, nms_thresh=0.50, anchor_size=None, hr=False):
 14 |         super(YOLOv3tiny, self).__init__()
 15 |         self.device = device
 16 |         self.input_size = input_size
 17 |         self.num_classes = num_classes
 18 |         self.trainable = trainable
 19 |         self.conf_thresh = conf_thresh
 20 |         self.nms_thresh = nms_thresh
 21 |         self.stride = [16, 32]
 22 |         self.anchor_size = torch.tensor(anchor_size).view(2, len(anchor_size) // 2, 2)
 23 |         self.num_anchors = self.anchor_size.size(1)
 24 | 
 25 |         self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size)
 26 | 
 27 |         # backbone
 28 |         self.backbone = build_backbone(model_name='darknet_tiny', pretrained=trainable)
 29 |         
 30 |         # s = 32
 31 |         self.conv_set_2 = Conv(1024, 256, k=3, p=1)
 32 | 
 33 |         self.conv_1x1_2 = Conv(256, 128, k=1)
 34 | 
 35 |         self.extra_conv_2 = Conv(256, 512, k=3, p=1)
 36 |         self.pred_2 = nn.Conv2d(512, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1)
 37 | 
 38 |         # s = 16
 39 |         self.conv_set_1 = Conv(384, 256, k=3, p=1)
 40 |         self.pred_1 = nn.Conv2d(256, self.num_anchors*(1 + 4 + self.num_classes), kernel_size=1)
 41 |     
 42 |     
 43 |         self.init_yolo()
 44 | 
 45 | 
 46 |     def init_yolo(self):  
 47 |         # Init head
 48 |         init_prob = 0.01
 49 |         bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
 50 |         # init obj&cls pred
 51 |         for pred in [self.pred_1, self.pred_2, self.pred_3]:
 52 |             nn.init.constant_(pred.bias[..., :self.num_anchors], bias_value)
 53 |             nn.init.constant_(pred.bias[..., self.num_anchors : (1 + self.num_classes) * self.num_anchors], bias_value)
 54 | 
 55 | 
 56 |     def create_grid(self, input_size):
 57 |         total_grid_xy = []
 58 |         total_stride = []
 59 |         total_anchor_wh = []
 60 |         w, h = input_size, input_size
 61 |         for ind, s in enumerate(self.stride):
 62 |             # generate grid cells
 63 |             ws, hs = w // s, h // s
 64 |             grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)])
 65 |             grid_xy = torch.stack([grid_x, grid_y], dim=-1).float()
 66 |             grid_xy = grid_xy.view(1, hs*ws, 1, 2)
 67 | 
 68 |             # generate stride tensor
 69 |             stride_tensor = torch.ones([1, hs*ws, self.num_anchors, 2]) * s
 70 | 
 71 |             # generate anchor_wh tensor
 72 |             anchor_wh = self.anchor_size[ind].repeat(hs*ws, 1, 1)
 73 | 
 74 |             total_grid_xy.append(grid_xy)
 75 |             total_stride.append(stride_tensor)
 76 |             total_anchor_wh.append(anchor_wh)
 77 | 
 78 |         total_grid_xy = torch.cat(total_grid_xy, dim=1).to(self.device)
 79 |         total_stride = torch.cat(total_stride, dim=1).to(self.device)
 80 |         total_anchor_wh = torch.cat(total_anchor_wh, dim=0).to(self.device).unsqueeze(0)
 81 | 
 82 |         return total_grid_xy, total_stride, total_anchor_wh
 83 | 
 84 | 
 85 |     def set_grid(self, input_size):
 86 |         self.input_size = input_size
 87 |         self.grid_cell, self.stride_tensor, self.all_anchors_wh = self.create_grid(input_size)
 88 | 
 89 | 
 90 |     def decode_xywh(self, txtytwth_pred):
 91 |         """
 92 |             Input:
 93 |                 txtytwth_pred : [B, H*W, anchor_n, 4] containing [tx, ty, tw, th]
 94 |             Output:
 95 |                 xywh_pred : [B, H*W*anchor_n, 4] containing [x, y, w, h]
 96 |         """
 97 |         # b_x = sigmoid(tx) + gride_x,  b_y = sigmoid(ty) + gride_y
 98 |         B, HW, ab_n, _ = txtytwth_pred.size()
 99 |         c_xy_pred = (torch.sigmoid(txtytwth_pred[:, :, :, :2]) + self.grid_cell) * self.stride_tensor
100 |         # b_w = anchor_w * exp(tw),     b_h = anchor_h * exp(th)
101 |         b_wh_pred = torch.exp(txtytwth_pred[:, :, :, 2:]) * self.all_anchors_wh
102 |         # [B, H*W, anchor_n, 4] -> [B, H*W*anchor_n, 4]
103 |         xywh_pred = torch.cat([c_xy_pred, b_wh_pred], -1).view(B, HW*ab_n, 4)
104 | 
105 |         return xywh_pred
106 | 
107 | 
108 |     def decode_boxes(self, txtytwth_pred):
109 |         """
110 |             Input: \n
111 |                 txtytwth_pred : [B, H*W, anchor_n, 4] \n
112 |             Output: \n
113 |                 x1y1x2y2_pred : [B, H*W*anchor_n, 4] \n
114 |         """
115 |         # txtytwth -> cxcywh
116 |         xywh_pred = self.decode_xywh(txtytwth_pred)
117 | 
118 |         # cxcywh -> x1y1x2y2
119 |         x1y1x2y2_pred = torch.zeros_like(xywh_pred)
120 |         x1y1_pred = xywh_pred[..., :2] - xywh_pred[..., 2:] * 0.5
121 |         x2y2_pred = xywh_pred[..., :2] + xywh_pred[..., 2:] * 0.5
122 |         x1y1x2y2_pred = torch.cat([x1y1_pred, x2y2_pred], dim=-1)
123 |         
124 |         return x1y1x2y2_pred
125 | 
126 | 
127 |     def nms(self, dets, scores):
128 |         """"Pure Python NMS baseline."""
129 |         x1 = dets[:, 0]  #xmin
130 |         y1 = dets[:, 1]  #ymin
131 |         x2 = dets[:, 2]  #xmax
132 |         y2 = dets[:, 3]  #ymax
133 | 
134 |         areas = (x2 - x1) * (y2 - y1)
135 |         order = scores.argsort()[::-1]
136 | 
137 |         keep = []
138 |         while order.size > 0:
139 |             i = order[0]
140 |             keep.append(i)
141 |             xx1 = np.maximum(x1[i], x1[order[1:]])
142 |             yy1 = np.maximum(y1[i], y1[order[1:]])
143 |             xx2 = np.minimum(x2[i], x2[order[1:]])
144 |             yy2 = np.minimum(y2[i], y2[order[1:]])
145 | 
146 |             w = np.maximum(1e-10, xx2 - xx1)
147 |             h = np.maximum(1e-10, yy2 - yy1)
148 |             inter = w * h
149 | 
150 |             # Cross Area / (bbox + particular area - Cross Area)
151 |             ovr = inter / (areas[i] + areas[order[1:]] - inter)
152 |             #reserve all the boundingbox whose ovr less than thresh
153 |             inds = np.where(ovr <= self.nms_thresh)[0]
154 |             order = order[inds + 1]
155 | 
156 |         return keep
157 | 
158 | 
159 |     def postprocess(self, bboxes, scores):
160 |         """
161 |         bboxes: (HxW, 4), bsize = 1
162 |         scores: (HxW, num_classes), bsize = 1
163 |         """
164 | 
165 |         cls_inds = np.argmax(scores, axis=1)
166 |         scores = scores[(np.arange(scores.shape[0]), cls_inds)]
167 |         
168 |         # threshold
169 |         keep = np.where(scores >= self.conf_thresh)
170 |         bboxes = bboxes[keep]
171 |         scores = scores[keep]
172 |         cls_inds = cls_inds[keep]
173 | 
174 |         # NMS
175 |         keep = np.zeros(len(bboxes), dtype=np.int)
176 |         for i in range(self.num_classes):
177 |             inds = np.where(cls_inds == i)[0]
178 |             if len(inds) == 0:
179 |                 continue
180 |             c_bboxes = bboxes[inds]
181 |             c_scores = scores[inds]
182 |             c_keep = self.nms(c_bboxes, c_scores)
183 |             keep[inds[c_keep]] = 1
184 | 
185 |         keep = np.where(keep > 0)
186 |         bboxes = bboxes[keep]
187 |         scores = scores[keep]
188 |         cls_inds = cls_inds[keep]
189 | 
190 |         return bboxes, scores, cls_inds
191 | 
192 | 
193 |     @torch.no_grad()
194 |     def inference(self, x):
195 |         B = x.size(0)
196 |         # backbone
197 |         feats = self.backbone(x)
198 |         c4, c5 = feats['layer2'], feats['layer3']
199 | 
200 |         # FPN
201 |         p5 = self.conv_set_2(c5)
202 |         p5_up = F.interpolate(self.conv_1x1_2(p5), scale_factor=2.0, mode='bilinear', align_corners=True)
203 | 
204 |         p4 = torch.cat([c4, p5_up], dim=1)
205 |         p4 = self.conv_set_1(p4)
206 | 
207 |         # head
208 |         # s = 32
209 |         p5 = self.extra_conv_2(p5)
210 |         pred_2 = self.pred_2(p5)
211 | 
212 |         # s = 16
213 |         pred_1 = self.pred_1(p4)
214 | 
215 | 
216 |         preds = [pred_1, pred_2]
217 |         total_conf_pred = []
218 |         total_cls_pred = []
219 |         total_reg_pred = []
220 |         for pred in preds:
221 |             C = pred.size(1)
222 | 
223 |             # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C]
224 |             pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C)
225 | 
226 |             # Divide prediction to obj_pred, xywh_pred and cls_pred   
227 |             # [B, H*W*anchor_n, 1]
228 |             conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1)
229 |             # [B, H*W*anchor_n, num_cls]
230 |             cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes)
231 |             # [B, H*W*anchor_n, 4]
232 |             reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous()
233 | 
234 |             total_conf_pred.append(conf_pred)
235 |             total_cls_pred.append(cls_pred)
236 |             total_reg_pred.append(reg_pred)
237 |         
238 |         conf_pred = torch.cat(total_conf_pred, dim=1)
239 |         cls_pred = torch.cat(total_cls_pred, dim=1)
240 |         reg_pred = torch.cat(total_reg_pred, dim=1)
241 |         # decode bbox
242 |         reg_pred = reg_pred.view(B, -1, self.num_anchors, 4)
243 |         box_pred = self.decode_boxes(reg_pred)
244 | 
245 |         # batch size = 1
246 |         conf_pred = conf_pred[0]
247 |         cls_pred = cls_pred[0]
248 |         box_pred = box_pred[0]
249 | 
250 |         # score
251 |         scores = torch.sigmoid(conf_pred) * torch.softmax(cls_pred, dim=-1)
252 | 
253 |         # normalize bbox
254 |         bboxes = torch.clamp(box_pred / self.input_size, 0., 1.)
255 | 
256 |         # to cpu
257 |         scores = scores.to('cpu').numpy()
258 |         bboxes = bboxes.to('cpu').numpy()
259 | 
260 |         # post-process
261 |         bboxes, scores, cls_inds = self.postprocess(bboxes, scores)
262 | 
263 |         return bboxes, scores, cls_inds
264 | 
265 | 
266 |     def forward(self, x, target=None):
267 |         if not self.trainable:
268 |             return self.inference(x)
269 |         else:
270 |             # backbone
271 |             B = x.size(0)
272 |             # backbone
273 |             feats = self.backbone(x)
274 |             c4, c5 = feats['layer2'], feats['layer3']
275 | 
276 |             # FPN
277 |             p5 = self.conv_set_2(c5)
278 |             p5_up = F.interpolate(self.conv_1x1_2(p5), scale_factor=2.0, mode='bilinear', align_corners=True)
279 | 
280 |             p4 = torch.cat([c4, p5_up], dim=1)
281 |             p4 = self.conv_set_1(p4)
282 | 
283 |             # head
284 |             # s = 32
285 |             p5 = self.extra_conv_2(p5)
286 |             pred_2 = self.pred_2(p5)
287 | 
288 |             # s = 16
289 |             pred_1 = self.pred_1(p4)
290 | 
291 |             preds = [pred_1, pred_2]
292 |             total_conf_pred = []
293 |             total_cls_pred = []
294 |             total_reg_pred = []
295 |             for pred in preds:
296 |                 C = pred.size(1)
297 | 
298 |                 # [B, anchor_n * C, H, W] -> [B, H, W, anchor_n * C] -> [B, H*W, anchor_n*C]
299 |                 pred = pred.permute(0, 2, 3, 1).contiguous().view(B, -1, C)
300 | 
301 |                 # Divide prediction to obj_pred, xywh_pred and cls_pred   
302 |                 # [B, H*W*anchor_n, 1]
303 |                 conf_pred = pred[:, :, :1 * self.num_anchors].contiguous().view(B, -1, 1)
304 |                 # [B, H*W*anchor_n, num_cls]
305 |                 cls_pred = pred[:, :, 1 * self.num_anchors : (1 + self.num_classes) * self.num_anchors].contiguous().view(B, -1, self.num_classes)
306 |                 # [B, H*W*anchor_n, 4]
307 |                 reg_pred = pred[:, :, (1 + self.num_classes) * self.num_anchors:].contiguous()
308 | 
309 |                 total_conf_pred.append(conf_pred)
310 |                 total_cls_pred.append(cls_pred)
311 |                 total_reg_pred.append(reg_pred)
312 |             
313 |             conf_pred = torch.cat(total_conf_pred, dim=1)
314 |             cls_pred = torch.cat(total_cls_pred, dim=1)
315 |             reg_pred = torch.cat(total_reg_pred, dim=1)
316 | 
317 |             # decode bbox
318 |             reg_pred = reg_pred.view(B, -1, self.num_anchors, 4)
319 |             x1y1x2y2_pred = (self.decode_boxes(reg_pred) / self.input_size).view(-1, 4)
320 |             reg_pred = reg_pred.view(B, -1, 4)
321 |             x1y1x2y2_gt = target[:, :, 7:].view(-1, 4)
322 |                 
323 |             # set conf target
324 |             iou_pred = tools.iou_score(x1y1x2y2_pred, x1y1x2y2_gt).view(B, -1, 1)
325 |             gt_conf = iou_pred.clone().detach()
326 | 
327 |             # [obj, cls, txtytwth, scale_weight, x1y1x2y2] -> [conf, obj, cls, txtytwth, scale_weight]
328 |             target = torch.cat([gt_conf, target[:, :, :7]], dim=2)
329 | 
330 |             # loss
331 |             (
332 |                 conf_loss,
333 |                 cls_loss,
334 |                 bbox_loss,
335 |                 iou_loss
336 |             ) = tools.loss(pred_conf=conf_pred,
337 |                             pred_cls=cls_pred,
338 |                             pred_txtytwth=reg_pred,
339 |                             pred_iou=iou_pred,
340 |                             label=target
341 |                             )
342 | 
343 |             return conf_loss, cls_loss, bbox_loss, iou_loss   
344 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import torch
  4 | import torch.backends.cudnn as cudnn
  5 | from data.voc0712 import VOC_CLASSES, VOCDetection
  6 | from data.coco2017 import COCODataset, coco_class_index, coco_class_labels
  7 | from data import config, BaseTransform
  8 | import numpy as np
  9 | import cv2
 10 | import time
 11 | 
 12 | 
 13 | parser = argparse.ArgumentParser(description='YOLO Detection')
 14 | # basic
 15 | parser.add_argument('-size', '--input_size', default=416, type=int,
 16 |                     help='input_size')
 17 | parser.add_argument('--cuda', action='store_true', default=False, 
 18 |                     help='use cuda.')
 19 | # model
 20 | parser.add_argument('-v', '--version', default='yolo_v2',
 21 |                     help='yolov2_d19, yolov2_r50, yolov2_slim, yolov3, yolov3_spp, yolov3_tiny')
 22 | parser.add_argument('--trained_model', default='weight/',
 23 |                     type=str, help='Trained state_dict file path to open')
 24 | parser.add_argument('--conf_thresh', default=0.1, type=float,
 25 |                     help='Confidence threshold')
 26 | parser.add_argument('--nms_thresh', default=0.50, type=float,
 27 |                     help='NMS threshold')
 28 | # dataset
 29 | parser.add_argument('-root', '--data_root', default='/mnt/share/ssd2/dataset',
 30 |                     help='dataset root')
 31 | parser.add_argument('-d', '--dataset', default='voc',
 32 |                     help='voc or coco')
 33 | # visualize
 34 | parser.add_argument('-vs', '--visual_threshold', default=0.25, type=float,
 35 |                     help='Final confidence threshold')
 36 | parser.add_argument('--show', action='store_true', default=False,
 37 |                     help='show the visulization results.')
 38 | 
 39 | 
 40 | args = parser.parse_args()
 41 | 
 42 | 
 43 | def plot_bbox_labels(img, bbox, label=None, cls_color=None, text_scale=0.4):
 44 |     x1, y1, x2, y2 = bbox
 45 |     x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
 46 |     t_size = cv2.getTextSize(label, 0, fontScale=1, thickness=2)[0]
 47 |     # plot bbox
 48 |     cv2.rectangle(img, (x1, y1), (x2, y2), cls_color, 2)
 49 |     
 50 |     if label is not None:
 51 |         # plot title bbox
 52 |         cv2.rectangle(img, (x1, y1-t_size[1]), (int(x1 + t_size[0] * text_scale), y1), cls_color, -1)
 53 |         # put the test on the title bbox
 54 |         cv2.putText(img, label, (int(x1), int(y1 - 5)), 0, text_scale, (0, 0, 0), 1, lineType=cv2.LINE_AA)
 55 | 
 56 |     return img
 57 | 
 58 | 
 59 | def visualize(img, 
 60 |               bboxes, 
 61 |               scores, 
 62 |               cls_inds, 
 63 |               vis_thresh, 
 64 |               class_colors, 
 65 |               class_names, 
 66 |               class_indexs=None, 
 67 |               dataset_name='voc'):
 68 |     ts = 0.4
 69 |     for i, bbox in enumerate(bboxes):
 70 |         if scores[i] > vis_thresh:
 71 |             cls_id = int(cls_inds[i])
 72 |             if dataset_name == 'coco':
 73 |                 cls_color = class_colors[cls_id]
 74 |                 cls_id = class_indexs[cls_id]
 75 |             else:
 76 |                 cls_color = class_colors[cls_id]
 77 |                 
 78 |             if len(class_names) > 1:
 79 |                 mess = '%s: %.2f' % (class_names[cls_id], scores[i])
 80 |             else:
 81 |                 cls_color = [255, 0, 0]
 82 |                 mess = None
 83 |             img = plot_bbox_labels(img, bbox, mess, cls_color, text_scale=ts)
 84 | 
 85 |     return img
 86 |         
 87 | 
 88 | def test(net, 
 89 |          device, 
 90 |          dataset, 
 91 |          transform, 
 92 |          vis_thresh, 
 93 |          class_colors=None, 
 94 |          class_names=None, 
 95 |          class_indexs=None, 
 96 |          dataset_name='voc'):
 97 | 
 98 |     num_images = len(dataset)
 99 |     save_path = os.path.join('det_results/', args.dataset, args.version)
100 |     os.makedirs(save_path, exist_ok=True)
101 | 
102 |     for index in range(num_images):
103 |         print('Testing image {:d}/{:d}....'.format(index+1, num_images))
104 |         image, _ = dataset.pull_image(index)
105 |         h, w, _ = image.shape
106 |         scale = np.array([[w, h, w, h]])
107 | 
108 |         # to tensor
109 |         x = torch.from_numpy(transform(image)[0][:, :, (2, 1, 0)]).permute(2, 0, 1)
110 |         x = x.unsqueeze(0).to(device)
111 | 
112 |         t0 = time.time()
113 |         # forward
114 |         bboxes, scores, cls_inds = net(x)
115 |         print("detection time used ", time.time() - t0, "s")
116 |         
117 |         # rescale
118 |         bboxes *= scale
119 | 
120 |         # vis detection
121 |         img_processed = visualize(
122 |                             img=image,
123 |                             bboxes=bboxes,
124 |                             scores=scores,
125 |                             cls_inds=cls_inds,
126 |                             vis_thresh=vis_thresh,
127 |                             class_colors=class_colors,
128 |                             class_names=class_names,
129 |                             class_indexs=class_indexs,
130 |                             dataset_name=dataset_name
131 |                             )
132 |         if args.show:
133 |             cv2.imshow('detection', img_processed)
134 |             cv2.waitKey(0)
135 |         # save result
136 |         cv2.imwrite(os.path.join(save_path, str(index).zfill(6) +'.jpg'), img_processed)
137 | 
138 | 
139 | if __name__ == '__main__':
140 |     # cuda
141 |     if args.cuda:
142 |         print('use cuda')
143 |         cudnn.benchmark = True
144 |         device = torch.device("cuda")
145 |     else:
146 |         device = torch.device("cpu")
147 | 
148 |     # input size
149 |     input_size = args.input_size
150 | 
151 |     # dataset
152 |     if args.dataset == 'voc':
153 |         print('test on voc ...')
154 |         data_dir = os.path.join(args.data_root, 'VOCdevkit')
155 |         class_names = VOC_CLASSES
156 |         class_indexs = None
157 |         num_classes = 20
158 |         dataset = VOCDetection(root=data_dir, 
159 |                                 image_sets=[('2007', 'test')])
160 | 
161 |     elif args.dataset == 'coco':
162 |         print('test on coco-val ...')
163 |         data_dir = os.path.join(args.data_root, 'COCO')
164 |         class_names = coco_class_labels
165 |         class_indexs = coco_class_index
166 |         num_classes = 80
167 |         dataset = COCODataset(
168 |                     data_dir=data_dir,
169 |                     json_file='instances_val2017.json',
170 |                     name='val2017')
171 | 
172 |     class_colors = [(np.random.randint(255), 
173 |                      np.random.randint(255),
174 |                      np.random.randint(255)) for _ in range(num_classes)]
175 | 
176 |     # model
177 |     model_name = args.version
178 |     print('Model: ', model_name)
179 | 
180 |     # load model and config file
181 |     if model_name == 'yolov2_d19':
182 |         from models.yolov2_d19 import YOLOv2D19 as yolo_net
183 |         cfg = config.yolov2_d19_cfg
184 | 
185 |     elif model_name == 'yolov2_r50':
186 |         from models.yolov2_r50 import YOLOv2R50 as yolo_net
187 |         cfg = config.yolov2_r50_cfg
188 | 
189 |     elif model_name == 'yolov3':
190 |         from models.yolov3 import YOLOv3 as yolo_net
191 |         cfg = config.yolov3_d53_cfg
192 | 
193 |     elif model_name == 'yolov3_spp':
194 |         from models.yolov3_spp import YOLOv3Spp as yolo_net
195 |         cfg = config.yolov3_d53_cfg
196 | 
197 |     elif model_name == 'yolov3_tiny':
198 |         from models.yolov3_tiny import YOLOv3tiny as yolo_net
199 |         cfg = config.yolov3_tiny_cfg
200 |     else:
201 |         print('Unknown model name...')
202 |         exit(0)
203 | 
204 |     # build model
205 |     anchor_size = cfg['anchor_size_voc'] if args.dataset == 'voc' else cfg['anchor_size_coco']
206 |     net = yolo_net(device=device, 
207 |                    input_size=input_size, 
208 |                    num_classes=num_classes, 
209 |                    trainable=False, 
210 |                    conf_thresh=args.conf_thresh,
211 |                    nms_thresh=args.nms_thresh,
212 |                    anchor_size=anchor_size)
213 | 
214 |     # load weight
215 |     net.load_state_dict(torch.load(args.trained_model, map_location=device))
216 |     net.to(device).eval()
217 |     print('Finished loading model!')
218 | 
219 |     # evaluation
220 |     test(net=net, 
221 |         device=device, 
222 |         dataset=dataset,
223 |         transform=BaseTransform(input_size),
224 |         vis_thresh=args.visual_threshold,
225 |         class_colors=class_colors,
226 |         class_names=class_names,
227 |         class_indexs=class_indexs,
228 |         dataset_name=args.dataset
229 |         )
230 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yjh0410/yolov2-yolov3_PyTorch/49dbf4bbcaba6bcf3f81de3ce0a85ec592f618ee/utils/__init__.py


--------------------------------------------------------------------------------
/utils/augmentations.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | from numpy import random
  4 | 
  5 | 
  6 | def intersect(box_a, box_b):
  7 |     max_xy = np.minimum(box_a[:, 2:], box_b[2:])
  8 |     min_xy = np.maximum(box_a[:, :2], box_b[:2])
  9 |     inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
 10 |     return inter[:, 0] * inter[:, 1]
 11 | 
 12 | 
 13 | def jaccard_numpy(box_a, box_b):
 14 |     """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
 15 |     is simply the intersection over union of two boxes.
 16 |     E.g.:
 17 |         A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
 18 |     Args:
 19 |         box_a: Multiple bounding boxes, Shape: [num_boxes,4]
 20 |         box_b: Single bounding box, Shape: [4]
 21 |     Return:
 22 |         jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
 23 |     """
 24 |     inter = intersect(box_a, box_b)
 25 |     area_a = ((box_a[:, 2]-box_a[:, 0]) *
 26 |               (box_a[:, 3]-box_a[:, 1]))  # [A,B]
 27 |     area_b = ((box_b[2]-box_b[0]) *
 28 |               (box_b[3]-box_b[1]))  # [A,B]
 29 |     union = area_a + area_b - inter
 30 |     return inter / union  # [A,B]
 31 | 
 32 | 
 33 | class Compose(object):
 34 |     """Composes several augmentations together.
 35 |     Args:
 36 |         transforms (List[Transform]): list of transforms to compose.
 37 |     Example:
 38 |         >>> augmentations.Compose([
 39 |         >>>     transforms.CenterCrop(10),
 40 |         >>>     transforms.ToTensor(),
 41 |         >>> ])
 42 |     """
 43 | 
 44 |     def __init__(self, transforms):
 45 |         self.transforms = transforms
 46 | 
 47 |     def __call__(self, img, boxes=None, labels=None):
 48 |         for t in self.transforms:
 49 |             img, boxes, labels = t(img, boxes, labels)
 50 |         return img, boxes, labels
 51 | 
 52 | 
 53 | class ConvertFromInts(object):
 54 |     def __call__(self, image, boxes=None, labels=None):
 55 |         return image.astype(np.float32), boxes, labels
 56 | 
 57 | 
 58 | class Normalize(object):
 59 |     def __init__(self, mean=None, std=None):
 60 |         self.mean = np.array(mean, dtype=np.float32)
 61 |         self.std = np.array(std, dtype=np.float32)
 62 | 
 63 |     def __call__(self, image, boxes=None, labels=None):
 64 |         image = image.astype(np.float32)
 65 |         image /= 255.
 66 |         image -= self.mean
 67 |         image /= self.std
 68 | 
 69 |         return image, boxes, labels
 70 | 
 71 | 
 72 | class ToAbsoluteCoords(object):
 73 |     def __call__(self, image, boxes=None, labels=None):
 74 |         height, width, channels = image.shape
 75 |         boxes[:, 0] *= width
 76 |         boxes[:, 2] *= width
 77 |         boxes[:, 1] *= height
 78 |         boxes[:, 3] *= height
 79 | 
 80 |         return image, boxes, labels
 81 | 
 82 | 
 83 | class ToPercentCoords(object):
 84 |     def __call__(self, image, boxes=None, labels=None):
 85 |         height, width, channels = image.shape
 86 |         boxes[:, 0] /= width
 87 |         boxes[:, 2] /= width
 88 |         boxes[:, 1] /= height
 89 |         boxes[:, 3] /= height
 90 | 
 91 |         return image, boxes, labels
 92 | 
 93 | 
 94 | class Resize(object):
 95 |     def __init__(self, size=416):
 96 |         self.size = size
 97 | 
 98 |     def __call__(self, image, boxes=None, labels=None):
 99 |         image = cv2.resize(image, (self.size, self.size))
100 |         return image, boxes, labels
101 | 
102 | 
103 | class RandomSaturation(object):
104 |     def __init__(self, lower=0.5, upper=1.5):
105 |         self.lower = lower
106 |         self.upper = upper
107 |         assert self.upper >= self.lower, "contrast upper must be >= lower."
108 |         assert self.lower >= 0, "contrast lower must be non-negative."
109 | 
110 |     def __call__(self, image, boxes=None, labels=None):
111 |         if random.randint(2):
112 |             image[:, :, 1] *= random.uniform(self.lower, self.upper)
113 | 
114 |         return image, boxes, labels
115 | 
116 | 
117 | class RandomHue(object):
118 |     def __init__(self, delta=18.0):
119 |         assert delta >= 0.0 and delta <= 360.0
120 |         self.delta = delta
121 | 
122 |     def __call__(self, image, boxes=None, labels=None):
123 |         if random.randint(2):
124 |             image[:, :, 0] += random.uniform(-self.delta, self.delta)
125 |             image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
126 |             image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
127 |         return image, boxes, labels
128 | 
129 | 
130 | class RandomLightingNoise(object):
131 |     def __init__(self):
132 |         self.perms = ((0, 1, 2), (0, 2, 1),
133 |                       (1, 0, 2), (1, 2, 0),
134 |                       (2, 0, 1), (2, 1, 0))
135 | 
136 |     def __call__(self, image, boxes=None, labels=None):
137 |         if random.randint(2):
138 |             swap = self.perms[random.randint(len(self.perms))]
139 |             shuffle = SwapChannels(swap)  # shuffle channels
140 |             image = shuffle(image)
141 |         return image, boxes, labels
142 | 
143 | 
144 | class ConvertColor(object):
145 |     def __init__(self, current='BGR', transform='HSV'):
146 |         self.transform = transform
147 |         self.current = current
148 | 
149 |     def __call__(self, image, boxes=None, labels=None):
150 |         if self.current == 'BGR' and self.transform == 'HSV':
151 |             image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
152 |         elif self.current == 'HSV' and self.transform == 'BGR':
153 |             image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
154 |         else:
155 |             raise NotImplementedError
156 |         return image, boxes, labels
157 | 
158 | 
159 | class RandomContrast(object):
160 |     def __init__(self, lower=0.5, upper=1.5):
161 |         self.lower = lower
162 |         self.upper = upper
163 |         assert self.upper >= self.lower, "contrast upper must be >= lower."
164 |         assert self.lower >= 0, "contrast lower must be non-negative."
165 | 
166 |     # expects float image
167 |     def __call__(self, image, boxes=None, labels=None):
168 |         if random.randint(2):
169 |             alpha = random.uniform(self.lower, self.upper)
170 |             image *= alpha
171 |         return image, boxes, labels
172 | 
173 | 
174 | class RandomBrightness(object):
175 |     def __init__(self, delta=32):
176 |         assert delta >= 0.0
177 |         assert delta <= 255.0
178 |         self.delta = delta
179 | 
180 |     def __call__(self, image, boxes=None, labels=None):
181 |         if random.randint(2):
182 |             delta = random.uniform(-self.delta, self.delta)
183 |             image += delta
184 |         return image, boxes, labels
185 | 
186 | 
187 | class RandomSampleCrop(object):
188 |     """Crop
189 |     Arguments:
190 |         img (Image): the image being input during training
191 |         boxes (Tensor): the original bounding boxes in pt form
192 |         labels (Tensor): the class labels for each bbox
193 |         mode (float tuple): the min and max jaccard overlaps
194 |     Return:
195 |         (img, boxes, classes)
196 |             img (Image): the cropped image
197 |             boxes (Tensor): the adjusted bounding boxes in pt form
198 |             labels (Tensor): the class labels for each bbox
199 |     """
200 |     def __init__(self):
201 |         self.sample_options = (
202 |             # using entire original input image
203 |             None,
204 |             # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
205 |             (0.1, None),
206 |             (0.3, None),
207 |             (0.7, None),
208 |             (0.9, None),
209 |             # randomly sample a patch
210 |             (None, None),
211 |         )
212 | 
213 |     def __call__(self, image, boxes=None, labels=None):
214 |         height, width, _ = image.shape
215 |         while True:
216 |             # randomly choose a mode
217 |             sample_id = np.random.randint(len(self.sample_options))
218 |             mode = self.sample_options[sample_id]
219 |             if mode is None:
220 |                 return image, boxes, labels
221 | 
222 |             min_iou, max_iou = mode
223 |             if min_iou is None:
224 |                 min_iou = float('-inf')
225 |             if max_iou is None:
226 |                 max_iou = float('inf')
227 | 
228 |             # max trails (50)
229 |             for _ in range(50):
230 |                 current_image = image
231 | 
232 |                 w = random.uniform(0.3 * width, width)
233 |                 h = random.uniform(0.3 * height, height)
234 | 
235 |                 # aspect ratio constraint b/t .5 & 2
236 |                 if h / w < 0.5 or h / w > 2:
237 |                     continue
238 | 
239 |                 left = random.uniform(width - w)
240 |                 top = random.uniform(height - h)
241 | 
242 |                 # convert to integer rect x1,y1,x2,y2
243 |                 rect = np.array([int(left), int(top), int(left+w), int(top+h)])
244 | 
245 |                 # calculate IoU (jaccard overlap) b/t the cropped and gt boxes
246 |                 overlap = jaccard_numpy(boxes, rect)
247 | 
248 |                 # is min and max overlap constraint satisfied? if not try again
249 |                 if overlap.min() < min_iou and max_iou < overlap.max():
250 |                     continue
251 | 
252 |                 # cut the crop from the image
253 |                 current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
254 |                                               :]
255 | 
256 |                 # keep overlap with gt box IF center in sampled patch
257 |                 centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
258 | 
259 |                 # mask in all gt boxes that above and to the left of centers
260 |                 m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
261 | 
262 |                 # mask in all gt boxes that under and to the right of centers
263 |                 m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
264 | 
265 |                 # mask in that both m1 and m2 are true
266 |                 mask = m1 * m2
267 | 
268 |                 # have any valid boxes? try again if not
269 |                 if not mask.any():
270 |                     continue
271 | 
272 |                 # take only matching gt boxes
273 |                 current_boxes = boxes[mask, :].copy()
274 | 
275 |                 # take only matching gt labels
276 |                 current_labels = labels[mask]
277 | 
278 |                 # should we use the box left and top corner or the crop's
279 |                 current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
280 |                                                   rect[:2])
281 |                 # adjust to crop (by substracting crop's left,top)
282 |                 current_boxes[:, :2] -= rect[:2]
283 | 
284 |                 current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
285 |                                                   rect[2:])
286 |                 # adjust to crop (by substracting crop's left,top)
287 |                 current_boxes[:, 2:] -= rect[:2]
288 | 
289 |                 return current_image, current_boxes, current_labels
290 | 
291 | 
292 | class RandomMirror(object):
293 |     def __call__(self, image, boxes, classes):
294 |         _, width, _ = image.shape
295 |         if random.randint(2):
296 |             image = image[:, ::-1]
297 |             boxes = boxes.copy()
298 |             boxes[:, 0::2] = width - boxes[:, 2::-2]
299 |         return image, boxes, classes
300 | 
301 | 
302 | class SwapChannels(object):
303 |     """Transforms a tensorized image by swapping the channels in the order
304 |      specified in the swap tuple.
305 |     Args:
306 |         swaps (int triple): final order of channels
307 |             eg: (2, 1, 0)
308 |     """
309 | 
310 |     def __init__(self, swaps):
311 |         self.swaps = swaps
312 | 
313 |     def __call__(self, image):
314 |         """
315 |         Args:
316 |             image (Tensor): image tensor to be transformed
317 |         Return:
318 |             a tensor with channels swapped according to swap
319 |         """
320 |         # if torch.is_tensor(image):
321 |         #     image = image.data.cpu().numpy()
322 |         # else:
323 |         #     image = np.array(image)
324 |         image = image[:, :, self.swaps]
325 |         return image
326 | 
327 | 
328 | class PhotometricDistort(object):
329 |     def __init__(self):
330 |         self.pd = [
331 |             RandomContrast(),
332 |             ConvertColor(transform='HSV'),
333 |             RandomSaturation(),
334 |             RandomHue(),
335 |             ConvertColor(current='HSV', transform='BGR'),
336 |             RandomContrast()
337 |         ]
338 |         self.rand_brightness = RandomBrightness()
339 |         # self.rand_light_noise = RandomLightingNoise()
340 | 
341 |     def __call__(self, image, boxes, labels):
342 |         im = image.copy()
343 |         im, boxes, labels = self.rand_brightness(im, boxes, labels)
344 |         if random.randint(2):
345 |             distort = Compose(self.pd[:-1])
346 |         else:
347 |             distort = Compose(self.pd[1:])
348 |         im, boxes, labels = distort(im, boxes, labels)
349 |         return im, boxes, labels
350 |         # return self.rand_light_noise(im, boxes, labels)
351 | 
352 | 
353 | class SSDAugmentation(object):
354 |     def __init__(self, size=416, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)):
355 |         self.mean = mean
356 |         self.size = size
357 |         self.std = std
358 |         self.augment = Compose([
359 |             ConvertFromInts(),
360 |             ToAbsoluteCoords(),
361 |             PhotometricDistort(),
362 |             RandomSampleCrop(),
363 |             RandomMirror(),
364 |             ToPercentCoords(),
365 |             Resize(self.size),
366 |             Normalize(self.mean, self.std)
367 |         ])
368 | 
369 |     def __call__(self, img, boxes, labels):
370 |         return self.augment(img, boxes, labels)
371 | 
372 | 
373 | class ColorAugmentation(object):
374 |     def __init__(self, size=416, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)):
375 |         self.mean = mean
376 |         self.size = size
377 |         self.std = std
378 |         self.augment = Compose([
379 |             ConvertFromInts(),
380 |             ToAbsoluteCoords(),
381 |             PhotometricDistort(),
382 |             RandomMirror(),
383 |             ToPercentCoords(),
384 |             Resize(self.size),
385 |             Normalize(self.mean, self.std)
386 |         ])
387 | 
388 |     def __call__(self, img, boxes, labels):
389 |         return self.augment(img, boxes, labels)
390 | 


--------------------------------------------------------------------------------
/utils/cocoapi_evaluator.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import tempfile
  3 | 
  4 | from pycocotools.cocoeval import COCOeval
  5 | from torch.autograd import Variable
  6 | 
  7 | from data.coco2017 import *
  8 | from data import *
  9 | 
 10 | 
 11 | class COCOAPIEvaluator():
 12 |     """
 13 |     COCO AP Evaluation class.
 14 |     All the data in the val2017 dataset are processed \
 15 |     and evaluated by COCO API.
 16 |     """
 17 |     def __init__(self, data_dir, img_size, device, testset=False, transform=None):
 18 |         """
 19 |         Args:
 20 |             data_dir (str): dataset root directory
 21 |             img_size (int): image size after preprocess. images are resized \
 22 |                 to squares whose shape is (img_size, img_size).
 23 |             confthre (float):
 24 |                 confidence threshold ranging from 0 to 1, \
 25 |                 which is defined in the config file.
 26 |             nmsthre (float):
 27 |                 IoU threshold of non-max supression ranging from 0 to 1.
 28 |         """
 29 |         self.testset = testset
 30 |         if self.testset:
 31 |             json_file='image_info_test-dev2017.json'
 32 |             name = 'test2017'
 33 |         else:
 34 |             json_file='instances_val2017.json'
 35 |             name='val2017'
 36 | 
 37 |         self.dataset = COCODataset(data_dir=data_dir,
 38 |                                    json_file=json_file,
 39 |                                    name=name)
 40 |         self.img_size = img_size
 41 |         self.transform = transform
 42 |         self.device = device
 43 | 
 44 |         self.map = 0.
 45 |         self.ap50_95 = 0.
 46 |         self.ap50 = 0.
 47 | 
 48 |     def evaluate(self, model):
 49 |         """
 50 |         COCO average precision (AP) Evaluation. Iterate inference on the test dataset
 51 |         and the results are evaluated by COCO API.
 52 |         Args:
 53 |             model : model object
 54 |         Returns:
 55 |             ap50_95 (float) : calculated COCO AP for IoU=50:95
 56 |             ap50 (float) : calculated COCO AP for IoU=50
 57 |         """
 58 |         model.eval()
 59 |         ids = []
 60 |         data_dict = []
 61 |         num_images = len(self.dataset)
 62 |         print('total number of images: %d' % (num_images))
 63 | 
 64 |         # start testing
 65 |         for index in range(num_images): # all the data in val2017
 66 |             if index % 500 == 0:
 67 |                 print('[Eval: %d / %d]'%(index, num_images))
 68 | 
 69 |             img, id_ = self.dataset.pull_image(index)  # load a batch
 70 |             if self.transform is not None:
 71 |                 x = torch.from_numpy(self.transform(img)[0][:, :, (2, 1, 0)]).permute(2, 0, 1)
 72 |                 x = x.unsqueeze(0).to(self.device)
 73 |             scale = np.array([[img.shape[1], img.shape[0],
 74 |                             img.shape[1], img.shape[0]]])
 75 |             
 76 |             id_ = int(id_)
 77 |             ids.append(id_)
 78 |             with torch.no_grad():
 79 |                 outputs = model(x)
 80 |                 bboxes, scores, cls_inds = outputs
 81 |                 bboxes *= scale
 82 |             for i, box in enumerate(bboxes):
 83 |                 x1 = float(box[0])
 84 |                 y1 = float(box[1])
 85 |                 x2 = float(box[2])
 86 |                 y2 = float(box[3])
 87 |                 label = self.dataset.class_ids[int(cls_inds[i])]
 88 |                 
 89 |                 bbox = [x1, y1, x2 - x1, y2 - y1]
 90 |                 score = float(scores[i]) # object score * class score
 91 |                 A = {"image_id": id_, "category_id": label, "bbox": bbox,
 92 |                      "score": score} # COCO json format
 93 |                 data_dict.append(A)
 94 | 
 95 |         annType = ['segm', 'bbox', 'keypoints']
 96 | 
 97 |         # Evaluate the Dt (detection) json comparing with the ground truth
 98 |         if len(data_dict) > 0:
 99 |             print('evaluating ......')
100 |             cocoGt = self.dataset.coco
101 |             # For test
102 |             if self.testset:
103 |                 json.dump(data_dict, open('yolov2_2017.json', 'w'))
104 |                 cocoDt = cocoGt.loadRes('yolov2_2017.json')
105 |                 print('inference on test-dev is done !!')
106 |                 return -1, -1
107 |             # For val
108 |             else:
109 |                 _, tmp = tempfile.mkstemp()
110 |                 json.dump(data_dict, open(tmp, 'w'))
111 |                 cocoDt = cocoGt.loadRes(tmp)
112 |                 cocoEval = COCOeval(self.dataset.coco, cocoDt, annType[1])
113 |                 cocoEval.params.imgIds = ids
114 |                 cocoEval.evaluate()
115 |                 cocoEval.accumulate()
116 |                 cocoEval.summarize()
117 | 
118 |                 ap50_95, ap50 = cocoEval.stats[0], cocoEval.stats[1]
119 |                 print('ap50_95 : ', ap50_95)
120 |                 print('ap50 : ', ap50)
121 |                 self.map = ap50_95
122 |                 self.ap50_95 = ap50_95
123 |                 self.ap50 = ap50
124 | 
125 |                 return ap50, ap50_95
126 |         else:
127 |             return 0, 0
128 | 
129 | 


--------------------------------------------------------------------------------
/utils/com_paras_flops.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from thop import profile
 3 | 
 4 | 
 5 | def FLOPs_and_Params(model, size, device):
 6 |     x = torch.randn(1, 3, size, size).to(device)
 7 |     model.trainable = False
 8 |     model.eval()
 9 | 
10 |     flops, params = profile(model, inputs=(x, ))
11 |     print('FLOPs : ', flops / 1e9, ' B')
12 |     print('Params : ', params / 1e6, ' M')
13 | 
14 |     model.trainable = True
15 |     model.train()
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     pass
20 | 


--------------------------------------------------------------------------------
/utils/distributed_utils.py:
--------------------------------------------------------------------------------
  1 | # from github: https://github.com/ruinmessi/ASFF/blob/master/utils/distributed_util.py
  2 | 
  3 | import torch
  4 | import torch.distributed as dist
  5 | import os
  6 | import subprocess
  7 | import pickle
  8 | 
  9 | 
 10 | def all_gather(data):
 11 |     """
 12 |     Run all_gather on arbitrary picklable data (not necessarily tensors)
 13 |     Args:
 14 |         data: any picklable object
 15 |     Returns:
 16 |         list[data]: list of data gathered from each rank
 17 |     """
 18 |     world_size = get_world_size()
 19 |     if world_size == 1:
 20 |         return [data]
 21 | 
 22 |     # serialized to a Tensor
 23 |     buffer = pickle.dumps(data)
 24 |     storage = torch.ByteStorage.from_buffer(buffer)
 25 |     tensor = torch.ByteTensor(storage).to("cuda")
 26 | 
 27 |     # obtain Tensor size of each rank
 28 |     local_size = torch.tensor([tensor.numel()], device="cuda")
 29 |     size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
 30 |     dist.all_gather(size_list, local_size)
 31 |     size_list = [int(size.item()) for size in size_list]
 32 |     max_size = max(size_list)
 33 | 
 34 |     # receiving Tensor from all ranks
 35 |     # we pad the tensor because torch all_gather does not support
 36 |     # gathering tensors of different shapes
 37 |     tensor_list = []
 38 |     for _ in size_list:
 39 |         tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
 40 |     if local_size != max_size:
 41 |         padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
 42 |         tensor = torch.cat((tensor, padding), dim=0)
 43 |     dist.all_gather(tensor_list, tensor)
 44 | 
 45 |     data_list = []
 46 |     for size, tensor in zip(size_list, tensor_list):
 47 |         buffer = tensor.cpu().numpy().tobytes()[:size]
 48 |         data_list.append(pickle.loads(buffer))
 49 | 
 50 |     return data_list
 51 | 
 52 | 
 53 | def reduce_dict(input_dict, average=True):
 54 |     """
 55 |     Args:
 56 |         input_dict (dict): all the values will be reduced
 57 |         average (bool): whether to do average or sum
 58 |     Reduce the values in the dictionary from all processes so that all processes
 59 |     have the averaged results. Returns a dict with the same fields as
 60 |     input_dict, after reduction.
 61 |     """
 62 |     world_size = get_world_size()
 63 |     if world_size < 2:
 64 |         return input_dict
 65 |     with torch.no_grad():
 66 |         names = []
 67 |         values = []
 68 |         # sort the keys so that they are consistent across processes
 69 |         for k in sorted(input_dict.keys()):
 70 |             names.append(k)
 71 |             values.append(input_dict[k])
 72 |         values = torch.stack(values, dim=0)
 73 |         dist.all_reduce(values)
 74 |         if average:
 75 |             values /= world_size
 76 |         reduced_dict = {k: v for k, v in zip(names, values)}
 77 |     return reduced_dict
 78 | 
 79 | 
 80 | def get_sha():
 81 |     cwd = os.path.dirname(os.path.abspath(__file__))
 82 | 
 83 |     def _run(command):
 84 |         return subprocess.check_output(command, cwd=cwd).decode('ascii').strip()
 85 |     sha = 'N/A'
 86 |     diff = "clean"
 87 |     branch = 'N/A'
 88 |     try:
 89 |         sha = _run(['git', 'rev-parse', 'HEAD'])
 90 |         subprocess.check_output(['git', 'diff'], cwd=cwd)
 91 |         diff = _run(['git', 'diff-index', 'HEAD'])
 92 |         diff = "has uncommited changes" if diff else "clean"
 93 |         branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])
 94 |     except Exception:
 95 |         pass
 96 |     message = f"sha: {sha}, status: {diff}, branch: {branch}"
 97 |     return message
 98 | 
 99 | 
100 | def setup_for_distributed(is_master):
101 |     """
102 |     This function disables printing when not in master process
103 |     """
104 |     import builtins as __builtin__
105 |     builtin_print = __builtin__.print
106 | 
107 |     def print(*args, **kwargs):
108 |         force = kwargs.pop('force', False)
109 |         if is_master or force:
110 |             builtin_print(*args, **kwargs)
111 | 
112 |     __builtin__.print = print
113 | 
114 | 
115 | def is_dist_avail_and_initialized():
116 |     if not dist.is_available():
117 |         return False
118 |     if not dist.is_initialized():
119 |         return False
120 |     return True
121 | 
122 | 
123 | def get_world_size():
124 |     if not is_dist_avail_and_initialized():
125 |         return 1
126 |     return dist.get_world_size()
127 | 
128 | 
129 | def get_rank():
130 |     if not is_dist_avail_and_initialized():
131 |         return 0
132 |     return dist.get_rank()
133 | 
134 | 
135 | def is_main_process():
136 |     return get_rank() == 0
137 | 
138 | 
139 | def save_on_master(*args, **kwargs):
140 |     if is_main_process():
141 |         torch.save(*args, **kwargs)
142 | 
143 | 
144 | def init_distributed_mode(args):
145 |     if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
146 |         args.rank = int(os.environ["RANK"])
147 |         args.world_size = int(os.environ['WORLD_SIZE'])
148 |         args.gpu = int(os.environ['LOCAL_RANK'])
149 |     elif 'SLURM_PROCID' in os.environ:
150 |         args.rank = int(os.environ['SLURM_PROCID'])
151 |         args.gpu = args.rank % torch.cuda.device_count()
152 |     else:
153 |         print('Not using distributed mode')
154 |         args.distributed = False
155 |         return
156 | 
157 |     args.distributed = True
158 | 
159 |     torch.cuda.set_device(args.gpu)
160 |     args.dist_backend = 'nccl'
161 |     print('| distributed init (rank {}): {}'.format(
162 |         args.rank, args.dist_url), flush=True)
163 |     torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
164 |                                          world_size=args.world_size, rank=args.rank)
165 |     torch.distributed.barrier()
166 |     setup_for_distributed(args.rank == 0)
167 | 


--------------------------------------------------------------------------------
/utils/kmeans_anchor.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import argparse
  4 | import os
  5 | import sys
  6 | sys.path.append('..')
  7 | 
  8 | from data.voc0712 import VOCDetection
  9 | from data.coco2017 import COCODataset
 10 | 
 11 | 
 12 | def parse_args():
 13 |     parser = argparse.ArgumentParser(description='kmeans for anchor box')
 14 | 
 15 |     parser.add_argument('-root', '--data_root', default='/mnt/share/ssd2/dataset',
 16 |                         help='dataset root')
 17 |     parser.add_argument('-d', '--dataset', default='coco',
 18 |                         help='coco, voc.')
 19 |     parser.add_argument('-na', '--num_anchorbox', default=9, type=int,
 20 |                         help='number of anchor box.')
 21 |     parser.add_argument('-size', '--input_size', default=416, type=int,
 22 |                         help='input size.')
 23 |     parser.add_argument('--scale', action='store_true', default=False,
 24 |                         help='divide the sizes of anchor boxes by 32 .')
 25 |     return parser.parse_args()
 26 |                     
 27 | args = parse_args()
 28 |                     
 29 | 
 30 | class Box():
 31 |     def __init__(self, x, y, w, h):
 32 |         self.x = x
 33 |         self.y = y
 34 |         self.w = w
 35 |         self.h = h
 36 | 
 37 | 
 38 | def iou(box1, box2):
 39 |     x1, y1, w1, h1 = box1.x, box1.y, box1.w, box1.h
 40 |     x2, y2, w2, h2 = box2.x, box2.y, box2.w, box2.h
 41 | 
 42 |     S_1 = w1 * h1
 43 |     S_2 = w2 * h2
 44 | 
 45 |     xmin_1, ymin_1 = x1 - w1 / 2, y1 - h1 / 2
 46 |     xmax_1, ymax_1 = x1 + w1 / 2, y1 + h1 / 2
 47 |     xmin_2, ymin_2 = x2 - w2 / 2, y2 - h2 / 2
 48 |     xmax_2, ymax_2 = x2 + w2 / 2, y2 + h2 / 2
 49 | 
 50 |     I_w = min(xmax_1, xmax_2) - max(xmin_1, xmin_2)
 51 |     I_h = min(ymax_1, ymax_2) - max(ymin_1, ymin_2)
 52 |     if I_w < 0 or I_h < 0:
 53 |         return 0
 54 |     I = I_w * I_h
 55 | 
 56 |     IoU = I / (S_1 + S_2 - I)
 57 | 
 58 |     return IoU
 59 | 
 60 | 
 61 | def init_centroids(boxes, n_anchors):
 62 |     """
 63 |         We use kmeans++ to initialize centroids.
 64 |     """
 65 |     centroids = []
 66 |     boxes_num = len(boxes)
 67 | 
 68 |     centroid_index = int(np.random.choice(boxes_num, 1)[0])
 69 |     centroids.append(boxes[centroid_index])
 70 |     print(centroids[0].w,centroids[0].h)
 71 | 
 72 |     for centroid_index in range(0, n_anchors-1):
 73 |         sum_distance = 0
 74 |         distance_thresh = 0
 75 |         distance_list = []
 76 |         cur_sum = 0
 77 | 
 78 |         for box in boxes:
 79 |             min_distance = 1
 80 |             for centroid_i, centroid in enumerate(centroids):
 81 |                 distance = (1 - iou(box, centroid))
 82 |                 if distance < min_distance:
 83 |                     min_distance = distance
 84 |             sum_distance += min_distance
 85 |             distance_list.append(min_distance)
 86 | 
 87 |         distance_thresh = sum_distance * np.random.random()
 88 | 
 89 |         for i in range(0, boxes_num):
 90 |             cur_sum += distance_list[i]
 91 |             if cur_sum > distance_thresh:
 92 |                 centroids.append(boxes[i])
 93 |                 print(boxes[i].w, boxes[i].h)
 94 |                 break
 95 |     return centroids
 96 | 
 97 | 
 98 | def do_kmeans(n_anchors, boxes, centroids):
 99 |     loss = 0
100 |     groups = []
101 |     new_centroids = []
102 |     # for box in centroids:
103 |     #     print('box: ', box.x, box.y, box.w, box.h)
104 |     # exit()
105 |     for i in range(n_anchors):
106 |         groups.append([])
107 |         new_centroids.append(Box(0, 0, 0, 0))
108 |     
109 |     for box in boxes:
110 |         min_distance = 1
111 |         group_index = 0
112 |         for centroid_index, centroid in enumerate(centroids):
113 |             distance = (1 - iou(box, centroid))
114 |             if distance < min_distance:
115 |                 min_distance = distance
116 |                 group_index = centroid_index
117 |         groups[group_index].append(box)
118 |         loss += min_distance
119 |         new_centroids[group_index].w += box.w
120 |         new_centroids[group_index].h += box.h
121 | 
122 |     for i in range(n_anchors):
123 |         new_centroids[i].w /= max(len(groups[i]), 1)
124 |         new_centroids[i].h /= max(len(groups[i]), 1)
125 | 
126 |     return new_centroids, groups, loss# / len(boxes)
127 | 
128 | 
129 | def anchor_box_kmeans(total_gt_boxes, n_anchors, loss_convergence, iters, plus=True):
130 |     """
131 |         This function will use k-means to get appropriate anchor boxes for train dataset.
132 |         Input:
133 |             total_gt_boxes: 
134 |             n_anchor : int -> the number of anchor boxes.
135 |             loss_convergence : float -> threshold of iterating convergence.
136 |             iters: int -> the number of iterations for training kmeans.
137 |         Output: anchor_boxes : list -> [[w1, h1], [w2, h2], ..., [wn, hn]].
138 |     """
139 |     boxes = total_gt_boxes
140 |     centroids = []
141 |     if plus:
142 |         centroids = init_centroids(boxes, n_anchors)
143 |     else:
144 |         total_indexs = range(len(boxes))
145 |         sample_indexs = random.sample(total_indexs, n_anchors)
146 |         for i in sample_indexs:
147 |             centroids.append(boxes[i])
148 | 
149 |     # iterate k-means
150 |     centroids, groups, old_loss = do_kmeans(n_anchors, boxes, centroids)
151 |     iterations = 1
152 |     while(True):
153 |         centroids, groups, loss = do_kmeans(n_anchors, boxes, centroids)
154 |         iterations += 1
155 |         print("Loss = %f" % loss)
156 |         if abs(old_loss - loss) < loss_convergence or iterations > iters:
157 |             break
158 |         old_loss = loss
159 | 
160 |         for centroid in centroids:
161 |             print(centroid.w, centroid.h)
162 |     
163 |     print("k-means result : ") 
164 |     for centroid in centroids:
165 |         if args.scale:
166 |             print("w, h: ", round(centroid.w / 32., 2), round(centroid.h / 32., 2), 
167 |                 "area: ", round(centroid.w / 32., 2) * round(centroid.h / 32., 2))
168 |         else:
169 |             print("w, h: ", round(centroid.w, 2), round(centroid.h, 2), 
170 |                 "area: ", round(centroid.w, 2) * round(centroid.h, 2))
171 |     
172 |     return centroids
173 | 
174 | 
175 | if __name__ == "__main__":
176 | 
177 |     n_anchors = args.num_anchorbox
178 |     img_size = args.img_size
179 |     dataset = args.dataset
180 |     
181 |     loss_convergence = 1e-6
182 |     iters_n = 1000
183 |     
184 |     dataset_voc = VOCDetection(data_dir=os.path.join(args.root, 'VOCdevkit'), 
185 |                                 img_size=img_size)
186 | 
187 |     dataset_coco = COCODataset(data_dir=os.path.join(args.root, 'COCO'),
188 |                                 img_size=img_size)
189 | 
190 |     boxes = []
191 |     print("The dataset size: ", len(dataset))
192 |     print("Loading the dataset ...")
193 |     # VOC
194 |     for i in range(len(dataset_voc)):
195 |         if i % 5000 == 0:
196 |             print('Loading voc data [%d / %d]' % (i+1, len(dataset_voc)))
197 | 
198 |         # For VOC
199 |         img, _ = dataset_voc.pull_image(i)
200 |         w, h = img.shape[1], img.shape[0]
201 |         _, annotation = dataset_voc.pull_anno(i)
202 | 
203 |         # prepare bbox datas
204 |         for box_and_label in annotation:
205 |             box = box_and_label[:-1]
206 |             xmin, ymin, xmax, ymax = box
207 |             bw = (xmax - xmin) / w * img_size
208 |             bh = (ymax - ymin) / h * img_size
209 |             # check bbox
210 |             if bw < 1.0 or bh < 1.0:
211 |                 continue
212 |             boxes.append(Box(0, 0, bw, bh))
213 | 
214 |     # COCO
215 |     for i in range(len(dataset_coco)):
216 |         if i % 5000 == 0:
217 |             print('Loading coco datat [%d / %d]' % (i+1, len(dataset_coco)))
218 | 
219 |         # For COCO
220 |         img, _ = dataset_coco.pull_image(i)
221 |         w, h = img.shape[1], img.shape[0]
222 |         annotation = dataset_coco.pull_anno(i)
223 | 
224 |         # prepare bbox datas
225 |         for box_and_label in annotation:
226 |             box = box_and_label[:-1]
227 |             xmin, ymin, xmax, ymax = box
228 |             bw = (xmax - xmin) / w * img_size
229 |             bh = (ymax - ymin) / h * img_size
230 |             # check bbox
231 |             if bw < 1.0 or bh < 1.0:
232 |                 continue
233 |             boxes.append(Box(0, 0, bw, bh))
234 | 
235 |     print("Number of all bboxes: ", len(boxes))
236 |     print("Start k-means !")
237 |     centroids = anchor_box_kmeans(boxes, n_anchors, loss_convergence, iters_n, plus=True)
238 | 


--------------------------------------------------------------------------------
/utils/modules.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | from copy import deepcopy
 5 | 
 6 | 
 7 | class Conv(nn.Module):
 8 |     def __init__(self, in_ch, out_ch, k=1, p=0, s=1, d=1, g=1, act=True):
 9 |         super(Conv, self).__init__()
10 |         if act:
11 |             self.convs = nn.Sequential(
12 |                 nn.Conv2d(in_ch, out_ch, k, stride=s, padding=p, dilation=d, groups=g),
13 |                 nn.BatchNorm2d(out_ch),
14 |                 nn.LeakyReLU(0.1, inplace=True)
15 |             )
16 |         else:
17 |             self.convs = nn.Sequential(
18 |                 nn.Conv2d(in_ch, out_ch, k, stride=s, padding=p, dilation=d, groups=g),
19 |                 nn.BatchNorm2d(out_ch)
20 |             )
21 | 
22 |     def forward(self, x):
23 |         return self.convs(x)
24 | 
25 | 
26 | class UpSample(nn.Module):
27 |     def __init__(self, size=None, scale_factor=None, mode='nearest', align_corner=None):
28 |         super(UpSample, self).__init__()
29 |         self.size = size
30 |         self.scale_factor = scale_factor
31 |         self.mode = mode
32 |         self.align_corner = align_corner
33 | 
34 |     def forward(self, x):
35 |         return torch.nn.functional.interpolate(x, size=self.size, scale_factor=self.scale_factor, 
36 |                                                 mode=self.mode, align_corners=self.align_corner)
37 | 
38 | 
39 | class reorg_layer(nn.Module):
40 |     def __init__(self, stride):
41 |         super(reorg_layer, self).__init__()
42 |         self.stride = stride
43 | 
44 |     def forward(self, x):
45 |         batch_size, channels, height, width = x.size()
46 |         _height, _width = height // self.stride, width // self.stride
47 |         
48 |         x = x.view(batch_size, channels, _height, self.stride, _width, self.stride).transpose(3, 4).contiguous()
49 |         x = x.view(batch_size, channels, _height * _width, self.stride * self.stride).transpose(2, 3).contiguous()
50 |         x = x.view(batch_size, channels, self.stride * self.stride, _height, _width).transpose(1, 2).contiguous()
51 |         x = x.view(batch_size, -1, _height, _width)
52 | 
53 |         return x
54 | 
55 | 
56 | class SPP(nn.Module):
57 |     """
58 |         Spatial Pyramid Pooling
59 |     """
60 |     def __init__(self):
61 |         super(SPP, self).__init__()
62 | 
63 |     def forward(self, x):
64 |         x_1 = torch.nn.functional.max_pool2d(x, 5, stride=1, padding=2)
65 |         x_2 = torch.nn.functional.max_pool2d(x, 9, stride=1, padding=4)
66 |         x_3 = torch.nn.functional.max_pool2d(x, 13, stride=1, padding=6)
67 |         x = torch.cat([x, x_1, x_2, x_3], dim=1)
68 | 
69 |         return x
70 | 
71 | 
72 | class ModelEMA(object):
73 |     def __init__(self, model, decay=0.9999, updates=0):
74 |         # create EMA
75 |         self.ema = deepcopy(model).eval()
76 |         self.updates = updates
77 |         self.decay = lambda x: decay * (1 - math.exp(-x / 2000.))
78 |         for p in self.ema.parameters():
79 |             p.requires_grad_(False)
80 | 
81 |     def update(self, model):
82 |         # Update EMA parameters
83 |         with torch.no_grad():
84 |             self.updates += 1
85 |             d = self.decay(self.updates)
86 | 
87 |             msd = model.state_dict()
88 |             for k, v in self.ema.state_dict().items():
89 |                 if v.dtype.is_floating_point:
90 |                     v *= d
91 |                     v += (1. - d) * msd[k].detach()
92 | 


--------------------------------------------------------------------------------
/utils/vocapi_evaluator.py:
--------------------------------------------------------------------------------
  1 | """Adapted from:
  2 |     @longcw faster_rcnn_pytorch: https://github.com/longcw/faster_rcnn_pytorch
  3 |     @rbgirshick py-faster-rcnn https://github.com/rbgirshick/py-faster-rcnn
  4 |     Licensed under The MIT License [see LICENSE for details]
  5 | """
  6 | 
  7 | from torch.autograd import Variable
  8 | from data.voc0712 import VOCDetection, VOC_CLASSES
  9 | import sys
 10 | import os
 11 | import time
 12 | import numpy as np
 13 | import pickle
 14 | import xml.etree.ElementTree as ET
 15 | 
 16 | 
 17 | class VOCAPIEvaluator():
 18 |     """ VOC AP Evaluation class """
 19 |     def __init__(self, data_root, img_size, device, transform, set_type='test', year='2007', display=False):
 20 |         self.data_root = data_root
 21 |         self.img_size = img_size
 22 |         self.device = device
 23 |         self.transform = transform
 24 |         self.labelmap = VOC_CLASSES
 25 |         self.set_type = set_type
 26 |         self.year = year
 27 |         self.display = display
 28 | 
 29 |         # path
 30 |         self.devkit_path = data_root + 'VOC' + year
 31 |         self.annopath = os.path.join(data_root, 'VOC2007', 'Annotations', '%s.xml')
 32 |         self.imgpath = os.path.join(data_root, 'VOC2007', 'JPEGImages', '%s.jpg')
 33 |         self.imgsetpath = os.path.join(data_root, 'VOC2007', 'ImageSets', 'Main', set_type+'.txt')
 34 |         self.output_dir = self.get_output_dir('voc_eval/', self.set_type)
 35 | 
 36 |         # dataset
 37 |         self.dataset = VOCDetection(data_dir=data_root, 
 38 |                                     image_sets=[('2007', set_type)],
 39 |                                     transform=transform
 40 |                                     )
 41 | 
 42 |     def evaluate(self, net):
 43 |         net.eval()
 44 |         num_images = len(self.dataset)
 45 |         # all detections are collected into:
 46 |         #    all_boxes[cls][image] = N x 5 array of detections in
 47 |         #    (x1, y1, x2, y2, score)
 48 |         self.all_boxes = [[[] for _ in range(num_images)]
 49 |                         for _ in range(len(self.labelmap))]
 50 | 
 51 |         # timers
 52 |         det_file = os.path.join(self.output_dir, 'detections.pkl')
 53 | 
 54 |         for i in range(num_images):
 55 |             im, gt, h, w = self.dataset.pull_item(i)
 56 | 
 57 |             x = Variable(im.unsqueeze(0)).to(self.device)
 58 |             t0 = time.time()
 59 |             # forward
 60 |             bboxes, scores, cls_inds = net(x)
 61 |             detect_time = time.time() - t0
 62 |             scale = np.array([[w, h, w, h]])
 63 |             bboxes *= scale
 64 | 
 65 |             for j in range(len(self.labelmap)):
 66 |                 inds = np.where(cls_inds == j)[0]
 67 |                 if len(inds) == 0:
 68 |                     self.all_boxes[j][i] = np.empty([0, 5], dtype=np.float32)
 69 |                     continue
 70 |                 c_bboxes = bboxes[inds]
 71 |                 c_scores = scores[inds]
 72 |                 c_dets = np.hstack((c_bboxes,
 73 |                                     c_scores[:, np.newaxis])).astype(np.float32,
 74 |                                                                     copy=False)
 75 |                 self.all_boxes[j][i] = c_dets
 76 | 
 77 |             if i % 500 == 0:
 78 |                 print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1, num_images, detect_time))
 79 | 
 80 |         with open(det_file, 'wb') as f:
 81 |             pickle.dump(self.all_boxes, f, pickle.HIGHEST_PROTOCOL)
 82 | 
 83 |         print('Evaluating detections')
 84 |         self.evaluate_detections(self.all_boxes)
 85 | 
 86 |         print('Mean AP: ', self.map)
 87 |   
 88 | 
 89 |     def parse_rec(self, filename):
 90 |         """ Parse a PASCAL VOC xml file """
 91 |         tree = ET.parse(filename)
 92 |         objects = []
 93 |         for obj in tree.findall('object'):
 94 |             obj_struct = {}
 95 |             obj_struct['name'] = obj.find('name').text
 96 |             obj_struct['pose'] = obj.find('pose').text
 97 |             obj_struct['truncated'] = int(obj.find('truncated').text)
 98 |             obj_struct['difficult'] = int(obj.find('difficult').text)
 99 |             bbox = obj.find('bndbox')
100 |             obj_struct['bbox'] = [int(bbox.find('xmin').text),
101 |                                 int(bbox.find('ymin').text),
102 |                                 int(bbox.find('xmax').text),
103 |                                 int(bbox.find('ymax').text)]
104 |             objects.append(obj_struct)
105 | 
106 |         return objects
107 | 
108 | 
109 |     def get_output_dir(self, name, phase):
110 |         """Return the directory where experimental artifacts are placed.
111 |         If the directory does not exist, it is created.
112 |         A canonical path is built using the name from an imdb and a network
113 |         (if not None).
114 |         """
115 |         filedir = os.path.join(name, phase)
116 |         if not os.path.exists(filedir):
117 |             os.makedirs(filedir)
118 |         return filedir
119 | 
120 | 
121 |     def get_voc_results_file_template(self, cls):
122 |         # VOCdevkit/VOC2007/results/det_test_aeroplane.txt
123 |         filename = 'det_' + self.set_type + '_%s.txt' % (cls)
124 |         filedir = os.path.join(self.devkit_path, 'results')
125 |         if not os.path.exists(filedir):
126 |             os.makedirs(filedir)
127 |         path = os.path.join(filedir, filename)
128 |         return path
129 | 
130 | 
131 |     def write_voc_results_file(self, all_boxes):
132 |         for cls_ind, cls in enumerate(self.labelmap):
133 |             if self.display:
134 |                 print('Writing {:s} VOC results file'.format(cls))
135 |             filename = self.get_voc_results_file_template(cls)
136 |             with open(filename, 'wt') as f:
137 |                 for im_ind, index in enumerate(self.dataset.ids):
138 |                     dets = all_boxes[cls_ind][im_ind]
139 |                     if dets == []:
140 |                         continue
141 |                     # the VOCdevkit expects 1-based indices
142 |                     for k in range(dets.shape[0]):
143 |                         f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
144 |                                 format(index[1], dets[k, -1],
145 |                                     dets[k, 0] + 1, dets[k, 1] + 1,
146 |                                     dets[k, 2] + 1, dets[k, 3] + 1))
147 | 
148 | 
149 |     def do_python_eval(self, use_07=True):
150 |         cachedir = os.path.join(self.devkit_path, 'annotations_cache')
151 |         aps = []
152 |         # The PASCAL VOC metric changed in 2010
153 |         use_07_metric = use_07
154 |         print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
155 |         if not os.path.isdir(self.output_dir):
156 |             os.mkdir(self.output_dir)
157 |         for i, cls in enumerate(self.labelmap):
158 |             filename = self.get_voc_results_file_template(cls)
159 |             rec, prec, ap = self.voc_eval(detpath=filename, 
160 |                                           classname=cls, 
161 |                                           cachedir=cachedir, 
162 |                                           ovthresh=0.5, 
163 |                                           use_07_metric=use_07_metric
164 |                                         )
165 |             aps += [ap]
166 |             print('AP for {} = {:.4f}'.format(cls, ap))
167 |             with open(os.path.join(self.output_dir, cls + '_pr.pkl'), 'wb') as f:
168 |                 pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
169 |         if self.display:
170 |             self.map = np.mean(aps)
171 |             print('Mean AP = {:.4f}'.format(np.mean(aps)))
172 |             print('~~~~~~~~')
173 |             print('Results:')
174 |             for ap in aps:
175 |                 print('{:.3f}'.format(ap))
176 |             print('{:.3f}'.format(np.mean(aps)))
177 |             print('~~~~~~~~')
178 |             print('')
179 |             print('--------------------------------------------------------------')
180 |             print('Results computed with the **unofficial** Python eval code.')
181 |             print('Results should be very close to the official MATLAB eval code.')
182 |             print('--------------------------------------------------------------')
183 |         else:
184 |             self.map = np.mean(aps)
185 |             print('Mean AP = {:.4f}'.format(np.mean(aps)))
186 | 
187 | 
188 |     def voc_ap(self, rec, prec, use_07_metric=True):
189 |         """ ap = voc_ap(rec, prec, [use_07_metric])
190 |         Compute VOC AP given precision and recall.
191 |         If use_07_metric is true, uses the
192 |         VOC 07 11 point method (default:True).
193 |         """
194 |         if use_07_metric:
195 |             # 11 point metric
196 |             ap = 0.
197 |             for t in np.arange(0., 1.1, 0.1):
198 |                 if np.sum(rec >= t) == 0:
199 |                     p = 0
200 |                 else:
201 |                     p = np.max(prec[rec >= t])
202 |                 ap = ap + p / 11.
203 |         else:
204 |             # correct AP calculation
205 |             # first append sentinel values at the end
206 |             mrec = np.concatenate(([0.], rec, [1.]))
207 |             mpre = np.concatenate(([0.], prec, [0.]))
208 | 
209 |             # compute the precision envelope
210 |             for i in range(mpre.size - 1, 0, -1):
211 |                 mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
212 | 
213 |             # to calculate area under PR curve, look for points
214 |             # where X axis (recall) changes value
215 |             i = np.where(mrec[1:] != mrec[:-1])[0]
216 | 
217 |             # and sum (\Delta recall) * prec
218 |             ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
219 |         return ap
220 | 
221 | 
222 |     def voc_eval(self, detpath, classname, cachedir, ovthresh=0.5, use_07_metric=True):
223 |         if not os.path.isdir(cachedir):
224 |             os.mkdir(cachedir)
225 |         cachefile = os.path.join(cachedir, 'annots.pkl')
226 |         # read list of images
227 |         with open(self.imgsetpath, 'r') as f:
228 |             lines = f.readlines()
229 |         imagenames = [x.strip() for x in lines]
230 |         if not os.path.isfile(cachefile):
231 |             # load annots
232 |             recs = {}
233 |             for i, imagename in enumerate(imagenames):
234 |                 recs[imagename] = self.parse_rec(self.annopath % (imagename))
235 |                 if i % 100 == 0 and self.display:
236 |                     print('Reading annotation for {:d}/{:d}'.format(
237 |                     i + 1, len(imagenames)))
238 |             # save
239 |             if self.display:
240 |                 print('Saving cached annotations to {:s}'.format(cachefile))
241 |             with open(cachefile, 'wb') as f:
242 |                 pickle.dump(recs, f)
243 |         else:
244 |             # load
245 |             with open(cachefile, 'rb') as f:
246 |                 recs = pickle.load(f)
247 | 
248 |         # extract gt objects for this class
249 |         class_recs = {}
250 |         npos = 0
251 |         for imagename in imagenames:
252 |             R = [obj for obj in recs[imagename] if obj['name'] == classname]
253 |             bbox = np.array([x['bbox'] for x in R])
254 |             difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
255 |             det = [False] * len(R)
256 |             npos = npos + sum(~difficult)
257 |             class_recs[imagename] = {'bbox': bbox,
258 |                                     'difficult': difficult,
259 |                                     'det': det}
260 | 
261 |         # read dets
262 |         detfile = detpath.format(classname)
263 |         with open(detfile, 'r') as f:
264 |             lines = f.readlines()
265 |         if any(lines) == 1:
266 | 
267 |             splitlines = [x.strip().split(' ') for x in lines]
268 |             image_ids = [x[0] for x in splitlines]
269 |             confidence = np.array([float(x[1]) for x in splitlines])
270 |             BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
271 | 
272 |             # sort by confidence
273 |             sorted_ind = np.argsort(-confidence)
274 |             sorted_scores = np.sort(-confidence)
275 |             BB = BB[sorted_ind, :]
276 |             image_ids = [image_ids[x] for x in sorted_ind]
277 | 
278 |             # go down dets and mark TPs and FPs
279 |             nd = len(image_ids)
280 |             tp = np.zeros(nd)
281 |             fp = np.zeros(nd)
282 |             for d in range(nd):
283 |                 R = class_recs[image_ids[d]]
284 |                 bb = BB[d, :].astype(float)
285 |                 ovmax = -np.inf
286 |                 BBGT = R['bbox'].astype(float)
287 |                 if BBGT.size > 0:
288 |                     # compute overlaps
289 |                     # intersection
290 |                     ixmin = np.maximum(BBGT[:, 0], bb[0])
291 |                     iymin = np.maximum(BBGT[:, 1], bb[1])
292 |                     ixmax = np.minimum(BBGT[:, 2], bb[2])
293 |                     iymax = np.minimum(BBGT[:, 3], bb[3])
294 |                     iw = np.maximum(ixmax - ixmin, 0.)
295 |                     ih = np.maximum(iymax - iymin, 0.)
296 |                     inters = iw * ih
297 |                     uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) +
298 |                         (BBGT[:, 2] - BBGT[:, 0]) *
299 |                         (BBGT[:, 3] - BBGT[:, 1]) - inters)
300 |                     overlaps = inters / uni
301 |                     ovmax = np.max(overlaps)
302 |                     jmax = np.argmax(overlaps)
303 | 
304 |                 if ovmax > ovthresh:
305 |                     if not R['difficult'][jmax]:
306 |                         if not R['det'][jmax]:
307 |                             tp[d] = 1.
308 |                             R['det'][jmax] = 1
309 |                         else:
310 |                             fp[d] = 1.
311 |                 else:
312 |                     fp[d] = 1.
313 | 
314 |             # compute precision recall
315 |             fp = np.cumsum(fp)
316 |             tp = np.cumsum(tp)
317 |             rec = tp / float(npos)
318 |             # avoid divide by zero in case the first detection matches a difficult
319 |             # ground truth
320 |             prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
321 |             ap = self.voc_ap(rec, prec, use_07_metric)
322 |         else:
323 |             rec = -1.
324 |             prec = -1.
325 |             ap = -1.
326 | 
327 |         return rec, prec, ap
328 | 
329 | 
330 |     def evaluate_detections(self, box_list):
331 |         self.write_voc_results_file(box_list)
332 |         self.do_python_eval()
333 | 
334 | 
335 | if __name__ == '__main__':
336 |     pass


--------------------------------------------------------------------------------
/weights/README.md:
--------------------------------------------------------------------------------
 1 | # yolo-v2-v3 and tiny model
 2 | Hi, guys ! 
 3 | 
 4 | For researchers in China, you can download them from BaiduYunDisk. 
 5 | There are 5 models including yolo-v2, yolo-v3, yolo_v3_spp, slim-yolo-v2 and tiny-yolo-v3.
 6 | 
 7 | The link is as following: 
 8 | 
 9 | link: https://pan.baidu.com/s/1rnmM8HGFzE2NTv6AkljJdg
10 | 
11 | password: 5c8h 
12 | 
13 | <!-- link: https://drive.google.com/open?id=1Yrxz2IW3nzMZiX6EvcTzayDlH8ju4ZFc -->
14 | 
15 | I will upload all models to googledrive.


--------------------------------------------------------------------------------