├── .gitignore
├── License.txt
├── README.md
├── Use_yolov4_to_train_your_own_data.md
├── cfg.py
├── cfg
    ├── yolov3-tiny.cfg
    ├── yolov3.cfg
    ├── yolov4-custom.cfg
    └── yolov4.cfg
├── data
    ├── coco.names
    ├── dog.jpg
    ├── giraffe.jpg
    ├── prediction.jpg
    └── voc.names
├── dataset.py
├── demo.py
├── demo_onnx.py
├── demo_tensorflow.py
├── evaluate_on_coco.py
├── models.py
├── requirements.txt
├── tool
    ├── __init__.py
    ├── camera.py
    ├── coco_annotation.py
    ├── config.py
    ├── darknet2onnx.py
    ├── darknet2pytorch.py
    ├── onnx2tensorflow.py
    ├── region_loss.py
    ├── utils.py
    └── yolo_layer.py
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | ttest
 2 | *.weights
 3 | *.pth
 4 | *.onnx
 5 | 
 6 | __pycache__
 7 | .idea
 8 | .vscode
 9 | runs
10 | log
11 | 
12 | predictions.jpg
13 | predictions_onnx.jpg
14 | 


--------------------------------------------------------------------------------
/License.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Pytorch-YOLOv4
  2 | 
  3 | ![](https://img.shields.io/static/v1?label=python&message=3.6|3.7&color=blue)
  4 | ![](https://img.shields.io/static/v1?label=pytorch&message=1.4&color=<COLOR>)
  5 | [![](https://img.shields.io/static/v1?label=license&message=Apache2&color=green)](./License.txt)
  6 | 
  7 | A minimal PyTorch implementation of YOLOv4.
  8 | - Paper Yolo v4: https://arxiv.org/abs/2004.10934
  9 | - Source code:https://github.com/AlexeyAB/darknet
 10 | - More details: http://pjreddie.com/darknet/yolo/
 11 | 
 12 | 
 13 | - [x] Inference
 14 | - [x] Train
 15 |     - [x] Mocaic
 16 | 
 17 | ```
 18 | ├── README.md
 19 | ├── dataset.py       dataset
 20 | ├── demo.py          demo to run pytorch --> tool/darknet2pytorch
 21 | ├── darknet2onnx.py  tool to convert into onnx --> tool/darknet2pytorch
 22 | ├── demo_onnx.py     demo to run the converted onnx model
 23 | ├── models.py        model for pytorch
 24 | ├── train.py         train models.py
 25 | ├── cfg.py           cfg.py for train
 26 | ├── cfg              cfg --> darknet2pytorch
 27 | ├── data            
 28 | ├── weight           --> darknet2pytorch
 29 | ├── tool
 30 | │   ├── camera.py           a demo camera
 31 | │   ├── coco_annotatin.py       coco dataset generator
 32 | │   ├── config.py
 33 | │   ├── darknet2pytorch.py
 34 | │   ├── region_loss.py
 35 | │   ├── utils.py
 36 | │   └── yolo_layer.py
 37 | ```
 38 | 
 39 | ![image](https://user-gold-cdn.xitu.io/2020/4/26/171b5a6c8b3bd513?w=768&h=576&f=jpeg&s=78882)
 40 | 
 41 | # 0.Weight
 42 | 
 43 | ## 0.1 darkent
 44 | - baidu(https://pan.baidu.com/s/1dAGEW8cm-dqK14TbhhVetA     Extraction code:dm5b)
 45 | - google(https://drive.google.com/open?id=1cewMfusmPjYWbrnuJRuKhPMwRe_b9PaT)
 46 | 
 47 | ## 0.2 pytorch
 48 | you can use darknet2pytorch to convert it yourself, or download my converted model.
 49 | 
 50 | - baidu
 51 |     - yolov4.pth(https://pan.baidu.com/s/1ZroDvoGScDgtE1ja_QqJVw Extraction code:xrq9) 
 52 |     - yolov4.conv.137.pth(https://pan.baidu.com/s/1ovBie4YyVQQoUrC3AY0joA Extraction code:kcel)
 53 | - google
 54 |     - yolov4.pth(https://drive.google.com/open?id=1wv_LiFeCRYwtpkqREPeI13-gPELBDwuJ)
 55 |     - yolov4.conv.137.pth(https://drive.google.com/open?id=1fcbR0bWzYfIEdLJPzOsn4R5mlvR6IQyA)
 56 | 
 57 | # 1.Train
 58 | 
 59 | [use yolov4 to train your own data](Use_yolov4_to_train_your_own_data.md)
 60 | 
 61 | 1. Download weight
 62 | 2. Transform data
 63 | 
 64 |     For coco dataset,you can use tool/coco_annotatin.py.
 65 |     ```
 66 |     # train.txt
 67 |     image_path1 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ...
 68 |     image_path2 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ...
 69 |     ...
 70 |     ...
 71 |     ```
 72 | 3. Train
 73 | 
 74 |     you can set parameters in cfg.py.
 75 |     ```
 76 |      python train.py -g [GPU_ID] -dir [Dataset direction] ...
 77 |     ```
 78 | 
 79 | # 2.Inference
 80 | - download model weight https://drive.google.com/open?id=1cewMfusmPjYWbrnuJRuKhPMwRe_b9PaT
 81 | ```
 82 | python demo.py <cfgFile> <weightFile> <imgFile>
 83 | ```
 84 | 
 85 | # 3.Darknet2ONNX
 86 | 
 87 | - **Install onnxruntime**
 88 | 
 89 |     ```sh
 90 |     pip install onnxruntime
 91 |     ```
 92 | 
 93 | - **Run python script to generate onnx model and run the demo**
 94 | 
 95 |     ```sh
 96 |     python demo_onnx.py <cfgFile> <weightFile> <imageFile> <batchSize>
 97 |     ```
 98 | 
 99 |   This script will generate 2 onnx models.
100 | 
101 |   - One is for running the demo (batch_size=1)
102 |   - The other one is what you want to generate (batch_size=batchSize)
103 | 
104 | 
105 | # 4.ONNX2Tensorflow
106 | 
107 | - **First:Conversion to ONNX**
108 | 
109 |     tensorflow >=2.0
110 |     
111 |     1: Thanks:github:https://github.com/onnx/onnx-tensorflow
112 |     
113 |     2: Run git clone https://github.com/onnx/onnx-tensorflow.git && cd onnx-tensorflow
114 |     Run pip install -e .
115 |     
116 |     Note:Errors will occur when using "pip install onnx-tf", at least for me,it is recommended to use source code installation
117 | 
118 | Reference:
119 | - https://github.com/eriklindernoren/PyTorch-YOLOv3
120 | - https://github.com/marvis/pytorch-caffe-darknet-convert
121 | - https://github.com/marvis/pytorch-yolo3
122 | 
123 | ```
124 | @article{yolov4,
125 |   title={YOLOv4: YOLOv4: Optimal Speed and Accuracy of Object Detection},
126 |   author={Alexey Bochkovskiy, Chien-Yao Wang, Hong-Yuan Mark Liao},
127 |   journal = {arXiv},
128 |   year={2020}
129 | }
130 | ```


--------------------------------------------------------------------------------
/Use_yolov4_to_train_your_own_data.md:
--------------------------------------------------------------------------------
 1 | yolov4的发布引起了不少的关注，但由于darknet是大佬c语言写的，对代码的阅读有诸多不变，所以周末的时候写了个pytorch版的(蹭一波热度)。虽然pytorch——yolov4写好已经有一段时间了，但是由于种种原因一直没有进行验证(主要就是懒)，大家提出了诸多问题帮助修复很多bug，还有大佬一起增加新的功能，感谢大家的帮助。这些天呼声最高的就是如何如何使用自己的数据进行训练，昨天又是周末，就把这件拖了很久的事做了。并不像使用很多数据，于是自己制作了一个简单的数据集。
 2 | 
 3 | # 1. 代码准备
 4 | 
 5 | github 克隆代码
 6 | ```
 7 | git clone https://github.com/Tianxiaomo/pytorch-YOLOv4.git
 8 | ```
 9 | # 2. 数据准备
10 | 
11 | 准备train.txt,内容是图片名和box。格式如下
12 | 
13 | ```
14 | image_path1 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ...
15 | image_path2 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ...
16 | ...
17 | ```
18 | - image_path : 图片名
19 | - x1,y1 : 左上角坐标
20 | - x2,y2 : 右下角坐标
21 | - id : 物体类别
22 | 
23 | 我自己用的数据是自己制作的了一个小数据集，检测各种各样的硬币(也就1元，5角，1角三种)，为什么不使用其他的东西制作数据集呢，没有啊，手边只有这些硬币感觉比较合适，相对其他的东西也比较简单。
24 | 
25 | ![UTOOLS1590383513325.png](https://user-gold-cdn.xitu.io/2020/5/25/1724a3e953909b1b?w=1649&h=791&f=png&s=1290382)
26 | 
27 | 一共准备了没几张。
28 | 
29 | # 3. 参数设置
30 | 
31 | 开始训练的时候我直接用原来的参数，batch size设为64，跑了几个epoch发现不对，我数据一共才二十多个。后修改网络更新策略，不是按照每个epoch的step更新，使用总的steps更新，观察loss貌似可以训练了，于是睡觉，明天再看训练如何(鬼知道我又改了啥)。
32 | 
33 | 今天打开电脑一看，what xx,loss收敛到2.e+4下不去了，此种必又蹊跷，遂kill了。于是把batch size直接设为4，可以正常训练了。
34 | 
35 | ```
36 | Cfg.batch = 4
37 | Cfg.subdivisions = 1
38 | ```
39 | 
40 | # 4. 开始训练
41 | 
42 | ```
43 |  python train.py -l 0.001 -g 4 -pretrained ./yolov4.conv.137.pth -classes 3 -dir /home/OCR/coins
44 | 
45 | -l 学习率
46 | -g gpu id
47 | -pretrained 预训练的主干网络，从AlexeyAB给的darknet的yolov4.conv.137转换过来的
48 | -classes 类别种类
49 | -dir 图片所在文件夹
50 | ```
51 | 
52 | 
53 | 看下loss曲线
54 | ```
55 | tensorboard --logdir log --host 192.168.212.75 --port 6008
56 | ```
57 | ![UTOOLS1590386319240.png](https://user-gold-cdn.xitu.io/2020/5/25/1724a696148d13f3?w=1357&h=795&f=png&s=151465)
58 | 
59 | # 5. 验证
60 | 
61 | ```
62 | python model.py 3 weight/Yolov4_epoch166_coins.pth data/coin2.jpg data/coins.names
63 | 
64 | python model.py num_classes weightfile imagepath namefile
65 | ```
66 | coins.names
67 | ```
68 | 1yuan
69 | 5jiao
70 | 1jiao
71 | 
72 | ```
73 | 
74 | ![UTOOLS1590386705468.png](https://user-gold-cdn.xitu.io/2020/5/25/1724a6f46e826bb8?w=774&h=1377&f=png&s=1191048)
75 | 
76 | 效果差强人意(训练数据只有3种类型硬币)。
77 | 
78 | # 附
79 | 
80 | - coins数据集 （链接：https://pan.baidu.com/s/1y701NRKSdpj6UKDIH-GpqA 
81 | 提取码：j09s）
82 | - yolov4.conv.137.pth (链接：https://pan.baidu.com/s/1ovBie4YyVQQoUrC3AY0joA 提取码：kcel)
83 | 


--------------------------------------------------------------------------------
/cfg.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | '''
 3 | @Time          : 2020/05/06 21:05
 4 | @Author        : Tianxiaomo
 5 | @File          : Cfg.py
 6 | @Noice         :
 7 | @Modificattion :
 8 |     @Author    :
 9 |     @Time      :
10 |     @Detail    :
11 | 
12 | '''
13 | from easydict import EasyDict
14 | 
15 | Cfg = EasyDict()
16 | Cfg.batch = 64
17 | Cfg.subdivisions = 16
18 | Cfg.width = 608
19 | Cfg.height = 608
20 | Cfg.channels = 3
21 | Cfg.momentum = 0.949
22 | Cfg.decay = 0.0005
23 | Cfg.angle = 0
24 | Cfg.saturation = 1.5
25 | Cfg.exposure = 1.5
26 | Cfg.hue = .1
27 | 
28 | Cfg.learning_rate = 0.00261
29 | Cfg.burn_in = 1000
30 | Cfg.max_batches = 500500
31 | Cfg.steps = [400000, 450000]
32 | Cfg.policy = Cfg.steps
33 | Cfg.scales = .1, .1
34 | 
35 | Cfg.cutmix = 0
36 | Cfg.mosaic = 1
37 | 
38 | Cfg.letter_box = 0
39 | Cfg.jitter = 0.2
40 | Cfg.classes = 80
41 | Cfg.track = 0
42 | Cfg.w = Cfg.width
43 | Cfg.h = Cfg.height
44 | Cfg.flip = 1
45 | Cfg.blur = 0
46 | Cfg.gaussian = 0
47 | Cfg.boxes = 60  # box num
48 | Cfg.TRAIN_EPOCHS = 300
49 | Cfg.train_label = 'data/train.txt'
50 | Cfg.val_label = 'data/val.txt'
51 | Cfg.TRAIN_OPTIMIZER = 'adam'
52 | '''
53 | image_path1 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ...
54 | image_path2 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ...
55 | ...
56 | '''
57 | 
58 | if Cfg.mosaic and Cfg.cutmix:
59 |     Cfg.mixup = 4
60 | elif Cfg.cutmix:
61 |     Cfg.mixup = 2
62 | elif Cfg.mosaic:
63 |     Cfg.mixup = 3
64 | 
65 | Cfg.checkpoints = 'checkpoints'
66 | Cfg.TRAIN_TENSORBOARD_DIR = 'log'


--------------------------------------------------------------------------------
/cfg/yolov3-tiny.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | batch=1
  4 | subdivisions=1
  5 | # Training
  6 | # batch=64
  7 | # subdivisions=2
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | # 0
 26 | [convolutional]
 27 | batch_normalize=1
 28 | filters=16
 29 | size=3
 30 | stride=1
 31 | pad=1
 32 | activation=leaky
 33 | 
 34 | # 1
 35 | [maxpool]
 36 | size=2
 37 | stride=2
 38 | 
 39 | # 2
 40 | [convolutional]
 41 | batch_normalize=1
 42 | filters=32
 43 | size=3
 44 | stride=1
 45 | pad=1
 46 | activation=leaky
 47 | 
 48 | # 3
 49 | [maxpool]
 50 | size=2
 51 | stride=2
 52 | 
 53 | # 4
 54 | [convolutional]
 55 | batch_normalize=1
 56 | filters=64
 57 | size=3
 58 | stride=1
 59 | pad=1
 60 | activation=leaky
 61 | 
 62 | # 5
 63 | [maxpool]
 64 | size=2
 65 | stride=2
 66 | 
 67 | # 6
 68 | [convolutional]
 69 | batch_normalize=1
 70 | filters=128
 71 | size=3
 72 | stride=1
 73 | pad=1
 74 | activation=leaky
 75 | 
 76 | # 7
 77 | [maxpool]
 78 | size=2
 79 | stride=2
 80 | 
 81 | # 8
 82 | [convolutional]
 83 | batch_normalize=1
 84 | filters=256
 85 | size=3
 86 | stride=1
 87 | pad=1
 88 | activation=leaky
 89 | 
 90 | # 9
 91 | [maxpool]
 92 | size=2
 93 | stride=2
 94 | 
 95 | # 10
 96 | [convolutional]
 97 | batch_normalize=1
 98 | filters=512
 99 | size=3
100 | stride=1
101 | pad=1
102 | activation=leaky
103 | 
104 | # 11
105 | [maxpool]
106 | size=2
107 | stride=1
108 | 
109 | # 12
110 | [convolutional]
111 | batch_normalize=1
112 | filters=1024
113 | size=3
114 | stride=1
115 | pad=1
116 | activation=leaky
117 | 
118 | ###########
119 | 
120 | # 13
121 | [convolutional]
122 | batch_normalize=1
123 | filters=256
124 | size=1
125 | stride=1
126 | pad=1
127 | activation=leaky
128 | 
129 | # 14
130 | [convolutional]
131 | batch_normalize=1
132 | filters=512
133 | size=3
134 | stride=1
135 | pad=1
136 | activation=leaky
137 | 
138 | # 15
139 | [convolutional]
140 | size=1
141 | stride=1
142 | pad=1
143 | filters=255
144 | activation=linear
145 | 
146 | 
147 | 
148 | # 16
149 | [yolo]
150 | mask = 3,4,5
151 | anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
152 | classes=80
153 | num=6
154 | jitter=.3
155 | ignore_thresh = .7
156 | truth_thresh = 1
157 | random=1
158 | 
159 | # 17
160 | [route]
161 | layers = -4
162 | 
163 | # 18
164 | [convolutional]
165 | batch_normalize=1
166 | filters=128
167 | size=1
168 | stride=1
169 | pad=1
170 | activation=leaky
171 | 
172 | # 19
173 | [upsample]
174 | stride=2
175 | 
176 | # 20
177 | [route]
178 | layers = -1, 8
179 | 
180 | # 21
181 | [convolutional]
182 | batch_normalize=1
183 | filters=256
184 | size=3
185 | stride=1
186 | pad=1
187 | activation=leaky
188 | 
189 | # 22
190 | [convolutional]
191 | size=1
192 | stride=1
193 | pad=1
194 | filters=255
195 | activation=linear
196 | 
197 | # 23
198 | [yolo]
199 | mask = 1,2,3
200 | anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
201 | classes=80
202 | num=6
203 | jitter=.3
204 | ignore_thresh = .7
205 | truth_thresh = 1
206 | random=1
207 | 


--------------------------------------------------------------------------------
/cfg/yolov3.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | batch=1
  4 | subdivisions=1
  5 | # Training
  6 | # batch=64
  7 | # subdivisions=16
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 | 
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 | 
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 | 
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 | 
606 | 
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .5
614 | truth_thresh = 1
615 | random=1
616 | 
617 | 
618 | [route]
619 | layers = -4
620 | 
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 | 
629 | [upsample]
630 | stride=2
631 | 
632 | [route]
633 | layers = -1, 61
634 | 
635 | 
636 | 
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 | 
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 | 
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 | 
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 | 
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 | 
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 | 
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 | 
692 | 
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .5
700 | truth_thresh = 1
701 | random=1
702 | 
703 | 
704 | 
705 | [route]
706 | layers = -4
707 | 
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 | 
716 | [upsample]
717 | stride=2
718 | 
719 | [route]
720 | layers = -1, 36
721 | 
722 | 
723 | 
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 | 
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 | 
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 | 
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 | 
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 | 
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 | 
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 | 
779 | 
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .5
787 | truth_thresh = 1
788 | random=1
789 | 
790 | 


--------------------------------------------------------------------------------
/cfg/yolov4-custom.cfg:
--------------------------------------------------------------------------------
   1 | [net]
   2 | # Testing
   3 | #batch=1
   4 | #subdivisions=1
   5 | # Training
   6 | batch=64
   7 | subdivisions=16
   8 | width=608
   9 | height=608
  10 | channels=3
  11 | momentum=0.949
  12 | decay=0.0005
  13 | angle=0
  14 | saturation = 1.5
  15 | exposure = 1.5
  16 | hue=.1
  17 | 
  18 | learning_rate=0.001
  19 | burn_in=1000
  20 | max_batches = 500500
  21 | policy=steps
  22 | steps=400000,450000
  23 | scales=.1,.1
  24 | 
  25 | #cutmix=1
  26 | mosaic=1
  27 | 
  28 | #:104x104 54:52x52 85:26x26 104:13x13 for 416
  29 | 
  30 | [convolutional]
  31 | batch_normalize=1
  32 | filters=32
  33 | size=3
  34 | stride=1
  35 | pad=1
  36 | activation=mish
  37 | 
  38 | # Downsample
  39 | 
  40 | [convolutional]
  41 | batch_normalize=1
  42 | filters=64
  43 | size=3
  44 | stride=2
  45 | pad=1
  46 | activation=mish
  47 | 
  48 | [convolutional]
  49 | batch_normalize=1
  50 | filters=64
  51 | size=1
  52 | stride=1
  53 | pad=1
  54 | activation=mish
  55 | 
  56 | [route]
  57 | layers = -2
  58 | 
  59 | [convolutional]
  60 | batch_normalize=1
  61 | filters=64
  62 | size=1
  63 | stride=1
  64 | pad=1
  65 | activation=mish
  66 | 
  67 | [convolutional]
  68 | batch_normalize=1
  69 | filters=32
  70 | size=1
  71 | stride=1
  72 | pad=1
  73 | activation=mish
  74 | 
  75 | [convolutional]
  76 | batch_normalize=1
  77 | filters=64
  78 | size=3
  79 | stride=1
  80 | pad=1
  81 | activation=mish
  82 | 
  83 | [shortcut]
  84 | from=-3
  85 | activation=linear
  86 | 
  87 | [convolutional]
  88 | batch_normalize=1
  89 | filters=64
  90 | size=1
  91 | stride=1
  92 | pad=1
  93 | activation=mish
  94 | 
  95 | [route]
  96 | layers = -1,-7
  97 | 
  98 | [convolutional]
  99 | batch_normalize=1
 100 | filters=64
 101 | size=1
 102 | stride=1
 103 | pad=1
 104 | activation=mish
 105 | 
 106 | # Downsample
 107 | 
 108 | [convolutional]
 109 | batch_normalize=1
 110 | filters=128
 111 | size=3
 112 | stride=2
 113 | pad=1
 114 | activation=mish
 115 | 
 116 | [convolutional]
 117 | batch_normalize=1
 118 | filters=64
 119 | size=1
 120 | stride=1
 121 | pad=1
 122 | activation=mish
 123 | 
 124 | [route]
 125 | layers = -2
 126 | 
 127 | [convolutional]
 128 | batch_normalize=1
 129 | filters=64
 130 | size=1
 131 | stride=1
 132 | pad=1
 133 | activation=mish
 134 | 
 135 | [convolutional]
 136 | batch_normalize=1
 137 | filters=64
 138 | size=1
 139 | stride=1
 140 | pad=1
 141 | activation=mish
 142 | 
 143 | [convolutional]
 144 | batch_normalize=1
 145 | filters=64
 146 | size=3
 147 | stride=1
 148 | pad=1
 149 | activation=mish
 150 | 
 151 | [shortcut]
 152 | from=-3
 153 | activation=linear
 154 | 
 155 | [convolutional]
 156 | batch_normalize=1
 157 | filters=64
 158 | size=1
 159 | stride=1
 160 | pad=1
 161 | activation=mish
 162 | 
 163 | [convolutional]
 164 | batch_normalize=1
 165 | filters=64
 166 | size=3
 167 | stride=1
 168 | pad=1
 169 | activation=mish
 170 | 
 171 | [shortcut]
 172 | from=-3
 173 | activation=linear
 174 | 
 175 | [convolutional]
 176 | batch_normalize=1
 177 | filters=64
 178 | size=1
 179 | stride=1
 180 | pad=1
 181 | activation=mish
 182 | 
 183 | [route]
 184 | layers = -1,-10
 185 | 
 186 | [convolutional]
 187 | batch_normalize=1
 188 | filters=128
 189 | size=1
 190 | stride=1
 191 | pad=1
 192 | activation=mish
 193 | 
 194 | # Downsample
 195 | 
 196 | [convolutional]
 197 | batch_normalize=1
 198 | filters=256
 199 | size=3
 200 | stride=2
 201 | pad=1
 202 | activation=mish
 203 | 
 204 | [convolutional]
 205 | batch_normalize=1
 206 | filters=128
 207 | size=1
 208 | stride=1
 209 | pad=1
 210 | activation=mish
 211 | 
 212 | [route]
 213 | layers = -2
 214 | 
 215 | [convolutional]
 216 | batch_normalize=1
 217 | filters=128
 218 | size=1
 219 | stride=1
 220 | pad=1
 221 | activation=mish
 222 | 
 223 | [convolutional]
 224 | batch_normalize=1
 225 | filters=128
 226 | size=1
 227 | stride=1
 228 | pad=1
 229 | activation=mish
 230 | 
 231 | [convolutional]
 232 | batch_normalize=1
 233 | filters=128
 234 | size=3
 235 | stride=1
 236 | pad=1
 237 | activation=mish
 238 | 
 239 | [shortcut]
 240 | from=-3
 241 | activation=linear
 242 | 
 243 | [convolutional]
 244 | batch_normalize=1
 245 | filters=128
 246 | size=1
 247 | stride=1
 248 | pad=1
 249 | activation=mish
 250 | 
 251 | [convolutional]
 252 | batch_normalize=1
 253 | filters=128
 254 | size=3
 255 | stride=1
 256 | pad=1
 257 | activation=mish
 258 | 
 259 | [shortcut]
 260 | from=-3
 261 | activation=linear
 262 | 
 263 | [convolutional]
 264 | batch_normalize=1
 265 | filters=128
 266 | size=1
 267 | stride=1
 268 | pad=1
 269 | activation=mish
 270 | 
 271 | [convolutional]
 272 | batch_normalize=1
 273 | filters=128
 274 | size=3
 275 | stride=1
 276 | pad=1
 277 | activation=mish
 278 | 
 279 | [shortcut]
 280 | from=-3
 281 | activation=linear
 282 | 
 283 | [convolutional]
 284 | batch_normalize=1
 285 | filters=128
 286 | size=1
 287 | stride=1
 288 | pad=1
 289 | activation=mish
 290 | 
 291 | [convolutional]
 292 | batch_normalize=1
 293 | filters=128
 294 | size=3
 295 | stride=1
 296 | pad=1
 297 | activation=mish
 298 | 
 299 | [shortcut]
 300 | from=-3
 301 | activation=linear
 302 | 
 303 | 
 304 | [convolutional]
 305 | batch_normalize=1
 306 | filters=128
 307 | size=1
 308 | stride=1
 309 | pad=1
 310 | activation=mish
 311 | 
 312 | [convolutional]
 313 | batch_normalize=1
 314 | filters=128
 315 | size=3
 316 | stride=1
 317 | pad=1
 318 | activation=mish
 319 | 
 320 | [shortcut]
 321 | from=-3
 322 | activation=linear
 323 | 
 324 | [convolutional]
 325 | batch_normalize=1
 326 | filters=128
 327 | size=1
 328 | stride=1
 329 | pad=1
 330 | activation=mish
 331 | 
 332 | [convolutional]
 333 | batch_normalize=1
 334 | filters=128
 335 | size=3
 336 | stride=1
 337 | pad=1
 338 | activation=mish
 339 | 
 340 | [shortcut]
 341 | from=-3
 342 | activation=linear
 343 | 
 344 | [convolutional]
 345 | batch_normalize=1
 346 | filters=128
 347 | size=1
 348 | stride=1
 349 | pad=1
 350 | activation=mish
 351 | 
 352 | [convolutional]
 353 | batch_normalize=1
 354 | filters=128
 355 | size=3
 356 | stride=1
 357 | pad=1
 358 | activation=mish
 359 | 
 360 | [shortcut]
 361 | from=-3
 362 | activation=linear
 363 | 
 364 | [convolutional]
 365 | batch_normalize=1
 366 | filters=128
 367 | size=1
 368 | stride=1
 369 | pad=1
 370 | activation=mish
 371 | 
 372 | [convolutional]
 373 | batch_normalize=1
 374 | filters=128
 375 | size=3
 376 | stride=1
 377 | pad=1
 378 | activation=mish
 379 | 
 380 | [shortcut]
 381 | from=-3
 382 | activation=linear
 383 | 
 384 | [convolutional]
 385 | batch_normalize=1
 386 | filters=128
 387 | size=1
 388 | stride=1
 389 | pad=1
 390 | activation=mish
 391 | 
 392 | [route]
 393 | layers = -1,-28
 394 | 
 395 | [convolutional]
 396 | batch_normalize=1
 397 | filters=256
 398 | size=1
 399 | stride=1
 400 | pad=1
 401 | activation=mish
 402 | 
 403 | # Downsample
 404 | 
 405 | [convolutional]
 406 | batch_normalize=1
 407 | filters=512
 408 | size=3
 409 | stride=2
 410 | pad=1
 411 | activation=mish
 412 | 
 413 | [convolutional]
 414 | batch_normalize=1
 415 | filters=256
 416 | size=1
 417 | stride=1
 418 | pad=1
 419 | activation=mish
 420 | 
 421 | [route]
 422 | layers = -2
 423 | 
 424 | [convolutional]
 425 | batch_normalize=1
 426 | filters=256
 427 | size=1
 428 | stride=1
 429 | pad=1
 430 | activation=mish
 431 | 
 432 | [convolutional]
 433 | batch_normalize=1
 434 | filters=256
 435 | size=1
 436 | stride=1
 437 | pad=1
 438 | activation=mish
 439 | 
 440 | [convolutional]
 441 | batch_normalize=1
 442 | filters=256
 443 | size=3
 444 | stride=1
 445 | pad=1
 446 | activation=mish
 447 | 
 448 | [shortcut]
 449 | from=-3
 450 | activation=linear
 451 | 
 452 | 
 453 | [convolutional]
 454 | batch_normalize=1
 455 | filters=256
 456 | size=1
 457 | stride=1
 458 | pad=1
 459 | activation=mish
 460 | 
 461 | [convolutional]
 462 | batch_normalize=1
 463 | filters=256
 464 | size=3
 465 | stride=1
 466 | pad=1
 467 | activation=mish
 468 | 
 469 | [shortcut]
 470 | from=-3
 471 | activation=linear
 472 | 
 473 | 
 474 | [convolutional]
 475 | batch_normalize=1
 476 | filters=256
 477 | size=1
 478 | stride=1
 479 | pad=1
 480 | activation=mish
 481 | 
 482 | [convolutional]
 483 | batch_normalize=1
 484 | filters=256
 485 | size=3
 486 | stride=1
 487 | pad=1
 488 | activation=mish
 489 | 
 490 | [shortcut]
 491 | from=-3
 492 | activation=linear
 493 | 
 494 | 
 495 | [convolutional]
 496 | batch_normalize=1
 497 | filters=256
 498 | size=1
 499 | stride=1
 500 | pad=1
 501 | activation=mish
 502 | 
 503 | [convolutional]
 504 | batch_normalize=1
 505 | filters=256
 506 | size=3
 507 | stride=1
 508 | pad=1
 509 | activation=mish
 510 | 
 511 | [shortcut]
 512 | from=-3
 513 | activation=linear
 514 | 
 515 | 
 516 | [convolutional]
 517 | batch_normalize=1
 518 | filters=256
 519 | size=1
 520 | stride=1
 521 | pad=1
 522 | activation=mish
 523 | 
 524 | [convolutional]
 525 | batch_normalize=1
 526 | filters=256
 527 | size=3
 528 | stride=1
 529 | pad=1
 530 | activation=mish
 531 | 
 532 | [shortcut]
 533 | from=-3
 534 | activation=linear
 535 | 
 536 | 
 537 | [convolutional]
 538 | batch_normalize=1
 539 | filters=256
 540 | size=1
 541 | stride=1
 542 | pad=1
 543 | activation=mish
 544 | 
 545 | [convolutional]
 546 | batch_normalize=1
 547 | filters=256
 548 | size=3
 549 | stride=1
 550 | pad=1
 551 | activation=mish
 552 | 
 553 | [shortcut]
 554 | from=-3
 555 | activation=linear
 556 | 
 557 | 
 558 | [convolutional]
 559 | batch_normalize=1
 560 | filters=256
 561 | size=1
 562 | stride=1
 563 | pad=1
 564 | activation=mish
 565 | 
 566 | [convolutional]
 567 | batch_normalize=1
 568 | filters=256
 569 | size=3
 570 | stride=1
 571 | pad=1
 572 | activation=mish
 573 | 
 574 | [shortcut]
 575 | from=-3
 576 | activation=linear
 577 | 
 578 | [convolutional]
 579 | batch_normalize=1
 580 | filters=256
 581 | size=1
 582 | stride=1
 583 | pad=1
 584 | activation=mish
 585 | 
 586 | [convolutional]
 587 | batch_normalize=1
 588 | filters=256
 589 | size=3
 590 | stride=1
 591 | pad=1
 592 | activation=mish
 593 | 
 594 | [shortcut]
 595 | from=-3
 596 | activation=linear
 597 | 
 598 | [convolutional]
 599 | batch_normalize=1
 600 | filters=256
 601 | size=1
 602 | stride=1
 603 | pad=1
 604 | activation=mish
 605 | 
 606 | [route]
 607 | layers = -1,-28
 608 | 
 609 | [convolutional]
 610 | batch_normalize=1
 611 | filters=512
 612 | size=1
 613 | stride=1
 614 | pad=1
 615 | activation=mish
 616 | 
 617 | # Downsample
 618 | 
 619 | [convolutional]
 620 | batch_normalize=1
 621 | filters=1024
 622 | size=3
 623 | stride=2
 624 | pad=1
 625 | activation=mish
 626 | 
 627 | [convolutional]
 628 | batch_normalize=1
 629 | filters=512
 630 | size=1
 631 | stride=1
 632 | pad=1
 633 | activation=mish
 634 | 
 635 | [route]
 636 | layers = -2
 637 | 
 638 | [convolutional]
 639 | batch_normalize=1
 640 | filters=512
 641 | size=1
 642 | stride=1
 643 | pad=1
 644 | activation=mish
 645 | 
 646 | [convolutional]
 647 | batch_normalize=1
 648 | filters=512
 649 | size=1
 650 | stride=1
 651 | pad=1
 652 | activation=mish
 653 | 
 654 | [convolutional]
 655 | batch_normalize=1
 656 | filters=512
 657 | size=3
 658 | stride=1
 659 | pad=1
 660 | activation=mish
 661 | 
 662 | [shortcut]
 663 | from=-3
 664 | activation=linear
 665 | 
 666 | [convolutional]
 667 | batch_normalize=1
 668 | filters=512
 669 | size=1
 670 | stride=1
 671 | pad=1
 672 | activation=mish
 673 | 
 674 | [convolutional]
 675 | batch_normalize=1
 676 | filters=512
 677 | size=3
 678 | stride=1
 679 | pad=1
 680 | activation=mish
 681 | 
 682 | [shortcut]
 683 | from=-3
 684 | activation=linear
 685 | 
 686 | [convolutional]
 687 | batch_normalize=1
 688 | filters=512
 689 | size=1
 690 | stride=1
 691 | pad=1
 692 | activation=mish
 693 | 
 694 | [convolutional]
 695 | batch_normalize=1
 696 | filters=512
 697 | size=3
 698 | stride=1
 699 | pad=1
 700 | activation=mish
 701 | 
 702 | [shortcut]
 703 | from=-3
 704 | activation=linear
 705 | 
 706 | [convolutional]
 707 | batch_normalize=1
 708 | filters=512
 709 | size=1
 710 | stride=1
 711 | pad=1
 712 | activation=mish
 713 | 
 714 | [convolutional]
 715 | batch_normalize=1
 716 | filters=512
 717 | size=3
 718 | stride=1
 719 | pad=1
 720 | activation=mish
 721 | 
 722 | [shortcut]
 723 | from=-3
 724 | activation=linear
 725 | 
 726 | [convolutional]
 727 | batch_normalize=1
 728 | filters=512
 729 | size=1
 730 | stride=1
 731 | pad=1
 732 | activation=mish
 733 | 
 734 | [route]
 735 | layers = -1,-16
 736 | 
 737 | [convolutional]
 738 | batch_normalize=1
 739 | filters=1024
 740 | size=1
 741 | stride=1
 742 | pad=1
 743 | activation=mish
 744 | 
 745 | ##########################
 746 | 
 747 | [convolutional]
 748 | batch_normalize=1
 749 | filters=512
 750 | size=1
 751 | stride=1
 752 | pad=1
 753 | activation=leaky
 754 | 
 755 | [convolutional]
 756 | batch_normalize=1
 757 | size=3
 758 | stride=1
 759 | pad=1
 760 | filters=1024
 761 | activation=leaky
 762 | 
 763 | [convolutional]
 764 | batch_normalize=1
 765 | filters=512
 766 | size=1
 767 | stride=1
 768 | pad=1
 769 | activation=leaky
 770 | 
 771 | ### SPP ###
 772 | [maxpool]
 773 | stride=1
 774 | size=5
 775 | 
 776 | [route]
 777 | layers=-2
 778 | 
 779 | [maxpool]
 780 | stride=1
 781 | size=9
 782 | 
 783 | [route]
 784 | layers=-4
 785 | 
 786 | [maxpool]
 787 | stride=1
 788 | size=13
 789 | 
 790 | [route]
 791 | layers=-1,-3,-5,-6
 792 | ### End SPP ###
 793 | 
 794 | [convolutional]
 795 | batch_normalize=1
 796 | filters=512
 797 | size=1
 798 | stride=1
 799 | pad=1
 800 | activation=leaky
 801 | 
 802 | [convolutional]
 803 | batch_normalize=1
 804 | size=3
 805 | stride=1
 806 | pad=1
 807 | filters=1024
 808 | activation=leaky
 809 | 
 810 | [convolutional]
 811 | batch_normalize=1
 812 | filters=512
 813 | size=1
 814 | stride=1
 815 | pad=1
 816 | activation=leaky
 817 | 
 818 | [convolutional]
 819 | batch_normalize=1
 820 | filters=256
 821 | size=1
 822 | stride=1
 823 | pad=1
 824 | activation=leaky
 825 | 
 826 | [upsample]
 827 | stride=2
 828 | 
 829 | [route]
 830 | layers = 85
 831 | 
 832 | [convolutional]
 833 | batch_normalize=1
 834 | filters=256
 835 | size=1
 836 | stride=1
 837 | pad=1
 838 | activation=leaky
 839 | 
 840 | [route]
 841 | layers = -1, -3
 842 | 
 843 | [convolutional]
 844 | batch_normalize=1
 845 | filters=256
 846 | size=1
 847 | stride=1
 848 | pad=1
 849 | activation=leaky
 850 | 
 851 | [convolutional]
 852 | batch_normalize=1
 853 | size=3
 854 | stride=1
 855 | pad=1
 856 | filters=512
 857 | activation=leaky
 858 | 
 859 | [convolutional]
 860 | batch_normalize=1
 861 | filters=256
 862 | size=1
 863 | stride=1
 864 | pad=1
 865 | activation=leaky
 866 | 
 867 | [convolutional]
 868 | batch_normalize=1
 869 | size=3
 870 | stride=1
 871 | pad=1
 872 | filters=512
 873 | activation=leaky
 874 | 
 875 | [convolutional]
 876 | batch_normalize=1
 877 | filters=256
 878 | size=1
 879 | stride=1
 880 | pad=1
 881 | activation=leaky
 882 | 
 883 | [convolutional]
 884 | batch_normalize=1
 885 | filters=128
 886 | size=1
 887 | stride=1
 888 | pad=1
 889 | activation=leaky
 890 | 
 891 | [upsample]
 892 | stride=2
 893 | 
 894 | [route]
 895 | layers = 54
 896 | 
 897 | [convolutional]
 898 | batch_normalize=1
 899 | filters=128
 900 | size=1
 901 | stride=1
 902 | pad=1
 903 | activation=leaky
 904 | 
 905 | [route]
 906 | layers = -1, -3
 907 | 
 908 | [convolutional]
 909 | batch_normalize=1
 910 | filters=128
 911 | size=1
 912 | stride=1
 913 | pad=1
 914 | activation=leaky
 915 | 
 916 | [convolutional]
 917 | batch_normalize=1
 918 | size=3
 919 | stride=1
 920 | pad=1
 921 | filters=256
 922 | activation=leaky
 923 | 
 924 | [convolutional]
 925 | batch_normalize=1
 926 | filters=128
 927 | size=1
 928 | stride=1
 929 | pad=1
 930 | activation=leaky
 931 | 
 932 | [convolutional]
 933 | batch_normalize=1
 934 | size=3
 935 | stride=1
 936 | pad=1
 937 | filters=256
 938 | activation=leaky
 939 | 
 940 | [convolutional]
 941 | batch_normalize=1
 942 | filters=128
 943 | size=1
 944 | stride=1
 945 | pad=1
 946 | activation=leaky
 947 | 
 948 | ##########################
 949 | 
 950 | [convolutional]
 951 | batch_normalize=1
 952 | size=3
 953 | stride=1
 954 | pad=1
 955 | filters=256
 956 | activation=leaky
 957 | 
 958 | [convolutional]
 959 | size=1
 960 | stride=1
 961 | pad=1
 962 | filters=255
 963 | activation=linear
 964 | 
 965 | 
 966 | [yolo]
 967 | mask = 0,1,2
 968 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
 969 | classes=80
 970 | num=9
 971 | jitter=.3
 972 | ignore_thresh = .7
 973 | truth_thresh = 1
 974 | scale_x_y = 1.2
 975 | iou_thresh=0.213
 976 | cls_normalizer=1.0
 977 | iou_normalizer=0.07
 978 | iou_loss=ciou
 979 | nms_kind=greedynms
 980 | beta_nms=0.6
 981 | 
 982 | 
 983 | [route]
 984 | layers = -4
 985 | 
 986 | [convolutional]
 987 | batch_normalize=1
 988 | size=3
 989 | stride=2
 990 | pad=1
 991 | filters=256
 992 | activation=leaky
 993 | 
 994 | [route]
 995 | layers = -1, -16
 996 | 
 997 | [convolutional]
 998 | batch_normalize=1
 999 | filters=256
1000 | size=1
1001 | stride=1
1002 | pad=1
1003 | activation=leaky
1004 | 
1005 | [convolutional]
1006 | batch_normalize=1
1007 | size=3
1008 | stride=1
1009 | pad=1
1010 | filters=512
1011 | activation=leaky
1012 | 
1013 | [convolutional]
1014 | batch_normalize=1
1015 | filters=256
1016 | size=1
1017 | stride=1
1018 | pad=1
1019 | activation=leaky
1020 | 
1021 | [convolutional]
1022 | batch_normalize=1
1023 | size=3
1024 | stride=1
1025 | pad=1
1026 | filters=512
1027 | activation=leaky
1028 | 
1029 | [convolutional]
1030 | batch_normalize=1
1031 | filters=256
1032 | size=1
1033 | stride=1
1034 | pad=1
1035 | activation=leaky
1036 | 
1037 | [convolutional]
1038 | batch_normalize=1
1039 | size=3
1040 | stride=1
1041 | pad=1
1042 | filters=512
1043 | activation=leaky
1044 | 
1045 | [convolutional]
1046 | size=1
1047 | stride=1
1048 | pad=1
1049 | filters=255
1050 | activation=linear
1051 | 
1052 | 
1053 | [yolo]
1054 | mask = 3,4,5
1055 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
1056 | classes=80
1057 | num=9
1058 | jitter=.3
1059 | ignore_thresh = .7
1060 | truth_thresh = 1
1061 | scale_x_y = 1.1
1062 | iou_thresh=0.213
1063 | cls_normalizer=1.0
1064 | iou_normalizer=0.07
1065 | iou_loss=ciou
1066 | nms_kind=greedynms
1067 | beta_nms=0.6
1068 | 
1069 | 
1070 | [route]
1071 | layers = -4
1072 | 
1073 | [convolutional]
1074 | batch_normalize=1
1075 | size=3
1076 | stride=2
1077 | pad=1
1078 | filters=512
1079 | activation=leaky
1080 | 
1081 | [route]
1082 | layers = -1, -37
1083 | 
1084 | [convolutional]
1085 | batch_normalize=1
1086 | filters=512
1087 | size=1
1088 | stride=1
1089 | pad=1
1090 | activation=leaky
1091 | 
1092 | [convolutional]
1093 | batch_normalize=1
1094 | size=3
1095 | stride=1
1096 | pad=1
1097 | filters=1024
1098 | activation=leaky
1099 | 
1100 | [convolutional]
1101 | batch_normalize=1
1102 | filters=512
1103 | size=1
1104 | stride=1
1105 | pad=1
1106 | activation=leaky
1107 | 
1108 | [convolutional]
1109 | batch_normalize=1
1110 | size=3
1111 | stride=1
1112 | pad=1
1113 | filters=1024
1114 | activation=leaky
1115 | 
1116 | [convolutional]
1117 | batch_normalize=1
1118 | filters=512
1119 | size=1
1120 | stride=1
1121 | pad=1
1122 | activation=leaky
1123 | 
1124 | [convolutional]
1125 | batch_normalize=1
1126 | size=3
1127 | stride=1
1128 | pad=1
1129 | filters=1024
1130 | activation=leaky
1131 | 
1132 | [convolutional]
1133 | size=1
1134 | stride=1
1135 | pad=1
1136 | filters=255
1137 | activation=linear
1138 | 
1139 | 
1140 | [yolo]
1141 | mask = 6,7,8
1142 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
1143 | classes=80
1144 | num=9
1145 | jitter=.3
1146 | ignore_thresh = .7
1147 | truth_thresh = 1
1148 | random=1
1149 | scale_x_y = 1.05
1150 | iou_thresh=0.213
1151 | cls_normalizer=1.0
1152 | iou_normalizer=0.07
1153 | iou_loss=ciou
1154 | nms_kind=greedynms
1155 | beta_nms=0.6
1156 | 
1157 | 


--------------------------------------------------------------------------------
/cfg/yolov4.cfg:
--------------------------------------------------------------------------------
   1 | [net]
   2 | batch=64
   3 | subdivisions=8
   4 | # Training
   5 | #width=512
   6 | #height=512
   7 | width=608
   8 | height=608
   9 | channels=3
  10 | momentum=0.949
  11 | decay=0.0005
  12 | angle=0
  13 | saturation = 1.5
  14 | exposure = 1.5
  15 | hue=.1
  16 | 
  17 | learning_rate=0.0013
  18 | burn_in=1000
  19 | max_batches = 500500
  20 | policy=steps
  21 | steps=400000,450000
  22 | scales=.1,.1
  23 | 
  24 | #cutmix=1
  25 | mosaic=1
  26 | 
  27 | #:104x104 54:52x52 85:26x26 104:13x13 for 416
  28 | 
  29 | [convolutional]
  30 | batch_normalize=1
  31 | filters=32
  32 | size=3
  33 | stride=1
  34 | pad=1
  35 | activation=mish
  36 | 
  37 | # Downsample
  38 | 
  39 | [convolutional]
  40 | batch_normalize=1
  41 | filters=64
  42 | size=3
  43 | stride=2
  44 | pad=1
  45 | activation=mish
  46 | 
  47 | [convolutional]
  48 | batch_normalize=1
  49 | filters=64
  50 | size=1
  51 | stride=1
  52 | pad=1
  53 | activation=mish
  54 | 
  55 | [route]
  56 | layers = -2
  57 | 
  58 | [convolutional]
  59 | batch_normalize=1
  60 | filters=64
  61 | size=1
  62 | stride=1
  63 | pad=1
  64 | activation=mish
  65 | 
  66 | [convolutional]
  67 | batch_normalize=1
  68 | filters=32
  69 | size=1
  70 | stride=1
  71 | pad=1
  72 | activation=mish
  73 | 
  74 | [convolutional]
  75 | batch_normalize=1
  76 | filters=64
  77 | size=3
  78 | stride=1
  79 | pad=1
  80 | activation=mish
  81 | 
  82 | [shortcut]
  83 | from=-3
  84 | activation=linear
  85 | 
  86 | [convolutional]
  87 | batch_normalize=1
  88 | filters=64
  89 | size=1
  90 | stride=1
  91 | pad=1
  92 | activation=mish
  93 | 
  94 | [route]
  95 | layers = -1,-7
  96 | 
  97 | [convolutional]
  98 | batch_normalize=1
  99 | filters=64
 100 | size=1
 101 | stride=1
 102 | pad=1
 103 | activation=mish
 104 | 
 105 | # Downsample
 106 | 
 107 | [convolutional]
 108 | batch_normalize=1
 109 | filters=128
 110 | size=3
 111 | stride=2
 112 | pad=1
 113 | activation=mish
 114 | 
 115 | [convolutional]
 116 | batch_normalize=1
 117 | filters=64
 118 | size=1
 119 | stride=1
 120 | pad=1
 121 | activation=mish
 122 | 
 123 | [route]
 124 | layers = -2
 125 | 
 126 | [convolutional]
 127 | batch_normalize=1
 128 | filters=64
 129 | size=1
 130 | stride=1
 131 | pad=1
 132 | activation=mish
 133 | 
 134 | [convolutional]
 135 | batch_normalize=1
 136 | filters=64
 137 | size=1
 138 | stride=1
 139 | pad=1
 140 | activation=mish
 141 | 
 142 | [convolutional]
 143 | batch_normalize=1
 144 | filters=64
 145 | size=3
 146 | stride=1
 147 | pad=1
 148 | activation=mish
 149 | 
 150 | [shortcut]
 151 | from=-3
 152 | activation=linear
 153 | 
 154 | [convolutional]
 155 | batch_normalize=1
 156 | filters=64
 157 | size=1
 158 | stride=1
 159 | pad=1
 160 | activation=mish
 161 | 
 162 | [convolutional]
 163 | batch_normalize=1
 164 | filters=64
 165 | size=3
 166 | stride=1
 167 | pad=1
 168 | activation=mish
 169 | 
 170 | [shortcut]
 171 | from=-3
 172 | activation=linear
 173 | 
 174 | [convolutional]
 175 | batch_normalize=1
 176 | filters=64
 177 | size=1
 178 | stride=1
 179 | pad=1
 180 | activation=mish
 181 | 
 182 | [route]
 183 | layers = -1,-10
 184 | 
 185 | [convolutional]
 186 | batch_normalize=1
 187 | filters=128
 188 | size=1
 189 | stride=1
 190 | pad=1
 191 | activation=mish
 192 | 
 193 | # Downsample
 194 | 
 195 | [convolutional]
 196 | batch_normalize=1
 197 | filters=256
 198 | size=3
 199 | stride=2
 200 | pad=1
 201 | activation=mish
 202 | 
 203 | [convolutional]
 204 | batch_normalize=1
 205 | filters=128
 206 | size=1
 207 | stride=1
 208 | pad=1
 209 | activation=mish
 210 | 
 211 | [route]
 212 | layers = -2
 213 | 
 214 | [convolutional]
 215 | batch_normalize=1
 216 | filters=128
 217 | size=1
 218 | stride=1
 219 | pad=1
 220 | activation=mish
 221 | 
 222 | [convolutional]
 223 | batch_normalize=1
 224 | filters=128
 225 | size=1
 226 | stride=1
 227 | pad=1
 228 | activation=mish
 229 | 
 230 | [convolutional]
 231 | batch_normalize=1
 232 | filters=128
 233 | size=3
 234 | stride=1
 235 | pad=1
 236 | activation=mish
 237 | 
 238 | [shortcut]
 239 | from=-3
 240 | activation=linear
 241 | 
 242 | [convolutional]
 243 | batch_normalize=1
 244 | filters=128
 245 | size=1
 246 | stride=1
 247 | pad=1
 248 | activation=mish
 249 | 
 250 | [convolutional]
 251 | batch_normalize=1
 252 | filters=128
 253 | size=3
 254 | stride=1
 255 | pad=1
 256 | activation=mish
 257 | 
 258 | [shortcut]
 259 | from=-3
 260 | activation=linear
 261 | 
 262 | [convolutional]
 263 | batch_normalize=1
 264 | filters=128
 265 | size=1
 266 | stride=1
 267 | pad=1
 268 | activation=mish
 269 | 
 270 | [convolutional]
 271 | batch_normalize=1
 272 | filters=128
 273 | size=3
 274 | stride=1
 275 | pad=1
 276 | activation=mish
 277 | 
 278 | [shortcut]
 279 | from=-3
 280 | activation=linear
 281 | 
 282 | [convolutional]
 283 | batch_normalize=1
 284 | filters=128
 285 | size=1
 286 | stride=1
 287 | pad=1
 288 | activation=mish
 289 | 
 290 | [convolutional]
 291 | batch_normalize=1
 292 | filters=128
 293 | size=3
 294 | stride=1
 295 | pad=1
 296 | activation=mish
 297 | 
 298 | [shortcut]
 299 | from=-3
 300 | activation=linear
 301 | 
 302 | 
 303 | [convolutional]
 304 | batch_normalize=1
 305 | filters=128
 306 | size=1
 307 | stride=1
 308 | pad=1
 309 | activation=mish
 310 | 
 311 | [convolutional]
 312 | batch_normalize=1
 313 | filters=128
 314 | size=3
 315 | stride=1
 316 | pad=1
 317 | activation=mish
 318 | 
 319 | [shortcut]
 320 | from=-3
 321 | activation=linear
 322 | 
 323 | [convolutional]
 324 | batch_normalize=1
 325 | filters=128
 326 | size=1
 327 | stride=1
 328 | pad=1
 329 | activation=mish
 330 | 
 331 | [convolutional]
 332 | batch_normalize=1
 333 | filters=128
 334 | size=3
 335 | stride=1
 336 | pad=1
 337 | activation=mish
 338 | 
 339 | [shortcut]
 340 | from=-3
 341 | activation=linear
 342 | 
 343 | [convolutional]
 344 | batch_normalize=1
 345 | filters=128
 346 | size=1
 347 | stride=1
 348 | pad=1
 349 | activation=mish
 350 | 
 351 | [convolutional]
 352 | batch_normalize=1
 353 | filters=128
 354 | size=3
 355 | stride=1
 356 | pad=1
 357 | activation=mish
 358 | 
 359 | [shortcut]
 360 | from=-3
 361 | activation=linear
 362 | 
 363 | [convolutional]
 364 | batch_normalize=1
 365 | filters=128
 366 | size=1
 367 | stride=1
 368 | pad=1
 369 | activation=mish
 370 | 
 371 | [convolutional]
 372 | batch_normalize=1
 373 | filters=128
 374 | size=3
 375 | stride=1
 376 | pad=1
 377 | activation=mish
 378 | 
 379 | [shortcut]
 380 | from=-3
 381 | activation=linear
 382 | 
 383 | [convolutional]
 384 | batch_normalize=1
 385 | filters=128
 386 | size=1
 387 | stride=1
 388 | pad=1
 389 | activation=mish
 390 | 
 391 | [route]
 392 | layers = -1,-28
 393 | 
 394 | [convolutional]
 395 | batch_normalize=1
 396 | filters=256
 397 | size=1
 398 | stride=1
 399 | pad=1
 400 | activation=mish
 401 | 
 402 | # Downsample
 403 | 
 404 | [convolutional]
 405 | batch_normalize=1
 406 | filters=512
 407 | size=3
 408 | stride=2
 409 | pad=1
 410 | activation=mish
 411 | 
 412 | [convolutional]
 413 | batch_normalize=1
 414 | filters=256
 415 | size=1
 416 | stride=1
 417 | pad=1
 418 | activation=mish
 419 | 
 420 | [route]
 421 | layers = -2
 422 | 
 423 | [convolutional]
 424 | batch_normalize=1
 425 | filters=256
 426 | size=1
 427 | stride=1
 428 | pad=1
 429 | activation=mish
 430 | 
 431 | [convolutional]
 432 | batch_normalize=1
 433 | filters=256
 434 | size=1
 435 | stride=1
 436 | pad=1
 437 | activation=mish
 438 | 
 439 | [convolutional]
 440 | batch_normalize=1
 441 | filters=256
 442 | size=3
 443 | stride=1
 444 | pad=1
 445 | activation=mish
 446 | 
 447 | [shortcut]
 448 | from=-3
 449 | activation=linear
 450 | 
 451 | 
 452 | [convolutional]
 453 | batch_normalize=1
 454 | filters=256
 455 | size=1
 456 | stride=1
 457 | pad=1
 458 | activation=mish
 459 | 
 460 | [convolutional]
 461 | batch_normalize=1
 462 | filters=256
 463 | size=3
 464 | stride=1
 465 | pad=1
 466 | activation=mish
 467 | 
 468 | [shortcut]
 469 | from=-3
 470 | activation=linear
 471 | 
 472 | 
 473 | [convolutional]
 474 | batch_normalize=1
 475 | filters=256
 476 | size=1
 477 | stride=1
 478 | pad=1
 479 | activation=mish
 480 | 
 481 | [convolutional]
 482 | batch_normalize=1
 483 | filters=256
 484 | size=3
 485 | stride=1
 486 | pad=1
 487 | activation=mish
 488 | 
 489 | [shortcut]
 490 | from=-3
 491 | activation=linear
 492 | 
 493 | 
 494 | [convolutional]
 495 | batch_normalize=1
 496 | filters=256
 497 | size=1
 498 | stride=1
 499 | pad=1
 500 | activation=mish
 501 | 
 502 | [convolutional]
 503 | batch_normalize=1
 504 | filters=256
 505 | size=3
 506 | stride=1
 507 | pad=1
 508 | activation=mish
 509 | 
 510 | [shortcut]
 511 | from=-3
 512 | activation=linear
 513 | 
 514 | 
 515 | [convolutional]
 516 | batch_normalize=1
 517 | filters=256
 518 | size=1
 519 | stride=1
 520 | pad=1
 521 | activation=mish
 522 | 
 523 | [convolutional]
 524 | batch_normalize=1
 525 | filters=256
 526 | size=3
 527 | stride=1
 528 | pad=1
 529 | activation=mish
 530 | 
 531 | [shortcut]
 532 | from=-3
 533 | activation=linear
 534 | 
 535 | 
 536 | [convolutional]
 537 | batch_normalize=1
 538 | filters=256
 539 | size=1
 540 | stride=1
 541 | pad=1
 542 | activation=mish
 543 | 
 544 | [convolutional]
 545 | batch_normalize=1
 546 | filters=256
 547 | size=3
 548 | stride=1
 549 | pad=1
 550 | activation=mish
 551 | 
 552 | [shortcut]
 553 | from=-3
 554 | activation=linear
 555 | 
 556 | 
 557 | [convolutional]
 558 | batch_normalize=1
 559 | filters=256
 560 | size=1
 561 | stride=1
 562 | pad=1
 563 | activation=mish
 564 | 
 565 | [convolutional]
 566 | batch_normalize=1
 567 | filters=256
 568 | size=3
 569 | stride=1
 570 | pad=1
 571 | activation=mish
 572 | 
 573 | [shortcut]
 574 | from=-3
 575 | activation=linear
 576 | 
 577 | [convolutional]
 578 | batch_normalize=1
 579 | filters=256
 580 | size=1
 581 | stride=1
 582 | pad=1
 583 | activation=mish
 584 | 
 585 | [convolutional]
 586 | batch_normalize=1
 587 | filters=256
 588 | size=3
 589 | stride=1
 590 | pad=1
 591 | activation=mish
 592 | 
 593 | [shortcut]
 594 | from=-3
 595 | activation=linear
 596 | 
 597 | [convolutional]
 598 | batch_normalize=1
 599 | filters=256
 600 | size=1
 601 | stride=1
 602 | pad=1
 603 | activation=mish
 604 | 
 605 | [route]
 606 | layers = -1,-28
 607 | 
 608 | [convolutional]
 609 | batch_normalize=1
 610 | filters=512
 611 | size=1
 612 | stride=1
 613 | pad=1
 614 | activation=mish
 615 | 
 616 | # Downsample
 617 | 
 618 | [convolutional]
 619 | batch_normalize=1
 620 | filters=1024
 621 | size=3
 622 | stride=2
 623 | pad=1
 624 | activation=mish
 625 | 
 626 | [convolutional]
 627 | batch_normalize=1
 628 | filters=512
 629 | size=1
 630 | stride=1
 631 | pad=1
 632 | activation=mish
 633 | 
 634 | [route]
 635 | layers = -2
 636 | 
 637 | [convolutional]
 638 | batch_normalize=1
 639 | filters=512
 640 | size=1
 641 | stride=1
 642 | pad=1
 643 | activation=mish
 644 | 
 645 | [convolutional]
 646 | batch_normalize=1
 647 | filters=512
 648 | size=1
 649 | stride=1
 650 | pad=1
 651 | activation=mish
 652 | 
 653 | [convolutional]
 654 | batch_normalize=1
 655 | filters=512
 656 | size=3
 657 | stride=1
 658 | pad=1
 659 | activation=mish
 660 | 
 661 | [shortcut]
 662 | from=-3
 663 | activation=linear
 664 | 
 665 | [convolutional]
 666 | batch_normalize=1
 667 | filters=512
 668 | size=1
 669 | stride=1
 670 | pad=1
 671 | activation=mish
 672 | 
 673 | [convolutional]
 674 | batch_normalize=1
 675 | filters=512
 676 | size=3
 677 | stride=1
 678 | pad=1
 679 | activation=mish
 680 | 
 681 | [shortcut]
 682 | from=-3
 683 | activation=linear
 684 | 
 685 | [convolutional]
 686 | batch_normalize=1
 687 | filters=512
 688 | size=1
 689 | stride=1
 690 | pad=1
 691 | activation=mish
 692 | 
 693 | [convolutional]
 694 | batch_normalize=1
 695 | filters=512
 696 | size=3
 697 | stride=1
 698 | pad=1
 699 | activation=mish
 700 | 
 701 | [shortcut]
 702 | from=-3
 703 | activation=linear
 704 | 
 705 | [convolutional]
 706 | batch_normalize=1
 707 | filters=512
 708 | size=1
 709 | stride=1
 710 | pad=1
 711 | activation=mish
 712 | 
 713 | [convolutional]
 714 | batch_normalize=1
 715 | filters=512
 716 | size=3
 717 | stride=1
 718 | pad=1
 719 | activation=mish
 720 | 
 721 | [shortcut]
 722 | from=-3
 723 | activation=linear
 724 | 
 725 | [convolutional]
 726 | batch_normalize=1
 727 | filters=512
 728 | size=1
 729 | stride=1
 730 | pad=1
 731 | activation=mish
 732 | 
 733 | [route]
 734 | layers = -1,-16
 735 | 
 736 | [convolutional]
 737 | batch_normalize=1
 738 | filters=1024
 739 | size=1
 740 | stride=1
 741 | pad=1
 742 | activation=mish
 743 | 
 744 | ##########################
 745 | 
 746 | [convolutional]
 747 | batch_normalize=1
 748 | filters=512
 749 | size=1
 750 | stride=1
 751 | pad=1
 752 | activation=leaky
 753 | 
 754 | [convolutional]
 755 | batch_normalize=1
 756 | size=3
 757 | stride=1
 758 | pad=1
 759 | filters=1024
 760 | activation=leaky
 761 | 
 762 | [convolutional]
 763 | batch_normalize=1
 764 | filters=512
 765 | size=1
 766 | stride=1
 767 | pad=1
 768 | activation=leaky
 769 | 
 770 | ### SPP ###
 771 | [maxpool]
 772 | stride=1
 773 | size=5
 774 | 
 775 | [route]
 776 | layers=-2
 777 | 
 778 | [maxpool]
 779 | stride=1
 780 | size=9
 781 | 
 782 | [route]
 783 | layers=-4
 784 | 
 785 | [maxpool]
 786 | stride=1
 787 | size=13
 788 | 
 789 | [route]
 790 | layers=-1,-3,-5,-6
 791 | ### End SPP ###
 792 | 
 793 | [convolutional]
 794 | batch_normalize=1
 795 | filters=512
 796 | size=1
 797 | stride=1
 798 | pad=1
 799 | activation=leaky
 800 | 
 801 | [convolutional]
 802 | batch_normalize=1
 803 | size=3
 804 | stride=1
 805 | pad=1
 806 | filters=1024
 807 | activation=leaky
 808 | 
 809 | [convolutional]
 810 | batch_normalize=1
 811 | filters=512
 812 | size=1
 813 | stride=1
 814 | pad=1
 815 | activation=leaky
 816 | 
 817 | [convolutional]
 818 | batch_normalize=1
 819 | filters=256
 820 | size=1
 821 | stride=1
 822 | pad=1
 823 | activation=leaky
 824 | 
 825 | [upsample]
 826 | stride=2
 827 | 
 828 | [route]
 829 | layers = 85
 830 | 
 831 | [convolutional]
 832 | batch_normalize=1
 833 | filters=256
 834 | size=1
 835 | stride=1
 836 | pad=1
 837 | activation=leaky
 838 | 
 839 | [route]
 840 | layers = -1, -3
 841 | 
 842 | [convolutional]
 843 | batch_normalize=1
 844 | filters=256
 845 | size=1
 846 | stride=1
 847 | pad=1
 848 | activation=leaky
 849 | 
 850 | [convolutional]
 851 | batch_normalize=1
 852 | size=3
 853 | stride=1
 854 | pad=1
 855 | filters=512
 856 | activation=leaky
 857 | 
 858 | [convolutional]
 859 | batch_normalize=1
 860 | filters=256
 861 | size=1
 862 | stride=1
 863 | pad=1
 864 | activation=leaky
 865 | 
 866 | [convolutional]
 867 | batch_normalize=1
 868 | size=3
 869 | stride=1
 870 | pad=1
 871 | filters=512
 872 | activation=leaky
 873 | 
 874 | [convolutional]
 875 | batch_normalize=1
 876 | filters=256
 877 | size=1
 878 | stride=1
 879 | pad=1
 880 | activation=leaky
 881 | 
 882 | [convolutional]
 883 | batch_normalize=1
 884 | filters=128
 885 | size=1
 886 | stride=1
 887 | pad=1
 888 | activation=leaky
 889 | 
 890 | [upsample]
 891 | stride=2
 892 | 
 893 | [route]
 894 | layers = 54
 895 | 
 896 | [convolutional]
 897 | batch_normalize=1
 898 | filters=128
 899 | size=1
 900 | stride=1
 901 | pad=1
 902 | activation=leaky
 903 | 
 904 | [route]
 905 | layers = -1, -3
 906 | 
 907 | [convolutional]
 908 | batch_normalize=1
 909 | filters=128
 910 | size=1
 911 | stride=1
 912 | pad=1
 913 | activation=leaky
 914 | 
 915 | [convolutional]
 916 | batch_normalize=1
 917 | size=3
 918 | stride=1
 919 | pad=1
 920 | filters=256
 921 | activation=leaky
 922 | 
 923 | [convolutional]
 924 | batch_normalize=1
 925 | filters=128
 926 | size=1
 927 | stride=1
 928 | pad=1
 929 | activation=leaky
 930 | 
 931 | [convolutional]
 932 | batch_normalize=1
 933 | size=3
 934 | stride=1
 935 | pad=1
 936 | filters=256
 937 | activation=leaky
 938 | 
 939 | [convolutional]
 940 | batch_normalize=1
 941 | filters=128
 942 | size=1
 943 | stride=1
 944 | pad=1
 945 | activation=leaky
 946 | 
 947 | ##########################
 948 | 
 949 | [convolutional]
 950 | batch_normalize=1
 951 | size=3
 952 | stride=1
 953 | pad=1
 954 | filters=256
 955 | activation=leaky
 956 | 
 957 | [convolutional]
 958 | size=1
 959 | stride=1
 960 | pad=1
 961 | filters=255
 962 | activation=linear
 963 | 
 964 | 
 965 | [yolo]
 966 | mask = 0,1,2
 967 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
 968 | classes=80
 969 | num=9
 970 | jitter=.3
 971 | ignore_thresh = .7
 972 | truth_thresh = 1
 973 | scale_x_y = 1.2
 974 | iou_thresh=0.213
 975 | cls_normalizer=1.0
 976 | iou_normalizer=0.07
 977 | iou_loss=ciou
 978 | nms_kind=greedynms
 979 | beta_nms=0.6
 980 | max_delta=5
 981 | 
 982 | 
 983 | [route]
 984 | layers = -4
 985 | 
 986 | [convolutional]
 987 | batch_normalize=1
 988 | size=3
 989 | stride=2
 990 | pad=1
 991 | filters=256
 992 | activation=leaky
 993 | 
 994 | [route]
 995 | layers = -1, -16
 996 | 
 997 | [convolutional]
 998 | batch_normalize=1
 999 | filters=256
1000 | size=1
1001 | stride=1
1002 | pad=1
1003 | activation=leaky
1004 | 
1005 | [convolutional]
1006 | batch_normalize=1
1007 | size=3
1008 | stride=1
1009 | pad=1
1010 | filters=512
1011 | activation=leaky
1012 | 
1013 | [convolutional]
1014 | batch_normalize=1
1015 | filters=256
1016 | size=1
1017 | stride=1
1018 | pad=1
1019 | activation=leaky
1020 | 
1021 | [convolutional]
1022 | batch_normalize=1
1023 | size=3
1024 | stride=1
1025 | pad=1
1026 | filters=512
1027 | activation=leaky
1028 | 
1029 | [convolutional]
1030 | batch_normalize=1
1031 | filters=256
1032 | size=1
1033 | stride=1
1034 | pad=1
1035 | activation=leaky
1036 | 
1037 | [convolutional]
1038 | batch_normalize=1
1039 | size=3
1040 | stride=1
1041 | pad=1
1042 | filters=512
1043 | activation=leaky
1044 | 
1045 | [convolutional]
1046 | size=1
1047 | stride=1
1048 | pad=1
1049 | filters=255
1050 | activation=linear
1051 | 
1052 | 
1053 | [yolo]
1054 | mask = 3,4,5
1055 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
1056 | classes=80
1057 | num=9
1058 | jitter=.3
1059 | ignore_thresh = .7
1060 | truth_thresh = 1
1061 | scale_x_y = 1.1
1062 | iou_thresh=0.213
1063 | cls_normalizer=1.0
1064 | iou_normalizer=0.07
1065 | iou_loss=ciou
1066 | nms_kind=greedynms
1067 | beta_nms=0.6
1068 | max_delta=5
1069 | 
1070 | 
1071 | [route]
1072 | layers = -4
1073 | 
1074 | [convolutional]
1075 | batch_normalize=1
1076 | size=3
1077 | stride=2
1078 | pad=1
1079 | filters=512
1080 | activation=leaky
1081 | 
1082 | [route]
1083 | layers = -1, -37
1084 | 
1085 | [convolutional]
1086 | batch_normalize=1
1087 | filters=512
1088 | size=1
1089 | stride=1
1090 | pad=1
1091 | activation=leaky
1092 | 
1093 | [convolutional]
1094 | batch_normalize=1
1095 | size=3
1096 | stride=1
1097 | pad=1
1098 | filters=1024
1099 | activation=leaky
1100 | 
1101 | [convolutional]
1102 | batch_normalize=1
1103 | filters=512
1104 | size=1
1105 | stride=1
1106 | pad=1
1107 | activation=leaky
1108 | 
1109 | [convolutional]
1110 | batch_normalize=1
1111 | size=3
1112 | stride=1
1113 | pad=1
1114 | filters=1024
1115 | activation=leaky
1116 | 
1117 | [convolutional]
1118 | batch_normalize=1
1119 | filters=512
1120 | size=1
1121 | stride=1
1122 | pad=1
1123 | activation=leaky
1124 | 
1125 | [convolutional]
1126 | batch_normalize=1
1127 | size=3
1128 | stride=1
1129 | pad=1
1130 | filters=1024
1131 | activation=leaky
1132 | 
1133 | [convolutional]
1134 | size=1
1135 | stride=1
1136 | pad=1
1137 | filters=255
1138 | activation=linear
1139 | 
1140 | 
1141 | [yolo]
1142 | mask = 6,7,8
1143 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401
1144 | classes=80
1145 | num=9
1146 | jitter=.3
1147 | ignore_thresh = .7
1148 | truth_thresh = 1
1149 | random=1
1150 | scale_x_y = 1.05
1151 | iou_thresh=0.213
1152 | cls_normalizer=1.0
1153 | iou_normalizer=0.07
1154 | iou_loss=ciou
1155 | nms_kind=greedynms
1156 | beta_nms=0.6
1157 | max_delta=5
1158 | 


--------------------------------------------------------------------------------
/data/coco.names:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorbike
 5 | aeroplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 | 


--------------------------------------------------------------------------------
/data/dog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/roboflow/pytorch-YOLOv4/b6189b304b9a60a15bc0a9c3627ec4973fe0e2b5/data/dog.jpg


--------------------------------------------------------------------------------
/data/giraffe.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/roboflow/pytorch-YOLOv4/b6189b304b9a60a15bc0a9c3627ec4973fe0e2b5/data/giraffe.jpg


--------------------------------------------------------------------------------
/data/prediction.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/roboflow/pytorch-YOLOv4/b6189b304b9a60a15bc0a9c3627ec4973fe0e2b5/data/prediction.jpg


--------------------------------------------------------------------------------
/data/voc.names:
--------------------------------------------------------------------------------
 1 | aeroplane
 2 | bicycle
 3 | bird
 4 | boat
 5 | bottle
 6 | bus
 7 | car
 8 | cat
 9 | chair
10 | cow
11 | diningtable
12 | dog
13 | horse
14 | motorbike
15 | person
16 | pottedplant
17 | sheep
18 | sofa
19 | train
20 | tvmonitor
21 | 


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 | @Time          : 2020/05/06 21:09
  4 | @Author        : Tianxiaomo
  5 | @File          : dataset.py
  6 | @Noice         :
  7 | @Modificattion :
  8 |     @Author    :
  9 |     @Time      :
 10 |     @Detail    :
 11 | 
 12 | '''
 13 | from torch.utils.data.dataset import Dataset
 14 | 
 15 | import random
 16 | import cv2
 17 | import sys
 18 | import numpy as np
 19 | import os
 20 | import matplotlib.pyplot as plt
 21 | 
 22 | 
 23 | def rand_uniform_strong(min, max):
 24 |     if min > max:
 25 |         swap = min
 26 |         min = max
 27 |         max = swap
 28 |     return random.random() * (max - min) + min
 29 | 
 30 | 
 31 | def rand_scale(s):
 32 |     scale = rand_uniform_strong(1, s)
 33 |     if random.randint(0, 1) % 2:
 34 |         return scale
 35 |     return 1. / scale
 36 | 
 37 | 
 38 | def rand_precalc_random(min, max, random_part):
 39 |     if max < min:
 40 |         swap = min
 41 |         min = max
 42 |         max = swap
 43 |     return (random_part * (max - min)) + min
 44 | 
 45 | 
 46 | def fill_truth_detection(bboxes, num_boxes, classes, flip, dx, dy, sx, sy, net_w, net_h):
 47 |     if bboxes.shape[0] == 0:
 48 |         return bboxes, 10000
 49 |     np.random.shuffle(bboxes)
 50 |     bboxes[:, 0] -= dx
 51 |     bboxes[:, 2] -= dx
 52 |     bboxes[:, 1] -= dy
 53 |     bboxes[:, 3] -= dy
 54 | 
 55 |     bboxes[:, 0] = np.clip(bboxes[:, 0], 0, sx)
 56 |     bboxes[:, 2] = np.clip(bboxes[:, 2], 0, sx)
 57 | 
 58 |     bboxes[:, 1] = np.clip(bboxes[:, 1], 0, sy)
 59 |     bboxes[:, 3] = np.clip(bboxes[:, 3], 0, sy)
 60 | 
 61 |     out_box = list(np.where(((bboxes[:, 1] == sy) & (bboxes[:, 3] == sy)) |
 62 |                             ((bboxes[:, 0] == sx) & (bboxes[:, 2] == sx)) |
 63 |                             ((bboxes[:, 1] == 0) & (bboxes[:, 3] == 0)) |
 64 |                             ((bboxes[:, 0] == 0) & (bboxes[:, 2] == 0)))[0])
 65 |     list_box = list(range(bboxes.shape[0]))
 66 |     for i in out_box:
 67 |         list_box.remove(i)
 68 |     bboxes = bboxes[list_box]
 69 | 
 70 |     if bboxes.shape[0] == 0:
 71 |         return bboxes, 10000
 72 | 
 73 |     bboxes = bboxes[np.where((bboxes[:, 4] < classes) & (bboxes[:, 4] >= 0))[0]]
 74 | 
 75 |     if bboxes.shape[0] > num_boxes:
 76 |         bboxes = bboxes[:num_boxes]
 77 | 
 78 |     min_w_h = np.array([bboxes[:, 2] - bboxes[:, 0], bboxes[:, 3] - bboxes[:, 1]]).min()
 79 | 
 80 |     bboxes[:, 0] *= (net_w / sx)
 81 |     bboxes[:, 2] *= (net_w / sx)
 82 |     bboxes[:, 1] *= (net_h / sy)
 83 |     bboxes[:, 3] *= (net_h / sy)
 84 | 
 85 |     if flip:
 86 |         temp = net_w - bboxes[:, 0]
 87 |         bboxes[:, 0] = net_w - bboxes[:, 2]
 88 |         bboxes[:, 2] = temp
 89 | 
 90 |     return bboxes, min_w_h
 91 | 
 92 | 
 93 | def rect_intersection(a, b):
 94 |     minx = max(a[0], b[0])
 95 |     miny = max(a[1], b[1])
 96 | 
 97 |     maxx = min(a[2], b[2])
 98 |     maxy = min(a[3], b[3])
 99 |     return [minx, miny, maxx, maxy]
100 | 
101 | 
102 | def image_data_augmentation(mat, w, h, pleft, ptop, swidth, sheight, flip, dhue, dsat, dexp, gaussian_noise, blur,
103 |                             truth):
104 |     try:
105 |         img = mat
106 |         oh, ow, _ = img.shape
107 |         pleft, ptop, swidth, sheight = int(pleft), int(ptop), int(swidth), int(sheight)
108 |         # crop
109 |         src_rect = [pleft, ptop, swidth + pleft, sheight + ptop]  # x1,y1,x2,y2
110 |         img_rect = [0, 0, ow, oh]
111 |         new_src_rect = rect_intersection(src_rect, img_rect)  # 交集
112 | 
113 |         dst_rect = [max(0, -pleft), max(0, -ptop), max(0, -pleft) + new_src_rect[2] - new_src_rect[0],
114 |                     max(0, -ptop) + new_src_rect[3] - new_src_rect[1]]
115 |         # cv2.Mat sized
116 | 
117 |         if (src_rect[0] == 0 and src_rect[1] == 0 and src_rect[2] == img.shape[0] and src_rect[3] == img.shape[1]):
118 |             sized = cv2.resize(img, (w, h), cv2.INTER_LINEAR)
119 |         else:
120 |             cropped = np.zeros([sheight, swidth, 3])
121 |             cropped[:, :, ] = np.mean(img, axis=(0, 1))
122 | 
123 |             cropped[dst_rect[1]:dst_rect[3], dst_rect[0]:dst_rect[2]] = \
124 |                 img[new_src_rect[1]:new_src_rect[3], new_src_rect[0]:new_src_rect[2]]
125 | 
126 |             # resize
127 |             sized = cv2.resize(cropped, (w, h), cv2.INTER_LINEAR)
128 | 
129 |         # flip
130 |         if flip:
131 |             # cv2.Mat cropped
132 |             sized = cv2.flip(sized, 1)  # 0 - x-axis, 1 - y-axis, -1 - both axes (x & y)
133 | 
134 |         # HSV augmentation
135 |         # cv2.COLOR_BGR2HSV, cv2.COLOR_RGB2HSV, cv2.COLOR_HSV2BGR, cv2.COLOR_HSV2RGB
136 |         if dsat != 1 or dexp != 1 or dhue != 0:
137 |             if img.shape[2] >= 3:
138 |                 hsv_src = cv2.cvtColor(sized.astype(np.float32), cv2.COLOR_RGB2HSV)  # RGB to HSV
139 |                 hsv = cv2.split(hsv_src)
140 |                 hsv[1] *= dsat
141 |                 hsv[2] *= dexp
142 |                 hsv[0] += 179 * dhue
143 |                 hsv_src = cv2.merge(hsv)
144 |                 sized = np.clip(cv2.cvtColor(hsv_src, cv2.COLOR_HSV2RGB), 0, 255)  # HSV to RGB (the same as previous)
145 |             else:
146 |                 sized *= dexp
147 | 
148 |         if blur:
149 |             if blur == 1:
150 |                 dst = cv2.GaussianBlur(sized, (17, 17), 0)
151 |                 # cv2.bilateralFilter(sized, dst, 17, 75, 75)
152 |             else:
153 |                 ksize = (blur / 2) * 2 + 1
154 |                 dst = cv2.GaussianBlur(sized, (ksize, ksize), 0)
155 | 
156 |             if blur == 1:
157 |                 img_rect = [0, 0, sized.cols, sized.rows]
158 |                 for b in truth:
159 |                     left = (b.x - b.w / 2.) * sized.shape[1]
160 |                     width = b.w * sized.shape[1]
161 |                     top = (b.y - b.h / 2.) * sized.shape[0]
162 |                     height = b.h * sized.shape[0]
163 |                     roi(left, top, width, height)
164 |                     roi = roi & img_rect
165 |                     dst[roi[0]:roi[0] + roi[2], roi[1]:roi[1] + roi[3]] = sized[roi[0]:roi[0] + roi[2],
166 |                                                                           roi[1]:roi[1] + roi[3]]
167 | 
168 |             sized = dst
169 | 
170 |         if gaussian_noise:
171 |             noise = np.array(sized.shape)
172 |             gaussian_noise = min(gaussian_noise, 127)
173 |             gaussian_noise = max(gaussian_noise, 0)
174 |             cv2.randn(noise, 0, gaussian_noise)  # mean and variance
175 |             sized = sized + noise
176 |     except:
177 |         print("OpenCV can't augment image: " + str(w) + " x " + str(h))
178 |         sized = mat
179 | 
180 |     return sized
181 | 
182 | 
183 | def filter_truth(bboxes, dx, dy, sx, sy, xd, yd):
184 |     bboxes[:, 0] -= dx
185 |     bboxes[:, 2] -= dx
186 |     bboxes[:, 1] -= dy
187 |     bboxes[:, 3] -= dy
188 | 
189 |     bboxes[:, 0] = np.clip(bboxes[:, 0], 0, sx)
190 |     bboxes[:, 2] = np.clip(bboxes[:, 2], 0, sx)
191 | 
192 |     bboxes[:, 1] = np.clip(bboxes[:, 1], 0, sy)
193 |     bboxes[:, 3] = np.clip(bboxes[:, 3], 0, sy)
194 | 
195 |     out_box = list(np.where(((bboxes[:, 1] == sy) & (bboxes[:, 3] == sy)) |
196 |                             ((bboxes[:, 0] == sx) & (bboxes[:, 2] == sx)) |
197 |                             ((bboxes[:, 1] == 0) & (bboxes[:, 3] == 0)) |
198 |                             ((bboxes[:, 0] == 0) & (bboxes[:, 2] == 0)))[0])
199 |     list_box = list(range(bboxes.shape[0]))
200 |     for i in out_box:
201 |         list_box.remove(i)
202 |     bboxes = bboxes[list_box]
203 | 
204 |     bboxes[:, 0] += xd
205 |     bboxes[:, 2] += xd
206 |     bboxes[:, 1] += yd
207 |     bboxes[:, 3] += yd
208 | 
209 |     return bboxes
210 | 
211 | 
212 | def blend_truth_mosaic(out_img, img, bboxes, w, h, cut_x, cut_y, i_mixup,
213 |                        left_shift, right_shift, top_shift, bot_shift):
214 |     left_shift = min(left_shift, w - cut_x)
215 |     top_shift = min(top_shift, h - cut_y)
216 |     right_shift = min(right_shift, cut_x)
217 |     bot_shift = min(bot_shift, cut_y)
218 | 
219 |     if i_mixup == 0:
220 |         bboxes = filter_truth(bboxes, left_shift, top_shift, cut_x, cut_y, 0, 0)
221 |         out_img[:cut_y, :cut_x] = img[top_shift:top_shift + cut_y, left_shift:left_shift + cut_x]
222 |     if i_mixup == 1:
223 |         bboxes = filter_truth(bboxes, cut_x - right_shift, top_shift, w - cut_x, cut_y, cut_x, 0)
224 |         out_img[:cut_y, cut_x:] = img[top_shift:top_shift + cut_y, cut_x - right_shift:w - right_shift]
225 |     if i_mixup == 2:
226 |         bboxes = filter_truth(bboxes, left_shift, cut_y - bot_shift, cut_x, h - cut_y, 0, cut_y)
227 |         out_img[cut_y:, :cut_x] = img[cut_y - bot_shift:h - bot_shift, left_shift:left_shift + cut_x]
228 |     if i_mixup == 3:
229 |         bboxes = filter_truth(bboxes, cut_x - right_shift, cut_y - bot_shift, w - cut_x, h - cut_y, cut_x, cut_y)
230 |         out_img[cut_y:, cut_x:] = img[cut_y - bot_shift:h - bot_shift, cut_x - right_shift:w - right_shift]
231 | 
232 |     return out_img, bboxes
233 | 
234 | 
235 | def draw_box(img, bboxes):
236 |     for b in bboxes:
237 |         img = cv2.rectangle(img, (b[0], b[1]), (b[2], b[3]), (0, 255, 0), 2)
238 |     return img
239 | 
240 | 
241 | class Yolo_dataset(Dataset):
242 |     def __init__(self, lable_path, cfg):
243 |         super(Yolo_dataset, self).__init__()
244 |         if cfg.mixup == 2:
245 |             print("cutmix=1 - isn't supported for Detector")
246 |             raise
247 |         elif cfg.mixup == 2 and cfg.letter_box:
248 |             print("Combination: letter_box=1 & mosaic=1 - isn't supported, use only 1 of these parameters")
249 |             raise
250 | 
251 |         self.cfg = cfg
252 | 
253 |         truth = {}
254 |         f = open(lable_path, 'r', encoding='utf-8')
255 |         for line in f.readlines():
256 |             data = line.split(" ")
257 |             truth[data[0]] = []
258 |             for i in data[1:]:
259 |                 truth[data[0]].append([int(j) for j in i.split(',')])
260 | 
261 |         self.truth = truth
262 | 
263 |     def __len__(self):
264 |         return len(self.truth.keys())
265 | 
266 |     def __getitem__(self, index):
267 |         img_path = list(self.truth.keys())[index]
268 |         bboxes = np.array(self.truth.get(img_path), dtype=np.float)
269 |         img_path = os.path.join(self.cfg.dataset_dir, img_path)
270 |         use_mixup = self.cfg.mixup
271 |         if random.randint(0, 1):
272 |             use_mixup = 0
273 | 
274 |         if use_mixup == 3:
275 |             min_offset = 0.2
276 |             cut_x = random.randint(int(self.cfg.w * min_offset), int(self.cfg.w * (1 - min_offset)))
277 |             cut_y = random.randint(int(self.cfg.h * min_offset), int(self.cfg.h * (1 - min_offset)))
278 | 
279 |         r1, r2, r3, r4, r_scale = 0, 0, 0, 0, 0
280 |         dhue, dsat, dexp, flip, blur = 0, 0, 0, 0, 0
281 |         gaussian_noise = 0
282 | 
283 |         out_img = np.zeros([self.cfg.h, self.cfg.w, 3])
284 |         out_bboxes = []
285 | 
286 |         for i in range(use_mixup + 1):
287 |             if i != 0:
288 |                 img_path = random.choice(list(self.truth.keys()))
289 |                 bboxes = np.array(self.truth.get(img_path), dtype=np.float)
290 |                 img_path = os.path.join(self.cfg.dataset_dir, img_path)
291 |             img = cv2.imread(img_path)
292 |             if img is None:
293 |                 continue
294 |             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
295 |             oh, ow, oc = img.shape
296 |             dh, dw, dc = np.array(np.array([oh, ow, oc]) * self.cfg.jitter, dtype=np.int)
297 | 
298 |             dhue = rand_uniform_strong(-self.cfg.hue, self.cfg.hue)
299 |             dsat = rand_scale(self.cfg.saturation)
300 |             dexp = rand_scale(self.cfg.exposure)
301 | 
302 |             pleft = random.randint(-dw, dw)
303 |             pright = random.randint(-dw, dw)
304 |             ptop = random.randint(-dh, dh)
305 |             pbot = random.randint(-dh, dh)
306 | 
307 |             flip = random.randint(0, 1) if self.cfg.flip else 0
308 | 
309 |             if (self.cfg.blur):
310 |                 tmp_blur = random.randint(0, 2)  # 0 - disable, 1 - blur background, 2 - blur the whole image
311 |                 if tmp_blur == 0:
312 |                     blur = 0
313 |                 elif tmp_blur == 1:
314 |                     blur = 1
315 |                 else:
316 |                     blur = self.cfg.blur
317 | 
318 |             if self.cfg.gaussian and random.randint(0, 1):
319 |                 gaussian_noise = self.cfg.gaussian
320 |             else:
321 |                 gaussian_noise = 0
322 | 
323 |             if self.cfg.letter_box:
324 |                 img_ar = ow / oh
325 |                 net_ar = self.cfg.w / self.cfg.h
326 |                 result_ar = img_ar / net_ar
327 |                 # print(" ow = %d, oh = %d, w = %d, h = %d, img_ar = %f, net_ar = %f, result_ar = %f \n", ow, oh, w, h, img_ar, net_ar, result_ar);
328 |                 if result_ar > 1:  # sheight - should be increased
329 |                     oh_tmp = ow / net_ar
330 |                     delta_h = (oh_tmp - oh) / 2
331 |                     ptop = ptop - delta_h
332 |                     pbot = pbot - delta_h
333 |                     # print(" result_ar = %f, oh_tmp = %f, delta_h = %d, ptop = %f, pbot = %f \n", result_ar, oh_tmp, delta_h, ptop, pbot);
334 |                 else:  # swidth - should be increased
335 |                     ow_tmp = oh * net_ar
336 |                     delta_w = (ow_tmp - ow) / 2
337 |                     pleft = pleft - delta_w
338 |                     pright = pright - delta_w
339 |                     # printf(" result_ar = %f, ow_tmp = %f, delta_w = %d, pleft = %f, pright = %f \n", result_ar, ow_tmp, delta_w, pleft, pright);
340 | 
341 |             swidth = ow - pleft - pright
342 |             sheight = oh - ptop - pbot
343 | 
344 |             truth, min_w_h = fill_truth_detection(bboxes, self.cfg.boxes, self.cfg.classes, flip, pleft, ptop, swidth,
345 |                                                   sheight, self.cfg.w, self.cfg.h)
346 |             if (min_w_h / 8) < blur and blur > 1:  # disable blur if one of the objects is too small
347 |                 blur = min_w_h / 8
348 | 
349 |             ai = image_data_augmentation(img, self.cfg.w, self.cfg.h, pleft, ptop, swidth, sheight, flip,
350 |                                          dhue, dsat, dexp, gaussian_noise, blur, truth)
351 | 
352 |             if use_mixup == 0:
353 |                 out_img = ai
354 |                 out_bboxes = truth
355 |             if use_mixup == 1:
356 |                 if i == 0:
357 |                     old_img = ai.copy()
358 |                     old_truth = truth.copy()
359 |                 elif i == 1:
360 |                     out_img = cv2.addWeighted(ai, 0.5, old_img, 0.5)
361 |                     out_bboxes = np.concatenate([old_truth, truth], axis=0)
362 |             elif use_mixup == 3:
363 |                 if flip:
364 |                     tmp = pleft
365 |                     pleft = pright
366 |                     pright = tmp
367 | 
368 |                 left_shift = int(min(cut_x, max(0, (-int(pleft) * self.cfg.w / swidth))))
369 |                 top_shift = int(min(cut_y, max(0, (-int(ptop) * self.cfg.h / sheight))))
370 | 
371 |                 right_shift = int(min((self.cfg.w - cut_x), max(0, (-int(pright) * self.cfg.w / swidth))))
372 |                 bot_shift = int(min(self.cfg.h - cut_y, max(0, (-int(pbot) * self.cfg.h / sheight))))
373 | 
374 |                 out_img, out_bbox = blend_truth_mosaic(out_img, ai, truth.copy(), self.cfg.w, self.cfg.h, cut_x,
375 |                                                        cut_y, i, left_shift, right_shift, top_shift, bot_shift)
376 |                 out_bboxes.append(out_bbox)
377 |                 # print(img_path)
378 |         if use_mixup == 3:
379 |             out_bboxes = np.concatenate(out_bboxes, axis=0)
380 |         out_bboxes1 = np.zeros([self.cfg.boxes, 5])
381 |         try:
382 |             out_bboxes1[:min(out_bboxes.shape[0], self.cfg.boxes)] = out_bboxes[:min(out_bboxes.shape[0], self.cfg.boxes)]
383 |         except AttributeError:
384 |             out_bboxes = np.array(out_bboxes.astype(object), dtype=np.float32)
385 |             out_bboxes1[:min(out_bboxes.shape[0], self.cfg.boxes)] = out_bboxes[:min(out_bboxes.shape[0], self.cfg.boxes)]
386 |         return out_img, out_bboxes1
387 | 
388 | 
389 | if __name__ == "__main__":
390 |     from cfg import Cfg
391 | 
392 |     random.seed(2020)
393 |     np.random.seed(2020)
394 |     Cfg.dataset_dir = '/mnt/e/Dataset'
395 |     dataset = Yolo_dataset(Cfg.train_label, Cfg)
396 |     for i in range(100):
397 |         out_img, out_bboxes = dataset.__getitem__(i)
398 |         a = draw_box(out_img.copy(), out_bboxes.astype(np.int32))
399 |         plt.imshow(a.astype(np.int32))
400 |         plt.show()
401 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 | @Time          : 20/04/25 15:49
  4 | @Author        : huguanghao
  5 | @File          : demo.py
  6 | @Noice         :
  7 | @Modificattion :
  8 |     @Author    :
  9 |     @Time      :
 10 |     @Detail    :
 11 | '''
 12 | 
 13 | # import sys
 14 | # import time
 15 | # from PIL import Image, ImageDraw
 16 | # from models.tiny_yolo import TinyYoloNet
 17 | from tool.utils import *
 18 | from tool.darknet2pytorch import Darknet
 19 | import argparse
 20 | 
 21 | """hyper parameters"""
 22 | use_cuda = True
 23 | num_classes = 80
 24 | if num_classes == 20:
 25 |     namesfile = 'data/voc.names'
 26 | elif num_classes == 80:
 27 |     namesfile = 'data/coco.names'
 28 | else:
 29 |     namesfile = 'data/x.names'
 30 | 
 31 | 
 32 | def detect(cfgfile, weightfile, imgfile):
 33 |     m = Darknet(cfgfile)
 34 | 
 35 |     m.print_network()
 36 |     m.load_weights(weightfile)
 37 |     print('Loading weights from %s... Done!' % (weightfile))
 38 | 
 39 |     if use_cuda:
 40 |         m.cuda()
 41 | 
 42 |     img = Image.open(imgfile).convert('RGB')
 43 |     sized = img.resize((m.width, m.height))
 44 | 
 45 |     for i in range(2):
 46 |         start = time.time()
 47 |         boxes = do_detect(m, sized, 0.5, num_classes, 0.4, use_cuda)
 48 |         finish = time.time()
 49 |         if i == 1:
 50 |             print('%s: Predicted in %f seconds.' % (imgfile, (finish - start)))
 51 | 
 52 |     class_names = load_class_names(namesfile)
 53 |     plot_boxes(img, boxes, 'predictions.jpg', class_names)
 54 | 
 55 | 
 56 | def detect_imges(cfgfile, weightfile, imgfile_list=['data/dog.jpg', 'data/giraffe.jpg']):
 57 |     m = Darknet(cfgfile)
 58 | 
 59 |     m.print_network()
 60 |     m.load_weights(weightfile)
 61 |     print('Loading weights from %s... Done!' % (weightfile))
 62 | 
 63 |     if use_cuda:
 64 |         m.cuda()
 65 | 
 66 |     imges = []
 67 |     imges_list = []
 68 |     for imgfile in imgfile_list:
 69 |         img = Image.open(imgfile).convert('RGB')
 70 |         imges_list.append(img)
 71 |         sized = img.resize((m.width, m.height))
 72 |         imges.append(np.expand_dims(np.array(sized), axis=0))
 73 | 
 74 |     images = np.concatenate(imges, 0)
 75 |     for i in range(2):
 76 |         start = time.time()
 77 |         boxes = do_detect(m, images, 0.5, num_classes, 0.4, use_cuda)
 78 |         finish = time.time()
 79 |         if i == 1:
 80 |             print('%s: Predicted in %f seconds.' % (imgfile, (finish - start)))
 81 | 
 82 |     class_names = load_class_names(namesfile)
 83 |     for i,(img,box) in enumerate(zip(imges_list,boxes)):
 84 |         plot_boxes(img, box, 'predictions{}.jpg'.format(i), class_names)
 85 | 
 86 | 
 87 | def detect_cv2(cfgfile, weightfile, imgfile):
 88 |     import cv2
 89 |     m = Darknet(cfgfile)
 90 | 
 91 |     m.print_network()
 92 |     m.load_weights(weightfile)
 93 |     print('Loading weights from %s... Done!' % (weightfile))
 94 | 
 95 |     if use_cuda:
 96 |         m.cuda()
 97 | 
 98 |     img = cv2.imread(imgfile)
 99 |     sized = cv2.resize(img, (m.width, m.height))
100 |     sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB)
101 | 
102 |     for i in range(2):
103 |         start = time.time()
104 |         boxes = do_detect(m, sized, 0.5, m.num_classes, 0.4, use_cuda)
105 |         finish = time.time()
106 |         if i == 1:
107 |             print('%s: Predicted in %f seconds.' % (imgfile, (finish - start)))
108 | 
109 |     class_names = load_class_names(namesfile)
110 |     plot_boxes_cv2(img, boxes, savename='predictions.jpg', class_names=class_names)
111 | 
112 | 
113 | def detect_cv2_camera(cfgfile, weightfile):
114 |     import cv2
115 |     m = Darknet(cfgfile)
116 | 
117 |     m.print_network()
118 |     m.load_weights(weightfile)
119 |     print('Loading weights from %s... Done!' % (weightfile))
120 | 
121 |     if use_cuda:
122 |         m.cuda()
123 | 
124 |     cap = cv2.VideoCapture(0)
125 |     # cap = cv2.VideoCapture("./test.mp4")
126 |     cap.set(3, 1280)
127 |     cap.set(4, 720)
128 |     print("Starting the YOLO loop...")
129 | 
130 |     while True:
131 |         ret, img = cap.read()
132 |         sized = cv2.resize(img, (m.width, m.height))
133 |         sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB)
134 | 
135 |         start = time.time()
136 |         boxes = do_detect(m, sized, 0.5, num_classes, 0.4, use_cuda)
137 |         finish = time.time()
138 |         print('Predicted in %f seconds.' % (finish - start))
139 | 
140 |         class_names = load_class_names(namesfile)
141 |         result_img = plot_boxes_cv2(img, boxes, savename=None, class_names=class_names)
142 | 
143 |         cv2.imshow('Yolo demo', result_img)
144 |         cv2.waitKey(1)
145 | 
146 |     cap.release()
147 | 
148 | 
149 | def detect_skimage(cfgfile, weightfile, imgfile):
150 |     from skimage import io
151 |     from skimage.transform import resize
152 |     m = Darknet(cfgfile)
153 | 
154 |     m.print_network()
155 |     m.load_weights(weightfile)
156 |     print('Loading weights from %s... Done!' % (weightfile))
157 | 
158 |     if use_cuda:
159 |         m.cuda()
160 | 
161 |     img = io.imread(imgfile)
162 |     sized = resize(img, (m.width, m.height)) * 255
163 | 
164 |     for i in range(2):
165 |         start = time.time()
166 |         boxes = do_detect(m, sized, 0.5, m.num_classes, 0.4, use_cuda)
167 |         finish = time.time()
168 |         if i == 1:
169 |             print('%s: Predicted in %f seconds.' % (imgfile, (finish - start)))
170 | 
171 |     class_names = load_class_names(namesfile)
172 |     plot_boxes_cv2(img, boxes, savename='predictions.jpg', class_names=class_names)
173 | 
174 | 
175 | def get_args():
176 |     parser = argparse.ArgumentParser('Test your image or video by trained model.')
177 |     parser.add_argument('-cfgfile', type=str, default='./cfg/yolov4.cfg',
178 |                         help='path of cfg file', dest='cfgfile')
179 |     parser.add_argument('-weightfile', type=str,
180 |                         default='./checkpoints/Yolov4_epoch1.pth',
181 |                         help='path of trained model.', dest='weightfile')
182 |     parser.add_argument('-imgfile', type=str,
183 |                         default='./data/mscoco2017/train2017/190109_180343_00154162.jpg',
184 |                         help='path of your image file.', dest='imgfile')
185 |     args = parser.parse_args()
186 | 
187 |     return args
188 | 
189 | 
190 | if __name__ == '__main__':
191 |     args = get_args()
192 |     if args.imgfile:
193 |         detect(args.cfgfile, args.weightfile, args.imgfile)
194 |         # detect_imges(args.cfgfile, args.weightfile)
195 |         # detect_cv2(args.cfgfile, args.weightfile, args.imgfile)
196 |         # detect_skimage(args.cfgfile, args.weightfile, args.imgfile)
197 |     else:
198 |         detect_cv2_camera(args.cfgfile, args.weightfile)
199 | 


--------------------------------------------------------------------------------
/demo_onnx.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import onnx
 3 | import os
 4 | import argparse
 5 | import numpy as np
 6 | import cv2
 7 | import onnxruntime
 8 | from tool.utils import *
 9 | from tool.darknet2onnx import *
10 | 
11 | 
12 | def main(cfg_file, weight_file, image_path, batch_size):
13 | 
14 |     # Transform to onnx as specified batch size
15 |     fransform_to_onnx(cfg_file, weight_file, batch_size)
16 |     # Transform to onnx for demo
17 |     onnx_path_demo = fransform_to_onnx(cfg_file, weight_file, 1)
18 | 
19 |     session = onnxruntime.InferenceSession(onnx_path_demo)
20 |     # session = onnx.load(onnx_path)
21 |     print("The model expects input shape: ", session.get_inputs()[0].shape)
22 | 
23 |     image_src = cv2.imread(image_path)
24 |     detect(session, image_src)
25 | 
26 | 
27 | 
28 | def detect(session, image_src):
29 |     IN_IMAGE_H = session.get_inputs()[0].shape[2]
30 |     IN_IMAGE_W = session.get_inputs()[0].shape[3]
31 | 
32 |     # Input
33 |     resized = cv2.resize(image_src, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR)
34 |     img_in = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
35 |     img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32)
36 |     img_in = np.expand_dims(img_in, axis=0)
37 |     img_in /= 255.0
38 |     print("Shape of the network input: ", img_in.shape)
39 | 
40 |     # Compute
41 |     input_name = session.get_inputs()[0].name
42 |     # output, output_exist = session.run(['decoder.output_conv', 'lane_exist.linear2'], {"input.1": image_np})
43 | 
44 |     # print(img_in)
45 | 
46 |     outputs = session.run(None, {input_name: img_in})
47 | 
48 |     print(outputs[0].shape)
49 |     print(outputs[1].shape)
50 |     print(outputs[2].shape)
51 | 
52 |     # print(outputs[2])
53 | 
54 |     num_classes = 80
55 |     boxes = post_processing(img_in, 0.5, num_classes, 0.4, outputs)
56 | 
57 |     if num_classes == 20:
58 |         namesfile = 'data/voc.names'
59 |     elif num_classes == 80:
60 |         namesfile = 'data/coco.names'
61 |     else:
62 |         namesfile = 'data/names'
63 | 
64 |     class_names = load_class_names(namesfile)
65 |     plot_boxes_cv2(image_src, boxes, savename='predictions_onnx.jpg', class_names=class_names)
66 | 
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     print("Converting to onnx and running demo ...")
71 |     if len(sys.argv) == 5:
72 |         cfg_file = sys.argv[1]
73 |         weight_file = sys.argv[2]
74 |         image_path = sys.argv[3]
75 |         batch_size = int(sys.argv[4])
76 |         main(cfg_file, weight_file, image_path, batch_size)
77 |     else:
78 |         print('Please run this way:\n')
79 |         print('  python demo_onnx.py <cfgFile> <weightFile> <imageFile> <batchSize>')
80 | 


--------------------------------------------------------------------------------
/demo_tensorflow.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | from tensorflow.python.platform import gfile
 5 | 
 6 | import cv2
 7 | from tool.utils import post_processing, load_class_names, plot_boxes_cv2
 8 | 
 9 | 
10 | def demo_tensorflow(tfpb_file="./weight/yolov4.pb", image_path=None, print_sensor_name=False):
11 |     graph_name = 'yolov4'
12 |     tf.compat.v1.disable_eager_execution()
13 |     with tf.compat.v1.Session() as persisted_sess:
14 |         print("loading graph...")
15 |         with gfile.FastGFile(tfpb_file, 'rb') as f:
16 |             graph_def = tf.compat.v1.GraphDef()
17 |             graph_def.ParseFromString(f.read())
18 | 
19 |         persisted_sess.graph.as_default()
20 |         tf.import_graph_def(graph_def, name=graph_name)
21 | 
22 |         # print all sensor_name
23 |         if print_sensor_name:
24 |             tensor_name_list = [tensor.name for tensor in tf.compat.v1.get_default_graph().as_graph_def().node]
25 |             for tensor_name in tensor_name_list:
26 |                 print(tensor_name)
27 | 
28 |         inp = persisted_sess.graph.get_tensor_by_name(graph_name + '/' + 'input:0')
29 |         print(inp.shape)
30 |         out1 = persisted_sess.graph.get_tensor_by_name(graph_name + '/' + 'output_1:0')
31 |         out2 = persisted_sess.graph.get_tensor_by_name(graph_name + '/' + 'output_2:0')
32 |         out3 = persisted_sess.graph.get_tensor_by_name(graph_name + '/' + 'output_3:0')
33 |         print(out1.shape, out2.shape, out3.shape)
34 | 
35 |         # image_src = np.random.rand(1, 3, 608, 608).astype(np.float32)  # input image
36 |         # Input
37 |         image_src = cv2.imread(image_path)
38 |         resized = cv2.resize(image_src, (inp.shape[2], inp.shape[3]), interpolation=cv2.INTER_LINEAR)
39 |         img_in = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
40 |         img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32)
41 |         img_in = np.expand_dims(img_in, axis=0)
42 |         img_in /= 255.0
43 |         print("Shape of the network input: ", img_in.shape)
44 | 
45 |         feed_dict = {inp: img_in}
46 | 
47 |         outputs = persisted_sess.run([out1, out2, out3], feed_dict)
48 |         print(outputs[0].shape)
49 |         print(outputs[1].shape)
50 |         print(outputs[2].shape)
51 | 
52 |         boxes = post_processing(img_in, 0.4, outputs)
53 | 
54 |         num_classes = 80
55 |         if num_classes == 20:
56 |             namesfile = 'data/voc.names'
57 |         elif num_classes == 80:
58 |             namesfile = 'data/coco.names'
59 |         else:
60 |             namesfile = 'data/names'
61 | 
62 |         class_names = load_class_names(namesfile)
63 |         result = plot_boxes_cv2(image_src, boxes, savename=None, class_names=class_names)
64 |         cv2.imshow("tensorflow predicted", result)
65 |         cv2.waitKey()
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     if len(sys.argv) == 1:
70 |         sys.argv.append('weight/yolov4.pb')
71 |         sys.argv.append('data/dog.jpg')
72 |     if len(sys.argv) == 3:
73 |         tfpbfile = sys.argv[1]
74 |         image_path = sys.argv[2]
75 |         demo_tensorflow(tfpbfile, image_path)
76 |     else:
77 |         print('Please execute this script this way:\n')
78 |         print('  python demo_tensorflow.py <tfpbfile> <imageFile>')
79 | 


--------------------------------------------------------------------------------
/evaluate_on_coco.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A script to evaluate the model's performance using pre-trained weights using COCO API.
  3 | Example usage: python evaluate_on_coco.py -dir D:\cocoDataset\val2017\val2017 -gta D:\cocoDataset\annotatio
  4 | ns_trainval2017\annotations\instances_val2017.json -c cfg/yolov4-smaller-input.cfg -g 0
  5 | Explanation: set where your images can be found using -dir, then use -gta to point to the ground truth annotations file
  6 | and finally -c to point to the config file you want to use to load the network using.
  7 | """
  8 | 
  9 | import argparse
 10 | import datetime
 11 | import json
 12 | import logging
 13 | import os
 14 | import sys
 15 | import time
 16 | 
 17 | import numpy as np
 18 | import torch
 19 | from PIL import Image
 20 | from easydict import EasyDict as edict
 21 | from pycocotools.coco import COCO
 22 | from pycocotools.cocoeval import COCOeval
 23 | 
 24 | from cfg import Cfg
 25 | from tool.darknet2pytorch import Darknet
 26 | from tool.utils import do_detect
 27 | 
 28 | 
 29 | def convert_cat_id(single_annotation):
 30 |     cat = single_annotation['category_id']
 31 |     if cat >= 1 and cat <= 11:
 32 |         cat = cat + 1
 33 |     elif cat >= 13 and cat <= 25:
 34 |         cat = cat + 2
 35 |     elif cat >= 27 and cat <= 28:
 36 |         cat = cat + 3
 37 |     elif cat >= 31 and cat <= 44:
 38 |         cat = cat + 5
 39 |     elif cat >= 46 and cat <= 65:
 40 |         cat = cat + 6
 41 |     elif cat == 67:
 42 |         cat = cat + 7
 43 |     elif cat == 70:
 44 |         cat = cat + 9
 45 |     elif cat >= 72 and cat <= 82:
 46 |         cat = cat + 10
 47 |     elif cat >= 84 and cat <= 90:
 48 |         cat = cat + 11
 49 |     single_annotation['category_id'] = cat
 50 |     return single_annotation
 51 | 
 52 | 
 53 | def myconverter(obj):
 54 |     if isinstance(obj, np.integer):
 55 |         return int(obj)
 56 |     elif isinstance(obj, np.floating):
 57 |         return float(obj)
 58 |     elif isinstance(obj, np.ndarray):
 59 |         return obj.tolist()
 60 |     elif isinstance(obj, datetime.datetime):
 61 |         return obj.__str__()
 62 |     else:
 63 |         return obj
 64 | 
 65 | 
 66 | def evaluate_on_coco(cfg, resFile):
 67 |     annType = "bbox"  # specify type here
 68 |     with open(resFile, 'r') as f:
 69 |         unsorted_annotations = json.load(f)
 70 |     sorted_annotations = list(sorted(unsorted_annotations, key=lambda single_annotation: single_annotation["image_id"]))
 71 |     sorted_annotations = list(map(convert_cat_id, sorted_annotations))
 72 | 
 73 |     with open('temp.json', 'w') as f:
 74 |         json.dump(sorted_annotations, f)
 75 | 
 76 |     cocoGt = COCO(cfg.gt_annotations_path)
 77 |     cocoDt = cocoGt.loadRes('temp.json')
 78 |     imgIds = sorted(cocoGt.getImgIds())
 79 |     cocoEval = COCOeval(cocoGt, cocoDt, annType)
 80 |     cocoEval.params.imgIds = imgIds
 81 |     cocoEval.evaluate()
 82 |     cocoEval.accumulate()
 83 |     cocoEval.summarize()
 84 | 
 85 | 
 86 | def test(model, annotations, cfg):
 87 |     if not annotations["images"]:
 88 |         print("Annotations do not have 'images' key")
 89 |         return
 90 |     images = annotations["images"]
 91 |     images = images[:10]
 92 |     resFile = 'data/coco_val_outputs.json'
 93 | 
 94 |     if torch.cuda.is_available():
 95 |         use_cuda = 1
 96 |     else:
 97 |         use_cuda = 0
 98 | 
 99 |     # do one forward pass first to circumvent cold start
100 |     throwaway_image = Image.open('data/dog.jpg').convert('RGB').resize((model.width, model.height))
101 |     do_detect(model, throwaway_image, 0.5, 0.4, use_cuda)
102 |     boxes_json = []
103 | 
104 |     for i, image_annotation in enumerate(images):
105 |         logging.info("currently on image: {}/{}".format(i + 1, len(images)))
106 |         image_file_name = image_annotation["file_name"]
107 |         image_id = image_annotation["id"]
108 |         image_height = image_annotation["height"]
109 |         image_width = image_annotation["width"]
110 | 
111 |         # open and resize each image first
112 |         img = Image.open(os.path.join(cfg.dataset_dir, image_file_name)).convert('RGB')
113 |         sized = img.resize((model.width, model.height))
114 | 
115 |         if use_cuda:
116 |             model.cuda()
117 | 
118 |         start = time.time()
119 |         boxes = do_detect(model, sized, 0.5, 0.4, use_cuda)
120 |         finish = time.time()
121 |         if type(boxes) == list:
122 |             for box in boxes:
123 |                 box_json = {}
124 |                 category_id = box[-1]
125 |                 score = box[-2]
126 |                 bbox_normalized = box[:4]
127 |                 box_json["category_id"] = int(category_id)
128 |                 box_json["image_id"] = int(image_id)
129 |                 bbox = []
130 |                 for i, bbox_coord in enumerate(bbox_normalized):
131 |                     modified_bbox_coord = float(bbox_coord)
132 |                     if i % 2:
133 |                         modified_bbox_coord *= image_height
134 |                     else:
135 |                         modified_bbox_coord *= image_width
136 |                     modified_bbox_coord = round(modified_bbox_coord, 2)
137 |                     bbox.append(modified_bbox_coord)
138 |                 box_json["bbox_normalized"] = list(map(lambda x: round(float(x), 2), bbox_normalized))
139 |                 box_json["bbox"] = bbox
140 |                 box_json["score"] = round(float(score), 2)
141 |                 box_json["timing"] = float(finish - start)
142 |                 boxes_json.append(box_json)
143 |                 # print("see box_json: ", box_json)
144 |                 with open(resFile, 'w') as outfile:
145 |                     json.dump(boxes_json, outfile, default=myconverter)
146 |         else:
147 |             print("warning: output from model after postprocessing is not a list, ignoring")
148 |             return
149 | 
150 |         # namesfile = 'data/coco.names'
151 |         # class_names = load_class_names(namesfile)
152 |         # plot_boxes(img, boxes, 'data/outcome/predictions_{}.jpg'.format(image_id), class_names)
153 | 
154 |     with open(resFile, 'w') as outfile:
155 |         json.dump(boxes_json, outfile, default=myconverter)
156 | 
157 |     evaluate_on_coco(cfg, resFile)
158 | 
159 | 
160 | def get_args(**kwargs):
161 |     cfg = kwargs
162 |     parser = argparse.ArgumentParser(description='Test model on test dataset',
163 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
164 |     parser.add_argument('-f', '--load', dest='load', type=str, default=None,
165 |                         help='Load model from a .pth file')
166 |     parser.add_argument('-g', '--gpu', metavar='G', type=str, default='-1',
167 |                         help='GPU', dest='gpu')
168 |     parser.add_argument('-dir', '--data-dir', type=str, default=None,
169 |                         help='dataset dir', dest='dataset_dir')
170 |     parser.add_argument('-gta', '--ground_truth_annotations', type=str, default='instances_val2017.json',
171 |                         help='ground truth annotations file', dest='gt_annotations_path')
172 |     parser.add_argument('-w', '--weights_file', type=str, default='weights/yolov4.weights',
173 |                         help='weights file to load', dest='weights_file')
174 |     parser.add_argument('-c', '--model_config', type=str, default='cfg/yolov4.cfg',
175 |                         help='model config file to load', dest='model_config')
176 |     args = vars(parser.parse_args())
177 | 
178 |     for k in args.keys():
179 |         cfg[k] = args.get(k)
180 |     return edict(cfg)
181 | 
182 | 
183 | def init_logger(log_file=None, log_dir=None, log_level=logging.INFO, mode='w', stdout=True):
184 |     """
185 |     log_dir: 日志文件的文件夹路径
186 |     mode: 'a', append; 'w', 覆盖原文件写入.
187 |     """
188 |     import datetime
189 |     def get_date_str():
190 |         now = datetime.datetime.now()
191 |         return now.strftime('%Y-%m-%d_%H-%M-%S')
192 | 
193 |     fmt = '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s: %(message)s'
194 |     if log_dir is None:
195 |         log_dir = '~/temp/log/'
196 |     if log_file is None:
197 |         log_file = 'log_' + get_date_str() + '.txt'
198 |     if not os.path.exists(log_dir):
199 |         os.makedirs(log_dir)
200 |     log_file = os.path.join(log_dir, log_file)
201 |     # 此处不能使用logging输出
202 |     print('log file path:' + log_file)
203 | 
204 |     logging.basicConfig(level=logging.DEBUG,
205 |                         format=fmt,
206 |                         filename=log_file,
207 |                         filemode=mode)
208 | 
209 |     if stdout:
210 |         console = logging.StreamHandler(stream=sys.stdout)
211 |         console.setLevel(log_level)
212 |         formatter = logging.Formatter(fmt)
213 |         console.setFormatter(formatter)
214 |         logging.getLogger('').addHandler(console)
215 | 
216 |     return logging
217 | 
218 | 
219 | if __name__ == "__main__":
220 |     logging = init_logger(log_dir='log')
221 |     cfg = get_args(**Cfg)
222 |     os.environ["CUDA_VISIBLE_DEVICES"] = cfg.gpu
223 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
224 |     logging.info(f'Using device {device}')
225 | 
226 |     model = Darknet(cfg.model_config)
227 | 
228 |     model.print_network()
229 |     model.load_weights(cfg.weights_file)
230 |     model.eval()  # set model away from training
231 | 
232 |     if torch.cuda.device_count() > 1:
233 |         model = torch.nn.DataParallel(model)
234 | 
235 |     model.to(device=device)
236 | 
237 |     annotations_file_path = cfg.gt_annotations_path
238 |     with open(annotations_file_path) as annotations_file:
239 |         try:
240 |             annotations = json.load(annotations_file)
241 |         except:
242 |             print("annotations file not a json")
243 |             exit()
244 |     test(model=model,
245 |          annotations=annotations,
246 |          cfg=cfg, )
247 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | class Mish(torch.nn.Module):
  7 |     def __init__(self):
  8 |         super().__init__()
  9 | 
 10 |     def forward(self, x):
 11 |         x = x * (torch.tanh(torch.nn.functional.softplus(x)))
 12 |         return x
 13 | 
 14 | 
 15 | class Upsample(nn.Module):
 16 |     def __init__(self):
 17 |         super(Upsample, self).__init__()
 18 | 
 19 |     def forward(self, x, target_size):
 20 |         assert (x.data.dim() == 4)
 21 |         _, _, H, W = target_size
 22 |         return F.interpolate(x, size=(H, W), mode='nearest')
 23 | 
 24 | 
 25 | class Conv_Bn_Activation(nn.Module):
 26 |     def __init__(self, in_channels, out_channels, kernel_size, stride, activation, bn=True, bias=False):
 27 |         super().__init__()
 28 |         pad = (kernel_size - 1) // 2
 29 | 
 30 |         self.conv = nn.ModuleList()
 31 |         if bias:
 32 |             self.conv.append(nn.Conv2d(in_channels, out_channels, kernel_size, stride, pad))
 33 |         else:
 34 |             self.conv.append(nn.Conv2d(in_channels, out_channels, kernel_size, stride, pad, bias=False))
 35 |         if bn:
 36 |             self.conv.append(nn.BatchNorm2d(out_channels))
 37 |         if activation == "mish":
 38 |             self.conv.append(Mish())
 39 |         elif activation == "relu":
 40 |             self.conv.append(nn.ReLU(inplace=True))
 41 |         elif activation == "leaky":
 42 |             self.conv.append(nn.LeakyReLU(0.1, inplace=True))
 43 |         elif activation == "linear":
 44 |             pass
 45 |         else:
 46 |             print("activate error !!! {} {} {}".format(sys._getframe().f_code.co_filename,
 47 |                                                        sys._getframe().f_code.co_name, sys._getframe().f_lineno))
 48 | 
 49 |     def forward(self, x):
 50 |         for l in self.conv:
 51 |             x = l(x)
 52 |         return x
 53 | 
 54 | 
 55 | class ResBlock(nn.Module):
 56 |     """
 57 |     Sequential residual blocks each of which consists of \
 58 |     two convolution layers.
 59 |     Args:
 60 |         ch (int): number of input and output channels.
 61 |         nblocks (int): number of residual blocks.
 62 |         shortcut (bool): if True, residual tensor addition is enabled.
 63 |     """
 64 | 
 65 |     def __init__(self, ch, nblocks=1, shortcut=True):
 66 |         super().__init__()
 67 |         self.shortcut = shortcut
 68 |         self.module_list = nn.ModuleList()
 69 |         for i in range(nblocks):
 70 |             resblock_one = nn.ModuleList()
 71 |             resblock_one.append(Conv_Bn_Activation(ch, ch, 1, 1, 'mish'))
 72 |             resblock_one.append(Conv_Bn_Activation(ch, ch, 3, 1, 'mish'))
 73 |             self.module_list.append(resblock_one)
 74 | 
 75 |     def forward(self, x):
 76 |         for module in self.module_list:
 77 |             h = x
 78 |             for res in module:
 79 |                 h = res(h)
 80 |             x = x + h if self.shortcut else h
 81 |         return x
 82 | 
 83 | 
 84 | class DownSample1(nn.Module):
 85 |     def __init__(self):
 86 |         super().__init__()
 87 |         self.conv1 = Conv_Bn_Activation(3, 32, 3, 1, 'mish')
 88 | 
 89 |         self.conv2 = Conv_Bn_Activation(32, 64, 3, 2, 'mish')
 90 |         self.conv3 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
 91 |         # [route]
 92 |         # layers = -2
 93 |         self.conv4 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
 94 | 
 95 |         self.conv5 = Conv_Bn_Activation(64, 32, 1, 1, 'mish')
 96 |         self.conv6 = Conv_Bn_Activation(32, 64, 3, 1, 'mish')
 97 |         # [shortcut]
 98 |         # from=-3
 99 |         # activation = linear
100 | 
101 |         self.conv7 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
102 |         # [route]
103 |         # layers = -1, -7
104 |         self.conv8 = Conv_Bn_Activation(128, 64, 1, 1, 'mish')
105 | 
106 |     def forward(self, input):
107 |         x1 = self.conv1(input)
108 |         x2 = self.conv2(x1)
109 |         x3 = self.conv3(x2)
110 |         # route -2
111 |         x4 = self.conv4(x2)
112 |         x5 = self.conv5(x4)
113 |         x6 = self.conv6(x5)
114 |         # shortcut -3
115 |         x6 = x6 + x4
116 | 
117 |         x7 = self.conv7(x6)
118 |         # [route]
119 |         # layers = -1, -7
120 |         x7 = torch.cat([x7, x3], dim=1)
121 |         x8 = self.conv8(x7)
122 |         return x8
123 | 
124 | 
125 | class DownSample2(nn.Module):
126 |     def __init__(self):
127 |         super().__init__()
128 |         self.conv1 = Conv_Bn_Activation(64, 128, 3, 2, 'mish')
129 |         self.conv2 = Conv_Bn_Activation(128, 64, 1, 1, 'mish')
130 |         # r -2
131 |         self.conv3 = Conv_Bn_Activation(128, 64, 1, 1, 'mish')
132 | 
133 |         self.resblock = ResBlock(ch=64, nblocks=2)
134 | 
135 |         # s -3
136 |         self.conv4 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
137 |         # r -1 -10
138 |         self.conv5 = Conv_Bn_Activation(128, 128, 1, 1, 'mish')
139 | 
140 |     def forward(self, input):
141 |         x1 = self.conv1(input)
142 |         x2 = self.conv2(x1)
143 |         x3 = self.conv3(x1)
144 | 
145 |         r = self.resblock(x3)
146 |         x4 = self.conv4(r)
147 | 
148 |         x4 = torch.cat([x4, x2], dim=1)
149 |         x5 = self.conv5(x4)
150 |         return x5
151 | 
152 | 
153 | class DownSample3(nn.Module):
154 |     def __init__(self):
155 |         super().__init__()
156 |         self.conv1 = Conv_Bn_Activation(128, 256, 3, 2, 'mish')
157 |         self.conv2 = Conv_Bn_Activation(256, 128, 1, 1, 'mish')
158 |         self.conv3 = Conv_Bn_Activation(256, 128, 1, 1, 'mish')
159 | 
160 |         self.resblock = ResBlock(ch=128, nblocks=8)
161 |         self.conv4 = Conv_Bn_Activation(128, 128, 1, 1, 'mish')
162 |         self.conv5 = Conv_Bn_Activation(256, 256, 1, 1, 'mish')
163 | 
164 |     def forward(self, input):
165 |         x1 = self.conv1(input)
166 |         x2 = self.conv2(x1)
167 |         x3 = self.conv3(x1)
168 | 
169 |         r = self.resblock(x3)
170 |         x4 = self.conv4(r)
171 | 
172 |         x4 = torch.cat([x4, x2], dim=1)
173 |         x5 = self.conv5(x4)
174 |         return x5
175 | 
176 | 
177 | class DownSample4(nn.Module):
178 |     def __init__(self):
179 |         super().__init__()
180 |         self.conv1 = Conv_Bn_Activation(256, 512, 3, 2, 'mish')
181 |         self.conv2 = Conv_Bn_Activation(512, 256, 1, 1, 'mish')
182 |         self.conv3 = Conv_Bn_Activation(512, 256, 1, 1, 'mish')
183 | 
184 |         self.resblock = ResBlock(ch=256, nblocks=8)
185 |         self.conv4 = Conv_Bn_Activation(256, 256, 1, 1, 'mish')
186 |         self.conv5 = Conv_Bn_Activation(512, 512, 1, 1, 'mish')
187 | 
188 |     def forward(self, input):
189 |         x1 = self.conv1(input)
190 |         x2 = self.conv2(x1)
191 |         x3 = self.conv3(x1)
192 | 
193 |         r = self.resblock(x3)
194 |         x4 = self.conv4(r)
195 | 
196 |         x4 = torch.cat([x4, x2], dim=1)
197 |         x5 = self.conv5(x4)
198 |         return x5
199 | 
200 | 
201 | class DownSample5(nn.Module):
202 |     def __init__(self):
203 |         super().__init__()
204 |         self.conv1 = Conv_Bn_Activation(512, 1024, 3, 2, 'mish')
205 |         self.conv2 = Conv_Bn_Activation(1024, 512, 1, 1, 'mish')
206 |         self.conv3 = Conv_Bn_Activation(1024, 512, 1, 1, 'mish')
207 | 
208 |         self.resblock = ResBlock(ch=512, nblocks=4)
209 |         self.conv4 = Conv_Bn_Activation(512, 512, 1, 1, 'mish')
210 |         self.conv5 = Conv_Bn_Activation(1024, 1024, 1, 1, 'mish')
211 | 
212 |     def forward(self, input):
213 |         x1 = self.conv1(input)
214 |         x2 = self.conv2(x1)
215 |         x3 = self.conv3(x1)
216 | 
217 |         r = self.resblock(x3)
218 |         x4 = self.conv4(r)
219 | 
220 |         x4 = torch.cat([x4, x2], dim=1)
221 |         x5 = self.conv5(x4)
222 |         return x5
223 | 
224 | 
225 | class Neck(nn.Module):
226 |     def __init__(self):
227 |         super().__init__()
228 |         self.conv1 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
229 |         self.conv2 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
230 |         self.conv3 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
231 |         # SPP
232 |         self.maxpool1 = nn.MaxPool2d(kernel_size=5, stride=1, padding=5 // 2)
233 |         self.maxpool2 = nn.MaxPool2d(kernel_size=9, stride=1, padding=9 // 2)
234 |         self.maxpool3 = nn.MaxPool2d(kernel_size=13, stride=1, padding=13 // 2)
235 | 
236 |         # R -1 -3 -5 -6
237 |         # SPP
238 |         self.conv4 = Conv_Bn_Activation(2048, 512, 1, 1, 'leaky')
239 |         self.conv5 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
240 |         self.conv6 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
241 |         self.conv7 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
242 |         # UP
243 |         self.upsample1 = Upsample()
244 |         # R 85
245 |         self.conv8 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
246 |         # R -1 -3
247 |         self.conv9 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
248 |         self.conv10 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
249 |         self.conv11 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
250 |         self.conv12 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
251 |         self.conv13 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
252 |         self.conv14 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
253 |         # UP
254 |         self.upsample2 = Upsample()
255 |         # R 54
256 |         self.conv15 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
257 |         # R -1 -3
258 |         self.conv16 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
259 |         self.conv17 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky')
260 |         self.conv18 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
261 |         self.conv19 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky')
262 |         self.conv20 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
263 | 
264 |     def forward(self, input, downsample4, downsample3):
265 |         x1 = self.conv1(input)
266 |         x2 = self.conv2(x1)
267 |         x3 = self.conv3(x2)
268 |         # SPP
269 |         m1 = self.maxpool1(x3)
270 |         m2 = self.maxpool2(x3)
271 |         m3 = self.maxpool3(x3)
272 |         spp = torch.cat([m3, m2, m1, x3], dim=1)
273 |         # SPP end
274 |         x4 = self.conv4(spp)
275 |         x5 = self.conv5(x4)
276 |         x6 = self.conv6(x5)
277 |         x7 = self.conv7(x6)
278 |         # UP
279 |         up = self.upsample1(x7, downsample4.size())
280 |         # R 85
281 |         x8 = self.conv8(downsample4)
282 |         # R -1 -3
283 |         x8 = torch.cat([x8, up], dim=1)
284 | 
285 |         x9 = self.conv9(x8)
286 |         x10 = self.conv10(x9)
287 |         x11 = self.conv11(x10)
288 |         x12 = self.conv12(x11)
289 |         x13 = self.conv13(x12)
290 |         x14 = self.conv14(x13)
291 | 
292 |         # UP
293 |         up = self.upsample2(x14, downsample3.size())
294 |         # R 54
295 |         x15 = self.conv15(downsample3)
296 |         # R -1 -3
297 |         x15 = torch.cat([x15, up], dim=1)
298 | 
299 |         x16 = self.conv16(x15)
300 |         x17 = self.conv17(x16)
301 |         x18 = self.conv18(x17)
302 |         x19 = self.conv19(x18)
303 |         x20 = self.conv20(x19)
304 |         return x20, x13, x6
305 | 
306 | 
307 | class Yolov4Head(nn.Module):
308 |     def __init__(self, output_ch):
309 |         super().__init__()
310 |         self.conv1 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky')
311 |         self.conv2 = Conv_Bn_Activation(256, output_ch, 1, 1, 'linear', bn=False, bias=True)
312 |         # self.yolo1 = YoloLayer(anchor_mask=[0, 1, 2], num_classes=80,
313 |         #                        anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401],
314 |         #                        num_anchors=9, stride=8)
315 | 
316 |         # R -4
317 |         self.conv3 = Conv_Bn_Activation(128, 256, 3, 2, 'leaky')
318 | 
319 |         # R -1 -16
320 |         self.conv4 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
321 |         self.conv5 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
322 |         self.conv6 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
323 |         self.conv7 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
324 |         self.conv8 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
325 |         self.conv9 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
326 |         self.conv10 = Conv_Bn_Activation(512, output_ch, 1, 1, 'linear', bn=False, bias=True)
327 |         # self.yolo2 = YoloLayer(anchor_mask=[3, 4, 5], num_classes=80,
328 |         #                        anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401],
329 |         #                        num_anchors=9, stride=16)
330 | 
331 |         # R -4
332 |         self.conv11 = Conv_Bn_Activation(256, 512, 3, 2, 'leaky')
333 | 
334 |         # R -1 -37
335 |         self.conv12 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
336 |         self.conv13 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
337 |         self.conv14 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
338 |         self.conv15 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
339 |         self.conv16 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
340 |         self.conv17 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
341 |         self.conv18 = Conv_Bn_Activation(1024, output_ch, 1, 1, 'linear', bn=False, bias=True)
342 |         # self.yolo3 = YoloLayer(anchor_mask=[6, 7, 8], num_classes=80,
343 |         #                        anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401],
344 |         #                        num_anchors=9, stride=32)
345 | 
346 |     def forward(self, input1, input2, input3):
347 |         x1 = self.conv1(input1)
348 |         x2 = self.conv2(x1)
349 |         # y1 = self.yolo1(x2)
350 | 
351 |         x3 = self.conv3(input1)
352 |         # R -1 -16
353 |         x3 = torch.cat([x3, input2], dim=1)
354 |         x4 = self.conv4(x3)
355 |         x5 = self.conv5(x4)
356 |         x6 = self.conv6(x5)
357 |         x7 = self.conv7(x6)
358 |         x8 = self.conv8(x7)
359 |         x9 = self.conv9(x8)
360 |         x10 = self.conv10(x9)
361 |         # y2 = self.yolo2(x10)
362 | 
363 |         # R -4
364 |         x11 = self.conv11(x8)
365 |         # R -1 -37
366 |         x11 = torch.cat([x11, input3], dim=1)
367 | 
368 |         x12 = self.conv12(x11)
369 |         x13 = self.conv13(x12)
370 |         x14 = self.conv14(x13)
371 |         x15 = self.conv15(x14)
372 |         x16 = self.conv16(x15)
373 |         x17 = self.conv17(x16)
374 |         x18 = self.conv18(x17)
375 |         return [x2, x10, x18]
376 |         # y3 = self.yolo3(x18)
377 |         # return [y1, y2, y3]
378 |         # return y3
379 | 
380 | 
381 | class Yolov4(nn.Module):
382 |     def __init__(self, yolov4conv137weight=None, n_classes=80):
383 |         super().__init__()
384 | 
385 |         output_ch = (4 + 1 + n_classes) * 3
386 | 
387 |         # backbone
388 |         self.down1 = DownSample1()
389 |         self.down2 = DownSample2()
390 |         self.down3 = DownSample3()
391 |         self.down4 = DownSample4()
392 |         self.down5 = DownSample5()
393 |         # neck
394 |         self.neck = Neck()
395 |         # yolov4conv137
396 |         if yolov4conv137weight:
397 |             _model = nn.Sequential(self.down1, self.down2, self.down3, self.down4, self.down5, self.neck)
398 |             pretrained_dict = torch.load(yolov4conv137weight)
399 | 
400 |             model_dict = _model.state_dict()
401 |             # 1. filter out unnecessary keys
402 |             pretrained_dict = {k1: v for (k, v), k1 in zip(pretrained_dict.items(), model_dict)}
403 |             # 2. overwrite entries in the existing state dict
404 |             model_dict.update(pretrained_dict)
405 |             _model.load_state_dict(model_dict)
406 |         # head
407 |         self.head = Yolov4Head(output_ch)
408 | 
409 |     def forward(self, input):
410 |         d1 = self.down1(input)
411 |         d2 = self.down2(d1)
412 |         d3 = self.down3(d2)
413 |         d4 = self.down4(d3)
414 |         d5 = self.down5(d4)
415 | 
416 |         x20, x13, x6 = self.neck(d5, d4, d3)
417 | 
418 |         output = self.head(x20, x13, x6)
419 |         return output
420 | 
421 | 
422 | if  __name__ == "__main__":
423 |     import sys
424 |     from PIL import Image
425 | 
426 |     namesfile = None
427 |     if len(sys.argv) == 4:
428 |         n_classes = int(sys.argv[1])
429 |         weightfile = sys.argv[2]
430 |         imgfile = sys.argv[3]
431 |     elif len(sys.argv) == 5:
432 |         n_classes = int(sys.argv[1])
433 |         weightfile = sys.argv[2]
434 |         imgfile = sys.argv[3]
435 |         namesfile = sys.argv[4]
436 |     else:
437 |         print('Usage: ')
438 |         print('  python models.py num_classes weightfile imgfile namefile')
439 | 
440 |     model = Yolov4(n_classes=n_classes)
441 | 
442 |     pretrained_dict = torch.load(weightfile, map_location=torch.device('cuda'))
443 |     model.load_state_dict(pretrained_dict)
444 | 
445 |     if namesfile == None:
446 |         if n_classes == 20:
447 |             namesfile = 'data/voc.names'
448 |         elif n_classes == 80:
449 |             namesfile = 'data/coco.names'
450 |         else:
451 |             print("please give namefile")
452 | 
453 |     use_cuda = 1
454 |     if use_cuda:
455 |         model.cuda()
456 | 
457 |     img = Image.open(imgfile).convert('RGB')
458 |     sized = img.resize((608, 608))
459 |     from tool.utils import *
460 | 
461 |     boxes = do_detect(model, sized, 0.5, n_classes,0.4, use_cuda)
462 | 
463 |     class_names = load_class_names(namesfile)
464 |     plot_boxes(img, boxes, 'predictions.jpg', class_names)
465 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==1.22.0
 2 | torch==1.4.0
 3 | scikit_image==0.16.2
 4 | matplotlib==2.2.3
 5 | tqdm==4.43.0
 6 | easydict==1.9
 7 | Pillow==9.0.1
 8 | tensorboardX
 9 | 
10 | 


--------------------------------------------------------------------------------
/tool/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/roboflow/pytorch-YOLOv4/b6189b304b9a60a15bc0a9c3627ec4973fe0e2b5/tool/__init__.py


--------------------------------------------------------------------------------
/tool/camera.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | '''
 3 | @Time          : 2020/04/26 15:48
 4 | @Author        : Tianxiaomo
 5 | @File          : camera.py
 6 | @Noice         :
 7 | @Modificattion :
 8 |     @Author    :
 9 |     @Time      :
10 |     @Detail    :
11 | 
12 | '''
13 | from __future__ import division
14 | import cv2
15 | from tool.darknet2pytorch import Darknet
16 | import argparse
17 | from tool.utils import *
18 | 
19 | 
20 | def arg_parse():
21 |     """
22 |     Parse arguements to the detect module
23 | 
24 |     """
25 | 
26 |     parser = argparse.ArgumentParser(description='YOLO v3 Cam Demo')
27 |     parser.add_argument("--confidence", dest="confidence", help="Object Confidence to filter predictions", default=0.25)
28 |     parser.add_argument("--nms_thresh", dest="nms_thresh", help="NMS Threshhold", default=0.4)
29 |     parser.add_argument("--reso", dest='reso', help=
30 |     "Input resolution of the network. Increase to increase accuracy. Decrease to increase speed",
31 |                         default="160", type=str)
32 |     return parser.parse_args()
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     cfgfile = "cfg/yolov4.cfg"
37 |     weightsfile = "weight/yolov4.weights"
38 | 
39 |     args = arg_parse()
40 |     confidence = float(args.confidence)
41 |     nms_thesh = float(args.nms_thresh)
42 |     CUDA = torch.cuda.is_available()
43 |     num_classes = 80
44 |     bbox_attrs = 5 + num_classes
45 |     class_names = load_class_names("data/coco.names")
46 | 
47 |     model = Darknet(cfgfile)
48 |     model.load_weights(weightsfile)
49 | 
50 |     if CUDA:
51 |         model.cuda()
52 | 
53 |     model.eval()
54 |     cap = cv2.VideoCapture(0)
55 | 
56 |     assert cap.isOpened(), 'Cannot capture source'
57 | 
58 |     frames = 0
59 |     start = time.time()
60 |     while cap.isOpened():
61 |         ret, frame = cap.read()
62 |         if ret:
63 |             sized = cv2.resize(frame, (model.width, model.height))
64 |             sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB)
65 |             boxes = do_detect(model, sized, 0.5, 0.4, CUDA)
66 | 
67 |             orig_im = plot_boxes_cv2(frame, boxes, class_names=class_names)
68 | 
69 |             cv2.imshow("frame", orig_im)
70 |             key = cv2.waitKey(1)
71 |             if key & 0xFF == ord('q'):
72 |                 break
73 |             frames += 1
74 |             print("FPS of the video is {:5.2f}".format(frames / (time.time() - start)))
75 |         else:
76 |             break
77 | 


--------------------------------------------------------------------------------
/tool/coco_annotation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | '''
 3 | @Time          : 2020/05/08 11:45
 4 | @Author        : Tianxiaomo
 5 | @File          : coco_annotatin.py
 6 | @Noice         :
 7 | @Modificattion :
 8 |     @Author    :
 9 |     @Time      :
10 |     @Detail    :
11 | 
12 | '''
13 | import json
14 | from collections import defaultdict
15 | from tqdm import tqdm
16 | import os
17 | 
18 | """hyper parameters"""
19 | json_file_path = 'E:/Dataset/coco2017/annotations_trainval2017/annotations/instances_val2017.json'
20 | images_dir_path = 'mscoco2017/train2017/'
21 | output_path = '../data/val.txt'
22 | 
23 | """load json file"""
24 | name_box_id = defaultdict(list)
25 | id_name = dict()
26 | with open(json_file_path, encoding='utf-8') as f:
27 |     data = json.load(f)
28 | 
29 | """generate labels"""
30 | images = data['images']
31 | annotations = data['annotations']
32 | for ant in tqdm(annotations):
33 |     id = ant['image_id']
34 |     name = os.path.join(images_dir_path, images[id]['file_name'])
35 |     cat = ant['category_id']
36 | 
37 |     if cat >= 1 and cat <= 11:
38 |         cat = cat - 1
39 |     elif cat >= 13 and cat <= 25:
40 |         cat = cat - 2
41 |     elif cat >= 27 and cat <= 28:
42 |         cat = cat - 3
43 |     elif cat >= 31 and cat <= 44:
44 |         cat = cat - 5
45 |     elif cat >= 46 and cat <= 65:
46 |         cat = cat - 6
47 |     elif cat == 67:
48 |         cat = cat - 7
49 |     elif cat == 70:
50 |         cat = cat - 9
51 |     elif cat >= 72 and cat <= 82:
52 |         cat = cat - 10
53 |     elif cat >= 84 and cat <= 90:
54 |         cat = cat - 11
55 | 
56 |     name_box_id[name].append([ant['bbox'], cat])
57 | 
58 | """write to txt"""
59 | with open(output_path, 'w') as f:
60 |     for key in tqdm(name_box_id.keys()):
61 |         f.write(key)
62 |         box_infos = name_box_id[key]
63 |         for info in box_infos:
64 |             x_min = int(info[0][0])
65 |             y_min = int(info[0][1])
66 |             x_max = x_min + int(info[0][2])
67 |             y_max = y_min + int(info[0][3])
68 | 
69 |             box_info = " %d,%d,%d,%d,%d" % (
70 |                 x_min, y_min, x_max, y_max, int(info[1]))
71 |             f.write(box_info)
72 |         f.write('\n')
73 | 


--------------------------------------------------------------------------------
/tool/config.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from tool.utils import convert2cpu
  3 | 
  4 | 
  5 | def parse_cfg(cfgfile):
  6 |     blocks = []
  7 |     fp = open(cfgfile, 'r')
  8 |     block = None
  9 |     line = fp.readline()
 10 |     while line != '':
 11 |         line = line.rstrip()
 12 |         if line == '' or line[0] == '#':
 13 |             line = fp.readline()
 14 |             continue
 15 |         elif line[0] == '[':
 16 |             if block:
 17 |                 blocks.append(block)
 18 |             block = dict()
 19 |             block['type'] = line.lstrip('[').rstrip(']')
 20 |             # set default value
 21 |             if block['type'] == 'convolutional':
 22 |                 block['batch_normalize'] = 0
 23 |         else:
 24 |             key, value = line.split('=')
 25 |             key = key.strip()
 26 |             if key == 'type':
 27 |                 key = '_type'
 28 |             value = value.strip()
 29 |             block[key] = value
 30 |         line = fp.readline()
 31 | 
 32 |     if block:
 33 |         blocks.append(block)
 34 |     fp.close()
 35 |     return blocks
 36 | 
 37 | 
 38 | def print_cfg(blocks):
 39 |     print('layer     filters    size              input                output');
 40 |     prev_width = 416
 41 |     prev_height = 416
 42 |     prev_filters = 3
 43 |     out_filters = []
 44 |     out_widths = []
 45 |     out_heights = []
 46 |     ind = -2
 47 |     for block in blocks:
 48 |         ind = ind + 1
 49 |         if block['type'] == 'net':
 50 |             prev_width = int(block['width'])
 51 |             prev_height = int(block['height'])
 52 |             continue
 53 |         elif block['type'] == 'convolutional':
 54 |             filters = int(block['filters'])
 55 |             kernel_size = int(block['size'])
 56 |             stride = int(block['stride'])
 57 |             is_pad = int(block['pad'])
 58 |             pad = (kernel_size - 1) // 2 if is_pad else 0
 59 |             width = (prev_width + 2 * pad - kernel_size) // stride + 1
 60 |             height = (prev_height + 2 * pad - kernel_size) // stride + 1
 61 |             print('%5d %-6s %4d  %d x %d / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
 62 |                 ind, 'conv', filters, kernel_size, kernel_size, stride, prev_width, prev_height, prev_filters, width,
 63 |                 height, filters))
 64 |             prev_width = width
 65 |             prev_height = height
 66 |             prev_filters = filters
 67 |             out_widths.append(prev_width)
 68 |             out_heights.append(prev_height)
 69 |             out_filters.append(prev_filters)
 70 |         elif block['type'] == 'maxpool':
 71 |             pool_size = int(block['size'])
 72 |             stride = int(block['stride'])
 73 |             width = prev_width // stride
 74 |             height = prev_height // stride
 75 |             print('%5d %-6s       %d x %d / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
 76 |                 ind, 'max', pool_size, pool_size, stride, prev_width, prev_height, prev_filters, width, height,
 77 |                 filters))
 78 |             prev_width = width
 79 |             prev_height = height
 80 |             prev_filters = filters
 81 |             out_widths.append(prev_width)
 82 |             out_heights.append(prev_height)
 83 |             out_filters.append(prev_filters)
 84 |         elif block['type'] == 'avgpool':
 85 |             width = 1
 86 |             height = 1
 87 |             print('%5d %-6s                   %3d x %3d x%4d   ->  %3d' % (
 88 |                 ind, 'avg', prev_width, prev_height, prev_filters, prev_filters))
 89 |             prev_width = width
 90 |             prev_height = height
 91 |             prev_filters = filters
 92 |             out_widths.append(prev_width)
 93 |             out_heights.append(prev_height)
 94 |             out_filters.append(prev_filters)
 95 |         elif block['type'] == 'softmax':
 96 |             print('%5d %-6s                                    ->  %3d' % (ind, 'softmax', prev_filters))
 97 |             out_widths.append(prev_width)
 98 |             out_heights.append(prev_height)
 99 |             out_filters.append(prev_filters)
100 |         elif block['type'] == 'cost':
101 |             print('%5d %-6s                                     ->  %3d' % (ind, 'cost', prev_filters))
102 |             out_widths.append(prev_width)
103 |             out_heights.append(prev_height)
104 |             out_filters.append(prev_filters)
105 |         elif block['type'] == 'reorg':
106 |             stride = int(block['stride'])
107 |             filters = stride * stride * prev_filters
108 |             width = prev_width // stride
109 |             height = prev_height // stride
110 |             print('%5d %-6s             / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
111 |                 ind, 'reorg', stride, prev_width, prev_height, prev_filters, width, height, filters))
112 |             prev_width = width
113 |             prev_height = height
114 |             prev_filters = filters
115 |             out_widths.append(prev_width)
116 |             out_heights.append(prev_height)
117 |             out_filters.append(prev_filters)
118 |         elif block['type'] == 'upsample':
119 |             stride = int(block['stride'])
120 |             filters = prev_filters
121 |             width = prev_width * stride
122 |             height = prev_height * stride
123 |             print('%5d %-6s           * %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
124 |                 ind, 'upsample', stride, prev_width, prev_height, prev_filters, width, height, filters))
125 |             prev_width = width
126 |             prev_height = height
127 |             prev_filters = filters
128 |             out_widths.append(prev_width)
129 |             out_heights.append(prev_height)
130 |             out_filters.append(prev_filters)
131 |         elif block['type'] == 'route':
132 |             layers = block['layers'].split(',')
133 |             layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
134 |             if len(layers) == 1:
135 |                 print('%5d %-6s %d' % (ind, 'route', layers[0]))
136 |                 prev_width = out_widths[layers[0]]
137 |                 prev_height = out_heights[layers[0]]
138 |                 prev_filters = out_filters[layers[0]]
139 |             elif len(layers) == 2:
140 |                 print('%5d %-6s %d %d' % (ind, 'route', layers[0], layers[1]))
141 |                 prev_width = out_widths[layers[0]]
142 |                 prev_height = out_heights[layers[0]]
143 |                 assert (prev_width == out_widths[layers[1]])
144 |                 assert (prev_height == out_heights[layers[1]])
145 |                 prev_filters = out_filters[layers[0]] + out_filters[layers[1]]
146 |             elif len(layers) == 4:
147 |                 print('%5d %-6s %d %d %d %d' % (ind, 'route', layers[0], layers[1], layers[2], layers[3]))
148 |                 prev_width = out_widths[layers[0]]
149 |                 prev_height = out_heights[layers[0]]
150 |                 assert (prev_width == out_widths[layers[1]] == out_widths[layers[2]] == out_widths[layers[3]])
151 |                 assert (prev_height == out_heights[layers[1]] == out_heights[layers[2]] == out_heights[layers[3]])
152 |                 prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + out_filters[
153 |                     layers[3]]
154 |             else:
155 |                 print("route error !!! {} {} {}".format(sys._getframe().f_code.co_filename,
156 |                                                         sys._getframe().f_code.co_name, sys._getframe().f_lineno))
157 | 
158 |             out_widths.append(prev_width)
159 |             out_heights.append(prev_height)
160 |             out_filters.append(prev_filters)
161 |         elif block['type'] in ['region', 'yolo']:
162 |             print('%5d %-6s' % (ind, 'detection'))
163 |             out_widths.append(prev_width)
164 |             out_heights.append(prev_height)
165 |             out_filters.append(prev_filters)
166 |         elif block['type'] == 'shortcut':
167 |             from_id = int(block['from'])
168 |             from_id = from_id if from_id > 0 else from_id + ind
169 |             print('%5d %-6s %d' % (ind, 'shortcut', from_id))
170 |             prev_width = out_widths[from_id]
171 |             prev_height = out_heights[from_id]
172 |             prev_filters = out_filters[from_id]
173 |             out_widths.append(prev_width)
174 |             out_heights.append(prev_height)
175 |             out_filters.append(prev_filters)
176 |         elif block['type'] == 'connected':
177 |             filters = int(block['output'])
178 |             print('%5d %-6s                            %d  ->  %3d' % (ind, 'connected', prev_filters, filters))
179 |             prev_filters = filters
180 |             out_widths.append(1)
181 |             out_heights.append(1)
182 |             out_filters.append(prev_filters)
183 |         else:
184 |             print('unknown type %s' % (block['type']))
185 | 
186 | 
187 | def load_conv(buf, start, conv_model):
188 |     num_w = conv_model.weight.numel()
189 |     num_b = conv_model.bias.numel()
190 |     conv_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
191 |     start = start + num_b
192 |     conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape));
193 |     start = start + num_w
194 |     return start
195 | 
196 | 
197 | def save_conv(fp, conv_model):
198 |     if conv_model.bias.is_cuda:
199 |         convert2cpu(conv_model.bias.data).numpy().tofile(fp)
200 |         convert2cpu(conv_model.weight.data).numpy().tofile(fp)
201 |     else:
202 |         conv_model.bias.data.numpy().tofile(fp)
203 |         conv_model.weight.data.numpy().tofile(fp)
204 | 
205 | 
206 | def load_conv_bn(buf, start, conv_model, bn_model):
207 |     num_w = conv_model.weight.numel()
208 |     num_b = bn_model.bias.numel()
209 |     bn_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
210 |     start = start + num_b
211 |     bn_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_b]));
212 |     start = start + num_b
213 |     bn_model.running_mean.copy_(torch.from_numpy(buf[start:start + num_b]));
214 |     start = start + num_b
215 |     bn_model.running_var.copy_(torch.from_numpy(buf[start:start + num_b]));
216 |     start = start + num_b
217 |     conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape));
218 |     start = start + num_w
219 |     return start
220 | 
221 | 
222 | def save_conv_bn(fp, conv_model, bn_model):
223 |     if bn_model.bias.is_cuda:
224 |         convert2cpu(bn_model.bias.data).numpy().tofile(fp)
225 |         convert2cpu(bn_model.weight.data).numpy().tofile(fp)
226 |         convert2cpu(bn_model.running_mean).numpy().tofile(fp)
227 |         convert2cpu(bn_model.running_var).numpy().tofile(fp)
228 |         convert2cpu(conv_model.weight.data).numpy().tofile(fp)
229 |     else:
230 |         bn_model.bias.data.numpy().tofile(fp)
231 |         bn_model.weight.data.numpy().tofile(fp)
232 |         bn_model.running_mean.numpy().tofile(fp)
233 |         bn_model.running_var.numpy().tofile(fp)
234 |         conv_model.weight.data.numpy().tofile(fp)
235 | 
236 | 
237 | def load_fc(buf, start, fc_model):
238 |     num_w = fc_model.weight.numel()
239 |     num_b = fc_model.bias.numel()
240 |     fc_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
241 |     start = start + num_b
242 |     fc_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]));
243 |     start = start + num_w
244 |     return start
245 | 
246 | 
247 | def save_fc(fp, fc_model):
248 |     fc_model.bias.data.numpy().tofile(fp)
249 |     fc_model.weight.data.numpy().tofile(fp)
250 | 
251 | 
252 | if __name__ == '__main__':
253 |     import sys
254 | 
255 |     blocks = parse_cfg('cfg/yolo.cfg')
256 |     if len(sys.argv) == 2:
257 |         blocks = parse_cfg(sys.argv[1])
258 |     print_cfg(blocks)
259 | 


--------------------------------------------------------------------------------
/tool/darknet2onnx.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import torch
 3 | from tool.darknet2pytorch import Darknet
 4 | 
 5 | 
 6 | def fransform_to_onnx(cfgfile, weightfile, batch_size=1):
 7 |     model = Darknet(cfgfile)
 8 | 
 9 |     model.print_network()
10 |     model.load_weights(weightfile)
11 |     print('Loading weights from %s... Done!' % (weightfile))
12 | 
13 |     # model.cuda()
14 | 
15 |     x = torch.randn((batch_size, 3, model.height, model.width), requires_grad=True)  # .cuda()
16 | 
17 |     onnx_file_name = "yolov4_{}_3_{}_{}.onnx".format(batch_size, model.height, model.width)
18 | 
19 |     # Export the model
20 |     print('Export the onnx model ...')
21 |     torch.onnx.export(model,
22 |                       x,
23 |                       onnx_file_name,
24 |                       export_params=True,
25 |                       opset_version=11,
26 |                       do_constant_folding=True,
27 |                       input_names=['input'], output_names=['output_1', 'output_2', 'output_3'],
28 |                       dynamic_axes=None)
29 | 
30 |     print('Onnx model exporting done')
31 |     return onnx_file_name
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     if len(sys.argv) == 3:
36 |         cfgfile = sys.argv[1]
37 |         weightfile = sys.argv[2]
38 |         fransform_to_onnx(cfgfile, weightfile)
39 |     elif len(sys.argv) == 4:
40 |         cfgfile = sys.argv[1]
41 |         weightfile = sys.argv[2]
42 |         batch_size = int(sys.argv[3])
43 |         fransform_to_onnx(cfgfile, weightfile, batch_size)
44 |     else:
45 |         print('Please execute this script this way:\n')
46 |         print('  python darknet2onnx.py <cfgFile> <weightFile>')
47 |         print('or')
48 |         print('  python darknet2onnx.py <cfgFile> <weightFile> <batchSize>')
49 | 


--------------------------------------------------------------------------------
/tool/darknet2pytorch.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch.nn.functional as F
  3 | import numpy as np
  4 | from tool.region_loss import RegionLoss
  5 | from tool.yolo_layer import YoloLayer
  6 | from tool.config import *
  7 | 
  8 | 
  9 | class Mish(torch.nn.Module):
 10 |     def __init__(self):
 11 |         super().__init__()
 12 | 
 13 |     def forward(self, x):
 14 |         x = x * (torch.tanh(torch.nn.functional.softplus(x)))
 15 |         return x
 16 | 
 17 | 
 18 | class MaxPoolDark(nn.Module):
 19 |     def __init__(self, size=2, stride=1):
 20 |         super(MaxPoolDark, self).__init__()
 21 |         self.size = size
 22 |         self.stride = stride
 23 | 
 24 |     def forward(self, x):
 25 |         '''
 26 |         darknet output_size = (input_size + p - k) / s +1
 27 |         p : padding = k - 1
 28 |         k : size
 29 |         s : stride
 30 |         torch output_size = (input_size + 2*p -k) / s +1
 31 |         p : padding = k//2
 32 |         '''
 33 |         p = self.size // 2
 34 |         if ((x.shape[2] - 1) // self.stride) != ((x.shape[2] + 2 * p - self.size) // self.stride):
 35 |             padding1 = (self.size - 1) // 2
 36 |             padding2 = padding1 + 1
 37 |         else:
 38 |             padding1 = (self.size - 1) // 2
 39 |             padding2 = padding1
 40 |         if ((x.shape[3] - 1) // self.stride) != ((x.shape[3] + 2 * p - self.size) // self.stride):
 41 |             padding3 = (self.size - 1) // 2
 42 |             padding4 = padding3 + 1
 43 |         else:
 44 |             padding3 = (self.size - 1) // 2
 45 |             padding4 = padding3
 46 |         x = F.max_pool2d(F.pad(x, (padding3, padding4, padding1, padding2), mode='replicate'),
 47 |                          self.size, stride=self.stride)
 48 |         return x
 49 | 
 50 | 
 51 | class Upsample(nn.Module):
 52 |     def __init__(self, stride=2):
 53 |         super(Upsample, self).__init__()
 54 |         self.stride = stride
 55 | 
 56 |     def forward(self, x):
 57 |         stride = self.stride
 58 |         assert (x.data.dim() == 4)
 59 |         B = x.data.size(0)
 60 |         C = x.data.size(1)
 61 |         H = x.data.size(2)
 62 |         W = x.data.size(3)
 63 |         ws = stride
 64 |         hs = stride
 65 |         x = x.view(B, C, H, 1, W, 1).expand(B, C, H, stride, W, stride).contiguous().view(B, C, H * stride, W * stride)
 66 |         return x
 67 | 
 68 | 
 69 | class Reorg(nn.Module):
 70 |     def __init__(self, stride=2):
 71 |         super(Reorg, self).__init__()
 72 |         self.stride = stride
 73 | 
 74 |     def forward(self, x):
 75 |         stride = self.stride
 76 |         assert (x.data.dim() == 4)
 77 |         B = x.data.size(0)
 78 |         C = x.data.size(1)
 79 |         H = x.data.size(2)
 80 |         W = x.data.size(3)
 81 |         assert (H % stride == 0)
 82 |         assert (W % stride == 0)
 83 |         ws = stride
 84 |         hs = stride
 85 |         x = x.view(B, C, H / hs, hs, W / ws, ws).transpose(3, 4).contiguous()
 86 |         x = x.view(B, C, H / hs * W / ws, hs * ws).transpose(2, 3).contiguous()
 87 |         x = x.view(B, C, hs * ws, H / hs, W / ws).transpose(1, 2).contiguous()
 88 |         x = x.view(B, hs * ws * C, H / hs, W / ws)
 89 |         return x
 90 | 
 91 | 
 92 | class GlobalAvgPool2d(nn.Module):
 93 |     def __init__(self):
 94 |         super(GlobalAvgPool2d, self).__init__()
 95 | 
 96 |     def forward(self, x):
 97 |         N = x.data.size(0)
 98 |         C = x.data.size(1)
 99 |         H = x.data.size(2)
100 |         W = x.data.size(3)
101 |         x = F.avg_pool2d(x, (H, W))
102 |         x = x.view(N, C)
103 |         return x
104 | 
105 | 
106 | # for route and shortcut
107 | class EmptyModule(nn.Module):
108 |     def __init__(self):
109 |         super(EmptyModule, self).__init__()
110 | 
111 |     def forward(self, x):
112 |         return x
113 | 
114 | 
115 | # support route shortcut and reorg
116 | class Darknet(nn.Module):
117 |     def __init__(self, cfgfile):
118 |         super(Darknet, self).__init__()
119 |         self.blocks = parse_cfg(cfgfile)
120 |         self.models = self.create_network(self.blocks)  # merge conv, bn,leaky
121 |         self.loss = self.models[len(self.models) - 1]
122 | 
123 |         self.width = int(self.blocks[0]['width'])
124 |         self.height = int(self.blocks[0]['height'])
125 | 
126 |         if self.blocks[(len(self.blocks) - 1)]['type'] == 'region':
127 |             self.anchors = self.loss.anchors
128 |             self.num_anchors = self.loss.num_anchors
129 |             self.anchor_step = self.loss.anchor_step
130 |             self.num_classes = self.loss.num_classes
131 | 
132 |         self.header = torch.IntTensor([0, 0, 0, 0])
133 |         self.seen = 0
134 | 
135 |     def forward(self, x):
136 |         ind = -2
137 |         self.loss = None
138 |         outputs = dict()
139 |         out_boxes = []
140 |         for block in self.blocks:
141 |             ind = ind + 1
142 |             # if ind > 0:
143 |             #    return x
144 | 
145 |             if block['type'] == 'net':
146 |                 continue
147 |             elif block['type'] in ['convolutional', 'maxpool', 'reorg', 'upsample', 'avgpool', 'softmax', 'connected']:
148 |                 x = self.models[ind](x)
149 |                 outputs[ind] = x
150 |             elif block['type'] == 'route':
151 |                 layers = block['layers'].split(',')
152 |                 layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
153 |                 if len(layers) == 1:
154 |                     x = outputs[layers[0]]
155 |                     outputs[ind] = x
156 |                 elif len(layers) == 2:
157 |                     x1 = outputs[layers[0]]
158 |                     x2 = outputs[layers[1]]
159 |                     x = torch.cat((x1, x2), 1)
160 |                     outputs[ind] = x
161 |                 elif len(layers) == 4:
162 |                     x1 = outputs[layers[0]]
163 |                     x2 = outputs[layers[1]]
164 |                     x3 = outputs[layers[2]]
165 |                     x4 = outputs[layers[3]]
166 |                     x = torch.cat((x1, x2, x3, x4), 1)
167 |                     outputs[ind] = x
168 |                 else:
169 |                     print("rounte number > 2 ,is {}".format(len(layers)))
170 | 
171 |             elif block['type'] == 'shortcut':
172 |                 from_layer = int(block['from'])
173 |                 activation = block['activation']
174 |                 from_layer = from_layer if from_layer > 0 else from_layer + ind
175 |                 x1 = outputs[from_layer]
176 |                 x2 = outputs[ind - 1]
177 |                 x = x1 + x2
178 |                 if activation == 'leaky':
179 |                     x = F.leaky_relu(x, 0.1, inplace=True)
180 |                 elif activation == 'relu':
181 |                     x = F.relu(x, inplace=True)
182 |                 outputs[ind] = x
183 |             elif block['type'] == 'region':
184 |                 continue
185 |                 if self.loss:
186 |                     self.loss = self.loss + self.models[ind](x)
187 |                 else:
188 |                     self.loss = self.models[ind](x)
189 |                 outputs[ind] = None
190 |             elif block['type'] == 'yolo':
191 |                 if self.training:
192 |                     pass
193 |                 else:
194 |                     boxes = self.models[ind](x)
195 |                     out_boxes.append(boxes)
196 |             elif block['type'] == 'cost':
197 |                 continue
198 |             else:
199 |                 print('unknown type %s' % (block['type']))
200 |         if self.training:
201 |             return loss
202 |         else:
203 |             return out_boxes
204 | 
205 |     def print_network(self):
206 |         print_cfg(self.blocks)
207 | 
208 |     def create_network(self, blocks):
209 |         models = nn.ModuleList()
210 | 
211 |         prev_filters = 3
212 |         out_filters = []
213 |         prev_stride = 1
214 |         out_strides = []
215 |         conv_id = 0
216 |         for block in blocks:
217 |             if block['type'] == 'net':
218 |                 prev_filters = int(block['channels'])
219 |                 continue
220 |             elif block['type'] == 'convolutional':
221 |                 conv_id = conv_id + 1
222 |                 batch_normalize = int(block['batch_normalize'])
223 |                 filters = int(block['filters'])
224 |                 kernel_size = int(block['size'])
225 |                 stride = int(block['stride'])
226 |                 is_pad = int(block['pad'])
227 |                 pad = (kernel_size - 1) // 2 if is_pad else 0
228 |                 activation = block['activation']
229 |                 model = nn.Sequential()
230 |                 if batch_normalize:
231 |                     model.add_module('conv{0}'.format(conv_id),
232 |                                      nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias=False))
233 |                     model.add_module('bn{0}'.format(conv_id), nn.BatchNorm2d(filters))
234 |                     # model.add_module('bn{0}'.format(conv_id), BN2d(filters))
235 |                 else:
236 |                     model.add_module('conv{0}'.format(conv_id),
237 |                                      nn.Conv2d(prev_filters, filters, kernel_size, stride, pad))
238 |                 if activation == 'leaky':
239 |                     model.add_module('leaky{0}'.format(conv_id), nn.LeakyReLU(0.1, inplace=True))
240 |                 elif activation == 'relu':
241 |                     model.add_module('relu{0}'.format(conv_id), nn.ReLU(inplace=True))
242 |                 elif activation == 'mish':
243 |                     model.add_module('mish{0}'.format(conv_id), Mish())
244 |                 else:
245 |                     print("convalution havn't activate {}".format(activation))
246 | 
247 |                 prev_filters = filters
248 |                 out_filters.append(prev_filters)
249 |                 prev_stride = stride * prev_stride
250 |                 out_strides.append(prev_stride)
251 |                 models.append(model)
252 |             elif block['type'] == 'maxpool':
253 |                 pool_size = int(block['size'])
254 |                 stride = int(block['stride'])
255 |                 if stride == 1 and pool_size % 2:
256 |                     # You can use Maxpooldark instead, here is convenient to convert onnx.
257 |                     model = nn.MaxPool2d(kernel_size=pool_size, stride=stride, padding=pool_size // 2)
258 |                 else:
259 |                     model = MaxPoolDark(pool_size, stride)
260 |                 out_filters.append(prev_filters)
261 |                 prev_stride = stride * prev_stride
262 |                 out_strides.append(prev_stride)
263 |                 models.append(model)
264 |             elif block['type'] == 'avgpool':
265 |                 model = GlobalAvgPool2d()
266 |                 out_filters.append(prev_filters)
267 |                 models.append(model)
268 |             elif block['type'] == 'softmax':
269 |                 model = nn.Softmax()
270 |                 out_strides.append(prev_stride)
271 |                 out_filters.append(prev_filters)
272 |                 models.append(model)
273 |             elif block['type'] == 'cost':
274 |                 if block['_type'] == 'sse':
275 |                     model = nn.MSELoss(size_average=True)
276 |                 elif block['_type'] == 'L1':
277 |                     model = nn.L1Loss(size_average=True)
278 |                 elif block['_type'] == 'smooth':
279 |                     model = nn.SmoothL1Loss(size_average=True)
280 |                 out_filters.append(1)
281 |                 out_strides.append(prev_stride)
282 |                 models.append(model)
283 |             elif block['type'] == 'reorg':
284 |                 stride = int(block['stride'])
285 |                 prev_filters = stride * stride * prev_filters
286 |                 out_filters.append(prev_filters)
287 |                 prev_stride = prev_stride * stride
288 |                 out_strides.append(prev_stride)
289 |                 models.append(Reorg(stride))
290 |             elif block['type'] == 'upsample':
291 |                 stride = int(block['stride'])
292 |                 out_filters.append(prev_filters)
293 |                 prev_stride = prev_stride // stride
294 |                 out_strides.append(prev_stride)
295 |                 # models.append(nn.Upsample(scale_factor=stride, mode='nearest'))
296 |                 models.append(Upsample(stride))
297 |             elif block['type'] == 'route':
298 |                 layers = block['layers'].split(',')
299 |                 ind = len(models)
300 |                 layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
301 |                 if len(layers) == 1:
302 |                     prev_filters = out_filters[layers[0]]
303 |                     prev_stride = out_strides[layers[0]]
304 |                 elif len(layers) == 2:
305 |                     assert (layers[0] == ind - 1)
306 |                     prev_filters = out_filters[layers[0]] + out_filters[layers[1]]
307 |                     prev_stride = out_strides[layers[0]]
308 |                 elif len(layers) == 4:
309 |                     assert (layers[0] == ind - 1)
310 |                     prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + \
311 |                                    out_filters[layers[3]]
312 |                     prev_stride = out_strides[layers[0]]
313 |                 else:
314 |                     print("route error!!!")
315 | 
316 |                 out_filters.append(prev_filters)
317 |                 out_strides.append(prev_stride)
318 |                 models.append(EmptyModule())
319 |             elif block['type'] == 'shortcut':
320 |                 ind = len(models)
321 |                 prev_filters = out_filters[ind - 1]
322 |                 out_filters.append(prev_filters)
323 |                 prev_stride = out_strides[ind - 1]
324 |                 out_strides.append(prev_stride)
325 |                 models.append(EmptyModule())
326 |             elif block['type'] == 'connected':
327 |                 filters = int(block['output'])
328 |                 if block['activation'] == 'linear':
329 |                     model = nn.Linear(prev_filters, filters)
330 |                 elif block['activation'] == 'leaky':
331 |                     model = nn.Sequential(
332 |                         nn.Linear(prev_filters, filters),
333 |                         nn.LeakyReLU(0.1, inplace=True))
334 |                 elif block['activation'] == 'relu':
335 |                     model = nn.Sequential(
336 |                         nn.Linear(prev_filters, filters),
337 |                         nn.ReLU(inplace=True))
338 |                 prev_filters = filters
339 |                 out_filters.append(prev_filters)
340 |                 out_strides.append(prev_stride)
341 |                 models.append(model)
342 |             elif block['type'] == 'region':
343 |                 loss = RegionLoss()
344 |                 anchors = block['anchors'].split(',')
345 |                 loss.anchors = [float(i) for i in anchors]
346 |                 loss.num_classes = int(block['classes'])
347 |                 loss.num_anchors = int(block['num'])
348 |                 loss.anchor_step = len(loss.anchors) // loss.num_anchors
349 |                 loss.object_scale = float(block['object_scale'])
350 |                 loss.noobject_scale = float(block['noobject_scale'])
351 |                 loss.class_scale = float(block['class_scale'])
352 |                 loss.coord_scale = float(block['coord_scale'])
353 |                 out_filters.append(prev_filters)
354 |                 out_strides.append(prev_stride)
355 |                 models.append(loss)
356 |             elif block['type'] == 'yolo':
357 |                 yolo_layer = YoloLayer()
358 |                 anchors = block['anchors'].split(',')
359 |                 anchor_mask = block['mask'].split(',')
360 |                 yolo_layer.anchor_mask = [int(i) for i in anchor_mask]
361 |                 yolo_layer.anchors = [float(i) for i in anchors]
362 |                 yolo_layer.num_classes = int(block['classes'])
363 |                 yolo_layer.num_anchors = int(block['num'])
364 |                 yolo_layer.anchor_step = len(yolo_layer.anchors) // yolo_layer.num_anchors
365 |                 yolo_layer.stride = prev_stride
366 |                 # yolo_layer.object_scale = float(block['object_scale'])
367 |                 # yolo_layer.noobject_scale = float(block['noobject_scale'])
368 |                 # yolo_layer.class_scale = float(block['class_scale'])
369 |                 # yolo_layer.coord_scale = float(block['coord_scale'])
370 |                 out_filters.append(prev_filters)
371 |                 out_strides.append(prev_stride)
372 |                 models.append(yolo_layer)
373 |             else:
374 |                 print('unknown type %s' % (block['type']))
375 | 
376 |         return models
377 | 
378 |     def load_weights(self, weightfile):
379 |         fp = open(weightfile, 'rb')
380 |         header = np.fromfile(fp, count=5, dtype=np.int32)
381 |         self.header = torch.from_numpy(header)
382 |         self.seen = self.header[3]
383 |         buf = np.fromfile(fp, dtype=np.float32)
384 |         fp.close()
385 | 
386 |         start = 0
387 |         ind = -2
388 |         for block in self.blocks:
389 |             if start >= buf.size:
390 |                 break
391 |             ind = ind + 1
392 |             if block['type'] == 'net':
393 |                 continue
394 |             elif block['type'] == 'convolutional':
395 |                 model = self.models[ind]
396 |                 batch_normalize = int(block['batch_normalize'])
397 |                 if batch_normalize:
398 |                     start = load_conv_bn(buf, start, model[0], model[1])
399 |                 else:
400 |                     start = load_conv(buf, start, model[0])
401 |             elif block['type'] == 'connected':
402 |                 model = self.models[ind]
403 |                 if block['activation'] != 'linear':
404 |                     start = load_fc(buf, start, model[0])
405 |                 else:
406 |                     start = load_fc(buf, start, model)
407 |             elif block['type'] == 'maxpool':
408 |                 pass
409 |             elif block['type'] == 'reorg':
410 |                 pass
411 |             elif block['type'] == 'upsample':
412 |                 pass
413 |             elif block['type'] == 'route':
414 |                 pass
415 |             elif block['type'] == 'shortcut':
416 |                 pass
417 |             elif block['type'] == 'region':
418 |                 pass
419 |             elif block['type'] == 'yolo':
420 |                 pass
421 |             elif block['type'] == 'avgpool':
422 |                 pass
423 |             elif block['type'] == 'softmax':
424 |                 pass
425 |             elif block['type'] == 'cost':
426 |                 pass
427 |             else:
428 |                 print('unknown type %s' % (block['type']))
429 | 
430 |     # def save_weights(self, outfile, cutoff=0):
431 |     #     if cutoff <= 0:
432 |     #         cutoff = len(self.blocks) - 1
433 |     #
434 |     #     fp = open(outfile, 'wb')
435 |     #     self.header[3] = self.seen
436 |     #     header = self.header
437 |     #     header.numpy().tofile(fp)
438 |     #
439 |     #     ind = -1
440 |     #     for blockId in range(1, cutoff + 1):
441 |     #         ind = ind + 1
442 |     #         block = self.blocks[blockId]
443 |     #         if block['type'] == 'convolutional':
444 |     #             model = self.models[ind]
445 |     #             batch_normalize = int(block['batch_normalize'])
446 |     #             if batch_normalize:
447 |     #                 save_conv_bn(fp, model[0], model[1])
448 |     #             else:
449 |     #                 save_conv(fp, model[0])
450 |     #         elif block['type'] == 'connected':
451 |     #             model = self.models[ind]
452 |     #             if block['activation'] != 'linear':
453 |     #                 save_fc(fc, model)
454 |     #             else:
455 |     #                 save_fc(fc, model[0])
456 |     #         elif block['type'] == 'maxpool':
457 |     #             pass
458 |     #         elif block['type'] == 'reorg':
459 |     #             pass
460 |     #         elif block['type'] == 'upsample':
461 |     #             pass
462 |     #         elif block['type'] == 'route':
463 |     #             pass
464 |     #         elif block['type'] == 'shortcut':
465 |     #             pass
466 |     #         elif block['type'] == 'region':
467 |     #             pass
468 |     #         elif block['type'] == 'yolo':
469 |     #             pass
470 |     #         elif block['type'] == 'avgpool':
471 |     #             pass
472 |     #         elif block['type'] == 'softmax':
473 |     #             pass
474 |     #         elif block['type'] == 'cost':
475 |     #             pass
476 |     #         else:
477 |     #             print('unknown type %s' % (block['type']))
478 |     #     fp.close()
479 | 


--------------------------------------------------------------------------------
/tool/onnx2tensorflow.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import onnx
 3 | from onnx_tf.backend import prepare
 4 | 
 5 | 
 6 | # tensorflow >=2.0
 7 | # 1: Thanks:github:https://github.com/onnx/onnx-tensorflow
 8 | # 2: Run git clone https://github.com/onnx/onnx-tensorflow.git && cd onnx-tensorflow
 9 | #    Run pip install -e .
10 | # Note:
11 | #    Errors will occur when using "pip install onnx-tf", at least for me,
12 | #    it is recommended to use source code installation
13 | def transform_to_tensorflow(onnx_input_path, pb_output_path):
14 |     onnx_model = onnx.load(onnx_input_path)  # load onnx model
15 |     tf_exp = prepare(onnx_model)  # prepare tf representation
16 |     tf_exp.export_graph(pb_output_path)  # export the model
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     if len(sys.argv) == 1:
21 |         sys.argv.append('../weight/yolov4_1_3_608_608.onnx')  # use:darknet2onnx.py
22 |         sys.argv.append('../weight/yolov4.pb')  # use:onnx2tensorflow.py
23 |     if len(sys.argv) == 3:
24 |         onnxfile = sys.argv[1]
25 |         tfpb_outfile = sys.argv[2]
26 |         transform_to_tensorflow(onnxfile, tfpb_outfile)
27 |     else:
28 |         print('Please execute this script this way:\n')
29 |         print('  python onnx2tensorflow.py <onnxfile> <tfpboutfile>')
30 | 


--------------------------------------------------------------------------------
/tool/region_loss.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch.nn.functional as F
  3 | from tool.utils import *
  4 | 
  5 | 
  6 | def build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale,
  7 |                   sil_thresh, seen):
  8 |     nB = target.size(0)
  9 |     nA = num_anchors
 10 |     nC = num_classes
 11 |     anchor_step = len(anchors) / num_anchors
 12 |     conf_mask = torch.ones(nB, nA, nH, nW) * noobject_scale
 13 |     coord_mask = torch.zeros(nB, nA, nH, nW)
 14 |     cls_mask = torch.zeros(nB, nA, nH, nW)
 15 |     tx = torch.zeros(nB, nA, nH, nW)
 16 |     ty = torch.zeros(nB, nA, nH, nW)
 17 |     tw = torch.zeros(nB, nA, nH, nW)
 18 |     th = torch.zeros(nB, nA, nH, nW)
 19 |     tconf = torch.zeros(nB, nA, nH, nW)
 20 |     tcls = torch.zeros(nB, nA, nH, nW)
 21 | 
 22 |     nAnchors = nA * nH * nW
 23 |     nPixels = nH * nW
 24 |     for b in range(nB):
 25 |         cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t()
 26 |         cur_ious = torch.zeros(nAnchors)
 27 |         for t in range(50):
 28 |             if target[b][t * 5 + 1] == 0:
 29 |                 break
 30 |             gx = target[b][t * 5 + 1] * nW
 31 |             gy = target[b][t * 5 + 2] * nH
 32 |             gw = target[b][t * 5 + 3] * nW
 33 |             gh = target[b][t * 5 + 4] * nH
 34 |             cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors, 1).t()
 35 |             cur_ious = torch.max(cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False))
 36 |         conf_mask[b][cur_ious > sil_thresh] = 0
 37 |     if seen < 12800:
 38 |         if anchor_step == 4:
 39 |             tx = torch.FloatTensor(anchors).view(nA, anchor_step).index_select(1, torch.LongTensor([2])).view(1, nA, 1,
 40 |                                                                                                               1).repeat(
 41 |                 nB, 1, nH, nW)
 42 |             ty = torch.FloatTensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([2])).view(
 43 |                 1, nA, 1, 1).repeat(nB, 1, nH, nW)
 44 |         else:
 45 |             tx.fill_(0.5)
 46 |             ty.fill_(0.5)
 47 |         tw.zero_()
 48 |         th.zero_()
 49 |         coord_mask.fill_(1)
 50 | 
 51 |     nGT = 0
 52 |     nCorrect = 0
 53 |     for b in range(nB):
 54 |         for t in range(50):
 55 |             if target[b][t * 5 + 1] == 0:
 56 |                 break
 57 |             nGT = nGT + 1
 58 |             best_iou = 0.0
 59 |             best_n = -1
 60 |             min_dist = 10000
 61 |             gx = target[b][t * 5 + 1] * nW
 62 |             gy = target[b][t * 5 + 2] * nH
 63 |             gi = int(gx)
 64 |             gj = int(gy)
 65 |             gw = target[b][t * 5 + 3] * nW
 66 |             gh = target[b][t * 5 + 4] * nH
 67 |             gt_box = [0, 0, gw, gh]
 68 |             for n in range(nA):
 69 |                 aw = anchors[anchor_step * n]
 70 |                 ah = anchors[anchor_step * n + 1]
 71 |                 anchor_box = [0, 0, aw, ah]
 72 |                 iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False)
 73 |                 if anchor_step == 4:
 74 |                     ax = anchors[anchor_step * n + 2]
 75 |                     ay = anchors[anchor_step * n + 3]
 76 |                     dist = pow(((gi + ax) - gx), 2) + pow(((gj + ay) - gy), 2)
 77 |                 if iou > best_iou:
 78 |                     best_iou = iou
 79 |                     best_n = n
 80 |                 elif anchor_step == 4 and iou == best_iou and dist < min_dist:
 81 |                     best_iou = iou
 82 |                     best_n = n
 83 |                     min_dist = dist
 84 | 
 85 |             gt_box = [gx, gy, gw, gh]
 86 |             pred_box = pred_boxes[b * nAnchors + best_n * nPixels + gj * nW + gi]
 87 | 
 88 |             coord_mask[b][best_n][gj][gi] = 1
 89 |             cls_mask[b][best_n][gj][gi] = 1
 90 |             conf_mask[b][best_n][gj][gi] = object_scale
 91 |             tx[b][best_n][gj][gi] = target[b][t * 5 + 1] * nW - gi
 92 |             ty[b][best_n][gj][gi] = target[b][t * 5 + 2] * nH - gj
 93 |             tw[b][best_n][gj][gi] = math.log(gw / anchors[anchor_step * best_n])
 94 |             th[b][best_n][gj][gi] = math.log(gh / anchors[anchor_step * best_n + 1])
 95 |             iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False)  # best_iou
 96 |             tconf[b][best_n][gj][gi] = iou
 97 |             tcls[b][best_n][gj][gi] = target[b][t * 5]
 98 |             if iou > 0.5:
 99 |                 nCorrect = nCorrect + 1
100 | 
101 |     return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls
102 | 
103 | 
104 | class RegionLoss(nn.Module):
105 |     def __init__(self, num_classes=0, anchors=[], num_anchors=1):
106 |         super(RegionLoss, self).__init__()
107 |         self.num_classes = num_classes
108 |         self.anchors = anchors
109 |         self.num_anchors = num_anchors
110 |         self.anchor_step = len(anchors) / num_anchors
111 |         self.coord_scale = 1
112 |         self.noobject_scale = 1
113 |         self.object_scale = 5
114 |         self.class_scale = 1
115 |         self.thresh = 0.6
116 |         self.seen = 0
117 | 
118 |     def forward(self, output, target):
119 |         # output : BxAs*(4+1+num_classes)*H*W
120 |         t0 = time.time()
121 |         nB = output.data.size(0)
122 |         nA = self.num_anchors
123 |         nC = self.num_classes
124 |         nH = output.data.size(2)
125 |         nW = output.data.size(3)
126 | 
127 |         output = output.view(nB, nA, (5 + nC), nH, nW)
128 |         x = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([0]))).view(nB, nA, nH, nW))
129 |         y = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([1]))).view(nB, nA, nH, nW))
130 |         w = output.index_select(2, Variable(torch.cuda.LongTensor([2]))).view(nB, nA, nH, nW)
131 |         h = output.index_select(2, Variable(torch.cuda.LongTensor([3]))).view(nB, nA, nH, nW)
132 |         conf = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([4]))).view(nB, nA, nH, nW))
133 |         cls = output.index_select(2, Variable(torch.linspace(5, 5 + nC - 1, nC).long().cuda()))
134 |         cls = cls.view(nB * nA, nC, nH * nW).transpose(1, 2).contiguous().view(nB * nA * nH * nW, nC)
135 |         t1 = time.time()
136 | 
137 |         pred_boxes = torch.cuda.FloatTensor(4, nB * nA * nH * nW)
138 |         grid_x = torch.linspace(0, nW - 1, nW).repeat(nH, 1).repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda()
139 |         grid_y = torch.linspace(0, nH - 1, nH).repeat(nW, 1).t().repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda()
140 |         anchor_w = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([0])).cuda()
141 |         anchor_h = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([1])).cuda()
142 |         anchor_w = anchor_w.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW)
143 |         anchor_h = anchor_h.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW)
144 |         pred_boxes[0] = x.data + grid_x
145 |         pred_boxes[1] = y.data + grid_y
146 |         pred_boxes[2] = torch.exp(w.data) * anchor_w
147 |         pred_boxes[3] = torch.exp(h.data) * anchor_h
148 |         pred_boxes = convert2cpu(pred_boxes.transpose(0, 1).contiguous().view(-1, 4))
149 |         t2 = time.time()
150 | 
151 |         nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes,
152 |                                                                                                     target.data,
153 |                                                                                                     self.anchors, nA,
154 |                                                                                                     nC, \
155 |                                                                                                     nH, nW,
156 |                                                                                                     self.noobject_scale,
157 |                                                                                                     self.object_scale,
158 |                                                                                                     self.thresh,
159 |                                                                                                     self.seen)
160 |         cls_mask = (cls_mask == 1)
161 |         nProposals = int((conf > 0.25).sum().data[0])
162 | 
163 |         tx = Variable(tx.cuda())
164 |         ty = Variable(ty.cuda())
165 |         tw = Variable(tw.cuda())
166 |         th = Variable(th.cuda())
167 |         tconf = Variable(tconf.cuda())
168 |         tcls = Variable(tcls.view(-1)[cls_mask].long().cuda())
169 | 
170 |         coord_mask = Variable(coord_mask.cuda())
171 |         conf_mask = Variable(conf_mask.cuda().sqrt())
172 |         cls_mask = Variable(cls_mask.view(-1, 1).repeat(1, nC).cuda())
173 |         cls = cls[cls_mask].view(-1, nC)
174 | 
175 |         t3 = time.time()
176 | 
177 |         loss_x = self.coord_scale * nn.MSELoss(size_average=False)(x * coord_mask, tx * coord_mask) / 2.0
178 |         loss_y = self.coord_scale * nn.MSELoss(size_average=False)(y * coord_mask, ty * coord_mask) / 2.0
179 |         loss_w = self.coord_scale * nn.MSELoss(size_average=False)(w * coord_mask, tw * coord_mask) / 2.0
180 |         loss_h = self.coord_scale * nn.MSELoss(size_average=False)(h * coord_mask, th * coord_mask) / 2.0
181 |         loss_conf = nn.MSELoss(size_average=False)(conf * conf_mask, tconf * conf_mask) / 2.0
182 |         loss_cls = self.class_scale * nn.CrossEntropyLoss(size_average=False)(cls, tcls)
183 |         loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
184 |         t4 = time.time()
185 |         if False:
186 |             print('-----------------------------------')
187 |             print('        activation : %f' % (t1 - t0))
188 |             print(' create pred_boxes : %f' % (t2 - t1))
189 |             print('     build targets : %f' % (t3 - t2))
190 |             print('       create loss : %f' % (t4 - t3))
191 |             print('             total : %f' % (t4 - t0))
192 |         print('%d: nGT %d, recall %d, proposals %d, loss: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f' % (
193 |         self.seen, nGT, nCorrect, nProposals, loss_x.data[0], loss_y.data[0], loss_w.data[0], loss_h.data[0],
194 |         loss_conf.data[0], loss_cls.data[0], loss.data[0]))
195 |         return loss
196 | 


--------------------------------------------------------------------------------
/tool/utils.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import time
  4 | import math
  5 | import torch
  6 | import numpy as np
  7 | from PIL import Image, ImageDraw, ImageFont
  8 | from torch.autograd import Variable
  9 | 
 10 | import itertools
 11 | import struct  # get_image_size
 12 | import imghdr  # get_image_size
 13 | 
 14 | 
 15 | def sigmoid(x):
 16 |     return 1.0 / (np.exp(-x) + 1.)
 17 | 
 18 | 
 19 | def softmax(x):
 20 |     x = np.exp(x - np.expand_dims(np.max(x, axis=1), axis=1))
 21 |     x = x / np.expand_dims(x.sum(axis=1), axis=1)
 22 |     return x
 23 | 
 24 | 
 25 | def bbox_iou(box1, box2, x1y1x2y2=True):
 26 |     if x1y1x2y2:
 27 |         mx = min(box1[0], box2[0])
 28 |         Mx = max(box1[2], box2[2])
 29 |         my = min(box1[1], box2[1])
 30 |         My = max(box1[3], box2[3])
 31 |         w1 = box1[2] - box1[0]
 32 |         h1 = box1[3] - box1[1]
 33 |         w2 = box2[2] - box2[0]
 34 |         h2 = box2[3] - box2[1]
 35 |     else:
 36 |         mx = min(box1[0] - box1[2] / 2.0, box2[0] - box2[2] / 2.0)
 37 |         Mx = max(box1[0] + box1[2] / 2.0, box2[0] + box2[2] / 2.0)
 38 |         my = min(box1[1] - box1[3] / 2.0, box2[1] - box2[3] / 2.0)
 39 |         My = max(box1[1] + box1[3] / 2.0, box2[1] + box2[3] / 2.0)
 40 |         w1 = box1[2]
 41 |         h1 = box1[3]
 42 |         w2 = box2[2]
 43 |         h2 = box2[3]
 44 |     uw = Mx - mx
 45 |     uh = My - my
 46 |     cw = w1 + w2 - uw
 47 |     ch = h1 + h2 - uh
 48 |     carea = 0
 49 |     if cw <= 0 or ch <= 0:
 50 |         return 0.0
 51 | 
 52 |     area1 = w1 * h1
 53 |     area2 = w2 * h2
 54 |     carea = cw * ch
 55 |     uarea = area1 + area2 - carea
 56 |     return carea / uarea
 57 | 
 58 | 
 59 | def bbox_ious(boxes1, boxes2, x1y1x2y2=True):
 60 |     if x1y1x2y2:
 61 |         mx = torch.min(boxes1[0], boxes2[0])
 62 |         Mx = torch.max(boxes1[2], boxes2[2])
 63 |         my = torch.min(boxes1[1], boxes2[1])
 64 |         My = torch.max(boxes1[3], boxes2[3])
 65 |         w1 = boxes1[2] - boxes1[0]
 66 |         h1 = boxes1[3] - boxes1[1]
 67 |         w2 = boxes2[2] - boxes2[0]
 68 |         h2 = boxes2[3] - boxes2[1]
 69 |     else:
 70 |         mx = torch.min(boxes1[0] - boxes1[2] / 2.0, boxes2[0] - boxes2[2] / 2.0)
 71 |         Mx = torch.max(boxes1[0] + boxes1[2] / 2.0, boxes2[0] + boxes2[2] / 2.0)
 72 |         my = torch.min(boxes1[1] - boxes1[3] / 2.0, boxes2[1] - boxes2[3] / 2.0)
 73 |         My = torch.max(boxes1[1] + boxes1[3] / 2.0, boxes2[1] + boxes2[3] / 2.0)
 74 |         w1 = boxes1[2]
 75 |         h1 = boxes1[3]
 76 |         w2 = boxes2[2]
 77 |         h2 = boxes2[3]
 78 |     uw = Mx - mx
 79 |     uh = My - my
 80 |     cw = w1 + w2 - uw
 81 |     ch = h1 + h2 - uh
 82 |     mask = ((cw <= 0) + (ch <= 0) > 0)
 83 |     area1 = w1 * h1
 84 |     area2 = w2 * h2
 85 |     carea = cw * ch
 86 |     carea[mask] = 0
 87 |     uarea = area1 + area2 - carea
 88 |     return carea / uarea
 89 | 
 90 | 
 91 | def nms(boxes, nms_thresh):
 92 |     if len(boxes) == 0:
 93 |         return boxes
 94 | 
 95 |     det_confs = torch.zeros(len(boxes))
 96 |     for i in range(len(boxes)):
 97 |         det_confs[i] = 1 - boxes[i][4]
 98 | 
 99 |     _, sortIds = torch.sort(det_confs)
100 |     out_boxes = []
101 |     for i in range(len(boxes)):
102 |         box_i = boxes[sortIds[i]]
103 |         if box_i[4] > 0:
104 |             out_boxes.append(box_i)
105 |             for j in range(i + 1, len(boxes)):
106 |                 box_j = boxes[sortIds[j]]
107 |                 if bbox_iou(box_i, box_j, x1y1x2y2=False) > nms_thresh:
108 |                     # print(box_i, box_j, bbox_iou(box_i, box_j, x1y1x2y2=False))
109 |                     box_j[4] = 0
110 |     return out_boxes
111 | 
112 | 
113 | def convert2cpu(gpu_matrix):
114 |     return torch.FloatTensor(gpu_matrix.size()).copy_(gpu_matrix)
115 | 
116 | 
117 | def convert2cpu_long(gpu_matrix):
118 |     return torch.LongTensor(gpu_matrix.size()).copy_(gpu_matrix)
119 | 
120 | 
121 | def get_region_boxes_in_model(output, conf_thresh, num_classes, anchors, num_anchors, only_objectness=1,
122 |                               validation=False):
123 |     anchor_step = len(anchors) // num_anchors
124 |     if output.dim() == 3:
125 |         output = output.unsqueeze(0)
126 |     batch = output.size(0)
127 |     assert (output.size(1) == (5 + num_classes) * num_anchors)
128 |     h = output.size(2)
129 |     w = output.size(3)
130 | 
131 |     t0 = time.time()
132 |     all_boxes = []
133 |     output = output.view(batch * num_anchors, 5 + num_classes, h * w).transpose(0, 1).contiguous().view(5 + num_classes,
134 |                                                                                                         batch * num_anchors * h * w)
135 | 
136 |     grid_x = torch.linspace(0, w - 1, w).repeat(h, 1).repeat(batch * num_anchors, 1, 1).view(
137 |         batch * num_anchors * h * w).type_as(output)  # cuda()
138 |     grid_y = torch.linspace(0, h - 1, h).repeat(w, 1).t().repeat(batch * num_anchors, 1, 1).view(
139 |         batch * num_anchors * h * w).type_as(output)  # cuda()
140 |     xs = torch.sigmoid(output[0]) + grid_x
141 |     ys = torch.sigmoid(output[1]) + grid_y
142 | 
143 |     anchor_w = torch.Tensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([0]))
144 |     anchor_h = torch.Tensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([1]))
145 |     anchor_w = anchor_w.repeat(batch, 1).repeat(1, 1, h * w).view(batch * num_anchors * h * w).type_as(output)  # cuda()
146 |     anchor_h = anchor_h.repeat(batch, 1).repeat(1, 1, h * w).view(batch * num_anchors * h * w).type_as(output)  # cuda()
147 |     ws = torch.exp(output[2]) * anchor_w
148 |     hs = torch.exp(output[3]) * anchor_h
149 | 
150 |     det_confs = torch.sigmoid(output[4])
151 | 
152 |     cls_confs = torch.nn.Softmax()(Variable(output[5:5 + num_classes].transpose(0, 1))).data
153 |     cls_max_confs, cls_max_ids = torch.max(cls_confs, 1)
154 |     cls_max_confs = cls_max_confs.view(-1)
155 |     cls_max_ids = cls_max_ids.view(-1)
156 |     t1 = time.time()
157 | 
158 |     sz_hw = h * w
159 |     sz_hwa = sz_hw * num_anchors
160 |     det_confs = convert2cpu(det_confs)
161 |     cls_max_confs = convert2cpu(cls_max_confs)
162 |     cls_max_ids = convert2cpu_long(cls_max_ids)
163 |     xs = convert2cpu(xs)
164 |     ys = convert2cpu(ys)
165 |     ws = convert2cpu(ws)
166 |     hs = convert2cpu(hs)
167 |     if validation:
168 |         cls_confs = convert2cpu(cls_confs.view(-1, num_classes))
169 |     t2 = time.time()
170 |     for b in range(batch):
171 |         boxes = []
172 |         for cy in range(h):
173 |             for cx in range(w):
174 |                 for i in range(num_anchors):
175 |                     ind = b * sz_hwa + i * sz_hw + cy * w + cx
176 |                     det_conf = det_confs[ind]
177 |                     if only_objectness:
178 |                         conf = det_confs[ind]
179 |                     else:
180 |                         conf = det_confs[ind] * cls_max_confs[ind]
181 | 
182 |                     if conf > conf_thresh:
183 |                         bcx = xs[ind]
184 |                         bcy = ys[ind]
185 |                         bw = ws[ind]
186 |                         bh = hs[ind]
187 |                         cls_max_conf = cls_max_confs[ind]
188 |                         cls_max_id = cls_max_ids[ind]
189 |                         box = [bcx / w, bcy / h, bw / w, bh / h, det_conf, cls_max_conf, cls_max_id]
190 |                         if (not only_objectness) and validation:
191 |                             for c in range(num_classes):
192 |                                 tmp_conf = cls_confs[ind][c]
193 |                                 if c != cls_max_id and det_confs[ind] * tmp_conf > conf_thresh:
194 |                                     box.append(tmp_conf)
195 |                                     box.append(c)
196 |                         boxes.append(box)
197 |         all_boxes.append(boxes)
198 |     t3 = time.time()
199 |     if False:
200 |         print('---------------------------------')
201 |         print('matrix computation : %f' % (t1 - t0))
202 |         print('        gpu to cpu : %f' % (t2 - t1))
203 |         print('      boxes filter : %f' % (t3 - t2))
204 |         print('---------------------------------')
205 |     return all_boxes
206 | 
207 | 
208 | def get_region_boxes_out_model(output, conf_thresh, num_classes, anchors, num_anchors, only_objectness=1,
209 |                                validation=False):
210 |     anchor_step = len(anchors) // num_anchors
211 |     if len(output.shape) == 3:
212 |         output = np.expand_dims(output, axis=0)
213 |     batch = output.shape[0]
214 |     assert (output.shape[1] == (5 + num_classes) * num_anchors)
215 |     h = output.shape[2]
216 |     w = output.shape[3]
217 | 
218 |     t0 = time.time()
219 |     all_boxes = []
220 |     output = output.reshape(batch * num_anchors, 5 + num_classes, h * w).transpose((1, 0, 2)).reshape(
221 |         5 + num_classes,
222 |         batch * num_anchors * h * w)
223 | 
224 |     grid_x = np.expand_dims(np.expand_dims(np.linspace(0, w - 1, w), axis=0).repeat(h, 0), axis=0).repeat(
225 |         batch * num_anchors, axis=0).reshape(
226 |         batch * num_anchors * h * w)
227 |     grid_y = np.expand_dims(np.expand_dims(np.linspace(0, h - 1, h), axis=0).repeat(w, 0).T, axis=0).repeat(
228 |         batch * num_anchors, axis=0).reshape(
229 |         batch * num_anchors * h * w)
230 | 
231 |     xs = sigmoid(output[0]) + grid_x
232 |     ys = sigmoid(output[1]) + grid_y
233 | 
234 |     anchor_w = np.array(anchors).reshape((num_anchors, anchor_step))[:, 0]
235 |     anchor_h = np.array(anchors).reshape((num_anchors, anchor_step))[:, 1]
236 |     anchor_w = np.expand_dims(np.expand_dims(anchor_w, axis=1).repeat(batch, 1), axis=2) \
237 |         .repeat(h * w, axis=2).transpose(1, 0, 2).reshape(batch * num_anchors * h * w)
238 |     anchor_h = np.expand_dims(np.expand_dims(anchor_h, axis=1).repeat(batch, 1), axis=2) \
239 |         .repeat(h * w, axis=2).transpose(1, 0, 2).reshape(batch * num_anchors * h * w)
240 |     ws = np.exp(output[2]) * anchor_w
241 |     hs = np.exp(output[3]) * anchor_h
242 | 
243 |     det_confs = sigmoid(output[4])
244 | 
245 |     cls_confs = softmax(output[5:5 + num_classes].transpose(1, 0))
246 |     cls_max_confs = np.max(cls_confs, 1)
247 |     cls_max_ids = np.argmax(cls_confs, 1)
248 |     t1 = time.time()
249 | 
250 |     sz_hw = h * w
251 |     sz_hwa = sz_hw * num_anchors
252 |     t2 = time.time()
253 |     for b in range(batch):
254 |         boxes = []
255 |         for cy in range(h):
256 |             for cx in range(w):
257 |                 for i in range(num_anchors):
258 |                     ind = b * sz_hwa + i * sz_hw + cy * w + cx
259 |                     det_conf = det_confs[ind]
260 |                     if only_objectness:
261 |                         conf = det_confs[ind]
262 |                     else:
263 |                         conf = det_confs[ind] * cls_max_confs[ind]
264 | 
265 |                     if conf > conf_thresh:
266 |                         bcx = xs[ind]
267 |                         bcy = ys[ind]
268 |                         bw = ws[ind]
269 |                         bh = hs[ind]
270 |                         cls_max_conf = cls_max_confs[ind]
271 |                         cls_max_id = cls_max_ids[ind]
272 |                         box = [bcx / w, bcy / h, bw / w, bh / h, det_conf, cls_max_conf, cls_max_id]
273 |                         if (not only_objectness) and validation:
274 |                             for c in range(num_classes):
275 |                                 tmp_conf = cls_confs[ind][c]
276 |                                 if c != cls_max_id and det_confs[ind] * tmp_conf > conf_thresh:
277 |                                     box.append(tmp_conf)
278 |                                     box.append(c)
279 |                         boxes.append(box)
280 |         all_boxes.append(boxes)
281 |     t3 = time.time()
282 |     if False:
283 |         print('---------------------------------')
284 |         print('matrix computation : %f' % (t1 - t0))
285 |         print('        gpu to cpu : %f' % (t2 - t1))
286 |         print('      boxes filter : %f' % (t3 - t2))
287 |         print('---------------------------------')
288 |     return all_boxes
289 | 
290 | 
291 | def plot_boxes_cv2(img, boxes, savename=None, class_names=None, color=None):
292 |     import cv2
293 |     colors = torch.FloatTensor([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]]);
294 | 
295 |     def get_color(c, x, max_val):
296 |         ratio = float(x) / max_val * 5
297 |         i = int(math.floor(ratio))
298 |         j = int(math.ceil(ratio))
299 |         ratio = ratio - i
300 |         r = (1 - ratio) * colors[i][c] + ratio * colors[j][c]
301 |         return int(r * 255)
302 | 
303 |     width = img.shape[1]
304 |     height = img.shape[0]
305 |     for i in range(len(boxes)):
306 |         box = boxes[i]
307 |         x1 = int((box[0] - box[2] / 2.0) * width)
308 |         y1 = int((box[1] - box[3] / 2.0) * height)
309 |         x2 = int((box[0] + box[2] / 2.0) * width)
310 |         y2 = int((box[1] + box[3] / 2.0) * height)
311 | 
312 |         if color:
313 |             rgb = color
314 |         else:
315 |             rgb = (255, 0, 0)
316 |         if len(box) >= 7 and class_names:
317 |             cls_conf = box[5]
318 |             cls_id = box[6]
319 |             print('%s: %f' % (class_names[cls_id], cls_conf))
320 |             classes = len(class_names)
321 |             offset = cls_id * 123457 % classes
322 |             red = get_color(2, offset, classes)
323 |             green = get_color(1, offset, classes)
324 |             blue = get_color(0, offset, classes)
325 |             if color is None:
326 |                 rgb = (red, green, blue)
327 |             img = cv2.putText(img, class_names[cls_id], (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 1.2, rgb, 1)
328 |         img = cv2.rectangle(img, (x1, y1), (x2, y2), rgb, 1)
329 |     if savename:
330 |         print("save plot results to %s" % savename)
331 |         cv2.imwrite(savename, img)
332 |     return img
333 | 
334 | 
335 | def plot_boxes(img, boxes, savename=None, class_names=None):
336 |     colors = torch.FloatTensor([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]]);
337 | 
338 |     def get_color(c, x, max_val):
339 |         ratio = float(x) / max_val * 5
340 |         i = int(math.floor(ratio))
341 |         j = int(math.ceil(ratio))
342 |         ratio = ratio - i
343 |         r = (1 - ratio) * colors[i][c] + ratio * colors[j][c]
344 |         return int(r * 255)
345 | 
346 |     width = img.width
347 |     height = img.height
348 |     draw = ImageDraw.Draw(img)
349 |     for i in range(len(boxes)):
350 |         box = boxes[i]
351 |         x1 = (box[0] - box[2] / 2.0) * width
352 |         y1 = (box[1] - box[3] / 2.0) * height
353 |         x2 = (box[0] + box[2] / 2.0) * width
354 |         y2 = (box[1] + box[3] / 2.0) * height
355 | 
356 |         rgb = (255, 0, 0)
357 |         if len(box) >= 7 and class_names:
358 |             cls_conf = box[5]
359 |             cls_id = box[6]
360 |             print('%s: %f' % (class_names[cls_id], cls_conf))
361 |             classes = len(class_names)
362 |             offset = cls_id * 123457 % classes
363 |             red = get_color(2, offset, classes)
364 |             green = get_color(1, offset, classes)
365 |             blue = get_color(0, offset, classes)
366 |             rgb = (red, green, blue)
367 |             draw.text((x1, y1), class_names[cls_id], fill=rgb)
368 |         draw.rectangle([x1, y1, x2, y2], outline=rgb)
369 |     if savename:
370 |         print("save plot results to %s" % savename)
371 |         img.save(savename)
372 |     return img
373 | 
374 | 
375 | def read_truths(lab_path):
376 |     if not os.path.exists(lab_path):
377 |         return np.array([])
378 |     if os.path.getsize(lab_path):
379 |         truths = np.loadtxt(lab_path)
380 |         truths = truths.reshape(truths.size / 5, 5)  # to avoid single truth problem
381 |         return truths
382 |     else:
383 |         return np.array([])
384 | 
385 | 
386 | def load_class_names(namesfile):
387 |     class_names = []
388 |     with open(namesfile, 'r') as fp:
389 |         lines = fp.readlines()
390 |     for line in lines:
391 |         line = line.rstrip()
392 |         class_names.append(line)
393 |     return class_names
394 | 
395 | 
396 | def do_detect(model, img, conf_thresh, n_classes, nms_thresh, use_cuda=1):
397 |     model.eval()
398 |     t0 = time.time()
399 | 
400 |     if isinstance(img, Image.Image):
401 |         width = img.width
402 |         height = img.height
403 |         img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes()))
404 |         img = img.view(height, width, 3).transpose(0, 1).transpose(0, 2).contiguous()
405 |         img = img.view(1, 3, height, width)
406 |         img = img.float().div(255.0)
407 |     elif type(img) == np.ndarray and len(img.shape) == 3:  # cv2 image
408 |         img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0)
409 |     elif type(img) == np.ndarray and len(img.shape) == 4:
410 |         img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0)
411 |     else:
412 |         print("unknow image type")
413 |         exit(-1)
414 | 
415 |     t1 = time.time()
416 | 
417 |     if use_cuda:
418 |         img = img.cuda()
419 |     img = torch.autograd.Variable(img)
420 |     t2 = time.time()
421 | 
422 |     list_features = model(img)
423 | 
424 |     list_features_numpy = []
425 |     for feature in list_features:
426 |         list_features_numpy.append(feature.data.cpu().numpy())
427 | 
428 |     return post_processing(img, conf_thresh, n_classes, nms_thresh, list_features_numpy)
429 | 
430 | 
431 | def post_processing(img, conf_thresh, n_classes, nms_thresh, list_features_numpy):
432 |     anchors = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401]
433 |     num_anchors = 9
434 |     anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
435 |     strides = [8, 16, 32]
436 |     anchor_step = len(anchors) // num_anchors
437 |     boxes = []
438 |     for i in range(3):
439 |         masked_anchors = []
440 |         for m in anchor_masks[i]:
441 |             masked_anchors += anchors[m * anchor_step:(m + 1) * anchor_step]
442 |         masked_anchors = [anchor / strides[i] for anchor in masked_anchors]
443 |         boxes.append(get_region_boxes_out_model(list_features_numpy[i], conf_thresh, n_classes, masked_anchors,
444 |                                                 len(anchor_masks[i])))
445 |         # boxes.append(get_region_boxes(list_boxes[i], 0.6, 80, masked_anchors, len(anchor_masks[i])))
446 |     if img.shape[0] > 1:
447 |         bboxs_for_imgs = [
448 |             boxes[0][index] + boxes[1][index] + boxes[2][index]
449 |             for index in range(img.shape[0])]
450 |         # 分别对每一张图片的结果进行nms
451 |         t3 = time.time()
452 |         boxes = [nms(bboxs, nms_thresh) for bboxs in bboxs_for_imgs]
453 |     else:
454 |         boxes = boxes[0][0] + boxes[1][0] + boxes[2][0]
455 |         t3 = time.time()
456 |         boxes = nms(boxes, nms_thresh)
457 |     t4 = time.time()
458 | 
459 |     if False:
460 |         print('-----------------------------------')
461 |         print(' image to tensor : %f' % (t1 - t0))
462 |         print('  tensor to cuda : %f' % (t2 - t1))
463 |         print('         predict : %f' % (t3 - t2))
464 |         print('             nms : %f' % (t4 - t3))
465 |         print('           total : %f' % (t4 - t0))
466 |         print('-----------------------------------')
467 |     return boxes
468 | 


--------------------------------------------------------------------------------
/tool/yolo_layer.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch.nn.functional as F
  3 | from tool.utils import *
  4 | 
  5 | 
  6 | def build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale,
  7 |                   sil_thresh, seen):
  8 |     nB = target.size(0)
  9 |     nA = num_anchors
 10 |     nC = num_classes
 11 |     anchor_step = len(anchors) / num_anchors
 12 |     conf_mask = torch.ones(nB, nA, nH, nW) * noobject_scale
 13 |     coord_mask = torch.zeros(nB, nA, nH, nW)
 14 |     cls_mask = torch.zeros(nB, nA, nH, nW)
 15 |     tx = torch.zeros(nB, nA, nH, nW)
 16 |     ty = torch.zeros(nB, nA, nH, nW)
 17 |     tw = torch.zeros(nB, nA, nH, nW)
 18 |     th = torch.zeros(nB, nA, nH, nW)
 19 |     tconf = torch.zeros(nB, nA, nH, nW)
 20 |     tcls = torch.zeros(nB, nA, nH, nW)
 21 | 
 22 |     nAnchors = nA * nH * nW
 23 |     nPixels = nH * nW
 24 |     for b in range(nB):
 25 |         cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t()
 26 |         cur_ious = torch.zeros(nAnchors)
 27 |         for t in range(50):
 28 |             if target[b][t * 5 + 1] == 0:
 29 |                 break
 30 |             gx = target[b][t * 5 + 1] * nW
 31 |             gy = target[b][t * 5 + 2] * nH
 32 |             gw = target[b][t * 5 + 3] * nW
 33 |             gh = target[b][t * 5 + 4] * nH
 34 |             cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors, 1).t()
 35 |             cur_ious = torch.max(cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False))
 36 |         conf_mask[b][cur_ious > sil_thresh] = 0
 37 |     if seen < 12800:
 38 |         if anchor_step == 4:
 39 |             tx = torch.FloatTensor(anchors).view(nA, anchor_step).index_select(1, torch.LongTensor([2])).view(1, nA, 1,
 40 |                                                                                                               1).repeat(
 41 |                 nB, 1, nH, nW)
 42 |             ty = torch.FloatTensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([2])).view(
 43 |                 1, nA, 1, 1).repeat(nB, 1, nH, nW)
 44 |         else:
 45 |             tx.fill_(0.5)
 46 |             ty.fill_(0.5)
 47 |         tw.zero_()
 48 |         th.zero_()
 49 |         coord_mask.fill_(1)
 50 | 
 51 |     nGT = 0
 52 |     nCorrect = 0
 53 |     for b in range(nB):
 54 |         for t in range(50):
 55 |             if target[b][t * 5 + 1] == 0:
 56 |                 break
 57 |             nGT = nGT + 1
 58 |             best_iou = 0.0
 59 |             best_n = -1
 60 |             min_dist = 10000
 61 |             gx = target[b][t * 5 + 1] * nW
 62 |             gy = target[b][t * 5 + 2] * nH
 63 |             gi = int(gx)
 64 |             gj = int(gy)
 65 |             gw = target[b][t * 5 + 3] * nW
 66 |             gh = target[b][t * 5 + 4] * nH
 67 |             gt_box = [0, 0, gw, gh]
 68 |             for n in range(nA):
 69 |                 aw = anchors[anchor_step * n]
 70 |                 ah = anchors[anchor_step * n + 1]
 71 |                 anchor_box = [0, 0, aw, ah]
 72 |                 iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False)
 73 |                 if anchor_step == 4:
 74 |                     ax = anchors[anchor_step * n + 2]
 75 |                     ay = anchors[anchor_step * n + 3]
 76 |                     dist = pow(((gi + ax) - gx), 2) + pow(((gj + ay) - gy), 2)
 77 |                 if iou > best_iou:
 78 |                     best_iou = iou
 79 |                     best_n = n
 80 |                 elif anchor_step == 4 and iou == best_iou and dist < min_dist:
 81 |                     best_iou = iou
 82 |                     best_n = n
 83 |                     min_dist = dist
 84 | 
 85 |             gt_box = [gx, gy, gw, gh]
 86 |             pred_box = pred_boxes[b * nAnchors + best_n * nPixels + gj * nW + gi]
 87 | 
 88 |             coord_mask[b][best_n][gj][gi] = 1
 89 |             cls_mask[b][best_n][gj][gi] = 1
 90 |             conf_mask[b][best_n][gj][gi] = object_scale
 91 |             tx[b][best_n][gj][gi] = target[b][t * 5 + 1] * nW - gi
 92 |             ty[b][best_n][gj][gi] = target[b][t * 5 + 2] * nH - gj
 93 |             tw[b][best_n][gj][gi] = math.log(gw / anchors[anchor_step * best_n])
 94 |             th[b][best_n][gj][gi] = math.log(gh / anchors[anchor_step * best_n + 1])
 95 |             iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False)  # best_iou
 96 |             tconf[b][best_n][gj][gi] = iou
 97 |             tcls[b][best_n][gj][gi] = target[b][t * 5]
 98 |             if iou > 0.5:
 99 |                 nCorrect = nCorrect + 1
100 | 
101 |     return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls
102 | 
103 | 
104 | class YoloLayer(nn.Module):
105 |     ''' Yolo layer
106 |     model_out: while inference,is post-processing inside or outside the model
107 |         true:outside
108 |     '''
109 |     def __init__(self, anchor_mask=[], num_classes=0, anchors=[], num_anchors=1,stride=32,model_out=True):
110 |         super(YoloLayer, self).__init__()
111 |         self.anchor_mask = anchor_mask
112 |         self.num_classes = num_classes
113 |         self.anchors = anchors
114 |         self.num_anchors = num_anchors
115 |         self.anchor_step = len(anchors) // num_anchors
116 |         self.coord_scale = 1
117 |         self.noobject_scale = 1
118 |         self.object_scale = 5
119 |         self.class_scale = 1
120 |         self.thresh = 0.6
121 |         self.stride = stride
122 |         self.seen = 0
123 | 
124 |         self.model_out = model_out
125 | 
126 |     def forward(self, output, target=None):
127 |         if self.training:
128 |             # output : BxAs*(4+1+num_classes)*H*W
129 |             t0 = time.time()
130 |             nB = output.data.size(0)
131 |             nA = self.num_anchors
132 |             nC = self.num_classes
133 |             nH = output.data.size(2)
134 |             nW = output.data.size(3)
135 | 
136 |             output = output.view(nB, nA, (5 + nC), nH, nW)
137 |             x = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([0]))).view(nB, nA, nH, nW))
138 |             y = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([1]))).view(nB, nA, nH, nW))
139 |             w = output.index_select(2, Variable(torch.cuda.LongTensor([2]))).view(nB, nA, nH, nW)
140 |             h = output.index_select(2, Variable(torch.cuda.LongTensor([3]))).view(nB, nA, nH, nW)
141 |             conf = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([4]))).view(nB, nA, nH, nW))
142 |             cls = output.index_select(2, Variable(torch.linspace(5, 5 + nC - 1, nC).long().cuda()))
143 |             cls = cls.view(nB * nA, nC, nH * nW).transpose(1, 2).contiguous().view(nB * nA * nH * nW, nC)
144 |             t1 = time.time()
145 | 
146 |             pred_boxes = torch.cuda.FloatTensor(4, nB * nA * nH * nW)
147 |             grid_x = torch.linspace(0, nW - 1, nW).repeat(nH, 1).repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda()
148 |             grid_y = torch.linspace(0, nH - 1, nH).repeat(nW, 1).t().repeat(nB * nA, 1, 1).view(
149 |                 nB * nA * nH * nW).cuda()
150 |             anchor_w = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1,
151 |                                                                                           torch.LongTensor([0])).cuda()
152 |             anchor_h = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1,
153 |                                                                                           torch.LongTensor([1])).cuda()
154 |             anchor_w = anchor_w.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW)
155 |             anchor_h = anchor_h.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW)
156 |             pred_boxes[0] = x.data + grid_x
157 |             pred_boxes[1] = y.data + grid_y
158 |             pred_boxes[2] = torch.exp(w.data) * anchor_w
159 |             pred_boxes[3] = torch.exp(h.data) * anchor_h
160 |             pred_boxes = convert2cpu(pred_boxes.transpose(0, 1).contiguous().view(-1, 4))
161 |             t2 = time.time()
162 | 
163 |             nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes,
164 |                                                                                                         target.data,
165 |                                                                                                         self.anchors,
166 |                                                                                                         nA, nC, \
167 |                                                                                                         nH, nW,
168 |                                                                                                         self.noobject_scale,
169 |                                                                                                         self.object_scale,
170 |                                                                                                         self.thresh,
171 |                                                                                                         self.seen)
172 |             cls_mask = (cls_mask == 1)
173 |             nProposals = int((conf > 0.25).sum().data[0])
174 | 
175 |             tx = Variable(tx.cuda())
176 |             ty = Variable(ty.cuda())
177 |             tw = Variable(tw.cuda())
178 |             th = Variable(th.cuda())
179 |             tconf = Variable(tconf.cuda())
180 |             tcls = Variable(tcls.view(-1)[cls_mask].long().cuda())
181 | 
182 |             coord_mask = Variable(coord_mask.cuda())
183 |             conf_mask = Variable(conf_mask.cuda().sqrt())
184 |             cls_mask = Variable(cls_mask.view(-1, 1).repeat(1, nC).cuda())
185 |             cls = cls[cls_mask].view(-1, nC)
186 | 
187 |             t3 = time.time()
188 | 
189 |             loss_x = self.coord_scale * nn.MSELoss(size_average=False)(x * coord_mask, tx * coord_mask) / 2.0
190 |             loss_y = self.coord_scale * nn.MSELoss(size_average=False)(y * coord_mask, ty * coord_mask) / 2.0
191 |             loss_w = self.coord_scale * nn.MSELoss(size_average=False)(w * coord_mask, tw * coord_mask) / 2.0
192 |             loss_h = self.coord_scale * nn.MSELoss(size_average=False)(h * coord_mask, th * coord_mask) / 2.0
193 |             loss_conf = nn.MSELoss(size_average=False)(conf * conf_mask, tconf * conf_mask) / 2.0
194 |             loss_cls = self.class_scale * nn.CrossEntropyLoss(size_average=False)(cls, tcls)
195 |             loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
196 |             t4 = time.time()
197 |             if False:
198 |                 print('-----------------------------------')
199 |                 print('        activation : %f' % (t1 - t0))
200 |                 print(' create pred_boxes : %f' % (t2 - t1))
201 |                 print('     build targets : %f' % (t3 - t2))
202 |                 print('       create loss : %f' % (t4 - t3))
203 |                 print('             total : %f' % (t4 - t0))
204 |             print('%d: nGT %d, recall %d, proposals %d, loss: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f' % (
205 |             self.seen, nGT, nCorrect, nProposals, loss_x.data[0], loss_y.data[0], loss_w.data[0], loss_h.data[0],
206 |             loss_conf.data[0], loss_cls.data[0], loss.data[0]))
207 |             return loss
208 |         else:
209 |             if self.model_out:
210 |                 return output
211 |             else:
212 |                 masked_anchors = []
213 |                 for m in self.anchor_mask:
214 |                     masked_anchors += self.anchors[m * self.anchor_step:(m + 1) * self.anchor_step]
215 |                 masked_anchors = [anchor / self.stride for anchor in masked_anchors]
216 |                 boxes = get_region_boxes_in_model(output.data, self.thresh, self.num_classes, masked_anchors, len(self.anchor_mask))
217 |                 return boxes
218 | 


--------------------------------------------------------------------------------