├── .gitignore ├── License.txt ├── README.md ├── Use_yolov4_to_train_your_own_data.md ├── cfg.py ├── cfg ├── yolov3-tiny.cfg ├── yolov3.cfg ├── yolov4-custom.cfg └── yolov4.cfg ├── data ├── coco.names ├── dog.jpg ├── giraffe.jpg ├── prediction.jpg └── voc.names ├── dataset.py ├── demo.py ├── demo_onnx.py ├── demo_tensorflow.py ├── evaluate_on_coco.py ├── models.py ├── requirements.txt ├── tool ├── __init__.py ├── camera.py ├── coco_annotation.py ├── config.py ├── darknet2onnx.py ├── darknet2pytorch.py ├── onnx2tensorflow.py ├── region_loss.py ├── utils.py └── yolo_layer.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | ttest 2 | *.weights 3 | *.pth 4 | *.onnx 5 | 6 | __pycache__ 7 | .idea 8 | .vscode 9 | runs 10 | log 11 | 12 | predictions.jpg 13 | predictions_onnx.jpg 14 | -------------------------------------------------------------------------------- /License.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pytorch-YOLOv4 2 | 3 | ![](https://img.shields.io/static/v1?label=python&message=3.6|3.7&color=blue) 4 | ![](https://img.shields.io/static/v1?label=pytorch&message=1.4&color=) 5 | [![](https://img.shields.io/static/v1?label=license&message=Apache2&color=green)](./License.txt) 6 | 7 | A minimal PyTorch implementation of YOLOv4. 8 | - Paper Yolo v4: https://arxiv.org/abs/2004.10934 9 | - Source code:https://github.com/AlexeyAB/darknet 10 | - More details: http://pjreddie.com/darknet/yolo/ 11 | 12 | 13 | - [x] Inference 14 | - [x] Train 15 | - [x] Mocaic 16 | 17 | ``` 18 | ├── README.md 19 | ├── dataset.py dataset 20 | ├── demo.py demo to run pytorch --> tool/darknet2pytorch 21 | ├── darknet2onnx.py tool to convert into onnx --> tool/darknet2pytorch 22 | ├── demo_onnx.py demo to run the converted onnx model 23 | ├── models.py model for pytorch 24 | ├── train.py train models.py 25 | ├── cfg.py cfg.py for train 26 | ├── cfg cfg --> darknet2pytorch 27 | ├── data 28 | ├── weight --> darknet2pytorch 29 | ├── tool 30 | │   ├── camera.py a demo camera 31 | │   ├── coco_annotatin.py coco dataset generator 32 | │   ├── config.py 33 | │   ├── darknet2pytorch.py 34 | │   ├── region_loss.py 35 | │   ├── utils.py 36 | │   └── yolo_layer.py 37 | ``` 38 | 39 | ![image](https://user-gold-cdn.xitu.io/2020/4/26/171b5a6c8b3bd513?w=768&h=576&f=jpeg&s=78882) 40 | 41 | # 0.Weight 42 | 43 | ## 0.1 darkent 44 | - baidu(https://pan.baidu.com/s/1dAGEW8cm-dqK14TbhhVetA Extraction code:dm5b) 45 | - google(https://drive.google.com/open?id=1cewMfusmPjYWbrnuJRuKhPMwRe_b9PaT) 46 | 47 | ## 0.2 pytorch 48 | you can use darknet2pytorch to convert it yourself, or download my converted model. 49 | 50 | - baidu 51 | - yolov4.pth(https://pan.baidu.com/s/1ZroDvoGScDgtE1ja_QqJVw Extraction code:xrq9) 52 | - yolov4.conv.137.pth(https://pan.baidu.com/s/1ovBie4YyVQQoUrC3AY0joA Extraction code:kcel) 53 | - google 54 | - yolov4.pth(https://drive.google.com/open?id=1wv_LiFeCRYwtpkqREPeI13-gPELBDwuJ) 55 | - yolov4.conv.137.pth(https://drive.google.com/open?id=1fcbR0bWzYfIEdLJPzOsn4R5mlvR6IQyA) 56 | 57 | # 1.Train 58 | 59 | [use yolov4 to train your own data](Use_yolov4_to_train_your_own_data.md) 60 | 61 | 1. Download weight 62 | 2. Transform data 63 | 64 | For coco dataset,you can use tool/coco_annotatin.py. 65 | ``` 66 | # train.txt 67 | image_path1 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ... 68 | image_path2 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ... 69 | ... 70 | ... 71 | ``` 72 | 3. Train 73 | 74 | you can set parameters in cfg.py. 75 | ``` 76 | python train.py -g [GPU_ID] -dir [Dataset direction] ... 77 | ``` 78 | 79 | # 2.Inference 80 | - download model weight https://drive.google.com/open?id=1cewMfusmPjYWbrnuJRuKhPMwRe_b9PaT 81 | ``` 82 | python demo.py 83 | ``` 84 | 85 | # 3.Darknet2ONNX 86 | 87 | - **Install onnxruntime** 88 | 89 | ```sh 90 | pip install onnxruntime 91 | ``` 92 | 93 | - **Run python script to generate onnx model and run the demo** 94 | 95 | ```sh 96 | python demo_onnx.py 97 | ``` 98 | 99 | This script will generate 2 onnx models. 100 | 101 | - One is for running the demo (batch_size=1) 102 | - The other one is what you want to generate (batch_size=batchSize) 103 | 104 | 105 | # 4.ONNX2Tensorflow 106 | 107 | - **First:Conversion to ONNX** 108 | 109 | tensorflow >=2.0 110 | 111 | 1: Thanks:github:https://github.com/onnx/onnx-tensorflow 112 | 113 | 2: Run git clone https://github.com/onnx/onnx-tensorflow.git && cd onnx-tensorflow 114 | Run pip install -e . 115 | 116 | Note:Errors will occur when using "pip install onnx-tf", at least for me,it is recommended to use source code installation 117 | 118 | Reference: 119 | - https://github.com/eriklindernoren/PyTorch-YOLOv3 120 | - https://github.com/marvis/pytorch-caffe-darknet-convert 121 | - https://github.com/marvis/pytorch-yolo3 122 | 123 | ``` 124 | @article{yolov4, 125 | title={YOLOv4: YOLOv4: Optimal Speed and Accuracy of Object Detection}, 126 | author={Alexey Bochkovskiy, Chien-Yao Wang, Hong-Yuan Mark Liao}, 127 | journal = {arXiv}, 128 | year={2020} 129 | } 130 | ``` -------------------------------------------------------------------------------- /Use_yolov4_to_train_your_own_data.md: -------------------------------------------------------------------------------- 1 | yolov4的发布引起了不少的关注,但由于darknet是大佬c语言写的,对代码的阅读有诸多不变,所以周末的时候写了个pytorch版的(蹭一波热度)。虽然pytorch——yolov4写好已经有一段时间了,但是由于种种原因一直没有进行验证(主要就是懒),大家提出了诸多问题帮助修复很多bug,还有大佬一起增加新的功能,感谢大家的帮助。这些天呼声最高的就是如何如何使用自己的数据进行训练,昨天又是周末,就把这件拖了很久的事做了。并不像使用很多数据,于是自己制作了一个简单的数据集。 2 | 3 | # 1. 代码准备 4 | 5 | github 克隆代码 6 | ``` 7 | git clone https://github.com/Tianxiaomo/pytorch-YOLOv4.git 8 | ``` 9 | # 2. 数据准备 10 | 11 | 准备train.txt,内容是图片名和box。格式如下 12 | 13 | ``` 14 | image_path1 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ... 15 | image_path2 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ... 16 | ... 17 | ``` 18 | - image_path : 图片名 19 | - x1,y1 : 左上角坐标 20 | - x2,y2 : 右下角坐标 21 | - id : 物体类别 22 | 23 | 我自己用的数据是自己制作的了一个小数据集,检测各种各样的硬币(也就1元,5角,1角三种),为什么不使用其他的东西制作数据集呢,没有啊,手边只有这些硬币感觉比较合适,相对其他的东西也比较简单。 24 | 25 | ![UTOOLS1590383513325.png](https://user-gold-cdn.xitu.io/2020/5/25/1724a3e953909b1b?w=1649&h=791&f=png&s=1290382) 26 | 27 | 一共准备了没几张。 28 | 29 | # 3. 参数设置 30 | 31 | 开始训练的时候我直接用原来的参数,batch size设为64,跑了几个epoch发现不对,我数据一共才二十多个。后修改网络更新策略,不是按照每个epoch的step更新,使用总的steps更新,观察loss貌似可以训练了,于是睡觉,明天再看训练如何(鬼知道我又改了啥)。 32 | 33 | 今天打开电脑一看,what xx,loss收敛到2.e+4下不去了,此种必又蹊跷,遂kill了。于是把batch size直接设为4,可以正常训练了。 34 | 35 | ``` 36 | Cfg.batch = 4 37 | Cfg.subdivisions = 1 38 | ``` 39 | 40 | # 4. 开始训练 41 | 42 | ``` 43 | python train.py -l 0.001 -g 4 -pretrained ./yolov4.conv.137.pth -classes 3 -dir /home/OCR/coins 44 | 45 | -l 学习率 46 | -g gpu id 47 | -pretrained 预训练的主干网络,从AlexeyAB给的darknet的yolov4.conv.137转换过来的 48 | -classes 类别种类 49 | -dir 图片所在文件夹 50 | ``` 51 | 52 | 53 | 看下loss曲线 54 | ``` 55 | tensorboard --logdir log --host 192.168.212.75 --port 6008 56 | ``` 57 | ![UTOOLS1590386319240.png](https://user-gold-cdn.xitu.io/2020/5/25/1724a696148d13f3?w=1357&h=795&f=png&s=151465) 58 | 59 | # 5. 验证 60 | 61 | ``` 62 | python model.py 3 weight/Yolov4_epoch166_coins.pth data/coin2.jpg data/coins.names 63 | 64 | python model.py num_classes weightfile imagepath namefile 65 | ``` 66 | coins.names 67 | ``` 68 | 1yuan 69 | 5jiao 70 | 1jiao 71 | 72 | ``` 73 | 74 | ![UTOOLS1590386705468.png](https://user-gold-cdn.xitu.io/2020/5/25/1724a6f46e826bb8?w=774&h=1377&f=png&s=1191048) 75 | 76 | 效果差强人意(训练数据只有3种类型硬币)。 77 | 78 | # 附 79 | 80 | - coins数据集 (链接:https://pan.baidu.com/s/1y701NRKSdpj6UKDIH-GpqA 81 | 提取码:j09s) 82 | - yolov4.conv.137.pth (链接:https://pan.baidu.com/s/1ovBie4YyVQQoUrC3AY0joA 提取码:kcel) 83 | -------------------------------------------------------------------------------- /cfg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | @Time : 2020/05/06 21:05 4 | @Author : Tianxiaomo 5 | @File : Cfg.py 6 | @Noice : 7 | @Modificattion : 8 | @Author : 9 | @Time : 10 | @Detail : 11 | 12 | ''' 13 | from easydict import EasyDict 14 | 15 | Cfg = EasyDict() 16 | Cfg.batch = 64 17 | Cfg.subdivisions = 16 18 | Cfg.width = 608 19 | Cfg.height = 608 20 | Cfg.channels = 3 21 | Cfg.momentum = 0.949 22 | Cfg.decay = 0.0005 23 | Cfg.angle = 0 24 | Cfg.saturation = 1.5 25 | Cfg.exposure = 1.5 26 | Cfg.hue = .1 27 | 28 | Cfg.learning_rate = 0.00261 29 | Cfg.burn_in = 1000 30 | Cfg.max_batches = 500500 31 | Cfg.steps = [400000, 450000] 32 | Cfg.policy = Cfg.steps 33 | Cfg.scales = .1, .1 34 | 35 | Cfg.cutmix = 0 36 | Cfg.mosaic = 1 37 | 38 | Cfg.letter_box = 0 39 | Cfg.jitter = 0.2 40 | Cfg.classes = 80 41 | Cfg.track = 0 42 | Cfg.w = Cfg.width 43 | Cfg.h = Cfg.height 44 | Cfg.flip = 1 45 | Cfg.blur = 0 46 | Cfg.gaussian = 0 47 | Cfg.boxes = 60 # box num 48 | Cfg.TRAIN_EPOCHS = 300 49 | Cfg.train_label = 'data/train.txt' 50 | Cfg.val_label = 'data/val.txt' 51 | Cfg.TRAIN_OPTIMIZER = 'adam' 52 | ''' 53 | image_path1 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ... 54 | image_path2 x1,y1,x2,y2,id x1,y1,x2,y2,id x1,y1,x2,y2,id ... 55 | ... 56 | ''' 57 | 58 | if Cfg.mosaic and Cfg.cutmix: 59 | Cfg.mixup = 4 60 | elif Cfg.cutmix: 61 | Cfg.mixup = 2 62 | elif Cfg.mosaic: 63 | Cfg.mixup = 3 64 | 65 | Cfg.checkpoints = 'checkpoints' 66 | Cfg.TRAIN_TENSORBOARD_DIR = 'log' -------------------------------------------------------------------------------- /cfg/yolov3-tiny.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=2 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | # 0 26 | [convolutional] 27 | batch_normalize=1 28 | filters=16 29 | size=3 30 | stride=1 31 | pad=1 32 | activation=leaky 33 | 34 | # 1 35 | [maxpool] 36 | size=2 37 | stride=2 38 | 39 | # 2 40 | [convolutional] 41 | batch_normalize=1 42 | filters=32 43 | size=3 44 | stride=1 45 | pad=1 46 | activation=leaky 47 | 48 | # 3 49 | [maxpool] 50 | size=2 51 | stride=2 52 | 53 | # 4 54 | [convolutional] 55 | batch_normalize=1 56 | filters=64 57 | size=3 58 | stride=1 59 | pad=1 60 | activation=leaky 61 | 62 | # 5 63 | [maxpool] 64 | size=2 65 | stride=2 66 | 67 | # 6 68 | [convolutional] 69 | batch_normalize=1 70 | filters=128 71 | size=3 72 | stride=1 73 | pad=1 74 | activation=leaky 75 | 76 | # 7 77 | [maxpool] 78 | size=2 79 | stride=2 80 | 81 | # 8 82 | [convolutional] 83 | batch_normalize=1 84 | filters=256 85 | size=3 86 | stride=1 87 | pad=1 88 | activation=leaky 89 | 90 | # 9 91 | [maxpool] 92 | size=2 93 | stride=2 94 | 95 | # 10 96 | [convolutional] 97 | batch_normalize=1 98 | filters=512 99 | size=3 100 | stride=1 101 | pad=1 102 | activation=leaky 103 | 104 | # 11 105 | [maxpool] 106 | size=2 107 | stride=1 108 | 109 | # 12 110 | [convolutional] 111 | batch_normalize=1 112 | filters=1024 113 | size=3 114 | stride=1 115 | pad=1 116 | activation=leaky 117 | 118 | ########### 119 | 120 | # 13 121 | [convolutional] 122 | batch_normalize=1 123 | filters=256 124 | size=1 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | # 14 130 | [convolutional] 131 | batch_normalize=1 132 | filters=512 133 | size=3 134 | stride=1 135 | pad=1 136 | activation=leaky 137 | 138 | # 15 139 | [convolutional] 140 | size=1 141 | stride=1 142 | pad=1 143 | filters=255 144 | activation=linear 145 | 146 | 147 | 148 | # 16 149 | [yolo] 150 | mask = 3,4,5 151 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 152 | classes=80 153 | num=6 154 | jitter=.3 155 | ignore_thresh = .7 156 | truth_thresh = 1 157 | random=1 158 | 159 | # 17 160 | [route] 161 | layers = -4 162 | 163 | # 18 164 | [convolutional] 165 | batch_normalize=1 166 | filters=128 167 | size=1 168 | stride=1 169 | pad=1 170 | activation=leaky 171 | 172 | # 19 173 | [upsample] 174 | stride=2 175 | 176 | # 20 177 | [route] 178 | layers = -1, 8 179 | 180 | # 21 181 | [convolutional] 182 | batch_normalize=1 183 | filters=256 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | # 22 190 | [convolutional] 191 | size=1 192 | stride=1 193 | pad=1 194 | filters=255 195 | activation=linear 196 | 197 | # 23 198 | [yolo] 199 | mask = 1,2,3 200 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 201 | classes=80 202 | num=6 203 | jitter=.3 204 | ignore_thresh = .7 205 | truth_thresh = 1 206 | random=1 207 | -------------------------------------------------------------------------------- /cfg/yolov3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=16 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .5 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .5 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .5 787 | truth_thresh = 1 788 | random=1 789 | 790 | -------------------------------------------------------------------------------- /cfg/yolov4-custom.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.949 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500500 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | #cutmix=1 26 | mosaic=1 27 | 28 | #:104x104 54:52x52 85:26x26 104:13x13 for 416 29 | 30 | [convolutional] 31 | batch_normalize=1 32 | filters=32 33 | size=3 34 | stride=1 35 | pad=1 36 | activation=mish 37 | 38 | # Downsample 39 | 40 | [convolutional] 41 | batch_normalize=1 42 | filters=64 43 | size=3 44 | stride=2 45 | pad=1 46 | activation=mish 47 | 48 | [convolutional] 49 | batch_normalize=1 50 | filters=64 51 | size=1 52 | stride=1 53 | pad=1 54 | activation=mish 55 | 56 | [route] 57 | layers = -2 58 | 59 | [convolutional] 60 | batch_normalize=1 61 | filters=64 62 | size=1 63 | stride=1 64 | pad=1 65 | activation=mish 66 | 67 | [convolutional] 68 | batch_normalize=1 69 | filters=32 70 | size=1 71 | stride=1 72 | pad=1 73 | activation=mish 74 | 75 | [convolutional] 76 | batch_normalize=1 77 | filters=64 78 | size=3 79 | stride=1 80 | pad=1 81 | activation=mish 82 | 83 | [shortcut] 84 | from=-3 85 | activation=linear 86 | 87 | [convolutional] 88 | batch_normalize=1 89 | filters=64 90 | size=1 91 | stride=1 92 | pad=1 93 | activation=mish 94 | 95 | [route] 96 | layers = -1,-7 97 | 98 | [convolutional] 99 | batch_normalize=1 100 | filters=64 101 | size=1 102 | stride=1 103 | pad=1 104 | activation=mish 105 | 106 | # Downsample 107 | 108 | [convolutional] 109 | batch_normalize=1 110 | filters=128 111 | size=3 112 | stride=2 113 | pad=1 114 | activation=mish 115 | 116 | [convolutional] 117 | batch_normalize=1 118 | filters=64 119 | size=1 120 | stride=1 121 | pad=1 122 | activation=mish 123 | 124 | [route] 125 | layers = -2 126 | 127 | [convolutional] 128 | batch_normalize=1 129 | filters=64 130 | size=1 131 | stride=1 132 | pad=1 133 | activation=mish 134 | 135 | [convolutional] 136 | batch_normalize=1 137 | filters=64 138 | size=1 139 | stride=1 140 | pad=1 141 | activation=mish 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=64 146 | size=3 147 | stride=1 148 | pad=1 149 | activation=mish 150 | 151 | [shortcut] 152 | from=-3 153 | activation=linear 154 | 155 | [convolutional] 156 | batch_normalize=1 157 | filters=64 158 | size=1 159 | stride=1 160 | pad=1 161 | activation=mish 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=64 166 | size=3 167 | stride=1 168 | pad=1 169 | activation=mish 170 | 171 | [shortcut] 172 | from=-3 173 | activation=linear 174 | 175 | [convolutional] 176 | batch_normalize=1 177 | filters=64 178 | size=1 179 | stride=1 180 | pad=1 181 | activation=mish 182 | 183 | [route] 184 | layers = -1,-10 185 | 186 | [convolutional] 187 | batch_normalize=1 188 | filters=128 189 | size=1 190 | stride=1 191 | pad=1 192 | activation=mish 193 | 194 | # Downsample 195 | 196 | [convolutional] 197 | batch_normalize=1 198 | filters=256 199 | size=3 200 | stride=2 201 | pad=1 202 | activation=mish 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=mish 211 | 212 | [route] 213 | layers = -2 214 | 215 | [convolutional] 216 | batch_normalize=1 217 | filters=128 218 | size=1 219 | stride=1 220 | pad=1 221 | activation=mish 222 | 223 | [convolutional] 224 | batch_normalize=1 225 | filters=128 226 | size=1 227 | stride=1 228 | pad=1 229 | activation=mish 230 | 231 | [convolutional] 232 | batch_normalize=1 233 | filters=128 234 | size=3 235 | stride=1 236 | pad=1 237 | activation=mish 238 | 239 | [shortcut] 240 | from=-3 241 | activation=linear 242 | 243 | [convolutional] 244 | batch_normalize=1 245 | filters=128 246 | size=1 247 | stride=1 248 | pad=1 249 | activation=mish 250 | 251 | [convolutional] 252 | batch_normalize=1 253 | filters=128 254 | size=3 255 | stride=1 256 | pad=1 257 | activation=mish 258 | 259 | [shortcut] 260 | from=-3 261 | activation=linear 262 | 263 | [convolutional] 264 | batch_normalize=1 265 | filters=128 266 | size=1 267 | stride=1 268 | pad=1 269 | activation=mish 270 | 271 | [convolutional] 272 | batch_normalize=1 273 | filters=128 274 | size=3 275 | stride=1 276 | pad=1 277 | activation=mish 278 | 279 | [shortcut] 280 | from=-3 281 | activation=linear 282 | 283 | [convolutional] 284 | batch_normalize=1 285 | filters=128 286 | size=1 287 | stride=1 288 | pad=1 289 | activation=mish 290 | 291 | [convolutional] 292 | batch_normalize=1 293 | filters=128 294 | size=3 295 | stride=1 296 | pad=1 297 | activation=mish 298 | 299 | [shortcut] 300 | from=-3 301 | activation=linear 302 | 303 | 304 | [convolutional] 305 | batch_normalize=1 306 | filters=128 307 | size=1 308 | stride=1 309 | pad=1 310 | activation=mish 311 | 312 | [convolutional] 313 | batch_normalize=1 314 | filters=128 315 | size=3 316 | stride=1 317 | pad=1 318 | activation=mish 319 | 320 | [shortcut] 321 | from=-3 322 | activation=linear 323 | 324 | [convolutional] 325 | batch_normalize=1 326 | filters=128 327 | size=1 328 | stride=1 329 | pad=1 330 | activation=mish 331 | 332 | [convolutional] 333 | batch_normalize=1 334 | filters=128 335 | size=3 336 | stride=1 337 | pad=1 338 | activation=mish 339 | 340 | [shortcut] 341 | from=-3 342 | activation=linear 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=128 347 | size=1 348 | stride=1 349 | pad=1 350 | activation=mish 351 | 352 | [convolutional] 353 | batch_normalize=1 354 | filters=128 355 | size=3 356 | stride=1 357 | pad=1 358 | activation=mish 359 | 360 | [shortcut] 361 | from=-3 362 | activation=linear 363 | 364 | [convolutional] 365 | batch_normalize=1 366 | filters=128 367 | size=1 368 | stride=1 369 | pad=1 370 | activation=mish 371 | 372 | [convolutional] 373 | batch_normalize=1 374 | filters=128 375 | size=3 376 | stride=1 377 | pad=1 378 | activation=mish 379 | 380 | [shortcut] 381 | from=-3 382 | activation=linear 383 | 384 | [convolutional] 385 | batch_normalize=1 386 | filters=128 387 | size=1 388 | stride=1 389 | pad=1 390 | activation=mish 391 | 392 | [route] 393 | layers = -1,-28 394 | 395 | [convolutional] 396 | batch_normalize=1 397 | filters=256 398 | size=1 399 | stride=1 400 | pad=1 401 | activation=mish 402 | 403 | # Downsample 404 | 405 | [convolutional] 406 | batch_normalize=1 407 | filters=512 408 | size=3 409 | stride=2 410 | pad=1 411 | activation=mish 412 | 413 | [convolutional] 414 | batch_normalize=1 415 | filters=256 416 | size=1 417 | stride=1 418 | pad=1 419 | activation=mish 420 | 421 | [route] 422 | layers = -2 423 | 424 | [convolutional] 425 | batch_normalize=1 426 | filters=256 427 | size=1 428 | stride=1 429 | pad=1 430 | activation=mish 431 | 432 | [convolutional] 433 | batch_normalize=1 434 | filters=256 435 | size=1 436 | stride=1 437 | pad=1 438 | activation=mish 439 | 440 | [convolutional] 441 | batch_normalize=1 442 | filters=256 443 | size=3 444 | stride=1 445 | pad=1 446 | activation=mish 447 | 448 | [shortcut] 449 | from=-3 450 | activation=linear 451 | 452 | 453 | [convolutional] 454 | batch_normalize=1 455 | filters=256 456 | size=1 457 | stride=1 458 | pad=1 459 | activation=mish 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=256 464 | size=3 465 | stride=1 466 | pad=1 467 | activation=mish 468 | 469 | [shortcut] 470 | from=-3 471 | activation=linear 472 | 473 | 474 | [convolutional] 475 | batch_normalize=1 476 | filters=256 477 | size=1 478 | stride=1 479 | pad=1 480 | activation=mish 481 | 482 | [convolutional] 483 | batch_normalize=1 484 | filters=256 485 | size=3 486 | stride=1 487 | pad=1 488 | activation=mish 489 | 490 | [shortcut] 491 | from=-3 492 | activation=linear 493 | 494 | 495 | [convolutional] 496 | batch_normalize=1 497 | filters=256 498 | size=1 499 | stride=1 500 | pad=1 501 | activation=mish 502 | 503 | [convolutional] 504 | batch_normalize=1 505 | filters=256 506 | size=3 507 | stride=1 508 | pad=1 509 | activation=mish 510 | 511 | [shortcut] 512 | from=-3 513 | activation=linear 514 | 515 | 516 | [convolutional] 517 | batch_normalize=1 518 | filters=256 519 | size=1 520 | stride=1 521 | pad=1 522 | activation=mish 523 | 524 | [convolutional] 525 | batch_normalize=1 526 | filters=256 527 | size=3 528 | stride=1 529 | pad=1 530 | activation=mish 531 | 532 | [shortcut] 533 | from=-3 534 | activation=linear 535 | 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=256 540 | size=1 541 | stride=1 542 | pad=1 543 | activation=mish 544 | 545 | [convolutional] 546 | batch_normalize=1 547 | filters=256 548 | size=3 549 | stride=1 550 | pad=1 551 | activation=mish 552 | 553 | [shortcut] 554 | from=-3 555 | activation=linear 556 | 557 | 558 | [convolutional] 559 | batch_normalize=1 560 | filters=256 561 | size=1 562 | stride=1 563 | pad=1 564 | activation=mish 565 | 566 | [convolutional] 567 | batch_normalize=1 568 | filters=256 569 | size=3 570 | stride=1 571 | pad=1 572 | activation=mish 573 | 574 | [shortcut] 575 | from=-3 576 | activation=linear 577 | 578 | [convolutional] 579 | batch_normalize=1 580 | filters=256 581 | size=1 582 | stride=1 583 | pad=1 584 | activation=mish 585 | 586 | [convolutional] 587 | batch_normalize=1 588 | filters=256 589 | size=3 590 | stride=1 591 | pad=1 592 | activation=mish 593 | 594 | [shortcut] 595 | from=-3 596 | activation=linear 597 | 598 | [convolutional] 599 | batch_normalize=1 600 | filters=256 601 | size=1 602 | stride=1 603 | pad=1 604 | activation=mish 605 | 606 | [route] 607 | layers = -1,-28 608 | 609 | [convolutional] 610 | batch_normalize=1 611 | filters=512 612 | size=1 613 | stride=1 614 | pad=1 615 | activation=mish 616 | 617 | # Downsample 618 | 619 | [convolutional] 620 | batch_normalize=1 621 | filters=1024 622 | size=3 623 | stride=2 624 | pad=1 625 | activation=mish 626 | 627 | [convolutional] 628 | batch_normalize=1 629 | filters=512 630 | size=1 631 | stride=1 632 | pad=1 633 | activation=mish 634 | 635 | [route] 636 | layers = -2 637 | 638 | [convolutional] 639 | batch_normalize=1 640 | filters=512 641 | size=1 642 | stride=1 643 | pad=1 644 | activation=mish 645 | 646 | [convolutional] 647 | batch_normalize=1 648 | filters=512 649 | size=1 650 | stride=1 651 | pad=1 652 | activation=mish 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=512 657 | size=3 658 | stride=1 659 | pad=1 660 | activation=mish 661 | 662 | [shortcut] 663 | from=-3 664 | activation=linear 665 | 666 | [convolutional] 667 | batch_normalize=1 668 | filters=512 669 | size=1 670 | stride=1 671 | pad=1 672 | activation=mish 673 | 674 | [convolutional] 675 | batch_normalize=1 676 | filters=512 677 | size=3 678 | stride=1 679 | pad=1 680 | activation=mish 681 | 682 | [shortcut] 683 | from=-3 684 | activation=linear 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=512 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=mish 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | filters=512 697 | size=3 698 | stride=1 699 | pad=1 700 | activation=mish 701 | 702 | [shortcut] 703 | from=-3 704 | activation=linear 705 | 706 | [convolutional] 707 | batch_normalize=1 708 | filters=512 709 | size=1 710 | stride=1 711 | pad=1 712 | activation=mish 713 | 714 | [convolutional] 715 | batch_normalize=1 716 | filters=512 717 | size=3 718 | stride=1 719 | pad=1 720 | activation=mish 721 | 722 | [shortcut] 723 | from=-3 724 | activation=linear 725 | 726 | [convolutional] 727 | batch_normalize=1 728 | filters=512 729 | size=1 730 | stride=1 731 | pad=1 732 | activation=mish 733 | 734 | [route] 735 | layers = -1,-16 736 | 737 | [convolutional] 738 | batch_normalize=1 739 | filters=1024 740 | size=1 741 | stride=1 742 | pad=1 743 | activation=mish 744 | 745 | ########################## 746 | 747 | [convolutional] 748 | batch_normalize=1 749 | filters=512 750 | size=1 751 | stride=1 752 | pad=1 753 | activation=leaky 754 | 755 | [convolutional] 756 | batch_normalize=1 757 | size=3 758 | stride=1 759 | pad=1 760 | filters=1024 761 | activation=leaky 762 | 763 | [convolutional] 764 | batch_normalize=1 765 | filters=512 766 | size=1 767 | stride=1 768 | pad=1 769 | activation=leaky 770 | 771 | ### SPP ### 772 | [maxpool] 773 | stride=1 774 | size=5 775 | 776 | [route] 777 | layers=-2 778 | 779 | [maxpool] 780 | stride=1 781 | size=9 782 | 783 | [route] 784 | layers=-4 785 | 786 | [maxpool] 787 | stride=1 788 | size=13 789 | 790 | [route] 791 | layers=-1,-3,-5,-6 792 | ### End SPP ### 793 | 794 | [convolutional] 795 | batch_normalize=1 796 | filters=512 797 | size=1 798 | stride=1 799 | pad=1 800 | activation=leaky 801 | 802 | [convolutional] 803 | batch_normalize=1 804 | size=3 805 | stride=1 806 | pad=1 807 | filters=1024 808 | activation=leaky 809 | 810 | [convolutional] 811 | batch_normalize=1 812 | filters=512 813 | size=1 814 | stride=1 815 | pad=1 816 | activation=leaky 817 | 818 | [convolutional] 819 | batch_normalize=1 820 | filters=256 821 | size=1 822 | stride=1 823 | pad=1 824 | activation=leaky 825 | 826 | [upsample] 827 | stride=2 828 | 829 | [route] 830 | layers = 85 831 | 832 | [convolutional] 833 | batch_normalize=1 834 | filters=256 835 | size=1 836 | stride=1 837 | pad=1 838 | activation=leaky 839 | 840 | [route] 841 | layers = -1, -3 842 | 843 | [convolutional] 844 | batch_normalize=1 845 | filters=256 846 | size=1 847 | stride=1 848 | pad=1 849 | activation=leaky 850 | 851 | [convolutional] 852 | batch_normalize=1 853 | size=3 854 | stride=1 855 | pad=1 856 | filters=512 857 | activation=leaky 858 | 859 | [convolutional] 860 | batch_normalize=1 861 | filters=256 862 | size=1 863 | stride=1 864 | pad=1 865 | activation=leaky 866 | 867 | [convolutional] 868 | batch_normalize=1 869 | size=3 870 | stride=1 871 | pad=1 872 | filters=512 873 | activation=leaky 874 | 875 | [convolutional] 876 | batch_normalize=1 877 | filters=256 878 | size=1 879 | stride=1 880 | pad=1 881 | activation=leaky 882 | 883 | [convolutional] 884 | batch_normalize=1 885 | filters=128 886 | size=1 887 | stride=1 888 | pad=1 889 | activation=leaky 890 | 891 | [upsample] 892 | stride=2 893 | 894 | [route] 895 | layers = 54 896 | 897 | [convolutional] 898 | batch_normalize=1 899 | filters=128 900 | size=1 901 | stride=1 902 | pad=1 903 | activation=leaky 904 | 905 | [route] 906 | layers = -1, -3 907 | 908 | [convolutional] 909 | batch_normalize=1 910 | filters=128 911 | size=1 912 | stride=1 913 | pad=1 914 | activation=leaky 915 | 916 | [convolutional] 917 | batch_normalize=1 918 | size=3 919 | stride=1 920 | pad=1 921 | filters=256 922 | activation=leaky 923 | 924 | [convolutional] 925 | batch_normalize=1 926 | filters=128 927 | size=1 928 | stride=1 929 | pad=1 930 | activation=leaky 931 | 932 | [convolutional] 933 | batch_normalize=1 934 | size=3 935 | stride=1 936 | pad=1 937 | filters=256 938 | activation=leaky 939 | 940 | [convolutional] 941 | batch_normalize=1 942 | filters=128 943 | size=1 944 | stride=1 945 | pad=1 946 | activation=leaky 947 | 948 | ########################## 949 | 950 | [convolutional] 951 | batch_normalize=1 952 | size=3 953 | stride=1 954 | pad=1 955 | filters=256 956 | activation=leaky 957 | 958 | [convolutional] 959 | size=1 960 | stride=1 961 | pad=1 962 | filters=255 963 | activation=linear 964 | 965 | 966 | [yolo] 967 | mask = 0,1,2 968 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 969 | classes=80 970 | num=9 971 | jitter=.3 972 | ignore_thresh = .7 973 | truth_thresh = 1 974 | scale_x_y = 1.2 975 | iou_thresh=0.213 976 | cls_normalizer=1.0 977 | iou_normalizer=0.07 978 | iou_loss=ciou 979 | nms_kind=greedynms 980 | beta_nms=0.6 981 | 982 | 983 | [route] 984 | layers = -4 985 | 986 | [convolutional] 987 | batch_normalize=1 988 | size=3 989 | stride=2 990 | pad=1 991 | filters=256 992 | activation=leaky 993 | 994 | [route] 995 | layers = -1, -16 996 | 997 | [convolutional] 998 | batch_normalize=1 999 | filters=256 1000 | size=1 1001 | stride=1 1002 | pad=1 1003 | activation=leaky 1004 | 1005 | [convolutional] 1006 | batch_normalize=1 1007 | size=3 1008 | stride=1 1009 | pad=1 1010 | filters=512 1011 | activation=leaky 1012 | 1013 | [convolutional] 1014 | batch_normalize=1 1015 | filters=256 1016 | size=1 1017 | stride=1 1018 | pad=1 1019 | activation=leaky 1020 | 1021 | [convolutional] 1022 | batch_normalize=1 1023 | size=3 1024 | stride=1 1025 | pad=1 1026 | filters=512 1027 | activation=leaky 1028 | 1029 | [convolutional] 1030 | batch_normalize=1 1031 | filters=256 1032 | size=1 1033 | stride=1 1034 | pad=1 1035 | activation=leaky 1036 | 1037 | [convolutional] 1038 | batch_normalize=1 1039 | size=3 1040 | stride=1 1041 | pad=1 1042 | filters=512 1043 | activation=leaky 1044 | 1045 | [convolutional] 1046 | size=1 1047 | stride=1 1048 | pad=1 1049 | filters=255 1050 | activation=linear 1051 | 1052 | 1053 | [yolo] 1054 | mask = 3,4,5 1055 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 1056 | classes=80 1057 | num=9 1058 | jitter=.3 1059 | ignore_thresh = .7 1060 | truth_thresh = 1 1061 | scale_x_y = 1.1 1062 | iou_thresh=0.213 1063 | cls_normalizer=1.0 1064 | iou_normalizer=0.07 1065 | iou_loss=ciou 1066 | nms_kind=greedynms 1067 | beta_nms=0.6 1068 | 1069 | 1070 | [route] 1071 | layers = -4 1072 | 1073 | [convolutional] 1074 | batch_normalize=1 1075 | size=3 1076 | stride=2 1077 | pad=1 1078 | filters=512 1079 | activation=leaky 1080 | 1081 | [route] 1082 | layers = -1, -37 1083 | 1084 | [convolutional] 1085 | batch_normalize=1 1086 | filters=512 1087 | size=1 1088 | stride=1 1089 | pad=1 1090 | activation=leaky 1091 | 1092 | [convolutional] 1093 | batch_normalize=1 1094 | size=3 1095 | stride=1 1096 | pad=1 1097 | filters=1024 1098 | activation=leaky 1099 | 1100 | [convolutional] 1101 | batch_normalize=1 1102 | filters=512 1103 | size=1 1104 | stride=1 1105 | pad=1 1106 | activation=leaky 1107 | 1108 | [convolutional] 1109 | batch_normalize=1 1110 | size=3 1111 | stride=1 1112 | pad=1 1113 | filters=1024 1114 | activation=leaky 1115 | 1116 | [convolutional] 1117 | batch_normalize=1 1118 | filters=512 1119 | size=1 1120 | stride=1 1121 | pad=1 1122 | activation=leaky 1123 | 1124 | [convolutional] 1125 | batch_normalize=1 1126 | size=3 1127 | stride=1 1128 | pad=1 1129 | filters=1024 1130 | activation=leaky 1131 | 1132 | [convolutional] 1133 | size=1 1134 | stride=1 1135 | pad=1 1136 | filters=255 1137 | activation=linear 1138 | 1139 | 1140 | [yolo] 1141 | mask = 6,7,8 1142 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 1143 | classes=80 1144 | num=9 1145 | jitter=.3 1146 | ignore_thresh = .7 1147 | truth_thresh = 1 1148 | random=1 1149 | scale_x_y = 1.05 1150 | iou_thresh=0.213 1151 | cls_normalizer=1.0 1152 | iou_normalizer=0.07 1153 | iou_loss=ciou 1154 | nms_kind=greedynms 1155 | beta_nms=0.6 1156 | 1157 | -------------------------------------------------------------------------------- /cfg/yolov4.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | batch=64 3 | subdivisions=8 4 | # Training 5 | #width=512 6 | #height=512 7 | width=608 8 | height=608 9 | channels=3 10 | momentum=0.949 11 | decay=0.0005 12 | angle=0 13 | saturation = 1.5 14 | exposure = 1.5 15 | hue=.1 16 | 17 | learning_rate=0.0013 18 | burn_in=1000 19 | max_batches = 500500 20 | policy=steps 21 | steps=400000,450000 22 | scales=.1,.1 23 | 24 | #cutmix=1 25 | mosaic=1 26 | 27 | #:104x104 54:52x52 85:26x26 104:13x13 for 416 28 | 29 | [convolutional] 30 | batch_normalize=1 31 | filters=32 32 | size=3 33 | stride=1 34 | pad=1 35 | activation=mish 36 | 37 | # Downsample 38 | 39 | [convolutional] 40 | batch_normalize=1 41 | filters=64 42 | size=3 43 | stride=2 44 | pad=1 45 | activation=mish 46 | 47 | [convolutional] 48 | batch_normalize=1 49 | filters=64 50 | size=1 51 | stride=1 52 | pad=1 53 | activation=mish 54 | 55 | [route] 56 | layers = -2 57 | 58 | [convolutional] 59 | batch_normalize=1 60 | filters=64 61 | size=1 62 | stride=1 63 | pad=1 64 | activation=mish 65 | 66 | [convolutional] 67 | batch_normalize=1 68 | filters=32 69 | size=1 70 | stride=1 71 | pad=1 72 | activation=mish 73 | 74 | [convolutional] 75 | batch_normalize=1 76 | filters=64 77 | size=3 78 | stride=1 79 | pad=1 80 | activation=mish 81 | 82 | [shortcut] 83 | from=-3 84 | activation=linear 85 | 86 | [convolutional] 87 | batch_normalize=1 88 | filters=64 89 | size=1 90 | stride=1 91 | pad=1 92 | activation=mish 93 | 94 | [route] 95 | layers = -1,-7 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=64 100 | size=1 101 | stride=1 102 | pad=1 103 | activation=mish 104 | 105 | # Downsample 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=128 110 | size=3 111 | stride=2 112 | pad=1 113 | activation=mish 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=64 118 | size=1 119 | stride=1 120 | pad=1 121 | activation=mish 122 | 123 | [route] 124 | layers = -2 125 | 126 | [convolutional] 127 | batch_normalize=1 128 | filters=64 129 | size=1 130 | stride=1 131 | pad=1 132 | activation=mish 133 | 134 | [convolutional] 135 | batch_normalize=1 136 | filters=64 137 | size=1 138 | stride=1 139 | pad=1 140 | activation=mish 141 | 142 | [convolutional] 143 | batch_normalize=1 144 | filters=64 145 | size=3 146 | stride=1 147 | pad=1 148 | activation=mish 149 | 150 | [shortcut] 151 | from=-3 152 | activation=linear 153 | 154 | [convolutional] 155 | batch_normalize=1 156 | filters=64 157 | size=1 158 | stride=1 159 | pad=1 160 | activation=mish 161 | 162 | [convolutional] 163 | batch_normalize=1 164 | filters=64 165 | size=3 166 | stride=1 167 | pad=1 168 | activation=mish 169 | 170 | [shortcut] 171 | from=-3 172 | activation=linear 173 | 174 | [convolutional] 175 | batch_normalize=1 176 | filters=64 177 | size=1 178 | stride=1 179 | pad=1 180 | activation=mish 181 | 182 | [route] 183 | layers = -1,-10 184 | 185 | [convolutional] 186 | batch_normalize=1 187 | filters=128 188 | size=1 189 | stride=1 190 | pad=1 191 | activation=mish 192 | 193 | # Downsample 194 | 195 | [convolutional] 196 | batch_normalize=1 197 | filters=256 198 | size=3 199 | stride=2 200 | pad=1 201 | activation=mish 202 | 203 | [convolutional] 204 | batch_normalize=1 205 | filters=128 206 | size=1 207 | stride=1 208 | pad=1 209 | activation=mish 210 | 211 | [route] 212 | layers = -2 213 | 214 | [convolutional] 215 | batch_normalize=1 216 | filters=128 217 | size=1 218 | stride=1 219 | pad=1 220 | activation=mish 221 | 222 | [convolutional] 223 | batch_normalize=1 224 | filters=128 225 | size=1 226 | stride=1 227 | pad=1 228 | activation=mish 229 | 230 | [convolutional] 231 | batch_normalize=1 232 | filters=128 233 | size=3 234 | stride=1 235 | pad=1 236 | activation=mish 237 | 238 | [shortcut] 239 | from=-3 240 | activation=linear 241 | 242 | [convolutional] 243 | batch_normalize=1 244 | filters=128 245 | size=1 246 | stride=1 247 | pad=1 248 | activation=mish 249 | 250 | [convolutional] 251 | batch_normalize=1 252 | filters=128 253 | size=3 254 | stride=1 255 | pad=1 256 | activation=mish 257 | 258 | [shortcut] 259 | from=-3 260 | activation=linear 261 | 262 | [convolutional] 263 | batch_normalize=1 264 | filters=128 265 | size=1 266 | stride=1 267 | pad=1 268 | activation=mish 269 | 270 | [convolutional] 271 | batch_normalize=1 272 | filters=128 273 | size=3 274 | stride=1 275 | pad=1 276 | activation=mish 277 | 278 | [shortcut] 279 | from=-3 280 | activation=linear 281 | 282 | [convolutional] 283 | batch_normalize=1 284 | filters=128 285 | size=1 286 | stride=1 287 | pad=1 288 | activation=mish 289 | 290 | [convolutional] 291 | batch_normalize=1 292 | filters=128 293 | size=3 294 | stride=1 295 | pad=1 296 | activation=mish 297 | 298 | [shortcut] 299 | from=-3 300 | activation=linear 301 | 302 | 303 | [convolutional] 304 | batch_normalize=1 305 | filters=128 306 | size=1 307 | stride=1 308 | pad=1 309 | activation=mish 310 | 311 | [convolutional] 312 | batch_normalize=1 313 | filters=128 314 | size=3 315 | stride=1 316 | pad=1 317 | activation=mish 318 | 319 | [shortcut] 320 | from=-3 321 | activation=linear 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=128 326 | size=1 327 | stride=1 328 | pad=1 329 | activation=mish 330 | 331 | [convolutional] 332 | batch_normalize=1 333 | filters=128 334 | size=3 335 | stride=1 336 | pad=1 337 | activation=mish 338 | 339 | [shortcut] 340 | from=-3 341 | activation=linear 342 | 343 | [convolutional] 344 | batch_normalize=1 345 | filters=128 346 | size=1 347 | stride=1 348 | pad=1 349 | activation=mish 350 | 351 | [convolutional] 352 | batch_normalize=1 353 | filters=128 354 | size=3 355 | stride=1 356 | pad=1 357 | activation=mish 358 | 359 | [shortcut] 360 | from=-3 361 | activation=linear 362 | 363 | [convolutional] 364 | batch_normalize=1 365 | filters=128 366 | size=1 367 | stride=1 368 | pad=1 369 | activation=mish 370 | 371 | [convolutional] 372 | batch_normalize=1 373 | filters=128 374 | size=3 375 | stride=1 376 | pad=1 377 | activation=mish 378 | 379 | [shortcut] 380 | from=-3 381 | activation=linear 382 | 383 | [convolutional] 384 | batch_normalize=1 385 | filters=128 386 | size=1 387 | stride=1 388 | pad=1 389 | activation=mish 390 | 391 | [route] 392 | layers = -1,-28 393 | 394 | [convolutional] 395 | batch_normalize=1 396 | filters=256 397 | size=1 398 | stride=1 399 | pad=1 400 | activation=mish 401 | 402 | # Downsample 403 | 404 | [convolutional] 405 | batch_normalize=1 406 | filters=512 407 | size=3 408 | stride=2 409 | pad=1 410 | activation=mish 411 | 412 | [convolutional] 413 | batch_normalize=1 414 | filters=256 415 | size=1 416 | stride=1 417 | pad=1 418 | activation=mish 419 | 420 | [route] 421 | layers = -2 422 | 423 | [convolutional] 424 | batch_normalize=1 425 | filters=256 426 | size=1 427 | stride=1 428 | pad=1 429 | activation=mish 430 | 431 | [convolutional] 432 | batch_normalize=1 433 | filters=256 434 | size=1 435 | stride=1 436 | pad=1 437 | activation=mish 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=3 443 | stride=1 444 | pad=1 445 | activation=mish 446 | 447 | [shortcut] 448 | from=-3 449 | activation=linear 450 | 451 | 452 | [convolutional] 453 | batch_normalize=1 454 | filters=256 455 | size=1 456 | stride=1 457 | pad=1 458 | activation=mish 459 | 460 | [convolutional] 461 | batch_normalize=1 462 | filters=256 463 | size=3 464 | stride=1 465 | pad=1 466 | activation=mish 467 | 468 | [shortcut] 469 | from=-3 470 | activation=linear 471 | 472 | 473 | [convolutional] 474 | batch_normalize=1 475 | filters=256 476 | size=1 477 | stride=1 478 | pad=1 479 | activation=mish 480 | 481 | [convolutional] 482 | batch_normalize=1 483 | filters=256 484 | size=3 485 | stride=1 486 | pad=1 487 | activation=mish 488 | 489 | [shortcut] 490 | from=-3 491 | activation=linear 492 | 493 | 494 | [convolutional] 495 | batch_normalize=1 496 | filters=256 497 | size=1 498 | stride=1 499 | pad=1 500 | activation=mish 501 | 502 | [convolutional] 503 | batch_normalize=1 504 | filters=256 505 | size=3 506 | stride=1 507 | pad=1 508 | activation=mish 509 | 510 | [shortcut] 511 | from=-3 512 | activation=linear 513 | 514 | 515 | [convolutional] 516 | batch_normalize=1 517 | filters=256 518 | size=1 519 | stride=1 520 | pad=1 521 | activation=mish 522 | 523 | [convolutional] 524 | batch_normalize=1 525 | filters=256 526 | size=3 527 | stride=1 528 | pad=1 529 | activation=mish 530 | 531 | [shortcut] 532 | from=-3 533 | activation=linear 534 | 535 | 536 | [convolutional] 537 | batch_normalize=1 538 | filters=256 539 | size=1 540 | stride=1 541 | pad=1 542 | activation=mish 543 | 544 | [convolutional] 545 | batch_normalize=1 546 | filters=256 547 | size=3 548 | stride=1 549 | pad=1 550 | activation=mish 551 | 552 | [shortcut] 553 | from=-3 554 | activation=linear 555 | 556 | 557 | [convolutional] 558 | batch_normalize=1 559 | filters=256 560 | size=1 561 | stride=1 562 | pad=1 563 | activation=mish 564 | 565 | [convolutional] 566 | batch_normalize=1 567 | filters=256 568 | size=3 569 | stride=1 570 | pad=1 571 | activation=mish 572 | 573 | [shortcut] 574 | from=-3 575 | activation=linear 576 | 577 | [convolutional] 578 | batch_normalize=1 579 | filters=256 580 | size=1 581 | stride=1 582 | pad=1 583 | activation=mish 584 | 585 | [convolutional] 586 | batch_normalize=1 587 | filters=256 588 | size=3 589 | stride=1 590 | pad=1 591 | activation=mish 592 | 593 | [shortcut] 594 | from=-3 595 | activation=linear 596 | 597 | [convolutional] 598 | batch_normalize=1 599 | filters=256 600 | size=1 601 | stride=1 602 | pad=1 603 | activation=mish 604 | 605 | [route] 606 | layers = -1,-28 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | filters=512 611 | size=1 612 | stride=1 613 | pad=1 614 | activation=mish 615 | 616 | # Downsample 617 | 618 | [convolutional] 619 | batch_normalize=1 620 | filters=1024 621 | size=3 622 | stride=2 623 | pad=1 624 | activation=mish 625 | 626 | [convolutional] 627 | batch_normalize=1 628 | filters=512 629 | size=1 630 | stride=1 631 | pad=1 632 | activation=mish 633 | 634 | [route] 635 | layers = -2 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=512 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=mish 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | filters=512 648 | size=1 649 | stride=1 650 | pad=1 651 | activation=mish 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=512 656 | size=3 657 | stride=1 658 | pad=1 659 | activation=mish 660 | 661 | [shortcut] 662 | from=-3 663 | activation=linear 664 | 665 | [convolutional] 666 | batch_normalize=1 667 | filters=512 668 | size=1 669 | stride=1 670 | pad=1 671 | activation=mish 672 | 673 | [convolutional] 674 | batch_normalize=1 675 | filters=512 676 | size=3 677 | stride=1 678 | pad=1 679 | activation=mish 680 | 681 | [shortcut] 682 | from=-3 683 | activation=linear 684 | 685 | [convolutional] 686 | batch_normalize=1 687 | filters=512 688 | size=1 689 | stride=1 690 | pad=1 691 | activation=mish 692 | 693 | [convolutional] 694 | batch_normalize=1 695 | filters=512 696 | size=3 697 | stride=1 698 | pad=1 699 | activation=mish 700 | 701 | [shortcut] 702 | from=-3 703 | activation=linear 704 | 705 | [convolutional] 706 | batch_normalize=1 707 | filters=512 708 | size=1 709 | stride=1 710 | pad=1 711 | activation=mish 712 | 713 | [convolutional] 714 | batch_normalize=1 715 | filters=512 716 | size=3 717 | stride=1 718 | pad=1 719 | activation=mish 720 | 721 | [shortcut] 722 | from=-3 723 | activation=linear 724 | 725 | [convolutional] 726 | batch_normalize=1 727 | filters=512 728 | size=1 729 | stride=1 730 | pad=1 731 | activation=mish 732 | 733 | [route] 734 | layers = -1,-16 735 | 736 | [convolutional] 737 | batch_normalize=1 738 | filters=1024 739 | size=1 740 | stride=1 741 | pad=1 742 | activation=mish 743 | 744 | ########################## 745 | 746 | [convolutional] 747 | batch_normalize=1 748 | filters=512 749 | size=1 750 | stride=1 751 | pad=1 752 | activation=leaky 753 | 754 | [convolutional] 755 | batch_normalize=1 756 | size=3 757 | stride=1 758 | pad=1 759 | filters=1024 760 | activation=leaky 761 | 762 | [convolutional] 763 | batch_normalize=1 764 | filters=512 765 | size=1 766 | stride=1 767 | pad=1 768 | activation=leaky 769 | 770 | ### SPP ### 771 | [maxpool] 772 | stride=1 773 | size=5 774 | 775 | [route] 776 | layers=-2 777 | 778 | [maxpool] 779 | stride=1 780 | size=9 781 | 782 | [route] 783 | layers=-4 784 | 785 | [maxpool] 786 | stride=1 787 | size=13 788 | 789 | [route] 790 | layers=-1,-3,-5,-6 791 | ### End SPP ### 792 | 793 | [convolutional] 794 | batch_normalize=1 795 | filters=512 796 | size=1 797 | stride=1 798 | pad=1 799 | activation=leaky 800 | 801 | [convolutional] 802 | batch_normalize=1 803 | size=3 804 | stride=1 805 | pad=1 806 | filters=1024 807 | activation=leaky 808 | 809 | [convolutional] 810 | batch_normalize=1 811 | filters=512 812 | size=1 813 | stride=1 814 | pad=1 815 | activation=leaky 816 | 817 | [convolutional] 818 | batch_normalize=1 819 | filters=256 820 | size=1 821 | stride=1 822 | pad=1 823 | activation=leaky 824 | 825 | [upsample] 826 | stride=2 827 | 828 | [route] 829 | layers = 85 830 | 831 | [convolutional] 832 | batch_normalize=1 833 | filters=256 834 | size=1 835 | stride=1 836 | pad=1 837 | activation=leaky 838 | 839 | [route] 840 | layers = -1, -3 841 | 842 | [convolutional] 843 | batch_normalize=1 844 | filters=256 845 | size=1 846 | stride=1 847 | pad=1 848 | activation=leaky 849 | 850 | [convolutional] 851 | batch_normalize=1 852 | size=3 853 | stride=1 854 | pad=1 855 | filters=512 856 | activation=leaky 857 | 858 | [convolutional] 859 | batch_normalize=1 860 | filters=256 861 | size=1 862 | stride=1 863 | pad=1 864 | activation=leaky 865 | 866 | [convolutional] 867 | batch_normalize=1 868 | size=3 869 | stride=1 870 | pad=1 871 | filters=512 872 | activation=leaky 873 | 874 | [convolutional] 875 | batch_normalize=1 876 | filters=256 877 | size=1 878 | stride=1 879 | pad=1 880 | activation=leaky 881 | 882 | [convolutional] 883 | batch_normalize=1 884 | filters=128 885 | size=1 886 | stride=1 887 | pad=1 888 | activation=leaky 889 | 890 | [upsample] 891 | stride=2 892 | 893 | [route] 894 | layers = 54 895 | 896 | [convolutional] 897 | batch_normalize=1 898 | filters=128 899 | size=1 900 | stride=1 901 | pad=1 902 | activation=leaky 903 | 904 | [route] 905 | layers = -1, -3 906 | 907 | [convolutional] 908 | batch_normalize=1 909 | filters=128 910 | size=1 911 | stride=1 912 | pad=1 913 | activation=leaky 914 | 915 | [convolutional] 916 | batch_normalize=1 917 | size=3 918 | stride=1 919 | pad=1 920 | filters=256 921 | activation=leaky 922 | 923 | [convolutional] 924 | batch_normalize=1 925 | filters=128 926 | size=1 927 | stride=1 928 | pad=1 929 | activation=leaky 930 | 931 | [convolutional] 932 | batch_normalize=1 933 | size=3 934 | stride=1 935 | pad=1 936 | filters=256 937 | activation=leaky 938 | 939 | [convolutional] 940 | batch_normalize=1 941 | filters=128 942 | size=1 943 | stride=1 944 | pad=1 945 | activation=leaky 946 | 947 | ########################## 948 | 949 | [convolutional] 950 | batch_normalize=1 951 | size=3 952 | stride=1 953 | pad=1 954 | filters=256 955 | activation=leaky 956 | 957 | [convolutional] 958 | size=1 959 | stride=1 960 | pad=1 961 | filters=255 962 | activation=linear 963 | 964 | 965 | [yolo] 966 | mask = 0,1,2 967 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 968 | classes=80 969 | num=9 970 | jitter=.3 971 | ignore_thresh = .7 972 | truth_thresh = 1 973 | scale_x_y = 1.2 974 | iou_thresh=0.213 975 | cls_normalizer=1.0 976 | iou_normalizer=0.07 977 | iou_loss=ciou 978 | nms_kind=greedynms 979 | beta_nms=0.6 980 | max_delta=5 981 | 982 | 983 | [route] 984 | layers = -4 985 | 986 | [convolutional] 987 | batch_normalize=1 988 | size=3 989 | stride=2 990 | pad=1 991 | filters=256 992 | activation=leaky 993 | 994 | [route] 995 | layers = -1, -16 996 | 997 | [convolutional] 998 | batch_normalize=1 999 | filters=256 1000 | size=1 1001 | stride=1 1002 | pad=1 1003 | activation=leaky 1004 | 1005 | [convolutional] 1006 | batch_normalize=1 1007 | size=3 1008 | stride=1 1009 | pad=1 1010 | filters=512 1011 | activation=leaky 1012 | 1013 | [convolutional] 1014 | batch_normalize=1 1015 | filters=256 1016 | size=1 1017 | stride=1 1018 | pad=1 1019 | activation=leaky 1020 | 1021 | [convolutional] 1022 | batch_normalize=1 1023 | size=3 1024 | stride=1 1025 | pad=1 1026 | filters=512 1027 | activation=leaky 1028 | 1029 | [convolutional] 1030 | batch_normalize=1 1031 | filters=256 1032 | size=1 1033 | stride=1 1034 | pad=1 1035 | activation=leaky 1036 | 1037 | [convolutional] 1038 | batch_normalize=1 1039 | size=3 1040 | stride=1 1041 | pad=1 1042 | filters=512 1043 | activation=leaky 1044 | 1045 | [convolutional] 1046 | size=1 1047 | stride=1 1048 | pad=1 1049 | filters=255 1050 | activation=linear 1051 | 1052 | 1053 | [yolo] 1054 | mask = 3,4,5 1055 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 1056 | classes=80 1057 | num=9 1058 | jitter=.3 1059 | ignore_thresh = .7 1060 | truth_thresh = 1 1061 | scale_x_y = 1.1 1062 | iou_thresh=0.213 1063 | cls_normalizer=1.0 1064 | iou_normalizer=0.07 1065 | iou_loss=ciou 1066 | nms_kind=greedynms 1067 | beta_nms=0.6 1068 | max_delta=5 1069 | 1070 | 1071 | [route] 1072 | layers = -4 1073 | 1074 | [convolutional] 1075 | batch_normalize=1 1076 | size=3 1077 | stride=2 1078 | pad=1 1079 | filters=512 1080 | activation=leaky 1081 | 1082 | [route] 1083 | layers = -1, -37 1084 | 1085 | [convolutional] 1086 | batch_normalize=1 1087 | filters=512 1088 | size=1 1089 | stride=1 1090 | pad=1 1091 | activation=leaky 1092 | 1093 | [convolutional] 1094 | batch_normalize=1 1095 | size=3 1096 | stride=1 1097 | pad=1 1098 | filters=1024 1099 | activation=leaky 1100 | 1101 | [convolutional] 1102 | batch_normalize=1 1103 | filters=512 1104 | size=1 1105 | stride=1 1106 | pad=1 1107 | activation=leaky 1108 | 1109 | [convolutional] 1110 | batch_normalize=1 1111 | size=3 1112 | stride=1 1113 | pad=1 1114 | filters=1024 1115 | activation=leaky 1116 | 1117 | [convolutional] 1118 | batch_normalize=1 1119 | filters=512 1120 | size=1 1121 | stride=1 1122 | pad=1 1123 | activation=leaky 1124 | 1125 | [convolutional] 1126 | batch_normalize=1 1127 | size=3 1128 | stride=1 1129 | pad=1 1130 | filters=1024 1131 | activation=leaky 1132 | 1133 | [convolutional] 1134 | size=1 1135 | stride=1 1136 | pad=1 1137 | filters=255 1138 | activation=linear 1139 | 1140 | 1141 | [yolo] 1142 | mask = 6,7,8 1143 | anchors = 12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401 1144 | classes=80 1145 | num=9 1146 | jitter=.3 1147 | ignore_thresh = .7 1148 | truth_thresh = 1 1149 | random=1 1150 | scale_x_y = 1.05 1151 | iou_thresh=0.213 1152 | cls_normalizer=1.0 1153 | iou_normalizer=0.07 1154 | iou_loss=ciou 1155 | nms_kind=greedynms 1156 | beta_nms=0.6 1157 | max_delta=5 1158 | -------------------------------------------------------------------------------- /data/coco.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /data/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/roboflow/pytorch-YOLOv4/b6189b304b9a60a15bc0a9c3627ec4973fe0e2b5/data/dog.jpg -------------------------------------------------------------------------------- /data/giraffe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/roboflow/pytorch-YOLOv4/b6189b304b9a60a15bc0a9c3627ec4973fe0e2b5/data/giraffe.jpg -------------------------------------------------------------------------------- /data/prediction.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/roboflow/pytorch-YOLOv4/b6189b304b9a60a15bc0a9c3627ec4973fe0e2b5/data/prediction.jpg -------------------------------------------------------------------------------- /data/voc.names: -------------------------------------------------------------------------------- 1 | aeroplane 2 | bicycle 3 | bird 4 | boat 5 | bottle 6 | bus 7 | car 8 | cat 9 | chair 10 | cow 11 | diningtable 12 | dog 13 | horse 14 | motorbike 15 | person 16 | pottedplant 17 | sheep 18 | sofa 19 | train 20 | tvmonitor 21 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | @Time : 2020/05/06 21:09 4 | @Author : Tianxiaomo 5 | @File : dataset.py 6 | @Noice : 7 | @Modificattion : 8 | @Author : 9 | @Time : 10 | @Detail : 11 | 12 | ''' 13 | from torch.utils.data.dataset import Dataset 14 | 15 | import random 16 | import cv2 17 | import sys 18 | import numpy as np 19 | import os 20 | import matplotlib.pyplot as plt 21 | 22 | 23 | def rand_uniform_strong(min, max): 24 | if min > max: 25 | swap = min 26 | min = max 27 | max = swap 28 | return random.random() * (max - min) + min 29 | 30 | 31 | def rand_scale(s): 32 | scale = rand_uniform_strong(1, s) 33 | if random.randint(0, 1) % 2: 34 | return scale 35 | return 1. / scale 36 | 37 | 38 | def rand_precalc_random(min, max, random_part): 39 | if max < min: 40 | swap = min 41 | min = max 42 | max = swap 43 | return (random_part * (max - min)) + min 44 | 45 | 46 | def fill_truth_detection(bboxes, num_boxes, classes, flip, dx, dy, sx, sy, net_w, net_h): 47 | if bboxes.shape[0] == 0: 48 | return bboxes, 10000 49 | np.random.shuffle(bboxes) 50 | bboxes[:, 0] -= dx 51 | bboxes[:, 2] -= dx 52 | bboxes[:, 1] -= dy 53 | bboxes[:, 3] -= dy 54 | 55 | bboxes[:, 0] = np.clip(bboxes[:, 0], 0, sx) 56 | bboxes[:, 2] = np.clip(bboxes[:, 2], 0, sx) 57 | 58 | bboxes[:, 1] = np.clip(bboxes[:, 1], 0, sy) 59 | bboxes[:, 3] = np.clip(bboxes[:, 3], 0, sy) 60 | 61 | out_box = list(np.where(((bboxes[:, 1] == sy) & (bboxes[:, 3] == sy)) | 62 | ((bboxes[:, 0] == sx) & (bboxes[:, 2] == sx)) | 63 | ((bboxes[:, 1] == 0) & (bboxes[:, 3] == 0)) | 64 | ((bboxes[:, 0] == 0) & (bboxes[:, 2] == 0)))[0]) 65 | list_box = list(range(bboxes.shape[0])) 66 | for i in out_box: 67 | list_box.remove(i) 68 | bboxes = bboxes[list_box] 69 | 70 | if bboxes.shape[0] == 0: 71 | return bboxes, 10000 72 | 73 | bboxes = bboxes[np.where((bboxes[:, 4] < classes) & (bboxes[:, 4] >= 0))[0]] 74 | 75 | if bboxes.shape[0] > num_boxes: 76 | bboxes = bboxes[:num_boxes] 77 | 78 | min_w_h = np.array([bboxes[:, 2] - bboxes[:, 0], bboxes[:, 3] - bboxes[:, 1]]).min() 79 | 80 | bboxes[:, 0] *= (net_w / sx) 81 | bboxes[:, 2] *= (net_w / sx) 82 | bboxes[:, 1] *= (net_h / sy) 83 | bboxes[:, 3] *= (net_h / sy) 84 | 85 | if flip: 86 | temp = net_w - bboxes[:, 0] 87 | bboxes[:, 0] = net_w - bboxes[:, 2] 88 | bboxes[:, 2] = temp 89 | 90 | return bboxes, min_w_h 91 | 92 | 93 | def rect_intersection(a, b): 94 | minx = max(a[0], b[0]) 95 | miny = max(a[1], b[1]) 96 | 97 | maxx = min(a[2], b[2]) 98 | maxy = min(a[3], b[3]) 99 | return [minx, miny, maxx, maxy] 100 | 101 | 102 | def image_data_augmentation(mat, w, h, pleft, ptop, swidth, sheight, flip, dhue, dsat, dexp, gaussian_noise, blur, 103 | truth): 104 | try: 105 | img = mat 106 | oh, ow, _ = img.shape 107 | pleft, ptop, swidth, sheight = int(pleft), int(ptop), int(swidth), int(sheight) 108 | # crop 109 | src_rect = [pleft, ptop, swidth + pleft, sheight + ptop] # x1,y1,x2,y2 110 | img_rect = [0, 0, ow, oh] 111 | new_src_rect = rect_intersection(src_rect, img_rect) # 交集 112 | 113 | dst_rect = [max(0, -pleft), max(0, -ptop), max(0, -pleft) + new_src_rect[2] - new_src_rect[0], 114 | max(0, -ptop) + new_src_rect[3] - new_src_rect[1]] 115 | # cv2.Mat sized 116 | 117 | if (src_rect[0] == 0 and src_rect[1] == 0 and src_rect[2] == img.shape[0] and src_rect[3] == img.shape[1]): 118 | sized = cv2.resize(img, (w, h), cv2.INTER_LINEAR) 119 | else: 120 | cropped = np.zeros([sheight, swidth, 3]) 121 | cropped[:, :, ] = np.mean(img, axis=(0, 1)) 122 | 123 | cropped[dst_rect[1]:dst_rect[3], dst_rect[0]:dst_rect[2]] = \ 124 | img[new_src_rect[1]:new_src_rect[3], new_src_rect[0]:new_src_rect[2]] 125 | 126 | # resize 127 | sized = cv2.resize(cropped, (w, h), cv2.INTER_LINEAR) 128 | 129 | # flip 130 | if flip: 131 | # cv2.Mat cropped 132 | sized = cv2.flip(sized, 1) # 0 - x-axis, 1 - y-axis, -1 - both axes (x & y) 133 | 134 | # HSV augmentation 135 | # cv2.COLOR_BGR2HSV, cv2.COLOR_RGB2HSV, cv2.COLOR_HSV2BGR, cv2.COLOR_HSV2RGB 136 | if dsat != 1 or dexp != 1 or dhue != 0: 137 | if img.shape[2] >= 3: 138 | hsv_src = cv2.cvtColor(sized.astype(np.float32), cv2.COLOR_RGB2HSV) # RGB to HSV 139 | hsv = cv2.split(hsv_src) 140 | hsv[1] *= dsat 141 | hsv[2] *= dexp 142 | hsv[0] += 179 * dhue 143 | hsv_src = cv2.merge(hsv) 144 | sized = np.clip(cv2.cvtColor(hsv_src, cv2.COLOR_HSV2RGB), 0, 255) # HSV to RGB (the same as previous) 145 | else: 146 | sized *= dexp 147 | 148 | if blur: 149 | if blur == 1: 150 | dst = cv2.GaussianBlur(sized, (17, 17), 0) 151 | # cv2.bilateralFilter(sized, dst, 17, 75, 75) 152 | else: 153 | ksize = (blur / 2) * 2 + 1 154 | dst = cv2.GaussianBlur(sized, (ksize, ksize), 0) 155 | 156 | if blur == 1: 157 | img_rect = [0, 0, sized.cols, sized.rows] 158 | for b in truth: 159 | left = (b.x - b.w / 2.) * sized.shape[1] 160 | width = b.w * sized.shape[1] 161 | top = (b.y - b.h / 2.) * sized.shape[0] 162 | height = b.h * sized.shape[0] 163 | roi(left, top, width, height) 164 | roi = roi & img_rect 165 | dst[roi[0]:roi[0] + roi[2], roi[1]:roi[1] + roi[3]] = sized[roi[0]:roi[0] + roi[2], 166 | roi[1]:roi[1] + roi[3]] 167 | 168 | sized = dst 169 | 170 | if gaussian_noise: 171 | noise = np.array(sized.shape) 172 | gaussian_noise = min(gaussian_noise, 127) 173 | gaussian_noise = max(gaussian_noise, 0) 174 | cv2.randn(noise, 0, gaussian_noise) # mean and variance 175 | sized = sized + noise 176 | except: 177 | print("OpenCV can't augment image: " + str(w) + " x " + str(h)) 178 | sized = mat 179 | 180 | return sized 181 | 182 | 183 | def filter_truth(bboxes, dx, dy, sx, sy, xd, yd): 184 | bboxes[:, 0] -= dx 185 | bboxes[:, 2] -= dx 186 | bboxes[:, 1] -= dy 187 | bboxes[:, 3] -= dy 188 | 189 | bboxes[:, 0] = np.clip(bboxes[:, 0], 0, sx) 190 | bboxes[:, 2] = np.clip(bboxes[:, 2], 0, sx) 191 | 192 | bboxes[:, 1] = np.clip(bboxes[:, 1], 0, sy) 193 | bboxes[:, 3] = np.clip(bboxes[:, 3], 0, sy) 194 | 195 | out_box = list(np.where(((bboxes[:, 1] == sy) & (bboxes[:, 3] == sy)) | 196 | ((bboxes[:, 0] == sx) & (bboxes[:, 2] == sx)) | 197 | ((bboxes[:, 1] == 0) & (bboxes[:, 3] == 0)) | 198 | ((bboxes[:, 0] == 0) & (bboxes[:, 2] == 0)))[0]) 199 | list_box = list(range(bboxes.shape[0])) 200 | for i in out_box: 201 | list_box.remove(i) 202 | bboxes = bboxes[list_box] 203 | 204 | bboxes[:, 0] += xd 205 | bboxes[:, 2] += xd 206 | bboxes[:, 1] += yd 207 | bboxes[:, 3] += yd 208 | 209 | return bboxes 210 | 211 | 212 | def blend_truth_mosaic(out_img, img, bboxes, w, h, cut_x, cut_y, i_mixup, 213 | left_shift, right_shift, top_shift, bot_shift): 214 | left_shift = min(left_shift, w - cut_x) 215 | top_shift = min(top_shift, h - cut_y) 216 | right_shift = min(right_shift, cut_x) 217 | bot_shift = min(bot_shift, cut_y) 218 | 219 | if i_mixup == 0: 220 | bboxes = filter_truth(bboxes, left_shift, top_shift, cut_x, cut_y, 0, 0) 221 | out_img[:cut_y, :cut_x] = img[top_shift:top_shift + cut_y, left_shift:left_shift + cut_x] 222 | if i_mixup == 1: 223 | bboxes = filter_truth(bboxes, cut_x - right_shift, top_shift, w - cut_x, cut_y, cut_x, 0) 224 | out_img[:cut_y, cut_x:] = img[top_shift:top_shift + cut_y, cut_x - right_shift:w - right_shift] 225 | if i_mixup == 2: 226 | bboxes = filter_truth(bboxes, left_shift, cut_y - bot_shift, cut_x, h - cut_y, 0, cut_y) 227 | out_img[cut_y:, :cut_x] = img[cut_y - bot_shift:h - bot_shift, left_shift:left_shift + cut_x] 228 | if i_mixup == 3: 229 | bboxes = filter_truth(bboxes, cut_x - right_shift, cut_y - bot_shift, w - cut_x, h - cut_y, cut_x, cut_y) 230 | out_img[cut_y:, cut_x:] = img[cut_y - bot_shift:h - bot_shift, cut_x - right_shift:w - right_shift] 231 | 232 | return out_img, bboxes 233 | 234 | 235 | def draw_box(img, bboxes): 236 | for b in bboxes: 237 | img = cv2.rectangle(img, (b[0], b[1]), (b[2], b[3]), (0, 255, 0), 2) 238 | return img 239 | 240 | 241 | class Yolo_dataset(Dataset): 242 | def __init__(self, lable_path, cfg): 243 | super(Yolo_dataset, self).__init__() 244 | if cfg.mixup == 2: 245 | print("cutmix=1 - isn't supported for Detector") 246 | raise 247 | elif cfg.mixup == 2 and cfg.letter_box: 248 | print("Combination: letter_box=1 & mosaic=1 - isn't supported, use only 1 of these parameters") 249 | raise 250 | 251 | self.cfg = cfg 252 | 253 | truth = {} 254 | f = open(lable_path, 'r', encoding='utf-8') 255 | for line in f.readlines(): 256 | data = line.split(" ") 257 | truth[data[0]] = [] 258 | for i in data[1:]: 259 | truth[data[0]].append([int(j) for j in i.split(',')]) 260 | 261 | self.truth = truth 262 | 263 | def __len__(self): 264 | return len(self.truth.keys()) 265 | 266 | def __getitem__(self, index): 267 | img_path = list(self.truth.keys())[index] 268 | bboxes = np.array(self.truth.get(img_path), dtype=np.float) 269 | img_path = os.path.join(self.cfg.dataset_dir, img_path) 270 | use_mixup = self.cfg.mixup 271 | if random.randint(0, 1): 272 | use_mixup = 0 273 | 274 | if use_mixup == 3: 275 | min_offset = 0.2 276 | cut_x = random.randint(int(self.cfg.w * min_offset), int(self.cfg.w * (1 - min_offset))) 277 | cut_y = random.randint(int(self.cfg.h * min_offset), int(self.cfg.h * (1 - min_offset))) 278 | 279 | r1, r2, r3, r4, r_scale = 0, 0, 0, 0, 0 280 | dhue, dsat, dexp, flip, blur = 0, 0, 0, 0, 0 281 | gaussian_noise = 0 282 | 283 | out_img = np.zeros([self.cfg.h, self.cfg.w, 3]) 284 | out_bboxes = [] 285 | 286 | for i in range(use_mixup + 1): 287 | if i != 0: 288 | img_path = random.choice(list(self.truth.keys())) 289 | bboxes = np.array(self.truth.get(img_path), dtype=np.float) 290 | img_path = os.path.join(self.cfg.dataset_dir, img_path) 291 | img = cv2.imread(img_path) 292 | if img is None: 293 | continue 294 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 295 | oh, ow, oc = img.shape 296 | dh, dw, dc = np.array(np.array([oh, ow, oc]) * self.cfg.jitter, dtype=np.int) 297 | 298 | dhue = rand_uniform_strong(-self.cfg.hue, self.cfg.hue) 299 | dsat = rand_scale(self.cfg.saturation) 300 | dexp = rand_scale(self.cfg.exposure) 301 | 302 | pleft = random.randint(-dw, dw) 303 | pright = random.randint(-dw, dw) 304 | ptop = random.randint(-dh, dh) 305 | pbot = random.randint(-dh, dh) 306 | 307 | flip = random.randint(0, 1) if self.cfg.flip else 0 308 | 309 | if (self.cfg.blur): 310 | tmp_blur = random.randint(0, 2) # 0 - disable, 1 - blur background, 2 - blur the whole image 311 | if tmp_blur == 0: 312 | blur = 0 313 | elif tmp_blur == 1: 314 | blur = 1 315 | else: 316 | blur = self.cfg.blur 317 | 318 | if self.cfg.gaussian and random.randint(0, 1): 319 | gaussian_noise = self.cfg.gaussian 320 | else: 321 | gaussian_noise = 0 322 | 323 | if self.cfg.letter_box: 324 | img_ar = ow / oh 325 | net_ar = self.cfg.w / self.cfg.h 326 | result_ar = img_ar / net_ar 327 | # print(" ow = %d, oh = %d, w = %d, h = %d, img_ar = %f, net_ar = %f, result_ar = %f \n", ow, oh, w, h, img_ar, net_ar, result_ar); 328 | if result_ar > 1: # sheight - should be increased 329 | oh_tmp = ow / net_ar 330 | delta_h = (oh_tmp - oh) / 2 331 | ptop = ptop - delta_h 332 | pbot = pbot - delta_h 333 | # print(" result_ar = %f, oh_tmp = %f, delta_h = %d, ptop = %f, pbot = %f \n", result_ar, oh_tmp, delta_h, ptop, pbot); 334 | else: # swidth - should be increased 335 | ow_tmp = oh * net_ar 336 | delta_w = (ow_tmp - ow) / 2 337 | pleft = pleft - delta_w 338 | pright = pright - delta_w 339 | # printf(" result_ar = %f, ow_tmp = %f, delta_w = %d, pleft = %f, pright = %f \n", result_ar, ow_tmp, delta_w, pleft, pright); 340 | 341 | swidth = ow - pleft - pright 342 | sheight = oh - ptop - pbot 343 | 344 | truth, min_w_h = fill_truth_detection(bboxes, self.cfg.boxes, self.cfg.classes, flip, pleft, ptop, swidth, 345 | sheight, self.cfg.w, self.cfg.h) 346 | if (min_w_h / 8) < blur and blur > 1: # disable blur if one of the objects is too small 347 | blur = min_w_h / 8 348 | 349 | ai = image_data_augmentation(img, self.cfg.w, self.cfg.h, pleft, ptop, swidth, sheight, flip, 350 | dhue, dsat, dexp, gaussian_noise, blur, truth) 351 | 352 | if use_mixup == 0: 353 | out_img = ai 354 | out_bboxes = truth 355 | if use_mixup == 1: 356 | if i == 0: 357 | old_img = ai.copy() 358 | old_truth = truth.copy() 359 | elif i == 1: 360 | out_img = cv2.addWeighted(ai, 0.5, old_img, 0.5) 361 | out_bboxes = np.concatenate([old_truth, truth], axis=0) 362 | elif use_mixup == 3: 363 | if flip: 364 | tmp = pleft 365 | pleft = pright 366 | pright = tmp 367 | 368 | left_shift = int(min(cut_x, max(0, (-int(pleft) * self.cfg.w / swidth)))) 369 | top_shift = int(min(cut_y, max(0, (-int(ptop) * self.cfg.h / sheight)))) 370 | 371 | right_shift = int(min((self.cfg.w - cut_x), max(0, (-int(pright) * self.cfg.w / swidth)))) 372 | bot_shift = int(min(self.cfg.h - cut_y, max(0, (-int(pbot) * self.cfg.h / sheight)))) 373 | 374 | out_img, out_bbox = blend_truth_mosaic(out_img, ai, truth.copy(), self.cfg.w, self.cfg.h, cut_x, 375 | cut_y, i, left_shift, right_shift, top_shift, bot_shift) 376 | out_bboxes.append(out_bbox) 377 | # print(img_path) 378 | if use_mixup == 3: 379 | out_bboxes = np.concatenate(out_bboxes, axis=0) 380 | out_bboxes1 = np.zeros([self.cfg.boxes, 5]) 381 | try: 382 | out_bboxes1[:min(out_bboxes.shape[0], self.cfg.boxes)] = out_bboxes[:min(out_bboxes.shape[0], self.cfg.boxes)] 383 | except AttributeError: 384 | out_bboxes = np.array(out_bboxes.astype(object), dtype=np.float32) 385 | out_bboxes1[:min(out_bboxes.shape[0], self.cfg.boxes)] = out_bboxes[:min(out_bboxes.shape[0], self.cfg.boxes)] 386 | return out_img, out_bboxes1 387 | 388 | 389 | if __name__ == "__main__": 390 | from cfg import Cfg 391 | 392 | random.seed(2020) 393 | np.random.seed(2020) 394 | Cfg.dataset_dir = '/mnt/e/Dataset' 395 | dataset = Yolo_dataset(Cfg.train_label, Cfg) 396 | for i in range(100): 397 | out_img, out_bboxes = dataset.__getitem__(i) 398 | a = draw_box(out_img.copy(), out_bboxes.astype(np.int32)) 399 | plt.imshow(a.astype(np.int32)) 400 | plt.show() 401 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | @Time : 20/04/25 15:49 4 | @Author : huguanghao 5 | @File : demo.py 6 | @Noice : 7 | @Modificattion : 8 | @Author : 9 | @Time : 10 | @Detail : 11 | ''' 12 | 13 | # import sys 14 | # import time 15 | # from PIL import Image, ImageDraw 16 | # from models.tiny_yolo import TinyYoloNet 17 | from tool.utils import * 18 | from tool.darknet2pytorch import Darknet 19 | import argparse 20 | 21 | """hyper parameters""" 22 | use_cuda = True 23 | num_classes = 80 24 | if num_classes == 20: 25 | namesfile = 'data/voc.names' 26 | elif num_classes == 80: 27 | namesfile = 'data/coco.names' 28 | else: 29 | namesfile = 'data/x.names' 30 | 31 | 32 | def detect(cfgfile, weightfile, imgfile): 33 | m = Darknet(cfgfile) 34 | 35 | m.print_network() 36 | m.load_weights(weightfile) 37 | print('Loading weights from %s... Done!' % (weightfile)) 38 | 39 | if use_cuda: 40 | m.cuda() 41 | 42 | img = Image.open(imgfile).convert('RGB') 43 | sized = img.resize((m.width, m.height)) 44 | 45 | for i in range(2): 46 | start = time.time() 47 | boxes = do_detect(m, sized, 0.5, num_classes, 0.4, use_cuda) 48 | finish = time.time() 49 | if i == 1: 50 | print('%s: Predicted in %f seconds.' % (imgfile, (finish - start))) 51 | 52 | class_names = load_class_names(namesfile) 53 | plot_boxes(img, boxes, 'predictions.jpg', class_names) 54 | 55 | 56 | def detect_imges(cfgfile, weightfile, imgfile_list=['data/dog.jpg', 'data/giraffe.jpg']): 57 | m = Darknet(cfgfile) 58 | 59 | m.print_network() 60 | m.load_weights(weightfile) 61 | print('Loading weights from %s... Done!' % (weightfile)) 62 | 63 | if use_cuda: 64 | m.cuda() 65 | 66 | imges = [] 67 | imges_list = [] 68 | for imgfile in imgfile_list: 69 | img = Image.open(imgfile).convert('RGB') 70 | imges_list.append(img) 71 | sized = img.resize((m.width, m.height)) 72 | imges.append(np.expand_dims(np.array(sized), axis=0)) 73 | 74 | images = np.concatenate(imges, 0) 75 | for i in range(2): 76 | start = time.time() 77 | boxes = do_detect(m, images, 0.5, num_classes, 0.4, use_cuda) 78 | finish = time.time() 79 | if i == 1: 80 | print('%s: Predicted in %f seconds.' % (imgfile, (finish - start))) 81 | 82 | class_names = load_class_names(namesfile) 83 | for i,(img,box) in enumerate(zip(imges_list,boxes)): 84 | plot_boxes(img, box, 'predictions{}.jpg'.format(i), class_names) 85 | 86 | 87 | def detect_cv2(cfgfile, weightfile, imgfile): 88 | import cv2 89 | m = Darknet(cfgfile) 90 | 91 | m.print_network() 92 | m.load_weights(weightfile) 93 | print('Loading weights from %s... Done!' % (weightfile)) 94 | 95 | if use_cuda: 96 | m.cuda() 97 | 98 | img = cv2.imread(imgfile) 99 | sized = cv2.resize(img, (m.width, m.height)) 100 | sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB) 101 | 102 | for i in range(2): 103 | start = time.time() 104 | boxes = do_detect(m, sized, 0.5, m.num_classes, 0.4, use_cuda) 105 | finish = time.time() 106 | if i == 1: 107 | print('%s: Predicted in %f seconds.' % (imgfile, (finish - start))) 108 | 109 | class_names = load_class_names(namesfile) 110 | plot_boxes_cv2(img, boxes, savename='predictions.jpg', class_names=class_names) 111 | 112 | 113 | def detect_cv2_camera(cfgfile, weightfile): 114 | import cv2 115 | m = Darknet(cfgfile) 116 | 117 | m.print_network() 118 | m.load_weights(weightfile) 119 | print('Loading weights from %s... Done!' % (weightfile)) 120 | 121 | if use_cuda: 122 | m.cuda() 123 | 124 | cap = cv2.VideoCapture(0) 125 | # cap = cv2.VideoCapture("./test.mp4") 126 | cap.set(3, 1280) 127 | cap.set(4, 720) 128 | print("Starting the YOLO loop...") 129 | 130 | while True: 131 | ret, img = cap.read() 132 | sized = cv2.resize(img, (m.width, m.height)) 133 | sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB) 134 | 135 | start = time.time() 136 | boxes = do_detect(m, sized, 0.5, num_classes, 0.4, use_cuda) 137 | finish = time.time() 138 | print('Predicted in %f seconds.' % (finish - start)) 139 | 140 | class_names = load_class_names(namesfile) 141 | result_img = plot_boxes_cv2(img, boxes, savename=None, class_names=class_names) 142 | 143 | cv2.imshow('Yolo demo', result_img) 144 | cv2.waitKey(1) 145 | 146 | cap.release() 147 | 148 | 149 | def detect_skimage(cfgfile, weightfile, imgfile): 150 | from skimage import io 151 | from skimage.transform import resize 152 | m = Darknet(cfgfile) 153 | 154 | m.print_network() 155 | m.load_weights(weightfile) 156 | print('Loading weights from %s... Done!' % (weightfile)) 157 | 158 | if use_cuda: 159 | m.cuda() 160 | 161 | img = io.imread(imgfile) 162 | sized = resize(img, (m.width, m.height)) * 255 163 | 164 | for i in range(2): 165 | start = time.time() 166 | boxes = do_detect(m, sized, 0.5, m.num_classes, 0.4, use_cuda) 167 | finish = time.time() 168 | if i == 1: 169 | print('%s: Predicted in %f seconds.' % (imgfile, (finish - start))) 170 | 171 | class_names = load_class_names(namesfile) 172 | plot_boxes_cv2(img, boxes, savename='predictions.jpg', class_names=class_names) 173 | 174 | 175 | def get_args(): 176 | parser = argparse.ArgumentParser('Test your image or video by trained model.') 177 | parser.add_argument('-cfgfile', type=str, default='./cfg/yolov4.cfg', 178 | help='path of cfg file', dest='cfgfile') 179 | parser.add_argument('-weightfile', type=str, 180 | default='./checkpoints/Yolov4_epoch1.pth', 181 | help='path of trained model.', dest='weightfile') 182 | parser.add_argument('-imgfile', type=str, 183 | default='./data/mscoco2017/train2017/190109_180343_00154162.jpg', 184 | help='path of your image file.', dest='imgfile') 185 | args = parser.parse_args() 186 | 187 | return args 188 | 189 | 190 | if __name__ == '__main__': 191 | args = get_args() 192 | if args.imgfile: 193 | detect(args.cfgfile, args.weightfile, args.imgfile) 194 | # detect_imges(args.cfgfile, args.weightfile) 195 | # detect_cv2(args.cfgfile, args.weightfile, args.imgfile) 196 | # detect_skimage(args.cfgfile, args.weightfile, args.imgfile) 197 | else: 198 | detect_cv2_camera(args.cfgfile, args.weightfile) 199 | -------------------------------------------------------------------------------- /demo_onnx.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import onnx 3 | import os 4 | import argparse 5 | import numpy as np 6 | import cv2 7 | import onnxruntime 8 | from tool.utils import * 9 | from tool.darknet2onnx import * 10 | 11 | 12 | def main(cfg_file, weight_file, image_path, batch_size): 13 | 14 | # Transform to onnx as specified batch size 15 | fransform_to_onnx(cfg_file, weight_file, batch_size) 16 | # Transform to onnx for demo 17 | onnx_path_demo = fransform_to_onnx(cfg_file, weight_file, 1) 18 | 19 | session = onnxruntime.InferenceSession(onnx_path_demo) 20 | # session = onnx.load(onnx_path) 21 | print("The model expects input shape: ", session.get_inputs()[0].shape) 22 | 23 | image_src = cv2.imread(image_path) 24 | detect(session, image_src) 25 | 26 | 27 | 28 | def detect(session, image_src): 29 | IN_IMAGE_H = session.get_inputs()[0].shape[2] 30 | IN_IMAGE_W = session.get_inputs()[0].shape[3] 31 | 32 | # Input 33 | resized = cv2.resize(image_src, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR) 34 | img_in = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) 35 | img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32) 36 | img_in = np.expand_dims(img_in, axis=0) 37 | img_in /= 255.0 38 | print("Shape of the network input: ", img_in.shape) 39 | 40 | # Compute 41 | input_name = session.get_inputs()[0].name 42 | # output, output_exist = session.run(['decoder.output_conv', 'lane_exist.linear2'], {"input.1": image_np}) 43 | 44 | # print(img_in) 45 | 46 | outputs = session.run(None, {input_name: img_in}) 47 | 48 | print(outputs[0].shape) 49 | print(outputs[1].shape) 50 | print(outputs[2].shape) 51 | 52 | # print(outputs[2]) 53 | 54 | num_classes = 80 55 | boxes = post_processing(img_in, 0.5, num_classes, 0.4, outputs) 56 | 57 | if num_classes == 20: 58 | namesfile = 'data/voc.names' 59 | elif num_classes == 80: 60 | namesfile = 'data/coco.names' 61 | else: 62 | namesfile = 'data/names' 63 | 64 | class_names = load_class_names(namesfile) 65 | plot_boxes_cv2(image_src, boxes, savename='predictions_onnx.jpg', class_names=class_names) 66 | 67 | 68 | 69 | if __name__ == '__main__': 70 | print("Converting to onnx and running demo ...") 71 | if len(sys.argv) == 5: 72 | cfg_file = sys.argv[1] 73 | weight_file = sys.argv[2] 74 | image_path = sys.argv[3] 75 | batch_size = int(sys.argv[4]) 76 | main(cfg_file, weight_file, image_path, batch_size) 77 | else: 78 | print('Please run this way:\n') 79 | print(' python demo_onnx.py ') 80 | -------------------------------------------------------------------------------- /demo_tensorflow.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import tensorflow as tf 4 | from tensorflow.python.platform import gfile 5 | 6 | import cv2 7 | from tool.utils import post_processing, load_class_names, plot_boxes_cv2 8 | 9 | 10 | def demo_tensorflow(tfpb_file="./weight/yolov4.pb", image_path=None, print_sensor_name=False): 11 | graph_name = 'yolov4' 12 | tf.compat.v1.disable_eager_execution() 13 | with tf.compat.v1.Session() as persisted_sess: 14 | print("loading graph...") 15 | with gfile.FastGFile(tfpb_file, 'rb') as f: 16 | graph_def = tf.compat.v1.GraphDef() 17 | graph_def.ParseFromString(f.read()) 18 | 19 | persisted_sess.graph.as_default() 20 | tf.import_graph_def(graph_def, name=graph_name) 21 | 22 | # print all sensor_name 23 | if print_sensor_name: 24 | tensor_name_list = [tensor.name for tensor in tf.compat.v1.get_default_graph().as_graph_def().node] 25 | for tensor_name in tensor_name_list: 26 | print(tensor_name) 27 | 28 | inp = persisted_sess.graph.get_tensor_by_name(graph_name + '/' + 'input:0') 29 | print(inp.shape) 30 | out1 = persisted_sess.graph.get_tensor_by_name(graph_name + '/' + 'output_1:0') 31 | out2 = persisted_sess.graph.get_tensor_by_name(graph_name + '/' + 'output_2:0') 32 | out3 = persisted_sess.graph.get_tensor_by_name(graph_name + '/' + 'output_3:0') 33 | print(out1.shape, out2.shape, out3.shape) 34 | 35 | # image_src = np.random.rand(1, 3, 608, 608).astype(np.float32) # input image 36 | # Input 37 | image_src = cv2.imread(image_path) 38 | resized = cv2.resize(image_src, (inp.shape[2], inp.shape[3]), interpolation=cv2.INTER_LINEAR) 39 | img_in = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) 40 | img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32) 41 | img_in = np.expand_dims(img_in, axis=0) 42 | img_in /= 255.0 43 | print("Shape of the network input: ", img_in.shape) 44 | 45 | feed_dict = {inp: img_in} 46 | 47 | outputs = persisted_sess.run([out1, out2, out3], feed_dict) 48 | print(outputs[0].shape) 49 | print(outputs[1].shape) 50 | print(outputs[2].shape) 51 | 52 | boxes = post_processing(img_in, 0.4, outputs) 53 | 54 | num_classes = 80 55 | if num_classes == 20: 56 | namesfile = 'data/voc.names' 57 | elif num_classes == 80: 58 | namesfile = 'data/coco.names' 59 | else: 60 | namesfile = 'data/names' 61 | 62 | class_names = load_class_names(namesfile) 63 | result = plot_boxes_cv2(image_src, boxes, savename=None, class_names=class_names) 64 | cv2.imshow("tensorflow predicted", result) 65 | cv2.waitKey() 66 | 67 | 68 | if __name__ == '__main__': 69 | if len(sys.argv) == 1: 70 | sys.argv.append('weight/yolov4.pb') 71 | sys.argv.append('data/dog.jpg') 72 | if len(sys.argv) == 3: 73 | tfpbfile = sys.argv[1] 74 | image_path = sys.argv[2] 75 | demo_tensorflow(tfpbfile, image_path) 76 | else: 77 | print('Please execute this script this way:\n') 78 | print(' python demo_tensorflow.py ') 79 | -------------------------------------------------------------------------------- /evaluate_on_coco.py: -------------------------------------------------------------------------------- 1 | """ 2 | A script to evaluate the model's performance using pre-trained weights using COCO API. 3 | Example usage: python evaluate_on_coco.py -dir D:\cocoDataset\val2017\val2017 -gta D:\cocoDataset\annotatio 4 | ns_trainval2017\annotations\instances_val2017.json -c cfg/yolov4-smaller-input.cfg -g 0 5 | Explanation: set where your images can be found using -dir, then use -gta to point to the ground truth annotations file 6 | and finally -c to point to the config file you want to use to load the network using. 7 | """ 8 | 9 | import argparse 10 | import datetime 11 | import json 12 | import logging 13 | import os 14 | import sys 15 | import time 16 | 17 | import numpy as np 18 | import torch 19 | from PIL import Image 20 | from easydict import EasyDict as edict 21 | from pycocotools.coco import COCO 22 | from pycocotools.cocoeval import COCOeval 23 | 24 | from cfg import Cfg 25 | from tool.darknet2pytorch import Darknet 26 | from tool.utils import do_detect 27 | 28 | 29 | def convert_cat_id(single_annotation): 30 | cat = single_annotation['category_id'] 31 | if cat >= 1 and cat <= 11: 32 | cat = cat + 1 33 | elif cat >= 13 and cat <= 25: 34 | cat = cat + 2 35 | elif cat >= 27 and cat <= 28: 36 | cat = cat + 3 37 | elif cat >= 31 and cat <= 44: 38 | cat = cat + 5 39 | elif cat >= 46 and cat <= 65: 40 | cat = cat + 6 41 | elif cat == 67: 42 | cat = cat + 7 43 | elif cat == 70: 44 | cat = cat + 9 45 | elif cat >= 72 and cat <= 82: 46 | cat = cat + 10 47 | elif cat >= 84 and cat <= 90: 48 | cat = cat + 11 49 | single_annotation['category_id'] = cat 50 | return single_annotation 51 | 52 | 53 | def myconverter(obj): 54 | if isinstance(obj, np.integer): 55 | return int(obj) 56 | elif isinstance(obj, np.floating): 57 | return float(obj) 58 | elif isinstance(obj, np.ndarray): 59 | return obj.tolist() 60 | elif isinstance(obj, datetime.datetime): 61 | return obj.__str__() 62 | else: 63 | return obj 64 | 65 | 66 | def evaluate_on_coco(cfg, resFile): 67 | annType = "bbox" # specify type here 68 | with open(resFile, 'r') as f: 69 | unsorted_annotations = json.load(f) 70 | sorted_annotations = list(sorted(unsorted_annotations, key=lambda single_annotation: single_annotation["image_id"])) 71 | sorted_annotations = list(map(convert_cat_id, sorted_annotations)) 72 | 73 | with open('temp.json', 'w') as f: 74 | json.dump(sorted_annotations, f) 75 | 76 | cocoGt = COCO(cfg.gt_annotations_path) 77 | cocoDt = cocoGt.loadRes('temp.json') 78 | imgIds = sorted(cocoGt.getImgIds()) 79 | cocoEval = COCOeval(cocoGt, cocoDt, annType) 80 | cocoEval.params.imgIds = imgIds 81 | cocoEval.evaluate() 82 | cocoEval.accumulate() 83 | cocoEval.summarize() 84 | 85 | 86 | def test(model, annotations, cfg): 87 | if not annotations["images"]: 88 | print("Annotations do not have 'images' key") 89 | return 90 | images = annotations["images"] 91 | images = images[:10] 92 | resFile = 'data/coco_val_outputs.json' 93 | 94 | if torch.cuda.is_available(): 95 | use_cuda = 1 96 | else: 97 | use_cuda = 0 98 | 99 | # do one forward pass first to circumvent cold start 100 | throwaway_image = Image.open('data/dog.jpg').convert('RGB').resize((model.width, model.height)) 101 | do_detect(model, throwaway_image, 0.5, 0.4, use_cuda) 102 | boxes_json = [] 103 | 104 | for i, image_annotation in enumerate(images): 105 | logging.info("currently on image: {}/{}".format(i + 1, len(images))) 106 | image_file_name = image_annotation["file_name"] 107 | image_id = image_annotation["id"] 108 | image_height = image_annotation["height"] 109 | image_width = image_annotation["width"] 110 | 111 | # open and resize each image first 112 | img = Image.open(os.path.join(cfg.dataset_dir, image_file_name)).convert('RGB') 113 | sized = img.resize((model.width, model.height)) 114 | 115 | if use_cuda: 116 | model.cuda() 117 | 118 | start = time.time() 119 | boxes = do_detect(model, sized, 0.5, 0.4, use_cuda) 120 | finish = time.time() 121 | if type(boxes) == list: 122 | for box in boxes: 123 | box_json = {} 124 | category_id = box[-1] 125 | score = box[-2] 126 | bbox_normalized = box[:4] 127 | box_json["category_id"] = int(category_id) 128 | box_json["image_id"] = int(image_id) 129 | bbox = [] 130 | for i, bbox_coord in enumerate(bbox_normalized): 131 | modified_bbox_coord = float(bbox_coord) 132 | if i % 2: 133 | modified_bbox_coord *= image_height 134 | else: 135 | modified_bbox_coord *= image_width 136 | modified_bbox_coord = round(modified_bbox_coord, 2) 137 | bbox.append(modified_bbox_coord) 138 | box_json["bbox_normalized"] = list(map(lambda x: round(float(x), 2), bbox_normalized)) 139 | box_json["bbox"] = bbox 140 | box_json["score"] = round(float(score), 2) 141 | box_json["timing"] = float(finish - start) 142 | boxes_json.append(box_json) 143 | # print("see box_json: ", box_json) 144 | with open(resFile, 'w') as outfile: 145 | json.dump(boxes_json, outfile, default=myconverter) 146 | else: 147 | print("warning: output from model after postprocessing is not a list, ignoring") 148 | return 149 | 150 | # namesfile = 'data/coco.names' 151 | # class_names = load_class_names(namesfile) 152 | # plot_boxes(img, boxes, 'data/outcome/predictions_{}.jpg'.format(image_id), class_names) 153 | 154 | with open(resFile, 'w') as outfile: 155 | json.dump(boxes_json, outfile, default=myconverter) 156 | 157 | evaluate_on_coco(cfg, resFile) 158 | 159 | 160 | def get_args(**kwargs): 161 | cfg = kwargs 162 | parser = argparse.ArgumentParser(description='Test model on test dataset', 163 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 164 | parser.add_argument('-f', '--load', dest='load', type=str, default=None, 165 | help='Load model from a .pth file') 166 | parser.add_argument('-g', '--gpu', metavar='G', type=str, default='-1', 167 | help='GPU', dest='gpu') 168 | parser.add_argument('-dir', '--data-dir', type=str, default=None, 169 | help='dataset dir', dest='dataset_dir') 170 | parser.add_argument('-gta', '--ground_truth_annotations', type=str, default='instances_val2017.json', 171 | help='ground truth annotations file', dest='gt_annotations_path') 172 | parser.add_argument('-w', '--weights_file', type=str, default='weights/yolov4.weights', 173 | help='weights file to load', dest='weights_file') 174 | parser.add_argument('-c', '--model_config', type=str, default='cfg/yolov4.cfg', 175 | help='model config file to load', dest='model_config') 176 | args = vars(parser.parse_args()) 177 | 178 | for k in args.keys(): 179 | cfg[k] = args.get(k) 180 | return edict(cfg) 181 | 182 | 183 | def init_logger(log_file=None, log_dir=None, log_level=logging.INFO, mode='w', stdout=True): 184 | """ 185 | log_dir: 日志文件的文件夹路径 186 | mode: 'a', append; 'w', 覆盖原文件写入. 187 | """ 188 | import datetime 189 | def get_date_str(): 190 | now = datetime.datetime.now() 191 | return now.strftime('%Y-%m-%d_%H-%M-%S') 192 | 193 | fmt = '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s: %(message)s' 194 | if log_dir is None: 195 | log_dir = '~/temp/log/' 196 | if log_file is None: 197 | log_file = 'log_' + get_date_str() + '.txt' 198 | if not os.path.exists(log_dir): 199 | os.makedirs(log_dir) 200 | log_file = os.path.join(log_dir, log_file) 201 | # 此处不能使用logging输出 202 | print('log file path:' + log_file) 203 | 204 | logging.basicConfig(level=logging.DEBUG, 205 | format=fmt, 206 | filename=log_file, 207 | filemode=mode) 208 | 209 | if stdout: 210 | console = logging.StreamHandler(stream=sys.stdout) 211 | console.setLevel(log_level) 212 | formatter = logging.Formatter(fmt) 213 | console.setFormatter(formatter) 214 | logging.getLogger('').addHandler(console) 215 | 216 | return logging 217 | 218 | 219 | if __name__ == "__main__": 220 | logging = init_logger(log_dir='log') 221 | cfg = get_args(**Cfg) 222 | os.environ["CUDA_VISIBLE_DEVICES"] = cfg.gpu 223 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 224 | logging.info(f'Using device {device}') 225 | 226 | model = Darknet(cfg.model_config) 227 | 228 | model.print_network() 229 | model.load_weights(cfg.weights_file) 230 | model.eval() # set model away from training 231 | 232 | if torch.cuda.device_count() > 1: 233 | model = torch.nn.DataParallel(model) 234 | 235 | model.to(device=device) 236 | 237 | annotations_file_path = cfg.gt_annotations_path 238 | with open(annotations_file_path) as annotations_file: 239 | try: 240 | annotations = json.load(annotations_file) 241 | except: 242 | print("annotations file not a json") 243 | exit() 244 | test(model=model, 245 | annotations=annotations, 246 | cfg=cfg, ) 247 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Mish(torch.nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x): 11 | x = x * (torch.tanh(torch.nn.functional.softplus(x))) 12 | return x 13 | 14 | 15 | class Upsample(nn.Module): 16 | def __init__(self): 17 | super(Upsample, self).__init__() 18 | 19 | def forward(self, x, target_size): 20 | assert (x.data.dim() == 4) 21 | _, _, H, W = target_size 22 | return F.interpolate(x, size=(H, W), mode='nearest') 23 | 24 | 25 | class Conv_Bn_Activation(nn.Module): 26 | def __init__(self, in_channels, out_channels, kernel_size, stride, activation, bn=True, bias=False): 27 | super().__init__() 28 | pad = (kernel_size - 1) // 2 29 | 30 | self.conv = nn.ModuleList() 31 | if bias: 32 | self.conv.append(nn.Conv2d(in_channels, out_channels, kernel_size, stride, pad)) 33 | else: 34 | self.conv.append(nn.Conv2d(in_channels, out_channels, kernel_size, stride, pad, bias=False)) 35 | if bn: 36 | self.conv.append(nn.BatchNorm2d(out_channels)) 37 | if activation == "mish": 38 | self.conv.append(Mish()) 39 | elif activation == "relu": 40 | self.conv.append(nn.ReLU(inplace=True)) 41 | elif activation == "leaky": 42 | self.conv.append(nn.LeakyReLU(0.1, inplace=True)) 43 | elif activation == "linear": 44 | pass 45 | else: 46 | print("activate error !!! {} {} {}".format(sys._getframe().f_code.co_filename, 47 | sys._getframe().f_code.co_name, sys._getframe().f_lineno)) 48 | 49 | def forward(self, x): 50 | for l in self.conv: 51 | x = l(x) 52 | return x 53 | 54 | 55 | class ResBlock(nn.Module): 56 | """ 57 | Sequential residual blocks each of which consists of \ 58 | two convolution layers. 59 | Args: 60 | ch (int): number of input and output channels. 61 | nblocks (int): number of residual blocks. 62 | shortcut (bool): if True, residual tensor addition is enabled. 63 | """ 64 | 65 | def __init__(self, ch, nblocks=1, shortcut=True): 66 | super().__init__() 67 | self.shortcut = shortcut 68 | self.module_list = nn.ModuleList() 69 | for i in range(nblocks): 70 | resblock_one = nn.ModuleList() 71 | resblock_one.append(Conv_Bn_Activation(ch, ch, 1, 1, 'mish')) 72 | resblock_one.append(Conv_Bn_Activation(ch, ch, 3, 1, 'mish')) 73 | self.module_list.append(resblock_one) 74 | 75 | def forward(self, x): 76 | for module in self.module_list: 77 | h = x 78 | for res in module: 79 | h = res(h) 80 | x = x + h if self.shortcut else h 81 | return x 82 | 83 | 84 | class DownSample1(nn.Module): 85 | def __init__(self): 86 | super().__init__() 87 | self.conv1 = Conv_Bn_Activation(3, 32, 3, 1, 'mish') 88 | 89 | self.conv2 = Conv_Bn_Activation(32, 64, 3, 2, 'mish') 90 | self.conv3 = Conv_Bn_Activation(64, 64, 1, 1, 'mish') 91 | # [route] 92 | # layers = -2 93 | self.conv4 = Conv_Bn_Activation(64, 64, 1, 1, 'mish') 94 | 95 | self.conv5 = Conv_Bn_Activation(64, 32, 1, 1, 'mish') 96 | self.conv6 = Conv_Bn_Activation(32, 64, 3, 1, 'mish') 97 | # [shortcut] 98 | # from=-3 99 | # activation = linear 100 | 101 | self.conv7 = Conv_Bn_Activation(64, 64, 1, 1, 'mish') 102 | # [route] 103 | # layers = -1, -7 104 | self.conv8 = Conv_Bn_Activation(128, 64, 1, 1, 'mish') 105 | 106 | def forward(self, input): 107 | x1 = self.conv1(input) 108 | x2 = self.conv2(x1) 109 | x3 = self.conv3(x2) 110 | # route -2 111 | x4 = self.conv4(x2) 112 | x5 = self.conv5(x4) 113 | x6 = self.conv6(x5) 114 | # shortcut -3 115 | x6 = x6 + x4 116 | 117 | x7 = self.conv7(x6) 118 | # [route] 119 | # layers = -1, -7 120 | x7 = torch.cat([x7, x3], dim=1) 121 | x8 = self.conv8(x7) 122 | return x8 123 | 124 | 125 | class DownSample2(nn.Module): 126 | def __init__(self): 127 | super().__init__() 128 | self.conv1 = Conv_Bn_Activation(64, 128, 3, 2, 'mish') 129 | self.conv2 = Conv_Bn_Activation(128, 64, 1, 1, 'mish') 130 | # r -2 131 | self.conv3 = Conv_Bn_Activation(128, 64, 1, 1, 'mish') 132 | 133 | self.resblock = ResBlock(ch=64, nblocks=2) 134 | 135 | # s -3 136 | self.conv4 = Conv_Bn_Activation(64, 64, 1, 1, 'mish') 137 | # r -1 -10 138 | self.conv5 = Conv_Bn_Activation(128, 128, 1, 1, 'mish') 139 | 140 | def forward(self, input): 141 | x1 = self.conv1(input) 142 | x2 = self.conv2(x1) 143 | x3 = self.conv3(x1) 144 | 145 | r = self.resblock(x3) 146 | x4 = self.conv4(r) 147 | 148 | x4 = torch.cat([x4, x2], dim=1) 149 | x5 = self.conv5(x4) 150 | return x5 151 | 152 | 153 | class DownSample3(nn.Module): 154 | def __init__(self): 155 | super().__init__() 156 | self.conv1 = Conv_Bn_Activation(128, 256, 3, 2, 'mish') 157 | self.conv2 = Conv_Bn_Activation(256, 128, 1, 1, 'mish') 158 | self.conv3 = Conv_Bn_Activation(256, 128, 1, 1, 'mish') 159 | 160 | self.resblock = ResBlock(ch=128, nblocks=8) 161 | self.conv4 = Conv_Bn_Activation(128, 128, 1, 1, 'mish') 162 | self.conv5 = Conv_Bn_Activation(256, 256, 1, 1, 'mish') 163 | 164 | def forward(self, input): 165 | x1 = self.conv1(input) 166 | x2 = self.conv2(x1) 167 | x3 = self.conv3(x1) 168 | 169 | r = self.resblock(x3) 170 | x4 = self.conv4(r) 171 | 172 | x4 = torch.cat([x4, x2], dim=1) 173 | x5 = self.conv5(x4) 174 | return x5 175 | 176 | 177 | class DownSample4(nn.Module): 178 | def __init__(self): 179 | super().__init__() 180 | self.conv1 = Conv_Bn_Activation(256, 512, 3, 2, 'mish') 181 | self.conv2 = Conv_Bn_Activation(512, 256, 1, 1, 'mish') 182 | self.conv3 = Conv_Bn_Activation(512, 256, 1, 1, 'mish') 183 | 184 | self.resblock = ResBlock(ch=256, nblocks=8) 185 | self.conv4 = Conv_Bn_Activation(256, 256, 1, 1, 'mish') 186 | self.conv5 = Conv_Bn_Activation(512, 512, 1, 1, 'mish') 187 | 188 | def forward(self, input): 189 | x1 = self.conv1(input) 190 | x2 = self.conv2(x1) 191 | x3 = self.conv3(x1) 192 | 193 | r = self.resblock(x3) 194 | x4 = self.conv4(r) 195 | 196 | x4 = torch.cat([x4, x2], dim=1) 197 | x5 = self.conv5(x4) 198 | return x5 199 | 200 | 201 | class DownSample5(nn.Module): 202 | def __init__(self): 203 | super().__init__() 204 | self.conv1 = Conv_Bn_Activation(512, 1024, 3, 2, 'mish') 205 | self.conv2 = Conv_Bn_Activation(1024, 512, 1, 1, 'mish') 206 | self.conv3 = Conv_Bn_Activation(1024, 512, 1, 1, 'mish') 207 | 208 | self.resblock = ResBlock(ch=512, nblocks=4) 209 | self.conv4 = Conv_Bn_Activation(512, 512, 1, 1, 'mish') 210 | self.conv5 = Conv_Bn_Activation(1024, 1024, 1, 1, 'mish') 211 | 212 | def forward(self, input): 213 | x1 = self.conv1(input) 214 | x2 = self.conv2(x1) 215 | x3 = self.conv3(x1) 216 | 217 | r = self.resblock(x3) 218 | x4 = self.conv4(r) 219 | 220 | x4 = torch.cat([x4, x2], dim=1) 221 | x5 = self.conv5(x4) 222 | return x5 223 | 224 | 225 | class Neck(nn.Module): 226 | def __init__(self): 227 | super().__init__() 228 | self.conv1 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky') 229 | self.conv2 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky') 230 | self.conv3 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky') 231 | # SPP 232 | self.maxpool1 = nn.MaxPool2d(kernel_size=5, stride=1, padding=5 // 2) 233 | self.maxpool2 = nn.MaxPool2d(kernel_size=9, stride=1, padding=9 // 2) 234 | self.maxpool3 = nn.MaxPool2d(kernel_size=13, stride=1, padding=13 // 2) 235 | 236 | # R -1 -3 -5 -6 237 | # SPP 238 | self.conv4 = Conv_Bn_Activation(2048, 512, 1, 1, 'leaky') 239 | self.conv5 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky') 240 | self.conv6 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky') 241 | self.conv7 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky') 242 | # UP 243 | self.upsample1 = Upsample() 244 | # R 85 245 | self.conv8 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky') 246 | # R -1 -3 247 | self.conv9 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky') 248 | self.conv10 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky') 249 | self.conv11 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky') 250 | self.conv12 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky') 251 | self.conv13 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky') 252 | self.conv14 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky') 253 | # UP 254 | self.upsample2 = Upsample() 255 | # R 54 256 | self.conv15 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky') 257 | # R -1 -3 258 | self.conv16 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky') 259 | self.conv17 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky') 260 | self.conv18 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky') 261 | self.conv19 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky') 262 | self.conv20 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky') 263 | 264 | def forward(self, input, downsample4, downsample3): 265 | x1 = self.conv1(input) 266 | x2 = self.conv2(x1) 267 | x3 = self.conv3(x2) 268 | # SPP 269 | m1 = self.maxpool1(x3) 270 | m2 = self.maxpool2(x3) 271 | m3 = self.maxpool3(x3) 272 | spp = torch.cat([m3, m2, m1, x3], dim=1) 273 | # SPP end 274 | x4 = self.conv4(spp) 275 | x5 = self.conv5(x4) 276 | x6 = self.conv6(x5) 277 | x7 = self.conv7(x6) 278 | # UP 279 | up = self.upsample1(x7, downsample4.size()) 280 | # R 85 281 | x8 = self.conv8(downsample4) 282 | # R -1 -3 283 | x8 = torch.cat([x8, up], dim=1) 284 | 285 | x9 = self.conv9(x8) 286 | x10 = self.conv10(x9) 287 | x11 = self.conv11(x10) 288 | x12 = self.conv12(x11) 289 | x13 = self.conv13(x12) 290 | x14 = self.conv14(x13) 291 | 292 | # UP 293 | up = self.upsample2(x14, downsample3.size()) 294 | # R 54 295 | x15 = self.conv15(downsample3) 296 | # R -1 -3 297 | x15 = torch.cat([x15, up], dim=1) 298 | 299 | x16 = self.conv16(x15) 300 | x17 = self.conv17(x16) 301 | x18 = self.conv18(x17) 302 | x19 = self.conv19(x18) 303 | x20 = self.conv20(x19) 304 | return x20, x13, x6 305 | 306 | 307 | class Yolov4Head(nn.Module): 308 | def __init__(self, output_ch): 309 | super().__init__() 310 | self.conv1 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky') 311 | self.conv2 = Conv_Bn_Activation(256, output_ch, 1, 1, 'linear', bn=False, bias=True) 312 | # self.yolo1 = YoloLayer(anchor_mask=[0, 1, 2], num_classes=80, 313 | # anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], 314 | # num_anchors=9, stride=8) 315 | 316 | # R -4 317 | self.conv3 = Conv_Bn_Activation(128, 256, 3, 2, 'leaky') 318 | 319 | # R -1 -16 320 | self.conv4 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky') 321 | self.conv5 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky') 322 | self.conv6 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky') 323 | self.conv7 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky') 324 | self.conv8 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky') 325 | self.conv9 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky') 326 | self.conv10 = Conv_Bn_Activation(512, output_ch, 1, 1, 'linear', bn=False, bias=True) 327 | # self.yolo2 = YoloLayer(anchor_mask=[3, 4, 5], num_classes=80, 328 | # anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], 329 | # num_anchors=9, stride=16) 330 | 331 | # R -4 332 | self.conv11 = Conv_Bn_Activation(256, 512, 3, 2, 'leaky') 333 | 334 | # R -1 -37 335 | self.conv12 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky') 336 | self.conv13 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky') 337 | self.conv14 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky') 338 | self.conv15 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky') 339 | self.conv16 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky') 340 | self.conv17 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky') 341 | self.conv18 = Conv_Bn_Activation(1024, output_ch, 1, 1, 'linear', bn=False, bias=True) 342 | # self.yolo3 = YoloLayer(anchor_mask=[6, 7, 8], num_classes=80, 343 | # anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401], 344 | # num_anchors=9, stride=32) 345 | 346 | def forward(self, input1, input2, input3): 347 | x1 = self.conv1(input1) 348 | x2 = self.conv2(x1) 349 | # y1 = self.yolo1(x2) 350 | 351 | x3 = self.conv3(input1) 352 | # R -1 -16 353 | x3 = torch.cat([x3, input2], dim=1) 354 | x4 = self.conv4(x3) 355 | x5 = self.conv5(x4) 356 | x6 = self.conv6(x5) 357 | x7 = self.conv7(x6) 358 | x8 = self.conv8(x7) 359 | x9 = self.conv9(x8) 360 | x10 = self.conv10(x9) 361 | # y2 = self.yolo2(x10) 362 | 363 | # R -4 364 | x11 = self.conv11(x8) 365 | # R -1 -37 366 | x11 = torch.cat([x11, input3], dim=1) 367 | 368 | x12 = self.conv12(x11) 369 | x13 = self.conv13(x12) 370 | x14 = self.conv14(x13) 371 | x15 = self.conv15(x14) 372 | x16 = self.conv16(x15) 373 | x17 = self.conv17(x16) 374 | x18 = self.conv18(x17) 375 | return [x2, x10, x18] 376 | # y3 = self.yolo3(x18) 377 | # return [y1, y2, y3] 378 | # return y3 379 | 380 | 381 | class Yolov4(nn.Module): 382 | def __init__(self, yolov4conv137weight=None, n_classes=80): 383 | super().__init__() 384 | 385 | output_ch = (4 + 1 + n_classes) * 3 386 | 387 | # backbone 388 | self.down1 = DownSample1() 389 | self.down2 = DownSample2() 390 | self.down3 = DownSample3() 391 | self.down4 = DownSample4() 392 | self.down5 = DownSample5() 393 | # neck 394 | self.neck = Neck() 395 | # yolov4conv137 396 | if yolov4conv137weight: 397 | _model = nn.Sequential(self.down1, self.down2, self.down3, self.down4, self.down5, self.neck) 398 | pretrained_dict = torch.load(yolov4conv137weight) 399 | 400 | model_dict = _model.state_dict() 401 | # 1. filter out unnecessary keys 402 | pretrained_dict = {k1: v for (k, v), k1 in zip(pretrained_dict.items(), model_dict)} 403 | # 2. overwrite entries in the existing state dict 404 | model_dict.update(pretrained_dict) 405 | _model.load_state_dict(model_dict) 406 | # head 407 | self.head = Yolov4Head(output_ch) 408 | 409 | def forward(self, input): 410 | d1 = self.down1(input) 411 | d2 = self.down2(d1) 412 | d3 = self.down3(d2) 413 | d4 = self.down4(d3) 414 | d5 = self.down5(d4) 415 | 416 | x20, x13, x6 = self.neck(d5, d4, d3) 417 | 418 | output = self.head(x20, x13, x6) 419 | return output 420 | 421 | 422 | if __name__ == "__main__": 423 | import sys 424 | from PIL import Image 425 | 426 | namesfile = None 427 | if len(sys.argv) == 4: 428 | n_classes = int(sys.argv[1]) 429 | weightfile = sys.argv[2] 430 | imgfile = sys.argv[3] 431 | elif len(sys.argv) == 5: 432 | n_classes = int(sys.argv[1]) 433 | weightfile = sys.argv[2] 434 | imgfile = sys.argv[3] 435 | namesfile = sys.argv[4] 436 | else: 437 | print('Usage: ') 438 | print(' python models.py num_classes weightfile imgfile namefile') 439 | 440 | model = Yolov4(n_classes=n_classes) 441 | 442 | pretrained_dict = torch.load(weightfile, map_location=torch.device('cuda')) 443 | model.load_state_dict(pretrained_dict) 444 | 445 | if namesfile == None: 446 | if n_classes == 20: 447 | namesfile = 'data/voc.names' 448 | elif n_classes == 80: 449 | namesfile = 'data/coco.names' 450 | else: 451 | print("please give namefile") 452 | 453 | use_cuda = 1 454 | if use_cuda: 455 | model.cuda() 456 | 457 | img = Image.open(imgfile).convert('RGB') 458 | sized = img.resize((608, 608)) 459 | from tool.utils import * 460 | 461 | boxes = do_detect(model, sized, 0.5, n_classes,0.4, use_cuda) 462 | 463 | class_names = load_class_names(namesfile) 464 | plot_boxes(img, boxes, 'predictions.jpg', class_names) 465 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.22.0 2 | torch==1.4.0 3 | scikit_image==0.16.2 4 | matplotlib==2.2.3 5 | tqdm==4.43.0 6 | easydict==1.9 7 | Pillow==9.0.1 8 | tensorboardX 9 | 10 | -------------------------------------------------------------------------------- /tool/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/roboflow/pytorch-YOLOv4/b6189b304b9a60a15bc0a9c3627ec4973fe0e2b5/tool/__init__.py -------------------------------------------------------------------------------- /tool/camera.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | @Time : 2020/04/26 15:48 4 | @Author : Tianxiaomo 5 | @File : camera.py 6 | @Noice : 7 | @Modificattion : 8 | @Author : 9 | @Time : 10 | @Detail : 11 | 12 | ''' 13 | from __future__ import division 14 | import cv2 15 | from tool.darknet2pytorch import Darknet 16 | import argparse 17 | from tool.utils import * 18 | 19 | 20 | def arg_parse(): 21 | """ 22 | Parse arguements to the detect module 23 | 24 | """ 25 | 26 | parser = argparse.ArgumentParser(description='YOLO v3 Cam Demo') 27 | parser.add_argument("--confidence", dest="confidence", help="Object Confidence to filter predictions", default=0.25) 28 | parser.add_argument("--nms_thresh", dest="nms_thresh", help="NMS Threshhold", default=0.4) 29 | parser.add_argument("--reso", dest='reso', help= 30 | "Input resolution of the network. Increase to increase accuracy. Decrease to increase speed", 31 | default="160", type=str) 32 | return parser.parse_args() 33 | 34 | 35 | if __name__ == '__main__': 36 | cfgfile = "cfg/yolov4.cfg" 37 | weightsfile = "weight/yolov4.weights" 38 | 39 | args = arg_parse() 40 | confidence = float(args.confidence) 41 | nms_thesh = float(args.nms_thresh) 42 | CUDA = torch.cuda.is_available() 43 | num_classes = 80 44 | bbox_attrs = 5 + num_classes 45 | class_names = load_class_names("data/coco.names") 46 | 47 | model = Darknet(cfgfile) 48 | model.load_weights(weightsfile) 49 | 50 | if CUDA: 51 | model.cuda() 52 | 53 | model.eval() 54 | cap = cv2.VideoCapture(0) 55 | 56 | assert cap.isOpened(), 'Cannot capture source' 57 | 58 | frames = 0 59 | start = time.time() 60 | while cap.isOpened(): 61 | ret, frame = cap.read() 62 | if ret: 63 | sized = cv2.resize(frame, (model.width, model.height)) 64 | sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB) 65 | boxes = do_detect(model, sized, 0.5, 0.4, CUDA) 66 | 67 | orig_im = plot_boxes_cv2(frame, boxes, class_names=class_names) 68 | 69 | cv2.imshow("frame", orig_im) 70 | key = cv2.waitKey(1) 71 | if key & 0xFF == ord('q'): 72 | break 73 | frames += 1 74 | print("FPS of the video is {:5.2f}".format(frames / (time.time() - start))) 75 | else: 76 | break 77 | -------------------------------------------------------------------------------- /tool/coco_annotation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | @Time : 2020/05/08 11:45 4 | @Author : Tianxiaomo 5 | @File : coco_annotatin.py 6 | @Noice : 7 | @Modificattion : 8 | @Author : 9 | @Time : 10 | @Detail : 11 | 12 | ''' 13 | import json 14 | from collections import defaultdict 15 | from tqdm import tqdm 16 | import os 17 | 18 | """hyper parameters""" 19 | json_file_path = 'E:/Dataset/coco2017/annotations_trainval2017/annotations/instances_val2017.json' 20 | images_dir_path = 'mscoco2017/train2017/' 21 | output_path = '../data/val.txt' 22 | 23 | """load json file""" 24 | name_box_id = defaultdict(list) 25 | id_name = dict() 26 | with open(json_file_path, encoding='utf-8') as f: 27 | data = json.load(f) 28 | 29 | """generate labels""" 30 | images = data['images'] 31 | annotations = data['annotations'] 32 | for ant in tqdm(annotations): 33 | id = ant['image_id'] 34 | name = os.path.join(images_dir_path, images[id]['file_name']) 35 | cat = ant['category_id'] 36 | 37 | if cat >= 1 and cat <= 11: 38 | cat = cat - 1 39 | elif cat >= 13 and cat <= 25: 40 | cat = cat - 2 41 | elif cat >= 27 and cat <= 28: 42 | cat = cat - 3 43 | elif cat >= 31 and cat <= 44: 44 | cat = cat - 5 45 | elif cat >= 46 and cat <= 65: 46 | cat = cat - 6 47 | elif cat == 67: 48 | cat = cat - 7 49 | elif cat == 70: 50 | cat = cat - 9 51 | elif cat >= 72 and cat <= 82: 52 | cat = cat - 10 53 | elif cat >= 84 and cat <= 90: 54 | cat = cat - 11 55 | 56 | name_box_id[name].append([ant['bbox'], cat]) 57 | 58 | """write to txt""" 59 | with open(output_path, 'w') as f: 60 | for key in tqdm(name_box_id.keys()): 61 | f.write(key) 62 | box_infos = name_box_id[key] 63 | for info in box_infos: 64 | x_min = int(info[0][0]) 65 | y_min = int(info[0][1]) 66 | x_max = x_min + int(info[0][2]) 67 | y_max = y_min + int(info[0][3]) 68 | 69 | box_info = " %d,%d,%d,%d,%d" % ( 70 | x_min, y_min, x_max, y_max, int(info[1])) 71 | f.write(box_info) 72 | f.write('\n') 73 | -------------------------------------------------------------------------------- /tool/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tool.utils import convert2cpu 3 | 4 | 5 | def parse_cfg(cfgfile): 6 | blocks = [] 7 | fp = open(cfgfile, 'r') 8 | block = None 9 | line = fp.readline() 10 | while line != '': 11 | line = line.rstrip() 12 | if line == '' or line[0] == '#': 13 | line = fp.readline() 14 | continue 15 | elif line[0] == '[': 16 | if block: 17 | blocks.append(block) 18 | block = dict() 19 | block['type'] = line.lstrip('[').rstrip(']') 20 | # set default value 21 | if block['type'] == 'convolutional': 22 | block['batch_normalize'] = 0 23 | else: 24 | key, value = line.split('=') 25 | key = key.strip() 26 | if key == 'type': 27 | key = '_type' 28 | value = value.strip() 29 | block[key] = value 30 | line = fp.readline() 31 | 32 | if block: 33 | blocks.append(block) 34 | fp.close() 35 | return blocks 36 | 37 | 38 | def print_cfg(blocks): 39 | print('layer filters size input output'); 40 | prev_width = 416 41 | prev_height = 416 42 | prev_filters = 3 43 | out_filters = [] 44 | out_widths = [] 45 | out_heights = [] 46 | ind = -2 47 | for block in blocks: 48 | ind = ind + 1 49 | if block['type'] == 'net': 50 | prev_width = int(block['width']) 51 | prev_height = int(block['height']) 52 | continue 53 | elif block['type'] == 'convolutional': 54 | filters = int(block['filters']) 55 | kernel_size = int(block['size']) 56 | stride = int(block['stride']) 57 | is_pad = int(block['pad']) 58 | pad = (kernel_size - 1) // 2 if is_pad else 0 59 | width = (prev_width + 2 * pad - kernel_size) // stride + 1 60 | height = (prev_height + 2 * pad - kernel_size) // stride + 1 61 | print('%5d %-6s %4d %d x %d / %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( 62 | ind, 'conv', filters, kernel_size, kernel_size, stride, prev_width, prev_height, prev_filters, width, 63 | height, filters)) 64 | prev_width = width 65 | prev_height = height 66 | prev_filters = filters 67 | out_widths.append(prev_width) 68 | out_heights.append(prev_height) 69 | out_filters.append(prev_filters) 70 | elif block['type'] == 'maxpool': 71 | pool_size = int(block['size']) 72 | stride = int(block['stride']) 73 | width = prev_width // stride 74 | height = prev_height // stride 75 | print('%5d %-6s %d x %d / %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( 76 | ind, 'max', pool_size, pool_size, stride, prev_width, prev_height, prev_filters, width, height, 77 | filters)) 78 | prev_width = width 79 | prev_height = height 80 | prev_filters = filters 81 | out_widths.append(prev_width) 82 | out_heights.append(prev_height) 83 | out_filters.append(prev_filters) 84 | elif block['type'] == 'avgpool': 85 | width = 1 86 | height = 1 87 | print('%5d %-6s %3d x %3d x%4d -> %3d' % ( 88 | ind, 'avg', prev_width, prev_height, prev_filters, prev_filters)) 89 | prev_width = width 90 | prev_height = height 91 | prev_filters = filters 92 | out_widths.append(prev_width) 93 | out_heights.append(prev_height) 94 | out_filters.append(prev_filters) 95 | elif block['type'] == 'softmax': 96 | print('%5d %-6s -> %3d' % (ind, 'softmax', prev_filters)) 97 | out_widths.append(prev_width) 98 | out_heights.append(prev_height) 99 | out_filters.append(prev_filters) 100 | elif block['type'] == 'cost': 101 | print('%5d %-6s -> %3d' % (ind, 'cost', prev_filters)) 102 | out_widths.append(prev_width) 103 | out_heights.append(prev_height) 104 | out_filters.append(prev_filters) 105 | elif block['type'] == 'reorg': 106 | stride = int(block['stride']) 107 | filters = stride * stride * prev_filters 108 | width = prev_width // stride 109 | height = prev_height // stride 110 | print('%5d %-6s / %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( 111 | ind, 'reorg', stride, prev_width, prev_height, prev_filters, width, height, filters)) 112 | prev_width = width 113 | prev_height = height 114 | prev_filters = filters 115 | out_widths.append(prev_width) 116 | out_heights.append(prev_height) 117 | out_filters.append(prev_filters) 118 | elif block['type'] == 'upsample': 119 | stride = int(block['stride']) 120 | filters = prev_filters 121 | width = prev_width * stride 122 | height = prev_height * stride 123 | print('%5d %-6s * %d %3d x %3d x%4d -> %3d x %3d x%4d' % ( 124 | ind, 'upsample', stride, prev_width, prev_height, prev_filters, width, height, filters)) 125 | prev_width = width 126 | prev_height = height 127 | prev_filters = filters 128 | out_widths.append(prev_width) 129 | out_heights.append(prev_height) 130 | out_filters.append(prev_filters) 131 | elif block['type'] == 'route': 132 | layers = block['layers'].split(',') 133 | layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers] 134 | if len(layers) == 1: 135 | print('%5d %-6s %d' % (ind, 'route', layers[0])) 136 | prev_width = out_widths[layers[0]] 137 | prev_height = out_heights[layers[0]] 138 | prev_filters = out_filters[layers[0]] 139 | elif len(layers) == 2: 140 | print('%5d %-6s %d %d' % (ind, 'route', layers[0], layers[1])) 141 | prev_width = out_widths[layers[0]] 142 | prev_height = out_heights[layers[0]] 143 | assert (prev_width == out_widths[layers[1]]) 144 | assert (prev_height == out_heights[layers[1]]) 145 | prev_filters = out_filters[layers[0]] + out_filters[layers[1]] 146 | elif len(layers) == 4: 147 | print('%5d %-6s %d %d %d %d' % (ind, 'route', layers[0], layers[1], layers[2], layers[3])) 148 | prev_width = out_widths[layers[0]] 149 | prev_height = out_heights[layers[0]] 150 | assert (prev_width == out_widths[layers[1]] == out_widths[layers[2]] == out_widths[layers[3]]) 151 | assert (prev_height == out_heights[layers[1]] == out_heights[layers[2]] == out_heights[layers[3]]) 152 | prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + out_filters[ 153 | layers[3]] 154 | else: 155 | print("route error !!! {} {} {}".format(sys._getframe().f_code.co_filename, 156 | sys._getframe().f_code.co_name, sys._getframe().f_lineno)) 157 | 158 | out_widths.append(prev_width) 159 | out_heights.append(prev_height) 160 | out_filters.append(prev_filters) 161 | elif block['type'] in ['region', 'yolo']: 162 | print('%5d %-6s' % (ind, 'detection')) 163 | out_widths.append(prev_width) 164 | out_heights.append(prev_height) 165 | out_filters.append(prev_filters) 166 | elif block['type'] == 'shortcut': 167 | from_id = int(block['from']) 168 | from_id = from_id if from_id > 0 else from_id + ind 169 | print('%5d %-6s %d' % (ind, 'shortcut', from_id)) 170 | prev_width = out_widths[from_id] 171 | prev_height = out_heights[from_id] 172 | prev_filters = out_filters[from_id] 173 | out_widths.append(prev_width) 174 | out_heights.append(prev_height) 175 | out_filters.append(prev_filters) 176 | elif block['type'] == 'connected': 177 | filters = int(block['output']) 178 | print('%5d %-6s %d -> %3d' % (ind, 'connected', prev_filters, filters)) 179 | prev_filters = filters 180 | out_widths.append(1) 181 | out_heights.append(1) 182 | out_filters.append(prev_filters) 183 | else: 184 | print('unknown type %s' % (block['type'])) 185 | 186 | 187 | def load_conv(buf, start, conv_model): 188 | num_w = conv_model.weight.numel() 189 | num_b = conv_model.bias.numel() 190 | conv_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b])); 191 | start = start + num_b 192 | conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape)); 193 | start = start + num_w 194 | return start 195 | 196 | 197 | def save_conv(fp, conv_model): 198 | if conv_model.bias.is_cuda: 199 | convert2cpu(conv_model.bias.data).numpy().tofile(fp) 200 | convert2cpu(conv_model.weight.data).numpy().tofile(fp) 201 | else: 202 | conv_model.bias.data.numpy().tofile(fp) 203 | conv_model.weight.data.numpy().tofile(fp) 204 | 205 | 206 | def load_conv_bn(buf, start, conv_model, bn_model): 207 | num_w = conv_model.weight.numel() 208 | num_b = bn_model.bias.numel() 209 | bn_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b])); 210 | start = start + num_b 211 | bn_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_b])); 212 | start = start + num_b 213 | bn_model.running_mean.copy_(torch.from_numpy(buf[start:start + num_b])); 214 | start = start + num_b 215 | bn_model.running_var.copy_(torch.from_numpy(buf[start:start + num_b])); 216 | start = start + num_b 217 | conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape)); 218 | start = start + num_w 219 | return start 220 | 221 | 222 | def save_conv_bn(fp, conv_model, bn_model): 223 | if bn_model.bias.is_cuda: 224 | convert2cpu(bn_model.bias.data).numpy().tofile(fp) 225 | convert2cpu(bn_model.weight.data).numpy().tofile(fp) 226 | convert2cpu(bn_model.running_mean).numpy().tofile(fp) 227 | convert2cpu(bn_model.running_var).numpy().tofile(fp) 228 | convert2cpu(conv_model.weight.data).numpy().tofile(fp) 229 | else: 230 | bn_model.bias.data.numpy().tofile(fp) 231 | bn_model.weight.data.numpy().tofile(fp) 232 | bn_model.running_mean.numpy().tofile(fp) 233 | bn_model.running_var.numpy().tofile(fp) 234 | conv_model.weight.data.numpy().tofile(fp) 235 | 236 | 237 | def load_fc(buf, start, fc_model): 238 | num_w = fc_model.weight.numel() 239 | num_b = fc_model.bias.numel() 240 | fc_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b])); 241 | start = start + num_b 242 | fc_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w])); 243 | start = start + num_w 244 | return start 245 | 246 | 247 | def save_fc(fp, fc_model): 248 | fc_model.bias.data.numpy().tofile(fp) 249 | fc_model.weight.data.numpy().tofile(fp) 250 | 251 | 252 | if __name__ == '__main__': 253 | import sys 254 | 255 | blocks = parse_cfg('cfg/yolo.cfg') 256 | if len(sys.argv) == 2: 257 | blocks = parse_cfg(sys.argv[1]) 258 | print_cfg(blocks) 259 | -------------------------------------------------------------------------------- /tool/darknet2onnx.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch 3 | from tool.darknet2pytorch import Darknet 4 | 5 | 6 | def fransform_to_onnx(cfgfile, weightfile, batch_size=1): 7 | model = Darknet(cfgfile) 8 | 9 | model.print_network() 10 | model.load_weights(weightfile) 11 | print('Loading weights from %s... Done!' % (weightfile)) 12 | 13 | # model.cuda() 14 | 15 | x = torch.randn((batch_size, 3, model.height, model.width), requires_grad=True) # .cuda() 16 | 17 | onnx_file_name = "yolov4_{}_3_{}_{}.onnx".format(batch_size, model.height, model.width) 18 | 19 | # Export the model 20 | print('Export the onnx model ...') 21 | torch.onnx.export(model, 22 | x, 23 | onnx_file_name, 24 | export_params=True, 25 | opset_version=11, 26 | do_constant_folding=True, 27 | input_names=['input'], output_names=['output_1', 'output_2', 'output_3'], 28 | dynamic_axes=None) 29 | 30 | print('Onnx model exporting done') 31 | return onnx_file_name 32 | 33 | 34 | if __name__ == '__main__': 35 | if len(sys.argv) == 3: 36 | cfgfile = sys.argv[1] 37 | weightfile = sys.argv[2] 38 | fransform_to_onnx(cfgfile, weightfile) 39 | elif len(sys.argv) == 4: 40 | cfgfile = sys.argv[1] 41 | weightfile = sys.argv[2] 42 | batch_size = int(sys.argv[3]) 43 | fransform_to_onnx(cfgfile, weightfile, batch_size) 44 | else: 45 | print('Please execute this script this way:\n') 46 | print(' python darknet2onnx.py ') 47 | print('or') 48 | print(' python darknet2onnx.py ') 49 | -------------------------------------------------------------------------------- /tool/darknet2pytorch.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import numpy as np 4 | from tool.region_loss import RegionLoss 5 | from tool.yolo_layer import YoloLayer 6 | from tool.config import * 7 | 8 | 9 | class Mish(torch.nn.Module): 10 | def __init__(self): 11 | super().__init__() 12 | 13 | def forward(self, x): 14 | x = x * (torch.tanh(torch.nn.functional.softplus(x))) 15 | return x 16 | 17 | 18 | class MaxPoolDark(nn.Module): 19 | def __init__(self, size=2, stride=1): 20 | super(MaxPoolDark, self).__init__() 21 | self.size = size 22 | self.stride = stride 23 | 24 | def forward(self, x): 25 | ''' 26 | darknet output_size = (input_size + p - k) / s +1 27 | p : padding = k - 1 28 | k : size 29 | s : stride 30 | torch output_size = (input_size + 2*p -k) / s +1 31 | p : padding = k//2 32 | ''' 33 | p = self.size // 2 34 | if ((x.shape[2] - 1) // self.stride) != ((x.shape[2] + 2 * p - self.size) // self.stride): 35 | padding1 = (self.size - 1) // 2 36 | padding2 = padding1 + 1 37 | else: 38 | padding1 = (self.size - 1) // 2 39 | padding2 = padding1 40 | if ((x.shape[3] - 1) // self.stride) != ((x.shape[3] + 2 * p - self.size) // self.stride): 41 | padding3 = (self.size - 1) // 2 42 | padding4 = padding3 + 1 43 | else: 44 | padding3 = (self.size - 1) // 2 45 | padding4 = padding3 46 | x = F.max_pool2d(F.pad(x, (padding3, padding4, padding1, padding2), mode='replicate'), 47 | self.size, stride=self.stride) 48 | return x 49 | 50 | 51 | class Upsample(nn.Module): 52 | def __init__(self, stride=2): 53 | super(Upsample, self).__init__() 54 | self.stride = stride 55 | 56 | def forward(self, x): 57 | stride = self.stride 58 | assert (x.data.dim() == 4) 59 | B = x.data.size(0) 60 | C = x.data.size(1) 61 | H = x.data.size(2) 62 | W = x.data.size(3) 63 | ws = stride 64 | hs = stride 65 | x = x.view(B, C, H, 1, W, 1).expand(B, C, H, stride, W, stride).contiguous().view(B, C, H * stride, W * stride) 66 | return x 67 | 68 | 69 | class Reorg(nn.Module): 70 | def __init__(self, stride=2): 71 | super(Reorg, self).__init__() 72 | self.stride = stride 73 | 74 | def forward(self, x): 75 | stride = self.stride 76 | assert (x.data.dim() == 4) 77 | B = x.data.size(0) 78 | C = x.data.size(1) 79 | H = x.data.size(2) 80 | W = x.data.size(3) 81 | assert (H % stride == 0) 82 | assert (W % stride == 0) 83 | ws = stride 84 | hs = stride 85 | x = x.view(B, C, H / hs, hs, W / ws, ws).transpose(3, 4).contiguous() 86 | x = x.view(B, C, H / hs * W / ws, hs * ws).transpose(2, 3).contiguous() 87 | x = x.view(B, C, hs * ws, H / hs, W / ws).transpose(1, 2).contiguous() 88 | x = x.view(B, hs * ws * C, H / hs, W / ws) 89 | return x 90 | 91 | 92 | class GlobalAvgPool2d(nn.Module): 93 | def __init__(self): 94 | super(GlobalAvgPool2d, self).__init__() 95 | 96 | def forward(self, x): 97 | N = x.data.size(0) 98 | C = x.data.size(1) 99 | H = x.data.size(2) 100 | W = x.data.size(3) 101 | x = F.avg_pool2d(x, (H, W)) 102 | x = x.view(N, C) 103 | return x 104 | 105 | 106 | # for route and shortcut 107 | class EmptyModule(nn.Module): 108 | def __init__(self): 109 | super(EmptyModule, self).__init__() 110 | 111 | def forward(self, x): 112 | return x 113 | 114 | 115 | # support route shortcut and reorg 116 | class Darknet(nn.Module): 117 | def __init__(self, cfgfile): 118 | super(Darknet, self).__init__() 119 | self.blocks = parse_cfg(cfgfile) 120 | self.models = self.create_network(self.blocks) # merge conv, bn,leaky 121 | self.loss = self.models[len(self.models) - 1] 122 | 123 | self.width = int(self.blocks[0]['width']) 124 | self.height = int(self.blocks[0]['height']) 125 | 126 | if self.blocks[(len(self.blocks) - 1)]['type'] == 'region': 127 | self.anchors = self.loss.anchors 128 | self.num_anchors = self.loss.num_anchors 129 | self.anchor_step = self.loss.anchor_step 130 | self.num_classes = self.loss.num_classes 131 | 132 | self.header = torch.IntTensor([0, 0, 0, 0]) 133 | self.seen = 0 134 | 135 | def forward(self, x): 136 | ind = -2 137 | self.loss = None 138 | outputs = dict() 139 | out_boxes = [] 140 | for block in self.blocks: 141 | ind = ind + 1 142 | # if ind > 0: 143 | # return x 144 | 145 | if block['type'] == 'net': 146 | continue 147 | elif block['type'] in ['convolutional', 'maxpool', 'reorg', 'upsample', 'avgpool', 'softmax', 'connected']: 148 | x = self.models[ind](x) 149 | outputs[ind] = x 150 | elif block['type'] == 'route': 151 | layers = block['layers'].split(',') 152 | layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers] 153 | if len(layers) == 1: 154 | x = outputs[layers[0]] 155 | outputs[ind] = x 156 | elif len(layers) == 2: 157 | x1 = outputs[layers[0]] 158 | x2 = outputs[layers[1]] 159 | x = torch.cat((x1, x2), 1) 160 | outputs[ind] = x 161 | elif len(layers) == 4: 162 | x1 = outputs[layers[0]] 163 | x2 = outputs[layers[1]] 164 | x3 = outputs[layers[2]] 165 | x4 = outputs[layers[3]] 166 | x = torch.cat((x1, x2, x3, x4), 1) 167 | outputs[ind] = x 168 | else: 169 | print("rounte number > 2 ,is {}".format(len(layers))) 170 | 171 | elif block['type'] == 'shortcut': 172 | from_layer = int(block['from']) 173 | activation = block['activation'] 174 | from_layer = from_layer if from_layer > 0 else from_layer + ind 175 | x1 = outputs[from_layer] 176 | x2 = outputs[ind - 1] 177 | x = x1 + x2 178 | if activation == 'leaky': 179 | x = F.leaky_relu(x, 0.1, inplace=True) 180 | elif activation == 'relu': 181 | x = F.relu(x, inplace=True) 182 | outputs[ind] = x 183 | elif block['type'] == 'region': 184 | continue 185 | if self.loss: 186 | self.loss = self.loss + self.models[ind](x) 187 | else: 188 | self.loss = self.models[ind](x) 189 | outputs[ind] = None 190 | elif block['type'] == 'yolo': 191 | if self.training: 192 | pass 193 | else: 194 | boxes = self.models[ind](x) 195 | out_boxes.append(boxes) 196 | elif block['type'] == 'cost': 197 | continue 198 | else: 199 | print('unknown type %s' % (block['type'])) 200 | if self.training: 201 | return loss 202 | else: 203 | return out_boxes 204 | 205 | def print_network(self): 206 | print_cfg(self.blocks) 207 | 208 | def create_network(self, blocks): 209 | models = nn.ModuleList() 210 | 211 | prev_filters = 3 212 | out_filters = [] 213 | prev_stride = 1 214 | out_strides = [] 215 | conv_id = 0 216 | for block in blocks: 217 | if block['type'] == 'net': 218 | prev_filters = int(block['channels']) 219 | continue 220 | elif block['type'] == 'convolutional': 221 | conv_id = conv_id + 1 222 | batch_normalize = int(block['batch_normalize']) 223 | filters = int(block['filters']) 224 | kernel_size = int(block['size']) 225 | stride = int(block['stride']) 226 | is_pad = int(block['pad']) 227 | pad = (kernel_size - 1) // 2 if is_pad else 0 228 | activation = block['activation'] 229 | model = nn.Sequential() 230 | if batch_normalize: 231 | model.add_module('conv{0}'.format(conv_id), 232 | nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias=False)) 233 | model.add_module('bn{0}'.format(conv_id), nn.BatchNorm2d(filters)) 234 | # model.add_module('bn{0}'.format(conv_id), BN2d(filters)) 235 | else: 236 | model.add_module('conv{0}'.format(conv_id), 237 | nn.Conv2d(prev_filters, filters, kernel_size, stride, pad)) 238 | if activation == 'leaky': 239 | model.add_module('leaky{0}'.format(conv_id), nn.LeakyReLU(0.1, inplace=True)) 240 | elif activation == 'relu': 241 | model.add_module('relu{0}'.format(conv_id), nn.ReLU(inplace=True)) 242 | elif activation == 'mish': 243 | model.add_module('mish{0}'.format(conv_id), Mish()) 244 | else: 245 | print("convalution havn't activate {}".format(activation)) 246 | 247 | prev_filters = filters 248 | out_filters.append(prev_filters) 249 | prev_stride = stride * prev_stride 250 | out_strides.append(prev_stride) 251 | models.append(model) 252 | elif block['type'] == 'maxpool': 253 | pool_size = int(block['size']) 254 | stride = int(block['stride']) 255 | if stride == 1 and pool_size % 2: 256 | # You can use Maxpooldark instead, here is convenient to convert onnx. 257 | model = nn.MaxPool2d(kernel_size=pool_size, stride=stride, padding=pool_size // 2) 258 | else: 259 | model = MaxPoolDark(pool_size, stride) 260 | out_filters.append(prev_filters) 261 | prev_stride = stride * prev_stride 262 | out_strides.append(prev_stride) 263 | models.append(model) 264 | elif block['type'] == 'avgpool': 265 | model = GlobalAvgPool2d() 266 | out_filters.append(prev_filters) 267 | models.append(model) 268 | elif block['type'] == 'softmax': 269 | model = nn.Softmax() 270 | out_strides.append(prev_stride) 271 | out_filters.append(prev_filters) 272 | models.append(model) 273 | elif block['type'] == 'cost': 274 | if block['_type'] == 'sse': 275 | model = nn.MSELoss(size_average=True) 276 | elif block['_type'] == 'L1': 277 | model = nn.L1Loss(size_average=True) 278 | elif block['_type'] == 'smooth': 279 | model = nn.SmoothL1Loss(size_average=True) 280 | out_filters.append(1) 281 | out_strides.append(prev_stride) 282 | models.append(model) 283 | elif block['type'] == 'reorg': 284 | stride = int(block['stride']) 285 | prev_filters = stride * stride * prev_filters 286 | out_filters.append(prev_filters) 287 | prev_stride = prev_stride * stride 288 | out_strides.append(prev_stride) 289 | models.append(Reorg(stride)) 290 | elif block['type'] == 'upsample': 291 | stride = int(block['stride']) 292 | out_filters.append(prev_filters) 293 | prev_stride = prev_stride // stride 294 | out_strides.append(prev_stride) 295 | # models.append(nn.Upsample(scale_factor=stride, mode='nearest')) 296 | models.append(Upsample(stride)) 297 | elif block['type'] == 'route': 298 | layers = block['layers'].split(',') 299 | ind = len(models) 300 | layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers] 301 | if len(layers) == 1: 302 | prev_filters = out_filters[layers[0]] 303 | prev_stride = out_strides[layers[0]] 304 | elif len(layers) == 2: 305 | assert (layers[0] == ind - 1) 306 | prev_filters = out_filters[layers[0]] + out_filters[layers[1]] 307 | prev_stride = out_strides[layers[0]] 308 | elif len(layers) == 4: 309 | assert (layers[0] == ind - 1) 310 | prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + \ 311 | out_filters[layers[3]] 312 | prev_stride = out_strides[layers[0]] 313 | else: 314 | print("route error!!!") 315 | 316 | out_filters.append(prev_filters) 317 | out_strides.append(prev_stride) 318 | models.append(EmptyModule()) 319 | elif block['type'] == 'shortcut': 320 | ind = len(models) 321 | prev_filters = out_filters[ind - 1] 322 | out_filters.append(prev_filters) 323 | prev_stride = out_strides[ind - 1] 324 | out_strides.append(prev_stride) 325 | models.append(EmptyModule()) 326 | elif block['type'] == 'connected': 327 | filters = int(block['output']) 328 | if block['activation'] == 'linear': 329 | model = nn.Linear(prev_filters, filters) 330 | elif block['activation'] == 'leaky': 331 | model = nn.Sequential( 332 | nn.Linear(prev_filters, filters), 333 | nn.LeakyReLU(0.1, inplace=True)) 334 | elif block['activation'] == 'relu': 335 | model = nn.Sequential( 336 | nn.Linear(prev_filters, filters), 337 | nn.ReLU(inplace=True)) 338 | prev_filters = filters 339 | out_filters.append(prev_filters) 340 | out_strides.append(prev_stride) 341 | models.append(model) 342 | elif block['type'] == 'region': 343 | loss = RegionLoss() 344 | anchors = block['anchors'].split(',') 345 | loss.anchors = [float(i) for i in anchors] 346 | loss.num_classes = int(block['classes']) 347 | loss.num_anchors = int(block['num']) 348 | loss.anchor_step = len(loss.anchors) // loss.num_anchors 349 | loss.object_scale = float(block['object_scale']) 350 | loss.noobject_scale = float(block['noobject_scale']) 351 | loss.class_scale = float(block['class_scale']) 352 | loss.coord_scale = float(block['coord_scale']) 353 | out_filters.append(prev_filters) 354 | out_strides.append(prev_stride) 355 | models.append(loss) 356 | elif block['type'] == 'yolo': 357 | yolo_layer = YoloLayer() 358 | anchors = block['anchors'].split(',') 359 | anchor_mask = block['mask'].split(',') 360 | yolo_layer.anchor_mask = [int(i) for i in anchor_mask] 361 | yolo_layer.anchors = [float(i) for i in anchors] 362 | yolo_layer.num_classes = int(block['classes']) 363 | yolo_layer.num_anchors = int(block['num']) 364 | yolo_layer.anchor_step = len(yolo_layer.anchors) // yolo_layer.num_anchors 365 | yolo_layer.stride = prev_stride 366 | # yolo_layer.object_scale = float(block['object_scale']) 367 | # yolo_layer.noobject_scale = float(block['noobject_scale']) 368 | # yolo_layer.class_scale = float(block['class_scale']) 369 | # yolo_layer.coord_scale = float(block['coord_scale']) 370 | out_filters.append(prev_filters) 371 | out_strides.append(prev_stride) 372 | models.append(yolo_layer) 373 | else: 374 | print('unknown type %s' % (block['type'])) 375 | 376 | return models 377 | 378 | def load_weights(self, weightfile): 379 | fp = open(weightfile, 'rb') 380 | header = np.fromfile(fp, count=5, dtype=np.int32) 381 | self.header = torch.from_numpy(header) 382 | self.seen = self.header[3] 383 | buf = np.fromfile(fp, dtype=np.float32) 384 | fp.close() 385 | 386 | start = 0 387 | ind = -2 388 | for block in self.blocks: 389 | if start >= buf.size: 390 | break 391 | ind = ind + 1 392 | if block['type'] == 'net': 393 | continue 394 | elif block['type'] == 'convolutional': 395 | model = self.models[ind] 396 | batch_normalize = int(block['batch_normalize']) 397 | if batch_normalize: 398 | start = load_conv_bn(buf, start, model[0], model[1]) 399 | else: 400 | start = load_conv(buf, start, model[0]) 401 | elif block['type'] == 'connected': 402 | model = self.models[ind] 403 | if block['activation'] != 'linear': 404 | start = load_fc(buf, start, model[0]) 405 | else: 406 | start = load_fc(buf, start, model) 407 | elif block['type'] == 'maxpool': 408 | pass 409 | elif block['type'] == 'reorg': 410 | pass 411 | elif block['type'] == 'upsample': 412 | pass 413 | elif block['type'] == 'route': 414 | pass 415 | elif block['type'] == 'shortcut': 416 | pass 417 | elif block['type'] == 'region': 418 | pass 419 | elif block['type'] == 'yolo': 420 | pass 421 | elif block['type'] == 'avgpool': 422 | pass 423 | elif block['type'] == 'softmax': 424 | pass 425 | elif block['type'] == 'cost': 426 | pass 427 | else: 428 | print('unknown type %s' % (block['type'])) 429 | 430 | # def save_weights(self, outfile, cutoff=0): 431 | # if cutoff <= 0: 432 | # cutoff = len(self.blocks) - 1 433 | # 434 | # fp = open(outfile, 'wb') 435 | # self.header[3] = self.seen 436 | # header = self.header 437 | # header.numpy().tofile(fp) 438 | # 439 | # ind = -1 440 | # for blockId in range(1, cutoff + 1): 441 | # ind = ind + 1 442 | # block = self.blocks[blockId] 443 | # if block['type'] == 'convolutional': 444 | # model = self.models[ind] 445 | # batch_normalize = int(block['batch_normalize']) 446 | # if batch_normalize: 447 | # save_conv_bn(fp, model[0], model[1]) 448 | # else: 449 | # save_conv(fp, model[0]) 450 | # elif block['type'] == 'connected': 451 | # model = self.models[ind] 452 | # if block['activation'] != 'linear': 453 | # save_fc(fc, model) 454 | # else: 455 | # save_fc(fc, model[0]) 456 | # elif block['type'] == 'maxpool': 457 | # pass 458 | # elif block['type'] == 'reorg': 459 | # pass 460 | # elif block['type'] == 'upsample': 461 | # pass 462 | # elif block['type'] == 'route': 463 | # pass 464 | # elif block['type'] == 'shortcut': 465 | # pass 466 | # elif block['type'] == 'region': 467 | # pass 468 | # elif block['type'] == 'yolo': 469 | # pass 470 | # elif block['type'] == 'avgpool': 471 | # pass 472 | # elif block['type'] == 'softmax': 473 | # pass 474 | # elif block['type'] == 'cost': 475 | # pass 476 | # else: 477 | # print('unknown type %s' % (block['type'])) 478 | # fp.close() 479 | -------------------------------------------------------------------------------- /tool/onnx2tensorflow.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import onnx 3 | from onnx_tf.backend import prepare 4 | 5 | 6 | # tensorflow >=2.0 7 | # 1: Thanks:github:https://github.com/onnx/onnx-tensorflow 8 | # 2: Run git clone https://github.com/onnx/onnx-tensorflow.git && cd onnx-tensorflow 9 | # Run pip install -e . 10 | # Note: 11 | # Errors will occur when using "pip install onnx-tf", at least for me, 12 | # it is recommended to use source code installation 13 | def transform_to_tensorflow(onnx_input_path, pb_output_path): 14 | onnx_model = onnx.load(onnx_input_path) # load onnx model 15 | tf_exp = prepare(onnx_model) # prepare tf representation 16 | tf_exp.export_graph(pb_output_path) # export the model 17 | 18 | 19 | if __name__ == '__main__': 20 | if len(sys.argv) == 1: 21 | sys.argv.append('../weight/yolov4_1_3_608_608.onnx') # use:darknet2onnx.py 22 | sys.argv.append('../weight/yolov4.pb') # use:onnx2tensorflow.py 23 | if len(sys.argv) == 3: 24 | onnxfile = sys.argv[1] 25 | tfpb_outfile = sys.argv[2] 26 | transform_to_tensorflow(onnxfile, tfpb_outfile) 27 | else: 28 | print('Please execute this script this way:\n') 29 | print(' python onnx2tensorflow.py ') 30 | -------------------------------------------------------------------------------- /tool/region_loss.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | from tool.utils import * 4 | 5 | 6 | def build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale, 7 | sil_thresh, seen): 8 | nB = target.size(0) 9 | nA = num_anchors 10 | nC = num_classes 11 | anchor_step = len(anchors) / num_anchors 12 | conf_mask = torch.ones(nB, nA, nH, nW) * noobject_scale 13 | coord_mask = torch.zeros(nB, nA, nH, nW) 14 | cls_mask = torch.zeros(nB, nA, nH, nW) 15 | tx = torch.zeros(nB, nA, nH, nW) 16 | ty = torch.zeros(nB, nA, nH, nW) 17 | tw = torch.zeros(nB, nA, nH, nW) 18 | th = torch.zeros(nB, nA, nH, nW) 19 | tconf = torch.zeros(nB, nA, nH, nW) 20 | tcls = torch.zeros(nB, nA, nH, nW) 21 | 22 | nAnchors = nA * nH * nW 23 | nPixels = nH * nW 24 | for b in range(nB): 25 | cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t() 26 | cur_ious = torch.zeros(nAnchors) 27 | for t in range(50): 28 | if target[b][t * 5 + 1] == 0: 29 | break 30 | gx = target[b][t * 5 + 1] * nW 31 | gy = target[b][t * 5 + 2] * nH 32 | gw = target[b][t * 5 + 3] * nW 33 | gh = target[b][t * 5 + 4] * nH 34 | cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors, 1).t() 35 | cur_ious = torch.max(cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False)) 36 | conf_mask[b][cur_ious > sil_thresh] = 0 37 | if seen < 12800: 38 | if anchor_step == 4: 39 | tx = torch.FloatTensor(anchors).view(nA, anchor_step).index_select(1, torch.LongTensor([2])).view(1, nA, 1, 40 | 1).repeat( 41 | nB, 1, nH, nW) 42 | ty = torch.FloatTensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([2])).view( 43 | 1, nA, 1, 1).repeat(nB, 1, nH, nW) 44 | else: 45 | tx.fill_(0.5) 46 | ty.fill_(0.5) 47 | tw.zero_() 48 | th.zero_() 49 | coord_mask.fill_(1) 50 | 51 | nGT = 0 52 | nCorrect = 0 53 | for b in range(nB): 54 | for t in range(50): 55 | if target[b][t * 5 + 1] == 0: 56 | break 57 | nGT = nGT + 1 58 | best_iou = 0.0 59 | best_n = -1 60 | min_dist = 10000 61 | gx = target[b][t * 5 + 1] * nW 62 | gy = target[b][t * 5 + 2] * nH 63 | gi = int(gx) 64 | gj = int(gy) 65 | gw = target[b][t * 5 + 3] * nW 66 | gh = target[b][t * 5 + 4] * nH 67 | gt_box = [0, 0, gw, gh] 68 | for n in range(nA): 69 | aw = anchors[anchor_step * n] 70 | ah = anchors[anchor_step * n + 1] 71 | anchor_box = [0, 0, aw, ah] 72 | iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False) 73 | if anchor_step == 4: 74 | ax = anchors[anchor_step * n + 2] 75 | ay = anchors[anchor_step * n + 3] 76 | dist = pow(((gi + ax) - gx), 2) + pow(((gj + ay) - gy), 2) 77 | if iou > best_iou: 78 | best_iou = iou 79 | best_n = n 80 | elif anchor_step == 4 and iou == best_iou and dist < min_dist: 81 | best_iou = iou 82 | best_n = n 83 | min_dist = dist 84 | 85 | gt_box = [gx, gy, gw, gh] 86 | pred_box = pred_boxes[b * nAnchors + best_n * nPixels + gj * nW + gi] 87 | 88 | coord_mask[b][best_n][gj][gi] = 1 89 | cls_mask[b][best_n][gj][gi] = 1 90 | conf_mask[b][best_n][gj][gi] = object_scale 91 | tx[b][best_n][gj][gi] = target[b][t * 5 + 1] * nW - gi 92 | ty[b][best_n][gj][gi] = target[b][t * 5 + 2] * nH - gj 93 | tw[b][best_n][gj][gi] = math.log(gw / anchors[anchor_step * best_n]) 94 | th[b][best_n][gj][gi] = math.log(gh / anchors[anchor_step * best_n + 1]) 95 | iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False) # best_iou 96 | tconf[b][best_n][gj][gi] = iou 97 | tcls[b][best_n][gj][gi] = target[b][t * 5] 98 | if iou > 0.5: 99 | nCorrect = nCorrect + 1 100 | 101 | return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls 102 | 103 | 104 | class RegionLoss(nn.Module): 105 | def __init__(self, num_classes=0, anchors=[], num_anchors=1): 106 | super(RegionLoss, self).__init__() 107 | self.num_classes = num_classes 108 | self.anchors = anchors 109 | self.num_anchors = num_anchors 110 | self.anchor_step = len(anchors) / num_anchors 111 | self.coord_scale = 1 112 | self.noobject_scale = 1 113 | self.object_scale = 5 114 | self.class_scale = 1 115 | self.thresh = 0.6 116 | self.seen = 0 117 | 118 | def forward(self, output, target): 119 | # output : BxAs*(4+1+num_classes)*H*W 120 | t0 = time.time() 121 | nB = output.data.size(0) 122 | nA = self.num_anchors 123 | nC = self.num_classes 124 | nH = output.data.size(2) 125 | nW = output.data.size(3) 126 | 127 | output = output.view(nB, nA, (5 + nC), nH, nW) 128 | x = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([0]))).view(nB, nA, nH, nW)) 129 | y = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([1]))).view(nB, nA, nH, nW)) 130 | w = output.index_select(2, Variable(torch.cuda.LongTensor([2]))).view(nB, nA, nH, nW) 131 | h = output.index_select(2, Variable(torch.cuda.LongTensor([3]))).view(nB, nA, nH, nW) 132 | conf = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([4]))).view(nB, nA, nH, nW)) 133 | cls = output.index_select(2, Variable(torch.linspace(5, 5 + nC - 1, nC).long().cuda())) 134 | cls = cls.view(nB * nA, nC, nH * nW).transpose(1, 2).contiguous().view(nB * nA * nH * nW, nC) 135 | t1 = time.time() 136 | 137 | pred_boxes = torch.cuda.FloatTensor(4, nB * nA * nH * nW) 138 | grid_x = torch.linspace(0, nW - 1, nW).repeat(nH, 1).repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda() 139 | grid_y = torch.linspace(0, nH - 1, nH).repeat(nW, 1).t().repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda() 140 | anchor_w = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([0])).cuda() 141 | anchor_h = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([1])).cuda() 142 | anchor_w = anchor_w.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW) 143 | anchor_h = anchor_h.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW) 144 | pred_boxes[0] = x.data + grid_x 145 | pred_boxes[1] = y.data + grid_y 146 | pred_boxes[2] = torch.exp(w.data) * anchor_w 147 | pred_boxes[3] = torch.exp(h.data) * anchor_h 148 | pred_boxes = convert2cpu(pred_boxes.transpose(0, 1).contiguous().view(-1, 4)) 149 | t2 = time.time() 150 | 151 | nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes, 152 | target.data, 153 | self.anchors, nA, 154 | nC, \ 155 | nH, nW, 156 | self.noobject_scale, 157 | self.object_scale, 158 | self.thresh, 159 | self.seen) 160 | cls_mask = (cls_mask == 1) 161 | nProposals = int((conf > 0.25).sum().data[0]) 162 | 163 | tx = Variable(tx.cuda()) 164 | ty = Variable(ty.cuda()) 165 | tw = Variable(tw.cuda()) 166 | th = Variable(th.cuda()) 167 | tconf = Variable(tconf.cuda()) 168 | tcls = Variable(tcls.view(-1)[cls_mask].long().cuda()) 169 | 170 | coord_mask = Variable(coord_mask.cuda()) 171 | conf_mask = Variable(conf_mask.cuda().sqrt()) 172 | cls_mask = Variable(cls_mask.view(-1, 1).repeat(1, nC).cuda()) 173 | cls = cls[cls_mask].view(-1, nC) 174 | 175 | t3 = time.time() 176 | 177 | loss_x = self.coord_scale * nn.MSELoss(size_average=False)(x * coord_mask, tx * coord_mask) / 2.0 178 | loss_y = self.coord_scale * nn.MSELoss(size_average=False)(y * coord_mask, ty * coord_mask) / 2.0 179 | loss_w = self.coord_scale * nn.MSELoss(size_average=False)(w * coord_mask, tw * coord_mask) / 2.0 180 | loss_h = self.coord_scale * nn.MSELoss(size_average=False)(h * coord_mask, th * coord_mask) / 2.0 181 | loss_conf = nn.MSELoss(size_average=False)(conf * conf_mask, tconf * conf_mask) / 2.0 182 | loss_cls = self.class_scale * nn.CrossEntropyLoss(size_average=False)(cls, tcls) 183 | loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls 184 | t4 = time.time() 185 | if False: 186 | print('-----------------------------------') 187 | print(' activation : %f' % (t1 - t0)) 188 | print(' create pred_boxes : %f' % (t2 - t1)) 189 | print(' build targets : %f' % (t3 - t2)) 190 | print(' create loss : %f' % (t4 - t3)) 191 | print(' total : %f' % (t4 - t0)) 192 | print('%d: nGT %d, recall %d, proposals %d, loss: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f' % ( 193 | self.seen, nGT, nCorrect, nProposals, loss_x.data[0], loss_y.data[0], loss_w.data[0], loss_h.data[0], 194 | loss_conf.data[0], loss_cls.data[0], loss.data[0])) 195 | return loss 196 | -------------------------------------------------------------------------------- /tool/utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | import math 5 | import torch 6 | import numpy as np 7 | from PIL import Image, ImageDraw, ImageFont 8 | from torch.autograd import Variable 9 | 10 | import itertools 11 | import struct # get_image_size 12 | import imghdr # get_image_size 13 | 14 | 15 | def sigmoid(x): 16 | return 1.0 / (np.exp(-x) + 1.) 17 | 18 | 19 | def softmax(x): 20 | x = np.exp(x - np.expand_dims(np.max(x, axis=1), axis=1)) 21 | x = x / np.expand_dims(x.sum(axis=1), axis=1) 22 | return x 23 | 24 | 25 | def bbox_iou(box1, box2, x1y1x2y2=True): 26 | if x1y1x2y2: 27 | mx = min(box1[0], box2[0]) 28 | Mx = max(box1[2], box2[2]) 29 | my = min(box1[1], box2[1]) 30 | My = max(box1[3], box2[3]) 31 | w1 = box1[2] - box1[0] 32 | h1 = box1[3] - box1[1] 33 | w2 = box2[2] - box2[0] 34 | h2 = box2[3] - box2[1] 35 | else: 36 | mx = min(box1[0] - box1[2] / 2.0, box2[0] - box2[2] / 2.0) 37 | Mx = max(box1[0] + box1[2] / 2.0, box2[0] + box2[2] / 2.0) 38 | my = min(box1[1] - box1[3] / 2.0, box2[1] - box2[3] / 2.0) 39 | My = max(box1[1] + box1[3] / 2.0, box2[1] + box2[3] / 2.0) 40 | w1 = box1[2] 41 | h1 = box1[3] 42 | w2 = box2[2] 43 | h2 = box2[3] 44 | uw = Mx - mx 45 | uh = My - my 46 | cw = w1 + w2 - uw 47 | ch = h1 + h2 - uh 48 | carea = 0 49 | if cw <= 0 or ch <= 0: 50 | return 0.0 51 | 52 | area1 = w1 * h1 53 | area2 = w2 * h2 54 | carea = cw * ch 55 | uarea = area1 + area2 - carea 56 | return carea / uarea 57 | 58 | 59 | def bbox_ious(boxes1, boxes2, x1y1x2y2=True): 60 | if x1y1x2y2: 61 | mx = torch.min(boxes1[0], boxes2[0]) 62 | Mx = torch.max(boxes1[2], boxes2[2]) 63 | my = torch.min(boxes1[1], boxes2[1]) 64 | My = torch.max(boxes1[3], boxes2[3]) 65 | w1 = boxes1[2] - boxes1[0] 66 | h1 = boxes1[3] - boxes1[1] 67 | w2 = boxes2[2] - boxes2[0] 68 | h2 = boxes2[3] - boxes2[1] 69 | else: 70 | mx = torch.min(boxes1[0] - boxes1[2] / 2.0, boxes2[0] - boxes2[2] / 2.0) 71 | Mx = torch.max(boxes1[0] + boxes1[2] / 2.0, boxes2[0] + boxes2[2] / 2.0) 72 | my = torch.min(boxes1[1] - boxes1[3] / 2.0, boxes2[1] - boxes2[3] / 2.0) 73 | My = torch.max(boxes1[1] + boxes1[3] / 2.0, boxes2[1] + boxes2[3] / 2.0) 74 | w1 = boxes1[2] 75 | h1 = boxes1[3] 76 | w2 = boxes2[2] 77 | h2 = boxes2[3] 78 | uw = Mx - mx 79 | uh = My - my 80 | cw = w1 + w2 - uw 81 | ch = h1 + h2 - uh 82 | mask = ((cw <= 0) + (ch <= 0) > 0) 83 | area1 = w1 * h1 84 | area2 = w2 * h2 85 | carea = cw * ch 86 | carea[mask] = 0 87 | uarea = area1 + area2 - carea 88 | return carea / uarea 89 | 90 | 91 | def nms(boxes, nms_thresh): 92 | if len(boxes) == 0: 93 | return boxes 94 | 95 | det_confs = torch.zeros(len(boxes)) 96 | for i in range(len(boxes)): 97 | det_confs[i] = 1 - boxes[i][4] 98 | 99 | _, sortIds = torch.sort(det_confs) 100 | out_boxes = [] 101 | for i in range(len(boxes)): 102 | box_i = boxes[sortIds[i]] 103 | if box_i[4] > 0: 104 | out_boxes.append(box_i) 105 | for j in range(i + 1, len(boxes)): 106 | box_j = boxes[sortIds[j]] 107 | if bbox_iou(box_i, box_j, x1y1x2y2=False) > nms_thresh: 108 | # print(box_i, box_j, bbox_iou(box_i, box_j, x1y1x2y2=False)) 109 | box_j[4] = 0 110 | return out_boxes 111 | 112 | 113 | def convert2cpu(gpu_matrix): 114 | return torch.FloatTensor(gpu_matrix.size()).copy_(gpu_matrix) 115 | 116 | 117 | def convert2cpu_long(gpu_matrix): 118 | return torch.LongTensor(gpu_matrix.size()).copy_(gpu_matrix) 119 | 120 | 121 | def get_region_boxes_in_model(output, conf_thresh, num_classes, anchors, num_anchors, only_objectness=1, 122 | validation=False): 123 | anchor_step = len(anchors) // num_anchors 124 | if output.dim() == 3: 125 | output = output.unsqueeze(0) 126 | batch = output.size(0) 127 | assert (output.size(1) == (5 + num_classes) * num_anchors) 128 | h = output.size(2) 129 | w = output.size(3) 130 | 131 | t0 = time.time() 132 | all_boxes = [] 133 | output = output.view(batch * num_anchors, 5 + num_classes, h * w).transpose(0, 1).contiguous().view(5 + num_classes, 134 | batch * num_anchors * h * w) 135 | 136 | grid_x = torch.linspace(0, w - 1, w).repeat(h, 1).repeat(batch * num_anchors, 1, 1).view( 137 | batch * num_anchors * h * w).type_as(output) # cuda() 138 | grid_y = torch.linspace(0, h - 1, h).repeat(w, 1).t().repeat(batch * num_anchors, 1, 1).view( 139 | batch * num_anchors * h * w).type_as(output) # cuda() 140 | xs = torch.sigmoid(output[0]) + grid_x 141 | ys = torch.sigmoid(output[1]) + grid_y 142 | 143 | anchor_w = torch.Tensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([0])) 144 | anchor_h = torch.Tensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([1])) 145 | anchor_w = anchor_w.repeat(batch, 1).repeat(1, 1, h * w).view(batch * num_anchors * h * w).type_as(output) # cuda() 146 | anchor_h = anchor_h.repeat(batch, 1).repeat(1, 1, h * w).view(batch * num_anchors * h * w).type_as(output) # cuda() 147 | ws = torch.exp(output[2]) * anchor_w 148 | hs = torch.exp(output[3]) * anchor_h 149 | 150 | det_confs = torch.sigmoid(output[4]) 151 | 152 | cls_confs = torch.nn.Softmax()(Variable(output[5:5 + num_classes].transpose(0, 1))).data 153 | cls_max_confs, cls_max_ids = torch.max(cls_confs, 1) 154 | cls_max_confs = cls_max_confs.view(-1) 155 | cls_max_ids = cls_max_ids.view(-1) 156 | t1 = time.time() 157 | 158 | sz_hw = h * w 159 | sz_hwa = sz_hw * num_anchors 160 | det_confs = convert2cpu(det_confs) 161 | cls_max_confs = convert2cpu(cls_max_confs) 162 | cls_max_ids = convert2cpu_long(cls_max_ids) 163 | xs = convert2cpu(xs) 164 | ys = convert2cpu(ys) 165 | ws = convert2cpu(ws) 166 | hs = convert2cpu(hs) 167 | if validation: 168 | cls_confs = convert2cpu(cls_confs.view(-1, num_classes)) 169 | t2 = time.time() 170 | for b in range(batch): 171 | boxes = [] 172 | for cy in range(h): 173 | for cx in range(w): 174 | for i in range(num_anchors): 175 | ind = b * sz_hwa + i * sz_hw + cy * w + cx 176 | det_conf = det_confs[ind] 177 | if only_objectness: 178 | conf = det_confs[ind] 179 | else: 180 | conf = det_confs[ind] * cls_max_confs[ind] 181 | 182 | if conf > conf_thresh: 183 | bcx = xs[ind] 184 | bcy = ys[ind] 185 | bw = ws[ind] 186 | bh = hs[ind] 187 | cls_max_conf = cls_max_confs[ind] 188 | cls_max_id = cls_max_ids[ind] 189 | box = [bcx / w, bcy / h, bw / w, bh / h, det_conf, cls_max_conf, cls_max_id] 190 | if (not only_objectness) and validation: 191 | for c in range(num_classes): 192 | tmp_conf = cls_confs[ind][c] 193 | if c != cls_max_id and det_confs[ind] * tmp_conf > conf_thresh: 194 | box.append(tmp_conf) 195 | box.append(c) 196 | boxes.append(box) 197 | all_boxes.append(boxes) 198 | t3 = time.time() 199 | if False: 200 | print('---------------------------------') 201 | print('matrix computation : %f' % (t1 - t0)) 202 | print(' gpu to cpu : %f' % (t2 - t1)) 203 | print(' boxes filter : %f' % (t3 - t2)) 204 | print('---------------------------------') 205 | return all_boxes 206 | 207 | 208 | def get_region_boxes_out_model(output, conf_thresh, num_classes, anchors, num_anchors, only_objectness=1, 209 | validation=False): 210 | anchor_step = len(anchors) // num_anchors 211 | if len(output.shape) == 3: 212 | output = np.expand_dims(output, axis=0) 213 | batch = output.shape[0] 214 | assert (output.shape[1] == (5 + num_classes) * num_anchors) 215 | h = output.shape[2] 216 | w = output.shape[3] 217 | 218 | t0 = time.time() 219 | all_boxes = [] 220 | output = output.reshape(batch * num_anchors, 5 + num_classes, h * w).transpose((1, 0, 2)).reshape( 221 | 5 + num_classes, 222 | batch * num_anchors * h * w) 223 | 224 | grid_x = np.expand_dims(np.expand_dims(np.linspace(0, w - 1, w), axis=0).repeat(h, 0), axis=0).repeat( 225 | batch * num_anchors, axis=0).reshape( 226 | batch * num_anchors * h * w) 227 | grid_y = np.expand_dims(np.expand_dims(np.linspace(0, h - 1, h), axis=0).repeat(w, 0).T, axis=0).repeat( 228 | batch * num_anchors, axis=0).reshape( 229 | batch * num_anchors * h * w) 230 | 231 | xs = sigmoid(output[0]) + grid_x 232 | ys = sigmoid(output[1]) + grid_y 233 | 234 | anchor_w = np.array(anchors).reshape((num_anchors, anchor_step))[:, 0] 235 | anchor_h = np.array(anchors).reshape((num_anchors, anchor_step))[:, 1] 236 | anchor_w = np.expand_dims(np.expand_dims(anchor_w, axis=1).repeat(batch, 1), axis=2) \ 237 | .repeat(h * w, axis=2).transpose(1, 0, 2).reshape(batch * num_anchors * h * w) 238 | anchor_h = np.expand_dims(np.expand_dims(anchor_h, axis=1).repeat(batch, 1), axis=2) \ 239 | .repeat(h * w, axis=2).transpose(1, 0, 2).reshape(batch * num_anchors * h * w) 240 | ws = np.exp(output[2]) * anchor_w 241 | hs = np.exp(output[3]) * anchor_h 242 | 243 | det_confs = sigmoid(output[4]) 244 | 245 | cls_confs = softmax(output[5:5 + num_classes].transpose(1, 0)) 246 | cls_max_confs = np.max(cls_confs, 1) 247 | cls_max_ids = np.argmax(cls_confs, 1) 248 | t1 = time.time() 249 | 250 | sz_hw = h * w 251 | sz_hwa = sz_hw * num_anchors 252 | t2 = time.time() 253 | for b in range(batch): 254 | boxes = [] 255 | for cy in range(h): 256 | for cx in range(w): 257 | for i in range(num_anchors): 258 | ind = b * sz_hwa + i * sz_hw + cy * w + cx 259 | det_conf = det_confs[ind] 260 | if only_objectness: 261 | conf = det_confs[ind] 262 | else: 263 | conf = det_confs[ind] * cls_max_confs[ind] 264 | 265 | if conf > conf_thresh: 266 | bcx = xs[ind] 267 | bcy = ys[ind] 268 | bw = ws[ind] 269 | bh = hs[ind] 270 | cls_max_conf = cls_max_confs[ind] 271 | cls_max_id = cls_max_ids[ind] 272 | box = [bcx / w, bcy / h, bw / w, bh / h, det_conf, cls_max_conf, cls_max_id] 273 | if (not only_objectness) and validation: 274 | for c in range(num_classes): 275 | tmp_conf = cls_confs[ind][c] 276 | if c != cls_max_id and det_confs[ind] * tmp_conf > conf_thresh: 277 | box.append(tmp_conf) 278 | box.append(c) 279 | boxes.append(box) 280 | all_boxes.append(boxes) 281 | t3 = time.time() 282 | if False: 283 | print('---------------------------------') 284 | print('matrix computation : %f' % (t1 - t0)) 285 | print(' gpu to cpu : %f' % (t2 - t1)) 286 | print(' boxes filter : %f' % (t3 - t2)) 287 | print('---------------------------------') 288 | return all_boxes 289 | 290 | 291 | def plot_boxes_cv2(img, boxes, savename=None, class_names=None, color=None): 292 | import cv2 293 | colors = torch.FloatTensor([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]]); 294 | 295 | def get_color(c, x, max_val): 296 | ratio = float(x) / max_val * 5 297 | i = int(math.floor(ratio)) 298 | j = int(math.ceil(ratio)) 299 | ratio = ratio - i 300 | r = (1 - ratio) * colors[i][c] + ratio * colors[j][c] 301 | return int(r * 255) 302 | 303 | width = img.shape[1] 304 | height = img.shape[0] 305 | for i in range(len(boxes)): 306 | box = boxes[i] 307 | x1 = int((box[0] - box[2] / 2.0) * width) 308 | y1 = int((box[1] - box[3] / 2.0) * height) 309 | x2 = int((box[0] + box[2] / 2.0) * width) 310 | y2 = int((box[1] + box[3] / 2.0) * height) 311 | 312 | if color: 313 | rgb = color 314 | else: 315 | rgb = (255, 0, 0) 316 | if len(box) >= 7 and class_names: 317 | cls_conf = box[5] 318 | cls_id = box[6] 319 | print('%s: %f' % (class_names[cls_id], cls_conf)) 320 | classes = len(class_names) 321 | offset = cls_id * 123457 % classes 322 | red = get_color(2, offset, classes) 323 | green = get_color(1, offset, classes) 324 | blue = get_color(0, offset, classes) 325 | if color is None: 326 | rgb = (red, green, blue) 327 | img = cv2.putText(img, class_names[cls_id], (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 1.2, rgb, 1) 328 | img = cv2.rectangle(img, (x1, y1), (x2, y2), rgb, 1) 329 | if savename: 330 | print("save plot results to %s" % savename) 331 | cv2.imwrite(savename, img) 332 | return img 333 | 334 | 335 | def plot_boxes(img, boxes, savename=None, class_names=None): 336 | colors = torch.FloatTensor([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]]); 337 | 338 | def get_color(c, x, max_val): 339 | ratio = float(x) / max_val * 5 340 | i = int(math.floor(ratio)) 341 | j = int(math.ceil(ratio)) 342 | ratio = ratio - i 343 | r = (1 - ratio) * colors[i][c] + ratio * colors[j][c] 344 | return int(r * 255) 345 | 346 | width = img.width 347 | height = img.height 348 | draw = ImageDraw.Draw(img) 349 | for i in range(len(boxes)): 350 | box = boxes[i] 351 | x1 = (box[0] - box[2] / 2.0) * width 352 | y1 = (box[1] - box[3] / 2.0) * height 353 | x2 = (box[0] + box[2] / 2.0) * width 354 | y2 = (box[1] + box[3] / 2.0) * height 355 | 356 | rgb = (255, 0, 0) 357 | if len(box) >= 7 and class_names: 358 | cls_conf = box[5] 359 | cls_id = box[6] 360 | print('%s: %f' % (class_names[cls_id], cls_conf)) 361 | classes = len(class_names) 362 | offset = cls_id * 123457 % classes 363 | red = get_color(2, offset, classes) 364 | green = get_color(1, offset, classes) 365 | blue = get_color(0, offset, classes) 366 | rgb = (red, green, blue) 367 | draw.text((x1, y1), class_names[cls_id], fill=rgb) 368 | draw.rectangle([x1, y1, x2, y2], outline=rgb) 369 | if savename: 370 | print("save plot results to %s" % savename) 371 | img.save(savename) 372 | return img 373 | 374 | 375 | def read_truths(lab_path): 376 | if not os.path.exists(lab_path): 377 | return np.array([]) 378 | if os.path.getsize(lab_path): 379 | truths = np.loadtxt(lab_path) 380 | truths = truths.reshape(truths.size / 5, 5) # to avoid single truth problem 381 | return truths 382 | else: 383 | return np.array([]) 384 | 385 | 386 | def load_class_names(namesfile): 387 | class_names = [] 388 | with open(namesfile, 'r') as fp: 389 | lines = fp.readlines() 390 | for line in lines: 391 | line = line.rstrip() 392 | class_names.append(line) 393 | return class_names 394 | 395 | 396 | def do_detect(model, img, conf_thresh, n_classes, nms_thresh, use_cuda=1): 397 | model.eval() 398 | t0 = time.time() 399 | 400 | if isinstance(img, Image.Image): 401 | width = img.width 402 | height = img.height 403 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(img.tobytes())) 404 | img = img.view(height, width, 3).transpose(0, 1).transpose(0, 2).contiguous() 405 | img = img.view(1, 3, height, width) 406 | img = img.float().div(255.0) 407 | elif type(img) == np.ndarray and len(img.shape) == 3: # cv2 image 408 | img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0) 409 | elif type(img) == np.ndarray and len(img.shape) == 4: 410 | img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0) 411 | else: 412 | print("unknow image type") 413 | exit(-1) 414 | 415 | t1 = time.time() 416 | 417 | if use_cuda: 418 | img = img.cuda() 419 | img = torch.autograd.Variable(img) 420 | t2 = time.time() 421 | 422 | list_features = model(img) 423 | 424 | list_features_numpy = [] 425 | for feature in list_features: 426 | list_features_numpy.append(feature.data.cpu().numpy()) 427 | 428 | return post_processing(img, conf_thresh, n_classes, nms_thresh, list_features_numpy) 429 | 430 | 431 | def post_processing(img, conf_thresh, n_classes, nms_thresh, list_features_numpy): 432 | anchors = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401] 433 | num_anchors = 9 434 | anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]] 435 | strides = [8, 16, 32] 436 | anchor_step = len(anchors) // num_anchors 437 | boxes = [] 438 | for i in range(3): 439 | masked_anchors = [] 440 | for m in anchor_masks[i]: 441 | masked_anchors += anchors[m * anchor_step:(m + 1) * anchor_step] 442 | masked_anchors = [anchor / strides[i] for anchor in masked_anchors] 443 | boxes.append(get_region_boxes_out_model(list_features_numpy[i], conf_thresh, n_classes, masked_anchors, 444 | len(anchor_masks[i]))) 445 | # boxes.append(get_region_boxes(list_boxes[i], 0.6, 80, masked_anchors, len(anchor_masks[i]))) 446 | if img.shape[0] > 1: 447 | bboxs_for_imgs = [ 448 | boxes[0][index] + boxes[1][index] + boxes[2][index] 449 | for index in range(img.shape[0])] 450 | # 分别对每一张图片的结果进行nms 451 | t3 = time.time() 452 | boxes = [nms(bboxs, nms_thresh) for bboxs in bboxs_for_imgs] 453 | else: 454 | boxes = boxes[0][0] + boxes[1][0] + boxes[2][0] 455 | t3 = time.time() 456 | boxes = nms(boxes, nms_thresh) 457 | t4 = time.time() 458 | 459 | if False: 460 | print('-----------------------------------') 461 | print(' image to tensor : %f' % (t1 - t0)) 462 | print(' tensor to cuda : %f' % (t2 - t1)) 463 | print(' predict : %f' % (t3 - t2)) 464 | print(' nms : %f' % (t4 - t3)) 465 | print(' total : %f' % (t4 - t0)) 466 | print('-----------------------------------') 467 | return boxes 468 | -------------------------------------------------------------------------------- /tool/yolo_layer.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | from tool.utils import * 4 | 5 | 6 | def build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale, 7 | sil_thresh, seen): 8 | nB = target.size(0) 9 | nA = num_anchors 10 | nC = num_classes 11 | anchor_step = len(anchors) / num_anchors 12 | conf_mask = torch.ones(nB, nA, nH, nW) * noobject_scale 13 | coord_mask = torch.zeros(nB, nA, nH, nW) 14 | cls_mask = torch.zeros(nB, nA, nH, nW) 15 | tx = torch.zeros(nB, nA, nH, nW) 16 | ty = torch.zeros(nB, nA, nH, nW) 17 | tw = torch.zeros(nB, nA, nH, nW) 18 | th = torch.zeros(nB, nA, nH, nW) 19 | tconf = torch.zeros(nB, nA, nH, nW) 20 | tcls = torch.zeros(nB, nA, nH, nW) 21 | 22 | nAnchors = nA * nH * nW 23 | nPixels = nH * nW 24 | for b in range(nB): 25 | cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t() 26 | cur_ious = torch.zeros(nAnchors) 27 | for t in range(50): 28 | if target[b][t * 5 + 1] == 0: 29 | break 30 | gx = target[b][t * 5 + 1] * nW 31 | gy = target[b][t * 5 + 2] * nH 32 | gw = target[b][t * 5 + 3] * nW 33 | gh = target[b][t * 5 + 4] * nH 34 | cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors, 1).t() 35 | cur_ious = torch.max(cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False)) 36 | conf_mask[b][cur_ious > sil_thresh] = 0 37 | if seen < 12800: 38 | if anchor_step == 4: 39 | tx = torch.FloatTensor(anchors).view(nA, anchor_step).index_select(1, torch.LongTensor([2])).view(1, nA, 1, 40 | 1).repeat( 41 | nB, 1, nH, nW) 42 | ty = torch.FloatTensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([2])).view( 43 | 1, nA, 1, 1).repeat(nB, 1, nH, nW) 44 | else: 45 | tx.fill_(0.5) 46 | ty.fill_(0.5) 47 | tw.zero_() 48 | th.zero_() 49 | coord_mask.fill_(1) 50 | 51 | nGT = 0 52 | nCorrect = 0 53 | for b in range(nB): 54 | for t in range(50): 55 | if target[b][t * 5 + 1] == 0: 56 | break 57 | nGT = nGT + 1 58 | best_iou = 0.0 59 | best_n = -1 60 | min_dist = 10000 61 | gx = target[b][t * 5 + 1] * nW 62 | gy = target[b][t * 5 + 2] * nH 63 | gi = int(gx) 64 | gj = int(gy) 65 | gw = target[b][t * 5 + 3] * nW 66 | gh = target[b][t * 5 + 4] * nH 67 | gt_box = [0, 0, gw, gh] 68 | for n in range(nA): 69 | aw = anchors[anchor_step * n] 70 | ah = anchors[anchor_step * n + 1] 71 | anchor_box = [0, 0, aw, ah] 72 | iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False) 73 | if anchor_step == 4: 74 | ax = anchors[anchor_step * n + 2] 75 | ay = anchors[anchor_step * n + 3] 76 | dist = pow(((gi + ax) - gx), 2) + pow(((gj + ay) - gy), 2) 77 | if iou > best_iou: 78 | best_iou = iou 79 | best_n = n 80 | elif anchor_step == 4 and iou == best_iou and dist < min_dist: 81 | best_iou = iou 82 | best_n = n 83 | min_dist = dist 84 | 85 | gt_box = [gx, gy, gw, gh] 86 | pred_box = pred_boxes[b * nAnchors + best_n * nPixels + gj * nW + gi] 87 | 88 | coord_mask[b][best_n][gj][gi] = 1 89 | cls_mask[b][best_n][gj][gi] = 1 90 | conf_mask[b][best_n][gj][gi] = object_scale 91 | tx[b][best_n][gj][gi] = target[b][t * 5 + 1] * nW - gi 92 | ty[b][best_n][gj][gi] = target[b][t * 5 + 2] * nH - gj 93 | tw[b][best_n][gj][gi] = math.log(gw / anchors[anchor_step * best_n]) 94 | th[b][best_n][gj][gi] = math.log(gh / anchors[anchor_step * best_n + 1]) 95 | iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False) # best_iou 96 | tconf[b][best_n][gj][gi] = iou 97 | tcls[b][best_n][gj][gi] = target[b][t * 5] 98 | if iou > 0.5: 99 | nCorrect = nCorrect + 1 100 | 101 | return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls 102 | 103 | 104 | class YoloLayer(nn.Module): 105 | ''' Yolo layer 106 | model_out: while inference,is post-processing inside or outside the model 107 | true:outside 108 | ''' 109 | def __init__(self, anchor_mask=[], num_classes=0, anchors=[], num_anchors=1,stride=32,model_out=True): 110 | super(YoloLayer, self).__init__() 111 | self.anchor_mask = anchor_mask 112 | self.num_classes = num_classes 113 | self.anchors = anchors 114 | self.num_anchors = num_anchors 115 | self.anchor_step = len(anchors) // num_anchors 116 | self.coord_scale = 1 117 | self.noobject_scale = 1 118 | self.object_scale = 5 119 | self.class_scale = 1 120 | self.thresh = 0.6 121 | self.stride = stride 122 | self.seen = 0 123 | 124 | self.model_out = model_out 125 | 126 | def forward(self, output, target=None): 127 | if self.training: 128 | # output : BxAs*(4+1+num_classes)*H*W 129 | t0 = time.time() 130 | nB = output.data.size(0) 131 | nA = self.num_anchors 132 | nC = self.num_classes 133 | nH = output.data.size(2) 134 | nW = output.data.size(3) 135 | 136 | output = output.view(nB, nA, (5 + nC), nH, nW) 137 | x = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([0]))).view(nB, nA, nH, nW)) 138 | y = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([1]))).view(nB, nA, nH, nW)) 139 | w = output.index_select(2, Variable(torch.cuda.LongTensor([2]))).view(nB, nA, nH, nW) 140 | h = output.index_select(2, Variable(torch.cuda.LongTensor([3]))).view(nB, nA, nH, nW) 141 | conf = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([4]))).view(nB, nA, nH, nW)) 142 | cls = output.index_select(2, Variable(torch.linspace(5, 5 + nC - 1, nC).long().cuda())) 143 | cls = cls.view(nB * nA, nC, nH * nW).transpose(1, 2).contiguous().view(nB * nA * nH * nW, nC) 144 | t1 = time.time() 145 | 146 | pred_boxes = torch.cuda.FloatTensor(4, nB * nA * nH * nW) 147 | grid_x = torch.linspace(0, nW - 1, nW).repeat(nH, 1).repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda() 148 | grid_y = torch.linspace(0, nH - 1, nH).repeat(nW, 1).t().repeat(nB * nA, 1, 1).view( 149 | nB * nA * nH * nW).cuda() 150 | anchor_w = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, 151 | torch.LongTensor([0])).cuda() 152 | anchor_h = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, 153 | torch.LongTensor([1])).cuda() 154 | anchor_w = anchor_w.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW) 155 | anchor_h = anchor_h.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW) 156 | pred_boxes[0] = x.data + grid_x 157 | pred_boxes[1] = y.data + grid_y 158 | pred_boxes[2] = torch.exp(w.data) * anchor_w 159 | pred_boxes[3] = torch.exp(h.data) * anchor_h 160 | pred_boxes = convert2cpu(pred_boxes.transpose(0, 1).contiguous().view(-1, 4)) 161 | t2 = time.time() 162 | 163 | nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes, 164 | target.data, 165 | self.anchors, 166 | nA, nC, \ 167 | nH, nW, 168 | self.noobject_scale, 169 | self.object_scale, 170 | self.thresh, 171 | self.seen) 172 | cls_mask = (cls_mask == 1) 173 | nProposals = int((conf > 0.25).sum().data[0]) 174 | 175 | tx = Variable(tx.cuda()) 176 | ty = Variable(ty.cuda()) 177 | tw = Variable(tw.cuda()) 178 | th = Variable(th.cuda()) 179 | tconf = Variable(tconf.cuda()) 180 | tcls = Variable(tcls.view(-1)[cls_mask].long().cuda()) 181 | 182 | coord_mask = Variable(coord_mask.cuda()) 183 | conf_mask = Variable(conf_mask.cuda().sqrt()) 184 | cls_mask = Variable(cls_mask.view(-1, 1).repeat(1, nC).cuda()) 185 | cls = cls[cls_mask].view(-1, nC) 186 | 187 | t3 = time.time() 188 | 189 | loss_x = self.coord_scale * nn.MSELoss(size_average=False)(x * coord_mask, tx * coord_mask) / 2.0 190 | loss_y = self.coord_scale * nn.MSELoss(size_average=False)(y * coord_mask, ty * coord_mask) / 2.0 191 | loss_w = self.coord_scale * nn.MSELoss(size_average=False)(w * coord_mask, tw * coord_mask) / 2.0 192 | loss_h = self.coord_scale * nn.MSELoss(size_average=False)(h * coord_mask, th * coord_mask) / 2.0 193 | loss_conf = nn.MSELoss(size_average=False)(conf * conf_mask, tconf * conf_mask) / 2.0 194 | loss_cls = self.class_scale * nn.CrossEntropyLoss(size_average=False)(cls, tcls) 195 | loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls 196 | t4 = time.time() 197 | if False: 198 | print('-----------------------------------') 199 | print(' activation : %f' % (t1 - t0)) 200 | print(' create pred_boxes : %f' % (t2 - t1)) 201 | print(' build targets : %f' % (t3 - t2)) 202 | print(' create loss : %f' % (t4 - t3)) 203 | print(' total : %f' % (t4 - t0)) 204 | print('%d: nGT %d, recall %d, proposals %d, loss: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f' % ( 205 | self.seen, nGT, nCorrect, nProposals, loss_x.data[0], loss_y.data[0], loss_w.data[0], loss_h.data[0], 206 | loss_conf.data[0], loss_cls.data[0], loss.data[0])) 207 | return loss 208 | else: 209 | if self.model_out: 210 | return output 211 | else: 212 | masked_anchors = [] 213 | for m in self.anchor_mask: 214 | masked_anchors += self.anchors[m * self.anchor_step:(m + 1) * self.anchor_step] 215 | masked_anchors = [anchor / self.stride for anchor in masked_anchors] 216 | boxes = get_region_boxes_in_model(output.data, self.thresh, self.num_classes, masked_anchors, len(self.anchor_mask)) 217 | return boxes 218 | --------------------------------------------------------------------------------