├── .dockerignore ├── .github └── ISSUE_TEMPLATE │ ├── --bug-report.md │ └── --feature-request.md ├── .gitignore ├── .slurm.sh.swp ├── Dockerfile ├── README.md ├── cfg ├── yolov3-1cls.cfg ├── yolov3-spp-1cls.cfg ├── yolov3-spp-3cls.cfg ├── yolov3-spp-matrix.cfg ├── yolov3-spp-pan-scale.cfg ├── yolov3-spp.cfg ├── yolov3-spp3.cfg ├── yolov3-tiny-1cls.cfg ├── yolov3-tiny-3cls.cfg ├── yolov3-tiny-mask.cfg ├── yolov3-tiny.cfg ├── yolov3-tiny3-1cls.cfg ├── yolov3-tiny3.cfg ├── yolov3.cfg └── yolov3s.cfg ├── change_name.ipynb ├── detect.py ├── makeMain.ipynb ├── mask_on.wav ├── models.py ├── project ├── datasets.py ├── gcp.sh ├── parse_config.py ├── torch_utils.py └── utils.py ├── record.py ├── requirements.txt ├── slurm.sh ├── test.py ├── train.py ├── voc_label.ipynb └── weights └── download_yolov3_weights.sh /.dockerignore: -------------------------------------------------------------------------------- 1 | # Repo-specific DockerIgnore ------------------------------------------------------------------------------------------- 2 | # .git 3 | .cache 4 | .idea 5 | runs 6 | output 7 | coco 8 | storage.googleapis.com 9 | 10 | data/samples/* 11 | !data/samples/zidane.jpg 12 | !data/samples/bus.jpg 13 | **/results*.txt 14 | *.jpg 15 | 16 | # Neural Network weights ----------------------------------------------------------------------------------------------- 17 | **/*.weights 18 | **/*.pt 19 | **/*.onnx 20 | **/*.mlmodel 21 | **/darknet53.conv.74 22 | **/yolov3-tiny.conv.15 23 | 24 | 25 | # Below Copied From .gitignore ----------------------------------------------------------------------------------------- 26 | # Below Copied From .gitignore ----------------------------------------------------------------------------------------- 27 | 28 | 29 | # GitHub Python GitIgnore ---------------------------------------------------------------------------------------------- 30 | # Byte-compiled / optimized / DLL files 31 | __pycache__/ 32 | *.py[cod] 33 | *$py.class 34 | 35 | # C extensions 36 | *.so 37 | 38 | # Distribution / packaging 39 | .Python 40 | env/ 41 | build/ 42 | develop-eggs/ 43 | dist/ 44 | downloads/ 45 | eggs/ 46 | .eggs/ 47 | lib/ 48 | lib64/ 49 | parts/ 50 | sdist/ 51 | var/ 52 | wheels/ 53 | *.egg-info/ 54 | .installed.cfg 55 | *.egg 56 | 57 | # PyInstaller 58 | # Usually these files are written by a python script from a template 59 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 60 | *.manifest 61 | *.spec 62 | 63 | # Installer logs 64 | pip-log.txt 65 | pip-delete-this-directory.txt 66 | 67 | # Unit test / coverage reports 68 | htmlcov/ 69 | .tox/ 70 | .coverage 71 | .coverage.* 72 | .cache 73 | nosetests.xml 74 | coverage.xml 75 | *.cover 76 | .hypothesis/ 77 | 78 | # Translations 79 | *.mo 80 | *.pot 81 | 82 | # Django stuff: 83 | *.log 84 | local_settings.py 85 | 86 | # Flask stuff: 87 | instance/ 88 | .webassets-cache 89 | 90 | # Scrapy stuff: 91 | .scrapy 92 | 93 | # Sphinx documentation 94 | docs/_build/ 95 | 96 | # PyBuilder 97 | target/ 98 | 99 | # Jupyter Notebook 100 | .ipynb_checkpoints 101 | 102 | # pyenv 103 | .python-version 104 | 105 | # celery beat schedule file 106 | celerybeat-schedule 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # dotenv 112 | .env 113 | 114 | # virtualenv 115 | .venv 116 | venv/ 117 | ENV/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | 132 | 133 | # https://github.com/github/gitignore/blob/master/Global/macOS.gitignore ----------------------------------------------- 134 | 135 | # General 136 | .DS_Store 137 | .AppleDouble 138 | .LSOverride 139 | 140 | # Icon must end with two \r 141 | Icon 142 | Icon? 143 | 144 | # Thumbnails 145 | ._* 146 | 147 | # Files that might appear in the root of a volume 148 | .DocumentRevisions-V100 149 | .fseventsd 150 | .Spotlight-V100 151 | .TemporaryItems 152 | .Trashes 153 | .VolumeIcon.icns 154 | .com.apple.timemachine.donotpresent 155 | 156 | # Directories potentially created on remote AFP share 157 | .AppleDB 158 | .AppleDesktop 159 | Network Trash Folder 160 | Temporary Items 161 | .apdisk 162 | 163 | 164 | # https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore 165 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 166 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 167 | 168 | # User-specific stuff: 169 | .idea/* 170 | .idea/**/workspace.xml 171 | .idea/**/tasks.xml 172 | .idea/dictionaries 173 | .html # Bokeh Plots 174 | .pg # TensorFlow Frozen Graphs 175 | .avi # videos 176 | 177 | # Sensitive or high-churn files: 178 | .idea/**/dataSources/ 179 | .idea/**/dataSources.ids 180 | .idea/**/dataSources.local.xml 181 | .idea/**/sqlDataSources.xml 182 | .idea/**/dynamic.xml 183 | .idea/**/uiDesigner.xml 184 | 185 | # Gradle: 186 | .idea/**/gradle.xml 187 | .idea/**/libraries 188 | 189 | # CMake 190 | cmake-build-debug/ 191 | cmake-build-release/ 192 | 193 | # Mongo Explorer plugin: 194 | .idea/**/mongoSettings.xml 195 | 196 | ## File-based project format: 197 | *.iws 198 | 199 | ## Plugin-specific files: 200 | 201 | # IntelliJ 202 | out/ 203 | 204 | # mpeltonen/sbt-idea plugin 205 | .idea_modules/ 206 | 207 | # JIRA plugin 208 | atlassian-ide-plugin.xml 209 | 210 | # Cursive Clojure plugin 211 | .idea/replstate.xml 212 | 213 | # Crashlytics plugin (for Android Studio and IntelliJ) 214 | com_crashlytics_export_strings.xml 215 | crashlytics.properties 216 | crashlytics-build.properties 217 | fabric.properties 218 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/--bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F41BBug report" 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## 🐛 Bug 11 | A clear and concise description of what the bug is. 12 | 13 | ## To Reproduce 14 | Steps to reproduce the behavior: 15 | 1. 16 | 2. 17 | 3. 18 | 19 | ## Expected behavior 20 | A clear and concise description of what you expected to happen. 21 | 22 | ## Environment 23 | If applicable, add screenshots to help explain your problem. 24 | 25 | **Desktop (please complete the following information):** 26 | - OS: [e.g. iOS] 27 | - Version [e.g. 22] 28 | 29 | **Smartphone (please complete the following information):** 30 | - Device: [e.g. iPhoneXS] 31 | - OS: [e.g. iOS8.1] 32 | - Version [e.g. 22] 33 | 34 | ## Additional context 35 | Add any other context about the problem here. 36 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/--feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F680Feature request" 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## 🚀 Feature 11 | 12 | 13 | ## Motivation 14 | 15 | 16 | 17 | ## Pitch 18 | 19 | 20 | 21 | ## Alternatives 22 | 23 | 24 | 25 | ## Additional context 26 | 27 | 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Repo-specific GitIgnore ---------------------------------------------------------------------------------------------- 2 | *.jpg 3 | *.png 4 | *.bmp 5 | *.tif 6 | *.heic 7 | *.JPG 8 | *.PNG 9 | *.TIF 10 | *.HEIC 11 | *.mp4 12 | *.mov 13 | *.MOV 14 | *.avi 15 | *.data 16 | *.json 17 | 18 | *.cfg 19 | !cfg/yolov3*.cfg 20 | 21 | storage.googleapis.com 22 | runs/* 23 | data/* 24 | !data/samples/zidane.jpg 25 | !data/samples/bus.jpg 26 | !data/coco.names 27 | !data/coco_paper.names 28 | !data/coco.data 29 | !data/coco_*.data 30 | !data/coco_*.txt 31 | !data/trainvalno5k.shapes 32 | !data/*.sh 33 | 34 | pycocotools/* 35 | results*.txt 36 | gcp_test*.sh 37 | 38 | # MATLAB GitIgnore ----------------------------------------------------------------------------------------------------- 39 | *.m~ 40 | *.mat 41 | !targets*.mat 42 | 43 | # Neural Network weights ----------------------------------------------------------------------------------------------- 44 | *.weights 45 | *.pt 46 | *.onnx 47 | *.mlmodel 48 | darknet53.conv.74 49 | yolov3-tiny.conv.15 50 | 51 | # GitHub Python GitIgnore ---------------------------------------------------------------------------------------------- 52 | # Byte-compiled / optimized / DLL files 53 | __pycache__/ 54 | *.py[cod] 55 | *$py.class 56 | 57 | # C extensions 58 | *.so 59 | 60 | # Distribution / packaging 61 | .Python 62 | env/ 63 | build/ 64 | develop-eggs/ 65 | dist/ 66 | downloads/ 67 | eggs/ 68 | .eggs/ 69 | lib/ 70 | lib64/ 71 | parts/ 72 | sdist/ 73 | var/ 74 | wheels/ 75 | *.egg-info/ 76 | .installed.cfg 77 | *.egg 78 | 79 | # PyInstaller 80 | # Usually these files are written by a python script from a template 81 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 82 | *.manifest 83 | *.spec 84 | 85 | # Installer logs 86 | pip-log.txt 87 | pip-delete-this-directory.txt 88 | 89 | # Unit test / coverage reports 90 | htmlcov/ 91 | .tox/ 92 | .coverage 93 | .coverage.* 94 | .cache 95 | nosetests.xml 96 | coverage.xml 97 | *.cover 98 | .hypothesis/ 99 | 100 | # Translations 101 | *.mo 102 | *.pot 103 | 104 | # Django stuff: 105 | *.log 106 | local_settings.py 107 | 108 | # Flask stuff: 109 | instance/ 110 | .webassets-cache 111 | 112 | # Scrapy stuff: 113 | .scrapy 114 | 115 | # Sphinx documentation 116 | docs/_build/ 117 | 118 | # PyBuilder 119 | target/ 120 | 121 | # Jupyter Notebook 122 | .ipynb_checkpoints 123 | 124 | # pyenv 125 | .python-version 126 | 127 | # celery beat schedule file 128 | celerybeat-schedule 129 | 130 | # SageMath parsed files 131 | *.sage.py 132 | 133 | # dotenv 134 | .env 135 | 136 | # virtualenv 137 | .venv 138 | venv/ 139 | ENV/ 140 | 141 | # Spyder project settings 142 | .spyderproject 143 | .spyproject 144 | 145 | # Rope project settings 146 | .ropeproject 147 | 148 | # mkdocs documentation 149 | /site 150 | 151 | # mypy 152 | .mypy_cache/ 153 | 154 | 155 | # https://github.com/github/gitignore/blob/master/Global/macOS.gitignore ----------------------------------------------- 156 | 157 | # General 158 | .DS_Store 159 | .AppleDouble 160 | .LSOverride 161 | 162 | # Icon must end with two \r 163 | Icon 164 | Icon? 165 | 166 | # Thumbnails 167 | ._* 168 | 169 | # Files that might appear in the root of a volume 170 | .DocumentRevisions-V100 171 | .fseventsd 172 | .Spotlight-V100 173 | .TemporaryItems 174 | .Trashes 175 | .VolumeIcon.icns 176 | .com.apple.timemachine.donotpresent 177 | 178 | # Directories potentially created on remote AFP share 179 | .AppleDB 180 | .AppleDesktop 181 | Network Trash Folder 182 | Temporary Items 183 | .apdisk 184 | 185 | 186 | # https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore 187 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 188 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 189 | 190 | # User-specific stuff: 191 | .idea/* 192 | .idea/**/workspace.xml 193 | .idea/**/tasks.xml 194 | .idea/dictionaries 195 | .html # Bokeh Plots 196 | .pg # TensorFlow Frozen Graphs 197 | .avi # videos 198 | 199 | # Sensitive or high-churn files: 200 | .idea/**/dataSources/ 201 | .idea/**/dataSources.ids 202 | .idea/**/dataSources.local.xml 203 | .idea/**/sqlDataSources.xml 204 | .idea/**/dynamic.xml 205 | .idea/**/uiDesigner.xml 206 | 207 | # Gradle: 208 | .idea/**/gradle.xml 209 | .idea/**/libraries 210 | 211 | # CMake 212 | cmake-build-debug/ 213 | cmake-build-release/ 214 | 215 | # Mongo Explorer plugin: 216 | .idea/**/mongoSettings.xml 217 | 218 | ## File-based project format: 219 | *.iws 220 | 221 | ## Plugin-specific files: 222 | 223 | # IntelliJ 224 | out/ 225 | 226 | # mpeltonen/sbt-idea plugin 227 | .idea_modules/ 228 | 229 | # JIRA plugin 230 | atlassian-ide-plugin.xml 231 | 232 | # Cursive Clojure plugin 233 | .idea/replstate.xml 234 | 235 | # Crashlytics plugin (for Android Studio and IntelliJ) 236 | com_crashlytics_export_strings.xml 237 | crashlytics.properties 238 | crashlytics-build.properties 239 | fabric.properties 240 | -------------------------------------------------------------------------------- /.slurm.sh.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhengmingzhang/mask-detection/ff8a57b81ced6bc3fa6c1ae01f3b08cf1cb23e60/.slurm.sh.swp -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Start FROM Nvidia PyTorch image https://ngc.nvidia.com/catalog/containers/nvidia:pytorch 2 | FROM nvcr.io/nvidia/pytorch:20.01-py3 3 | 4 | # Install dependencies (pip or conda) 5 | RUN pip install -U gsutil 6 | # RUN pip install -U -r requirements.txt 7 | # RUN conda update -n base -c defaults conda 8 | # RUN conda install -y -c anaconda future numpy opencv matplotlib tqdm pillow 9 | # RUN conda install -y -c conda-forge scikit-image tensorboard pycocotools 10 | 11 | ## Install OpenCV with Gstreamer support 12 | #WORKDIR /usr/src 13 | #RUN pip uninstall -y opencv-python 14 | #RUN apt-get update 15 | #RUN apt-get install -y gstreamer1.0-tools gstreamer1.0-python3-dbg-plugin-loader libgstreamer1.0-dev libgstreamer-plugins-base1.0-dev 16 | #RUN git clone https://github.com/opencv/opencv.git && cd opencv && git checkout 4.1.1 && mkdir build 17 | #RUN git clone https://github.com/opencv/opencv_contrib.git && cd opencv_contrib && git checkout 4.1.1 18 | #RUN cd opencv/build && cmake ../ \ 19 | # -D OPENCV_EXTRA_MODULES_PATH=../../opencv_contrib/modules \ 20 | # -D BUILD_OPENCV_PYTHON3=ON \ 21 | # -D PYTHON3_EXECUTABLE=/opt/conda/bin/python \ 22 | # -D PYTHON3_INCLUDE_PATH=/opt/conda/include/python3.6m \ 23 | # -D PYTHON3_LIBRARIES=/opt/conda/lib/python3.6/site-packages \ 24 | # -D WITH_GSTREAMER=ON \ 25 | # -D WITH_FFMPEG=OFF \ 26 | # && make && make install && ldconfig 27 | #RUN cd /usr/local/lib/python3.6/site-packages/cv2/python-3.6/ && mv cv2.cpython-36m-x86_64-linux-gnu.so cv2.so 28 | #RUN cd /opt/conda/lib/python3.6/site-packages/ && ln -s /usr/local/lib/python3.6/site-packages/cv2/python-3.6/cv2.so cv2.so 29 | #RUN python3 -c "import cv2; print(cv2.getBuildInformation())" 30 | 31 | # Create working directory 32 | RUN mkdir -p /usr/src/app 33 | WORKDIR /usr/src/app 34 | 35 | # Copy contents 36 | COPY . /usr/src/app 37 | 38 | # Copy weights 39 | #RUN python3 -c "from models import *; \ 40 | #attempt_download('weights/yolov3.pt'); \ 41 | #attempt_download('weights/yolov3-spp.pt')" 42 | 43 | 44 | # --------------------------------------------------- Extras Below --------------------------------------------------- 45 | 46 | # Build and Push 47 | # t=ultralytics/yolov3:v0 && sudo docker build -t $t . && sudo docker push $t 48 | 49 | # Run 50 | # t=ultralytics/yolov3:v0 && sudo docker pull $t && sudo docker run -it $t bash 51 | 52 | # Pull and Run with local directory access 53 | # t=ultralytics/yolov3:v0 && sudo docker pull $t && sudo docker run -it -v "$(pwd)"/coco:/usr/src/coco $t bash 54 | 55 | # Kill all 56 | # sudo docker kill "$(sudo docker ps -q)" 57 | 58 | # Kill all image-based 59 | # sudo docker kill $(sudo docker ps -a -q --filter ancestor=ultralytics/yolov3:v0) 60 | 61 | # Run bash for loop 62 | # sudo docker run --gpus all --ipc=host ultralytics/yolov3:v0 while true; do python3 train.py --evolve; done 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mask-detetcion 2 | ## 项目说明 3 | 针对当前疫情情况,各大科技公司开始使用科技手段助力防疫。本项目通过yolov3对人脸口罩数据集进行训练,通过该模型可以识别一个人是否带了口罩,如果识别出此人没有佩戴口罩,则语音播报“请佩戴口罩”,如果此人是佩戴了口罩的,那么系统不会有反应。 4 | ## 数据集 5 | 用来训练的数据集来源于B站UP主:HamlinZheng,非常感谢该UP主的无私奉献~ 6 | ## 如何使用该项目 7 | 链接:https://pan.baidu.com/s/11z6hmBitSHG4TjilDNFJfQ 8 | 提取码:2zvl 9 | 10 | 将best.pt放入weights文件夹中,创建一个data文件夹将mask.data和mask.name放进去,配置好项目所需的环境之后,在命令行执行 11 | 12 | python detect.py --data-cfg data/mask.data --cfg cfg/yolov3-tiny-mask.cfg --weights weights/best.pt 13 | 便可看到结果。 14 | 15 | 语音播报的声音是我自己录的,不咋好听,如果想录自己的语音提醒可以直接运行record.py,录下自己的语音 16 | 17 | 模型我使用的是yolov3-tiny,模型训练结果实际上不算特别好,计划这两天对数据进行增强,优化一下模型。 18 | ## 如何训练自己的数据集 19 | 想要使用自己的数据集进行重新训练,需要对数据集的格式改成VOC数据集的形式,这个在网上有非常多的教程,我训练的时候参考的是 20 | https://blog.csdn.net/qq_21578849/article/details/84980298 21 | 大家感兴趣也可以自己训练一下 22 | -------------------------------------------------------------------------------- /cfg/yolov3-1cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=16 7 | subdivisions=1 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=18 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=1 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .7 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=18 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=1 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .7 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=18 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=1 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .7 787 | truth_thresh = 1 788 | random=1 789 | -------------------------------------------------------------------------------- /cfg/yolov3-spp-1cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=100 20 | max_batches = 5000 21 | policy=steps 22 | steps=4000,4500 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=18 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=1 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=18 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 3,4,5 728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 729 | classes=1 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=18 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2 815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 816 | classes=1 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | -------------------------------------------------------------------------------- /cfg/yolov3-spp-3cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=100 20 | max_batches = 5000 21 | policy=steps 22 | steps=4000,4500 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=24 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=3 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=24 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 3,4,5 728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 729 | classes=3 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=24 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2 815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 816 | classes=3 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | -------------------------------------------------------------------------------- /cfg/yolov3-spp-pan-scale.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=32 8 | width=544 9 | height=544 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | 19 | learning_rate=0.001 20 | burn_in=1000 21 | max_batches = 10000 22 | 23 | policy=steps 24 | steps=8000,9000 25 | scales=.1,.1 26 | 27 | #policy=sgdr 28 | #sgdr_cycle=1000 29 | #sgdr_mult=2 30 | #steps=4000,6000,8000,9000 31 | #scales=1, 1, 0.1, 0.1 32 | 33 | [convolutional] 34 | batch_normalize=1 35 | filters=32 36 | size=3 37 | stride=1 38 | pad=1 39 | activation=leaky 40 | 41 | # Downsample 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=64 46 | size=3 47 | stride=2 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=32 54 | size=1 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [convolutional] 60 | batch_normalize=1 61 | filters=64 62 | size=3 63 | stride=1 64 | pad=1 65 | activation=leaky 66 | 67 | [shortcut] 68 | from=-3 69 | activation=linear 70 | 71 | # Downsample 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=128 76 | size=3 77 | stride=2 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=64 84 | size=1 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [convolutional] 90 | batch_normalize=1 91 | filters=128 92 | size=3 93 | stride=1 94 | pad=1 95 | activation=leaky 96 | 97 | [shortcut] 98 | from=-3 99 | activation=linear 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=64 104 | size=1 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [convolutional] 110 | batch_normalize=1 111 | filters=128 112 | size=3 113 | stride=1 114 | pad=1 115 | activation=leaky 116 | 117 | [shortcut] 118 | from=-3 119 | activation=linear 120 | 121 | # Downsample 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=256 126 | size=3 127 | stride=2 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=128 134 | size=1 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [convolutional] 140 | batch_normalize=1 141 | filters=256 142 | size=3 143 | stride=1 144 | pad=1 145 | activation=leaky 146 | 147 | [shortcut] 148 | from=-3 149 | activation=linear 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=128 154 | size=1 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [convolutional] 160 | batch_normalize=1 161 | filters=256 162 | size=3 163 | stride=1 164 | pad=1 165 | activation=leaky 166 | 167 | [shortcut] 168 | from=-3 169 | activation=linear 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=128 174 | size=1 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [convolutional] 180 | batch_normalize=1 181 | filters=256 182 | size=3 183 | stride=1 184 | pad=1 185 | activation=leaky 186 | 187 | [shortcut] 188 | from=-3 189 | activation=linear 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=128 194 | size=1 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [convolutional] 200 | batch_normalize=1 201 | filters=256 202 | size=3 203 | stride=1 204 | pad=1 205 | activation=leaky 206 | 207 | [shortcut] 208 | from=-3 209 | activation=linear 210 | 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=128 215 | size=1 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [convolutional] 221 | batch_normalize=1 222 | filters=256 223 | size=3 224 | stride=1 225 | pad=1 226 | activation=leaky 227 | 228 | [shortcut] 229 | from=-3 230 | activation=linear 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=128 235 | size=1 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [convolutional] 241 | batch_normalize=1 242 | filters=256 243 | size=3 244 | stride=1 245 | pad=1 246 | activation=leaky 247 | 248 | [shortcut] 249 | from=-3 250 | activation=linear 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=128 255 | size=1 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [convolutional] 261 | batch_normalize=1 262 | filters=256 263 | size=3 264 | stride=1 265 | pad=1 266 | activation=leaky 267 | 268 | [shortcut] 269 | from=-3 270 | activation=linear 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=128 275 | size=1 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [convolutional] 281 | batch_normalize=1 282 | filters=256 283 | size=3 284 | stride=1 285 | pad=1 286 | activation=leaky 287 | 288 | [shortcut] 289 | from=-3 290 | activation=linear 291 | 292 | # Downsample 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=512 297 | size=3 298 | stride=2 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=256 305 | size=1 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [convolutional] 311 | batch_normalize=1 312 | filters=512 313 | size=3 314 | stride=1 315 | pad=1 316 | activation=leaky 317 | 318 | [shortcut] 319 | from=-3 320 | activation=linear 321 | 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=256 326 | size=1 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [convolutional] 332 | batch_normalize=1 333 | filters=512 334 | size=3 335 | stride=1 336 | pad=1 337 | activation=leaky 338 | 339 | [shortcut] 340 | from=-3 341 | activation=linear 342 | 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=256 347 | size=1 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [convolutional] 353 | batch_normalize=1 354 | filters=512 355 | size=3 356 | stride=1 357 | pad=1 358 | activation=leaky 359 | 360 | [shortcut] 361 | from=-3 362 | activation=linear 363 | 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=256 368 | size=1 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [convolutional] 374 | batch_normalize=1 375 | filters=512 376 | size=3 377 | stride=1 378 | pad=1 379 | activation=leaky 380 | 381 | [shortcut] 382 | from=-3 383 | activation=linear 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=256 388 | size=1 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [convolutional] 394 | batch_normalize=1 395 | filters=512 396 | size=3 397 | stride=1 398 | pad=1 399 | activation=leaky 400 | 401 | [shortcut] 402 | from=-3 403 | activation=linear 404 | 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=256 409 | size=1 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [convolutional] 415 | batch_normalize=1 416 | filters=512 417 | size=3 418 | stride=1 419 | pad=1 420 | activation=leaky 421 | 422 | [shortcut] 423 | from=-3 424 | activation=linear 425 | 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=256 430 | size=1 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [convolutional] 436 | batch_normalize=1 437 | filters=512 438 | size=3 439 | stride=1 440 | pad=1 441 | activation=leaky 442 | 443 | [shortcut] 444 | from=-3 445 | activation=linear 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=256 450 | size=1 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [convolutional] 456 | batch_normalize=1 457 | filters=512 458 | size=3 459 | stride=1 460 | pad=1 461 | activation=leaky 462 | 463 | [shortcut] 464 | from=-3 465 | activation=linear 466 | 467 | # Downsample 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=1024 472 | size=3 473 | stride=2 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=512 480 | size=1 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [convolutional] 486 | batch_normalize=1 487 | filters=1024 488 | size=3 489 | stride=1 490 | pad=1 491 | activation=leaky 492 | 493 | [shortcut] 494 | from=-3 495 | activation=linear 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=512 500 | size=1 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [convolutional] 506 | batch_normalize=1 507 | filters=1024 508 | size=3 509 | stride=1 510 | pad=1 511 | activation=leaky 512 | 513 | [shortcut] 514 | from=-3 515 | activation=linear 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=512 520 | size=1 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [convolutional] 526 | batch_normalize=1 527 | filters=1024 528 | size=3 529 | stride=1 530 | pad=1 531 | activation=leaky 532 | 533 | [shortcut] 534 | from=-3 535 | activation=linear 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=512 540 | size=1 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [convolutional] 546 | batch_normalize=1 547 | filters=1024 548 | size=3 549 | stride=1 550 | pad=1 551 | activation=leaky 552 | 553 | [shortcut] 554 | from=-3 555 | activation=linear 556 | 557 | ###################### 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | filters=512 562 | size=1 563 | stride=1 564 | pad=1 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | size=3 570 | stride=1 571 | pad=1 572 | filters=1024 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | filters=512 578 | size=1 579 | stride=1 580 | pad=1 581 | activation=leaky 582 | 583 | ### SPP ### 584 | [maxpool] 585 | stride=1 586 | size=5 587 | 588 | [route] 589 | layers=-2 590 | 591 | [maxpool] 592 | stride=1 593 | size=9 594 | 595 | [route] 596 | layers=-4 597 | 598 | [maxpool] 599 | stride=1 600 | size=13 601 | 602 | [route] 603 | layers=-1,-3,-5,-6 604 | 605 | ### End SPP ### 606 | 607 | [convolutional] 608 | batch_normalize=1 609 | filters=512 610 | size=1 611 | stride=1 612 | pad=1 613 | activation=leaky 614 | 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | size=3 619 | stride=1 620 | pad=1 621 | filters=1024 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | filters=512 627 | size=1 628 | stride=1 629 | pad=1 630 | activation=leaky 631 | 632 | 633 | 634 | ########### to [yolo-3] 635 | 636 | 637 | 638 | [route] 639 | layers = -4 640 | 641 | [convolutional] 642 | batch_normalize=1 643 | filters=256 644 | size=1 645 | stride=1 646 | pad=1 647 | activation=leaky 648 | 649 | [upsample] 650 | stride=2 651 | 652 | [route] 653 | layers = -1, 61 654 | 655 | 656 | 657 | [convolutional] 658 | batch_normalize=1 659 | filters=256 660 | size=1 661 | stride=1 662 | pad=1 663 | activation=leaky 664 | 665 | [convolutional] 666 | batch_normalize=1 667 | size=3 668 | stride=1 669 | pad=1 670 | filters=512 671 | activation=leaky 672 | 673 | [convolutional] 674 | batch_normalize=1 675 | filters=256 676 | size=1 677 | stride=1 678 | pad=1 679 | activation=leaky 680 | 681 | [convolutional] 682 | batch_normalize=1 683 | size=3 684 | stride=1 685 | pad=1 686 | filters=512 687 | activation=leaky 688 | 689 | [convolutional] 690 | batch_normalize=1 691 | filters=256 692 | size=1 693 | stride=1 694 | pad=1 695 | activation=leaky 696 | 697 | 698 | ########### to [yolo-2] 699 | 700 | 701 | 702 | 703 | [route] 704 | layers = -4 705 | 706 | [convolutional] 707 | batch_normalize=1 708 | filters=128 709 | size=1 710 | stride=1 711 | pad=1 712 | activation=leaky 713 | 714 | [upsample] 715 | stride=2 716 | 717 | [route] 718 | layers = -1, 36 719 | 720 | 721 | 722 | [convolutional] 723 | batch_normalize=1 724 | filters=128 725 | size=1 726 | stride=1 727 | pad=1 728 | activation=leaky 729 | 730 | [convolutional] 731 | batch_normalize=1 732 | size=3 733 | stride=1 734 | pad=1 735 | filters=256 736 | activation=leaky 737 | 738 | [convolutional] 739 | batch_normalize=1 740 | filters=128 741 | size=1 742 | stride=1 743 | pad=1 744 | activation=leaky 745 | 746 | [convolutional] 747 | batch_normalize=1 748 | size=3 749 | stride=1 750 | pad=1 751 | filters=256 752 | activation=leaky 753 | 754 | [convolutional] 755 | batch_normalize=1 756 | filters=128 757 | size=1 758 | stride=1 759 | pad=1 760 | activation=leaky 761 | 762 | 763 | 764 | ########### to [yolo-1] 765 | 766 | 767 | ########### features of different layers 768 | 769 | 770 | [route] 771 | layers=1 772 | 773 | [reorg3d] 774 | stride=2 775 | 776 | [route] 777 | layers=5,-1 778 | 779 | [reorg3d] 780 | stride=2 781 | 782 | [route] 783 | layers=12,-1 784 | 785 | [reorg3d] 786 | stride=2 787 | 788 | [route] 789 | layers=37,-1 790 | 791 | [reorg3d] 792 | stride=2 793 | 794 | [route] 795 | layers=62,-1 796 | 797 | 798 | 799 | ########### [yolo-1] 800 | 801 | [convolutional] 802 | batch_normalize=1 803 | filters=128 804 | size=1 805 | stride=1 806 | pad=1 807 | activation=leaky 808 | 809 | [upsample] 810 | stride=4 811 | 812 | [route] 813 | layers = -1,-12 814 | 815 | 816 | [convolutional] 817 | batch_normalize=1 818 | size=3 819 | stride=1 820 | pad=1 821 | filters=256 822 | activation=leaky 823 | 824 | [convolutional] 825 | size=1 826 | stride=1 827 | pad=1 828 | filters=340 829 | activation=linear 830 | 831 | 832 | [yolo] 833 | mask = 0,1,2,3 834 | anchors = 8,8, 10,13, 16,30, 33,23, 32,32, 30,61, 62,45, 64,64, 59,119, 116,90, 156,198, 373,326 835 | classes=80 836 | num=12 837 | jitter=.3 838 | ignore_thresh = .7 839 | truth_thresh = 1 840 | scale_x_y = 1.05 841 | random=0 842 | 843 | 844 | 845 | 846 | ########### [yolo-2] 847 | 848 | 849 | [route] 850 | layers = -7 851 | 852 | [convolutional] 853 | batch_normalize=1 854 | filters=256 855 | size=1 856 | stride=1 857 | pad=1 858 | activation=leaky 859 | 860 | [upsample] 861 | stride=2 862 | 863 | [route] 864 | layers = -1,-28 865 | 866 | 867 | [convolutional] 868 | batch_normalize=1 869 | size=3 870 | stride=1 871 | pad=1 872 | filters=512 873 | activation=leaky 874 | 875 | [convolutional] 876 | size=1 877 | stride=1 878 | pad=1 879 | filters=340 880 | activation=linear 881 | 882 | 883 | [yolo] 884 | mask = 4,5,6,7 885 | anchors = 8,8, 10,13, 16,30, 33,23, 32,32, 30,61, 62,45, 64,64, 59,119, 116,90, 156,198, 373,326 886 | classes=80 887 | num=12 888 | jitter=.3 889 | ignore_thresh = .7 890 | truth_thresh = 1 891 | scale_x_y = 1.1 892 | random=0 893 | 894 | 895 | 896 | ########### [yolo-3] 897 | 898 | [route] 899 | layers = -14 900 | 901 | [convolutional] 902 | batch_normalize=1 903 | filters=512 904 | size=1 905 | stride=1 906 | pad=1 907 | activation=leaky 908 | 909 | [route] 910 | layers = -1,-43 911 | 912 | [convolutional] 913 | batch_normalize=1 914 | size=3 915 | stride=1 916 | pad=1 917 | filters=1024 918 | activation=leaky 919 | 920 | 921 | [convolutional] 922 | size=1 923 | stride=1 924 | pad=1 925 | filters=340 926 | activation=linear 927 | 928 | 929 | [yolo] 930 | mask = 8,9,10,11 931 | anchors = 8,8, 10,13, 16,30, 33,23, 32,32, 30,61, 62,45, 59,119, 80,80, 116,90, 156,198, 373,326 932 | classes=80 933 | num=12 934 | jitter=.3 935 | ignore_thresh = .7 936 | truth_thresh = 1 937 | scale_x_y = 1.2 938 | random=0 939 | -------------------------------------------------------------------------------- /cfg/yolov3-spp.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=255 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=80 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=255 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 3,4,5 728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 729 | classes=80 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=255 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2 815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 816 | classes=80 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | -------------------------------------------------------------------------------- /cfg/yolov3-spp3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 120200 21 | policy=steps 22 | steps=70000,100000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=255 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=80 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | ### SPP ### 687 | [maxpool] 688 | stride=1 689 | size=5 690 | 691 | [route] 692 | layers=-2 693 | 694 | [maxpool] 695 | stride=1 696 | size=9 697 | 698 | [route] 699 | layers=-4 700 | 701 | [maxpool] 702 | stride=1 703 | size=13 704 | 705 | [route] 706 | layers=-1,-3,-5,-6 707 | 708 | ### End SPP ### 709 | 710 | 711 | [convolutional] 712 | batch_normalize=1 713 | filters=256 714 | size=1 715 | stride=1 716 | pad=1 717 | activation=leaky 718 | 719 | [convolutional] 720 | batch_normalize=1 721 | size=3 722 | stride=1 723 | pad=1 724 | filters=512 725 | activation=leaky 726 | 727 | [convolutional] 728 | batch_normalize=1 729 | filters=256 730 | size=1 731 | stride=1 732 | pad=1 733 | activation=leaky 734 | 735 | [convolutional] 736 | batch_normalize=1 737 | size=3 738 | stride=1 739 | pad=1 740 | filters=512 741 | activation=leaky 742 | 743 | [convolutional] 744 | size=1 745 | stride=1 746 | pad=1 747 | filters=255 748 | activation=linear 749 | 750 | 751 | [yolo] 752 | mask = 3,4,5 753 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 754 | classes=80 755 | num=9 756 | jitter=.3 757 | ignore_thresh = .7 758 | truth_thresh = 1 759 | random=1 760 | 761 | 762 | 763 | [route] 764 | layers = -4 765 | 766 | [convolutional] 767 | batch_normalize=1 768 | filters=128 769 | size=1 770 | stride=1 771 | pad=1 772 | activation=leaky 773 | 774 | [upsample] 775 | stride=2 776 | 777 | [route] 778 | layers = -1, 36 779 | 780 | 781 | 782 | [convolutional] 783 | batch_normalize=1 784 | filters=128 785 | size=1 786 | stride=1 787 | pad=1 788 | activation=leaky 789 | 790 | [convolutional] 791 | batch_normalize=1 792 | size=3 793 | stride=1 794 | pad=1 795 | filters=256 796 | activation=leaky 797 | 798 | [convolutional] 799 | batch_normalize=1 800 | filters=128 801 | size=1 802 | stride=1 803 | pad=1 804 | activation=leaky 805 | 806 | ### SPP ### 807 | [maxpool] 808 | stride=1 809 | size=5 810 | 811 | [route] 812 | layers=-2 813 | 814 | [maxpool] 815 | stride=1 816 | size=9 817 | 818 | [route] 819 | layers=-4 820 | 821 | [maxpool] 822 | stride=1 823 | size=13 824 | 825 | [route] 826 | layers=-1,-3,-5,-6 827 | 828 | ### End SPP ### 829 | 830 | [convolutional] 831 | batch_normalize=1 832 | size=3 833 | stride=1 834 | pad=1 835 | filters=256 836 | activation=leaky 837 | 838 | [convolutional] 839 | batch_normalize=1 840 | filters=128 841 | size=1 842 | stride=1 843 | pad=1 844 | activation=leaky 845 | 846 | [convolutional] 847 | batch_normalize=1 848 | size=3 849 | stride=1 850 | pad=1 851 | filters=256 852 | activation=leaky 853 | 854 | [convolutional] 855 | size=1 856 | stride=1 857 | pad=1 858 | filters=255 859 | activation=linear 860 | 861 | 862 | [yolo] 863 | mask = 0,1,2 864 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 865 | classes=80 866 | num=9 867 | jitter=.3 868 | ignore_thresh = .7 869 | truth_thresh = 1 870 | random=1 871 | -------------------------------------------------------------------------------- /cfg/yolov3-tiny-1cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=2 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=16 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=32 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=64 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [maxpool] 58 | size=2 59 | stride=2 60 | 61 | [convolutional] 62 | batch_normalize=1 63 | filters=128 64 | size=3 65 | stride=1 66 | pad=1 67 | activation=leaky 68 | 69 | [maxpool] 70 | size=2 71 | stride=2 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=256 76 | size=3 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [maxpool] 82 | size=2 83 | stride=2 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=512 88 | size=3 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [maxpool] 94 | size=2 95 | stride=1 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=1024 100 | size=3 101 | stride=1 102 | pad=1 103 | activation=leaky 104 | 105 | ########### 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=256 110 | size=1 111 | stride=1 112 | pad=1 113 | activation=leaky 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=512 118 | size=3 119 | stride=1 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | size=1 125 | stride=1 126 | pad=1 127 | filters=18 128 | activation=linear 129 | 130 | 131 | 132 | [yolo] 133 | mask = 3,4,5 134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 135 | classes=1 136 | num=6 137 | jitter=.3 138 | ignore_thresh = .7 139 | truth_thresh = 1 140 | random=1 141 | 142 | [route] 143 | layers = -4 144 | 145 | [convolutional] 146 | batch_normalize=1 147 | filters=128 148 | size=1 149 | stride=1 150 | pad=1 151 | activation=leaky 152 | 153 | [upsample] 154 | stride=2 155 | 156 | [route] 157 | layers = -1, 8 158 | 159 | [convolutional] 160 | batch_normalize=1 161 | filters=256 162 | size=3 163 | stride=1 164 | pad=1 165 | activation=leaky 166 | 167 | [convolutional] 168 | size=1 169 | stride=1 170 | pad=1 171 | filters=18 172 | activation=linear 173 | 174 | [yolo] 175 | mask = 0,1,2 176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 177 | classes=1 178 | num=6 179 | jitter=.3 180 | ignore_thresh = .7 181 | truth_thresh = 1 182 | random=1 183 | -------------------------------------------------------------------------------- /cfg/yolov3-tiny-3cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=2 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=16 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=32 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=64 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [maxpool] 58 | size=2 59 | stride=2 60 | 61 | [convolutional] 62 | batch_normalize=1 63 | filters=128 64 | size=3 65 | stride=1 66 | pad=1 67 | activation=leaky 68 | 69 | [maxpool] 70 | size=2 71 | stride=2 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=256 76 | size=3 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [maxpool] 82 | size=2 83 | stride=2 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=512 88 | size=3 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [maxpool] 94 | size=2 95 | stride=1 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=1024 100 | size=3 101 | stride=1 102 | pad=1 103 | activation=leaky 104 | 105 | ########### 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=256 110 | size=1 111 | stride=1 112 | pad=1 113 | activation=leaky 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=512 118 | size=3 119 | stride=1 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | size=1 125 | stride=1 126 | pad=1 127 | filters=24 128 | activation=linear 129 | 130 | 131 | 132 | [yolo] 133 | mask = 3,4,5 134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 135 | classes=3 136 | num=6 137 | jitter=.3 138 | ignore_thresh = .7 139 | truth_thresh = 1 140 | random=1 141 | 142 | [route] 143 | layers = -4 144 | 145 | [convolutional] 146 | batch_normalize=1 147 | filters=128 148 | size=1 149 | stride=1 150 | pad=1 151 | activation=leaky 152 | 153 | [upsample] 154 | stride=2 155 | 156 | [route] 157 | layers = -1, 8 158 | 159 | [convolutional] 160 | batch_normalize=1 161 | filters=256 162 | size=3 163 | stride=1 164 | pad=1 165 | activation=leaky 166 | 167 | [convolutional] 168 | size=1 169 | stride=1 170 | pad=1 171 | filters=24 172 | activation=linear 173 | 174 | [yolo] 175 | mask = 0,1,2 176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 177 | classes=3 178 | num=6 179 | jitter=.3 180 | ignore_thresh = .7 181 | truth_thresh = 1 182 | random=1 183 | -------------------------------------------------------------------------------- /cfg/yolov3-tiny-mask.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=2 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=16 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=32 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=64 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [maxpool] 58 | size=2 59 | stride=2 60 | 61 | [convolutional] 62 | batch_normalize=1 63 | filters=128 64 | size=3 65 | stride=1 66 | pad=1 67 | activation=leaky 68 | 69 | [maxpool] 70 | size=2 71 | stride=2 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=256 76 | size=3 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [maxpool] 82 | size=2 83 | stride=2 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=512 88 | size=3 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [maxpool] 94 | size=2 95 | stride=1 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=1024 100 | size=3 101 | stride=1 102 | pad=1 103 | activation=leaky 104 | 105 | ########### 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=256 110 | size=1 111 | stride=1 112 | pad=1 113 | activation=leaky 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=512 118 | size=3 119 | stride=1 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | size=1 125 | stride=1 126 | pad=1 127 | filters=21 128 | activation=linear 129 | 130 | 131 | 132 | [yolo] 133 | mask = 3,4,5 134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 135 | classes=2 136 | num=6 137 | jitter=.3 138 | ignore_thresh = .7 139 | truth_thresh = 1 140 | random=1 141 | 142 | [route] 143 | layers = -4 144 | 145 | [convolutional] 146 | batch_normalize=1 147 | filters=128 148 | size=1 149 | stride=1 150 | pad=1 151 | activation=leaky 152 | 153 | [upsample] 154 | stride=2 155 | 156 | [route] 157 | layers = -1, 8 158 | 159 | [convolutional] 160 | batch_normalize=1 161 | filters=256 162 | size=3 163 | stride=1 164 | pad=1 165 | activation=leaky 166 | 167 | [convolutional] 168 | size=1 169 | stride=1 170 | pad=1 171 | filters=21 172 | activation=linear 173 | 174 | [yolo] 175 | mask = 1,2,3 176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 177 | classes=2 178 | num=6 179 | jitter=.3 180 | ignore_thresh = .7 181 | truth_thresh = 1 182 | random=1 183 | -------------------------------------------------------------------------------- /cfg/yolov3-tiny.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=2 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=16 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=32 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=64 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [maxpool] 58 | size=2 59 | stride=2 60 | 61 | [convolutional] 62 | batch_normalize=1 63 | filters=128 64 | size=3 65 | stride=1 66 | pad=1 67 | activation=leaky 68 | 69 | [maxpool] 70 | size=2 71 | stride=2 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=256 76 | size=3 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [maxpool] 82 | size=2 83 | stride=2 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=512 88 | size=3 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [maxpool] 94 | size=2 95 | stride=1 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=1024 100 | size=3 101 | stride=1 102 | pad=1 103 | activation=leaky 104 | 105 | ########### 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=256 110 | size=1 111 | stride=1 112 | pad=1 113 | activation=leaky 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=512 118 | size=3 119 | stride=1 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | size=1 125 | stride=1 126 | pad=1 127 | filters=255 128 | activation=linear 129 | 130 | 131 | 132 | [yolo] 133 | mask = 3,4,5 134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 135 | classes=80 136 | num=6 137 | jitter=.3 138 | ignore_thresh = .7 139 | truth_thresh = 1 140 | random=1 141 | 142 | [route] 143 | layers = -4 144 | 145 | [convolutional] 146 | batch_normalize=1 147 | filters=128 148 | size=1 149 | stride=1 150 | pad=1 151 | activation=leaky 152 | 153 | [upsample] 154 | stride=2 155 | 156 | [route] 157 | layers = -1, 8 158 | 159 | [convolutional] 160 | batch_normalize=1 161 | filters=256 162 | size=3 163 | stride=1 164 | pad=1 165 | activation=leaky 166 | 167 | [convolutional] 168 | size=1 169 | stride=1 170 | pad=1 171 | filters=255 172 | activation=linear 173 | 174 | [yolo] 175 | mask = 1,2,3 176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 177 | classes=80 178 | num=6 179 | jitter=.3 180 | ignore_thresh = .7 181 | truth_thresh = 1 182 | random=1 183 | -------------------------------------------------------------------------------- /cfg/yolov3-tiny3-1cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 200000 21 | policy=steps 22 | steps=180000,190000 23 | scales=.1,.1 24 | 25 | 26 | [convolutional] 27 | batch_normalize=1 28 | filters=16 29 | size=3 30 | stride=1 31 | pad=1 32 | activation=leaky 33 | 34 | [maxpool] 35 | size=2 36 | stride=2 37 | 38 | [convolutional] 39 | batch_normalize=1 40 | filters=32 41 | size=3 42 | stride=1 43 | pad=1 44 | activation=leaky 45 | 46 | [maxpool] 47 | size=2 48 | stride=2 49 | 50 | [convolutional] 51 | batch_normalize=1 52 | filters=64 53 | size=3 54 | stride=1 55 | pad=1 56 | activation=leaky 57 | 58 | [maxpool] 59 | size=2 60 | stride=2 61 | 62 | [convolutional] 63 | batch_normalize=1 64 | filters=128 65 | size=3 66 | stride=1 67 | pad=1 68 | activation=leaky 69 | 70 | [maxpool] 71 | size=2 72 | stride=2 73 | 74 | [convolutional] 75 | batch_normalize=1 76 | filters=256 77 | size=3 78 | stride=1 79 | pad=1 80 | activation=leaky 81 | 82 | [maxpool] 83 | size=2 84 | stride=2 85 | 86 | [convolutional] 87 | batch_normalize=1 88 | filters=512 89 | size=3 90 | stride=1 91 | pad=1 92 | activation=leaky 93 | 94 | [maxpool] 95 | size=2 96 | stride=1 97 | 98 | [convolutional] 99 | batch_normalize=1 100 | filters=1024 101 | size=3 102 | stride=1 103 | pad=1 104 | activation=leaky 105 | 106 | ########### 107 | 108 | [convolutional] 109 | batch_normalize=1 110 | filters=256 111 | size=1 112 | stride=1 113 | pad=1 114 | activation=leaky 115 | 116 | [convolutional] 117 | batch_normalize=1 118 | filters=512 119 | size=3 120 | stride=1 121 | pad=1 122 | activation=leaky 123 | 124 | [convolutional] 125 | size=1 126 | stride=1 127 | pad=1 128 | filters=18 129 | activation=linear 130 | 131 | 132 | 133 | [yolo] 134 | mask = 6,7,8 135 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 136 | classes=1 137 | num=9 138 | jitter=.3 139 | ignore_thresh = .7 140 | truth_thresh = 1 141 | random=1 142 | 143 | [route] 144 | layers = -4 145 | 146 | [convolutional] 147 | batch_normalize=1 148 | filters=128 149 | size=1 150 | stride=1 151 | pad=1 152 | activation=leaky 153 | 154 | [upsample] 155 | stride=2 156 | 157 | [route] 158 | layers = -1, 8 159 | 160 | [convolutional] 161 | batch_normalize=1 162 | filters=256 163 | size=3 164 | stride=1 165 | pad=1 166 | activation=leaky 167 | 168 | [convolutional] 169 | size=1 170 | stride=1 171 | pad=1 172 | filters=18 173 | activation=linear 174 | 175 | [yolo] 176 | mask = 3,4,5 177 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 178 | classes=1 179 | num=9 180 | jitter=.3 181 | ignore_thresh = .7 182 | truth_thresh = 1 183 | random=1 184 | 185 | 186 | 187 | [route] 188 | layers = -3 189 | 190 | [convolutional] 191 | batch_normalize=1 192 | filters=128 193 | size=1 194 | stride=1 195 | pad=1 196 | activation=leaky 197 | 198 | [upsample] 199 | stride=2 200 | 201 | [route] 202 | layers = -1, 6 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=3 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=18 217 | activation=linear 218 | 219 | [yolo] 220 | mask = 0,1,2 221 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 222 | classes=1 223 | num=9 224 | jitter=.3 225 | ignore_thresh = .7 226 | truth_thresh = 1 227 | random=1 228 | -------------------------------------------------------------------------------- /cfg/yolov3-tiny3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 200000 21 | policy=steps 22 | steps=180000,190000 23 | scales=.1,.1 24 | 25 | 26 | [convolutional] 27 | batch_normalize=1 28 | filters=16 29 | size=3 30 | stride=1 31 | pad=1 32 | activation=leaky 33 | 34 | [maxpool] 35 | size=2 36 | stride=2 37 | 38 | [convolutional] 39 | batch_normalize=1 40 | filters=32 41 | size=3 42 | stride=1 43 | pad=1 44 | activation=leaky 45 | 46 | [maxpool] 47 | size=2 48 | stride=2 49 | 50 | [convolutional] 51 | batch_normalize=1 52 | filters=64 53 | size=3 54 | stride=1 55 | pad=1 56 | activation=leaky 57 | 58 | [maxpool] 59 | size=2 60 | stride=2 61 | 62 | [convolutional] 63 | batch_normalize=1 64 | filters=128 65 | size=3 66 | stride=1 67 | pad=1 68 | activation=leaky 69 | 70 | [maxpool] 71 | size=2 72 | stride=2 73 | 74 | [convolutional] 75 | batch_normalize=1 76 | filters=256 77 | size=3 78 | stride=1 79 | pad=1 80 | activation=leaky 81 | 82 | [maxpool] 83 | size=2 84 | stride=2 85 | 86 | [convolutional] 87 | batch_normalize=1 88 | filters=512 89 | size=3 90 | stride=1 91 | pad=1 92 | activation=leaky 93 | 94 | [maxpool] 95 | size=2 96 | stride=1 97 | 98 | [convolutional] 99 | batch_normalize=1 100 | filters=1024 101 | size=3 102 | stride=1 103 | pad=1 104 | activation=leaky 105 | 106 | ########### 107 | 108 | [convolutional] 109 | batch_normalize=1 110 | filters=256 111 | size=1 112 | stride=1 113 | pad=1 114 | activation=leaky 115 | 116 | [convolutional] 117 | batch_normalize=1 118 | filters=512 119 | size=3 120 | stride=1 121 | pad=1 122 | activation=leaky 123 | 124 | [convolutional] 125 | size=1 126 | stride=1 127 | pad=1 128 | filters=255 129 | activation=linear 130 | 131 | 132 | 133 | [yolo] 134 | mask = 6,7,8 135 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 136 | classes=80 137 | num=9 138 | jitter=.3 139 | ignore_thresh = .7 140 | truth_thresh = 1 141 | random=1 142 | 143 | [route] 144 | layers = -4 145 | 146 | [convolutional] 147 | batch_normalize=1 148 | filters=128 149 | size=1 150 | stride=1 151 | pad=1 152 | activation=leaky 153 | 154 | [upsample] 155 | stride=2 156 | 157 | [route] 158 | layers = -1, 8 159 | 160 | [convolutional] 161 | batch_normalize=1 162 | filters=256 163 | size=3 164 | stride=1 165 | pad=1 166 | activation=leaky 167 | 168 | [convolutional] 169 | size=1 170 | stride=1 171 | pad=1 172 | filters=255 173 | activation=linear 174 | 175 | [yolo] 176 | mask = 3,4,5 177 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 178 | classes=80 179 | num=9 180 | jitter=.3 181 | ignore_thresh = .7 182 | truth_thresh = 1 183 | random=1 184 | 185 | 186 | 187 | [route] 188 | layers = -3 189 | 190 | [convolutional] 191 | batch_normalize=1 192 | filters=128 193 | size=1 194 | stride=1 195 | pad=1 196 | activation=leaky 197 | 198 | [upsample] 199 | stride=2 200 | 201 | [route] 202 | layers = -1, 6 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=3 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=255 217 | activation=linear 218 | 219 | [yolo] 220 | mask = 0,1,2 221 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 222 | classes=80 223 | num=9 224 | jitter=.3 225 | ignore_thresh = .7 226 | truth_thresh = 1 227 | random=1 228 | -------------------------------------------------------------------------------- /cfg/yolov3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=16 7 | subdivisions=1 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .7 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .7 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .7 787 | truth_thresh = 1 788 | random=1 789 | -------------------------------------------------------------------------------- /cfg/yolov3s.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=swish 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=swish 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=swish 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=swish 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=swish 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=swish 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=swish 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=swish 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=swish 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=swish 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=swish 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=swish 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=swish 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=swish 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=swish 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=swish 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=swish 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=swish 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=swish 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=swish 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=swish 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=swish 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=swish 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=swish 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=swish 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=swish 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=swish 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=swish 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=swish 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=swish 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=swish 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=swish 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=swish 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=swish 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=swish 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=swish 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=swish 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=swish 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=swish 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=swish 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=swish 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=swish 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=swish 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=swish 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=swish 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=swish 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=swish 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=swish 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=swish 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=swish 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=swish 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=swish 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=swish 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=swish 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=swish 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=swish 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=swish 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=swish 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=swish 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=255 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=80 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=swish 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=swish 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=swish 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=swish 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=swish 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=swish 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=swish 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=255 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 3,4,5 728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 729 | classes=80 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=swish 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=swish 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=swish 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=swish 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=swish 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=swish 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=swish 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=255 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2 815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 816 | classes=80 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | -------------------------------------------------------------------------------- /detect.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | import threading 4 | from sys import platform 5 | from models import * 6 | from project.datasets import * 7 | from project.utils import * 8 | from pydub import AudioSegment 9 | 10 | from pydub.playback import play 11 | 12 | def show(weights, im0): 13 | cv2.imshow(weights, im0) 14 | def playsound(): 15 | song = AudioSegment.from_wav('mask_on.wav') 16 | play(song) 17 | def detect( 18 | cfg, 19 | data_cfg, 20 | weights, 21 | images='data/samples', # input folder 22 | output='output', # output folder 23 | fourcc='mp4v', 24 | img_size=416, 25 | conf_thres=0.5, 26 | nms_thres=0.5, 27 | save_txt=False, 28 | save_images=True, 29 | webcam=True 30 | ): 31 | device = torch_utils.select_device() 32 | if os.path.exists(output): 33 | shutil.rmtree(output) # delete output folder 34 | os.makedirs(output) # make new output folder 35 | 36 | # Initialize model 37 | if ONNX_EXPORT: 38 | s = (320, 192) # onnx model image size (height, width) 39 | model = Darknet(cfg, s) 40 | else: 41 | model = Darknet(cfg, img_size) 42 | 43 | # Load weights 44 | if weights.endswith('.pt'): # pytorch format 45 | model.load_state_dict(torch.load(weights, map_location=device)['model']) 46 | else: # darknet format 47 | _ = load_darknet_weights(model, weights) 48 | 49 | # Fuse Conv2d + BatchNorm2d layers 50 | model.fuse() 51 | 52 | # Eval mode 53 | model.to(device).eval() 54 | 55 | if ONNX_EXPORT: 56 | img = torch.zeros((1, 3, s[0], s[1])) 57 | torch.onnx.export(model, img, 'weights/export.onnx', verbose=True) 58 | return 59 | 60 | # Set Dataloader 61 | vid_path, vid_writer = None, None 62 | if webcam: 63 | save_images = False 64 | dataloader = LoadWebcam(img_size=img_size) 65 | else: 66 | dataloader = LoadImages(images, img_size=img_size) 67 | 68 | # Get classes and colors 69 | classes = load_classes(parse_data_cfg(data_cfg)['names']) 70 | colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(classes))] 71 | res = "" 72 | times = 0 73 | for i, (path, img, im0, vid_cap) in enumerate(dataloader): 74 | t = time.time() 75 | save_path = str(Path(output) / Path(path).name) 76 | 77 | # Get detections 78 | img = torch.from_numpy(img).unsqueeze(0).to(device) 79 | pred, _ = model(img) 80 | det = non_max_suppression(pred, conf_thres, nms_thres)[0] 81 | if det is not None and len(det) > 0: 82 | # Rescale boxes from 416 to true image size 83 | det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round() 84 | 85 | # Print results to screen 86 | print('%gx%g ' % img.shape[2:], end='') # print image size 87 | for c in det[:, -1].unique(): 88 | n = (det[:, -1] == c).sum() 89 | print('%g %ss' % (n, classes[int(c)]), end=', ') 90 | 91 | # Draw bounding boxes and labels of detections 92 | for *xyxy, conf, cls_conf, cls in det: 93 | if save_txt: # Write to file 94 | with open(save_path + '.txt', 'a') as file: 95 | file.write(('%g ' * 6 + '\n') % (*xyxy, cls, conf)) 96 | 97 | # Add bbox to the image 98 | label = '%s %.2f' % (classes[int(cls)], conf) 99 | plot_one_box(xyxy, im0, label=label, color=colors[int(cls)]) 100 | res = classes[int(cls)] 101 | 102 | print('Done. (%.3fs)' % (time.time() - t)) 103 | if webcam: 104 | show(weights, im0) 105 | if res == "no_mask" and (times%50) == 0: 106 | threading.Thread(target=playsound).start() 107 | if save_images: # Save image with detections 108 | if dataloader.mode == 'images': 109 | cv2.imwrite(save_path, im0) 110 | else: 111 | if vid_path != save_path: # new video 112 | vid_path = save_path 113 | if isinstance(vid_writer, cv2.VideoWriter): 114 | vid_writer.release() # release previous video writer 115 | 116 | fps = vid_cap.get(cv2.CAP_PROP_FPS) 117 | width = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 118 | height = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 119 | vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*fourcc), fps, (width, height)) 120 | vid_writer.write(im0) 121 | times += 1 122 | 123 | if save_images: 124 | print('Results saved to %s' % os.getcwd() + os.sep + output) 125 | if platform == 'darwin': # macos 126 | os.system('open ' + output + ' ' + save_path) 127 | 128 | 129 | if __name__ == '__main__': 130 | parser = argparse.ArgumentParser() 131 | parser.add_argument('--cfg', type=str, default='cfg/yolov3-spp.cfg', help='cfg file path') 132 | parser.add_argument('--data-cfg', type=str, default='data/coco.data', help='coco.data file path') 133 | parser.add_argument('--weights', type=str, default='weights/yolov3-spp.weights', help='path to weights file') 134 | parser.add_argument('--images', type=str, default='data/samples', help='path to images') 135 | parser.add_argument('--img-size', type=int, default=416, help='inference size (pixels)') 136 | parser.add_argument('--conf-thres', type=float, default=0.5, help='object confidence threshold') 137 | parser.add_argument('--nms-thres', type=float, default=0.5, help='iou threshold for non-maximum suppression') 138 | parser.add_argument('--fourcc', type=str, default='mp4v', help='specifies the fourcc code for output video encoding (make sure ffmpeg supports specified fourcc codec)') 139 | parser.add_argument('--output', type=str, default='output',help='specifies the output path for images and videos') 140 | opt = parser.parse_args() 141 | print(opt) 142 | 143 | with torch.no_grad(): 144 | detect( 145 | opt.cfg, 146 | opt.data_cfg, 147 | opt.weights, 148 | images=opt.images, 149 | img_size=opt.img_size, 150 | conf_thres=opt.conf_thres, 151 | nms_thres=opt.nms_thres, 152 | fourcc=opt.fourcc, 153 | output=opt.output 154 | ) 155 | -------------------------------------------------------------------------------- /makeMain.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os \n", 10 | "import random \n", 11 | " \n", 12 | "trainval_percent = 0.8\n", 13 | "train_percent = 0.8 \n", 14 | "xmlfilepath = 'Annotations' \n", 15 | "txtsavepath = 'ImageSets\\Main' \n", 16 | "total_xml = os.listdir(xmlfilepath) \n", 17 | " \n", 18 | "num=len(total_xml) \n", 19 | "list=range(num) \n", 20 | "tv=int(num*trainval_percent) \n", 21 | "tr=int(tv*train_percent) \n", 22 | "trainval= random.sample(list,tv) \n", 23 | "train=random.sample(trainval,tr) \n", 24 | " \n", 25 | "ftrainval = open('ImageSets/Main/trainval.txt', 'w') \n", 26 | "ftest = open('ImageSets/Main/test.txt', 'w') \n", 27 | "ftrain = open('ImageSets/Main/train.txt', 'w') \n", 28 | "fval = open('ImageSets/Main/val.txt', 'w') \n", 29 | " \n", 30 | "for i in list: \n", 31 | " name=total_xml[i][:-4]+'\\n' \n", 32 | " if i in trainval: \n", 33 | " ftrainval.write(name) \n", 34 | " if i in train: \n", 35 | " ftrain.write(name) \n", 36 | " else: \n", 37 | " fval.write(name) \n", 38 | " else: \n", 39 | " ftest.write(name) \n", 40 | " \n", 41 | "ftrainval.close() \n", 42 | "ftrain.close() \n", 43 | "fval.close() \n", 44 | "ftest.close()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [] 53 | } 54 | ], 55 | "metadata": { 56 | "kernelspec": { 57 | "display_name": "Python 3", 58 | "language": "python", 59 | "name": "python3" 60 | }, 61 | "language_info": { 62 | "codemirror_mode": { 63 | "name": "ipython", 64 | "version": 3 65 | }, 66 | "file_extension": ".py", 67 | "mimetype": "text/x-python", 68 | "name": "python", 69 | "nbconvert_exporter": "python", 70 | "pygments_lexer": "ipython3", 71 | "version": "3.7.0" 72 | } 73 | }, 74 | "nbformat": 4, 75 | "nbformat_minor": 2 76 | } 77 | -------------------------------------------------------------------------------- /mask_on.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhengmingzhang/mask-detection/ff8a57b81ced6bc3fa6c1ae01f3b08cf1cb23e60/mask_on.wav -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch.nn.functional as F 3 | from project.parse_config import * 4 | from project.utils import * 5 | 6 | ONNX_EXPORT = False 7 | 8 | 9 | def create_modules(module_defs): 10 | """ 11 | Constructs module list of layer blocks from module configuration in module_defs 12 | """ 13 | hyperparams = module_defs.pop(0) 14 | output_filters = [int(hyperparams['channels'])] 15 | module_list = nn.ModuleList() 16 | yolo_layer_count = 0 17 | for i, module_def in enumerate(module_defs): 18 | modules = nn.Sequential() 19 | 20 | if module_def['type'] == 'convolutional': 21 | bn = int(module_def['batch_normalize']) 22 | filters = int(module_def['filters']) 23 | kernel_size = int(module_def['size']) 24 | pad = (kernel_size - 1) // 2 if int(module_def['pad']) else 0 25 | modules.add_module('conv_%d' % i, nn.Conv2d(in_channels=output_filters[-1], 26 | out_channels=filters, 27 | kernel_size=kernel_size, 28 | stride=int(module_def['stride']), 29 | padding=pad, 30 | bias=not bn)) 31 | if bn: 32 | modules.add_module('batch_norm_%d' % i, nn.BatchNorm2d(filters)) 33 | if module_def['activation'] == 'leaky': 34 | modules.add_module('leaky_%d' % i, nn.LeakyReLU(0.1, inplace=True)) 35 | 36 | elif module_def['type'] == 'maxpool': 37 | kernel_size = int(module_def['size']) 38 | stride = int(module_def['stride']) 39 | if kernel_size == 2 and stride == 1: 40 | modules.add_module('_debug_padding_%d' % i, nn.ZeroPad2d((0, 1, 0, 1))) 41 | maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2)) 42 | modules.add_module('maxpool_%d' % i, maxpool) 43 | 44 | elif module_def['type'] == 'upsample': 45 | # upsample = nn.Upsample(scale_factor=int(module_def['stride']), mode='nearest') # WARNING: deprecated 46 | upsample = Upsample(scale_factor=int(module_def['stride'])) 47 | modules.add_module('upsample_%d' % i, upsample) 48 | 49 | elif module_def['type'] == 'route': 50 | layers = [int(x) for x in module_def['layers'].split(',')] 51 | filters = sum([output_filters[i + 1 if i > 0 else i] for i in layers]) 52 | modules.add_module('route_%d' % i, EmptyLayer()) 53 | 54 | elif module_def['type'] == 'shortcut': 55 | filters = output_filters[int(module_def['from'])] 56 | modules.add_module('shortcut_%d' % i, EmptyLayer()) 57 | 58 | elif module_def['type'] == 'yolo': 59 | anchor_idxs = [int(x) for x in module_def['mask'].split(',')] 60 | # Extract anchors 61 | anchors = [float(x) for x in module_def['anchors'].split(',')] 62 | anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)] 63 | anchors = [anchors[i] for i in anchor_idxs] 64 | nc = int(module_def['classes']) # number of classes 65 | img_size = hyperparams['height'] 66 | # Define detection layer 67 | yolo_layer = YOLOLayer(anchors, nc, img_size, yolo_layer_count, cfg=hyperparams['cfg']) 68 | modules.add_module('yolo_%d' % i, yolo_layer) 69 | yolo_layer_count += 1 70 | 71 | # Register module list and number of output filters 72 | module_list.append(modules) 73 | output_filters.append(filters) 74 | 75 | return hyperparams, module_list 76 | 77 | 78 | class EmptyLayer(nn.Module): 79 | """Placeholder for 'route' and 'shortcut' layers""" 80 | 81 | def __init__(self): 82 | super(EmptyLayer, self).__init__() 83 | 84 | def forward(self, x): 85 | return x 86 | 87 | 88 | class Upsample(nn.Module): 89 | # Custom Upsample layer (nn.Upsample gives deprecated warning message) 90 | 91 | def __init__(self, scale_factor=1, mode='nearest'): 92 | super(Upsample, self).__init__() 93 | self.scale_factor = scale_factor 94 | self.mode = mode 95 | 96 | def forward(self, x): 97 | return F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode) 98 | 99 | 100 | class YOLOLayer(nn.Module): 101 | def __init__(self, anchors, nc, img_size, yolo_layer, cfg): 102 | super(YOLOLayer, self).__init__() 103 | 104 | self.anchors = torch.Tensor(anchors) 105 | self.na = len(anchors) # number of anchors (3) 106 | self.nc = nc # number of classes (80) 107 | self.nx = 0 # initialize number of x gridpoints 108 | self.ny = 0 # initialize number of y gridpoints 109 | 110 | if ONNX_EXPORT: # grids must be computed in __init__ 111 | stride = [32, 16, 8][yolo_layer] # stride of this layer 112 | nx = int(img_size[1] / stride) # number x grid points 113 | ny = int(img_size[0] / stride) # number y grid points 114 | create_grids(self, max(img_size), (nx, ny)) 115 | 116 | def forward(self, p, img_size, var=None): 117 | if ONNX_EXPORT: 118 | bs = 1 # batch size 119 | else: 120 | bs, ny, nx = p.shape[0], p.shape[-2], p.shape[-1] 121 | if (self.nx, self.ny) != (nx, ny): 122 | create_grids(self, img_size, (nx, ny), p.device) 123 | 124 | # p.view(bs, 255, 13, 13) -- > (bs, 3, 13, 13, 85) # (bs, anchors, grid, grid, classes + xywh) 125 | p = p.view(bs, self.na, self.nc + 5, self.ny, self.nx).permute(0, 1, 3, 4, 2).contiguous() # prediction 126 | 127 | if self.training: 128 | return p 129 | 130 | elif ONNX_EXPORT: 131 | # Constants CAN NOT BE BROADCAST, ensure correct shape! 132 | ngu = self.ng.repeat((1, self.na * self.nx * self.ny, 1)) 133 | grid_xy = self.grid_xy.repeat((1, self.na, 1, 1, 1)).view((1, -1, 2)) 134 | anchor_wh = self.anchor_wh.repeat((1, 1, self.nx, self.ny, 1)).view((1, -1, 2)) / ngu 135 | 136 | # p = p.view(-1, 5 + self.nc) 137 | # xy = torch.sigmoid(p[..., 0:2]) + grid_xy[0] # x, y 138 | # wh = torch.exp(p[..., 2:4]) * anchor_wh[0] # width, height 139 | # p_conf = torch.sigmoid(p[:, 4:5]) # Conf 140 | # p_cls = F.softmax(p[:, 5:85], 1) * p_conf # SSD-like conf 141 | # return torch.cat((xy / ngu[0], wh, p_conf, p_cls), 1).t() 142 | 143 | p = p.view(1, -1, 5 + self.nc) 144 | xy = torch.sigmoid(p[..., 0:2]) + grid_xy # x, y 145 | wh = torch.exp(p[..., 2:4]) * anchor_wh # width, height 146 | p_conf = torch.sigmoid(p[..., 4:5]) # Conf 147 | p_cls = p[..., 5:5 + self.nc] 148 | # Broadcasting only supported on first dimension in CoreML. See onnx-coreml/_operators.py 149 | # p_cls = F.softmax(p_cls, 2) * p_conf # SSD-like conf 150 | p_cls = torch.exp(p_cls).permute((2, 1, 0)) 151 | p_cls = p_cls / p_cls.sum(0).unsqueeze(0) * p_conf.permute((2, 1, 0)) # F.softmax() equivalent 152 | p_cls = p_cls.permute(2, 1, 0) 153 | return torch.cat((xy / ngu, wh, p_conf, p_cls), 2).squeeze().t() 154 | 155 | else: # inference 156 | io = p.clone() # inference output 157 | io[..., 0:2] = torch.sigmoid(io[..., 0:2]) + self.grid_xy # xy 158 | io[..., 2:4] = torch.exp(io[..., 2:4]) * self.anchor_wh # wh yolo method 159 | # io[..., 2:4] = ((torch.sigmoid(io[..., 2:4]) * 2) ** 3) * self.anchor_wh # wh power method 160 | io[..., 4:] = torch.sigmoid(io[..., 4:]) # p_conf, p_cls 161 | # io[..., 5:] = F.softmax(io[..., 5:], dim=4) # p_cls 162 | io[..., :4] *= self.stride 163 | if self.nc == 1: 164 | io[..., 5] = 1 # single-class model https://github.com/ultralytics/yolov3/issues/235 165 | 166 | # reshape from [1, 3, 13, 13, 85] to [1, 507, 85] 167 | return io.view(bs, -1, 5 + self.nc), p 168 | 169 | 170 | class Darknet(nn.Module): 171 | """YOLOv3 object detection model""" 172 | 173 | def __init__(self, cfg, img_size=(416, 416)): 174 | super(Darknet, self).__init__() 175 | 176 | self.module_defs = parse_model_cfg(cfg) 177 | self.module_defs[0]['cfg'] = cfg 178 | self.module_defs[0]['height'] = img_size 179 | self.hyperparams, self.module_list = create_modules(self.module_defs) 180 | self.yolo_layers = get_yolo_layers(self) 181 | 182 | # Darknet Header https://github.com/AlexeyAB/darknet/issues/2914#issuecomment-496675346 183 | self.version = np.array([0, 2, 5], dtype=np.int32) # (int32) version info: major, minor, revision 184 | self.seen = np.array([0], dtype=np.int64) # (int64) number of images seen during training 185 | 186 | def forward(self, x, var=None): 187 | img_size = max(x.shape[-2:]) 188 | layer_outputs = [] 189 | output = [] 190 | 191 | for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): 192 | mtype = module_def['type'] 193 | if mtype in ['convolutional', 'upsample', 'maxpool']: 194 | x = module(x) 195 | elif mtype == 'route': 196 | layer_i = [int(x) for x in module_def['layers'].split(',')] 197 | if len(layer_i) == 1: 198 | x = layer_outputs[layer_i[0]] 199 | else: 200 | x = torch.cat([layer_outputs[i] for i in layer_i], 1) 201 | elif mtype == 'shortcut': 202 | layer_i = int(module_def['from']) 203 | x = layer_outputs[-1] + layer_outputs[layer_i] 204 | elif mtype == 'yolo': 205 | x = module[0](x, img_size) 206 | output.append(x) 207 | layer_outputs.append(x) 208 | 209 | if self.training: 210 | return output 211 | elif ONNX_EXPORT: 212 | output = torch.cat(output, 1) # cat 3 layers 85 x (507, 2028, 8112) to 85 x 10647 213 | nc = self.module_list[self.yolo_layers[0]][0].nc # number of classes 214 | return output[5:5 + nc].t(), output[:4].t() # ONNX scores, boxes 215 | else: 216 | io, p = list(zip(*output)) # inference output, training output 217 | return torch.cat(io, 1), p 218 | 219 | def fuse(self): 220 | # Fuse Conv2d + BatchNorm2d layers throughout model 221 | fused_list = nn.ModuleList() 222 | for a in list(self.children())[0]: 223 | for i, b in enumerate(a): 224 | if isinstance(b, nn.modules.batchnorm.BatchNorm2d): 225 | # fuse this bn layer with the previous conv2d layer 226 | conv = a[i - 1] 227 | fused = torch_utils.fuse_conv_and_bn(conv, b) 228 | a = nn.Sequential(fused, *list(a.children())[i + 1:]) 229 | break 230 | fused_list.append(a) 231 | self.module_list = fused_list 232 | # model_info(self) # yolov3-spp reduced from 225 to 152 layers 233 | 234 | 235 | def get_yolo_layers(model): 236 | a = [module_def['type'] == 'yolo' for module_def in model.module_defs] 237 | return [i for i, x in enumerate(a) if x] # [82, 94, 106] for yolov3 238 | 239 | 240 | def create_grids(self, img_size=416, ng=(13, 13), device='cpu'): 241 | nx, ny = ng # x and y grid size 242 | self.img_size = img_size 243 | self.stride = img_size / max(ng) 244 | 245 | # build xy offsets 246 | yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)]) 247 | self.grid_xy = torch.stack((xv, yv), 2).to(device).float().view((1, 1, ny, nx, 2)) 248 | 249 | # build wh gains 250 | self.anchor_vec = self.anchors.to(device) / self.stride 251 | self.anchor_wh = self.anchor_vec.view(1, self.na, 1, 1, 2).to(device) 252 | self.ng = torch.Tensor(ng).to(device) 253 | self.nx = nx 254 | self.ny = ny 255 | 256 | 257 | def load_darknet_weights(self, weights, cutoff=-1): 258 | # Parses and loads the weights stored in 'weights' 259 | # cutoff: save layers between 0 and cutoff (if cutoff = -1 all are saved) 260 | weights_file = weights.split(os.sep)[-1] 261 | 262 | # Try to download weights if not available locally 263 | if not os.path.isfile(weights): 264 | try: 265 | os.system('wget https://pjreddie.com/media/files/' + weights_file + ' -O ' + weights) 266 | except IOError: 267 | print(weights + ' not found.\nTry https://drive.google.com/drive/folders/1uxgUBemJVw9wZsdpboYbzUN4bcRhsuAI') 268 | 269 | # Establish cutoffs 270 | if weights_file == 'darknet53.conv.74': 271 | cutoff = 75 272 | elif weights_file == 'yolov3-tiny.conv.15': 273 | cutoff = 15 274 | 275 | # Read weights file 276 | with open(weights, 'rb') as f: 277 | # Read Header https://github.com/AlexeyAB/darknet/issues/2914#issuecomment-496675346 278 | self.version = np.fromfile(f, dtype=np.int32, count=3) # (int32) version info: major, minor, revision 279 | self.seen = np.fromfile(f, dtype=np.int64, count=1) # (int64) number of images seen during training 280 | 281 | weights = np.fromfile(f, dtype=np.float32) # The rest are weights 282 | 283 | ptr = 0 284 | for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])): 285 | if module_def['type'] == 'convolutional': 286 | conv_layer = module[0] 287 | if module_def['batch_normalize']: 288 | # Load BN bias, weights, running mean and running variance 289 | bn_layer = module[1] 290 | num_b = bn_layer.bias.numel() # Number of biases 291 | # Bias 292 | bn_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.bias) 293 | bn_layer.bias.data.copy_(bn_b) 294 | ptr += num_b 295 | # Weight 296 | bn_w = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.weight) 297 | bn_layer.weight.data.copy_(bn_w) 298 | ptr += num_b 299 | # Running Mean 300 | bn_rm = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_mean) 301 | bn_layer.running_mean.data.copy_(bn_rm) 302 | ptr += num_b 303 | # Running Var 304 | bn_rv = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_var) 305 | bn_layer.running_var.data.copy_(bn_rv) 306 | ptr += num_b 307 | else: 308 | # Load conv. bias 309 | num_b = conv_layer.bias.numel() 310 | conv_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(conv_layer.bias) 311 | conv_layer.bias.data.copy_(conv_b) 312 | ptr += num_b 313 | # Load conv. weights 314 | num_w = conv_layer.weight.numel() 315 | conv_w = torch.from_numpy(weights[ptr:ptr + num_w]).view_as(conv_layer.weight) 316 | conv_layer.weight.data.copy_(conv_w) 317 | ptr += num_w 318 | 319 | return cutoff 320 | 321 | 322 | def save_weights(self, path='model.weights', cutoff=-1): 323 | # Converts a PyTorch model to Darket format (*.pt to *.weights) 324 | # Note: Does not work if model.fuse() is applied 325 | with open(path, 'wb') as f: 326 | # Write Header https://github.com/AlexeyAB/darknet/issues/2914#issuecomment-496675346 327 | self.version.tofile(f) # (int32) version info: major, minor, revision 328 | self.seen.tofile(f) # (int64) number of images seen during training 329 | 330 | # Iterate through layers 331 | for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])): 332 | if module_def['type'] == 'convolutional': 333 | conv_layer = module[0] 334 | # If batch norm, load bn first 335 | if module_def['batch_normalize']: 336 | bn_layer = module[1] 337 | bn_layer.bias.data.cpu().numpy().tofile(f) 338 | bn_layer.weight.data.cpu().numpy().tofile(f) 339 | bn_layer.running_mean.data.cpu().numpy().tofile(f) 340 | bn_layer.running_var.data.cpu().numpy().tofile(f) 341 | # Load conv bias 342 | else: 343 | conv_layer.bias.data.cpu().numpy().tofile(f) 344 | # Load conv weights 345 | conv_layer.weight.data.cpu().numpy().tofile(f) 346 | 347 | 348 | def convert(cfg='cfg/yolov3-spp.cfg', weights='weights/yolov3-spp.weights'): 349 | # Converts between PyTorch and Darknet format per extension (i.e. *.weights convert to *.pt and vice versa) 350 | # from models import *; convert('cfg/yolov3-spp.cfg', 'weights/yolov3-spp.weights') 351 | 352 | # Initialize model 353 | model = Darknet(cfg) 354 | 355 | # Load weights and save 356 | if weights.endswith('.pt'): # if PyTorch format 357 | model.load_state_dict(torch.load(weights, map_location='cpu')['model']) 358 | save_weights(model, path='converted.weights', cutoff=-1) 359 | print("Success: converted '%s' to 'converted.weights'" % weights) 360 | 361 | elif weights.endswith('.weights'): # darknet format 362 | _ = load_darknet_weights(model, weights) 363 | chkpt = {'epoch': -1, 'best_loss': None, 'model': model.state_dict(), 'optimizer': None} 364 | torch.save(chkpt, 'converted.pt') 365 | print("Success: converted '%s' to 'converted.pt'" % weights) 366 | 367 | else: 368 | print('Error: extension not supported.') 369 | -------------------------------------------------------------------------------- /project/datasets.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import math 3 | import os 4 | import random 5 | import shutil 6 | from pathlib import Path 7 | 8 | import cv2 9 | import numpy as np 10 | import torch 11 | from torch.utils.data import Dataset 12 | from tqdm import tqdm 13 | 14 | from project.utils import xyxy2xywh 15 | 16 | 17 | class LoadImages: # for inference 18 | def __init__(self, path, img_size=416): 19 | self.height = img_size 20 | img_formats = ['.jpg', '.jpeg', '.png', '.tif'] 21 | vid_formats = ['.mov', '.avi', '.mp4'] 22 | 23 | files = [] 24 | if os.path.isdir(path): 25 | files = sorted(glob.glob('%s/*.*' % path)) 26 | elif os.path.isfile(path): 27 | files = [path] 28 | 29 | images = [x for x in files if os.path.splitext(x)[-1].lower() in img_formats] 30 | videos = [x for x in files if os.path.splitext(x)[-1].lower() in vid_formats] 31 | nI, nV = len(images), len(videos) 32 | 33 | self.files = images + videos 34 | self.nF = nI + nV # number of files 35 | self.video_flag = [False] * nI + [True] * nV 36 | self.mode = 'images' 37 | if any(videos): 38 | self.new_video(videos[0]) # new video 39 | else: 40 | self.cap = None 41 | assert self.nF > 0, 'No images or videos found in ' + path 42 | 43 | def __iter__(self): 44 | self.count = 0 45 | return self 46 | 47 | def __next__(self): 48 | if self.count == self.nF: 49 | raise StopIteration 50 | path = self.files[self.count] 51 | 52 | if self.video_flag[self.count]: 53 | # Read video 54 | self.mode = 'video' 55 | ret_val, img0 = self.cap.read() 56 | if not ret_val: 57 | self.count += 1 58 | self.cap.release() 59 | if self.count == self.nF: # last video 60 | raise StopIteration 61 | else: 62 | path = self.files[self.count] 63 | self.new_video(path) 64 | ret_val, img0 = self.cap.read() 65 | 66 | self.frame += 1 67 | print('video %g/%g (%g/%g) %s: ' % (self.count + 1, self.nF, self.frame, self.nframes, path), end='') 68 | 69 | else: 70 | # Read image 71 | self.count += 1 72 | img0 = cv2.imread(path) # BGR 73 | assert img0 is not None, 'File Not Found ' + path 74 | print('image %g/%g %s: ' % (self.count, self.nF, path), end='') 75 | 76 | # Padded resize 77 | img, _, _, _ = letterbox(img0, new_shape=self.height) 78 | 79 | # Normalize RGB 80 | img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB 81 | img = np.ascontiguousarray(img, dtype=np.float32) # uint8 to float32 82 | img /= 255.0 # 0 - 255 to 0.0 - 1.0 83 | 84 | # cv2.imwrite(path + '.letterbox.jpg', 255 * img.transpose((1, 2, 0))[:, :, ::-1]) # save letterbox image 85 | return path, img, img0, self.cap 86 | 87 | def new_video(self, path): 88 | self.frame = 0 89 | self.cap = cv2.VideoCapture(path) 90 | self.nframes = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) 91 | 92 | def __len__(self): 93 | return self.nF # number of files 94 | 95 | 96 | class LoadWebcam: # for inference 97 | def __init__(self, img_size=416): 98 | self.cam = cv2.VideoCapture(0) 99 | self.height = img_size 100 | 101 | def __iter__(self): 102 | self.count = -1 103 | return self 104 | 105 | def __next__(self): 106 | self.count += 1 107 | if cv2.waitKey(1) == 27: # esc to quit 108 | cv2.destroyAllWindows() 109 | raise StopIteration 110 | 111 | # Read image 112 | ret_val, img0 = self.cam.read() 113 | assert ret_val, 'Webcam Error' 114 | img_path = 'webcam_%g.jpg' % self.count 115 | img0 = cv2.flip(img0, 1) # flip left-right 116 | print('webcam %g: ' % self.count, end='') 117 | 118 | # Padded resize 119 | img, _, _, _ = letterbox(img0, new_shape=self.height) 120 | 121 | # Normalize RGB 122 | img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB 123 | img = np.ascontiguousarray(img, dtype=np.float32) # uint8 to float32 124 | img /= 255.0 # 0 - 255 to 0.0 - 1.0 125 | 126 | return img_path, img, img0, None 127 | 128 | def __len__(self): 129 | return 0 130 | 131 | 132 | class LoadImagesAndLabels(Dataset): # for training/testing 133 | def __init__(self, path, img_size=416, batch_size=16, augment=False, rect=True, image_weights=False): 134 | with open(path, 'r') as f: 135 | img_files = f.read().splitlines() 136 | self.img_files = list(filter(lambda x: len(x) > 0, img_files)) 137 | 138 | n = len(self.img_files) 139 | bi = np.floor(np.arange(n) / batch_size).astype(np.int) # batch index 140 | nb = bi[-1] + 1 # number of batches 141 | assert n > 0, 'No images found in %s' % path 142 | 143 | self.n = n 144 | self.batch = bi # batch index of image 145 | self.img_size = img_size 146 | self.augment = augment 147 | self.image_weights = image_weights 148 | self.rect = False if image_weights else rect 149 | self.label_files = [x.replace('images', 'labels'). 150 | replace('.jpeg', '.txt'). 151 | replace('.jpg', '.txt'). 152 | replace('.bmp', '.txt'). 153 | replace('.png', '.txt') for x in self.img_files] 154 | 155 | # Rectangular Training https://github.com/ultralytics/yolov3/issues/232 156 | if self.rect: 157 | from PIL import Image 158 | 159 | # Read image shapes 160 | sp = 'data' + os.sep + path.replace('.txt', '.shapes').split(os.sep)[-1] # shapefile path 161 | if os.path.exists(sp): # read existing shapefile 162 | with open(sp, 'r') as f: 163 | s = np.array([x.split() for x in f.read().splitlines()], dtype=np.float32) 164 | assert len(s) == n, 'Shapefile out of sync, please delete %s and rerun' % sp 165 | else: # no shapefile, so read shape using PIL and write shapefile for next time (faster) 166 | s = np.array([Image.open(f).size for f in tqdm(self.img_files, desc='Reading image shapes')]) 167 | np.savetxt(sp, s, fmt='%g') 168 | 169 | # Sort by aspect ratio 170 | ar = s[:, 1] / s[:, 0] # aspect ratio 171 | i = ar.argsort() 172 | ar = ar[i] 173 | self.img_files = [self.img_files[i] for i in i] 174 | self.label_files = [self.label_files[i] for i in i] 175 | 176 | # Set training image shapes 177 | shapes = [[1, 1]] * nb 178 | for i in range(nb): 179 | ari = ar[bi == i] 180 | mini, maxi = ari.min(), ari.max() 181 | if maxi < 1: 182 | shapes[i] = [maxi, 1] 183 | elif mini > 1: 184 | shapes[i] = [1, 1 / mini] 185 | 186 | self.batch_shapes = np.ceil(np.array(shapes) * img_size / 32.).astype(np.int) * 32 187 | 188 | # Preload labels (required for weighted CE training) 189 | self.imgs = [None] * n 190 | self.labels = [np.zeros((0, 5))] * n 191 | iter = tqdm(self.label_files, desc='Reading labels') if n > 1000 else self.label_files 192 | for i, file in enumerate(iter): 193 | try: 194 | with open(file, 'r') as f: 195 | l = np.array([x.split() for x in f.read().splitlines()], dtype=np.float32) 196 | if l.shape[0]: 197 | assert l.shape[1] == 5, '> 5 label columns: %s' % file 198 | assert (l >= 0).all(), 'negative labels: %s' % file 199 | assert (l[:, 1:] <= 1).all(), 'non-normalized or out of bounds coordinate labels: %s' % file 200 | self.labels[i] = l 201 | except: 202 | pass # print('Warning: missing labels for %s' % self.img_files[i]) # missing label file 203 | assert len(np.concatenate(self.labels, 0)) > 0, 'No labels found. Incorrect label paths provided.' 204 | 205 | def __len__(self): 206 | return len(self.img_files) 207 | 208 | # def __iter__(self): 209 | # self.count = -1 210 | # print('ran dataset iter') 211 | # #self.shuffled_vector = np.random.permutation(self.nF) if self.augment else np.arange(self.nF) 212 | # return self 213 | 214 | def __getitem__(self, index): 215 | if self.image_weights: 216 | index = self.indices[index] 217 | 218 | img_path = self.img_files[index] 219 | label_path = self.label_files[index] 220 | 221 | # Load image 222 | img = self.imgs[index] 223 | if img is None: 224 | img = cv2.imread(img_path) # BGR 225 | assert img is not None, 'File Not Found ' + img_path 226 | if self.n < 1001: 227 | self.imgs[index] = img # cache image into memory 228 | 229 | # Augment colorspace 230 | augment_hsv = True 231 | if self.augment and augment_hsv: 232 | # SV augmentation by 50% 233 | fraction = 0.50 # must be < 1.0 234 | img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) # hue, sat, val 235 | S = img_hsv[:, :, 1].astype(np.float32) # saturation 236 | V = img_hsv[:, :, 2].astype(np.float32) # value 237 | 238 | a = (random.random() * 2 - 1) * fraction + 1 239 | b = (random.random() * 2 - 1) * fraction + 1 240 | S *= a 241 | V *= b 242 | 243 | img_hsv[:, :, 1] = S if a < 1 else S.clip(None, 255) 244 | img_hsv[:, :, 2] = V if b < 1 else V.clip(None, 255) 245 | cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img) 246 | 247 | # Letterbox 248 | h, w, _ = img.shape 249 | if self.rect: 250 | shape = self.batch_shapes[self.batch[index]] 251 | img, ratio, padw, padh = letterbox(img, new_shape=shape, mode='rect') 252 | else: 253 | shape = self.img_size 254 | img, ratio, padw, padh = letterbox(img, new_shape=shape, mode='square') 255 | 256 | # Load labels 257 | labels = [] 258 | if os.path.isfile(label_path): 259 | # with open(label_path, 'r') as f: 260 | # x = np.array([x.split() for x in f.read().splitlines()], dtype=np.float32) 261 | x = self.labels[index] 262 | if x.size > 0: 263 | # Normalized xywh to pixel xyxy format 264 | labels = x.copy() 265 | labels[:, 1] = ratio * w * (x[:, 1] - x[:, 3] / 2) + padw 266 | labels[:, 2] = ratio * h * (x[:, 2] - x[:, 4] / 2) + padh 267 | labels[:, 3] = ratio * w * (x[:, 1] + x[:, 3] / 2) + padw 268 | labels[:, 4] = ratio * h * (x[:, 2] + x[:, 4] / 2) + padh 269 | 270 | # Augment image and labels 271 | if self.augment: 272 | img, labels = random_affine(img, labels, degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10)) 273 | 274 | nL = len(labels) # number of labels 275 | if nL: 276 | # convert xyxy to xywh 277 | labels[:, 1:5] = xyxy2xywh(labels[:, 1:5]) 278 | 279 | # Normalize coordinates 0 - 1 280 | labels[:, [2, 4]] /= img.shape[0] # height 281 | labels[:, [1, 3]] /= img.shape[1] # width 282 | 283 | if self.augment: 284 | # random left-right flip 285 | lr_flip = True 286 | if lr_flip and random.random() > 0.5: 287 | img = np.fliplr(img) 288 | if nL: 289 | labels[:, 1] = 1 - labels[:, 1] 290 | 291 | # random up-down flip 292 | ud_flip = False 293 | if ud_flip and random.random() > 0.5: 294 | img = np.flipud(img) 295 | if nL: 296 | labels[:, 2] = 1 - labels[:, 2] 297 | 298 | labels_out = torch.zeros((nL, 6)) 299 | if nL: 300 | labels_out[:, 1:] = torch.from_numpy(labels) 301 | 302 | # Normalize 303 | img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416 304 | img = np.ascontiguousarray(img, dtype=np.float32) # uint8 to float32 305 | img /= 255.0 # 0 - 255 to 0.0 - 1.0 306 | 307 | return torch.from_numpy(img), labels_out, img_path, (h, w) 308 | 309 | @staticmethod 310 | def collate_fn(batch): 311 | img, label, path, hw = list(zip(*batch)) # transposed 312 | for i, l in enumerate(label): 313 | l[:, 0] = i # add target image index for build_targets() 314 | return torch.stack(img, 0), torch.cat(label, 0), path, hw 315 | 316 | 317 | def letterbox(img, new_shape=416, color=(127.5, 127.5, 127.5), mode='auto'): 318 | # Resize a rectangular image to a 32 pixel multiple rectangle 319 | # https://github.com/ultralytics/yolov3/issues/232 320 | shape = img.shape[:2] # current shape [height, width] 321 | if isinstance(new_shape, int): 322 | ratio = float(new_shape) / max(shape) 323 | else: 324 | ratio = max(new_shape) / max(shape) # ratio = new / old 325 | new_unpad = (int(round(shape[1] * ratio)), int(round(shape[0] * ratio))) 326 | 327 | # Compute padding https://github.com/ultralytics/yolov3/issues/232 328 | if mode is 'auto': # minimum rectangle 329 | dw = np.mod(new_shape - new_unpad[0], 32) / 2 # width padding 330 | dh = np.mod(new_shape - new_unpad[1], 32) / 2 # height padding 331 | elif mode is 'square': # square 332 | dw = (new_shape - new_unpad[0]) / 2 # width padding 333 | dh = (new_shape - new_unpad[1]) / 2 # height padding 334 | elif mode is 'rect': # square 335 | dw = (new_shape[1] - new_unpad[0]) / 2 # width padding 336 | dh = (new_shape[0] - new_unpad[1]) / 2 # height padding 337 | 338 | top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) 339 | left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) 340 | img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR) # resized, no border 341 | img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # padded square 342 | return img, ratio, dw, dh 343 | 344 | 345 | def random_affine(img, targets=(), degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-2, 2), 346 | borderValue=(127.5, 127.5, 127.5)): 347 | # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10)) 348 | # https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4 349 | 350 | if targets is None: 351 | targets = [] 352 | border = 0 # width of added border (optional) 353 | height = img.shape[0] + border * 2 354 | width = img.shape[1] + border * 2 355 | 356 | # Rotation and Scale 357 | R = np.eye(3) 358 | a = random.random() * (degrees[1] - degrees[0]) + degrees[0] 359 | # a += random.choice([-180, -90, 0, 90]) # 90deg rotations added to small rotations 360 | s = random.random() * (scale[1] - scale[0]) + scale[0] 361 | R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s) 362 | 363 | # Translation 364 | T = np.eye(3) 365 | T[0, 2] = (random.random() * 2 - 1) * translate[0] * img.shape[0] + border # x translation (pixels) 366 | T[1, 2] = (random.random() * 2 - 1) * translate[1] * img.shape[1] + border # y translation (pixels) 367 | 368 | # Shear 369 | S = np.eye(3) 370 | S[0, 1] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # x shear (deg) 371 | S[1, 0] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # y shear (deg) 372 | 373 | M = S @ T @ R # Combined rotation matrix. ORDER IS IMPORTANT HERE!! 374 | imw = cv2.warpAffine(img, M[:2], dsize=(width, height), flags=cv2.INTER_LINEAR, 375 | borderValue=borderValue) # BGR order borderValue 376 | 377 | # Return warped points also 378 | if len(targets) > 0: 379 | n = targets.shape[0] 380 | points = targets[:, 1:5].copy() 381 | area0 = (points[:, 2] - points[:, 0]) * (points[:, 3] - points[:, 1]) 382 | 383 | # warp points 384 | xy = np.ones((n * 4, 3)) 385 | xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1 386 | xy = (xy @ M.T)[:, :2].reshape(n, 8) 387 | 388 | # create new boxes 389 | x = xy[:, [0, 2, 4, 6]] 390 | y = xy[:, [1, 3, 5, 7]] 391 | xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T 392 | 393 | # # apply angle-based reduction of bounding boxes 394 | # radians = a * math.pi / 180 395 | # reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5 396 | # x = (xy[:, 2] + xy[:, 0]) / 2 397 | # y = (xy[:, 3] + xy[:, 1]) / 2 398 | # w = (xy[:, 2] - xy[:, 0]) * reduction 399 | # h = (xy[:, 3] - xy[:, 1]) * reduction 400 | # xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T 401 | 402 | # reject warped points outside of image 403 | xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width) 404 | xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height) 405 | w = xy[:, 2] - xy[:, 0] 406 | h = xy[:, 3] - xy[:, 1] 407 | area = w * h 408 | ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16)) 409 | i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10) 410 | 411 | targets = targets[i] 412 | targets[:, 1:5] = xy[i] 413 | 414 | return imw, targets 415 | 416 | 417 | def convert_images2bmp(): 418 | # cv2.imread() jpg at 230 img/s, *.bmp at 400 img/s 419 | for path in ['../coco/images/val2014/', '../coco/images/train2014/']: 420 | folder = os.sep + Path(path).name 421 | output = path.replace(folder, folder + 'bmp') 422 | if os.path.exists(output): 423 | shutil.rmtree(output) # delete output folder 424 | os.makedirs(output) # make new output folder 425 | 426 | for f in tqdm(glob.glob('%s*.jpg' % path)): 427 | save_name = f.replace('.jpg', '.bmp').replace(folder, folder + 'bmp') 428 | cv2.imwrite(save_name, cv2.imread(f)) 429 | 430 | for label_path in ['../coco/trainvalno5k.txt', '../coco/5k.txt']: 431 | with open(label_path, 'r') as file: 432 | lines = file.read() 433 | lines = lines.replace('2014/', '2014bmp/').replace('.jpg', '.bmp').replace( 434 | '/Users/glennjocher/PycharmProjects/', '../') 435 | with open(label_path.replace('5k', '5k_bmp'), 'w') as file: 436 | file.write(lines) 437 | -------------------------------------------------------------------------------- /project/gcp.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # New VM 4 | rm -rf yolov3 weights coco 5 | git clone https://github.com/ultralytics/yolov3 6 | bash yolov3/weights/download_yolov3_weights.sh && cp -r weights yolov3 7 | bash yolov3/data/get_coco_dataset.sh 8 | git clone https://github.com/cocodataset/cocoapi && cd cocoapi/PythonAPI && make && cd ../.. && cp -r cocoapi/PythonAPI/pycocotools yolov3 9 | sudo shutdown 10 | 11 | # Re-clone 12 | rm -rf yolov3 # Warning: remove existing 13 | git clone https://github.com/ultralytics/yolov3 # master 14 | # git clone -b test --depth 1 https://github.com/ultralytics/yolov3 test # branch 15 | cp -r cocoapi/PythonAPI/pycocotools yolov3 16 | cp -r weights yolov3 && cd yolov3 17 | 18 | # Train 19 | python3 train.py 20 | 21 | # Resume 22 | python3 train.py --resume 23 | 24 | # Detect 25 | python3 detect.py 26 | 27 | # Test 28 | python3 test.py --save-json 29 | 30 | # Git pull 31 | git pull https://github.com/ultralytics/yolov3 # master 32 | git pull https://github.com/ultralytics/yolov3 test # branch 33 | 34 | # Test Darknet training 35 | python3 test.py --weights ../darknet/backup/yolov3.backup 36 | 37 | # Copy latest.pt TO bucket 38 | gsutil cp yolov3/weights/latest1gpu.pt gs://ultralytics 39 | 40 | # Copy latest.pt FROM bucket 41 | gsutil cp gs://ultralytics/latest.pt yolov3/weights/latest.pt 42 | wget https://storage.googleapis.com/ultralytics/yolov3/latest_v1_0.pt -O weights/latest_v1_0.pt 43 | wget https://storage.googleapis.com/ultralytics/yolov3/best_v1_0.pt -O weights/best_v1_0.pt 44 | 45 | # Reproduce tutorials 46 | rm results*.txt # WARNING: removes existing results 47 | python3 train.py --nosave --data data/coco_1img.data && mv results.txt results0r_1img.txt 48 | python3 train.py --nosave --data data/coco_10img.data && mv results.txt results0r_10img.txt 49 | python3 train.py --nosave --data data/coco_100img.data && mv results.txt results0r_100img.txt 50 | #python3 train.py --nosave --data data/coco_100img.data --transfer && mv results.txt results3_100imgTL.txt 51 | python3 -c "from utils import utils; utils.plot_results()" 52 | gsutil cp results*.txt gs://ultralytics 53 | gsutil cp results.png gs://ultralytics 54 | sudo shutdown 55 | 56 | # Reproduce mAP 57 | python3 test.py --save-json --img-size 608 58 | python3 test.py --save-json --img-size 416 59 | python3 test.py --save-json --img-size 320 60 | sudo shutdown 61 | 62 | # Unit tests 63 | python3 detect.py # detect 2 persons, 1 tie 64 | python3 test.py --data data/coco_32img.data # test mAP = 0.8 65 | python3 train.py --data data/coco_32img.data --epochs 5 --nosave # train 5 epochs 66 | python3 train.py --data data/coco_1cls.data --epochs 5 --nosave # train 5 epochs 67 | python3 train.py --data data/coco_1img.data --epochs 5 --nosave # train 5 epochs 68 | 69 | # AlexyAB Darknet 70 | gsutil cp -r gs://sm6/supermarket2 . # dataset from bucket 71 | rm -rf darknet && git clone https://github.com/AlexeyAB/darknet && cd darknet && wget -c https://pjreddie.com/media/files/darknet53.conv.74 # sudo apt install libopencv-dev && make 72 | ./darknet detector calc_anchors data/coco_img64.data -num_of_clusters 9 -width 320 -height 320 # kmeans anchor calculation 73 | ./darknet detector train ../supermarket2/supermarket2.data ../yolo_v3_spp_pan_scale.cfg darknet53.conv.74 -map -dont_show # train spp 74 | ./darknet detector train ../yolov3/data/coco.data ../yolov3-spp.cfg darknet53.conv.74 -map -dont_show # train spp coco 75 | 76 | ./darknet detector train ../supermarket2/supermarket2.data ../yolov3-spp-sm2-1cls-scalexy_variable.cfg darknet53.conv.74 -map -dont_show # train spp 77 | gsutil cp -r backup/*5000.weights gs://sm6/weights 78 | sudo shutdown 79 | 80 | 81 | ./darknet detector train ../supermarket2/supermarket2.data ../yolov3-tiny-sm2-1cls.cfg yolov3-tiny.conv.15 -map -dont_show # train tiny 82 | ./darknet detector train ../supermarket2/supermarket2.data cfg/yolov3-spp-sm2-1cls.cfg backup/yolov3-spp-sm2-1cls_last.weights # resume 83 | python3 train.py --data ../supermarket2/supermarket2.data --cfg ../yolov3-spp-sm2-1cls.cfg --epochs 100 --num-workers 8 --img-size 320 --nosave # train ultralytics 84 | python3 test.py --data ../supermarket2/supermarket2.data --weights ../darknet/backup/yolov3-spp-sm2-1cls_5000.weights --cfg cfg/yolov3-spp-sm2-1cls.cfg # test 85 | gsutil cp -r backup/*.weights gs://sm6/weights # weights to bucket 86 | 87 | python3 test.py --data ../supermarket2/supermarket2.data --weights weights/yolov3-spp-sm2-1cls_5000.weights --cfg ../yolov3-spp-sm2-1cls.cfg --img-size 320 --conf-thres 0.2 # test 88 | python3 test.py --data ../supermarket2/supermarket2.data --weights weights/yolov3-spp-sm2-1cls-scalexy_125_5000.weights --cfg ../yolov3-spp-sm2-1cls-scalexy_125.cfg --img-size 320 --conf-thres 0.2 # test 89 | python3 test.py --data ../supermarket2/supermarket2.data --weights weights/yolov3-spp-sm2-1cls-scalexy_150_5000.weights --cfg ../yolov3-spp-sm2-1cls-scalexy_150.cfg --img-size 320 --conf-thres 0.2 # test 90 | python3 test.py --data ../supermarket2/supermarket2.data --weights weights/yolov3-spp-sm2-1cls-scalexy_200_5000.weights --cfg ../yolov3-spp-sm2-1cls-scalexy_200.cfg --img-size 320 --conf-thres 0.2 # test 91 | python3 test.py --data ../supermarket2/supermarket2.data --weights ../darknet/backup/yolov3-spp-sm2-1cls-scalexy_variable_5000.weights --cfg ../yolov3-spp-sm2-1cls-scalexy_variable.cfg --img-size 320 --conf-thres 0.2 # test 92 | 93 | 94 | 95 | 96 | 97 | # Debug/Development 98 | python3 train.py --evolve --data data/coco_1k5k.data --epochs 30 --img-size 320 99 | gsutil cp evolve.txt gs://ultralytics 100 | sudo shutdown 101 | -------------------------------------------------------------------------------- /project/parse_config.py: -------------------------------------------------------------------------------- 1 | def parse_model_cfg(path): 2 | """Parses the yolo-v3 layer configuration file and returns module definitions""" 3 | file = open(path, 'r') 4 | lines = file.read().split('\n') 5 | lines = [x for x in lines if x and not x.startswith('#')] 6 | lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces 7 | module_defs = [] 8 | for line in lines: 9 | if line.startswith('['): # This marks the start of a new block 10 | module_defs.append({}) 11 | module_defs[-1]['type'] = line[1:-1].rstrip() 12 | if module_defs[-1]['type'] == 'convolutional': 13 | module_defs[-1]['batch_normalize'] = 0 14 | else: 15 | key, value = line.split("=") 16 | value = value.strip() 17 | module_defs[-1][key.rstrip()] = value.strip() 18 | 19 | return module_defs 20 | 21 | 22 | def parse_data_cfg(path): 23 | """Parses the data configuration file""" 24 | options = dict() 25 | options['gpus'] = '0,1,2,3' 26 | options['num_workers'] = '10' 27 | with open(path, 'r') as fp: 28 | lines = fp.readlines() 29 | for line in lines: 30 | line = line.strip() 31 | if line == '' or line.startswith('#'): 32 | continue 33 | key, value = line.split('=') 34 | options[key.strip()] = value.strip() 35 | return options 36 | -------------------------------------------------------------------------------- /project/torch_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def init_seeds(seed=0): 5 | torch.manual_seed(seed) 6 | torch.cuda.manual_seed(seed) 7 | torch.cuda.manual_seed_all(seed) 8 | 9 | 10 | def select_device(force_cpu=False): 11 | cuda = False if force_cpu else torch.cuda.is_available() 12 | device = torch.device('cuda:0' if cuda else 'cpu') 13 | 14 | if not cuda: 15 | print('Using CPU') 16 | if cuda: 17 | c = 1024 ** 2 # bytes to MB 18 | ng = torch.cuda.device_count() 19 | x = [torch.cuda.get_device_properties(i) for i in range(ng)] 20 | print("Using CUDA device0 _CudaDeviceProperties(name='%s', total_memory=%dMB)" % 21 | (x[0].name, x[0].total_memory / c)) 22 | if ng > 0: 23 | # torch.cuda.set_device(0) # OPTIONAL: Set GPU ID 24 | for i in range(1, ng): 25 | print(" device%g _CudaDeviceProperties(name='%s', total_memory=%dMB)" % 26 | (i, x[i].name, x[i].total_memory / c)) 27 | 28 | print('') # skip a line 29 | return device 30 | 31 | 32 | def fuse_conv_and_bn(conv, bn): 33 | # https://tehnokv.com/posts/fusing-batchnorm-and-conv/ 34 | with torch.no_grad(): 35 | # init 36 | fusedconv = torch.nn.Conv2d( 37 | conv.in_channels, 38 | conv.out_channels, 39 | kernel_size=conv.kernel_size, 40 | stride=conv.stride, 41 | padding=conv.padding, 42 | bias=True 43 | ) 44 | 45 | # prepare filters 46 | w_conv = conv.weight.clone().view(conv.out_channels, -1) 47 | w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) 48 | fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.size())) 49 | 50 | # prepare spatial bias 51 | if conv.bias is not None: 52 | b_conv = conv.bias 53 | else: 54 | b_conv = torch.zeros(conv.weight.size(0)) 55 | b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps)) 56 | fusedconv.bias.copy_(b_conv + b_bn) 57 | 58 | return fusedconv 59 | -------------------------------------------------------------------------------- /record.py: -------------------------------------------------------------------------------- 1 | import wave 2 | import pyaudio 3 | import matplotlib.pyplot as plt 4 | import time 5 | 6 | CHUNK = 1024 7 | FORMAT = pyaudio.paInt16 8 | CHANNELS = 2 9 | RATE = 44100 10 | RECORD_SECONDS = 2 11 | 12 | 13 | def record(filename='output.wav'): 14 | """官方录音教程 15 | """ 16 | 17 | p = pyaudio.PyAudio() 18 | 19 | stream = p.open(format=FORMAT, 20 | channels=CHANNELS, 21 | rate=RATE, 22 | input=True, 23 | frames_per_buffer=CHUNK) 24 | 25 | print("* recording") 26 | 27 | frames = [] 28 | 29 | for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): 30 | data = stream.read(CHUNK) 31 | frames.append(data) 32 | 33 | print("* done recording") 34 | 35 | stream.stop_stream() 36 | stream.close() 37 | p.terminate() 38 | 39 | wf = wave.open(filename, 'wb') 40 | wf.setnchannels(CHANNELS) 41 | wf.setsampwidth(p.get_sample_size(FORMAT)) 42 | wf.setframerate(RATE) 43 | wf.writeframes(b''.join(frames)) 44 | wf.close() 45 | 46 | 47 | def multi_record(num=3): 48 | """implement 多次录音""" 49 | for i in range(1,num+1): 50 | print('第{}次录音准备'.format(i)) 51 | filename = 'record_{}.wav'.format(i) 52 | record(filename) 53 | time.sleep(second) 54 | _ = input('进行下一次录音?') 55 | 56 | 57 | def main(): 58 | record(filename='mask_on.wav') 59 | 60 | if __name__ == '__main__': 61 | main() 62 | 63 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # pip install -U -r requirements.txt 2 | numpy 3 | opencv-python >= 4.1 4 | torch >= 1.4 5 | matplotlib 6 | pycocotools 7 | tqdm 8 | pillow 9 | 10 | # Nvidia Apex (optional) for mixed precision training -------------------------- 11 | # git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . --user && cd .. && rm -rf apex 12 | 13 | # Tensorboard (optional) pip requirements -------------------------------------- 14 | # tb-nightly 15 | # future 16 | 17 | # Conda commands (in place of pip) --------------------------------------------- 18 | # conda update -yn base -c defaults conda 19 | # conda install -yc anaconda numpy opencv matplotlib tqdm pillow ipython future 20 | # conda install -yc conda-forge scikit-image pycocotools tensorboard 21 | # conda install -yc spyder-ide spyder-line-profiler 22 | # conda install -yc pytorch pytorch torchvision 23 | # conda install -yc conda-forge protobuf numpy && pip install onnx # https://github.com/onnx/onnx#linux-and-macos 24 | -------------------------------------------------------------------------------- /slurm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | mkdir -p log 3 | now=$(date +"%Y%m%d_%H%M%S") 4 | srun --partition=Data --gres=gpu:4 --ntasks-per-node=1 --job-name=maskdetect python train.py --data data/mask.data --cfg cfg/yolov3-tiny-mask.cfg --epochs 100 2>&1|tee log/train-$now.log 5 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | from torch.utils.data import DataLoader 5 | 6 | from models import * 7 | from project.datasets import * 8 | from project.utils import * 9 | 10 | 11 | def test( 12 | cfg, 13 | data_cfg, 14 | weights=None, 15 | batch_size=16, 16 | img_size=416, 17 | iou_thres=0.5, 18 | conf_thres=0.001, 19 | nms_thres=0.5, 20 | save_json=False, 21 | model=None 22 | ): 23 | if model is None: 24 | device = torch_utils.select_device() 25 | 26 | # Initialize model 27 | model = Darknet(cfg, img_size).to(device) 28 | 29 | # Load weights 30 | if weights.endswith('.pt'): # pytorch format 31 | model.load_state_dict(torch.load(weights, map_location=device)['model']) 32 | else: # darknet format 33 | _ = load_darknet_weights(model, weights) 34 | 35 | if torch.cuda.device_count() > 1: 36 | model = nn.DataParallel(model) 37 | else: 38 | device = next(model.parameters()).device # get model device 39 | 40 | # Configure run 41 | data_cfg = parse_data_cfg(data_cfg) 42 | nc = int(data_cfg['classes']) # number of classes 43 | test_path = data_cfg['valid'] # path to test images 44 | names = load_classes(data_cfg['names']) # class names 45 | 46 | # Dataloader 47 | dataset = LoadImagesAndLabels(test_path, img_size, batch_size) 48 | dataloader = DataLoader(dataset, 49 | batch_size=batch_size, 50 | num_workers=4, 51 | pin_memory=True, 52 | collate_fn=dataset.collate_fn) 53 | 54 | seen = 0 55 | model.eval() 56 | coco91class = coco80_to_coco91_class() 57 | print(('%20s' + '%10s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP', 'F1')) 58 | loss, p, r, f1, mp, mr, map, mf1 = 0., 0., 0., 0., 0., 0., 0., 0. 59 | jdict, stats, ap, ap_class = [], [], [], [] 60 | for batch_i, (imgs, targets, paths, shapes) in enumerate(tqdm(dataloader, desc='Computing mAP')): 61 | targets = targets.to(device) 62 | imgs = imgs.to(device) 63 | _, _, height, width = imgs.shape # batch size, channels, height, width 64 | 65 | # Plot images with bounding boxes 66 | if batch_i == 0 and not os.path.exists('test_batch0.jpg'): 67 | plot_images(imgs=imgs, targets=targets, fname='test_batch0.jpg') 68 | 69 | # Run model 70 | inf_out, train_out = model(imgs) # inference and training outputs 71 | 72 | # Compute loss 73 | if hasattr(model, 'hyp'): # if model has loss hyperparameters 74 | loss_i, _ = compute_loss(train_out, targets, model) 75 | loss += loss_i.item() 76 | 77 | # Run NMS 78 | output = non_max_suppression(inf_out, conf_thres=conf_thres, nms_thres=nms_thres) 79 | 80 | # Statistics per image 81 | for si, pred in enumerate(output): 82 | labels = targets[targets[:, 0] == si, 1:] 83 | nl = len(labels) 84 | tcls = labels[:, 0].tolist() if nl else [] # target class 85 | seen += 1 86 | 87 | if pred is None: 88 | if nl: 89 | stats.append(([], torch.Tensor(), torch.Tensor(), tcls)) 90 | continue 91 | 92 | # Append to text file 93 | # with open('test.txt', 'a') as file: 94 | # [file.write('%11.5g' * 7 % tuple(x) + '\n') for x in pred] 95 | 96 | # Append to pycocotools JSON dictionary 97 | if save_json: 98 | # [{"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}, ... 99 | image_id = int(Path(paths[si]).stem.split('_')[-1]) 100 | box = pred[:, :4].clone() # xyxy 101 | scale_coords(imgs[si].shape[1:], box, shapes[si]) # to original shape 102 | box = xyxy2xywh(box) # xywh 103 | box[:, :2] -= box[:, 2:] / 2 # xy center to top-left corner 104 | for di, d in enumerate(pred): 105 | jdict.append({ 106 | 'image_id': image_id, 107 | 'category_id': coco91class[int(d[6])], 108 | 'bbox': [float3(x) for x in box[di]], 109 | 'score': float(d[4]) 110 | }) 111 | 112 | # Assign all predictions as incorrect 113 | correct = [0] * len(pred) 114 | if nl: 115 | detected = [] 116 | tcls_tensor = labels[:, 0] 117 | 118 | # target boxes 119 | tbox = xywh2xyxy(labels[:, 1:5]) 120 | tbox[:, [0, 2]] *= width 121 | tbox[:, [1, 3]] *= height 122 | 123 | # Search for correct predictions 124 | for i, (*pbox, pconf, pcls_conf, pcls) in enumerate(pred): 125 | 126 | # Break if all targets already located in image 127 | if len(detected) == nl: 128 | break 129 | 130 | # Continue if predicted class not among image classes 131 | if pcls.item() not in tcls: 132 | continue 133 | 134 | # Best iou, index between pred and targets 135 | m = (pcls == tcls_tensor).nonzero().view(-1) 136 | iou, bi = bbox_iou(pbox, tbox[m]).max(0) 137 | 138 | # If iou > threshold and class is correct mark as correct 139 | if iou > iou_thres and m[bi] not in detected: # and pcls == tcls[bi]: 140 | correct[i] = 1 141 | detected.append(m[bi]) 142 | 143 | # Append statistics (correct, conf, pcls, tcls) 144 | stats.append((correct, pred[:, 4].cpu(), pred[:, 6].cpu(), tcls)) 145 | 146 | # Compute statistics 147 | stats = [np.concatenate(x, 0) for x in list(zip(*stats))] # to numpy 148 | nt = np.bincount(stats[3].astype(np.int64), minlength=nc) # number of targets per class 149 | if len(stats): 150 | p, r, ap, f1, ap_class = ap_per_class(*stats) 151 | mp, mr, map, mf1 = p.mean(), r.mean(), ap.mean(), f1.mean() 152 | 153 | # Print results 154 | pf = '%20s' + '%10.3g' * 6 # print format 155 | print(pf % ('all', seen, nt.sum(), mp, mr, map, mf1)) 156 | 157 | # Print results per class 158 | if nc > 1 and len(stats): 159 | for i, c in enumerate(ap_class): 160 | print(pf % (names[c], seen, nt[c], p[i], r[i], ap[i], f1[i])) 161 | 162 | # Save JSON 163 | if save_json and map and len(jdict): 164 | imgIds = [int(Path(x).stem.split('_')[-1]) for x in dataset.img_files] 165 | with open('results.json', 'w') as file: 166 | json.dump(jdict, file) 167 | 168 | from pycocotools.coco import COCO 169 | from pycocotools.cocoeval import COCOeval 170 | 171 | # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb 172 | cocoGt = COCO('../coco/annotations/instances_val2014.json') # initialize COCO ground truth api 173 | cocoDt = cocoGt.loadRes('results.json') # initialize COCO pred api 174 | 175 | cocoEval = COCOeval(cocoGt, cocoDt, 'bbox') 176 | cocoEval.params.imgIds = imgIds # [:32] # only evaluate these images 177 | cocoEval.evaluate() 178 | cocoEval.accumulate() 179 | cocoEval.summarize() 180 | map = cocoEval.stats[1] # update mAP to pycocotools mAP 181 | 182 | # Return results 183 | maps = np.zeros(nc) + map 184 | for i, c in enumerate(ap_class): 185 | maps[c] = ap[i] 186 | return (mp, mr, map, mf1, loss / len(dataloader)), maps 187 | 188 | 189 | if __name__ == '__main__': 190 | parser = argparse.ArgumentParser(prog='test.py') 191 | parser.add_argument('--batch-size', type=int, default=16, help='size of each image batch') 192 | parser.add_argument('--cfg', type=str, default='cfg/yolov3-spp.cfg', help='cfg file path') 193 | parser.add_argument('--data-cfg', type=str, default='data/coco.data', help='coco.data file path') 194 | parser.add_argument('--weights', type=str, default='weights/yolov3-spp.weights', help='path to weights file') 195 | parser.add_argument('--iou-thres', type=float, default=0.5, help='iou threshold required to qualify as detected') 196 | parser.add_argument('--conf-thres', type=float, default=0.001, help='object confidence threshold') 197 | parser.add_argument('--nms-thres', type=float, default=0.5, help='iou threshold for non-maximum suppression') 198 | parser.add_argument('--save-json', action='store_true', help='save a cocoapi-compatible JSON results file') 199 | parser.add_argument('--img-size', type=int, default=416, help='inference size (pixels)') 200 | opt = parser.parse_args() 201 | print(opt) 202 | 203 | with torch.no_grad(): 204 | mAP = test( 205 | opt.cfg, 206 | opt.data_cfg, 207 | opt.weights, 208 | opt.batch_size, 209 | opt.img_size, 210 | opt.iou_thres, 211 | opt.conf_thres, 212 | opt.nms_thres, 213 | opt.save_json 214 | ) 215 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | 4 | import torch.distributed as dist 5 | import torch.optim as optim 6 | import torch.optim.lr_scheduler as lr_scheduler 7 | from torch.utils.data import DataLoader 8 | 9 | import test # Import test.py to get mAP after each epoch 10 | from models import * 11 | from project.datasets import * 12 | from project.utils import * 13 | 14 | # Hyperparameters: train.py --evolve --epochs 2 --img-size 320, Metrics: 0.204 0.302 0.175 0.234 (square smart) 15 | hyp = {'xy': 0.1, # xy loss gain (giou is about 0.02) 16 | 'wh': 0.1, # wh loss gain 17 | 'cls': 0.04, # cls loss gain 18 | 'conf': 4.5, # conf loss gain 19 | 'iou_t': 0.5, # iou target-anchor training threshold 20 | 'lr0': 0.001, # initial learning rate 21 | 'lrf': -4., # final learning rate = lr0 * (10 ** lrf) 22 | 'momentum': 0.90, # SGD momentum 23 | 'weight_decay': 0.0005} # optimizer weight decay 24 | 25 | 26 | # Hyperparameters: Original, Metrics: 0.172 0.304 0.156 0.205 (square) 27 | # hyp = {'xy': 0.5, # xy loss gain 28 | # 'wh': 0.0625, # wh loss gain 29 | # 'cls': 0.0625, # cls loss gain 30 | # 'conf': 4, # conf loss gain 31 | # 'iou_t': 0.1, # iou target-anchor training threshold 32 | # 'lr0': 0.001, # initial learning rate 33 | # 'lrf': -5., # final learning rate = lr0 * (10 ** lrf) 34 | # 'momentum': 0.9, # SGD momentum 35 | # 'weight_decay': 0.0005} # optimizer weight decay 36 | 37 | # Hyperparameters: train.py --evolve --epochs 2 --img-size 320, Metrics: 0.225 0.251 0.145 0.218 (rect) 38 | # hyp = {'xy': 0.4499, # xy loss gain 39 | # 'wh': 0.05121, # wh loss gain 40 | # 'cls': 0.04207, # cls loss gain 41 | # 'conf': 2.853, # conf loss gain 42 | # 'iou_t': 0.2487, # iou target-anchor training threshold 43 | # 'lr0': 0.0005301, # initial learning rate 44 | # 'lrf': -5., # final learning rate = lr0 * (10 ** lrf) 45 | # 'momentum': 0.8823, # SGD momentum 46 | # 'weight_decay': 0.0004149} # optimizer weight decay 47 | 48 | # Hyperparameters: train.py --evolve --epochs 2 --img-size 320, Metrics: 0.178 0.313 0.167 0.212 (square) 49 | # hyp = {'xy': 0.4664, # xy loss gain 50 | # 'wh': 0.08437, # wh loss gain 51 | # 'cls': 0.05145, # cls loss gain 52 | # 'conf': 4.244, # conf loss gain 53 | # 'iou_t': 0.09121, # iou target-anchor training threshold 54 | # 'lr0': 0.0004938, # initial learning rate 55 | # 'lrf': -5., # final learning rate = lr0 * (10 ** lrf) 56 | # 'momentum': 0.9025, # SGD momentum 57 | # 'weight_decay': 0.0005417} # optimizer weight decay 58 | 59 | def train( 60 | cfg, 61 | data_cfg, 62 | img_size=416, 63 | resume=False, 64 | epochs=100, # 500200 batches at bs 4, 117263 images = 68 epochs 65 | batch_size=16, 66 | accumulate=4, # effective bs = 64 = batch_size * accumulate 67 | freeze_backbone=False, 68 | transfer=False # Transfer learning (train only YOLO layers) 69 | ): 70 | init_seeds() 71 | weights = 'weights' + os.sep 72 | latest = weights + 'latest.pt' 73 | best = weights + 'best.pt' 74 | device = torch_utils.select_device() 75 | torch.backends.cudnn.benchmark = True # possibly unsuitable for multiscale 76 | img_size_test = img_size # image size for testing 77 | 78 | if opt.multi_scale: 79 | img_size_min = round(img_size / 32 / 1.5) 80 | img_size_max = round(img_size / 32 * 1.5) 81 | img_size = img_size_max * 32 # initiate with maximum multi_scale size 82 | 83 | # Configure run 84 | data_dict = parse_data_cfg(data_cfg) 85 | train_path = data_dict['train'] 86 | nc = int(data_dict['classes']) # number of classes 87 | 88 | # Initialize model 89 | model = Darknet(cfg).to(device) 90 | 91 | # Optimizer 92 | optimizer = optim.SGD(model.parameters(), lr=hyp['lr0'], momentum=hyp['momentum'], weight_decay=hyp['weight_decay']) 93 | 94 | cutoff = -1 # backbone reaches to cutoff layer 95 | start_epoch = 0 96 | best_loss = float('inf') 97 | nf = int(model.module_defs[model.yolo_layers[0] - 1]['filters']) # yolo layer size (i.e. 255) 98 | if resume: # Load previously saved model 99 | if transfer: # Transfer learning 100 | chkpt = torch.load(weights + 'yolov3-spp.pt', map_location=device) 101 | model.load_state_dict({k: v for k, v in chkpt['model'].items() if v.numel() > 1 and v.shape[0] != 255}, 102 | strict=False) 103 | for p in model.parameters(): 104 | p.requires_grad = True if p.shape[0] == nf else False 105 | 106 | else: # resume from latest.pt 107 | chkpt = torch.load(latest, map_location=device) # load checkpoint 108 | model.load_state_dict(chkpt['model']) 109 | 110 | start_epoch = chkpt['epoch'] + 1 111 | if chkpt['optimizer'] is not None: 112 | optimizer.load_state_dict(chkpt['optimizer']) 113 | best_loss = chkpt['best_loss'] 114 | del chkpt 115 | 116 | else: # Initialize model with backbone (optional) 117 | if '-tiny' in cfg: 118 | cutoff = load_darknet_weights(model, weights + 'yolov3-tiny.conv.15') 119 | else: 120 | cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74') 121 | 122 | # Scheduler https://github.com/ultralytics/yolov3/issues/238 123 | # lf = lambda x: 1 - x / epochs # linear ramp to zero 124 | # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs) # exp ramp 125 | # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs)) # inverse exp ramp 126 | # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) 127 | scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=[round(opt.epochs * x) for x in (0.8, 0.9)], gamma=0.1) 128 | scheduler.last_epoch = start_epoch - 1 129 | 130 | # # Plot lr schedule 131 | # y = [] 132 | # for _ in range(epochs): 133 | # scheduler.step() 134 | # y.append(optimizer.param_groups[0]['lr']) 135 | # plt.plot(y, label='LambdaLR') 136 | # plt.xlabel('epoch') 137 | # plt.xlabel('LR') 138 | # plt.tight_layout() 139 | # plt.savefig('LR.png', dpi=300) 140 | 141 | # Dataset 142 | dataset = LoadImagesAndLabels(train_path, 143 | img_size, 144 | batch_size, 145 | augment=True, 146 | rect=False) 147 | 148 | # Initialize distributed training 149 | if torch.cuda.device_count() > 1: 150 | dist.init_process_group(backend=opt.backend, init_method=opt.dist_url, world_size=opt.world_size, rank=opt.rank) 151 | model = torch.nn.parallel.DistributedDataParallel(model) 152 | # sampler = torch.project.data.distributed.DistributedSampler(dataset) 153 | 154 | # Dataloader 155 | dataloader = DataLoader(dataset, 156 | batch_size=batch_size, 157 | num_workers=opt.num_workers, 158 | shuffle=True, # disable rectangular training if True 159 | pin_memory=True, 160 | collate_fn=dataset.collate_fn) 161 | 162 | # Mixed precision training https://github.com/NVIDIA/apex 163 | # install help: https://github.com/NVIDIA/apex/issues/259 164 | mixed_precision = False 165 | if mixed_precision: 166 | from apex import amp 167 | model, optimizer = amp.initialize(model, optimizer, opt_level='O1') 168 | 169 | # Remove old results 170 | for f in glob.glob('*_batch*.jpg') + glob.glob('results.txt'): 171 | os.remove(f) 172 | 173 | # Start training 174 | model.hyp = hyp # attach hyperparameters to model 175 | model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights 176 | model_info(model) 177 | nb = len(dataloader) 178 | maps = np.zeros(nc) # mAP per class 179 | results = (0, 0, 0, 0, 0) # P, R, mAP, F1, test_loss 180 | n_burnin = min(round(nb / 5 + 1), 1000) # burn-in batches 181 | t, t0 = time.time(), time.time() 182 | for epoch in range(start_epoch, epochs): 183 | print(epoch) 184 | model.train() 185 | print(('\n%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf', 'cls', 'total', 'targets', 'time')) 186 | 187 | # Update scheduler 188 | scheduler.step() 189 | 190 | # Freeze backbone at epoch 0, unfreeze at epoch 1 (optional) 191 | if freeze_backbone and epoch < 2: 192 | for name, p in model.named_parameters(): 193 | if int(name.split('.')[1]) < cutoff: # if layer < 75 194 | p.requires_grad = False if epoch == 0 else True 195 | 196 | # # Update image weights (optional) 197 | # w = model.class_weights.cpu().numpy() * (1 - maps) # class weights 198 | # image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) 199 | # dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # random weighted index 200 | 201 | mloss = torch.zeros(5).to(device) # mean losses 202 | for i, (imgs, targets, _, _) in enumerate(dataloader): 203 | imgs = imgs.to(device) 204 | targets = targets.to(device) 205 | 206 | # Multi-Scale training 207 | if opt.multi_scale: 208 | if (i + 1 + nb * epoch) % 10 == 0: #  adjust (67% - 150%) every 10 batches 209 | img_size = random.choice(range(img_size_min, img_size_max + 1)) * 32 210 | print('img_size = %g' % img_size) 211 | scale_factor = img_size / max(imgs.shape[-2:]) 212 | imgs = F.interpolate(imgs, scale_factor=scale_factor, mode='bilinear', align_corners=False) 213 | 214 | # Plot images with bounding boxes 215 | if epoch == 0 and i == 0: 216 | plot_images(imgs=imgs, targets=targets, fname='train_batch%g.jpg' % i) 217 | 218 | # SGD burn-in 219 | if epoch == 0 and i <= n_burnin: 220 | lr = hyp['lr0'] * (i / n_burnin) ** 4 221 | for x in optimizer.param_groups: 222 | x['lr'] = lr 223 | 224 | # Run model 225 | pred = model(imgs) 226 | 227 | # Compute loss 228 | loss, loss_items = compute_loss(pred, targets, model) 229 | if torch.isnan(loss): 230 | print('WARNING: nan loss detected, ending training') 231 | return results 232 | 233 | # Compute gradient 234 | if mixed_precision: 235 | with amp.scale_loss(loss, optimizer) as scaled_loss: 236 | scaled_loss.backward() 237 | else: 238 | loss.backward() 239 | 240 | # Accumulate gradient for x batches before optimizing 241 | if (i + 1) % accumulate == 0 or (i + 1) == nb: 242 | optimizer.step() 243 | optimizer.zero_grad() 244 | 245 | # Print batch results 246 | mloss = (mloss * i + loss_items) / (i + 1) # update mean losses 247 | s = ('%8s%12s' + '%10.3g' * 7) % ( 248 | '%g/%g' % (epoch, epochs - 1), 249 | '%g/%g' % (i, nb - 1), *mloss, len(targets), time.time() - t) 250 | t = time.time() 251 | print(s) 252 | 253 | # Calculate mAP (always test final epoch, skip first 5 if opt.nosave) 254 | if not (opt.notest or (opt.nosave and epoch < 10)) or epoch == epochs - 1: 255 | with torch.no_grad(): 256 | results, maps = test.test(cfg, data_cfg, batch_size=batch_size, img_size=img_size_test, model=model, 257 | conf_thres=0.1) 258 | 259 | # Write epoch results 260 | with open('results.txt', 'a') as file: 261 | file.write(s + '%11.3g' * 5 % results + '\n') # P, R, mAP, F1, test_loss 262 | 263 | # Update best loss 264 | test_loss = results[4] 265 | if test_loss < best_loss: 266 | best_loss = test_loss 267 | 268 | # Save training results 269 | save = (not opt.nosave) or (epoch == epochs - 1) 270 | if save: 271 | # Create checkpoint 272 | chkpt = {'epoch': epoch, 273 | 'best_loss': best_loss, 274 | 'model': model.module.state_dict() if type( 275 | model) is nn.parallel.DistributedDataParallel else model.state_dict(), 276 | 'optimizer': optimizer.state_dict()} 277 | 278 | # Save latest checkpoint 279 | torch.save(chkpt, latest) 280 | 281 | # Save best checkpoint 282 | if best_loss == test_loss: 283 | torch.save(chkpt, best) 284 | 285 | # Save backup every 10 epochs (optional) 286 | if epoch > 0 and epoch % 10 == 0: 287 | torch.save(chkpt, weights + 'backup%g.pt' % epoch) 288 | 289 | # Delete checkpoint 290 | del chkpt 291 | 292 | dt = (time.time() - t0) / 3600 293 | print('%g epochs completed in %.3f hours.' % (epoch - start_epoch + 1, dt)) 294 | return results 295 | 296 | 297 | def print_mutation(hyp, results): 298 | # Write mutation results 299 | a = '%11s' * len(hyp) % tuple(hyp.keys()) # hyperparam keys 300 | b = '%11.4g' * len(hyp) % tuple(hyp.values()) # hyperparam values 301 | c = '%11.3g' * len(results) % results # results (P, R, mAP, F1, test_loss) 302 | print('\n%s\n%s\nEvolved fitness: %s\n' % (a, b, c)) 303 | with open('evolve.txt', 'a') as f: 304 | f.write(c + b + '\n') 305 | 306 | 307 | if __name__ == '__main__': 308 | parser = argparse.ArgumentParser() 309 | parser.add_argument('--epochs', type=int, default=68, help='number of epochs') 310 | parser.add_argument('--batch-size', type=int, default=8, help='batch size') 311 | parser.add_argument('--accumulate', type=int, default=8, help='number of batches to accumulate before optimizing') 312 | parser.add_argument('--cfg', type=str, default='cfg/yolov3-spp.cfg', help='cfg file path') 313 | parser.add_argument('--data-cfg', type=str, default='data/coco_64img.data', help='coco.data file path') 314 | parser.add_argument('--multi-scale', action='store_true', help='random image sizes per batch 320 - 608') 315 | parser.add_argument('--img-size', type=int, default=416, help='inference size (pixels)') 316 | parser.add_argument('--resume', action='store_true', help='resume training flag') 317 | parser.add_argument('--transfer', action='store_true', help='transfer learning flag') 318 | parser.add_argument('--num-workers', type=int, default=4, help='number of Pytorch DataLoader workers') 319 | parser.add_argument('--dist-url', default='tcp://127.0.0.1:9999', type=str, help='distributed training init method') 320 | parser.add_argument('--rank', default=0, type=int, help='distributed training node rank') 321 | parser.add_argument('--world-size', default=1, type=int, help='number of nodes for distributed training') 322 | parser.add_argument('--backend', default='nccl', type=str, help='distributed backend') 323 | parser.add_argument('--nosave', action='store_true', help='do not save training results') 324 | parser.add_argument('--notest', action='store_true', help='only test final epoch') 325 | parser.add_argument('--evolve', action='store_true', help='run hyperparameter evolution') 326 | parser.add_argument('--var', default=0, type=int, help='debug variable') 327 | opt = parser.parse_args() 328 | print(opt) 329 | 330 | if opt.evolve: 331 | opt.notest = True # save time by only testing final epoch 332 | opt.nosave = True # do not save checkpoints 333 | 334 | # Train 335 | results = train( 336 | opt.cfg, 337 | opt.data_cfg, 338 | img_size=opt.img_size, 339 | resume=opt.resume or opt.transfer, 340 | transfer=opt.transfer, 341 | epochs=opt.epochs, 342 | batch_size=opt.batch_size, 343 | accumulate=opt.accumulate, 344 | ) 345 | 346 | # Evolve hyperparameters (optional) 347 | if opt.evolve: 348 | best_fitness = results[2] # use mAP for fitness 349 | 350 | # Write mutation results 351 | print_mutation(hyp, results) 352 | 353 | gen = 1000 # generations to evolve 354 | for _ in range(gen): 355 | 356 | # Mutate hyperparameters 357 | old_hyp = hyp.copy() 358 | init_seeds(seed=int(time.time())) 359 | s = [.3, .3, .3, .3, .3, .3, .3, .03, .3] # xy, wh, cls, conf, iou_t, lr0, lrf, momentum, weight_decay 360 | for i, k in enumerate(hyp.keys()): 361 | x = (np.random.randn(1) * s[i] + 1) ** 1.1 # plt.hist(x.ravel(), 100) 362 | hyp[k] = hyp[k] * float(x) # vary by about 30% 1sigma 363 | 364 | # Clip to limits 365 | keys = ['lr0', 'iou_t', 'momentum', 'weight_decay'] 366 | limits = [(1e-4, 1e-2), (0, 0.90), (0.70, 0.99), (0, 0.01)] 367 | for k, v in zip(keys, limits): 368 | hyp[k] = np.clip(hyp[k], v[0], v[1]) 369 | 370 | # Determine mutation fitness 371 | results = train( 372 | opt.cfg, 373 | opt.data_cfg, 374 | img_size=opt.img_size, 375 | resume=opt.resume or opt.transfer, 376 | transfer=opt.transfer, 377 | epochs=opt.epochs, 378 | batch_size=opt.batch_size, 379 | accumulate=opt.accumulate, 380 | ) 381 | mutation_fitness = results[2] 382 | 383 | # Write mutation results 384 | print_mutation(hyp, results) 385 | 386 | # Update hyperparameters if fitness improved 387 | if mutation_fitness > best_fitness: 388 | # Fitness improved! 389 | print('Fitness improved!') 390 | best_fitness = mutation_fitness 391 | else: 392 | hyp = old_hyp.copy() # reset hyp to 393 | 394 | # # Plot results 395 | # import numpy as np 396 | # import matplotlib.pyplot as plt 397 | # a = np.loadtxt('evolve_1000val.txt') 398 | # x = a[:, 2] * a[:, 3] # metric = mAP * F1 399 | # weights = (x - x.min()) ** 2 400 | # fig = plt.figure(figsize=(14, 7)) 401 | # for i in range(len(hyp)): 402 | # y = a[:, i + 5] 403 | # mu = (y * weights).sum() / weights.sum() 404 | # plt.subplot(2, 5, i+1) 405 | # plt.plot(x.max(), mu, 'o') 406 | # plt.plot(x, y, '.') 407 | # print(list(hyp.keys())[i],'%.4g' % mu) 408 | 409 | -------------------------------------------------------------------------------- /voc_label.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/zhangzhengming/下载/yolov3/data\n" 13 | ] 14 | }, 15 | { 16 | "ename": "FileNotFoundError", 17 | "evalue": "[Errno 2] No such file or directory: 'data/ImageSets/train.txt'", 18 | "output_type": "error", 19 | "traceback": [ 20 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 21 | "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", 22 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data/labels/'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmakedirs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data/labels/'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 53\u001b[0;31m \u001b[0mimage_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data/ImageSets/%s.txt'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mimage_set\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 54\u001b[0m \u001b[0mlist_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data/%s.txt'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mimage_set\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mimage_id\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mimage_ids\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 23 | "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/ImageSets/train.txt'" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "import xml.etree.ElementTree as ET\n", 29 | "import pickle\n", 30 | "import os\n", 31 | "from os import listdir, getcwd\n", 32 | "from os.path import join\n", 33 | " \n", 34 | "sets = ['train', 'test','val']\n", 35 | " \n", 36 | "classes = [\"collector\"]\n", 37 | " \n", 38 | " \n", 39 | "def convert(size, box):\n", 40 | " dw = 1. / size[0]\n", 41 | " dh = 1. / size[1]\n", 42 | " x = (box[0] + box[1]) / 2.0\n", 43 | " y = (box[2] + box[3]) / 2.0\n", 44 | " w = box[1] - box[0]\n", 45 | " h = box[3] - box[2]\n", 46 | " x = x * dw\n", 47 | " w = w * dw\n", 48 | " y = y * dh\n", 49 | " h = h * dh\n", 50 | " return (x, y, w, h)\n", 51 | " \n", 52 | " \n", 53 | "def convert_annotation(image_id):\n", 54 | " in_file = open('Annotations/%s.xml' % (image_id))\n", 55 | " out_file = open('data/labels/%s.txt' % (image_id), 'w')\n", 56 | " tree = ET.parse(in_file)\n", 57 | " root = tree.getroot()\n", 58 | " size = root.find('size')\n", 59 | " w = int(size.find('width').text)\n", 60 | " h = int(size.find('height').text)\n", 61 | " \n", 62 | " for obj in root.iter('object'):\n", 63 | " difficult = obj.find('Difficult').text\n", 64 | " cls = obj.find('name').text\n", 65 | " if cls not in classes or int(difficult) == 1:\n", 66 | " continue\n", 67 | " cls_id = classes.index(cls)\n", 68 | " xmlbox = obj.find('bndbox')\n", 69 | " b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text),\n", 70 | " float(xmlbox.find('ymax').text))\n", 71 | " bb = convert((w, h), b)\n", 72 | " out_file.write(str(cls_id) + \" \" + \" \".join([str(a) for a in bb]) + '\\n')\n", 73 | " \n", 74 | " \n", 75 | "wd = getcwd()\n", 76 | "print(wd)\n", 77 | "for image_set in sets:\n", 78 | " if not os.path.exists('data/labels/'):\n", 79 | " os.makedirs('data/labels/')\n", 80 | " image_ids = open('data/ImageSets/%s.txt' % (image_set)).read().strip().split()\n", 81 | " list_file = open('data/%s.txt' % (image_set), 'w')\n", 82 | " for image_id in image_ids:\n", 83 | " list_file.write('images/%s.jpg\\n' % (image_id))\n", 84 | " convert_annotation(image_id)\n", 85 | " list_file.close()\n" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [] 94 | } 95 | ], 96 | "metadata": { 97 | "kernelspec": { 98 | "display_name": "Python 3", 99 | "language": "python", 100 | "name": "python3" 101 | }, 102 | "language_info": { 103 | "codemirror_mode": { 104 | "name": "ipython", 105 | "version": 3 106 | }, 107 | "file_extension": ".py", 108 | "mimetype": "text/x-python", 109 | "name": "python", 110 | "nbconvert_exporter": "python", 111 | "pygments_lexer": "ipython3", 112 | "version": "3.7.0" 113 | } 114 | }, 115 | "nbformat": 4, 116 | "nbformat_minor": 2 117 | } 118 | -------------------------------------------------------------------------------- /weights/download_yolov3_weights.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # make '/weights' directory if it does not exist and cd into it 4 | # mkdir -p weights && cd weights 5 | 6 | # copy darknet weight files, continue '-c' if partially downloaded 7 | # wget -c https://pjreddie.com/media/files/yolov3.weights 8 | # wget -c https://pjreddie.com/media/files/yolov3-tiny.weights 9 | # wget -c https://pjreddie.com/media/files/yolov3-spp.weights 10 | 11 | # yolov3 pytorch weights 12 | # download from Google Drive: https://drive.google.com/drive/folders/1uxgUBemJVw9wZsdpboYbzUN4bcRhsuAI 13 | 14 | # darknet53 weights (first 75 layers only) 15 | # wget -c https://pjreddie.com/media/files/darknet53.conv.74 16 | 17 | # yolov3-tiny weights from darknet (first 16 layers only) 18 | # ./darknet partial cfg/yolov3-tiny.cfg yolov3-tiny.weights yolov3-tiny.conv.15 15 19 | # mv yolov3-tiny.conv.15 ../ 20 | 21 | # new method 22 | python3 -c "from models import *; 23 | attempt_download('weights/yolov3.pt'); 24 | attempt_download('weights/yolov3-spp.pt')" 25 | --------------------------------------------------------------------------------