├── data ├── coco1.txt ├── coco1.data ├── coco16.data ├── coco64.data ├── coco1cls.data ├── coco2014.data ├── coco2017.data ├── coco16.txt ├── coco1cls.txt ├── get_coco2014.sh ├── get_coco2017.sh ├── coco.names ├── coco_paper.names └── coco64.txt ├── CIoU.png ├── requirements.txt ├── utils ├── evolve.sh ├── gcp.sh ├── google_utils.py ├── parse_config.py ├── layers.py ├── torch_utils.py └── adabound.py ├── weights └── download_yolov3_weights.sh ├── .gitignore ├── Dockerfile ├── cfg ├── yolov3-tiny.cfg ├── yolov3-tiny-1cls.cfg ├── yolov3-tiny-3cls.cfg ├── yolov3-tiny3-1cls.cfg ├── yolov3-tiny3.cfg ├── yolov3-1cls.cfg ├── yolov3.cfg ├── yolov3-spp-1cls.cfg ├── yolov3-spp-3cls.cfg ├── yolov3-spp.cfg ├── yolov3-asff.cfg ├── yolov3-spp3.cfg ├── yolov3-spp-pan-scale.cfg └── csresnext50-panet-spp.cfg ├── README.md ├── detect.py └── test.py /data/coco1.txt: -------------------------------------------------------------------------------- 1 | ../coco/images/train2017/000000109622.jpg 2 | -------------------------------------------------------------------------------- /CIoU.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zzh-tju/ultralytics-YOLOv3-Cluster-NMS/HEAD/CIoU.png -------------------------------------------------------------------------------- /data/coco1.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=data/coco1.txt 3 | valid=data/coco1.txt 4 | names=data/coco.names 5 | -------------------------------------------------------------------------------- /data/coco16.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=data/coco16.txt 3 | valid=data/coco16.txt 4 | names=data/coco.names 5 | -------------------------------------------------------------------------------- /data/coco64.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=data/coco64.txt 3 | valid=data/coco64.txt 4 | names=data/coco.names 5 | -------------------------------------------------------------------------------- /data/coco1cls.data: -------------------------------------------------------------------------------- 1 | classes=1 2 | train=data/coco1cls.txt 3 | valid=data/coco1cls.txt 4 | names=data/coco.names 5 | -------------------------------------------------------------------------------- /data/coco2014.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=../coco/trainvalno5k.txt 3 | valid=../coco/5k.txt 4 | names=data/coco.names 5 | -------------------------------------------------------------------------------- /data/coco2017.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=../coco/train2017.txt 3 | valid=../coco/val2017.txt 4 | names=data/coco.names 5 | -------------------------------------------------------------------------------- /data/coco16.txt: -------------------------------------------------------------------------------- 1 | ../coco/images/train2017/000000109622.jpg 2 | ../coco/images/train2017/000000160694.jpg 3 | ../coco/images/train2017/000000308590.jpg 4 | ../coco/images/train2017/000000327573.jpg 5 | ../coco/images/train2017/000000062929.jpg 6 | ../coco/images/train2017/000000512793.jpg 7 | ../coco/images/train2017/000000371735.jpg 8 | ../coco/images/train2017/000000148118.jpg 9 | ../coco/images/train2017/000000309856.jpg 10 | ../coco/images/train2017/000000141882.jpg 11 | ../coco/images/train2017/000000318783.jpg 12 | ../coco/images/train2017/000000337760.jpg 13 | ../coco/images/train2017/000000298197.jpg 14 | ../coco/images/train2017/000000042421.jpg 15 | ../coco/images/train2017/000000328898.jpg 16 | ../coco/images/train2017/000000458856.jpg 17 | -------------------------------------------------------------------------------- /data/coco1cls.txt: -------------------------------------------------------------------------------- 1 | ../coco/images/train2017/000000000901.jpg 2 | ../coco/images/train2017/000000001464.jpg 3 | ../coco/images/train2017/000000003220.jpg 4 | ../coco/images/train2017/000000003365.jpg 5 | ../coco/images/train2017/000000004772.jpg 6 | ../coco/images/train2017/000000009987.jpg 7 | ../coco/images/train2017/000000010498.jpg 8 | ../coco/images/train2017/000000012455.jpg 9 | ../coco/images/train2017/000000013992.jpg 10 | ../coco/images/train2017/000000014125.jpg 11 | ../coco/images/train2017/000000016314.jpg 12 | ../coco/images/train2017/000000016670.jpg 13 | ../coco/images/train2017/000000018412.jpg 14 | ../coco/images/train2017/000000021212.jpg 15 | ../coco/images/train2017/000000021826.jpg 16 | ../coco/images/train2017/000000030566.jpg 17 | -------------------------------------------------------------------------------- /data/get_coco2014.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Zip coco folder 3 | # zip -r coco.zip coco 4 | # tar -czvf coco.tar.gz coco 5 | 6 | # Download labels from Google Drive, accepting presented query 7 | filename="coco2014labels.zip" 8 | fileid="1s6-CmF5_SElM28r52P1OUrCcuXZN-SFo" 9 | curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null 10 | curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=${fileid}" -o ${filename} 11 | rm ./cookie 12 | 13 | # Unzip labels 14 | unzip -q ${filename} # for coco.zip 15 | # tar -xzf ${filename} # for coco.tar.gz 16 | rm ${filename} 17 | 18 | # Download and unzip images 19 | cd coco/images 20 | f="train2014.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f 21 | f="val2014.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f 22 | 23 | # cd out 24 | cd ../.. 25 | -------------------------------------------------------------------------------- /data/get_coco2017.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Zip coco folder 3 | # zip -r coco.zip coco 4 | # tar -czvf coco.tar.gz coco 5 | 6 | # Download labels from Google Drive, accepting presented query 7 | filename="coco2017labels.zip" 8 | fileid="1cXZR_ckHki6nddOmcysCuuJFM--T-Q6L" 9 | curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null 10 | curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=${fileid}" -o ${filename} 11 | rm ./cookie 12 | 13 | # Unzip labels 14 | unzip -q ${filename} # for coco.zip 15 | # tar -xzf ${filename} # for coco.tar.gz 16 | rm ${filename} 17 | 18 | # Download and unzip images 19 | cd coco/images 20 | f="train2017.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f 21 | f="val2017.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f 22 | 23 | # cd out 24 | cd ../.. 25 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # pip install -U -r requirements.txt 2 | numpy 3 | opencv-python >= 4.1 4 | torch >= 1.5 5 | matplotlib 6 | pycocotools 7 | tqdm 8 | pillow 9 | tensorboard >= 1.14 10 | 11 | # Nvidia Apex (optional) for mixed precision training -------------------------- 12 | # git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . --user && cd .. && rm -rf apex 13 | 14 | # Conda commands (in place of pip) --------------------------------------------- 15 | # conda update -yn base -c defaults conda 16 | # conda install -yc anaconda numpy opencv matplotlib tqdm pillow ipython 17 | # conda install -yc conda-forge scikit-image pycocotools tensorboard 18 | # conda install -yc spyder-ide spyder-line-profiler 19 | # conda install -yc pytorch pytorch torchvision 20 | # conda install -yc conda-forge protobuf numpy && pip install onnx # https://github.com/onnx/onnx#linux-and-macos 21 | -------------------------------------------------------------------------------- /data/coco.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorcycle 5 | airplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | couch 59 | potted plant 60 | bed 61 | dining table 62 | toilet 63 | tv 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /utils/evolve.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #for i in 0 1 2 3 3 | #do 4 | # t=ultralytics/yolov3:v139 && sudo docker pull $t && sudo nvidia-docker run -d --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t utils/evolve.sh $i 5 | # sleep 30 6 | #done 7 | 8 | while true; do 9 | # python3 train.py --data ../data/sm4/out.data --img-size 320 --epochs 100 --batch 64 --accum 1 --weights yolov3-tiny.conv.15 --multi --bucket ult/wer --evolve --cache --device $1 --cfg yolov3-tiny3-1cls.cfg --single --adam 10 | # python3 train.py --data ../out/data.data --img-size 608 --epochs 10 --batch 8 --accum 8 --weights ultralytics68.pt --multi --bucket ult/athena --evolve --device $1 --cfg yolov3-spp-1cls.cfg 11 | 12 | python3 train.py --data coco2014.data --img-size 512 608 --epochs 27 --batch 8 --accum 8 --evolve --weights '' --bucket ult/coco/sppa_512 --device $1 --cfg yolov3-sppa.cfg --multi 13 | done 14 | 15 | 16 | # coco epoch times --img-size 416 608 --epochs 27 --batch 16 --accum 4 17 | # 36:34 2080ti 18 | # 21:58 V100 19 | # 63:00 T4 -------------------------------------------------------------------------------- /weights/download_yolov3_weights.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # make '/weights' directory if it does not exist and cd into it 4 | # mkdir -p weights && cd weights 5 | 6 | # copy darknet weight files, continue '-c' if partially downloaded 7 | # wget -c https://pjreddie.com/media/files/yolov3.weights 8 | # wget -c https://pjreddie.com/media/files/yolov3-tiny.weights 9 | # wget -c https://pjreddie.com/media/files/yolov3-spp.weights 10 | 11 | # yolov3 pytorch weights 12 | # download from Google Drive: https://drive.google.com/drive/folders/1uxgUBemJVw9wZsdpboYbzUN4bcRhsuAI 13 | 14 | # darknet53 weights (first 75 layers only) 15 | # wget -c https://pjreddie.com/media/files/darknet53.conv.74 16 | 17 | # yolov3-tiny weights from darknet (first 16 layers only) 18 | # ./darknet partial cfg/yolov3-tiny.cfg yolov3-tiny.weights yolov3-tiny.conv.15 15 19 | # mv yolov3-tiny.conv.15 ../ 20 | 21 | # new method 22 | python3 -c "from models import *; 23 | attempt_download('weights/yolov3.pt'); 24 | attempt_download('weights/yolov3-spp.pt')" 25 | -------------------------------------------------------------------------------- /data/coco_paper.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorcycle 5 | airplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | street sign 13 | stop sign 14 | parking meter 15 | bench 16 | bird 17 | cat 18 | dog 19 | horse 20 | sheep 21 | cow 22 | elephant 23 | bear 24 | zebra 25 | giraffe 26 | hat 27 | backpack 28 | umbrella 29 | shoe 30 | eye glasses 31 | handbag 32 | tie 33 | suitcase 34 | frisbee 35 | skis 36 | snowboard 37 | sports ball 38 | kite 39 | baseball bat 40 | baseball glove 41 | skateboard 42 | surfboard 43 | tennis racket 44 | bottle 45 | plate 46 | wine glass 47 | cup 48 | fork 49 | knife 50 | spoon 51 | bowl 52 | banana 53 | apple 54 | sandwich 55 | orange 56 | broccoli 57 | carrot 58 | hot dog 59 | pizza 60 | donut 61 | cake 62 | chair 63 | couch 64 | potted plant 65 | bed 66 | mirror 67 | dining table 68 | window 69 | desk 70 | toilet 71 | door 72 | tv 73 | laptop 74 | mouse 75 | remote 76 | keyboard 77 | cell phone 78 | microwave 79 | oven 80 | toaster 81 | sink 82 | refrigerator 83 | blender 84 | book 85 | clock 86 | vase 87 | scissors 88 | teddy bear 89 | hair drier 90 | toothbrush 91 | hair brush -------------------------------------------------------------------------------- /utils/gcp.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # New VM 4 | rm -rf sample_data yolov3 5 | git clone https://github.com/ultralytics/yolov3 6 | # git clone -b test --depth 1 https://github.com/ultralytics/yolov3 test # branch 7 | # sudo apt-get install zip 8 | #git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . --user && cd .. && rm -rf apex 9 | sudo conda install -yc conda-forge scikit-image pycocotools 10 | # python3 -c "from yolov3.utils.google_utils import gdrive_download; gdrive_download('193Zp_ye-3qXMonR1nZj3YyxMtQkMy50k','coco2014.zip')" 11 | python3 -c "from yolov3.utils.google_utils import gdrive_download; gdrive_download('1WQT6SOktSe8Uw6r10-2JhbEhMY5DJaph','coco2017.zip')" 12 | python3 -c "from yolov3.utils.google_utils import gdrive_download; gdrive_download('1C3HewOG9akA3y456SZLBJZfNDPkBwAto','knife.zip')" 13 | python3 -c "from yolov3.utils.google_utils import gdrive_download; gdrive_download('13g3LqdpkNE8sPosVJT6KFXlfoMypzRP4','sm4.zip')" 14 | sudo shutdown 15 | 16 | # Mount local SSD 17 | lsblk 18 | sudo mkfs.ext4 -F /dev/nvme0n1 19 | sudo mkdir -p /mnt/disks/nvme0n1 20 | sudo mount /dev/nvme0n1 /mnt/disks/nvme0n1 21 | sudo chmod a+w /mnt/disks/nvme0n1 22 | cp -r coco /mnt/disks/nvme0n1 23 | 24 | # Kill All 25 | t=ultralytics/yolov3:v1 26 | docker kill $(docker ps -a -q --filter ancestor=$t) 27 | 28 | # Evolve coco 29 | sudo -s 30 | t=ultralytics/yolov3:evolve 31 | # docker kill $(docker ps -a -q --filter ancestor=$t) 32 | for i in 0 1 6 7 33 | do 34 | docker pull $t && docker run --gpus all -d --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t bash utils/evolve.sh $i 35 | sleep 30 36 | done 37 | 38 | #COCO training 39 | n=131 && t=ultralytics/coco:v131 && sudo docker pull $t && sudo docker run -it --gpus all --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t python3 train.py --data coco2014.data --img-size 320 640 --epochs 300 --batch 16 --accum 4 --weights '' --device 0 --cfg yolov3-spp.cfg --nosave --bucket ult/coco --name $n && sudo shutdown 40 | n=132 && t=ultralytics/coco:v131 && sudo docker pull $t && sudo docker run -it --gpus all --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t python3 train.py --data coco2014.data --img-size 320 640 --epochs 300 --batch 64 --accum 1 --weights '' --device 0 --cfg yolov3-tiny.cfg --nosave --bucket ult/coco --name $n && sudo shutdown 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Start FROM Nvidia PyTorch image https://ngc.nvidia.com/catalog/containers/nvidia:pytorch 2 | FROM nvcr.io/nvidia/pytorch:20.03-py3 3 | 4 | # Install dependencies (pip or conda) 5 | RUN pip install -U gsutil thop 6 | # RUN pip install -U -r requirements.txt 7 | # RUN conda update -n base -c defaults conda 8 | # RUN conda install -y -c anaconda future numpy opencv matplotlib tqdm pillow 9 | # RUN conda install -y -c conda-forge scikit-image tensorboard pycocotools 10 | 11 | ## Install OpenCV with Gstreamer support 12 | #WORKDIR /usr/src 13 | #RUN pip uninstall -y opencv-python 14 | #RUN apt-get update 15 | #RUN apt-get install -y gstreamer1.0-tools gstreamer1.0-python3-dbg-plugin-loader libgstreamer1.0-dev libgstreamer-plugins-base1.0-dev 16 | #RUN git clone https://github.com/opencv/opencv.git && cd opencv && git checkout 4.1.1 && mkdir build 17 | #RUN git clone https://github.com/opencv/opencv_contrib.git && cd opencv_contrib && git checkout 4.1.1 18 | #RUN cd opencv/build && cmake ../ \ 19 | # -D OPENCV_EXTRA_MODULES_PATH=../../opencv_contrib/modules \ 20 | # -D BUILD_OPENCV_PYTHON3=ON \ 21 | # -D PYTHON3_EXECUTABLE=/opt/conda/bin/python \ 22 | # -D PYTHON3_INCLUDE_PATH=/opt/conda/include/python3.6m \ 23 | # -D PYTHON3_LIBRARIES=/opt/conda/lib/python3.6/site-packages \ 24 | # -D WITH_GSTREAMER=ON \ 25 | # -D WITH_FFMPEG=OFF \ 26 | # && make && make install && ldconfig 27 | #RUN cd /usr/local/lib/python3.6/site-packages/cv2/python-3.6/ && mv cv2.cpython-36m-x86_64-linux-gnu.so cv2.so 28 | #RUN cd /opt/conda/lib/python3.6/site-packages/ && ln -s /usr/local/lib/python3.6/site-packages/cv2/python-3.6/cv2.so cv2.so 29 | #RUN python3 -c "import cv2; print(cv2.getBuildInformation())" 30 | 31 | # Create working directory 32 | RUN mkdir -p /usr/src/app 33 | WORKDIR /usr/src/app 34 | 35 | # Copy contents 36 | COPY . /usr/src/app 37 | 38 | # Copy weights 39 | #RUN python3 -c "from models import *; \ 40 | #attempt_download('weights/yolov3.pt'); \ 41 | #attempt_download('weights/yolov3-spp.pt')" 42 | 43 | 44 | # --------------------------------------------------- Extras Below --------------------------------------------------- 45 | 46 | # Build and Push 47 | # t=ultralytics/yolov3:v0 && sudo docker build -t $t . && sudo docker push $t 48 | 49 | # Run 50 | # t=ultralytics/yolov3:v0 && sudo docker pull $t && sudo docker run -it --gpus all --ipc=host $t bash 51 | 52 | # Pull and Run with local directory access 53 | # t=ultralytics/yolov3:v0 && sudo docker pull $t && sudo docker run -it --gpus all --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t bash 54 | 55 | # Kill all 56 | # sudo docker kill "$(sudo docker ps -q)" 57 | 58 | # Kill all image-based 59 | # sudo docker kill $(sudo docker ps -a -q --filter ancestor=ultralytics/yolov3:v0) 60 | 61 | # Run bash for loop 62 | # sudo docker run --gpus all --ipc=host ultralytics/yolov3:v0 while true; do python3 train.py --evolve; done 63 | -------------------------------------------------------------------------------- /cfg/yolov3-tiny.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=2 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=16 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=32 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=64 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [maxpool] 58 | size=2 59 | stride=2 60 | 61 | [convolutional] 62 | batch_normalize=1 63 | filters=128 64 | size=3 65 | stride=1 66 | pad=1 67 | activation=leaky 68 | 69 | [maxpool] 70 | size=2 71 | stride=2 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=256 76 | size=3 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [maxpool] 82 | size=2 83 | stride=2 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=512 88 | size=3 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [maxpool] 94 | size=2 95 | stride=1 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=1024 100 | size=3 101 | stride=1 102 | pad=1 103 | activation=leaky 104 | 105 | ########### 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=256 110 | size=1 111 | stride=1 112 | pad=1 113 | activation=leaky 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=512 118 | size=3 119 | stride=1 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | size=1 125 | stride=1 126 | pad=1 127 | filters=255 128 | activation=linear 129 | 130 | 131 | 132 | [yolo] 133 | mask = 3,4,5 134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 135 | classes=80 136 | num=6 137 | jitter=.3 138 | ignore_thresh = .7 139 | truth_thresh = 1 140 | random=1 141 | 142 | [route] 143 | layers = -4 144 | 145 | [convolutional] 146 | batch_normalize=1 147 | filters=128 148 | size=1 149 | stride=1 150 | pad=1 151 | activation=leaky 152 | 153 | [upsample] 154 | stride=2 155 | 156 | [route] 157 | layers = -1, 8 158 | 159 | [convolutional] 160 | batch_normalize=1 161 | filters=256 162 | size=3 163 | stride=1 164 | pad=1 165 | activation=leaky 166 | 167 | [convolutional] 168 | size=1 169 | stride=1 170 | pad=1 171 | filters=255 172 | activation=linear 173 | 174 | [yolo] 175 | mask = 1,2,3 176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 177 | classes=80 178 | num=6 179 | jitter=.3 180 | ignore_thresh = .7 181 | truth_thresh = 1 182 | random=1 183 | -------------------------------------------------------------------------------- /data/coco64.txt: -------------------------------------------------------------------------------- 1 | ../coco/images/train2017/000000109622.jpg 2 | ../coco/images/train2017/000000160694.jpg 3 | ../coco/images/train2017/000000308590.jpg 4 | ../coco/images/train2017/000000327573.jpg 5 | ../coco/images/train2017/000000062929.jpg 6 | ../coco/images/train2017/000000512793.jpg 7 | ../coco/images/train2017/000000371735.jpg 8 | ../coco/images/train2017/000000148118.jpg 9 | ../coco/images/train2017/000000309856.jpg 10 | ../coco/images/train2017/000000141882.jpg 11 | ../coco/images/train2017/000000318783.jpg 12 | ../coco/images/train2017/000000337760.jpg 13 | ../coco/images/train2017/000000298197.jpg 14 | ../coco/images/train2017/000000042421.jpg 15 | ../coco/images/train2017/000000328898.jpg 16 | ../coco/images/train2017/000000458856.jpg 17 | ../coco/images/train2017/000000073824.jpg 18 | ../coco/images/train2017/000000252846.jpg 19 | ../coco/images/train2017/000000459590.jpg 20 | ../coco/images/train2017/000000273650.jpg 21 | ../coco/images/train2017/000000331311.jpg 22 | ../coco/images/train2017/000000156326.jpg 23 | ../coco/images/train2017/000000262985.jpg 24 | ../coco/images/train2017/000000253580.jpg 25 | ../coco/images/train2017/000000447976.jpg 26 | ../coco/images/train2017/000000378077.jpg 27 | ../coco/images/train2017/000000259913.jpg 28 | ../coco/images/train2017/000000424553.jpg 29 | ../coco/images/train2017/000000000612.jpg 30 | ../coco/images/train2017/000000267625.jpg 31 | ../coco/images/train2017/000000566012.jpg 32 | ../coco/images/train2017/000000196664.jpg 33 | ../coco/images/train2017/000000363331.jpg 34 | ../coco/images/train2017/000000057992.jpg 35 | ../coco/images/train2017/000000520047.jpg 36 | ../coco/images/train2017/000000453903.jpg 37 | ../coco/images/train2017/000000162083.jpg 38 | ../coco/images/train2017/000000268516.jpg 39 | ../coco/images/train2017/000000277436.jpg 40 | ../coco/images/train2017/000000189744.jpg 41 | ../coco/images/train2017/000000041128.jpg 42 | ../coco/images/train2017/000000527728.jpg 43 | ../coco/images/train2017/000000465269.jpg 44 | ../coco/images/train2017/000000246833.jpg 45 | ../coco/images/train2017/000000076784.jpg 46 | ../coco/images/train2017/000000323715.jpg 47 | ../coco/images/train2017/000000560463.jpg 48 | ../coco/images/train2017/000000006263.jpg 49 | ../coco/images/train2017/000000094701.jpg 50 | ../coco/images/train2017/000000521359.jpg 51 | ../coco/images/train2017/000000302903.jpg 52 | ../coco/images/train2017/000000047559.jpg 53 | ../coco/images/train2017/000000480583.jpg 54 | ../coco/images/train2017/000000050025.jpg 55 | ../coco/images/train2017/000000084512.jpg 56 | ../coco/images/train2017/000000508913.jpg 57 | ../coco/images/train2017/000000093708.jpg 58 | ../coco/images/train2017/000000070493.jpg 59 | ../coco/images/train2017/000000539270.jpg 60 | ../coco/images/train2017/000000474402.jpg 61 | ../coco/images/train2017/000000209842.jpg 62 | ../coco/images/train2017/000000028820.jpg 63 | ../coco/images/train2017/000000154257.jpg 64 | ../coco/images/train2017/000000342499.jpg 65 | -------------------------------------------------------------------------------- /cfg/yolov3-tiny-1cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=2 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=16 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=32 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=64 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [maxpool] 58 | size=2 59 | stride=2 60 | 61 | [convolutional] 62 | batch_normalize=1 63 | filters=128 64 | size=3 65 | stride=1 66 | pad=1 67 | activation=leaky 68 | 69 | [maxpool] 70 | size=2 71 | stride=2 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=256 76 | size=3 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [maxpool] 82 | size=2 83 | stride=2 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=512 88 | size=3 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [maxpool] 94 | size=2 95 | stride=1 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=1024 100 | size=3 101 | stride=1 102 | pad=1 103 | activation=leaky 104 | 105 | ########### 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=256 110 | size=1 111 | stride=1 112 | pad=1 113 | activation=leaky 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=512 118 | size=3 119 | stride=1 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | size=1 125 | stride=1 126 | pad=1 127 | filters=18 128 | activation=linear 129 | 130 | 131 | 132 | [yolo] 133 | mask = 3,4,5 134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 135 | classes=1 136 | num=6 137 | jitter=.3 138 | ignore_thresh = .7 139 | truth_thresh = 1 140 | random=1 141 | 142 | [route] 143 | layers = -4 144 | 145 | [convolutional] 146 | batch_normalize=1 147 | filters=128 148 | size=1 149 | stride=1 150 | pad=1 151 | activation=leaky 152 | 153 | [upsample] 154 | stride=2 155 | 156 | [route] 157 | layers = -1, 8 158 | 159 | [convolutional] 160 | batch_normalize=1 161 | filters=256 162 | size=3 163 | stride=1 164 | pad=1 165 | activation=leaky 166 | 167 | [convolutional] 168 | size=1 169 | stride=1 170 | pad=1 171 | filters=18 172 | activation=linear 173 | 174 | [yolo] 175 | mask = 0,1,2 176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 177 | classes=1 178 | num=6 179 | jitter=.3 180 | ignore_thresh = .7 181 | truth_thresh = 1 182 | random=1 183 | -------------------------------------------------------------------------------- /cfg/yolov3-tiny-3cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=2 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=16 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=32 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=64 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [maxpool] 58 | size=2 59 | stride=2 60 | 61 | [convolutional] 62 | batch_normalize=1 63 | filters=128 64 | size=3 65 | stride=1 66 | pad=1 67 | activation=leaky 68 | 69 | [maxpool] 70 | size=2 71 | stride=2 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=256 76 | size=3 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [maxpool] 82 | size=2 83 | stride=2 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=512 88 | size=3 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [maxpool] 94 | size=2 95 | stride=1 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=1024 100 | size=3 101 | stride=1 102 | pad=1 103 | activation=leaky 104 | 105 | ########### 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=256 110 | size=1 111 | stride=1 112 | pad=1 113 | activation=leaky 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=512 118 | size=3 119 | stride=1 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | size=1 125 | stride=1 126 | pad=1 127 | filters=24 128 | activation=linear 129 | 130 | 131 | 132 | [yolo] 133 | mask = 3,4,5 134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 135 | classes=3 136 | num=6 137 | jitter=.3 138 | ignore_thresh = .7 139 | truth_thresh = 1 140 | random=1 141 | 142 | [route] 143 | layers = -4 144 | 145 | [convolutional] 146 | batch_normalize=1 147 | filters=128 148 | size=1 149 | stride=1 150 | pad=1 151 | activation=leaky 152 | 153 | [upsample] 154 | stride=2 155 | 156 | [route] 157 | layers = -1, 8 158 | 159 | [convolutional] 160 | batch_normalize=1 161 | filters=256 162 | size=3 163 | stride=1 164 | pad=1 165 | activation=leaky 166 | 167 | [convolutional] 168 | size=1 169 | stride=1 170 | pad=1 171 | filters=24 172 | activation=linear 173 | 174 | [yolo] 175 | mask = 0,1,2 176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 177 | classes=3 178 | num=6 179 | jitter=.3 180 | ignore_thresh = .7 181 | truth_thresh = 1 182 | random=1 183 | -------------------------------------------------------------------------------- /utils/google_utils.py: -------------------------------------------------------------------------------- 1 | # This file contains google utils: https://cloud.google.com/storage/docs/reference/libraries 2 | # pip install --upgrade google-cloud-storage 3 | 4 | import os 5 | import time 6 | 7 | 8 | # from google.cloud import storage 9 | 10 | 11 | def gdrive_download(id='1HaXkef9z6y5l4vUnCYgdmEAj61c6bfWO', name='coco.zip'): 12 | # https://gist.github.com/tanaikech/f0f2d122e05bf5f971611258c22c110f 13 | # Downloads a file from Google Drive, accepting presented query 14 | # from utils.google_utils import *; gdrive_download() 15 | t = time.time() 16 | 17 | print('Downloading https://drive.google.com/uc?export=download&id=%s as %s... ' % (id, name), end='') 18 | os.remove(name) if os.path.exists(name) else None # remove existing 19 | os.remove('cookie') if os.path.exists('cookie') else None 20 | 21 | # Attempt file download 22 | os.system("curl -c ./cookie -s -L \"https://drive.google.com/uc?export=download&id=%s\" > /dev/null" % id) 23 | if os.path.exists('cookie'): # large file 24 | s = "curl -Lb ./cookie \"https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=%s\" -o %s" % ( 25 | id, name) 26 | else: # small file 27 | s = "curl -s -L -o %s 'https://drive.google.com/uc?export=download&id=%s'" % (name, id) 28 | r = os.system(s) # execute, capture return values 29 | os.remove('cookie') if os.path.exists('cookie') else None 30 | 31 | # Error check 32 | if r != 0: 33 | os.remove(name) if os.path.exists(name) else None # remove partial 34 | print('Download error ') # raise Exception('Download error') 35 | return r 36 | 37 | # Unzip if archive 38 | if name.endswith('.zip'): 39 | print('unzipping... ', end='') 40 | os.system('unzip -q %s' % name) # unzip 41 | os.remove(name) # remove zip to free space 42 | 43 | print('Done (%.1fs)' % (time.time() - t)) 44 | return r 45 | 46 | 47 | def upload_blob(bucket_name, source_file_name, destination_blob_name): 48 | # Uploads a file to a bucket 49 | # https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python 50 | 51 | storage_client = storage.Client() 52 | bucket = storage_client.get_bucket(bucket_name) 53 | blob = bucket.blob(destination_blob_name) 54 | 55 | blob.upload_from_filename(source_file_name) 56 | 57 | print('File {} uploaded to {}.'.format( 58 | source_file_name, 59 | destination_blob_name)) 60 | 61 | 62 | def download_blob(bucket_name, source_blob_name, destination_file_name): 63 | # Uploads a blob from a bucket 64 | storage_client = storage.Client() 65 | bucket = storage_client.get_bucket(bucket_name) 66 | blob = bucket.blob(source_blob_name) 67 | 68 | blob.download_to_filename(destination_file_name) 69 | 70 | print('Blob {} downloaded to {}.'.format( 71 | source_blob_name, 72 | destination_file_name)) 73 | -------------------------------------------------------------------------------- /utils/parse_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | 5 | 6 | def parse_model_cfg(path): 7 | # Parse the yolo *.cfg file and return module definitions path may be 'cfg/yolov3.cfg', 'yolov3.cfg', or 'yolov3' 8 | if not path.endswith('.cfg'): # add .cfg suffix if omitted 9 | path += '.cfg' 10 | if not os.path.exists(path) and os.path.exists('cfg' + os.sep + path): # add cfg/ prefix if omitted 11 | path = 'cfg' + os.sep + path 12 | 13 | with open(path, 'r') as f: 14 | lines = f.read().split('\n') 15 | lines = [x for x in lines if x and not x.startswith('#')] 16 | lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces 17 | mdefs = [] # module definitions 18 | for line in lines: 19 | if line.startswith('['): # This marks the start of a new block 20 | mdefs.append({}) 21 | mdefs[-1]['type'] = line[1:-1].rstrip() 22 | if mdefs[-1]['type'] == 'convolutional': 23 | mdefs[-1]['batch_normalize'] = 0 # pre-populate with zeros (may be overwritten later) 24 | else: 25 | key, val = line.split("=") 26 | key = key.rstrip() 27 | 28 | if key == 'anchors': # return nparray 29 | mdefs[-1][key] = np.array([float(x) for x in val.split(',')]).reshape((-1, 2)) # np anchors 30 | elif (key in ['from', 'layers', 'mask']) or (key == 'size' and ',' in val): # return array 31 | mdefs[-1][key] = [int(x) for x in val.split(',')] 32 | else: 33 | val = val.strip() 34 | if val.isnumeric(): # return int or float 35 | mdefs[-1][key] = int(val) if (int(val) - float(val)) == 0 else float(val) 36 | else: 37 | mdefs[-1][key] = val # return string 38 | 39 | # Check all fields are supported 40 | supported = ['type', 'batch_normalize', 'filters', 'size', 'stride', 'pad', 'activation', 'layers', 'groups', 41 | 'from', 'mask', 'anchors', 'classes', 'num', 'jitter', 'ignore_thresh', 'truth_thresh', 'random', 42 | 'stride_x', 'stride_y', 'weights_type', 'weights_normalization', 'scale_x_y', 'beta_nms', 'nms_kind', 43 | 'iou_loss', 'iou_normalizer', 'cls_normalizer', 'iou_thresh'] 44 | 45 | f = [] # fields 46 | for x in mdefs[1:]: 47 | [f.append(k) for k in x if k not in f] 48 | u = [x for x in f if x not in supported] # unsupported fields 49 | assert not any(u), "Unsupported fields %s in %s. See https://github.com/ultralytics/yolov3/issues/631" % (u, path) 50 | 51 | return mdefs 52 | 53 | 54 | def parse_data_cfg(path): 55 | # Parses the data configuration file 56 | if not os.path.exists(path) and os.path.exists('data' + os.sep + path): # add data/ prefix if omitted 57 | path = 'data' + os.sep + path 58 | 59 | with open(path, 'r') as f: 60 | lines = f.readlines() 61 | 62 | options = dict() 63 | for line in lines: 64 | line = line.strip() 65 | if line == '' or line.startswith('#'): 66 | continue 67 | key, val = line.split('=') 68 | options[key.strip()] = val.strip() 69 | 70 | return options 71 | -------------------------------------------------------------------------------- /cfg/yolov3-tiny3-1cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 200000 21 | policy=steps 22 | steps=180000,190000 23 | scales=.1,.1 24 | 25 | 26 | [convolutional] 27 | batch_normalize=1 28 | filters=16 29 | size=3 30 | stride=1 31 | pad=1 32 | activation=leaky 33 | 34 | [maxpool] 35 | size=2 36 | stride=2 37 | 38 | [convolutional] 39 | batch_normalize=1 40 | filters=32 41 | size=3 42 | stride=1 43 | pad=1 44 | activation=leaky 45 | 46 | [maxpool] 47 | size=2 48 | stride=2 49 | 50 | [convolutional] 51 | batch_normalize=1 52 | filters=64 53 | size=3 54 | stride=1 55 | pad=1 56 | activation=leaky 57 | 58 | [maxpool] 59 | size=2 60 | stride=2 61 | 62 | [convolutional] 63 | batch_normalize=1 64 | filters=128 65 | size=3 66 | stride=1 67 | pad=1 68 | activation=leaky 69 | 70 | [maxpool] 71 | size=2 72 | stride=2 73 | 74 | [convolutional] 75 | batch_normalize=1 76 | filters=256 77 | size=3 78 | stride=1 79 | pad=1 80 | activation=leaky 81 | 82 | [maxpool] 83 | size=2 84 | stride=2 85 | 86 | [convolutional] 87 | batch_normalize=1 88 | filters=512 89 | size=3 90 | stride=1 91 | pad=1 92 | activation=leaky 93 | 94 | [maxpool] 95 | size=2 96 | stride=1 97 | 98 | [convolutional] 99 | batch_normalize=1 100 | filters=1024 101 | size=3 102 | stride=1 103 | pad=1 104 | activation=leaky 105 | 106 | ########### 107 | 108 | [convolutional] 109 | batch_normalize=1 110 | filters=256 111 | size=1 112 | stride=1 113 | pad=1 114 | activation=leaky 115 | 116 | [convolutional] 117 | batch_normalize=1 118 | filters=512 119 | size=3 120 | stride=1 121 | pad=1 122 | activation=leaky 123 | 124 | [convolutional] 125 | size=1 126 | stride=1 127 | pad=1 128 | filters=18 129 | activation=linear 130 | 131 | 132 | 133 | [yolo] 134 | mask = 6,7,8 135 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 136 | classes=1 137 | num=9 138 | jitter=.3 139 | ignore_thresh = .7 140 | truth_thresh = 1 141 | random=1 142 | 143 | [route] 144 | layers = -4 145 | 146 | [convolutional] 147 | batch_normalize=1 148 | filters=128 149 | size=1 150 | stride=1 151 | pad=1 152 | activation=leaky 153 | 154 | [upsample] 155 | stride=2 156 | 157 | [route] 158 | layers = -1, 8 159 | 160 | [convolutional] 161 | batch_normalize=1 162 | filters=256 163 | size=3 164 | stride=1 165 | pad=1 166 | activation=leaky 167 | 168 | [convolutional] 169 | size=1 170 | stride=1 171 | pad=1 172 | filters=18 173 | activation=linear 174 | 175 | [yolo] 176 | mask = 3,4,5 177 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 178 | classes=1 179 | num=9 180 | jitter=.3 181 | ignore_thresh = .7 182 | truth_thresh = 1 183 | random=1 184 | 185 | 186 | 187 | [route] 188 | layers = -3 189 | 190 | [convolutional] 191 | batch_normalize=1 192 | filters=128 193 | size=1 194 | stride=1 195 | pad=1 196 | activation=leaky 197 | 198 | [upsample] 199 | stride=2 200 | 201 | [route] 202 | layers = -1, 6 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=3 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=18 217 | activation=linear 218 | 219 | [yolo] 220 | mask = 0,1,2 221 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 222 | classes=1 223 | num=9 224 | jitter=.3 225 | ignore_thresh = .7 226 | truth_thresh = 1 227 | random=1 228 | -------------------------------------------------------------------------------- /cfg/yolov3-tiny3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 200000 21 | policy=steps 22 | steps=180000,190000 23 | scales=.1,.1 24 | 25 | 26 | [convolutional] 27 | batch_normalize=1 28 | filters=16 29 | size=3 30 | stride=1 31 | pad=1 32 | activation=leaky 33 | 34 | [maxpool] 35 | size=2 36 | stride=2 37 | 38 | [convolutional] 39 | batch_normalize=1 40 | filters=32 41 | size=3 42 | stride=1 43 | pad=1 44 | activation=leaky 45 | 46 | [maxpool] 47 | size=2 48 | stride=2 49 | 50 | [convolutional] 51 | batch_normalize=1 52 | filters=64 53 | size=3 54 | stride=1 55 | pad=1 56 | activation=leaky 57 | 58 | [maxpool] 59 | size=2 60 | stride=2 61 | 62 | [convolutional] 63 | batch_normalize=1 64 | filters=128 65 | size=3 66 | stride=1 67 | pad=1 68 | activation=leaky 69 | 70 | [maxpool] 71 | size=2 72 | stride=2 73 | 74 | [convolutional] 75 | batch_normalize=1 76 | filters=256 77 | size=3 78 | stride=1 79 | pad=1 80 | activation=leaky 81 | 82 | [maxpool] 83 | size=2 84 | stride=2 85 | 86 | [convolutional] 87 | batch_normalize=1 88 | filters=512 89 | size=3 90 | stride=1 91 | pad=1 92 | activation=leaky 93 | 94 | [maxpool] 95 | size=2 96 | stride=1 97 | 98 | [convolutional] 99 | batch_normalize=1 100 | filters=1024 101 | size=3 102 | stride=1 103 | pad=1 104 | activation=leaky 105 | 106 | ########### 107 | 108 | [convolutional] 109 | batch_normalize=1 110 | filters=256 111 | size=1 112 | stride=1 113 | pad=1 114 | activation=leaky 115 | 116 | [convolutional] 117 | batch_normalize=1 118 | filters=512 119 | size=3 120 | stride=1 121 | pad=1 122 | activation=leaky 123 | 124 | [convolutional] 125 | size=1 126 | stride=1 127 | pad=1 128 | filters=255 129 | activation=linear 130 | 131 | 132 | 133 | [yolo] 134 | mask = 6,7,8 135 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 136 | classes=80 137 | num=9 138 | jitter=.3 139 | ignore_thresh = .7 140 | truth_thresh = 1 141 | random=1 142 | 143 | [route] 144 | layers = -4 145 | 146 | [convolutional] 147 | batch_normalize=1 148 | filters=128 149 | size=1 150 | stride=1 151 | pad=1 152 | activation=leaky 153 | 154 | [upsample] 155 | stride=2 156 | 157 | [route] 158 | layers = -1, 8 159 | 160 | [convolutional] 161 | batch_normalize=1 162 | filters=256 163 | size=3 164 | stride=1 165 | pad=1 166 | activation=leaky 167 | 168 | [convolutional] 169 | size=1 170 | stride=1 171 | pad=1 172 | filters=255 173 | activation=linear 174 | 175 | [yolo] 176 | mask = 3,4,5 177 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 178 | classes=80 179 | num=9 180 | jitter=.3 181 | ignore_thresh = .7 182 | truth_thresh = 1 183 | random=1 184 | 185 | 186 | 187 | [route] 188 | layers = -3 189 | 190 | [convolutional] 191 | batch_normalize=1 192 | filters=128 193 | size=1 194 | stride=1 195 | pad=1 196 | activation=leaky 197 | 198 | [upsample] 199 | stride=2 200 | 201 | [route] 202 | layers = -1, 6 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=3 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=255 217 | activation=linear 218 | 219 | [yolo] 220 | mask = 0,1,2 221 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 222 | classes=80 223 | num=9 224 | jitter=.3 225 | ignore_thresh = .7 226 | truth_thresh = 1 227 | random=1 228 | -------------------------------------------------------------------------------- /utils/layers.py: -------------------------------------------------------------------------------- 1 | import torch.nn.functional as F 2 | 3 | from utils.utils import * 4 | 5 | 6 | def make_divisible(v, divisor): 7 | # Function ensures all layers have a channel number that is divisible by 8 8 | # https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py 9 | return math.ceil(v / divisor) * divisor 10 | 11 | 12 | class Flatten(nn.Module): 13 | # Use after nn.AdaptiveAvgPool2d(1) to remove last 2 dimensions 14 | def forward(self, x): 15 | return x.view(x.size(0), -1) 16 | 17 | 18 | class Concat(nn.Module): 19 | # Concatenate a list of tensors along dimension 20 | def __init__(self, dimension=1): 21 | super(Concat, self).__init__() 22 | self.d = dimension 23 | 24 | def forward(self, x): 25 | return torch.cat(x, self.d) 26 | 27 | 28 | class FeatureConcat(nn.Module): 29 | def __init__(self, layers): 30 | super(FeatureConcat, self).__init__() 31 | self.layers = layers # layer indices 32 | self.multiple = len(layers) > 1 # multiple layers flag 33 | 34 | def forward(self, x, outputs): 35 | return torch.cat([outputs[i] for i in self.layers], 1) if self.multiple else outputs[self.layers[0]] 36 | 37 | 38 | class WeightedFeatureFusion(nn.Module): # weighted sum of 2 or more layers https://arxiv.org/abs/1911.09070 39 | def __init__(self, layers, weight=False): 40 | super(WeightedFeatureFusion, self).__init__() 41 | self.layers = layers # layer indices 42 | self.weight = weight # apply weights boolean 43 | self.n = len(layers) + 1 # number of layers 44 | if weight: 45 | self.w = nn.Parameter(torch.zeros(self.n), requires_grad=True) # layer weights 46 | 47 | def forward(self, x, outputs): 48 | # Weights 49 | if self.weight: 50 | w = torch.sigmoid(self.w) * (2 / self.n) # sigmoid weights (0-1) 51 | x = x * w[0] 52 | 53 | # Fusion 54 | nx = x.shape[1] # input channels 55 | for i in range(self.n - 1): 56 | a = outputs[self.layers[i]] * w[i + 1] if self.weight else outputs[self.layers[i]] # feature to add 57 | na = a.shape[1] # feature channels 58 | 59 | # Adjust channels 60 | if nx == na: # same shape 61 | x = x + a 62 | elif nx > na: # slice input 63 | x[:, :na] = x[:, :na] + a # or a = nn.ZeroPad2d((0, 0, 0, 0, 0, dc))(a); x = x + a 64 | else: # slice feature 65 | x = x + a[:, :nx] 66 | 67 | return x 68 | 69 | 70 | class MixConv2d(nn.Module): # MixConv: Mixed Depthwise Convolutional Kernels https://arxiv.org/abs/1907.09595 71 | def __init__(self, in_ch, out_ch, k=(3, 5, 7), stride=1, dilation=1, bias=True, method='equal_params'): 72 | super(MixConv2d, self).__init__() 73 | 74 | groups = len(k) 75 | if method == 'equal_ch': # equal channels per group 76 | i = torch.linspace(0, groups - 1E-6, out_ch).floor() # out_ch indices 77 | ch = [(i == g).sum() for g in range(groups)] 78 | else: # 'equal_params': equal parameter count per group 79 | b = [out_ch] + [0] * groups 80 | a = np.eye(groups + 1, groups, k=-1) 81 | a -= np.roll(a, 1, axis=1) 82 | a *= np.array(k) ** 2 83 | a[0] = 1 84 | ch = np.linalg.lstsq(a, b, rcond=None)[0].round().astype(int) # solve for equal weight indices, ax = b 85 | 86 | self.m = nn.ModuleList([nn.Conv2d(in_channels=in_ch, 87 | out_channels=ch[g], 88 | kernel_size=k[g], 89 | stride=stride, 90 | padding=k[g] // 2, # 'same' pad 91 | dilation=dilation, 92 | bias=bias) for g in range(groups)]) 93 | 94 | def forward(self, x): 95 | return torch.cat([m(x) for m in self.m], 1) 96 | 97 | 98 | # Activation functions below ------------------------------------------------------------------------------------------- 99 | class SwishImplementation(torch.autograd.Function): 100 | @staticmethod 101 | def forward(ctx, x): 102 | ctx.save_for_backward(x) 103 | return x * torch.sigmoid(x) 104 | 105 | @staticmethod 106 | def backward(ctx, grad_output): 107 | x = ctx.saved_tensors[0] 108 | sx = torch.sigmoid(x) # sigmoid(ctx) 109 | return grad_output * (sx * (1 + x * (1 - sx))) 110 | 111 | 112 | class MishImplementation(torch.autograd.Function): 113 | @staticmethod 114 | def forward(ctx, x): 115 | ctx.save_for_backward(x) 116 | return x.mul(torch.tanh(F.softplus(x))) # x * tanh(ln(1 + exp(x))) 117 | 118 | @staticmethod 119 | def backward(ctx, grad_output): 120 | x = ctx.saved_tensors[0] 121 | sx = torch.sigmoid(x) 122 | fx = F.softplus(x).tanh() 123 | return grad_output * (fx + x * sx * (1 - fx * fx)) 124 | 125 | 126 | class MemoryEfficientSwish(nn.Module): 127 | def forward(self, x): 128 | return SwishImplementation.apply(x) 129 | 130 | 131 | class MemoryEfficientMish(nn.Module): 132 | def forward(self, x): 133 | return MishImplementation.apply(x) 134 | 135 | 136 | class Swish(nn.Module): 137 | def forward(self, x): 138 | return x * torch.sigmoid(x) 139 | 140 | 141 | class HardSwish(nn.Module): # https://arxiv.org/pdf/1905.02244.pdf 142 | def forward(self, x): 143 | return x * F.hardtanh(x + 3, 0., 6., True) / 6. 144 | 145 | 146 | class Mish(nn.Module): # https://github.com/digantamisra98/Mish 147 | def forward(self, x): 148 | return x * F.softplus(x).tanh() 149 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Ultralytics-YOLOv3-Cluster-NMS 4 | ## Cluster-NMS into YOLOv3 Pytorch 5 | Our paper is accepted by **IEEE Transactions on Cybernetics (TCYB)**. 6 | 7 | #### This is the code for our paper: 8 | - [Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression](https://arxiv.org/abs/1911.08287) 9 | - [Enhancing Geometric Factors into Model Learning and Inference for Object Detection and Instance Segmentation](http://arxiv.org/abs/2005.03572) 10 | 11 | ``` 12 | @Inproceedings{zheng2020diou, 13 | author = {Zheng, Zhaohui and Wang, Ping and Liu, Wei and Li, Jinze and Ye, Rongguang and Ren, Dongwei}, 14 | title = {Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression}, 15 | booktitle = {The AAAI Conference on Artificial Intelligence (AAAI)}, 16 | year = {2020}, 17 | } 18 | 19 | @Article{zheng2021ciou, 20 | author = {Zheng, Zhaohui and Wang, Ping and Ren, Dongwei and Liu, Wei and Ye, Rongguang and Hu, Qinghua and Zuo, Wangmeng}, 21 | title = {Enhancing Geometric Factors in Model Learning and Inference for Object Detection and Instance Segmentation}, 22 | booktitle = {IEEE Transactions on Cybernetics}, 23 | year = {2021}, 24 | } 25 | ``` 26 | # Introduction 27 | 28 | In this [paper](http://arxiv.org/abs/2005.03572), we propose Complete-IoU (CIoU) loss and Cluster-NMS for enhancing geometric factors in both bounding box regression and Non-Maximum Suppression (NMS), leading to notable gains of average precision (AP) and average recall (AR), without the sacrifice of inference efficiency. In particular, we consider three geometric factors, i.e., overlap area, normalized central point distance and aspect ratio, which are crucial for measuring bounding box regression in object detection and instance segmentation. The three geometric factors are then incorporated into CIoU loss for better distinguishing difficult regression cases. The training of deep models using CIoU loss results in consistent AP and AR improvements in comparison to widely adopted Ln-norm loss and IoU-based loss. Furthermore, we propose Cluster-NMS, where NMS during inference is done by implicitly clustering detected boxes and usually requires less iterations. Cluster-NMS is very efficient due to its pure GPU implementation, and geometric factors can be incorporated to improve both AP and AR. In the experiments, CIoU loss and Cluster-NMS have been applied to state-of-the-art instance segmentation (e.g., YOLACT), and object detection (e.g., YOLO v3, SSD and Faster R-CNN) models. 29 | 30 | ### This repo only focuses on NMS improvement based on https://github.com/ultralytics/yolov3. 31 | 32 | ### See `non_max_suppression` function of [utils/utils.py](utils/utils.py) for our Cluster-NMS implementation. 33 | 34 | This directory contains PyTorch YOLOv3 software developed by Ultralytics LLC, and **is freely available for redistribution under the GPL-3.0 license**. For more information please visit https://www.ultralytics.com. 35 | 36 | # Description 37 | 38 | The https://github.com/ultralytics/yolov3 repo contains inference and training code for YOLOv3 in PyTorch. The code works on Linux, MacOS and Windows. Training is done on the COCO dataset by default: https://cocodataset.org/#home. **Credit to Joseph Redmon for YOLO:** https://pjreddie.com/darknet/yolo/. 39 | 40 | # Requirements 41 | 42 | Python 3.7 or later with all `pip install -U -r requirements.txt` packages including `torch >= 1.5`. Docker images come with all dependencies preinstalled. Docker requirements are: 43 | - Nvidia Driver >= 440.44 44 | - Docker Engine - CE >= 19.03 45 | 46 | # mAP 47 | 48 | |Size |COCO mAP
@0.5...0.95 |COCO mAP
@0.5 49 | --- | --- | --- | --- 50 | YOLOv3-tiny
YOLOv3
YOLOv3-SPP
**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |320 |14.0
28.7
30.5
**37.7** |29.1
51.8
52.3
**56.8** 51 | YOLOv3-tiny
YOLOv3
YOLOv3-SPP
**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |416 |16.0
31.2
33.9
**41.2** |33.0
55.4
56.9
**60.6** 52 | YOLOv3-tiny
YOLOv3
YOLOv3-SPP
**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |512 |16.6
32.7
35.6
**42.6** |34.9
57.7
59.5
**62.4** 53 | YOLOv3-tiny
YOLOv3
YOLOv3-SPP
**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |608 |16.6
33.1
37.0
**43.1** |35.4
58.2
60.7
**62.8** 54 | 55 | - mAP@0.5 run at `--iou-thr 0.5`, mAP@0.5...0.95 run at `--iou-thr 0.7` 56 | - Darknet results: https://arxiv.org/abs/1804.02767 57 | 58 | ## Cluster-NMS 59 | 60 | #### Hardware 61 | - 2 GTX 1080 Ti 62 | - Intel(R) Core(TM) i7-6850K CPU @ 3.60GHz 63 | 64 | Evaluation command: `python3 test.py --cfg yolov3-spp.cfg --weights yolov3-spp-ultralytics.pt` 65 | 66 | AP reports on `coco 2014 minival`. 67 | 68 | | Image Size | Model | NMS | FPS | box AP | box AP75 | box AR100 | 69 | |:----:|:-------------:|:------------------------------------:|:----:|:----:|:----:|:----:| 70 | | 608 | YOLOv3-SPP-ultralytics | Fast NMS | 85.5 | 42.2 | 45.1 | 60.1 | 71 | | 608 | YOLOv3-SPP-ultralytics | Original NMS | 14.6 | 42.6 | 45.8 | 62.5 | 72 | | 608 | YOLOv3-SPP-ultralytics | DIoU-NMS | 7.9 | 42.7 | 46.2 | 63.4 | 73 | | 608 | YOLOv3-SPP-ultralytics | Original NMS Torchvision | **95.2** | 42.6 | 45.8 | 62.5 | 74 | | 608 | YOLOv3-SPP-ultralytics | Cluster-NMS | 82.6 | 42.6 | 45.8 | 62.5 | 75 | | 608 | YOLOv3-SPP-ultralytics | Cluster-DIoU-NMS | 76.9 | 42.7 | 46.2 | 63.4 | 76 | | 608 | YOLOv3-SPP-ultralytics | Weighted-NMS | 11.2 | 42.9 | 46.4 | 62.7 | 77 | | 608 | YOLOv3-SPP-ultralytics | Weighted Cluster-NMS | 68.0 | 42.9 | 46.4 | 62.7 | 78 | | 608 | YOLOv3-SPP-ultralytics | Weighted + Cluster-DIoU-NMS | 64.9 | **43.1** | **46.8** | **63.7** | 79 | | 608 | YOLOv3-SPP-ultralytics | Merge + Torchvision NMS | 88.5 | 42.8 | 46.3 | 63.0 | 80 | | 608 | YOLOv3-SPP-ultralytics | Merge + DIoU + Torchvision NMS | 82.5 | 43.0 | 46.6 | 63.2 | 81 | ## Conclusion 82 | 83 | - Merge NMS is a simplified version of Weighted-NMS. It just use score vector for weighted coordinates, not combine score and IoU. (Refer to [CAD](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8265304) for the details of Weighted-NMS.) 84 | 85 | - We further incorporate DIoU into NMS for YOLOv3 which can get higher AP and AR. 86 | 87 | - Note that Torchvision NMS has the fastest speed, that is owing to CUDA implementation and engineering accelerations (like upper triangular IoU matrix only). However, our Cluster-NMS requires less iterations for NMS and can also be further accelerated by adopting engineering tricks. Almost completed at the same time as the work of our paper is Glenn Jocher's Torchvision NMS + Merge. First, we do Torchvision NMS, then convert the output to vector to multiply the IoU matrix. Also, for Merge NMS, the IoU matrix is no need to be square shape `n*n`. It can be `m*n` to save more time, where `m` is the boxes that NMS outputs. 88 | 89 | - Currently, Torchvision NMS use IoU as criterion, not DIoU. However, if we directly replace IoU with DIoU in Original NMS, it will costs much more time due to the sequence operation. Now, Cluster-DIoU-NMS will significantly speed up DIoU-NMS and obtain exactly the same result. 90 | 91 | - Torchvision NMS is a function in Torchvision>=0.3, and our Cluster-NMS can be applied to any projects that use low version of Torchvision and other deep learning frameworks as long as it can do matrix operations. **No other import, no need to compile, less iteration, fully GPU-accelerated and better performance**. 92 | 93 | # Citation 94 | 95 | [![DOI](https://zenodo.org/badge/146165888.svg)](https://zenodo.org/badge/latestdoi/146165888) 96 | -------------------------------------------------------------------------------- /detect.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from sys import platform 3 | 4 | from models import * # set ONNX_EXPORT in models.py 5 | from utils.datasets import * 6 | from utils.utils import * 7 | 8 | 9 | def detect(save_img=False): 10 | img_size = (320, 192) if ONNX_EXPORT else opt.img_size # (320, 192) or (416, 256) or (608, 352) for (height, width) 11 | out, source, weights, half, view_img, save_txt = opt.output, opt.source, opt.weights, opt.half, opt.view_img, opt.save_txt 12 | webcam = source == '0' or source.startswith('rtsp') or source.startswith('http') or source.endswith('.txt') 13 | 14 | # Initialize 15 | device = torch_utils.select_device(device='cpu' if ONNX_EXPORT else opt.device) 16 | if os.path.exists(out): 17 | shutil.rmtree(out) # delete output folder 18 | os.makedirs(out) # make new output folder 19 | 20 | # Initialize model 21 | model = Darknet(opt.cfg, img_size) 22 | 23 | # Load weights 24 | attempt_download(weights) 25 | if weights.endswith('.pt'): # pytorch format 26 | model.load_state_dict(torch.load(weights, map_location=device)['model']) 27 | else: # darknet format 28 | load_darknet_weights(model, weights) 29 | 30 | # Second-stage classifier 31 | classify = False 32 | if classify: 33 | modelc = torch_utils.load_classifier(name='resnet101', n=2) # initialize 34 | modelc.load_state_dict(torch.load('weights/resnet101.pt', map_location=device)['model']) # load weights 35 | modelc.to(device).eval() 36 | 37 | # Eval mode 38 | model.to(device).eval() 39 | 40 | # Fuse Conv2d + BatchNorm2d layers 41 | # model.fuse() 42 | 43 | # Export mode 44 | if ONNX_EXPORT: 45 | model.fuse() 46 | img = torch.zeros((1, 3) + img_size) # (1, 3, 320, 192) 47 | f = opt.weights.replace(opt.weights.split('.')[-1], 'onnx') # *.onnx filename 48 | torch.onnx.export(model, img, f, verbose=False, opset_version=11, 49 | input_names=['images'], output_names=['classes', 'boxes']) 50 | 51 | # Validate exported model 52 | import onnx 53 | model = onnx.load(f) # Load the ONNX model 54 | onnx.checker.check_model(model) # Check that the IR is well formed 55 | print(onnx.helper.printable_graph(model.graph)) # Print a human readable representation of the graph 56 | return 57 | 58 | # Half precision 59 | half = half and device.type != 'cpu' # half precision only supported on CUDA 60 | if half: 61 | model.half() 62 | 63 | # Set Dataloader 64 | vid_path, vid_writer = None, None 65 | if webcam: 66 | view_img = True 67 | torch.backends.cudnn.benchmark = True # set True to speed up constant image size inference 68 | dataset = LoadStreams(source, img_size=img_size) 69 | else: 70 | save_img = True 71 | dataset = LoadImages(source, img_size=img_size) 72 | 73 | # Get names and colors 74 | names = load_classes(opt.names) 75 | colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))] 76 | 77 | # Run inference 78 | t0 = time.time() 79 | img = torch.zeros((1, 3, img_size, img_size), device=device) # init img 80 | _ = model(img.half() if half else img.float()) if device.type != 'cpu' else None # run once 81 | for path, img, im0s, vid_cap in dataset: 82 | img = torch.from_numpy(img).to(device) 83 | img = img.half() if half else img.float() # uint8 to fp16/32 84 | img /= 255.0 # 0 - 255 to 0.0 - 1.0 85 | if img.ndimension() == 3: 86 | img = img.unsqueeze(0) 87 | 88 | # Inference 89 | t1 = torch_utils.time_synchronized() 90 | pred = model(img, augment=opt.augment)[0] 91 | t2 = torch_utils.time_synchronized() 92 | 93 | # to float 94 | if half: 95 | pred = pred.float() 96 | 97 | # Apply NMS 98 | pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, 99 | multi_label=False, classes=opt.classes, agnostic=opt.agnostic_nms) 100 | 101 | # Apply Classifier 102 | if classify: 103 | pred = apply_classifier(pred, modelc, img, im0s) 104 | 105 | # Process detections 106 | for i, det in enumerate(pred): # detections per image 107 | if webcam: # batch_size >= 1 108 | p, s, im0 = path[i], '%g: ' % i, im0s[i] 109 | else: 110 | p, s, im0 = path, '', im0s 111 | 112 | save_path = str(Path(out) / Path(p).name) 113 | s += '%gx%g ' % img.shape[2:] # print string 114 | if det is not None and len(det): 115 | # Rescale boxes from img_size to im0 size 116 | det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round() 117 | 118 | # Print results 119 | for c in det[:, -1].unique(): 120 | n = (det[:, -1] == c).sum() # detections per class 121 | s += '%g %ss, ' % (n, names[int(c)]) # add to string 122 | 123 | # Write results 124 | for *xyxy, conf, cls in det: 125 | if save_txt: # Write to file 126 | with open(save_path + '.txt', 'a') as file: 127 | file.write(('%g ' * 6 + '\n') % (*xyxy, cls, conf)) 128 | 129 | if save_img or view_img: # Add bbox to image 130 | label = '%s %.2f' % (names[int(cls)], conf) 131 | plot_one_box(xyxy, im0, label=label, color=colors[int(cls)]) 132 | 133 | # Print time (inference + NMS) 134 | print('%sDone. (%.3fs)' % (s, t2 - t1)) 135 | 136 | # Stream results 137 | if view_img: 138 | cv2.imshow(p, im0) 139 | if cv2.waitKey(1) == ord('q'): # q to quit 140 | raise StopIteration 141 | 142 | # Save results (image with detections) 143 | if save_img: 144 | if dataset.mode == 'images': 145 | cv2.imwrite(save_path, im0) 146 | else: 147 | if vid_path != save_path: # new video 148 | vid_path = save_path 149 | if isinstance(vid_writer, cv2.VideoWriter): 150 | vid_writer.release() # release previous video writer 151 | 152 | fps = vid_cap.get(cv2.CAP_PROP_FPS) 153 | w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 154 | h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 155 | vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*opt.fourcc), fps, (w, h)) 156 | vid_writer.write(im0) 157 | 158 | if save_txt or save_img: 159 | print('Results saved to %s' % os.getcwd() + os.sep + out) 160 | if platform == 'darwin': # MacOS 161 | os.system('open ' + save_path) 162 | 163 | print('Done. (%.3fs)' % (time.time() - t0)) 164 | 165 | 166 | if __name__ == '__main__': 167 | parser = argparse.ArgumentParser() 168 | parser.add_argument('--cfg', type=str, default='cfg/yolov3-spp.cfg', help='*.cfg path') 169 | parser.add_argument('--names', type=str, default='data/coco.names', help='*.names path') 170 | parser.add_argument('--weights', type=str, default='weights/yolov3-spp-ultralytics.pt', help='weights path') 171 | parser.add_argument('--source', type=str, default='data/samples', help='source') # input file/folder, 0 for webcam 172 | parser.add_argument('--output', type=str, default='output', help='output folder') # output folder 173 | parser.add_argument('--img-size', type=int, default=512, help='inference size (pixels)') 174 | parser.add_argument('--conf-thres', type=float, default=0.3, help='object confidence threshold') 175 | parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS') 176 | parser.add_argument('--fourcc', type=str, default='mp4v', help='output video codec (verify ffmpeg support)') 177 | parser.add_argument('--half', action='store_true', help='half precision FP16 inference') 178 | parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1) or cpu') 179 | parser.add_argument('--view-img', action='store_true', help='display results') 180 | parser.add_argument('--save-txt', action='store_true', help='save results to *.txt') 181 | parser.add_argument('--classes', nargs='+', type=int, help='filter by class') 182 | parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS') 183 | parser.add_argument('--augment', action='store_true', help='augmented inference') 184 | opt = parser.parse_args() 185 | print(opt) 186 | 187 | with torch.no_grad(): 188 | detect() 189 | -------------------------------------------------------------------------------- /utils/torch_utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import time 4 | from copy import deepcopy 5 | 6 | import torch 7 | import torch.backends.cudnn as cudnn 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | 11 | 12 | def init_seeds(seed=0): 13 | torch.manual_seed(seed) 14 | 15 | # Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html 16 | if seed == 0: 17 | cudnn.deterministic = True 18 | cudnn.benchmark = False 19 | 20 | 21 | def select_device(device='', apex=False, batch_size=None): 22 | # device = 'cpu' or '0' or '0,1,2,3' 23 | cpu_request = device.lower() == 'cpu' 24 | if device and not cpu_request: # if device requested other than 'cpu' 25 | os.environ['CUDA_VISIBLE_DEVICES'] = device # set environment variable 26 | assert torch.cuda.is_available(), 'CUDA unavailable, invalid device %s requested' % device # check availablity 27 | 28 | cuda = False if cpu_request else torch.cuda.is_available() 29 | if cuda: 30 | c = 1024 ** 2 # bytes to MB 31 | ng = torch.cuda.device_count() 32 | if ng > 1 and batch_size: # check that batch_size is compatible with device_count 33 | assert batch_size % ng == 0, 'batch-size %g not multiple of GPU count %g' % (batch_size, ng) 34 | x = [torch.cuda.get_device_properties(i) for i in range(ng)] 35 | s = 'Using CUDA ' + ('Apex ' if apex else '') # apex for mixed precision https://github.com/NVIDIA/apex 36 | for i in range(0, ng): 37 | if i == 1: 38 | s = ' ' * len(s) 39 | print("%sdevice%g _CudaDeviceProperties(name='%s', total_memory=%dMB)" % 40 | (s, i, x[i].name, x[i].total_memory / c)) 41 | else: 42 | print('Using CPU') 43 | 44 | print('') # skip a line 45 | return torch.device('cuda:0' if cuda else 'cpu') 46 | 47 | 48 | def time_synchronized(): 49 | torch.cuda.synchronize() if torch.cuda.is_available() else None 50 | return time.time() 51 | 52 | 53 | def initialize_weights(model): 54 | for m in model.modules(): 55 | t = type(m) 56 | if t is nn.Conv2d: 57 | pass # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 58 | elif t is nn.BatchNorm2d: 59 | m.eps = 1e-4 60 | m.momentum = 0.03 61 | elif t in [nn.LeakyReLU, nn.ReLU, nn.ReLU6]: 62 | m.inplace = True 63 | 64 | 65 | def find_modules(model, mclass=nn.Conv2d): 66 | # finds layer indices matching module class 'mclass' 67 | return [i for i, m in enumerate(model.module_list) if isinstance(m, mclass)] 68 | 69 | 70 | def fuse_conv_and_bn(conv, bn): 71 | # https://tehnokv.com/posts/fusing-batchnorm-and-conv/ 72 | with torch.no_grad(): 73 | # init 74 | fusedconv = torch.nn.Conv2d(conv.in_channels, 75 | conv.out_channels, 76 | kernel_size=conv.kernel_size, 77 | stride=conv.stride, 78 | padding=conv.padding, 79 | bias=True) 80 | 81 | # prepare filters 82 | w_conv = conv.weight.clone().view(conv.out_channels, -1) 83 | w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) 84 | fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.size())) 85 | 86 | # prepare spatial bias 87 | if conv.bias is not None: 88 | b_conv = conv.bias 89 | else: 90 | b_conv = torch.zeros(conv.weight.size(0)) 91 | b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps)) 92 | fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn) 93 | 94 | return fusedconv 95 | 96 | 97 | def model_info(model, verbose=False): 98 | # Plots a line-by-line description of a PyTorch model 99 | n_p = sum(x.numel() for x in model.parameters()) # number parameters 100 | n_g = sum(x.numel() for x in model.parameters() if x.requires_grad) # number gradients 101 | if verbose: 102 | print('%5s %40s %9s %12s %20s %10s %10s' % ('layer', 'name', 'gradient', 'parameters', 'shape', 'mu', 'sigma')) 103 | for i, (name, p) in enumerate(model.named_parameters()): 104 | name = name.replace('module_list.', '') 105 | print('%5g %40s %9s %12g %20s %10.3g %10.3g' % 106 | (i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std())) 107 | 108 | try: # FLOPS 109 | from thop import profile 110 | macs, _ = profile(model, inputs=(torch.zeros(1, 3, 480, 640),), verbose=False) 111 | fs = ', %.1f GFLOPS' % (macs / 1E9 * 2) 112 | except: 113 | fs = '' 114 | 115 | print('Model Summary: %g layers, %g parameters, %g gradients%s' % (len(list(model.parameters())), n_p, n_g, fs)) 116 | 117 | 118 | def load_classifier(name='resnet101', n=2): 119 | # Loads a pretrained model reshaped to n-class output 120 | import pretrainedmodels # https://github.com/Cadene/pretrained-models.pytorch#torchvision 121 | model = pretrainedmodels.__dict__[name](num_classes=1000, pretrained='imagenet') 122 | 123 | # Display model properties 124 | for x in ['model.input_size', 'model.input_space', 'model.input_range', 'model.mean', 'model.std']: 125 | print(x + ' =', eval(x)) 126 | 127 | # Reshape output to n classes 128 | filters = model.last_linear.weight.shape[1] 129 | model.last_linear.bias = torch.nn.Parameter(torch.zeros(n)) 130 | model.last_linear.weight = torch.nn.Parameter(torch.zeros(n, filters)) 131 | model.last_linear.out_features = n 132 | return model 133 | 134 | 135 | def scale_img(img, ratio=1.0, same_shape=True): # img(16,3,256,416), r=ratio 136 | # scales img(bs,3,y,x) by ratio 137 | h, w = img.shape[2:] 138 | s = (int(h * ratio), int(w * ratio)) # new size 139 | img = F.interpolate(img, size=s, mode='bilinear', align_corners=False) # resize 140 | if not same_shape: # pad/crop img 141 | gs = 64 # (pixels) grid size 142 | h, w = [math.ceil(x * ratio / gs) * gs for x in (h, w)] 143 | return F.pad(img, [0, w - s[1], 0, h - s[0]], value=0.447) # value = imagenet mean 144 | 145 | 146 | class ModelEMA: 147 | """ Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models 148 | Keep a moving average of everything in the model state_dict (parameters and buffers). 149 | This is intended to allow functionality like 150 | https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage 151 | A smoothed version of the weights is necessary for some training schemes to perform well. 152 | E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use 153 | RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA 154 | smoothing of weights to match results. Pay attention to the decay constant you are using 155 | relative to your update count per epoch. 156 | To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but 157 | disable validation of the EMA weights. Validation will have to be done manually in a separate 158 | process, or after the training stops converging. 159 | This class is sensitive where it is initialized in the sequence of model init, 160 | GPU assignment and distributed training wrappers. 161 | I've tested with the sequence in my own train.py for torch.DataParallel, apex.DDP, and single-GPU. 162 | """ 163 | 164 | def __init__(self, model, decay=0.9999, device=''): 165 | # make a copy of the model for accumulating moving average of weights 166 | self.ema = deepcopy(model) 167 | self.ema.eval() 168 | self.updates = 0 # number of EMA updates 169 | self.decay = lambda x: decay * (1 - math.exp(-x / 2000)) # decay exponential ramp (to help early epochs) 170 | self.device = device # perform ema on different device from model if set 171 | if device: 172 | self.ema.to(device=device) 173 | for p in self.ema.parameters(): 174 | p.requires_grad_(False) 175 | 176 | def update(self, model): 177 | self.updates += 1 178 | d = self.decay(self.updates) 179 | with torch.no_grad(): 180 | if type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel): 181 | msd, esd = model.module.state_dict(), self.ema.module.state_dict() 182 | else: 183 | msd, esd = model.state_dict(), self.ema.state_dict() 184 | 185 | for k, v in esd.items(): 186 | if v.dtype.is_floating_point: 187 | v *= d 188 | v += (1. - d) * msd[k].detach() 189 | 190 | def update_attr(self, model): 191 | # Assign attributes (which may change during training) 192 | for k in model.__dict__.keys(): 193 | if not k.startswith('_'): 194 | setattr(self.ema, k, getattr(model, k)) 195 | -------------------------------------------------------------------------------- /utils/adabound.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch.optim.optimizer import Optimizer 5 | 6 | 7 | class AdaBound(Optimizer): 8 | """Implements AdaBound algorithm. 9 | It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of Learning Rate`_. 10 | Arguments: 11 | params (iterable): iterable of parameters to optimize or dicts defining 12 | parameter groups 13 | lr (float, optional): Adam learning rate (default: 1e-3) 14 | betas (Tuple[float, float], optional): coefficients used for computing 15 | running averages of gradient and its square (default: (0.9, 0.999)) 16 | final_lr (float, optional): final (SGD) learning rate (default: 0.1) 17 | gamma (float, optional): convergence speed of the bound functions (default: 1e-3) 18 | eps (float, optional): term added to the denominator to improve 19 | numerical stability (default: 1e-8) 20 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 21 | amsbound (boolean, optional): whether to use the AMSBound variant of this algorithm 22 | .. Adaptive Gradient Methods with Dynamic Bound of Learning Rate: 23 | https://openreview.net/forum?id=Bkg3g2R9FX 24 | """ 25 | 26 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), final_lr=0.1, gamma=1e-3, 27 | eps=1e-8, weight_decay=0, amsbound=False): 28 | if not 0.0 <= lr: 29 | raise ValueError("Invalid learning rate: {}".format(lr)) 30 | if not 0.0 <= eps: 31 | raise ValueError("Invalid epsilon value: {}".format(eps)) 32 | if not 0.0 <= betas[0] < 1.0: 33 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 34 | if not 0.0 <= betas[1] < 1.0: 35 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 36 | if not 0.0 <= final_lr: 37 | raise ValueError("Invalid final learning rate: {}".format(final_lr)) 38 | if not 0.0 <= gamma < 1.0: 39 | raise ValueError("Invalid gamma parameter: {}".format(gamma)) 40 | defaults = dict(lr=lr, betas=betas, final_lr=final_lr, gamma=gamma, eps=eps, 41 | weight_decay=weight_decay, amsbound=amsbound) 42 | super(AdaBound, self).__init__(params, defaults) 43 | 44 | self.base_lrs = list(map(lambda group: group['lr'], self.param_groups)) 45 | 46 | def __setstate__(self, state): 47 | super(AdaBound, self).__setstate__(state) 48 | for group in self.param_groups: 49 | group.setdefault('amsbound', False) 50 | 51 | def step(self, closure=None): 52 | """Performs a single optimization step. 53 | Arguments: 54 | closure (callable, optional): A closure that reevaluates the model 55 | and returns the loss. 56 | """ 57 | loss = None 58 | if closure is not None: 59 | loss = closure() 60 | 61 | for group, base_lr in zip(self.param_groups, self.base_lrs): 62 | for p in group['params']: 63 | if p.grad is None: 64 | continue 65 | grad = p.grad.data 66 | if grad.is_sparse: 67 | raise RuntimeError( 68 | 'Adam does not support sparse gradients, please consider SparseAdam instead') 69 | amsbound = group['amsbound'] 70 | 71 | state = self.state[p] 72 | 73 | # State initialization 74 | if len(state) == 0: 75 | state['step'] = 0 76 | # Exponential moving average of gradient values 77 | state['exp_avg'] = torch.zeros_like(p.data) 78 | # Exponential moving average of squared gradient values 79 | state['exp_avg_sq'] = torch.zeros_like(p.data) 80 | if amsbound: 81 | # Maintains max of all exp. moving avg. of sq. grad. values 82 | state['max_exp_avg_sq'] = torch.zeros_like(p.data) 83 | 84 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 85 | if amsbound: 86 | max_exp_avg_sq = state['max_exp_avg_sq'] 87 | beta1, beta2 = group['betas'] 88 | 89 | state['step'] += 1 90 | 91 | if group['weight_decay'] != 0: 92 | grad = grad.add(group['weight_decay'], p.data) 93 | 94 | # Decay the first and second moment running average coefficient 95 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 96 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 97 | if amsbound: 98 | # Maintains the maximum of all 2nd moment running avg. till now 99 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) 100 | # Use the max. for normalizing running avg. of gradient 101 | denom = max_exp_avg_sq.sqrt().add_(group['eps']) 102 | else: 103 | denom = exp_avg_sq.sqrt().add_(group['eps']) 104 | 105 | bias_correction1 = 1 - beta1 ** state['step'] 106 | bias_correction2 = 1 - beta2 ** state['step'] 107 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 108 | 109 | # Applies bounds on actual learning rate 110 | # lr_scheduler cannot affect final_lr, this is a workaround to apply lr decay 111 | final_lr = group['final_lr'] * group['lr'] / base_lr 112 | lower_bound = final_lr * (1 - 1 / (group['gamma'] * state['step'] + 1)) 113 | upper_bound = final_lr * (1 + 1 / (group['gamma'] * state['step'])) 114 | step_size = torch.full_like(denom, step_size) 115 | step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(exp_avg) 116 | 117 | p.data.add_(-step_size) 118 | 119 | return loss 120 | 121 | 122 | class AdaBoundW(Optimizer): 123 | """Implements AdaBound algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101) 124 | It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of Learning Rate`_. 125 | Arguments: 126 | params (iterable): iterable of parameters to optimize or dicts defining 127 | parameter groups 128 | lr (float, optional): Adam learning rate (default: 1e-3) 129 | betas (Tuple[float, float], optional): coefficients used for computing 130 | running averages of gradient and its square (default: (0.9, 0.999)) 131 | final_lr (float, optional): final (SGD) learning rate (default: 0.1) 132 | gamma (float, optional): convergence speed of the bound functions (default: 1e-3) 133 | eps (float, optional): term added to the denominator to improve 134 | numerical stability (default: 1e-8) 135 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 136 | amsbound (boolean, optional): whether to use the AMSBound variant of this algorithm 137 | .. Adaptive Gradient Methods with Dynamic Bound of Learning Rate: 138 | https://openreview.net/forum?id=Bkg3g2R9FX 139 | """ 140 | 141 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), final_lr=0.1, gamma=1e-3, 142 | eps=1e-8, weight_decay=0, amsbound=False): 143 | if not 0.0 <= lr: 144 | raise ValueError("Invalid learning rate: {}".format(lr)) 145 | if not 0.0 <= eps: 146 | raise ValueError("Invalid epsilon value: {}".format(eps)) 147 | if not 0.0 <= betas[0] < 1.0: 148 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 149 | if not 0.0 <= betas[1] < 1.0: 150 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 151 | if not 0.0 <= final_lr: 152 | raise ValueError("Invalid final learning rate: {}".format(final_lr)) 153 | if not 0.0 <= gamma < 1.0: 154 | raise ValueError("Invalid gamma parameter: {}".format(gamma)) 155 | defaults = dict(lr=lr, betas=betas, final_lr=final_lr, gamma=gamma, eps=eps, 156 | weight_decay=weight_decay, amsbound=amsbound) 157 | super(AdaBoundW, self).__init__(params, defaults) 158 | 159 | self.base_lrs = list(map(lambda group: group['lr'], self.param_groups)) 160 | 161 | def __setstate__(self, state): 162 | super(AdaBoundW, self).__setstate__(state) 163 | for group in self.param_groups: 164 | group.setdefault('amsbound', False) 165 | 166 | def step(self, closure=None): 167 | """Performs a single optimization step. 168 | Arguments: 169 | closure (callable, optional): A closure that reevaluates the model 170 | and returns the loss. 171 | """ 172 | loss = None 173 | if closure is not None: 174 | loss = closure() 175 | 176 | for group, base_lr in zip(self.param_groups, self.base_lrs): 177 | for p in group['params']: 178 | if p.grad is None: 179 | continue 180 | grad = p.grad.data 181 | if grad.is_sparse: 182 | raise RuntimeError( 183 | 'Adam does not support sparse gradients, please consider SparseAdam instead') 184 | amsbound = group['amsbound'] 185 | 186 | state = self.state[p] 187 | 188 | # State initialization 189 | if len(state) == 0: 190 | state['step'] = 0 191 | # Exponential moving average of gradient values 192 | state['exp_avg'] = torch.zeros_like(p.data) 193 | # Exponential moving average of squared gradient values 194 | state['exp_avg_sq'] = torch.zeros_like(p.data) 195 | if amsbound: 196 | # Maintains max of all exp. moving avg. of sq. grad. values 197 | state['max_exp_avg_sq'] = torch.zeros_like(p.data) 198 | 199 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 200 | if amsbound: 201 | max_exp_avg_sq = state['max_exp_avg_sq'] 202 | beta1, beta2 = group['betas'] 203 | 204 | state['step'] += 1 205 | 206 | # Decay the first and second moment running average coefficient 207 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 208 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 209 | if amsbound: 210 | # Maintains the maximum of all 2nd moment running avg. till now 211 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) 212 | # Use the max. for normalizing running avg. of gradient 213 | denom = max_exp_avg_sq.sqrt().add_(group['eps']) 214 | else: 215 | denom = exp_avg_sq.sqrt().add_(group['eps']) 216 | 217 | bias_correction1 = 1 - beta1 ** state['step'] 218 | bias_correction2 = 1 - beta2 ** state['step'] 219 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 220 | 221 | # Applies bounds on actual learning rate 222 | # lr_scheduler cannot affect final_lr, this is a workaround to apply lr decay 223 | final_lr = group['final_lr'] * group['lr'] / base_lr 224 | lower_bound = final_lr * (1 - 1 / (group['gamma'] * state['step'] + 1)) 225 | upper_bound = final_lr * (1 + 1 / (group['gamma'] * state['step'])) 226 | step_size = torch.full_like(denom, step_size) 227 | step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(exp_avg) 228 | 229 | if group['weight_decay'] != 0: 230 | decayed_weights = torch.mul(p.data, group['weight_decay']) 231 | p.data.add_(-step_size) 232 | p.data.sub_(decayed_weights) 233 | else: 234 | p.data.add_(-step_size) 235 | 236 | return loss 237 | -------------------------------------------------------------------------------- /cfg/yolov3-1cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=16 7 | subdivisions=1 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=18 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=1 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .7 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=18 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=1 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .7 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=18 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=1 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .7 787 | truth_thresh = 1 788 | random=1 789 | -------------------------------------------------------------------------------- /cfg/yolov3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=16 7 | subdivisions=1 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .7 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .7 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .7 787 | truth_thresh = 1 788 | random=1 789 | -------------------------------------------------------------------------------- /cfg/yolov3-spp-1cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=100 20 | max_batches = 5000 21 | policy=steps 22 | steps=4000,4500 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=18 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=1 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=18 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 3,4,5 728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 729 | classes=1 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=18 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2 815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 816 | classes=1 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | -------------------------------------------------------------------------------- /cfg/yolov3-spp-3cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=100 20 | max_batches = 5000 21 | policy=steps 22 | steps=4000,4500 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=24 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=3 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=24 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 3,4,5 728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 729 | classes=3 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=24 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2 815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 816 | classes=3 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | -------------------------------------------------------------------------------- /cfg/yolov3-spp.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=255 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=80 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=255 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 3,4,5 728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 729 | classes=80 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=255 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2 815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 816 | classes=80 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | -------------------------------------------------------------------------------- /cfg/yolov3-asff.cfg: -------------------------------------------------------------------------------- 1 | # Generated by Glenn Jocher (glenn.jocher@ultralytics.com) for https://github.com/ultralytics/yolov3 2 | # def kmean_anchors(path='../coco/train2017.txt', n=12, img_size=(320, 640)): # from utils.utils import *; kmean_anchors() 3 | # Evolving anchors: 100%|██████████| 1000/1000 [41:15<00:00, 2.48s/it] 4 | # 0.20 iou_thr: 0.992 best possible recall, 4.25 anchors > thr 5 | # kmeans anchors (n=12, img_size=(320, 640), IoU=0.005/0.184/0.634-min/mean/best): 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359 6 | 7 | [net] 8 | # Testing 9 | # batch=1 10 | # subdivisions=1 11 | # Training 12 | batch=64 13 | subdivisions=16 14 | width=608 15 | height=608 16 | channels=3 17 | momentum=0.9 18 | decay=0.0005 19 | angle=0 20 | saturation = 1.5 21 | exposure = 1.5 22 | hue=.1 23 | 24 | learning_rate=0.001 25 | burn_in=1000 26 | max_batches = 500200 27 | policy=steps 28 | steps=400000,450000 29 | scales=.1,.1 30 | 31 | [convolutional] 32 | batch_normalize=1 33 | filters=32 34 | size=3 35 | stride=1 36 | pad=1 37 | activation=leaky 38 | 39 | # Downsample 40 | 41 | [convolutional] 42 | batch_normalize=1 43 | filters=64 44 | size=3 45 | stride=2 46 | pad=1 47 | activation=leaky 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=32 52 | size=1 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=3 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [shortcut] 66 | from=-3 67 | activation=linear 68 | 69 | # Downsample 70 | 71 | [convolutional] 72 | batch_normalize=1 73 | filters=128 74 | size=3 75 | stride=2 76 | pad=1 77 | activation=leaky 78 | 79 | [convolutional] 80 | batch_normalize=1 81 | filters=64 82 | size=1 83 | stride=1 84 | pad=1 85 | activation=leaky 86 | 87 | [convolutional] 88 | batch_normalize=1 89 | filters=128 90 | size=3 91 | stride=1 92 | pad=1 93 | activation=leaky 94 | 95 | [shortcut] 96 | from=-3 97 | activation=linear 98 | 99 | [convolutional] 100 | batch_normalize=1 101 | filters=64 102 | size=1 103 | stride=1 104 | pad=1 105 | activation=leaky 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=128 110 | size=3 111 | stride=1 112 | pad=1 113 | activation=leaky 114 | 115 | [shortcut] 116 | from=-3 117 | activation=linear 118 | 119 | # Downsample 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=256 124 | size=3 125 | stride=2 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=128 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=256 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [shortcut] 146 | from=-3 147 | activation=linear 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=128 152 | size=1 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=256 160 | size=3 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [shortcut] 166 | from=-3 167 | activation=linear 168 | 169 | [convolutional] 170 | batch_normalize=1 171 | filters=128 172 | size=1 173 | stride=1 174 | pad=1 175 | activation=leaky 176 | 177 | [convolutional] 178 | batch_normalize=1 179 | filters=256 180 | size=3 181 | stride=1 182 | pad=1 183 | activation=leaky 184 | 185 | [shortcut] 186 | from=-3 187 | activation=linear 188 | 189 | [convolutional] 190 | batch_normalize=1 191 | filters=128 192 | size=1 193 | stride=1 194 | pad=1 195 | activation=leaky 196 | 197 | [convolutional] 198 | batch_normalize=1 199 | filters=256 200 | size=3 201 | stride=1 202 | pad=1 203 | activation=leaky 204 | 205 | [shortcut] 206 | from=-3 207 | activation=linear 208 | 209 | [convolutional] 210 | batch_normalize=1 211 | filters=128 212 | size=1 213 | stride=1 214 | pad=1 215 | activation=leaky 216 | 217 | [convolutional] 218 | batch_normalize=1 219 | filters=256 220 | size=3 221 | stride=1 222 | pad=1 223 | activation=leaky 224 | 225 | [shortcut] 226 | from=-3 227 | activation=linear 228 | 229 | [convolutional] 230 | batch_normalize=1 231 | filters=128 232 | size=1 233 | stride=1 234 | pad=1 235 | activation=leaky 236 | 237 | [convolutional] 238 | batch_normalize=1 239 | filters=256 240 | size=3 241 | stride=1 242 | pad=1 243 | activation=leaky 244 | 245 | [shortcut] 246 | from=-3 247 | activation=linear 248 | 249 | [convolutional] 250 | batch_normalize=1 251 | filters=128 252 | size=1 253 | stride=1 254 | pad=1 255 | activation=leaky 256 | 257 | [convolutional] 258 | batch_normalize=1 259 | filters=256 260 | size=3 261 | stride=1 262 | pad=1 263 | activation=leaky 264 | 265 | [shortcut] 266 | from=-3 267 | activation=linear 268 | 269 | [convolutional] 270 | batch_normalize=1 271 | filters=128 272 | size=1 273 | stride=1 274 | pad=1 275 | activation=leaky 276 | 277 | [convolutional] 278 | batch_normalize=1 279 | filters=256 280 | size=3 281 | stride=1 282 | pad=1 283 | activation=leaky 284 | 285 | [shortcut] 286 | from=-3 287 | activation=linear 288 | 289 | # Downsample 290 | 291 | [convolutional] 292 | batch_normalize=1 293 | filters=512 294 | size=3 295 | stride=2 296 | pad=1 297 | activation=leaky 298 | 299 | [convolutional] 300 | batch_normalize=1 301 | filters=256 302 | size=1 303 | stride=1 304 | pad=1 305 | activation=leaky 306 | 307 | [convolutional] 308 | batch_normalize=1 309 | filters=512 310 | size=3 311 | stride=1 312 | pad=1 313 | activation=leaky 314 | 315 | [shortcut] 316 | from=-3 317 | activation=linear 318 | 319 | [convolutional] 320 | batch_normalize=1 321 | filters=256 322 | size=1 323 | stride=1 324 | pad=1 325 | activation=leaky 326 | 327 | [convolutional] 328 | batch_normalize=1 329 | filters=512 330 | size=3 331 | stride=1 332 | pad=1 333 | activation=leaky 334 | 335 | [shortcut] 336 | from=-3 337 | activation=linear 338 | 339 | [convolutional] 340 | batch_normalize=1 341 | filters=256 342 | size=1 343 | stride=1 344 | pad=1 345 | activation=leaky 346 | 347 | [convolutional] 348 | batch_normalize=1 349 | filters=512 350 | size=3 351 | stride=1 352 | pad=1 353 | activation=leaky 354 | 355 | [shortcut] 356 | from=-3 357 | activation=linear 358 | 359 | [convolutional] 360 | batch_normalize=1 361 | filters=256 362 | size=1 363 | stride=1 364 | pad=1 365 | activation=leaky 366 | 367 | [convolutional] 368 | batch_normalize=1 369 | filters=512 370 | size=3 371 | stride=1 372 | pad=1 373 | activation=leaky 374 | 375 | [shortcut] 376 | from=-3 377 | activation=linear 378 | 379 | [convolutional] 380 | batch_normalize=1 381 | filters=256 382 | size=1 383 | stride=1 384 | pad=1 385 | activation=leaky 386 | 387 | [convolutional] 388 | batch_normalize=1 389 | filters=512 390 | size=3 391 | stride=1 392 | pad=1 393 | activation=leaky 394 | 395 | [shortcut] 396 | from=-3 397 | activation=linear 398 | 399 | [convolutional] 400 | batch_normalize=1 401 | filters=256 402 | size=1 403 | stride=1 404 | pad=1 405 | activation=leaky 406 | 407 | [convolutional] 408 | batch_normalize=1 409 | filters=512 410 | size=3 411 | stride=1 412 | pad=1 413 | activation=leaky 414 | 415 | [shortcut] 416 | from=-3 417 | activation=linear 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | # SPP -------------------------------------------------------------------------- 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | # SPP -------------------------------------------------------------------------- 597 | 598 | [convolutional] 599 | batch_normalize=1 600 | filters=512 601 | size=1 602 | stride=1 603 | pad=1 604 | activation=leaky 605 | 606 | [convolutional] 607 | batch_normalize=1 608 | size=3 609 | stride=1 610 | pad=1 611 | filters=1024 612 | activation=leaky 613 | 614 | [convolutional] 615 | batch_normalize=1 616 | filters=512 617 | size=1 618 | stride=1 619 | pad=1 620 | activation=leaky 621 | 622 | [convolutional] 623 | batch_normalize=1 624 | size=3 625 | stride=1 626 | pad=1 627 | filters=1024 628 | activation=leaky 629 | 630 | [convolutional] 631 | size=1 632 | stride=1 633 | pad=1 634 | filters=258 635 | activation=linear 636 | 637 | # YOLO ------------------------------------------------------------------------- 638 | 639 | [route] 640 | layers = -3 641 | 642 | [convolutional] 643 | batch_normalize=1 644 | filters=256 645 | size=1 646 | stride=1 647 | pad=1 648 | activation=leaky 649 | 650 | [upsample] 651 | stride=2 652 | 653 | [route] 654 | layers = -1, 61 655 | 656 | [convolutional] 657 | batch_normalize=1 658 | filters=256 659 | size=1 660 | stride=1 661 | pad=1 662 | activation=leaky 663 | 664 | [convolutional] 665 | batch_normalize=1 666 | size=3 667 | stride=1 668 | pad=1 669 | filters=512 670 | activation=leaky 671 | 672 | [convolutional] 673 | batch_normalize=1 674 | filters=256 675 | size=1 676 | stride=1 677 | pad=1 678 | activation=leaky 679 | 680 | [convolutional] 681 | batch_normalize=1 682 | size=3 683 | stride=1 684 | pad=1 685 | filters=512 686 | activation=leaky 687 | 688 | [convolutional] 689 | batch_normalize=1 690 | filters=256 691 | size=1 692 | stride=1 693 | pad=1 694 | activation=leaky 695 | 696 | [convolutional] 697 | batch_normalize=1 698 | size=3 699 | stride=1 700 | pad=1 701 | filters=512 702 | activation=leaky 703 | 704 | [convolutional] 705 | size=1 706 | stride=1 707 | pad=1 708 | filters=258 709 | activation=linear 710 | 711 | # YOLO ------------------------------------------------------------------------- 712 | 713 | [route] 714 | layers = -3 715 | 716 | [convolutional] 717 | batch_normalize=1 718 | filters=128 719 | size=1 720 | stride=1 721 | pad=1 722 | activation=leaky 723 | 724 | [upsample] 725 | stride=2 726 | 727 | [route] 728 | layers = -1, 36 729 | 730 | [convolutional] 731 | batch_normalize=1 732 | filters=128 733 | size=1 734 | stride=1 735 | pad=1 736 | activation=leaky 737 | 738 | [convolutional] 739 | batch_normalize=1 740 | size=3 741 | stride=1 742 | pad=1 743 | filters=256 744 | activation=leaky 745 | 746 | [convolutional] 747 | batch_normalize=1 748 | filters=128 749 | size=1 750 | stride=1 751 | pad=1 752 | activation=leaky 753 | 754 | [convolutional] 755 | batch_normalize=1 756 | size=3 757 | stride=1 758 | pad=1 759 | filters=256 760 | activation=leaky 761 | 762 | [convolutional] 763 | batch_normalize=1 764 | filters=128 765 | size=1 766 | stride=1 767 | pad=1 768 | activation=leaky 769 | 770 | [convolutional] 771 | batch_normalize=1 772 | size=3 773 | stride=1 774 | pad=1 775 | filters=256 776 | activation=leaky 777 | 778 | [convolutional] 779 | size=1 780 | stride=1 781 | pad=1 782 | filters=258 783 | activation=linear 784 | 785 | [yolo] 786 | from=88,99,110 787 | mask = 6,7,8 788 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 789 | classes=80 790 | num=9 791 | 792 | [yolo] 793 | from=88,99,110 794 | mask = 3,4,5 795 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 796 | classes=80 797 | num=9 798 | 799 | [yolo] 800 | from=88,99,110 801 | mask = 0,1,2 802 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 803 | classes=80 804 | num=9 -------------------------------------------------------------------------------- /cfg/yolov3-spp3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 120200 21 | policy=steps 22 | steps=70000,100000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=255 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=80 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | ### SPP ### 687 | [maxpool] 688 | stride=1 689 | size=5 690 | 691 | [route] 692 | layers=-2 693 | 694 | [maxpool] 695 | stride=1 696 | size=9 697 | 698 | [route] 699 | layers=-4 700 | 701 | [maxpool] 702 | stride=1 703 | size=13 704 | 705 | [route] 706 | layers=-1,-3,-5,-6 707 | 708 | ### End SPP ### 709 | 710 | 711 | [convolutional] 712 | batch_normalize=1 713 | filters=256 714 | size=1 715 | stride=1 716 | pad=1 717 | activation=leaky 718 | 719 | [convolutional] 720 | batch_normalize=1 721 | size=3 722 | stride=1 723 | pad=1 724 | filters=512 725 | activation=leaky 726 | 727 | [convolutional] 728 | batch_normalize=1 729 | filters=256 730 | size=1 731 | stride=1 732 | pad=1 733 | activation=leaky 734 | 735 | [convolutional] 736 | batch_normalize=1 737 | size=3 738 | stride=1 739 | pad=1 740 | filters=512 741 | activation=leaky 742 | 743 | [convolutional] 744 | size=1 745 | stride=1 746 | pad=1 747 | filters=255 748 | activation=linear 749 | 750 | 751 | [yolo] 752 | mask = 3,4,5 753 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 754 | classes=80 755 | num=9 756 | jitter=.3 757 | ignore_thresh = .7 758 | truth_thresh = 1 759 | random=1 760 | 761 | 762 | 763 | [route] 764 | layers = -4 765 | 766 | [convolutional] 767 | batch_normalize=1 768 | filters=128 769 | size=1 770 | stride=1 771 | pad=1 772 | activation=leaky 773 | 774 | [upsample] 775 | stride=2 776 | 777 | [route] 778 | layers = -1, 36 779 | 780 | 781 | 782 | [convolutional] 783 | batch_normalize=1 784 | filters=128 785 | size=1 786 | stride=1 787 | pad=1 788 | activation=leaky 789 | 790 | [convolutional] 791 | batch_normalize=1 792 | size=3 793 | stride=1 794 | pad=1 795 | filters=256 796 | activation=leaky 797 | 798 | [convolutional] 799 | batch_normalize=1 800 | filters=128 801 | size=1 802 | stride=1 803 | pad=1 804 | activation=leaky 805 | 806 | ### SPP ### 807 | [maxpool] 808 | stride=1 809 | size=5 810 | 811 | [route] 812 | layers=-2 813 | 814 | [maxpool] 815 | stride=1 816 | size=9 817 | 818 | [route] 819 | layers=-4 820 | 821 | [maxpool] 822 | stride=1 823 | size=13 824 | 825 | [route] 826 | layers=-1,-3,-5,-6 827 | 828 | ### End SPP ### 829 | 830 | [convolutional] 831 | batch_normalize=1 832 | size=3 833 | stride=1 834 | pad=1 835 | filters=256 836 | activation=leaky 837 | 838 | [convolutional] 839 | batch_normalize=1 840 | filters=128 841 | size=1 842 | stride=1 843 | pad=1 844 | activation=leaky 845 | 846 | [convolutional] 847 | batch_normalize=1 848 | size=3 849 | stride=1 850 | pad=1 851 | filters=256 852 | activation=leaky 853 | 854 | [convolutional] 855 | size=1 856 | stride=1 857 | pad=1 858 | filters=255 859 | activation=linear 860 | 861 | 862 | [yolo] 863 | mask = 0,1,2 864 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 865 | classes=80 866 | num=9 867 | jitter=.3 868 | ignore_thresh = .7 869 | truth_thresh = 1 870 | random=1 871 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | from torch.utils.data import DataLoader 5 | 6 | from models import * 7 | from utils.datasets import * 8 | from utils.utils import * 9 | 10 | 11 | def test(cfg, 12 | data, 13 | weights=None, 14 | batch_size=16, 15 | img_size=416, 16 | conf_thres=0.001, 17 | iou_thres=0.6, # for nms 18 | save_json=False, 19 | single_cls=False, 20 | augment=False, 21 | model=None, 22 | dataloader=None): 23 | # Initialize/load model and set device 24 | if model is None: 25 | device = torch_utils.select_device(opt.device, batch_size=batch_size) 26 | verbose = opt.task == 'test' 27 | 28 | # Remove previous 29 | for f in glob.glob('test_batch*.png'): 30 | os.remove(f) 31 | 32 | # Initialize model 33 | model = Darknet(cfg, img_size) 34 | 35 | # Load weights 36 | attempt_download(weights) 37 | if weights.endswith('.pt'): # pytorch format 38 | model.load_state_dict(torch.load(weights, map_location=device)['model']) 39 | else: # darknet format 40 | load_darknet_weights(model, weights) 41 | 42 | # Fuse 43 | model.fuse() 44 | model.to(device) 45 | 46 | if device.type != 'cpu' and torch.cuda.device_count() > 1: 47 | model = nn.DataParallel(model) 48 | else: # called by train.py 49 | device = next(model.parameters()).device # get model device 50 | verbose = False 51 | 52 | # Configure run 53 | data = parse_data_cfg(data) 54 | nc = 1 if single_cls else int(data['classes']) # number of classes 55 | path = data['valid'] # path to test images 56 | names = load_classes(data['names']) # class names 57 | iouv = torch.linspace(0.5, 0.95, 10).to(device) # iou vector for mAP@0.5:0.95 58 | iouv = iouv[0].view(1) # comment for mAP@0.5:0.95 59 | niou = iouv.numel() 60 | 61 | # Dataloader 62 | if dataloader is None: 63 | dataset = LoadImagesAndLabels(path, img_size, batch_size, rect=True, single_cls=opt.single_cls) 64 | batch_size = min(batch_size, len(dataset)) 65 | dataloader = DataLoader(dataset, 66 | batch_size=batch_size, 67 | num_workers=min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]), 68 | pin_memory=True, 69 | collate_fn=dataset.collate_fn) 70 | 71 | seen = 0 72 | model.eval() 73 | _ = model(torch.zeros((1, 3, img_size, img_size), device=device)) if device.type != 'cpu' else None # run once 74 | coco91class = coco80_to_coco91_class() 75 | s = ('%20s' + '%10s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@0.5', 'F1') 76 | p, r, f1, mp, mr, map, mf1, t0, t1 = 0., 0., 0., 0., 0., 0., 0., 0., 0. 77 | loss = torch.zeros(3, device=device) 78 | jdict, stats, ap, ap_class = [], [], [], [] 79 | for batch_i, (imgs, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)): 80 | imgs = imgs.to(device).float() / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 81 | targets = targets.to(device) 82 | nb, _, height, width = imgs.shape # batch size, channels, height, width 83 | whwh = torch.Tensor([width, height, width, height]).to(device) 84 | 85 | # Plot images with bounding boxes 86 | f = 'test_batch%g.png' % batch_i # filename 87 | if batch_i < 1 and not os.path.exists(f): 88 | plot_images(imgs=imgs, targets=targets, paths=paths, fname=f) 89 | 90 | # Disable gradients 91 | with torch.no_grad(): 92 | # Run model 93 | t = torch_utils.time_synchronized() 94 | inf_out, train_out = model(imgs, augment=augment) # inference and training outputs 95 | t0 += torch_utils.time_synchronized() - t 96 | 97 | # Compute loss 98 | if hasattr(model, 'hyp'): # if model has loss hyperparameters 99 | loss += compute_loss(train_out, targets, model)[1][:3] # GIoU, obj, cls 100 | 101 | # Run NMS 102 | t = torch_utils.time_synchronized() 103 | output = non_max_suppression(inf_out, conf_thres=conf_thres, iou_thres=iou_thres) # nms 104 | t1 += torch_utils.time_synchronized() - t 105 | 106 | # Statistics per image 107 | for si, pred in enumerate(output): 108 | labels = targets[targets[:, 0] == si, 1:] 109 | nl = len(labels) 110 | tcls = labels[:, 0].tolist() if nl else [] # target class 111 | seen += 1 112 | 113 | if pred is None: 114 | if nl: 115 | stats.append((torch.zeros(0, niou, dtype=torch.bool), torch.Tensor(), torch.Tensor(), tcls)) 116 | continue 117 | 118 | # Append to text file 119 | # with open('test.txt', 'a') as file: 120 | # [file.write('%11.5g' * 7 % tuple(x) + '\n') for x in pred] 121 | 122 | # Clip boxes to image bounds 123 | clip_coords(pred, (height, width)) 124 | 125 | # Append to pycocotools JSON dictionary 126 | if save_json: 127 | # [{"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}, ... 128 | image_id = int(Path(paths[si]).stem.split('_')[-1]) 129 | box = pred[:, :4].clone() # xyxy 130 | scale_coords(imgs[si].shape[1:], box, shapes[si][0], shapes[si][1]) # to original shape 131 | box = xyxy2xywh(box) # xywh 132 | box[:, :2] -= box[:, 2:] / 2 # xy center to top-left corner 133 | for p, b in zip(pred.tolist(), box.tolist()): 134 | jdict.append({'image_id': image_id, 135 | 'category_id': coco91class[int(p[5])], 136 | 'bbox': [round(x, 3) for x in b], 137 | 'score': round(p[4], 5)}) 138 | 139 | # Assign all predictions as incorrect 140 | correct = torch.zeros(pred.shape[0], niou, dtype=torch.bool, device=device) 141 | if nl: 142 | detected = [] # target indices 143 | tcls_tensor = labels[:, 0] 144 | 145 | # target boxes 146 | tbox = xywh2xyxy(labels[:, 1:5]) * whwh 147 | 148 | # Per target class 149 | for cls in torch.unique(tcls_tensor): 150 | ti = (cls == tcls_tensor).nonzero().view(-1) # prediction indices 151 | pi = (cls == pred[:, 5]).nonzero().view(-1) # target indices 152 | 153 | # Search for detections 154 | if pi.shape[0]: 155 | # Prediction to target ious 156 | ious, i = box_iou(pred[pi, :4], tbox[ti]).max(1) # best ious, indices 157 | 158 | # Append detections 159 | for j in (ious > iouv[0]).nonzero(): 160 | d = ti[i[j]] # detected target 161 | if d not in detected: 162 | detected.append(d) 163 | correct[pi[j]] = ious[j] > iouv # iou_thres is 1xn 164 | if len(detected) == nl: # all targets already located in image 165 | break 166 | 167 | # Append statistics (correct, conf, pcls, tcls) 168 | stats.append((correct.cpu(), pred[:, 4].cpu(), pred[:, 5].cpu(), tcls)) 169 | 170 | # Compute statistics 171 | stats = [np.concatenate(x, 0) for x in zip(*stats)] # to numpy 172 | if len(stats): 173 | p, r, ap, f1, ap_class = ap_per_class(*stats) 174 | if niou > 1: 175 | p, r, ap, f1 = p[:, 0], r[:, 0], ap.mean(1), ap[:, 0] # [P, R, AP@0.5:0.95, AP@0.5] 176 | mp, mr, map, mf1 = p.mean(), r.mean(), ap.mean(), f1.mean() 177 | nt = np.bincount(stats[3].astype(np.int64), minlength=nc) # number of targets per class 178 | else: 179 | nt = torch.zeros(1) 180 | 181 | # Print results 182 | pf = '%20s' + '%10.3g' * 6 # print format 183 | print(pf % ('all', seen, nt.sum(), mp, mr, map, mf1)) 184 | 185 | # Print results per class 186 | if verbose and nc > 1 and len(stats): 187 | for i, c in enumerate(ap_class): 188 | print(pf % (names[c], seen, nt[c], p[i], r[i], ap[i], f1[i])) 189 | 190 | # Print speeds 191 | if verbose or save_json: 192 | t = tuple(x / seen * 1E3 for x in (t0, t1, t0 + t1)) + (img_size, img_size, batch_size) # tuple 193 | print('Speed: %.1f/%.1f/%.1f ms inference/NMS/total per %gx%g image at batch-size %g' % t) 194 | 195 | # Save JSON 196 | if save_json and map and len(jdict): 197 | print('\nCOCO mAP with pycocotools...') 198 | imgIds = [int(Path(x).stem.split('_')[-1]) for x in dataloader.dataset.img_files] 199 | with open('results.json', 'w') as file: 200 | json.dump(jdict, file) 201 | 202 | try: 203 | from pycocotools.coco import COCO 204 | from pycocotools.cocoeval import COCOeval 205 | except: 206 | print('WARNING: missing pycocotools package, can not compute official COCO mAP. See requirements.txt.') 207 | 208 | # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb 209 | cocoGt = COCO(glob.glob('/mnt/sda/yolact/data/coco/annotations/instances_val2014.json')[0]) # initialize COCO ground truth api 210 | cocoDt = cocoGt.loadRes('results.json') # initialize COCO pred api 211 | 212 | cocoEval = COCOeval(cocoGt, cocoDt, 'bbox') 213 | cocoEval.params.imgIds = imgIds # [:32] # only evaluate these images 214 | cocoEval.evaluate() 215 | cocoEval.accumulate() 216 | cocoEval.summarize() 217 | # mf1, map = cocoEval.stats[:2] # update to pycocotools results (mAP@0.5:0.95, mAP@0.5) 218 | 219 | # Return results 220 | maps = np.zeros(nc) + map 221 | for i, c in enumerate(ap_class): 222 | maps[c] = ap[i] 223 | return (mp, mr, map, mf1, *(loss.cpu() / len(dataloader)).tolist()), maps 224 | 225 | 226 | if __name__ == '__main__': 227 | parser = argparse.ArgumentParser(prog='test.py') 228 | parser.add_argument('--cfg', type=str, default='cfg/yolov3-spp.cfg', help='*.cfg path') 229 | parser.add_argument('--data', type=str, default='data/coco2014.data', help='*.data path') 230 | parser.add_argument('--weights', type=str, default='weights/yolov3-spp-ultralytics.pt', help='weights path') 231 | parser.add_argument('--batch-size', type=int, default=4, help='size of each image batch') 232 | parser.add_argument('--img-size', type=int, default=608, help='inference size (pixels)') 233 | parser.add_argument('--conf-thres', type=float, default=0.001, help='object confidence threshold') 234 | parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS') 235 | parser.add_argument('--save-json', action='store_true', help='save a cocoapi-compatible JSON results file') 236 | parser.add_argument('--task', default='test', help="'test', 'study', 'benchmark'") 237 | parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1) or cpu') 238 | parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset') 239 | parser.add_argument('--augment', action='store_true', help='augmented inference') 240 | opt = parser.parse_args() 241 | opt.save_json = opt.save_json or any([x in opt.data for x in ['coco.data', 'coco2014.data', 'coco2017.data']]) 242 | print(opt) 243 | 244 | # task = 'test', 'study', 'benchmark' 245 | if opt.task == 'test': # (default) test normally 246 | test(opt.cfg, 247 | opt.data, 248 | opt.weights, 249 | opt.batch_size, 250 | opt.img_size, 251 | opt.conf_thres, 252 | opt.iou_thres, 253 | opt.save_json, 254 | opt.single_cls, 255 | opt.augment) 256 | 257 | elif opt.task == 'benchmark': # mAPs at 320-608 at conf 0.5 and 0.7 258 | y = [] 259 | for i in [320, 416, 512, 608]: # img-size 260 | for j in [0.5, 0.7]: # iou-thres 261 | t = time.time() 262 | r = test(opt.cfg, opt.data, opt.weights, opt.batch_size, i, opt.conf_thres, j, opt.save_json)[0] 263 | y.append(r + (time.time() - t,)) 264 | np.savetxt('benchmark.txt', y, fmt='%10.4g') # y = np.loadtxt('study.txt') 265 | 266 | elif opt.task == 'study': # Parameter study 267 | y = [] 268 | x = np.arange(0.4, 0.9, 0.05) # iou-thres 269 | for i in x: 270 | t = time.time() 271 | r = test(opt.cfg, opt.data, opt.weights, opt.batch_size, opt.img_size, opt.conf_thres, i, opt.save_json)[0] 272 | y.append(r + (time.time() - t,)) 273 | np.savetxt('study.txt', y, fmt='%10.4g') # y = np.loadtxt('study.txt') 274 | 275 | # Plot 276 | fig, ax = plt.subplots(3, 1, figsize=(6, 6)) 277 | y = np.stack(y, 0) 278 | ax[0].plot(x, y[:, 2], marker='.', label='mAP@0.5') 279 | ax[0].set_ylabel('mAP') 280 | ax[1].plot(x, y[:, 3], marker='.', label='mAP@0.5:0.95') 281 | ax[1].set_ylabel('mAP') 282 | ax[2].plot(x, y[:, -1], marker='.', label='time') 283 | ax[2].set_ylabel('time (s)') 284 | for i in range(3): 285 | ax[i].legend() 286 | ax[i].set_xlabel('iou_thr') 287 | fig.tight_layout() 288 | plt.savefig('study.jpg', dpi=200) 289 | -------------------------------------------------------------------------------- /cfg/yolov3-spp-pan-scale.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=32 8 | width=544 9 | height=544 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | 19 | learning_rate=0.001 20 | burn_in=1000 21 | max_batches = 10000 22 | 23 | policy=steps 24 | steps=8000,9000 25 | scales=.1,.1 26 | 27 | #policy=sgdr 28 | #sgdr_cycle=1000 29 | #sgdr_mult=2 30 | #steps=4000,6000,8000,9000 31 | #scales=1, 1, 0.1, 0.1 32 | 33 | [convolutional] 34 | batch_normalize=1 35 | filters=32 36 | size=3 37 | stride=1 38 | pad=1 39 | activation=leaky 40 | 41 | # Downsample 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=64 46 | size=3 47 | stride=2 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=32 54 | size=1 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [convolutional] 60 | batch_normalize=1 61 | filters=64 62 | size=3 63 | stride=1 64 | pad=1 65 | activation=leaky 66 | 67 | [shortcut] 68 | from=-3 69 | activation=linear 70 | 71 | # Downsample 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=128 76 | size=3 77 | stride=2 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=64 84 | size=1 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [convolutional] 90 | batch_normalize=1 91 | filters=128 92 | size=3 93 | stride=1 94 | pad=1 95 | activation=leaky 96 | 97 | [shortcut] 98 | from=-3 99 | activation=linear 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=64 104 | size=1 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [convolutional] 110 | batch_normalize=1 111 | filters=128 112 | size=3 113 | stride=1 114 | pad=1 115 | activation=leaky 116 | 117 | [shortcut] 118 | from=-3 119 | activation=linear 120 | 121 | # Downsample 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=256 126 | size=3 127 | stride=2 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=128 134 | size=1 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [convolutional] 140 | batch_normalize=1 141 | filters=256 142 | size=3 143 | stride=1 144 | pad=1 145 | activation=leaky 146 | 147 | [shortcut] 148 | from=-3 149 | activation=linear 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=128 154 | size=1 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [convolutional] 160 | batch_normalize=1 161 | filters=256 162 | size=3 163 | stride=1 164 | pad=1 165 | activation=leaky 166 | 167 | [shortcut] 168 | from=-3 169 | activation=linear 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=128 174 | size=1 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [convolutional] 180 | batch_normalize=1 181 | filters=256 182 | size=3 183 | stride=1 184 | pad=1 185 | activation=leaky 186 | 187 | [shortcut] 188 | from=-3 189 | activation=linear 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=128 194 | size=1 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [convolutional] 200 | batch_normalize=1 201 | filters=256 202 | size=3 203 | stride=1 204 | pad=1 205 | activation=leaky 206 | 207 | [shortcut] 208 | from=-3 209 | activation=linear 210 | 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=128 215 | size=1 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [convolutional] 221 | batch_normalize=1 222 | filters=256 223 | size=3 224 | stride=1 225 | pad=1 226 | activation=leaky 227 | 228 | [shortcut] 229 | from=-3 230 | activation=linear 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=128 235 | size=1 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [convolutional] 241 | batch_normalize=1 242 | filters=256 243 | size=3 244 | stride=1 245 | pad=1 246 | activation=leaky 247 | 248 | [shortcut] 249 | from=-3 250 | activation=linear 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=128 255 | size=1 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [convolutional] 261 | batch_normalize=1 262 | filters=256 263 | size=3 264 | stride=1 265 | pad=1 266 | activation=leaky 267 | 268 | [shortcut] 269 | from=-3 270 | activation=linear 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=128 275 | size=1 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [convolutional] 281 | batch_normalize=1 282 | filters=256 283 | size=3 284 | stride=1 285 | pad=1 286 | activation=leaky 287 | 288 | [shortcut] 289 | from=-3 290 | activation=linear 291 | 292 | # Downsample 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=512 297 | size=3 298 | stride=2 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=256 305 | size=1 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [convolutional] 311 | batch_normalize=1 312 | filters=512 313 | size=3 314 | stride=1 315 | pad=1 316 | activation=leaky 317 | 318 | [shortcut] 319 | from=-3 320 | activation=linear 321 | 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=256 326 | size=1 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [convolutional] 332 | batch_normalize=1 333 | filters=512 334 | size=3 335 | stride=1 336 | pad=1 337 | activation=leaky 338 | 339 | [shortcut] 340 | from=-3 341 | activation=linear 342 | 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=256 347 | size=1 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [convolutional] 353 | batch_normalize=1 354 | filters=512 355 | size=3 356 | stride=1 357 | pad=1 358 | activation=leaky 359 | 360 | [shortcut] 361 | from=-3 362 | activation=linear 363 | 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=256 368 | size=1 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [convolutional] 374 | batch_normalize=1 375 | filters=512 376 | size=3 377 | stride=1 378 | pad=1 379 | activation=leaky 380 | 381 | [shortcut] 382 | from=-3 383 | activation=linear 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=256 388 | size=1 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [convolutional] 394 | batch_normalize=1 395 | filters=512 396 | size=3 397 | stride=1 398 | pad=1 399 | activation=leaky 400 | 401 | [shortcut] 402 | from=-3 403 | activation=linear 404 | 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=256 409 | size=1 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [convolutional] 415 | batch_normalize=1 416 | filters=512 417 | size=3 418 | stride=1 419 | pad=1 420 | activation=leaky 421 | 422 | [shortcut] 423 | from=-3 424 | activation=linear 425 | 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=256 430 | size=1 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [convolutional] 436 | batch_normalize=1 437 | filters=512 438 | size=3 439 | stride=1 440 | pad=1 441 | activation=leaky 442 | 443 | [shortcut] 444 | from=-3 445 | activation=linear 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=256 450 | size=1 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [convolutional] 456 | batch_normalize=1 457 | filters=512 458 | size=3 459 | stride=1 460 | pad=1 461 | activation=leaky 462 | 463 | [shortcut] 464 | from=-3 465 | activation=linear 466 | 467 | # Downsample 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=1024 472 | size=3 473 | stride=2 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=512 480 | size=1 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [convolutional] 486 | batch_normalize=1 487 | filters=1024 488 | size=3 489 | stride=1 490 | pad=1 491 | activation=leaky 492 | 493 | [shortcut] 494 | from=-3 495 | activation=linear 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=512 500 | size=1 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [convolutional] 506 | batch_normalize=1 507 | filters=1024 508 | size=3 509 | stride=1 510 | pad=1 511 | activation=leaky 512 | 513 | [shortcut] 514 | from=-3 515 | activation=linear 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=512 520 | size=1 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [convolutional] 526 | batch_normalize=1 527 | filters=1024 528 | size=3 529 | stride=1 530 | pad=1 531 | activation=leaky 532 | 533 | [shortcut] 534 | from=-3 535 | activation=linear 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=512 540 | size=1 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [convolutional] 546 | batch_normalize=1 547 | filters=1024 548 | size=3 549 | stride=1 550 | pad=1 551 | activation=leaky 552 | 553 | [shortcut] 554 | from=-3 555 | activation=linear 556 | 557 | ###################### 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | filters=512 562 | size=1 563 | stride=1 564 | pad=1 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | size=3 570 | stride=1 571 | pad=1 572 | filters=1024 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | filters=512 578 | size=1 579 | stride=1 580 | pad=1 581 | activation=leaky 582 | 583 | ### SPP ### 584 | [maxpool] 585 | stride=1 586 | size=5 587 | 588 | [route] 589 | layers=-2 590 | 591 | [maxpool] 592 | stride=1 593 | size=9 594 | 595 | [route] 596 | layers=-4 597 | 598 | [maxpool] 599 | stride=1 600 | size=13 601 | 602 | [route] 603 | layers=-1,-3,-5,-6 604 | 605 | ### End SPP ### 606 | 607 | [convolutional] 608 | batch_normalize=1 609 | filters=512 610 | size=1 611 | stride=1 612 | pad=1 613 | activation=leaky 614 | 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | size=3 619 | stride=1 620 | pad=1 621 | filters=1024 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | filters=512 627 | size=1 628 | stride=1 629 | pad=1 630 | activation=leaky 631 | 632 | 633 | 634 | ########### to [yolo-3] 635 | 636 | 637 | 638 | [route] 639 | layers = -4 640 | 641 | [convolutional] 642 | batch_normalize=1 643 | filters=256 644 | size=1 645 | stride=1 646 | pad=1 647 | activation=leaky 648 | 649 | [upsample] 650 | stride=2 651 | 652 | [route] 653 | layers = -1, 61 654 | 655 | 656 | 657 | [convolutional] 658 | batch_normalize=1 659 | filters=256 660 | size=1 661 | stride=1 662 | pad=1 663 | activation=leaky 664 | 665 | [convolutional] 666 | batch_normalize=1 667 | size=3 668 | stride=1 669 | pad=1 670 | filters=512 671 | activation=leaky 672 | 673 | [convolutional] 674 | batch_normalize=1 675 | filters=256 676 | size=1 677 | stride=1 678 | pad=1 679 | activation=leaky 680 | 681 | [convolutional] 682 | batch_normalize=1 683 | size=3 684 | stride=1 685 | pad=1 686 | filters=512 687 | activation=leaky 688 | 689 | [convolutional] 690 | batch_normalize=1 691 | filters=256 692 | size=1 693 | stride=1 694 | pad=1 695 | activation=leaky 696 | 697 | 698 | ########### to [yolo-2] 699 | 700 | 701 | 702 | 703 | [route] 704 | layers = -4 705 | 706 | [convolutional] 707 | batch_normalize=1 708 | filters=128 709 | size=1 710 | stride=1 711 | pad=1 712 | activation=leaky 713 | 714 | [upsample] 715 | stride=2 716 | 717 | [route] 718 | layers = -1, 36 719 | 720 | 721 | 722 | [convolutional] 723 | batch_normalize=1 724 | filters=128 725 | size=1 726 | stride=1 727 | pad=1 728 | activation=leaky 729 | 730 | [convolutional] 731 | batch_normalize=1 732 | size=3 733 | stride=1 734 | pad=1 735 | filters=256 736 | activation=leaky 737 | 738 | [convolutional] 739 | batch_normalize=1 740 | filters=128 741 | size=1 742 | stride=1 743 | pad=1 744 | activation=leaky 745 | 746 | [convolutional] 747 | batch_normalize=1 748 | size=3 749 | stride=1 750 | pad=1 751 | filters=256 752 | activation=leaky 753 | 754 | [convolutional] 755 | batch_normalize=1 756 | filters=128 757 | size=1 758 | stride=1 759 | pad=1 760 | activation=leaky 761 | 762 | 763 | 764 | ########### to [yolo-1] 765 | 766 | 767 | ########### features of different layers 768 | 769 | 770 | [route] 771 | layers=1 772 | 773 | [reorg3d] 774 | stride=2 775 | 776 | [route] 777 | layers=5,-1 778 | 779 | [reorg3d] 780 | stride=2 781 | 782 | [route] 783 | layers=12,-1 784 | 785 | [reorg3d] 786 | stride=2 787 | 788 | [route] 789 | layers=37,-1 790 | 791 | [reorg3d] 792 | stride=2 793 | 794 | [route] 795 | layers=62,-1 796 | 797 | 798 | 799 | ########### [yolo-1] 800 | 801 | [convolutional] 802 | batch_normalize=1 803 | filters=128 804 | size=1 805 | stride=1 806 | pad=1 807 | activation=leaky 808 | 809 | [upsample] 810 | stride=4 811 | 812 | [route] 813 | layers = -1,-12 814 | 815 | 816 | [convolutional] 817 | batch_normalize=1 818 | size=3 819 | stride=1 820 | pad=1 821 | filters=256 822 | activation=leaky 823 | 824 | [convolutional] 825 | size=1 826 | stride=1 827 | pad=1 828 | filters=340 829 | activation=linear 830 | 831 | 832 | [yolo] 833 | mask = 0,1,2,3 834 | anchors = 8,8, 10,13, 16,30, 33,23, 32,32, 30,61, 62,45, 64,64, 59,119, 116,90, 156,198, 373,326 835 | classes=80 836 | num=12 837 | jitter=.3 838 | ignore_thresh = .7 839 | truth_thresh = 1 840 | scale_x_y = 1.05 841 | random=0 842 | 843 | 844 | 845 | 846 | ########### [yolo-2] 847 | 848 | 849 | [route] 850 | layers = -7 851 | 852 | [convolutional] 853 | batch_normalize=1 854 | filters=256 855 | size=1 856 | stride=1 857 | pad=1 858 | activation=leaky 859 | 860 | [upsample] 861 | stride=2 862 | 863 | [route] 864 | layers = -1,-28 865 | 866 | 867 | [convolutional] 868 | batch_normalize=1 869 | size=3 870 | stride=1 871 | pad=1 872 | filters=512 873 | activation=leaky 874 | 875 | [convolutional] 876 | size=1 877 | stride=1 878 | pad=1 879 | filters=340 880 | activation=linear 881 | 882 | 883 | [yolo] 884 | mask = 4,5,6,7 885 | anchors = 8,8, 10,13, 16,30, 33,23, 32,32, 30,61, 62,45, 64,64, 59,119, 116,90, 156,198, 373,326 886 | classes=80 887 | num=12 888 | jitter=.3 889 | ignore_thresh = .7 890 | truth_thresh = 1 891 | scale_x_y = 1.1 892 | random=0 893 | 894 | 895 | 896 | ########### [yolo-3] 897 | 898 | [route] 899 | layers = -14 900 | 901 | [convolutional] 902 | batch_normalize=1 903 | filters=512 904 | size=1 905 | stride=1 906 | pad=1 907 | activation=leaky 908 | 909 | [route] 910 | layers = -1,-43 911 | 912 | [convolutional] 913 | batch_normalize=1 914 | size=3 915 | stride=1 916 | pad=1 917 | filters=1024 918 | activation=leaky 919 | 920 | 921 | [convolutional] 922 | size=1 923 | stride=1 924 | pad=1 925 | filters=340 926 | activation=linear 927 | 928 | 929 | [yolo] 930 | mask = 8,9,10,11 931 | anchors = 8,8, 10,13, 16,30, 33,23, 32,32, 30,61, 62,45, 59,119, 80,80, 116,90, 156,198, 373,326 932 | classes=80 933 | num=12 934 | jitter=.3 935 | ignore_thresh = .7 936 | truth_thresh = 1 937 | scale_x_y = 1.2 938 | random=0 939 | -------------------------------------------------------------------------------- /cfg/csresnext50-panet-spp.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500500 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | #19:104x104 38:52x52 65:26x26 80:13x13 for 416 26 | 27 | [convolutional] 28 | batch_normalize=1 29 | filters=64 30 | size=7 31 | stride=2 32 | pad=1 33 | activation=leaky 34 | 35 | [maxpool] 36 | size=2 37 | stride=2 38 | 39 | [convolutional] 40 | batch_normalize=1 41 | filters=128 42 | size=1 43 | stride=1 44 | pad=1 45 | activation=leaky 46 | 47 | [route] 48 | layers = -2 49 | 50 | [convolutional] 51 | batch_normalize=1 52 | filters=64 53 | size=1 54 | stride=1 55 | pad=1 56 | activation=leaky 57 | 58 | # 1-1 59 | 60 | [convolutional] 61 | batch_normalize=1 62 | filters=128 63 | size=1 64 | stride=1 65 | pad=1 66 | activation=leaky 67 | 68 | [convolutional] 69 | batch_normalize=1 70 | filters=128 71 | size=3 72 | groups=32 73 | stride=1 74 | pad=1 75 | activation=leaky 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=64 80 | size=1 81 | stride=1 82 | pad=1 83 | activation=linear 84 | 85 | [shortcut] 86 | from=-4 87 | activation=leaky 88 | 89 | # 1-2 90 | 91 | [convolutional] 92 | batch_normalize=1 93 | filters=128 94 | size=1 95 | stride=1 96 | pad=1 97 | activation=leaky 98 | 99 | [convolutional] 100 | batch_normalize=1 101 | filters=128 102 | size=3 103 | groups=32 104 | stride=1 105 | pad=1 106 | activation=leaky 107 | 108 | [convolutional] 109 | batch_normalize=1 110 | filters=64 111 | size=1 112 | stride=1 113 | pad=1 114 | activation=linear 115 | 116 | [shortcut] 117 | from=-4 118 | activation=leaky 119 | 120 | # 1-3 121 | 122 | [convolutional] 123 | batch_normalize=1 124 | filters=128 125 | size=1 126 | stride=1 127 | pad=1 128 | activation=leaky 129 | 130 | [convolutional] 131 | batch_normalize=1 132 | filters=128 133 | size=3 134 | groups=32 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [convolutional] 140 | batch_normalize=1 141 | filters=64 142 | size=1 143 | stride=1 144 | pad=1 145 | activation=linear 146 | 147 | [shortcut] 148 | from=-4 149 | activation=leaky 150 | 151 | # 1-T 152 | 153 | [convolutional] 154 | batch_normalize=1 155 | filters=128 156 | size=1 157 | stride=1 158 | pad=1 159 | activation=leaky 160 | 161 | [route] 162 | layers = -1,-16 163 | 164 | [convolutional] 165 | batch_normalize=1 166 | filters=256 167 | size=1 168 | stride=1 169 | pad=1 170 | activation=leaky 171 | 172 | [convolutional] 173 | batch_normalize=1 174 | filters=256 175 | size=3 176 | groups=32 177 | stride=2 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=256 184 | size=1 185 | stride=1 186 | pad=1 187 | activation=linear 188 | 189 | [route] 190 | layers = -2 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | filters=256 195 | size=1 196 | stride=1 197 | pad=1 198 | activation=linear 199 | 200 | # 2-1 201 | 202 | [convolutional] 203 | batch_normalize=1 204 | filters=256 205 | size=1 206 | stride=1 207 | pad=1 208 | activation=leaky 209 | 210 | [convolutional] 211 | batch_normalize=1 212 | filters=256 213 | size=3 214 | groups=32 215 | stride=1 216 | pad=1 217 | activation=leaky 218 | 219 | [convolutional] 220 | batch_normalize=1 221 | filters=256 222 | size=1 223 | stride=1 224 | pad=1 225 | activation=linear 226 | 227 | [shortcut] 228 | from=-4 229 | activation=leaky 230 | 231 | # 2-2 232 | 233 | [convolutional] 234 | batch_normalize=1 235 | filters=256 236 | size=1 237 | stride=1 238 | pad=1 239 | activation=leaky 240 | 241 | [convolutional] 242 | batch_normalize=1 243 | filters=256 244 | size=3 245 | groups=32 246 | stride=1 247 | pad=1 248 | activation=leaky 249 | 250 | [convolutional] 251 | batch_normalize=1 252 | filters=256 253 | size=1 254 | stride=1 255 | pad=1 256 | activation=linear 257 | 258 | [shortcut] 259 | from=-4 260 | activation=leaky 261 | 262 | # 2-3 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=256 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | groups=32 277 | stride=1 278 | pad=1 279 | activation=leaky 280 | 281 | [convolutional] 282 | batch_normalize=1 283 | filters=256 284 | size=1 285 | stride=1 286 | pad=1 287 | activation=linear 288 | 289 | [shortcut] 290 | from=-4 291 | activation=leaky 292 | 293 | # 2-T 294 | 295 | [convolutional] 296 | batch_normalize=1 297 | filters=256 298 | size=1 299 | stride=1 300 | pad=1 301 | activation=leaky 302 | 303 | [route] 304 | layers = -1,-16 305 | 306 | [convolutional] 307 | batch_normalize=1 308 | filters=512 309 | size=1 310 | stride=1 311 | pad=1 312 | activation=leaky 313 | 314 | [convolutional] 315 | batch_normalize=1 316 | filters=512 317 | size=3 318 | groups=32 319 | stride=2 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=1 327 | stride=1 328 | pad=1 329 | activation=linear 330 | 331 | [route] 332 | layers = -2 333 | 334 | [convolutional] 335 | batch_normalize=1 336 | filters=512 337 | size=1 338 | stride=1 339 | pad=1 340 | activation=linear 341 | 342 | # 3-1 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=1 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [convolutional] 353 | batch_normalize=1 354 | filters=512 355 | size=3 356 | groups=32 357 | stride=1 358 | pad=1 359 | activation=leaky 360 | 361 | [convolutional] 362 | batch_normalize=1 363 | filters=512 364 | size=1 365 | stride=1 366 | pad=1 367 | activation=linear 368 | 369 | [shortcut] 370 | from=-4 371 | activation=leaky 372 | 373 | # 3-2 374 | 375 | [convolutional] 376 | batch_normalize=1 377 | filters=512 378 | size=1 379 | stride=1 380 | pad=1 381 | activation=leaky 382 | 383 | [convolutional] 384 | batch_normalize=1 385 | filters=512 386 | size=3 387 | groups=32 388 | stride=1 389 | pad=1 390 | activation=leaky 391 | 392 | [convolutional] 393 | batch_normalize=1 394 | filters=512 395 | size=1 396 | stride=1 397 | pad=1 398 | activation=linear 399 | 400 | [shortcut] 401 | from=-4 402 | activation=leaky 403 | 404 | # 3-3 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=1 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [convolutional] 415 | batch_normalize=1 416 | filters=512 417 | size=3 418 | groups=32 419 | stride=1 420 | pad=1 421 | activation=leaky 422 | 423 | [convolutional] 424 | batch_normalize=1 425 | filters=512 426 | size=1 427 | stride=1 428 | pad=1 429 | activation=linear 430 | 431 | [shortcut] 432 | from=-4 433 | activation=leaky 434 | 435 | # 3-4 436 | 437 | [convolutional] 438 | batch_normalize=1 439 | filters=512 440 | size=1 441 | stride=1 442 | pad=1 443 | activation=leaky 444 | 445 | [convolutional] 446 | batch_normalize=1 447 | filters=512 448 | size=3 449 | groups=32 450 | stride=1 451 | pad=1 452 | activation=leaky 453 | 454 | [convolutional] 455 | batch_normalize=1 456 | filters=512 457 | size=1 458 | stride=1 459 | pad=1 460 | activation=linear 461 | 462 | [shortcut] 463 | from=-4 464 | activation=leaky 465 | 466 | # 3-5 467 | 468 | [convolutional] 469 | batch_normalize=1 470 | filters=512 471 | size=1 472 | stride=1 473 | pad=1 474 | activation=leaky 475 | 476 | [convolutional] 477 | batch_normalize=1 478 | filters=512 479 | size=3 480 | groups=32 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [convolutional] 486 | batch_normalize=1 487 | filters=512 488 | size=1 489 | stride=1 490 | pad=1 491 | activation=linear 492 | 493 | [shortcut] 494 | from=-4 495 | activation=leaky 496 | 497 | # 3-T 498 | 499 | [convolutional] 500 | batch_normalize=1 501 | filters=512 502 | size=1 503 | stride=1 504 | pad=1 505 | activation=leaky 506 | 507 | [route] 508 | layers = -1,-24 509 | 510 | [convolutional] 511 | batch_normalize=1 512 | filters=1024 513 | size=1 514 | stride=1 515 | pad=1 516 | activation=leaky 517 | 518 | [convolutional] 519 | batch_normalize=1 520 | filters=1024 521 | size=3 522 | groups=32 523 | stride=2 524 | pad=1 525 | activation=leaky 526 | 527 | [convolutional] 528 | batch_normalize=1 529 | filters=1024 530 | size=1 531 | stride=1 532 | pad=1 533 | activation=leaky 534 | 535 | [route] 536 | layers = -2 537 | 538 | [convolutional] 539 | batch_normalize=1 540 | filters=1024 541 | size=1 542 | stride=1 543 | pad=1 544 | activation=leaky 545 | 546 | # 4-1 547 | 548 | [convolutional] 549 | batch_normalize=1 550 | filters=1024 551 | size=1 552 | stride=1 553 | pad=1 554 | activation=leaky 555 | 556 | [convolutional] 557 | batch_normalize=1 558 | filters=1024 559 | size=3 560 | groups=32 561 | stride=1 562 | pad=1 563 | activation=leaky 564 | 565 | [convolutional] 566 | batch_normalize=1 567 | filters=1024 568 | size=1 569 | stride=1 570 | pad=1 571 | activation=linear 572 | 573 | [shortcut] 574 | from=-4 575 | activation=leaky 576 | 577 | # 4-2 578 | 579 | [convolutional] 580 | batch_normalize=1 581 | filters=1024 582 | size=1 583 | stride=1 584 | pad=1 585 | activation=leaky 586 | 587 | [convolutional] 588 | batch_normalize=1 589 | filters=1024 590 | size=3 591 | groups=32 592 | stride=1 593 | pad=1 594 | activation=leaky 595 | 596 | [convolutional] 597 | batch_normalize=1 598 | filters=1024 599 | size=1 600 | stride=1 601 | pad=1 602 | activation=linear 603 | 604 | [shortcut] 605 | from=-4 606 | activation=leaky 607 | 608 | # 4-T 609 | 610 | [convolutional] 611 | batch_normalize=1 612 | filters=1024 613 | size=1 614 | stride=1 615 | pad=1 616 | activation=leaky 617 | 618 | [route] 619 | layers = -1,-12 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=2048 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | ########################## 630 | 631 | [convolutional] 632 | batch_normalize=1 633 | filters=512 634 | size=1 635 | stride=1 636 | pad=1 637 | activation=leaky 638 | 639 | [convolutional] 640 | batch_normalize=1 641 | size=3 642 | stride=1 643 | pad=1 644 | filters=1024 645 | activation=leaky 646 | 647 | [convolutional] 648 | batch_normalize=1 649 | filters=512 650 | size=1 651 | stride=1 652 | pad=1 653 | activation=leaky 654 | 655 | ### SPP ### 656 | [maxpool] 657 | stride=1 658 | size=5 659 | 660 | [route] 661 | layers=-2 662 | 663 | [maxpool] 664 | stride=1 665 | size=9 666 | 667 | [route] 668 | layers=-4 669 | 670 | [maxpool] 671 | stride=1 672 | size=13 673 | 674 | [route] 675 | layers=-1,-3,-5,-6 676 | ### End SPP ### 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | filters=512 681 | size=1 682 | stride=1 683 | pad=1 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | size=3 689 | stride=1 690 | pad=1 691 | filters=1024 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | filters=512 697 | size=1 698 | stride=1 699 | pad=1 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [upsample] 711 | stride=2 712 | 713 | [route] 714 | layers = 65 715 | 716 | [convolutional] 717 | batch_normalize=1 718 | filters=256 719 | size=1 720 | stride=1 721 | pad=1 722 | activation=leaky 723 | 724 | [route] 725 | layers = -1, -3 726 | 727 | [convolutional] 728 | batch_normalize=1 729 | filters=256 730 | size=1 731 | stride=1 732 | pad=1 733 | activation=leaky 734 | 735 | [convolutional] 736 | batch_normalize=1 737 | size=3 738 | stride=1 739 | pad=1 740 | filters=512 741 | activation=leaky 742 | 743 | [convolutional] 744 | batch_normalize=1 745 | filters=256 746 | size=1 747 | stride=1 748 | pad=1 749 | activation=leaky 750 | 751 | [convolutional] 752 | batch_normalize=1 753 | size=3 754 | stride=1 755 | pad=1 756 | filters=512 757 | activation=leaky 758 | 759 | [convolutional] 760 | batch_normalize=1 761 | filters=256 762 | size=1 763 | stride=1 764 | pad=1 765 | activation=leaky 766 | 767 | [convolutional] 768 | batch_normalize=1 769 | filters=128 770 | size=1 771 | stride=1 772 | pad=1 773 | activation=leaky 774 | 775 | [upsample] 776 | stride=2 777 | 778 | [route] 779 | layers = 38 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | filters=128 784 | size=1 785 | stride=1 786 | pad=1 787 | activation=leaky 788 | 789 | [route] 790 | layers = -1, -3 791 | 792 | [convolutional] 793 | batch_normalize=1 794 | filters=128 795 | size=1 796 | stride=1 797 | pad=1 798 | activation=leaky 799 | 800 | [convolutional] 801 | batch_normalize=1 802 | size=3 803 | stride=1 804 | pad=1 805 | filters=256 806 | activation=leaky 807 | 808 | [convolutional] 809 | batch_normalize=1 810 | filters=128 811 | size=1 812 | stride=1 813 | pad=1 814 | activation=leaky 815 | 816 | [convolutional] 817 | batch_normalize=1 818 | size=3 819 | stride=1 820 | pad=1 821 | filters=256 822 | activation=leaky 823 | 824 | [convolutional] 825 | batch_normalize=1 826 | filters=128 827 | size=1 828 | stride=1 829 | pad=1 830 | activation=leaky 831 | 832 | ########################## 833 | 834 | [convolutional] 835 | batch_normalize=1 836 | size=3 837 | stride=1 838 | pad=1 839 | filters=256 840 | activation=leaky 841 | 842 | [convolutional] 843 | size=1 844 | stride=1 845 | pad=1 846 | filters=255 847 | activation=linear 848 | 849 | 850 | [yolo] 851 | mask = 0,1,2 852 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 853 | classes=80 854 | num=9 855 | jitter=.3 856 | ignore_thresh = .7 857 | truth_thresh = 1 858 | random=1 859 | 860 | [route] 861 | layers = -4 862 | 863 | [convolutional] 864 | batch_normalize=1 865 | size=3 866 | stride=2 867 | pad=1 868 | filters=256 869 | activation=leaky 870 | 871 | [route] 872 | layers = -1, -16 873 | 874 | [convolutional] 875 | batch_normalize=1 876 | filters=256 877 | size=1 878 | stride=1 879 | pad=1 880 | activation=leaky 881 | 882 | [convolutional] 883 | batch_normalize=1 884 | size=3 885 | stride=1 886 | pad=1 887 | filters=512 888 | activation=leaky 889 | 890 | [convolutional] 891 | batch_normalize=1 892 | filters=256 893 | size=1 894 | stride=1 895 | pad=1 896 | activation=leaky 897 | 898 | [convolutional] 899 | batch_normalize=1 900 | size=3 901 | stride=1 902 | pad=1 903 | filters=512 904 | activation=leaky 905 | 906 | [convolutional] 907 | batch_normalize=1 908 | filters=256 909 | size=1 910 | stride=1 911 | pad=1 912 | activation=leaky 913 | 914 | [convolutional] 915 | batch_normalize=1 916 | size=3 917 | stride=1 918 | pad=1 919 | filters=512 920 | activation=leaky 921 | 922 | [convolutional] 923 | size=1 924 | stride=1 925 | pad=1 926 | filters=255 927 | activation=linear 928 | 929 | 930 | [yolo] 931 | mask = 3,4,5 932 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 933 | classes=80 934 | num=9 935 | jitter=.3 936 | ignore_thresh = .7 937 | truth_thresh = 1 938 | random=1 939 | 940 | [route] 941 | layers = -4 942 | 943 | [convolutional] 944 | batch_normalize=1 945 | size=3 946 | stride=2 947 | pad=1 948 | filters=512 949 | activation=leaky 950 | 951 | [route] 952 | layers = -1, -37 953 | 954 | [convolutional] 955 | batch_normalize=1 956 | filters=512 957 | size=1 958 | stride=1 959 | pad=1 960 | activation=leaky 961 | 962 | [convolutional] 963 | batch_normalize=1 964 | size=3 965 | stride=1 966 | pad=1 967 | filters=1024 968 | activation=leaky 969 | 970 | [convolutional] 971 | batch_normalize=1 972 | filters=512 973 | size=1 974 | stride=1 975 | pad=1 976 | activation=leaky 977 | 978 | [convolutional] 979 | batch_normalize=1 980 | size=3 981 | stride=1 982 | pad=1 983 | filters=1024 984 | activation=leaky 985 | 986 | [convolutional] 987 | batch_normalize=1 988 | filters=512 989 | size=1 990 | stride=1 991 | pad=1 992 | activation=leaky 993 | 994 | [convolutional] 995 | batch_normalize=1 996 | size=3 997 | stride=1 998 | pad=1 999 | filters=1024 1000 | activation=leaky 1001 | 1002 | [convolutional] 1003 | size=1 1004 | stride=1 1005 | pad=1 1006 | filters=255 1007 | activation=linear 1008 | 1009 | 1010 | [yolo] 1011 | mask = 6,7,8 1012 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 1013 | classes=80 1014 | num=9 1015 | jitter=.3 1016 | ignore_thresh = .7 1017 | truth_thresh = 1 1018 | random=1 1019 | --------------------------------------------------------------------------------