├── data
    ├── coco1.txt
    ├── coco1.data
    ├── coco16.data
    ├── coco64.data
    ├── coco1cls.data
    ├── coco2014.data
    ├── coco2017.data
    ├── coco16.txt
    ├── coco1cls.txt
    ├── get_coco2014.sh
    ├── get_coco2017.sh
    ├── coco.names
    ├── coco_paper.names
    └── coco64.txt
├── CIoU.png
├── requirements.txt
├── utils
    ├── evolve.sh
    ├── gcp.sh
    ├── google_utils.py
    ├── parse_config.py
    ├── layers.py
    ├── torch_utils.py
    └── adabound.py
├── weights
    └── download_yolov3_weights.sh
├── .gitignore
├── Dockerfile
├── cfg
    ├── yolov3-tiny.cfg
    ├── yolov3-tiny-1cls.cfg
    ├── yolov3-tiny-3cls.cfg
    ├── yolov3-tiny3-1cls.cfg
    ├── yolov3-tiny3.cfg
    ├── yolov3-1cls.cfg
    ├── yolov3.cfg
    ├── yolov3-spp-1cls.cfg
    ├── yolov3-spp-3cls.cfg
    ├── yolov3-spp.cfg
    ├── yolov3-asff.cfg
    ├── yolov3-spp3.cfg
    ├── yolov3-spp-pan-scale.cfg
    └── csresnext50-panet-spp.cfg
├── README.md
├── detect.py
└── test.py


/data/coco1.txt:
--------------------------------------------------------------------------------
1 | ../coco/images/train2017/000000109622.jpg
2 | 


--------------------------------------------------------------------------------
/CIoU.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zzh-tju/ultralytics-YOLOv3-Cluster-NMS/HEAD/CIoU.png


--------------------------------------------------------------------------------
/data/coco1.data:
--------------------------------------------------------------------------------
1 | classes=80
2 | train=data/coco1.txt
3 | valid=data/coco1.txt
4 | names=data/coco.names
5 | 


--------------------------------------------------------------------------------
/data/coco16.data:
--------------------------------------------------------------------------------
1 | classes=80
2 | train=data/coco16.txt
3 | valid=data/coco16.txt
4 | names=data/coco.names
5 | 


--------------------------------------------------------------------------------
/data/coco64.data:
--------------------------------------------------------------------------------
1 | classes=80
2 | train=data/coco64.txt
3 | valid=data/coco64.txt
4 | names=data/coco.names
5 | 


--------------------------------------------------------------------------------
/data/coco1cls.data:
--------------------------------------------------------------------------------
1 | classes=1
2 | train=data/coco1cls.txt
3 | valid=data/coco1cls.txt
4 | names=data/coco.names
5 | 


--------------------------------------------------------------------------------
/data/coco2014.data:
--------------------------------------------------------------------------------
1 | classes=80
2 | train=../coco/trainvalno5k.txt
3 | valid=../coco/5k.txt
4 | names=data/coco.names
5 | 


--------------------------------------------------------------------------------
/data/coco2017.data:
--------------------------------------------------------------------------------
1 | classes=80
2 | train=../coco/train2017.txt
3 | valid=../coco/val2017.txt
4 | names=data/coco.names
5 | 


--------------------------------------------------------------------------------
/data/coco16.txt:
--------------------------------------------------------------------------------
 1 | ../coco/images/train2017/000000109622.jpg
 2 | ../coco/images/train2017/000000160694.jpg
 3 | ../coco/images/train2017/000000308590.jpg
 4 | ../coco/images/train2017/000000327573.jpg
 5 | ../coco/images/train2017/000000062929.jpg
 6 | ../coco/images/train2017/000000512793.jpg
 7 | ../coco/images/train2017/000000371735.jpg
 8 | ../coco/images/train2017/000000148118.jpg
 9 | ../coco/images/train2017/000000309856.jpg
10 | ../coco/images/train2017/000000141882.jpg
11 | ../coco/images/train2017/000000318783.jpg
12 | ../coco/images/train2017/000000337760.jpg
13 | ../coco/images/train2017/000000298197.jpg
14 | ../coco/images/train2017/000000042421.jpg
15 | ../coco/images/train2017/000000328898.jpg
16 | ../coco/images/train2017/000000458856.jpg
17 | 


--------------------------------------------------------------------------------
/data/coco1cls.txt:
--------------------------------------------------------------------------------
 1 | ../coco/images/train2017/000000000901.jpg
 2 | ../coco/images/train2017/000000001464.jpg
 3 | ../coco/images/train2017/000000003220.jpg
 4 | ../coco/images/train2017/000000003365.jpg
 5 | ../coco/images/train2017/000000004772.jpg
 6 | ../coco/images/train2017/000000009987.jpg
 7 | ../coco/images/train2017/000000010498.jpg
 8 | ../coco/images/train2017/000000012455.jpg
 9 | ../coco/images/train2017/000000013992.jpg
10 | ../coco/images/train2017/000000014125.jpg
11 | ../coco/images/train2017/000000016314.jpg
12 | ../coco/images/train2017/000000016670.jpg
13 | ../coco/images/train2017/000000018412.jpg
14 | ../coco/images/train2017/000000021212.jpg
15 | ../coco/images/train2017/000000021826.jpg
16 | ../coco/images/train2017/000000030566.jpg
17 | 


--------------------------------------------------------------------------------
/data/get_coco2014.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Zip coco folder
 3 | # zip -r coco.zip coco
 4 | # tar -czvf coco.tar.gz coco
 5 | 
 6 | # Download labels from Google Drive, accepting presented query
 7 | filename="coco2014labels.zip"
 8 | fileid="1s6-CmF5_SElM28r52P1OUrCcuXZN-SFo"
 9 | curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null
10 | curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=${fileid}" -o ${filename}
11 | rm ./cookie
12 | 
13 | # Unzip labels
14 | unzip -q ${filename}  # for coco.zip
15 | # tar -xzf ${filename}  # for coco.tar.gz
16 | rm ${filename}
17 | 
18 | # Download and unzip images
19 | cd coco/images
20 | f="train2014.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f
21 | f="val2014.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f
22 | 
23 | # cd out
24 | cd ../..
25 | 


--------------------------------------------------------------------------------
/data/get_coco2017.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Zip coco folder
 3 | # zip -r coco.zip coco
 4 | # tar -czvf coco.tar.gz coco
 5 | 
 6 | # Download labels from Google Drive, accepting presented query
 7 | filename="coco2017labels.zip"
 8 | fileid="1cXZR_ckHki6nddOmcysCuuJFM--T-Q6L"
 9 | curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null
10 | curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=${fileid}" -o ${filename}
11 | rm ./cookie
12 | 
13 | # Unzip labels
14 | unzip -q ${filename}  # for coco.zip
15 | # tar -xzf ${filename}  # for coco.tar.gz
16 | rm ${filename}
17 | 
18 | # Download and unzip images
19 | cd coco/images
20 | f="train2017.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f
21 | f="val2017.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f
22 | 
23 | # cd out
24 | cd ../..
25 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # pip install -U -r requirements.txt
 2 | numpy
 3 | opencv-python >= 4.1
 4 | torch >= 1.5
 5 | matplotlib
 6 | pycocotools
 7 | tqdm
 8 | pillow
 9 | tensorboard >= 1.14
10 | 
11 | # Nvidia Apex (optional) for mixed precision training --------------------------
12 | # git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . --user && cd .. && rm -rf apex
13 | 
14 | # Conda commands (in place of pip) ---------------------------------------------
15 | # conda update -yn base -c defaults conda
16 | # conda install -yc anaconda numpy opencv matplotlib tqdm pillow ipython
17 | # conda install -yc conda-forge scikit-image pycocotools tensorboard
18 | # conda install -yc spyder-ide spyder-line-profiler
19 | # conda install -yc pytorch pytorch torchvision
20 | # conda install -yc conda-forge protobuf numpy && pip install onnx  # https://github.com/onnx/onnx#linux-and-macos
21 | 


--------------------------------------------------------------------------------
/data/coco.names:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorcycle
 5 | airplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | couch
59 | potted plant
60 | bed
61 | dining table
62 | toilet
63 | tv
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 | 


--------------------------------------------------------------------------------
/utils/evolve.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #for i in 0 1 2 3
 3 | #do
 4 | #  t=ultralytics/yolov3:v139 && sudo docker pull $t && sudo nvidia-docker run -d --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t utils/evolve.sh $i
 5 | #  sleep 30
 6 | #done
 7 | 
 8 | while true; do
 9 |   # python3 train.py --data ../data/sm4/out.data --img-size 320 --epochs 100 --batch 64 --accum 1 --weights yolov3-tiny.conv.15 --multi --bucket ult/wer --evolve --cache --device $1 --cfg yolov3-tiny3-1cls.cfg --single --adam
10 |   # python3 train.py --data ../out/data.data --img-size 608 --epochs 10 --batch 8 --accum 8 --weights ultralytics68.pt --multi --bucket ult/athena --evolve --device $1 --cfg yolov3-spp-1cls.cfg
11 | 
12 |   python3 train.py --data coco2014.data --img-size 512 608 --epochs 27 --batch 8 --accum 8 --evolve --weights '' --bucket ult/coco/sppa_512 --device $1 --cfg yolov3-sppa.cfg --multi
13 | done
14 | 
15 | 
16 | # coco epoch times --img-size 416 608 --epochs 27 --batch 16 --accum 4
17 | # 36:34 2080ti
18 | # 21:58 V100
19 | # 63:00 T4


--------------------------------------------------------------------------------
/weights/download_yolov3_weights.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # make '/weights' directory if it does not exist and cd into it
 4 | # mkdir -p weights && cd weights
 5 | 
 6 | # copy darknet weight files, continue '-c' if partially downloaded
 7 | # wget -c https://pjreddie.com/media/files/yolov3.weights
 8 | # wget -c https://pjreddie.com/media/files/yolov3-tiny.weights
 9 | # wget -c https://pjreddie.com/media/files/yolov3-spp.weights
10 | 
11 | # yolov3 pytorch weights
12 | # download from Google Drive: https://drive.google.com/drive/folders/1uxgUBemJVw9wZsdpboYbzUN4bcRhsuAI
13 | 
14 | # darknet53 weights (first 75 layers only)
15 | # wget -c https://pjreddie.com/media/files/darknet53.conv.74
16 | 
17 | # yolov3-tiny weights from darknet (first 16 layers only)
18 | # ./darknet partial cfg/yolov3-tiny.cfg yolov3-tiny.weights yolov3-tiny.conv.15 15
19 | # mv yolov3-tiny.conv.15 ../
20 | 
21 | # new method
22 | python3 -c "from models import *;
23 | attempt_download('weights/yolov3.pt');
24 | attempt_download('weights/yolov3-spp.pt')"
25 | 


--------------------------------------------------------------------------------
/data/coco_paper.names:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorcycle
 5 | airplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | street sign
13 | stop sign
14 | parking meter
15 | bench
16 | bird
17 | cat
18 | dog
19 | horse
20 | sheep
21 | cow
22 | elephant
23 | bear
24 | zebra
25 | giraffe
26 | hat
27 | backpack
28 | umbrella
29 | shoe
30 | eye glasses
31 | handbag
32 | tie
33 | suitcase
34 | frisbee
35 | skis
36 | snowboard
37 | sports ball
38 | kite
39 | baseball bat
40 | baseball glove
41 | skateboard
42 | surfboard
43 | tennis racket
44 | bottle
45 | plate
46 | wine glass
47 | cup
48 | fork
49 | knife
50 | spoon
51 | bowl
52 | banana
53 | apple
54 | sandwich
55 | orange
56 | broccoli
57 | carrot
58 | hot dog
59 | pizza
60 | donut
61 | cake
62 | chair
63 | couch
64 | potted plant
65 | bed
66 | mirror
67 | dining table
68 | window
69 | desk
70 | toilet
71 | door
72 | tv
73 | laptop
74 | mouse
75 | remote
76 | keyboard
77 | cell phone
78 | microwave
79 | oven
80 | toaster
81 | sink
82 | refrigerator
83 | blender
84 | book
85 | clock
86 | vase
87 | scissors
88 | teddy bear
89 | hair drier
90 | toothbrush
91 | hair brush


--------------------------------------------------------------------------------
/utils/gcp.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # New VM
 4 | rm -rf sample_data yolov3
 5 | git clone https://github.com/ultralytics/yolov3
 6 | # git clone -b test --depth 1 https://github.com/ultralytics/yolov3 test  # branch
 7 | # sudo apt-get install zip
 8 | #git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . --user && cd .. && rm -rf apex
 9 | sudo conda install -yc conda-forge scikit-image pycocotools
10 | # python3 -c "from yolov3.utils.google_utils import gdrive_download; gdrive_download('193Zp_ye-3qXMonR1nZj3YyxMtQkMy50k','coco2014.zip')"
11 | python3 -c "from yolov3.utils.google_utils import gdrive_download; gdrive_download('1WQT6SOktSe8Uw6r10-2JhbEhMY5DJaph','coco2017.zip')"
12 | python3 -c "from yolov3.utils.google_utils import gdrive_download; gdrive_download('1C3HewOG9akA3y456SZLBJZfNDPkBwAto','knife.zip')"
13 | python3 -c "from yolov3.utils.google_utils import gdrive_download; gdrive_download('13g3LqdpkNE8sPosVJT6KFXlfoMypzRP4','sm4.zip')"
14 | sudo shutdown
15 | 
16 | # Mount local SSD
17 | lsblk
18 | sudo mkfs.ext4 -F /dev/nvme0n1
19 | sudo mkdir -p /mnt/disks/nvme0n1
20 | sudo mount /dev/nvme0n1 /mnt/disks/nvme0n1
21 | sudo chmod a+w /mnt/disks/nvme0n1
22 | cp -r coco /mnt/disks/nvme0n1
23 | 
24 | # Kill All
25 | t=ultralytics/yolov3:v1
26 | docker kill $(docker ps -a -q --filter ancestor=$t)
27 | 
28 | # Evolve coco
29 | sudo -s
30 | t=ultralytics/yolov3:evolve
31 | # docker kill $(docker ps -a -q --filter ancestor=$t)
32 | for i in 0 1 6 7
33 | do
34 |   docker pull $t && docker run --gpus all -d --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t bash utils/evolve.sh $i
35 |   sleep 30
36 | done
37 | 
38 | #COCO training
39 | n=131 && t=ultralytics/coco:v131 && sudo docker pull $t && sudo docker run -it --gpus all --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t python3 train.py --data coco2014.data --img-size 320 640 --epochs 300 --batch 16 --accum 4 --weights '' --device 0 --cfg yolov3-spp.cfg --nosave --bucket ult/coco --name $n && sudo shutdown
40 | n=132 && t=ultralytics/coco:v131 && sudo docker pull $t && sudo docker run -it --gpus all --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t python3 train.py --data coco2014.data --img-size 320 640 --epochs 300 --batch 64 --accum 1 --weights '' --device 0 --cfg yolov3-tiny.cfg --nosave --bucket ult/coco --name $n && sudo shutdown
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Start FROM Nvidia PyTorch image https://ngc.nvidia.com/catalog/containers/nvidia:pytorch
 2 | FROM nvcr.io/nvidia/pytorch:20.03-py3
 3 | 
 4 | # Install dependencies (pip or conda)
 5 | RUN pip install -U gsutil thop
 6 | # RUN pip install -U -r requirements.txt
 7 | # RUN conda update -n base -c defaults conda
 8 | # RUN conda install -y -c anaconda future numpy opencv matplotlib tqdm pillow
 9 | # RUN conda install -y -c conda-forge scikit-image tensorboard pycocotools
10 | 
11 | ## Install OpenCV with Gstreamer support
12 | #WORKDIR /usr/src
13 | #RUN pip uninstall -y opencv-python
14 | #RUN apt-get update
15 | #RUN apt-get install -y gstreamer1.0-tools gstreamer1.0-python3-dbg-plugin-loader libgstreamer1.0-dev libgstreamer-plugins-base1.0-dev
16 | #RUN git clone https://github.com/opencv/opencv.git && cd opencv && git checkout 4.1.1 && mkdir build
17 | #RUN git clone https://github.com/opencv/opencv_contrib.git && cd opencv_contrib && git checkout 4.1.1
18 | #RUN cd opencv/build && cmake ../ \
19 | #    -D OPENCV_EXTRA_MODULES_PATH=../../opencv_contrib/modules \
20 | #    -D BUILD_OPENCV_PYTHON3=ON \
21 | #    -D PYTHON3_EXECUTABLE=/opt/conda/bin/python \
22 | #    -D PYTHON3_INCLUDE_PATH=/opt/conda/include/python3.6m \
23 | #    -D PYTHON3_LIBRARIES=/opt/conda/lib/python3.6/site-packages \
24 | #    -D WITH_GSTREAMER=ON \
25 | #    -D WITH_FFMPEG=OFF \
26 | #    && make && make install && ldconfig
27 | #RUN cd /usr/local/lib/python3.6/site-packages/cv2/python-3.6/ && mv cv2.cpython-36m-x86_64-linux-gnu.so cv2.so
28 | #RUN cd /opt/conda/lib/python3.6/site-packages/ && ln -s /usr/local/lib/python3.6/site-packages/cv2/python-3.6/cv2.so cv2.so
29 | #RUN python3 -c "import cv2; print(cv2.getBuildInformation())"
30 | 
31 | # Create working directory
32 | RUN mkdir -p /usr/src/app
33 | WORKDIR /usr/src/app
34 | 
35 | # Copy contents
36 | COPY . /usr/src/app
37 | 
38 | # Copy weights
39 | #RUN python3 -c "from models import *; \
40 | #attempt_download('weights/yolov3.pt'); \
41 | #attempt_download('weights/yolov3-spp.pt')"
42 | 
43 | 
44 | # ---------------------------------------------------  Extras Below  ---------------------------------------------------
45 | 
46 | # Build and Push
47 | # t=ultralytics/yolov3:v0 && sudo docker build -t $t . && sudo docker push $t
48 | 
49 | # Run
50 | # t=ultralytics/yolov3:v0 && sudo docker pull $t && sudo docker run -it --gpus all --ipc=host $t bash
51 | 
52 | # Pull and Run with local directory access
53 | # t=ultralytics/yolov3:v0 && sudo docker pull $t && sudo docker run -it --gpus all --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t bash
54 | 
55 | # Kill all
56 | # sudo docker kill "$(sudo docker ps -q)"
57 | 
58 | # Kill all image-based
59 | # sudo docker kill $(sudo docker ps -a -q --filter ancestor=ultralytics/yolov3:v0)
60 | 
61 | # Run bash for loop
62 | # sudo docker run --gpus all --ipc=host ultralytics/yolov3:v0 while true; do python3 train.py --evolve; done
63 | 


--------------------------------------------------------------------------------
/cfg/yolov3-tiny.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | batch=1
  4 | subdivisions=1
  5 | # Training
  6 | # batch=64
  7 | # subdivisions=2
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=16
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | [maxpool]
 34 | size=2
 35 | stride=2
 36 | 
 37 | [convolutional]
 38 | batch_normalize=1
 39 | filters=32
 40 | size=3
 41 | stride=1
 42 | pad=1
 43 | activation=leaky
 44 | 
 45 | [maxpool]
 46 | size=2
 47 | stride=2
 48 | 
 49 | [convolutional]
 50 | batch_normalize=1
 51 | filters=64
 52 | size=3
 53 | stride=1
 54 | pad=1
 55 | activation=leaky
 56 | 
 57 | [maxpool]
 58 | size=2
 59 | stride=2
 60 | 
 61 | [convolutional]
 62 | batch_normalize=1
 63 | filters=128
 64 | size=3
 65 | stride=1
 66 | pad=1
 67 | activation=leaky
 68 | 
 69 | [maxpool]
 70 | size=2
 71 | stride=2
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=256
 76 | size=3
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [maxpool]
 82 | size=2
 83 | stride=2
 84 | 
 85 | [convolutional]
 86 | batch_normalize=1
 87 | filters=512
 88 | size=3
 89 | stride=1
 90 | pad=1
 91 | activation=leaky
 92 | 
 93 | [maxpool]
 94 | size=2
 95 | stride=1
 96 | 
 97 | [convolutional]
 98 | batch_normalize=1
 99 | filters=1024
100 | size=3
101 | stride=1
102 | pad=1
103 | activation=leaky
104 | 
105 | ###########
106 | 
107 | [convolutional]
108 | batch_normalize=1
109 | filters=256
110 | size=1
111 | stride=1
112 | pad=1
113 | activation=leaky
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=512
118 | size=3
119 | stride=1
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | size=1
125 | stride=1
126 | pad=1
127 | filters=255
128 | activation=linear
129 | 
130 | 
131 | 
132 | [yolo]
133 | mask = 3,4,5
134 | anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
135 | classes=80
136 | num=6
137 | jitter=.3
138 | ignore_thresh = .7
139 | truth_thresh = 1
140 | random=1
141 | 
142 | [route]
143 | layers = -4
144 | 
145 | [convolutional]
146 | batch_normalize=1
147 | filters=128
148 | size=1
149 | stride=1
150 | pad=1
151 | activation=leaky
152 | 
153 | [upsample]
154 | stride=2
155 | 
156 | [route]
157 | layers = -1, 8
158 | 
159 | [convolutional]
160 | batch_normalize=1
161 | filters=256
162 | size=3
163 | stride=1
164 | pad=1
165 | activation=leaky
166 | 
167 | [convolutional]
168 | size=1
169 | stride=1
170 | pad=1
171 | filters=255
172 | activation=linear
173 | 
174 | [yolo]
175 | mask = 1,2,3
176 | anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
177 | classes=80
178 | num=6
179 | jitter=.3
180 | ignore_thresh = .7
181 | truth_thresh = 1
182 | random=1
183 | 


--------------------------------------------------------------------------------
/data/coco64.txt:
--------------------------------------------------------------------------------
 1 | ../coco/images/train2017/000000109622.jpg
 2 | ../coco/images/train2017/000000160694.jpg
 3 | ../coco/images/train2017/000000308590.jpg
 4 | ../coco/images/train2017/000000327573.jpg
 5 | ../coco/images/train2017/000000062929.jpg
 6 | ../coco/images/train2017/000000512793.jpg
 7 | ../coco/images/train2017/000000371735.jpg
 8 | ../coco/images/train2017/000000148118.jpg
 9 | ../coco/images/train2017/000000309856.jpg
10 | ../coco/images/train2017/000000141882.jpg
11 | ../coco/images/train2017/000000318783.jpg
12 | ../coco/images/train2017/000000337760.jpg
13 | ../coco/images/train2017/000000298197.jpg
14 | ../coco/images/train2017/000000042421.jpg
15 | ../coco/images/train2017/000000328898.jpg
16 | ../coco/images/train2017/000000458856.jpg
17 | ../coco/images/train2017/000000073824.jpg
18 | ../coco/images/train2017/000000252846.jpg
19 | ../coco/images/train2017/000000459590.jpg
20 | ../coco/images/train2017/000000273650.jpg
21 | ../coco/images/train2017/000000331311.jpg
22 | ../coco/images/train2017/000000156326.jpg
23 | ../coco/images/train2017/000000262985.jpg
24 | ../coco/images/train2017/000000253580.jpg
25 | ../coco/images/train2017/000000447976.jpg
26 | ../coco/images/train2017/000000378077.jpg
27 | ../coco/images/train2017/000000259913.jpg
28 | ../coco/images/train2017/000000424553.jpg
29 | ../coco/images/train2017/000000000612.jpg
30 | ../coco/images/train2017/000000267625.jpg
31 | ../coco/images/train2017/000000566012.jpg
32 | ../coco/images/train2017/000000196664.jpg
33 | ../coco/images/train2017/000000363331.jpg
34 | ../coco/images/train2017/000000057992.jpg
35 | ../coco/images/train2017/000000520047.jpg
36 | ../coco/images/train2017/000000453903.jpg
37 | ../coco/images/train2017/000000162083.jpg
38 | ../coco/images/train2017/000000268516.jpg
39 | ../coco/images/train2017/000000277436.jpg
40 | ../coco/images/train2017/000000189744.jpg
41 | ../coco/images/train2017/000000041128.jpg
42 | ../coco/images/train2017/000000527728.jpg
43 | ../coco/images/train2017/000000465269.jpg
44 | ../coco/images/train2017/000000246833.jpg
45 | ../coco/images/train2017/000000076784.jpg
46 | ../coco/images/train2017/000000323715.jpg
47 | ../coco/images/train2017/000000560463.jpg
48 | ../coco/images/train2017/000000006263.jpg
49 | ../coco/images/train2017/000000094701.jpg
50 | ../coco/images/train2017/000000521359.jpg
51 | ../coco/images/train2017/000000302903.jpg
52 | ../coco/images/train2017/000000047559.jpg
53 | ../coco/images/train2017/000000480583.jpg
54 | ../coco/images/train2017/000000050025.jpg
55 | ../coco/images/train2017/000000084512.jpg
56 | ../coco/images/train2017/000000508913.jpg
57 | ../coco/images/train2017/000000093708.jpg
58 | ../coco/images/train2017/000000070493.jpg
59 | ../coco/images/train2017/000000539270.jpg
60 | ../coco/images/train2017/000000474402.jpg
61 | ../coco/images/train2017/000000209842.jpg
62 | ../coco/images/train2017/000000028820.jpg
63 | ../coco/images/train2017/000000154257.jpg
64 | ../coco/images/train2017/000000342499.jpg
65 | 


--------------------------------------------------------------------------------
/cfg/yolov3-tiny-1cls.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | batch=1
  4 | subdivisions=1
  5 | # Training
  6 | # batch=64
  7 | # subdivisions=2
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=16
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | [maxpool]
 34 | size=2
 35 | stride=2
 36 | 
 37 | [convolutional]
 38 | batch_normalize=1
 39 | filters=32
 40 | size=3
 41 | stride=1
 42 | pad=1
 43 | activation=leaky
 44 | 
 45 | [maxpool]
 46 | size=2
 47 | stride=2
 48 | 
 49 | [convolutional]
 50 | batch_normalize=1
 51 | filters=64
 52 | size=3
 53 | stride=1
 54 | pad=1
 55 | activation=leaky
 56 | 
 57 | [maxpool]
 58 | size=2
 59 | stride=2
 60 | 
 61 | [convolutional]
 62 | batch_normalize=1
 63 | filters=128
 64 | size=3
 65 | stride=1
 66 | pad=1
 67 | activation=leaky
 68 | 
 69 | [maxpool]
 70 | size=2
 71 | stride=2
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=256
 76 | size=3
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [maxpool]
 82 | size=2
 83 | stride=2
 84 | 
 85 | [convolutional]
 86 | batch_normalize=1
 87 | filters=512
 88 | size=3
 89 | stride=1
 90 | pad=1
 91 | activation=leaky
 92 | 
 93 | [maxpool]
 94 | size=2
 95 | stride=1
 96 | 
 97 | [convolutional]
 98 | batch_normalize=1
 99 | filters=1024
100 | size=3
101 | stride=1
102 | pad=1
103 | activation=leaky
104 | 
105 | ###########
106 | 
107 | [convolutional]
108 | batch_normalize=1
109 | filters=256
110 | size=1
111 | stride=1
112 | pad=1
113 | activation=leaky
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=512
118 | size=3
119 | stride=1
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | size=1
125 | stride=1
126 | pad=1
127 | filters=18
128 | activation=linear
129 | 
130 | 
131 | 
132 | [yolo]
133 | mask = 3,4,5
134 | anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
135 | classes=1
136 | num=6
137 | jitter=.3
138 | ignore_thresh = .7
139 | truth_thresh = 1
140 | random=1
141 | 
142 | [route]
143 | layers = -4
144 | 
145 | [convolutional]
146 | batch_normalize=1
147 | filters=128
148 | size=1
149 | stride=1
150 | pad=1
151 | activation=leaky
152 | 
153 | [upsample]
154 | stride=2
155 | 
156 | [route]
157 | layers = -1, 8
158 | 
159 | [convolutional]
160 | batch_normalize=1
161 | filters=256
162 | size=3
163 | stride=1
164 | pad=1
165 | activation=leaky
166 | 
167 | [convolutional]
168 | size=1
169 | stride=1
170 | pad=1
171 | filters=18
172 | activation=linear
173 | 
174 | [yolo]
175 | mask = 0,1,2
176 | anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
177 | classes=1
178 | num=6
179 | jitter=.3
180 | ignore_thresh = .7
181 | truth_thresh = 1
182 | random=1
183 | 


--------------------------------------------------------------------------------
/cfg/yolov3-tiny-3cls.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | batch=1
  4 | subdivisions=1
  5 | # Training
  6 | # batch=64
  7 | # subdivisions=2
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=16
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | [maxpool]
 34 | size=2
 35 | stride=2
 36 | 
 37 | [convolutional]
 38 | batch_normalize=1
 39 | filters=32
 40 | size=3
 41 | stride=1
 42 | pad=1
 43 | activation=leaky
 44 | 
 45 | [maxpool]
 46 | size=2
 47 | stride=2
 48 | 
 49 | [convolutional]
 50 | batch_normalize=1
 51 | filters=64
 52 | size=3
 53 | stride=1
 54 | pad=1
 55 | activation=leaky
 56 | 
 57 | [maxpool]
 58 | size=2
 59 | stride=2
 60 | 
 61 | [convolutional]
 62 | batch_normalize=1
 63 | filters=128
 64 | size=3
 65 | stride=1
 66 | pad=1
 67 | activation=leaky
 68 | 
 69 | [maxpool]
 70 | size=2
 71 | stride=2
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=256
 76 | size=3
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [maxpool]
 82 | size=2
 83 | stride=2
 84 | 
 85 | [convolutional]
 86 | batch_normalize=1
 87 | filters=512
 88 | size=3
 89 | stride=1
 90 | pad=1
 91 | activation=leaky
 92 | 
 93 | [maxpool]
 94 | size=2
 95 | stride=1
 96 | 
 97 | [convolutional]
 98 | batch_normalize=1
 99 | filters=1024
100 | size=3
101 | stride=1
102 | pad=1
103 | activation=leaky
104 | 
105 | ###########
106 | 
107 | [convolutional]
108 | batch_normalize=1
109 | filters=256
110 | size=1
111 | stride=1
112 | pad=1
113 | activation=leaky
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=512
118 | size=3
119 | stride=1
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | size=1
125 | stride=1
126 | pad=1
127 | filters=24
128 | activation=linear
129 | 
130 | 
131 | 
132 | [yolo]
133 | mask = 3,4,5
134 | anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
135 | classes=3
136 | num=6
137 | jitter=.3
138 | ignore_thresh = .7
139 | truth_thresh = 1
140 | random=1
141 | 
142 | [route]
143 | layers = -4
144 | 
145 | [convolutional]
146 | batch_normalize=1
147 | filters=128
148 | size=1
149 | stride=1
150 | pad=1
151 | activation=leaky
152 | 
153 | [upsample]
154 | stride=2
155 | 
156 | [route]
157 | layers = -1, 8
158 | 
159 | [convolutional]
160 | batch_normalize=1
161 | filters=256
162 | size=3
163 | stride=1
164 | pad=1
165 | activation=leaky
166 | 
167 | [convolutional]
168 | size=1
169 | stride=1
170 | pad=1
171 | filters=24
172 | activation=linear
173 | 
174 | [yolo]
175 | mask = 0,1,2
176 | anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
177 | classes=3
178 | num=6
179 | jitter=.3
180 | ignore_thresh = .7
181 | truth_thresh = 1
182 | random=1
183 | 


--------------------------------------------------------------------------------
/utils/google_utils.py:
--------------------------------------------------------------------------------
 1 | # This file contains google utils: https://cloud.google.com/storage/docs/reference/libraries
 2 | # pip install --upgrade google-cloud-storage
 3 | 
 4 | import os
 5 | import time
 6 | 
 7 | 
 8 | # from google.cloud import storage
 9 | 
10 | 
11 | def gdrive_download(id='1HaXkef9z6y5l4vUnCYgdmEAj61c6bfWO', name='coco.zip'):
12 |     # https://gist.github.com/tanaikech/f0f2d122e05bf5f971611258c22c110f
13 |     # Downloads a file from Google Drive, accepting presented query
14 |     # from utils.google_utils import *; gdrive_download()
15 |     t = time.time()
16 | 
17 |     print('Downloading https://drive.google.com/uc?export=download&id=%s as %s... ' % (id, name), end='')
18 |     os.remove(name) if os.path.exists(name) else None  # remove existing
19 |     os.remove('cookie') if os.path.exists('cookie') else None
20 | 
21 |     # Attempt file download
22 |     os.system("curl -c ./cookie -s -L \"https://drive.google.com/uc?export=download&id=%s\" > /dev/null" % id)
23 |     if os.path.exists('cookie'):  # large file
24 |         s = "curl -Lb ./cookie \"https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=%s\" -o %s" % (
25 |             id, name)
26 |     else:  # small file
27 |         s = "curl -s -L -o %s 'https://drive.google.com/uc?export=download&id=%s'" % (name, id)
28 |     r = os.system(s)  # execute, capture return values
29 |     os.remove('cookie') if os.path.exists('cookie') else None
30 | 
31 |     # Error check
32 |     if r != 0:
33 |         os.remove(name) if os.path.exists(name) else None  # remove partial
34 |         print('Download error ')  # raise Exception('Download error')
35 |         return r
36 | 
37 |     # Unzip if archive
38 |     if name.endswith('.zip'):
39 |         print('unzipping... ', end='')
40 |         os.system('unzip -q %s' % name)  # unzip
41 |         os.remove(name)  # remove zip to free space
42 | 
43 |     print('Done (%.1fs)' % (time.time() - t))
44 |     return r
45 | 
46 | 
47 | def upload_blob(bucket_name, source_file_name, destination_blob_name):
48 |     # Uploads a file to a bucket
49 |     # https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
50 | 
51 |     storage_client = storage.Client()
52 |     bucket = storage_client.get_bucket(bucket_name)
53 |     blob = bucket.blob(destination_blob_name)
54 | 
55 |     blob.upload_from_filename(source_file_name)
56 | 
57 |     print('File {} uploaded to {}.'.format(
58 |         source_file_name,
59 |         destination_blob_name))
60 | 
61 | 
62 | def download_blob(bucket_name, source_blob_name, destination_file_name):
63 |     # Uploads a blob from a bucket
64 |     storage_client = storage.Client()
65 |     bucket = storage_client.get_bucket(bucket_name)
66 |     blob = bucket.blob(source_blob_name)
67 | 
68 |     blob.download_to_filename(destination_file_name)
69 | 
70 |     print('Blob {} downloaded to {}.'.format(
71 |         source_blob_name,
72 |         destination_file_name))
73 | 


--------------------------------------------------------------------------------
/utils/parse_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def parse_model_cfg(path):
 7 |     # Parse the yolo *.cfg file and return module definitions path may be 'cfg/yolov3.cfg', 'yolov3.cfg', or 'yolov3'
 8 |     if not path.endswith('.cfg'):  # add .cfg suffix if omitted
 9 |         path += '.cfg'
10 |     if not os.path.exists(path) and os.path.exists('cfg' + os.sep + path):  # add cfg/ prefix if omitted
11 |         path = 'cfg' + os.sep + path
12 | 
13 |     with open(path, 'r') as f:
14 |         lines = f.read().split('\n')
15 |     lines = [x for x in lines if x and not x.startswith('#')]
16 |     lines = [x.rstrip().lstrip() for x in lines]  # get rid of fringe whitespaces
17 |     mdefs = []  # module definitions
18 |     for line in lines:
19 |         if line.startswith('['):  # This marks the start of a new block
20 |             mdefs.append({})
21 |             mdefs[-1]['type'] = line[1:-1].rstrip()
22 |             if mdefs[-1]['type'] == 'convolutional':
23 |                 mdefs[-1]['batch_normalize'] = 0  # pre-populate with zeros (may be overwritten later)
24 |         else:
25 |             key, val = line.split("=")
26 |             key = key.rstrip()
27 | 
28 |             if key == 'anchors':  # return nparray
29 |                 mdefs[-1][key] = np.array([float(x) for x in val.split(',')]).reshape((-1, 2))  # np anchors
30 |             elif (key in ['from', 'layers', 'mask']) or (key == 'size' and ',' in val):  # return array
31 |                 mdefs[-1][key] = [int(x) for x in val.split(',')]
32 |             else:
33 |                 val = val.strip()
34 |                 if val.isnumeric():  # return int or float
35 |                     mdefs[-1][key] = int(val) if (int(val) - float(val)) == 0 else float(val)
36 |                 else:
37 |                     mdefs[-1][key] = val  # return string
38 | 
39 |     # Check all fields are supported
40 |     supported = ['type', 'batch_normalize', 'filters', 'size', 'stride', 'pad', 'activation', 'layers', 'groups',
41 |                  'from', 'mask', 'anchors', 'classes', 'num', 'jitter', 'ignore_thresh', 'truth_thresh', 'random',
42 |                  'stride_x', 'stride_y', 'weights_type', 'weights_normalization', 'scale_x_y', 'beta_nms', 'nms_kind',
43 |                  'iou_loss', 'iou_normalizer', 'cls_normalizer', 'iou_thresh']
44 | 
45 |     f = []  # fields
46 |     for x in mdefs[1:]:
47 |         [f.append(k) for k in x if k not in f]
48 |     u = [x for x in f if x not in supported]  # unsupported fields
49 |     assert not any(u), "Unsupported fields %s in %s. See https://github.com/ultralytics/yolov3/issues/631" % (u, path)
50 | 
51 |     return mdefs
52 | 
53 | 
54 | def parse_data_cfg(path):
55 |     # Parses the data configuration file
56 |     if not os.path.exists(path) and os.path.exists('data' + os.sep + path):  # add data/ prefix if omitted
57 |         path = 'data' + os.sep + path
58 | 
59 |     with open(path, 'r') as f:
60 |         lines = f.readlines()
61 | 
62 |     options = dict()
63 |     for line in lines:
64 |         line = line.strip()
65 |         if line == '' or line.startswith('#'):
66 |             continue
67 |         key, val = line.split('=')
68 |         options[key.strip()] = val.strip()
69 | 
70 |     return options
71 | 


--------------------------------------------------------------------------------
/cfg/yolov3-tiny3-1cls.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | # batch=1
  4 | # subdivisions=1
  5 | # Training
  6 | batch=64
  7 | subdivisions=16
  8 | width=608
  9 | height=608
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 200000
 21 | policy=steps
 22 | steps=180000,190000
 23 | scales=.1,.1
 24 | 
 25 | 
 26 | [convolutional]
 27 | batch_normalize=1
 28 | filters=16
 29 | size=3
 30 | stride=1
 31 | pad=1
 32 | activation=leaky
 33 | 
 34 | [maxpool]
 35 | size=2
 36 | stride=2
 37 | 
 38 | [convolutional]
 39 | batch_normalize=1
 40 | filters=32
 41 | size=3
 42 | stride=1
 43 | pad=1
 44 | activation=leaky
 45 | 
 46 | [maxpool]
 47 | size=2
 48 | stride=2
 49 | 
 50 | [convolutional]
 51 | batch_normalize=1
 52 | filters=64
 53 | size=3
 54 | stride=1
 55 | pad=1
 56 | activation=leaky
 57 | 
 58 | [maxpool]
 59 | size=2
 60 | stride=2
 61 | 
 62 | [convolutional]
 63 | batch_normalize=1
 64 | filters=128
 65 | size=3
 66 | stride=1
 67 | pad=1
 68 | activation=leaky
 69 | 
 70 | [maxpool]
 71 | size=2
 72 | stride=2
 73 | 
 74 | [convolutional]
 75 | batch_normalize=1
 76 | filters=256
 77 | size=3
 78 | stride=1
 79 | pad=1
 80 | activation=leaky
 81 | 
 82 | [maxpool]
 83 | size=2
 84 | stride=2
 85 | 
 86 | [convolutional]
 87 | batch_normalize=1
 88 | filters=512
 89 | size=3
 90 | stride=1
 91 | pad=1
 92 | activation=leaky
 93 | 
 94 | [maxpool]
 95 | size=2
 96 | stride=1
 97 | 
 98 | [convolutional]
 99 | batch_normalize=1
100 | filters=1024
101 | size=3
102 | stride=1
103 | pad=1
104 | activation=leaky
105 | 
106 | ###########
107 | 
108 | [convolutional]
109 | batch_normalize=1
110 | filters=256
111 | size=1
112 | stride=1
113 | pad=1
114 | activation=leaky
115 | 
116 | [convolutional]
117 | batch_normalize=1
118 | filters=512
119 | size=3
120 | stride=1
121 | pad=1
122 | activation=leaky
123 | 
124 | [convolutional]
125 | size=1
126 | stride=1
127 | pad=1
128 | filters=18
129 | activation=linear
130 | 
131 | 
132 | 
133 | [yolo]
134 | mask = 6,7,8
135 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
136 | classes=1
137 | num=9
138 | jitter=.3
139 | ignore_thresh = .7
140 | truth_thresh = 1
141 | random=1
142 | 
143 | [route]
144 | layers = -4
145 | 
146 | [convolutional]
147 | batch_normalize=1
148 | filters=128
149 | size=1
150 | stride=1
151 | pad=1
152 | activation=leaky
153 | 
154 | [upsample]
155 | stride=2
156 | 
157 | [route]
158 | layers = -1, 8
159 | 
160 | [convolutional]
161 | batch_normalize=1
162 | filters=256
163 | size=3
164 | stride=1
165 | pad=1
166 | activation=leaky
167 | 
168 | [convolutional]
169 | size=1
170 | stride=1
171 | pad=1
172 | filters=18
173 | activation=linear
174 | 
175 | [yolo]
176 | mask = 3,4,5
177 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
178 | classes=1
179 | num=9
180 | jitter=.3
181 | ignore_thresh = .7
182 | truth_thresh = 1
183 | random=1
184 | 
185 | 
186 | 
187 | [route]
188 | layers = -3
189 | 
190 | [convolutional]
191 | batch_normalize=1
192 | filters=128
193 | size=1
194 | stride=1
195 | pad=1
196 | activation=leaky
197 | 
198 | [upsample]
199 | stride=2
200 | 
201 | [route]
202 | layers = -1, 6
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=3
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | size=1
214 | stride=1
215 | pad=1
216 | filters=18
217 | activation=linear
218 | 
219 | [yolo]
220 | mask = 0,1,2
221 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
222 | classes=1
223 | num=9
224 | jitter=.3
225 | ignore_thresh = .7
226 | truth_thresh = 1
227 | random=1
228 | 


--------------------------------------------------------------------------------
/cfg/yolov3-tiny3.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | # batch=1
  4 | # subdivisions=1
  5 | # Training
  6 | batch=64
  7 | subdivisions=16
  8 | width=608
  9 | height=608
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 200000
 21 | policy=steps
 22 | steps=180000,190000
 23 | scales=.1,.1
 24 | 
 25 | 
 26 | [convolutional]
 27 | batch_normalize=1
 28 | filters=16
 29 | size=3
 30 | stride=1
 31 | pad=1
 32 | activation=leaky
 33 | 
 34 | [maxpool]
 35 | size=2
 36 | stride=2
 37 | 
 38 | [convolutional]
 39 | batch_normalize=1
 40 | filters=32
 41 | size=3
 42 | stride=1
 43 | pad=1
 44 | activation=leaky
 45 | 
 46 | [maxpool]
 47 | size=2
 48 | stride=2
 49 | 
 50 | [convolutional]
 51 | batch_normalize=1
 52 | filters=64
 53 | size=3
 54 | stride=1
 55 | pad=1
 56 | activation=leaky
 57 | 
 58 | [maxpool]
 59 | size=2
 60 | stride=2
 61 | 
 62 | [convolutional]
 63 | batch_normalize=1
 64 | filters=128
 65 | size=3
 66 | stride=1
 67 | pad=1
 68 | activation=leaky
 69 | 
 70 | [maxpool]
 71 | size=2
 72 | stride=2
 73 | 
 74 | [convolutional]
 75 | batch_normalize=1
 76 | filters=256
 77 | size=3
 78 | stride=1
 79 | pad=1
 80 | activation=leaky
 81 | 
 82 | [maxpool]
 83 | size=2
 84 | stride=2
 85 | 
 86 | [convolutional]
 87 | batch_normalize=1
 88 | filters=512
 89 | size=3
 90 | stride=1
 91 | pad=1
 92 | activation=leaky
 93 | 
 94 | [maxpool]
 95 | size=2
 96 | stride=1
 97 | 
 98 | [convolutional]
 99 | batch_normalize=1
100 | filters=1024
101 | size=3
102 | stride=1
103 | pad=1
104 | activation=leaky
105 | 
106 | ###########
107 | 
108 | [convolutional]
109 | batch_normalize=1
110 | filters=256
111 | size=1
112 | stride=1
113 | pad=1
114 | activation=leaky
115 | 
116 | [convolutional]
117 | batch_normalize=1
118 | filters=512
119 | size=3
120 | stride=1
121 | pad=1
122 | activation=leaky
123 | 
124 | [convolutional]
125 | size=1
126 | stride=1
127 | pad=1
128 | filters=255
129 | activation=linear
130 | 
131 | 
132 | 
133 | [yolo]
134 | mask = 6,7,8
135 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
136 | classes=80
137 | num=9
138 | jitter=.3
139 | ignore_thresh = .7
140 | truth_thresh = 1
141 | random=1
142 | 
143 | [route]
144 | layers = -4
145 | 
146 | [convolutional]
147 | batch_normalize=1
148 | filters=128
149 | size=1
150 | stride=1
151 | pad=1
152 | activation=leaky
153 | 
154 | [upsample]
155 | stride=2
156 | 
157 | [route]
158 | layers = -1, 8
159 | 
160 | [convolutional]
161 | batch_normalize=1
162 | filters=256
163 | size=3
164 | stride=1
165 | pad=1
166 | activation=leaky
167 | 
168 | [convolutional]
169 | size=1
170 | stride=1
171 | pad=1
172 | filters=255
173 | activation=linear
174 | 
175 | [yolo]
176 | mask = 3,4,5
177 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
178 | classes=80
179 | num=9
180 | jitter=.3
181 | ignore_thresh = .7
182 | truth_thresh = 1
183 | random=1
184 | 
185 | 
186 | 
187 | [route]
188 | layers = -3
189 | 
190 | [convolutional]
191 | batch_normalize=1
192 | filters=128
193 | size=1
194 | stride=1
195 | pad=1
196 | activation=leaky
197 | 
198 | [upsample]
199 | stride=2
200 | 
201 | [route]
202 | layers = -1, 6
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=3
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | size=1
214 | stride=1
215 | pad=1
216 | filters=255
217 | activation=linear
218 | 
219 | [yolo]
220 | mask = 0,1,2
221 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
222 | classes=80
223 | num=9
224 | jitter=.3
225 | ignore_thresh = .7
226 | truth_thresh = 1
227 | random=1
228 | 


--------------------------------------------------------------------------------
/utils/layers.py:
--------------------------------------------------------------------------------
  1 | import torch.nn.functional as F
  2 | 
  3 | from utils.utils import *
  4 | 
  5 | 
  6 | def make_divisible(v, divisor):
  7 |     # Function ensures all layers have a channel number that is divisible by 8
  8 |     # https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
  9 |     return math.ceil(v / divisor) * divisor
 10 | 
 11 | 
 12 | class Flatten(nn.Module):
 13 |     # Use after nn.AdaptiveAvgPool2d(1) to remove last 2 dimensions
 14 |     def forward(self, x):
 15 |         return x.view(x.size(0), -1)
 16 | 
 17 | 
 18 | class Concat(nn.Module):
 19 |     # Concatenate a list of tensors along dimension
 20 |     def __init__(self, dimension=1):
 21 |         super(Concat, self).__init__()
 22 |         self.d = dimension
 23 | 
 24 |     def forward(self, x):
 25 |         return torch.cat(x, self.d)
 26 | 
 27 | 
 28 | class FeatureConcat(nn.Module):
 29 |     def __init__(self, layers):
 30 |         super(FeatureConcat, self).__init__()
 31 |         self.layers = layers  # layer indices
 32 |         self.multiple = len(layers) > 1  # multiple layers flag
 33 | 
 34 |     def forward(self, x, outputs):
 35 |         return torch.cat([outputs[i] for i in self.layers], 1) if self.multiple else outputs[self.layers[0]]
 36 | 
 37 | 
 38 | class WeightedFeatureFusion(nn.Module):  # weighted sum of 2 or more layers https://arxiv.org/abs/1911.09070
 39 |     def __init__(self, layers, weight=False):
 40 |         super(WeightedFeatureFusion, self).__init__()
 41 |         self.layers = layers  # layer indices
 42 |         self.weight = weight  # apply weights boolean
 43 |         self.n = len(layers) + 1  # number of layers
 44 |         if weight:
 45 |             self.w = nn.Parameter(torch.zeros(self.n), requires_grad=True)  # layer weights
 46 | 
 47 |     def forward(self, x, outputs):
 48 |         # Weights
 49 |         if self.weight:
 50 |             w = torch.sigmoid(self.w) * (2 / self.n)  # sigmoid weights (0-1)
 51 |             x = x * w[0]
 52 | 
 53 |         # Fusion
 54 |         nx = x.shape[1]  # input channels
 55 |         for i in range(self.n - 1):
 56 |             a = outputs[self.layers[i]] * w[i + 1] if self.weight else outputs[self.layers[i]]  # feature to add
 57 |             na = a.shape[1]  # feature channels
 58 | 
 59 |             # Adjust channels
 60 |             if nx == na:  # same shape
 61 |                 x = x + a
 62 |             elif nx > na:  # slice input
 63 |                 x[:, :na] = x[:, :na] + a  # or a = nn.ZeroPad2d((0, 0, 0, 0, 0, dc))(a); x = x + a
 64 |             else:  # slice feature
 65 |                 x = x + a[:, :nx]
 66 | 
 67 |         return x
 68 | 
 69 | 
 70 | class MixConv2d(nn.Module):  # MixConv: Mixed Depthwise Convolutional Kernels https://arxiv.org/abs/1907.09595
 71 |     def __init__(self, in_ch, out_ch, k=(3, 5, 7), stride=1, dilation=1, bias=True, method='equal_params'):
 72 |         super(MixConv2d, self).__init__()
 73 | 
 74 |         groups = len(k)
 75 |         if method == 'equal_ch':  # equal channels per group
 76 |             i = torch.linspace(0, groups - 1E-6, out_ch).floor()  # out_ch indices
 77 |             ch = [(i == g).sum() for g in range(groups)]
 78 |         else:  # 'equal_params': equal parameter count per group
 79 |             b = [out_ch] + [0] * groups
 80 |             a = np.eye(groups + 1, groups, k=-1)
 81 |             a -= np.roll(a, 1, axis=1)
 82 |             a *= np.array(k) ** 2
 83 |             a[0] = 1
 84 |             ch = np.linalg.lstsq(a, b, rcond=None)[0].round().astype(int)  # solve for equal weight indices, ax = b
 85 | 
 86 |         self.m = nn.ModuleList([nn.Conv2d(in_channels=in_ch,
 87 |                                           out_channels=ch[g],
 88 |                                           kernel_size=k[g],
 89 |                                           stride=stride,
 90 |                                           padding=k[g] // 2,  # 'same' pad
 91 |                                           dilation=dilation,
 92 |                                           bias=bias) for g in range(groups)])
 93 | 
 94 |     def forward(self, x):
 95 |         return torch.cat([m(x) for m in self.m], 1)
 96 | 
 97 | 
 98 | # Activation functions below -------------------------------------------------------------------------------------------
 99 | class SwishImplementation(torch.autograd.Function):
100 |     @staticmethod
101 |     def forward(ctx, x):
102 |         ctx.save_for_backward(x)
103 |         return x * torch.sigmoid(x)
104 | 
105 |     @staticmethod
106 |     def backward(ctx, grad_output):
107 |         x = ctx.saved_tensors[0]
108 |         sx = torch.sigmoid(x)  # sigmoid(ctx)
109 |         return grad_output * (sx * (1 + x * (1 - sx)))
110 | 
111 | 
112 | class MishImplementation(torch.autograd.Function):
113 |     @staticmethod
114 |     def forward(ctx, x):
115 |         ctx.save_for_backward(x)
116 |         return x.mul(torch.tanh(F.softplus(x)))  # x * tanh(ln(1 + exp(x)))
117 | 
118 |     @staticmethod
119 |     def backward(ctx, grad_output):
120 |         x = ctx.saved_tensors[0]
121 |         sx = torch.sigmoid(x)
122 |         fx = F.softplus(x).tanh()
123 |         return grad_output * (fx + x * sx * (1 - fx * fx))
124 | 
125 | 
126 | class MemoryEfficientSwish(nn.Module):
127 |     def forward(self, x):
128 |         return SwishImplementation.apply(x)
129 | 
130 | 
131 | class MemoryEfficientMish(nn.Module):
132 |     def forward(self, x):
133 |         return MishImplementation.apply(x)
134 | 
135 | 
136 | class Swish(nn.Module):
137 |     def forward(self, x):
138 |         return x * torch.sigmoid(x)
139 | 
140 | 
141 | class HardSwish(nn.Module):  # https://arxiv.org/pdf/1905.02244.pdf
142 |     def forward(self, x):
143 |         return x * F.hardtanh(x + 3, 0., 6., True) / 6.
144 | 
145 | 
146 | class Mish(nn.Module):  # https://github.com/digantamisra98/Mish
147 |     def forward(self, x):
148 |         return x * F.softplus(x).tanh()
149 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img src="CIoU.png" width="800px"/>
 2 | 
 3 | # Ultralytics-YOLOv3-Cluster-NMS
 4 | ## Cluster-NMS into YOLOv3 Pytorch
 5 | Our paper is accepted by **IEEE Transactions on Cybernetics (TCYB)**.
 6 | 
 7 | #### This is the code for our paper:
 8 |  - [Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression](https://arxiv.org/abs/1911.08287)
 9 |  - [Enhancing Geometric Factors into Model Learning and Inference for Object Detection and Instance Segmentation](http://arxiv.org/abs/2005.03572)
10 | 
11 | ```
12 | @Inproceedings{zheng2020diou,
13 |   author    = {Zheng, Zhaohui and Wang, Ping and Liu, Wei and Li, Jinze and Ye, Rongguang and Ren, Dongwei},
14 |   title     = {Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression},
15 |   booktitle = {The AAAI Conference on Artificial Intelligence (AAAI)},
16 |   year      = {2020},
17 | }
18 | 
19 | @Article{zheng2021ciou,
20 |   author    = {Zheng, Zhaohui and Wang, Ping and Ren, Dongwei and Liu, Wei and Ye, Rongguang and Hu, Qinghua and Zuo, Wangmeng},
21 |   title     = {Enhancing Geometric Factors in Model Learning and Inference for Object Detection and Instance Segmentation},
22 |   booktitle = {IEEE Transactions on Cybernetics},
23 |   year      = {2021},
24 | }
25 | ```
26 | # Introduction
27 | 
28 | In this [paper](http://arxiv.org/abs/2005.03572), we propose Complete-IoU (CIoU) loss and Cluster-NMS for enhancing geometric factors in both bounding box regression and Non-Maximum Suppression (NMS), leading to notable gains of average precision (AP) and average recall (AR), without the sacrifice of inference efficiency. In particular, we consider three geometric factors, i.e., overlap area, normalized central point distance and aspect ratio, which are crucial for measuring bounding box regression in object detection and instance segmentation. The three geometric factors are then incorporated into CIoU loss for better distinguishing difficult regression cases. The training of deep models using CIoU loss results in consistent AP and AR improvements in comparison to widely adopted Ln-norm loss and IoU-based loss. Furthermore, we propose Cluster-NMS, where NMS during inference is done by implicitly clustering detected boxes and usually requires less iterations. Cluster-NMS is very efficient due to its pure GPU implementation, and geometric factors can be incorporated to improve both AP and AR. In the experiments, CIoU loss and Cluster-NMS have been applied to state-of-the-art instance segmentation (e.g., YOLACT), and object detection (e.g., YOLO v3, SSD and Faster R-CNN) models.
29 | 
30 | ### This repo only focuses on NMS improvement based on https://github.com/ultralytics/yolov3.
31 | 
32 | ### See `non_max_suppression` function of [utils/utils.py](utils/utils.py) for our Cluster-NMS implementation.
33 | 
34 | This directory contains PyTorch YOLOv3 software developed by Ultralytics LLC, and **is freely available for redistribution under the GPL-3.0 license**. For more information please visit https://www.ultralytics.com.
35 | 
36 | # Description
37 | 
38 | The https://github.com/ultralytics/yolov3 repo contains inference and training code for YOLOv3 in PyTorch. The code works on Linux, MacOS and Windows. Training is done on the COCO dataset by default: https://cocodataset.org/#home. **Credit to Joseph Redmon for YOLO:** https://pjreddie.com/darknet/yolo/.
39 | 
40 | # Requirements
41 | 
42 | Python 3.7 or later with all `pip install -U -r requirements.txt` packages including `torch >= 1.5`. Docker images come with all dependencies preinstalled. Docker requirements are: 
43 | - Nvidia Driver >= 440.44
44 | - Docker Engine - CE >= 19.03
45 | 
46 | # mAP
47 | 
48 | <i></i>                      |Size |COCO mAP<br>@0.5...0.95 |COCO mAP<br>@0.5 
49 | ---                          | ---         | ---         | ---
50 | YOLOv3-tiny<br>YOLOv3<br>YOLOv3-SPP<br>**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |320 |14.0<br>28.7<br>30.5<br>**37.7** |29.1<br>51.8<br>52.3<br>**56.8**
51 | YOLOv3-tiny<br>YOLOv3<br>YOLOv3-SPP<br>**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |416 |16.0<br>31.2<br>33.9<br>**41.2** |33.0<br>55.4<br>56.9<br>**60.6**
52 | YOLOv3-tiny<br>YOLOv3<br>YOLOv3-SPP<br>**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |512 |16.6<br>32.7<br>35.6<br>**42.6** |34.9<br>57.7<br>59.5<br>**62.4**
53 | YOLOv3-tiny<br>YOLOv3<br>YOLOv3-SPP<br>**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |608 |16.6<br>33.1<br>37.0<br>**43.1** |35.4<br>58.2<br>60.7<br>**62.8**
54 | 
55 | - mAP@0.5 run at `--iou-thr 0.5`, mAP@0.5...0.95 run at `--iou-thr 0.7`
56 | - Darknet results: https://arxiv.org/abs/1804.02767
57 | 
58 | ## Cluster-NMS
59 | 
60 | #### Hardware
61 |  - 2 GTX 1080 Ti
62 |  - Intel(R) Core(TM) i7-6850K CPU @ 3.60GHz
63 |  
64 | Evaluation command: `python3 test.py --cfg yolov3-spp.cfg --weights yolov3-spp-ultralytics.pt`
65 | 
66 | AP reports on `coco 2014 minival`.
67 | 
68 |  | Image Size | Model  | NMS  | FPS  | box AP | box AP75 | box AR100 |
69 | |:----:|:-------------:|:------------------------------------:|:----:|:----:|:----:|:----:|
70 | | 608  | YOLOv3-SPP-ultralytics |                 Fast NMS               | 85.5     | 42.2     | 45.1     | 60.1     |
71 | | 608  | YOLOv3-SPP-ultralytics |               Original NMS             | 14.6     | 42.6     | 45.8     | 62.5     | 
72 | | 608  | YOLOv3-SPP-ultralytics |                 DIoU-NMS               | 7.9      | 42.7     | 46.2     | 63.4     | 
73 | | 608  | YOLOv3-SPP-ultralytics |        Original NMS Torchvision        | **95.2** | 42.6     | 45.8     | 62.5     | 
74 | | 608  | YOLOv3-SPP-ultralytics |               Cluster-NMS              | 82.6     | 42.6     | 45.8     | 62.5     | 
75 | | 608  | YOLOv3-SPP-ultralytics |             Cluster-DIoU-NMS           | 76.9     | 42.7     | 46.2     | 63.4     | 
76 | | 608  | YOLOv3-SPP-ultralytics |               Weighted-NMS             | 11.2     | 42.9     | 46.4     | 62.7     |
77 | | 608  | YOLOv3-SPP-ultralytics |          Weighted Cluster-NMS          | 68.0     | 42.9     | 46.4     | 62.7     |
78 | | 608  | YOLOv3-SPP-ultralytics |       Weighted + Cluster-DIoU-NMS      | 64.9     | **43.1** | **46.8** | **63.7** |
79 | | 608  | YOLOv3-SPP-ultralytics |         Merge + Torchvision NMS        | 88.5     | 42.8     | 46.3     | 63.0     |
80 | | 608  | YOLOv3-SPP-ultralytics |      Merge + DIoU + Torchvision NMS    | 82.5     | 43.0     | 46.6     | 63.2     |
81 | ## Conclusion
82 | 
83 |  - Merge NMS is a simplified version of Weighted-NMS. It just use score vector for weighted coordinates, not combine score and IoU. (Refer to [CAD](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8265304) for the details of Weighted-NMS.)
84 |  
85 |  - We further incorporate DIoU into NMS for YOLOv3 which can get higher AP and AR.
86 |  
87 |  - Note that Torchvision NMS has the fastest speed, that is owing to CUDA implementation and engineering accelerations (like upper triangular IoU matrix only). However, our Cluster-NMS requires less iterations for NMS and can also be further accelerated by adopting engineering tricks. Almost completed at the same time as the work of our paper is Glenn Jocher's Torchvision NMS + Merge. First, we do Torchvision NMS, then convert the output to vector to multiply the IoU matrix. Also, for Merge NMS, the IoU matrix is no need to be square shape `n*n`. It can be `m*n` to save more time, where `m` is the boxes that NMS outputs.
88 |  
89 |  - Currently, Torchvision NMS use IoU as criterion, not DIoU. However, if we directly replace IoU with DIoU in Original NMS, it will costs much more time due to the sequence operation. Now, Cluster-DIoU-NMS will significantly speed up DIoU-NMS and obtain exactly the same result.
90 |  
91 |  - Torchvision NMS is a function in Torchvision>=0.3, and our Cluster-NMS can be applied to any projects that use low version of Torchvision and other deep learning frameworks as long as it can do matrix operations. **No other import, no need to compile, less iteration, fully GPU-accelerated and better performance**.
92 |  
93 | # Citation
94 | 
95 | [![DOI](https://zenodo.org/badge/146165888.svg)](https://zenodo.org/badge/latestdoi/146165888)
96 | 


--------------------------------------------------------------------------------
/detect.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from sys import platform
  3 | 
  4 | from models import *  # set ONNX_EXPORT in models.py
  5 | from utils.datasets import *
  6 | from utils.utils import *
  7 | 
  8 | 
  9 | def detect(save_img=False):
 10 |     img_size = (320, 192) if ONNX_EXPORT else opt.img_size  # (320, 192) or (416, 256) or (608, 352) for (height, width)
 11 |     out, source, weights, half, view_img, save_txt = opt.output, opt.source, opt.weights, opt.half, opt.view_img, opt.save_txt
 12 |     webcam = source == '0' or source.startswith('rtsp') or source.startswith('http') or source.endswith('.txt')
 13 | 
 14 |     # Initialize
 15 |     device = torch_utils.select_device(device='cpu' if ONNX_EXPORT else opt.device)
 16 |     if os.path.exists(out):
 17 |         shutil.rmtree(out)  # delete output folder
 18 |     os.makedirs(out)  # make new output folder
 19 | 
 20 |     # Initialize model
 21 |     model = Darknet(opt.cfg, img_size)
 22 | 
 23 |     # Load weights
 24 |     attempt_download(weights)
 25 |     if weights.endswith('.pt'):  # pytorch format
 26 |         model.load_state_dict(torch.load(weights, map_location=device)['model'])
 27 |     else:  # darknet format
 28 |         load_darknet_weights(model, weights)
 29 | 
 30 |     # Second-stage classifier
 31 |     classify = False
 32 |     if classify:
 33 |         modelc = torch_utils.load_classifier(name='resnet101', n=2)  # initialize
 34 |         modelc.load_state_dict(torch.load('weights/resnet101.pt', map_location=device)['model'])  # load weights
 35 |         modelc.to(device).eval()
 36 | 
 37 |     # Eval mode
 38 |     model.to(device).eval()
 39 | 
 40 |     # Fuse Conv2d + BatchNorm2d layers
 41 |     # model.fuse()
 42 | 
 43 |     # Export mode
 44 |     if ONNX_EXPORT:
 45 |         model.fuse()
 46 |         img = torch.zeros((1, 3) + img_size)  # (1, 3, 320, 192)
 47 |         f = opt.weights.replace(opt.weights.split('.')[-1], 'onnx')  # *.onnx filename
 48 |         torch.onnx.export(model, img, f, verbose=False, opset_version=11,
 49 |                           input_names=['images'], output_names=['classes', 'boxes'])
 50 | 
 51 |         # Validate exported model
 52 |         import onnx
 53 |         model = onnx.load(f)  # Load the ONNX model
 54 |         onnx.checker.check_model(model)  # Check that the IR is well formed
 55 |         print(onnx.helper.printable_graph(model.graph))  # Print a human readable representation of the graph
 56 |         return
 57 | 
 58 |     # Half precision
 59 |     half = half and device.type != 'cpu'  # half precision only supported on CUDA
 60 |     if half:
 61 |         model.half()
 62 | 
 63 |     # Set Dataloader
 64 |     vid_path, vid_writer = None, None
 65 |     if webcam:
 66 |         view_img = True
 67 |         torch.backends.cudnn.benchmark = True  # set True to speed up constant image size inference
 68 |         dataset = LoadStreams(source, img_size=img_size)
 69 |     else:
 70 |         save_img = True
 71 |         dataset = LoadImages(source, img_size=img_size)
 72 | 
 73 |     # Get names and colors
 74 |     names = load_classes(opt.names)
 75 |     colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))]
 76 | 
 77 |     # Run inference
 78 |     t0 = time.time()
 79 |     img = torch.zeros((1, 3, img_size, img_size), device=device)  # init img
 80 |     _ = model(img.half() if half else img.float()) if device.type != 'cpu' else None  # run once
 81 |     for path, img, im0s, vid_cap in dataset:
 82 |         img = torch.from_numpy(img).to(device)
 83 |         img = img.half() if half else img.float()  # uint8 to fp16/32
 84 |         img /= 255.0  # 0 - 255 to 0.0 - 1.0
 85 |         if img.ndimension() == 3:
 86 |             img = img.unsqueeze(0)
 87 | 
 88 |         # Inference
 89 |         t1 = torch_utils.time_synchronized()
 90 |         pred = model(img, augment=opt.augment)[0]
 91 |         t2 = torch_utils.time_synchronized()
 92 | 
 93 |         # to float
 94 |         if half:
 95 |             pred = pred.float()
 96 | 
 97 |         # Apply NMS
 98 |         pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres,
 99 |                                    multi_label=False, classes=opt.classes, agnostic=opt.agnostic_nms)
100 | 
101 |         # Apply Classifier
102 |         if classify:
103 |             pred = apply_classifier(pred, modelc, img, im0s)
104 | 
105 |         # Process detections
106 |         for i, det in enumerate(pred):  # detections per image
107 |             if webcam:  # batch_size >= 1
108 |                 p, s, im0 = path[i], '%g: ' % i, im0s[i]
109 |             else:
110 |                 p, s, im0 = path, '', im0s
111 | 
112 |             save_path = str(Path(out) / Path(p).name)
113 |             s += '%gx%g ' % img.shape[2:]  # print string
114 |             if det is not None and len(det):
115 |                 # Rescale boxes from img_size to im0 size
116 |                 det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()
117 | 
118 |                 # Print results
119 |                 for c in det[:, -1].unique():
120 |                     n = (det[:, -1] == c).sum()  # detections per class
121 |                     s += '%g %ss, ' % (n, names[int(c)])  # add to string
122 | 
123 |                 # Write results
124 |                 for *xyxy, conf, cls in det:
125 |                     if save_txt:  # Write to file
126 |                         with open(save_path + '.txt', 'a') as file:
127 |                             file.write(('%g ' * 6 + '\n') % (*xyxy, cls, conf))
128 | 
129 |                     if save_img or view_img:  # Add bbox to image
130 |                         label = '%s %.2f' % (names[int(cls)], conf)
131 |                         plot_one_box(xyxy, im0, label=label, color=colors[int(cls)])
132 | 
133 |             # Print time (inference + NMS)
134 |             print('%sDone. (%.3fs)' % (s, t2 - t1))
135 | 
136 |             # Stream results
137 |             if view_img:
138 |                 cv2.imshow(p, im0)
139 |                 if cv2.waitKey(1) == ord('q'):  # q to quit
140 |                     raise StopIteration
141 | 
142 |             # Save results (image with detections)
143 |             if save_img:
144 |                 if dataset.mode == 'images':
145 |                     cv2.imwrite(save_path, im0)
146 |                 else:
147 |                     if vid_path != save_path:  # new video
148 |                         vid_path = save_path
149 |                         if isinstance(vid_writer, cv2.VideoWriter):
150 |                             vid_writer.release()  # release previous video writer
151 | 
152 |                         fps = vid_cap.get(cv2.CAP_PROP_FPS)
153 |                         w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
154 |                         h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
155 |                         vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*opt.fourcc), fps, (w, h))
156 |                     vid_writer.write(im0)
157 | 
158 |     if save_txt or save_img:
159 |         print('Results saved to %s' % os.getcwd() + os.sep + out)
160 |         if platform == 'darwin':  # MacOS
161 |             os.system('open ' + save_path)
162 | 
163 |     print('Done. (%.3fs)' % (time.time() - t0))
164 | 
165 | 
166 | if __name__ == '__main__':
167 |     parser = argparse.ArgumentParser()
168 |     parser.add_argument('--cfg', type=str, default='cfg/yolov3-spp.cfg', help='*.cfg path')
169 |     parser.add_argument('--names', type=str, default='data/coco.names', help='*.names path')
170 |     parser.add_argument('--weights', type=str, default='weights/yolov3-spp-ultralytics.pt', help='weights path')
171 |     parser.add_argument('--source', type=str, default='data/samples', help='source')  # input file/folder, 0 for webcam
172 |     parser.add_argument('--output', type=str, default='output', help='output folder')  # output folder
173 |     parser.add_argument('--img-size', type=int, default=512, help='inference size (pixels)')
174 |     parser.add_argument('--conf-thres', type=float, default=0.3, help='object confidence threshold')
175 |     parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS')
176 |     parser.add_argument('--fourcc', type=str, default='mp4v', help='output video codec (verify ffmpeg support)')
177 |     parser.add_argument('--half', action='store_true', help='half precision FP16 inference')
178 |     parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1) or cpu')
179 |     parser.add_argument('--view-img', action='store_true', help='display results')
180 |     parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
181 |     parser.add_argument('--classes', nargs='+', type=int, help='filter by class')
182 |     parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
183 |     parser.add_argument('--augment', action='store_true', help='augmented inference')
184 |     opt = parser.parse_args()
185 |     print(opt)
186 | 
187 |     with torch.no_grad():
188 |         detect()
189 | 


--------------------------------------------------------------------------------
/utils/torch_utils.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import time
  4 | from copy import deepcopy
  5 | 
  6 | import torch
  7 | import torch.backends.cudnn as cudnn
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | 
 11 | 
 12 | def init_seeds(seed=0):
 13 |     torch.manual_seed(seed)
 14 | 
 15 |     # Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html
 16 |     if seed == 0:
 17 |         cudnn.deterministic = True
 18 |         cudnn.benchmark = False
 19 | 
 20 | 
 21 | def select_device(device='', apex=False, batch_size=None):
 22 |     # device = 'cpu' or '0' or '0,1,2,3'
 23 |     cpu_request = device.lower() == 'cpu'
 24 |     if device and not cpu_request:  # if device requested other than 'cpu'
 25 |         os.environ['CUDA_VISIBLE_DEVICES'] = device  # set environment variable
 26 |         assert torch.cuda.is_available(), 'CUDA unavailable, invalid device %s requested' % device  # check availablity
 27 | 
 28 |     cuda = False if cpu_request else torch.cuda.is_available()
 29 |     if cuda:
 30 |         c = 1024 ** 2  # bytes to MB
 31 |         ng = torch.cuda.device_count()
 32 |         if ng > 1 and batch_size:  # check that batch_size is compatible with device_count
 33 |             assert batch_size % ng == 0, 'batch-size %g not multiple of GPU count %g' % (batch_size, ng)
 34 |         x = [torch.cuda.get_device_properties(i) for i in range(ng)]
 35 |         s = 'Using CUDA ' + ('Apex ' if apex else '')  # apex for mixed precision https://github.com/NVIDIA/apex
 36 |         for i in range(0, ng):
 37 |             if i == 1:
 38 |                 s = ' ' * len(s)
 39 |             print("%sdevice%g _CudaDeviceProperties(name='%s', total_memory=%dMB)" %
 40 |                   (s, i, x[i].name, x[i].total_memory / c))
 41 |     else:
 42 |         print('Using CPU')
 43 | 
 44 |     print('')  # skip a line
 45 |     return torch.device('cuda:0' if cuda else 'cpu')
 46 | 
 47 | 
 48 | def time_synchronized():
 49 |     torch.cuda.synchronize() if torch.cuda.is_available() else None
 50 |     return time.time()
 51 | 
 52 | 
 53 | def initialize_weights(model):
 54 |     for m in model.modules():
 55 |         t = type(m)
 56 |         if t is nn.Conv2d:
 57 |             pass  # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
 58 |         elif t is nn.BatchNorm2d:
 59 |             m.eps = 1e-4
 60 |             m.momentum = 0.03
 61 |         elif t in [nn.LeakyReLU, nn.ReLU, nn.ReLU6]:
 62 |             m.inplace = True
 63 | 
 64 | 
 65 | def find_modules(model, mclass=nn.Conv2d):
 66 |     # finds layer indices matching module class 'mclass'
 67 |     return [i for i, m in enumerate(model.module_list) if isinstance(m, mclass)]
 68 | 
 69 | 
 70 | def fuse_conv_and_bn(conv, bn):
 71 |     # https://tehnokv.com/posts/fusing-batchnorm-and-conv/
 72 |     with torch.no_grad():
 73 |         # init
 74 |         fusedconv = torch.nn.Conv2d(conv.in_channels,
 75 |                                     conv.out_channels,
 76 |                                     kernel_size=conv.kernel_size,
 77 |                                     stride=conv.stride,
 78 |                                     padding=conv.padding,
 79 |                                     bias=True)
 80 | 
 81 |         # prepare filters
 82 |         w_conv = conv.weight.clone().view(conv.out_channels, -1)
 83 |         w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
 84 |         fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.size()))
 85 | 
 86 |         # prepare spatial bias
 87 |         if conv.bias is not None:
 88 |             b_conv = conv.bias
 89 |         else:
 90 |             b_conv = torch.zeros(conv.weight.size(0))
 91 |         b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
 92 |         fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
 93 | 
 94 |         return fusedconv
 95 | 
 96 | 
 97 | def model_info(model, verbose=False):
 98 |     # Plots a line-by-line description of a PyTorch model
 99 |     n_p = sum(x.numel() for x in model.parameters())  # number parameters
100 |     n_g = sum(x.numel() for x in model.parameters() if x.requires_grad)  # number gradients
101 |     if verbose:
102 |         print('%5s %40s %9s %12s %20s %10s %10s' % ('layer', 'name', 'gradient', 'parameters', 'shape', 'mu', 'sigma'))
103 |         for i, (name, p) in enumerate(model.named_parameters()):
104 |             name = name.replace('module_list.', '')
105 |             print('%5g %40s %9s %12g %20s %10.3g %10.3g' %
106 |                   (i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std()))
107 | 
108 |     try:  # FLOPS
109 |         from thop import profile
110 |         macs, _ = profile(model, inputs=(torch.zeros(1, 3, 480, 640),), verbose=False)
111 |         fs = ', %.1f GFLOPS' % (macs / 1E9 * 2)
112 |     except:
113 |         fs = ''
114 | 
115 |     print('Model Summary: %g layers, %g parameters, %g gradients%s' % (len(list(model.parameters())), n_p, n_g, fs))
116 | 
117 | 
118 | def load_classifier(name='resnet101', n=2):
119 |     # Loads a pretrained model reshaped to n-class output
120 |     import pretrainedmodels  # https://github.com/Cadene/pretrained-models.pytorch#torchvision
121 |     model = pretrainedmodels.__dict__[name](num_classes=1000, pretrained='imagenet')
122 | 
123 |     # Display model properties
124 |     for x in ['model.input_size', 'model.input_space', 'model.input_range', 'model.mean', 'model.std']:
125 |         print(x + ' =', eval(x))
126 | 
127 |     # Reshape output to n classes
128 |     filters = model.last_linear.weight.shape[1]
129 |     model.last_linear.bias = torch.nn.Parameter(torch.zeros(n))
130 |     model.last_linear.weight = torch.nn.Parameter(torch.zeros(n, filters))
131 |     model.last_linear.out_features = n
132 |     return model
133 | 
134 | 
135 | def scale_img(img, ratio=1.0, same_shape=True):  # img(16,3,256,416), r=ratio
136 |     # scales img(bs,3,y,x) by ratio
137 |     h, w = img.shape[2:]
138 |     s = (int(h * ratio), int(w * ratio))  # new size
139 |     img = F.interpolate(img, size=s, mode='bilinear', align_corners=False)  # resize
140 |     if not same_shape:  # pad/crop img
141 |         gs = 64  # (pixels) grid size
142 |         h, w = [math.ceil(x * ratio / gs) * gs for x in (h, w)]
143 |     return F.pad(img, [0, w - s[1], 0, h - s[0]], value=0.447)  # value = imagenet mean
144 | 
145 | 
146 | class ModelEMA:
147 |     """ Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
148 |     Keep a moving average of everything in the model state_dict (parameters and buffers).
149 |     This is intended to allow functionality like
150 |     https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
151 |     A smoothed version of the weights is necessary for some training schemes to perform well.
152 |     E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use
153 |     RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA
154 |     smoothing of weights to match results. Pay attention to the decay constant you are using
155 |     relative to your update count per epoch.
156 |     To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but
157 |     disable validation of the EMA weights. Validation will have to be done manually in a separate
158 |     process, or after the training stops converging.
159 |     This class is sensitive where it is initialized in the sequence of model init,
160 |     GPU assignment and distributed training wrappers.
161 |     I've tested with the sequence in my own train.py for torch.DataParallel, apex.DDP, and single-GPU.
162 |     """
163 | 
164 |     def __init__(self, model, decay=0.9999, device=''):
165 |         # make a copy of the model for accumulating moving average of weights
166 |         self.ema = deepcopy(model)
167 |         self.ema.eval()
168 |         self.updates = 0  # number of EMA updates
169 |         self.decay = lambda x: decay * (1 - math.exp(-x / 2000))  # decay exponential ramp (to help early epochs)
170 |         self.device = device  # perform ema on different device from model if set
171 |         if device:
172 |             self.ema.to(device=device)
173 |         for p in self.ema.parameters():
174 |             p.requires_grad_(False)
175 | 
176 |     def update(self, model):
177 |         self.updates += 1
178 |         d = self.decay(self.updates)
179 |         with torch.no_grad():
180 |             if type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel):
181 |                 msd, esd = model.module.state_dict(), self.ema.module.state_dict()
182 |             else:
183 |                 msd, esd = model.state_dict(), self.ema.state_dict()
184 | 
185 |             for k, v in esd.items():
186 |                 if v.dtype.is_floating_point:
187 |                     v *= d
188 |                     v += (1. - d) * msd[k].detach()
189 | 
190 |     def update_attr(self, model):
191 |         # Assign attributes (which may change during training)
192 |         for k in model.__dict__.keys():
193 |             if not k.startswith('_'):
194 |                 setattr(self.ema, k, getattr(model, k))
195 | 


--------------------------------------------------------------------------------
/utils/adabound.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | from torch.optim.optimizer import Optimizer
  5 | 
  6 | 
  7 | class AdaBound(Optimizer):
  8 |     """Implements AdaBound algorithm.
  9 |     It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of Learning Rate`_.
 10 |     Arguments:
 11 |         params (iterable): iterable of parameters to optimize or dicts defining
 12 |             parameter groups
 13 |         lr (float, optional): Adam learning rate (default: 1e-3)
 14 |         betas (Tuple[float, float], optional): coefficients used for computing
 15 |             running averages of gradient and its square (default: (0.9, 0.999))
 16 |         final_lr (float, optional): final (SGD) learning rate (default: 0.1)
 17 |         gamma (float, optional): convergence speed of the bound functions (default: 1e-3)
 18 |         eps (float, optional): term added to the denominator to improve
 19 |             numerical stability (default: 1e-8)
 20 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
 21 |         amsbound (boolean, optional): whether to use the AMSBound variant of this algorithm
 22 |     .. Adaptive Gradient Methods with Dynamic Bound of Learning Rate:
 23 |         https://openreview.net/forum?id=Bkg3g2R9FX
 24 |     """
 25 | 
 26 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), final_lr=0.1, gamma=1e-3,
 27 |                  eps=1e-8, weight_decay=0, amsbound=False):
 28 |         if not 0.0 <= lr:
 29 |             raise ValueError("Invalid learning rate: {}".format(lr))
 30 |         if not 0.0 <= eps:
 31 |             raise ValueError("Invalid epsilon value: {}".format(eps))
 32 |         if not 0.0 <= betas[0] < 1.0:
 33 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
 34 |         if not 0.0 <= betas[1] < 1.0:
 35 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
 36 |         if not 0.0 <= final_lr:
 37 |             raise ValueError("Invalid final learning rate: {}".format(final_lr))
 38 |         if not 0.0 <= gamma < 1.0:
 39 |             raise ValueError("Invalid gamma parameter: {}".format(gamma))
 40 |         defaults = dict(lr=lr, betas=betas, final_lr=final_lr, gamma=gamma, eps=eps,
 41 |                         weight_decay=weight_decay, amsbound=amsbound)
 42 |         super(AdaBound, self).__init__(params, defaults)
 43 | 
 44 |         self.base_lrs = list(map(lambda group: group['lr'], self.param_groups))
 45 | 
 46 |     def __setstate__(self, state):
 47 |         super(AdaBound, self).__setstate__(state)
 48 |         for group in self.param_groups:
 49 |             group.setdefault('amsbound', False)
 50 | 
 51 |     def step(self, closure=None):
 52 |         """Performs a single optimization step.
 53 |         Arguments:
 54 |             closure (callable, optional): A closure that reevaluates the model
 55 |                 and returns the loss.
 56 |         """
 57 |         loss = None
 58 |         if closure is not None:
 59 |             loss = closure()
 60 | 
 61 |         for group, base_lr in zip(self.param_groups, self.base_lrs):
 62 |             for p in group['params']:
 63 |                 if p.grad is None:
 64 |                     continue
 65 |                 grad = p.grad.data
 66 |                 if grad.is_sparse:
 67 |                     raise RuntimeError(
 68 |                         'Adam does not support sparse gradients, please consider SparseAdam instead')
 69 |                 amsbound = group['amsbound']
 70 | 
 71 |                 state = self.state[p]
 72 | 
 73 |                 # State initialization
 74 |                 if len(state) == 0:
 75 |                     state['step'] = 0
 76 |                     # Exponential moving average of gradient values
 77 |                     state['exp_avg'] = torch.zeros_like(p.data)
 78 |                     # Exponential moving average of squared gradient values
 79 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
 80 |                     if amsbound:
 81 |                         # Maintains max of all exp. moving avg. of sq. grad. values
 82 |                         state['max_exp_avg_sq'] = torch.zeros_like(p.data)
 83 | 
 84 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 85 |                 if amsbound:
 86 |                     max_exp_avg_sq = state['max_exp_avg_sq']
 87 |                 beta1, beta2 = group['betas']
 88 | 
 89 |                 state['step'] += 1
 90 | 
 91 |                 if group['weight_decay'] != 0:
 92 |                     grad = grad.add(group['weight_decay'], p.data)
 93 | 
 94 |                 # Decay the first and second moment running average coefficient
 95 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 96 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 97 |                 if amsbound:
 98 |                     # Maintains the maximum of all 2nd moment running avg. till now
 99 |                     torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
100 |                     # Use the max. for normalizing running avg. of gradient
101 |                     denom = max_exp_avg_sq.sqrt().add_(group['eps'])
102 |                 else:
103 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
104 | 
105 |                 bias_correction1 = 1 - beta1 ** state['step']
106 |                 bias_correction2 = 1 - beta2 ** state['step']
107 |                 step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
108 | 
109 |                 # Applies bounds on actual learning rate
110 |                 # lr_scheduler cannot affect final_lr, this is a workaround to apply lr decay
111 |                 final_lr = group['final_lr'] * group['lr'] / base_lr
112 |                 lower_bound = final_lr * (1 - 1 / (group['gamma'] * state['step'] + 1))
113 |                 upper_bound = final_lr * (1 + 1 / (group['gamma'] * state['step']))
114 |                 step_size = torch.full_like(denom, step_size)
115 |                 step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(exp_avg)
116 | 
117 |                 p.data.add_(-step_size)
118 | 
119 |         return loss
120 | 
121 | 
122 | class AdaBoundW(Optimizer):
123 |     """Implements AdaBound algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101)
124 |     It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of Learning Rate`_.
125 |     Arguments:
126 |         params (iterable): iterable of parameters to optimize or dicts defining
127 |             parameter groups
128 |         lr (float, optional): Adam learning rate (default: 1e-3)
129 |         betas (Tuple[float, float], optional): coefficients used for computing
130 |             running averages of gradient and its square (default: (0.9, 0.999))
131 |         final_lr (float, optional): final (SGD) learning rate (default: 0.1)
132 |         gamma (float, optional): convergence speed of the bound functions (default: 1e-3)
133 |         eps (float, optional): term added to the denominator to improve
134 |             numerical stability (default: 1e-8)
135 |         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
136 |         amsbound (boolean, optional): whether to use the AMSBound variant of this algorithm
137 |     .. Adaptive Gradient Methods with Dynamic Bound of Learning Rate:
138 |         https://openreview.net/forum?id=Bkg3g2R9FX
139 |     """
140 | 
141 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), final_lr=0.1, gamma=1e-3,
142 |                  eps=1e-8, weight_decay=0, amsbound=False):
143 |         if not 0.0 <= lr:
144 |             raise ValueError("Invalid learning rate: {}".format(lr))
145 |         if not 0.0 <= eps:
146 |             raise ValueError("Invalid epsilon value: {}".format(eps))
147 |         if not 0.0 <= betas[0] < 1.0:
148 |             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
149 |         if not 0.0 <= betas[1] < 1.0:
150 |             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
151 |         if not 0.0 <= final_lr:
152 |             raise ValueError("Invalid final learning rate: {}".format(final_lr))
153 |         if not 0.0 <= gamma < 1.0:
154 |             raise ValueError("Invalid gamma parameter: {}".format(gamma))
155 |         defaults = dict(lr=lr, betas=betas, final_lr=final_lr, gamma=gamma, eps=eps,
156 |                         weight_decay=weight_decay, amsbound=amsbound)
157 |         super(AdaBoundW, self).__init__(params, defaults)
158 | 
159 |         self.base_lrs = list(map(lambda group: group['lr'], self.param_groups))
160 | 
161 |     def __setstate__(self, state):
162 |         super(AdaBoundW, self).__setstate__(state)
163 |         for group in self.param_groups:
164 |             group.setdefault('amsbound', False)
165 | 
166 |     def step(self, closure=None):
167 |         """Performs a single optimization step.
168 |         Arguments:
169 |             closure (callable, optional): A closure that reevaluates the model
170 |                 and returns the loss.
171 |         """
172 |         loss = None
173 |         if closure is not None:
174 |             loss = closure()
175 | 
176 |         for group, base_lr in zip(self.param_groups, self.base_lrs):
177 |             for p in group['params']:
178 |                 if p.grad is None:
179 |                     continue
180 |                 grad = p.grad.data
181 |                 if grad.is_sparse:
182 |                     raise RuntimeError(
183 |                         'Adam does not support sparse gradients, please consider SparseAdam instead')
184 |                 amsbound = group['amsbound']
185 | 
186 |                 state = self.state[p]
187 | 
188 |                 # State initialization
189 |                 if len(state) == 0:
190 |                     state['step'] = 0
191 |                     # Exponential moving average of gradient values
192 |                     state['exp_avg'] = torch.zeros_like(p.data)
193 |                     # Exponential moving average of squared gradient values
194 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
195 |                     if amsbound:
196 |                         # Maintains max of all exp. moving avg. of sq. grad. values
197 |                         state['max_exp_avg_sq'] = torch.zeros_like(p.data)
198 | 
199 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
200 |                 if amsbound:
201 |                     max_exp_avg_sq = state['max_exp_avg_sq']
202 |                 beta1, beta2 = group['betas']
203 | 
204 |                 state['step'] += 1
205 | 
206 |                 # Decay the first and second moment running average coefficient
207 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
208 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
209 |                 if amsbound:
210 |                     # Maintains the maximum of all 2nd moment running avg. till now
211 |                     torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
212 |                     # Use the max. for normalizing running avg. of gradient
213 |                     denom = max_exp_avg_sq.sqrt().add_(group['eps'])
214 |                 else:
215 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
216 | 
217 |                 bias_correction1 = 1 - beta1 ** state['step']
218 |                 bias_correction2 = 1 - beta2 ** state['step']
219 |                 step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
220 | 
221 |                 # Applies bounds on actual learning rate
222 |                 # lr_scheduler cannot affect final_lr, this is a workaround to apply lr decay
223 |                 final_lr = group['final_lr'] * group['lr'] / base_lr
224 |                 lower_bound = final_lr * (1 - 1 / (group['gamma'] * state['step'] + 1))
225 |                 upper_bound = final_lr * (1 + 1 / (group['gamma'] * state['step']))
226 |                 step_size = torch.full_like(denom, step_size)
227 |                 step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(exp_avg)
228 | 
229 |                 if group['weight_decay'] != 0:
230 |                     decayed_weights = torch.mul(p.data, group['weight_decay'])
231 |                     p.data.add_(-step_size)
232 |                     p.data.sub_(decayed_weights)
233 |                 else:
234 |                     p.data.add_(-step_size)
235 | 
236 |         return loss
237 | 


--------------------------------------------------------------------------------
/cfg/yolov3-1cls.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | #batch=1
  4 | #subdivisions=1
  5 | # Training
  6 | batch=16
  7 | subdivisions=1
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 | 
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 | 
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 | 
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=18
604 | activation=linear
605 | 
606 | 
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
610 | classes=1
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 | 
617 | 
618 | [route]
619 | layers = -4
620 | 
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 | 
629 | [upsample]
630 | stride=2
631 | 
632 | [route]
633 | layers = -1, 61
634 | 
635 | 
636 | 
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 | 
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 | 
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 | 
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 | 
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 | 
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 | 
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=18
690 | activation=linear
691 | 
692 | 
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
696 | classes=1
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 | 
703 | 
704 | 
705 | [route]
706 | layers = -4
707 | 
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 | 
716 | [upsample]
717 | stride=2
718 | 
719 | [route]
720 | layers = -1, 36
721 | 
722 | 
723 | 
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 | 
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 | 
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 | 
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 | 
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 | 
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 | 
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=18
777 | activation=linear
778 | 
779 | 
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
783 | classes=1
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 | 


--------------------------------------------------------------------------------
/cfg/yolov3.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | #batch=1
  4 | #subdivisions=1
  5 | # Training
  6 | batch=16
  7 | subdivisions=1
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 | 
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 | 
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 | 
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 | 
606 | 
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 | 
617 | 
618 | [route]
619 | layers = -4
620 | 
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 | 
629 | [upsample]
630 | stride=2
631 | 
632 | [route]
633 | layers = -1, 61
634 | 
635 | 
636 | 
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 | 
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 | 
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 | 
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 | 
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 | 
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 | 
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 | 
692 | 
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 | 
703 | 
704 | 
705 | [route]
706 | layers = -4
707 | 
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 | 
716 | [upsample]
717 | stride=2
718 | 
719 | [route]
720 | layers = -1, 36
721 | 
722 | 
723 | 
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 | 
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 | 
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 | 
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 | 
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 | 
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 | 
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 | 
779 | 
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 | 


--------------------------------------------------------------------------------
/cfg/yolov3-spp-1cls.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | # batch=1
  4 | # subdivisions=1
  5 | # Training
  6 | batch=64
  7 | subdivisions=16
  8 | width=608
  9 | height=608
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=100
 20 | max_batches = 5000
 21 | policy=steps
 22 | steps=4000,4500
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | ### SPP ###
576 | [maxpool]
577 | stride=1
578 | size=5
579 | 
580 | [route]
581 | layers=-2
582 | 
583 | [maxpool]
584 | stride=1
585 | size=9
586 | 
587 | [route]
588 | layers=-4
589 | 
590 | [maxpool]
591 | stride=1
592 | size=13
593 | 
594 | [route]
595 | layers=-1,-3,-5,-6
596 | 
597 | ### End SPP ###
598 | 
599 | [convolutional]
600 | batch_normalize=1
601 | filters=512
602 | size=1
603 | stride=1
604 | pad=1
605 | activation=leaky
606 | 
607 | 
608 | [convolutional]
609 | batch_normalize=1
610 | size=3
611 | stride=1
612 | pad=1
613 | filters=1024
614 | activation=leaky
615 | 
616 | [convolutional]
617 | batch_normalize=1
618 | filters=512
619 | size=1
620 | stride=1
621 | pad=1
622 | activation=leaky
623 | 
624 | [convolutional]
625 | batch_normalize=1
626 | size=3
627 | stride=1
628 | pad=1
629 | filters=1024
630 | activation=leaky
631 | 
632 | [convolutional]
633 | size=1
634 | stride=1
635 | pad=1
636 | filters=18
637 | activation=linear
638 | 
639 | 
640 | [yolo]
641 | mask = 6,7,8
642 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
643 | classes=1
644 | num=9
645 | jitter=.3
646 | ignore_thresh = .7
647 | truth_thresh = 1
648 | random=1
649 | 
650 | 
651 | [route]
652 | layers = -4
653 | 
654 | [convolutional]
655 | batch_normalize=1
656 | filters=256
657 | size=1
658 | stride=1
659 | pad=1
660 | activation=leaky
661 | 
662 | [upsample]
663 | stride=2
664 | 
665 | [route]
666 | layers = -1, 61
667 | 
668 | 
669 | 
670 | [convolutional]
671 | batch_normalize=1
672 | filters=256
673 | size=1
674 | stride=1
675 | pad=1
676 | activation=leaky
677 | 
678 | [convolutional]
679 | batch_normalize=1
680 | size=3
681 | stride=1
682 | pad=1
683 | filters=512
684 | activation=leaky
685 | 
686 | [convolutional]
687 | batch_normalize=1
688 | filters=256
689 | size=1
690 | stride=1
691 | pad=1
692 | activation=leaky
693 | 
694 | [convolutional]
695 | batch_normalize=1
696 | size=3
697 | stride=1
698 | pad=1
699 | filters=512
700 | activation=leaky
701 | 
702 | [convolutional]
703 | batch_normalize=1
704 | filters=256
705 | size=1
706 | stride=1
707 | pad=1
708 | activation=leaky
709 | 
710 | [convolutional]
711 | batch_normalize=1
712 | size=3
713 | stride=1
714 | pad=1
715 | filters=512
716 | activation=leaky
717 | 
718 | [convolutional]
719 | size=1
720 | stride=1
721 | pad=1
722 | filters=18
723 | activation=linear
724 | 
725 | 
726 | [yolo]
727 | mask = 3,4,5
728 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
729 | classes=1
730 | num=9
731 | jitter=.3
732 | ignore_thresh = .7
733 | truth_thresh = 1
734 | random=1
735 | 
736 | 
737 | 
738 | [route]
739 | layers = -4
740 | 
741 | [convolutional]
742 | batch_normalize=1
743 | filters=128
744 | size=1
745 | stride=1
746 | pad=1
747 | activation=leaky
748 | 
749 | [upsample]
750 | stride=2
751 | 
752 | [route]
753 | layers = -1, 36
754 | 
755 | 
756 | 
757 | [convolutional]
758 | batch_normalize=1
759 | filters=128
760 | size=1
761 | stride=1
762 | pad=1
763 | activation=leaky
764 | 
765 | [convolutional]
766 | batch_normalize=1
767 | size=3
768 | stride=1
769 | pad=1
770 | filters=256
771 | activation=leaky
772 | 
773 | [convolutional]
774 | batch_normalize=1
775 | filters=128
776 | size=1
777 | stride=1
778 | pad=1
779 | activation=leaky
780 | 
781 | [convolutional]
782 | batch_normalize=1
783 | size=3
784 | stride=1
785 | pad=1
786 | filters=256
787 | activation=leaky
788 | 
789 | [convolutional]
790 | batch_normalize=1
791 | filters=128
792 | size=1
793 | stride=1
794 | pad=1
795 | activation=leaky
796 | 
797 | [convolutional]
798 | batch_normalize=1
799 | size=3
800 | stride=1
801 | pad=1
802 | filters=256
803 | activation=leaky
804 | 
805 | [convolutional]
806 | size=1
807 | stride=1
808 | pad=1
809 | filters=18
810 | activation=linear
811 | 
812 | 
813 | [yolo]
814 | mask = 0,1,2
815 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
816 | classes=1
817 | num=9
818 | jitter=.3
819 | ignore_thresh = .7
820 | truth_thresh = 1
821 | random=1
822 | 


--------------------------------------------------------------------------------
/cfg/yolov3-spp-3cls.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | # batch=1
  4 | # subdivisions=1
  5 | # Training
  6 | batch=64
  7 | subdivisions=16
  8 | width=608
  9 | height=608
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=100
 20 | max_batches = 5000
 21 | policy=steps
 22 | steps=4000,4500
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | ### SPP ###
576 | [maxpool]
577 | stride=1
578 | size=5
579 | 
580 | [route]
581 | layers=-2
582 | 
583 | [maxpool]
584 | stride=1
585 | size=9
586 | 
587 | [route]
588 | layers=-4
589 | 
590 | [maxpool]
591 | stride=1
592 | size=13
593 | 
594 | [route]
595 | layers=-1,-3,-5,-6
596 | 
597 | ### End SPP ###
598 | 
599 | [convolutional]
600 | batch_normalize=1
601 | filters=512
602 | size=1
603 | stride=1
604 | pad=1
605 | activation=leaky
606 | 
607 | 
608 | [convolutional]
609 | batch_normalize=1
610 | size=3
611 | stride=1
612 | pad=1
613 | filters=1024
614 | activation=leaky
615 | 
616 | [convolutional]
617 | batch_normalize=1
618 | filters=512
619 | size=1
620 | stride=1
621 | pad=1
622 | activation=leaky
623 | 
624 | [convolutional]
625 | batch_normalize=1
626 | size=3
627 | stride=1
628 | pad=1
629 | filters=1024
630 | activation=leaky
631 | 
632 | [convolutional]
633 | size=1
634 | stride=1
635 | pad=1
636 | filters=24
637 | activation=linear
638 | 
639 | 
640 | [yolo]
641 | mask = 6,7,8
642 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
643 | classes=3
644 | num=9
645 | jitter=.3
646 | ignore_thresh = .7
647 | truth_thresh = 1
648 | random=1
649 | 
650 | 
651 | [route]
652 | layers = -4
653 | 
654 | [convolutional]
655 | batch_normalize=1
656 | filters=256
657 | size=1
658 | stride=1
659 | pad=1
660 | activation=leaky
661 | 
662 | [upsample]
663 | stride=2
664 | 
665 | [route]
666 | layers = -1, 61
667 | 
668 | 
669 | 
670 | [convolutional]
671 | batch_normalize=1
672 | filters=256
673 | size=1
674 | stride=1
675 | pad=1
676 | activation=leaky
677 | 
678 | [convolutional]
679 | batch_normalize=1
680 | size=3
681 | stride=1
682 | pad=1
683 | filters=512
684 | activation=leaky
685 | 
686 | [convolutional]
687 | batch_normalize=1
688 | filters=256
689 | size=1
690 | stride=1
691 | pad=1
692 | activation=leaky
693 | 
694 | [convolutional]
695 | batch_normalize=1
696 | size=3
697 | stride=1
698 | pad=1
699 | filters=512
700 | activation=leaky
701 | 
702 | [convolutional]
703 | batch_normalize=1
704 | filters=256
705 | size=1
706 | stride=1
707 | pad=1
708 | activation=leaky
709 | 
710 | [convolutional]
711 | batch_normalize=1
712 | size=3
713 | stride=1
714 | pad=1
715 | filters=512
716 | activation=leaky
717 | 
718 | [convolutional]
719 | size=1
720 | stride=1
721 | pad=1
722 | filters=24
723 | activation=linear
724 | 
725 | 
726 | [yolo]
727 | mask = 3,4,5
728 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
729 | classes=3
730 | num=9
731 | jitter=.3
732 | ignore_thresh = .7
733 | truth_thresh = 1
734 | random=1
735 | 
736 | 
737 | 
738 | [route]
739 | layers = -4
740 | 
741 | [convolutional]
742 | batch_normalize=1
743 | filters=128
744 | size=1
745 | stride=1
746 | pad=1
747 | activation=leaky
748 | 
749 | [upsample]
750 | stride=2
751 | 
752 | [route]
753 | layers = -1, 36
754 | 
755 | 
756 | 
757 | [convolutional]
758 | batch_normalize=1
759 | filters=128
760 | size=1
761 | stride=1
762 | pad=1
763 | activation=leaky
764 | 
765 | [convolutional]
766 | batch_normalize=1
767 | size=3
768 | stride=1
769 | pad=1
770 | filters=256
771 | activation=leaky
772 | 
773 | [convolutional]
774 | batch_normalize=1
775 | filters=128
776 | size=1
777 | stride=1
778 | pad=1
779 | activation=leaky
780 | 
781 | [convolutional]
782 | batch_normalize=1
783 | size=3
784 | stride=1
785 | pad=1
786 | filters=256
787 | activation=leaky
788 | 
789 | [convolutional]
790 | batch_normalize=1
791 | filters=128
792 | size=1
793 | stride=1
794 | pad=1
795 | activation=leaky
796 | 
797 | [convolutional]
798 | batch_normalize=1
799 | size=3
800 | stride=1
801 | pad=1
802 | filters=256
803 | activation=leaky
804 | 
805 | [convolutional]
806 | size=1
807 | stride=1
808 | pad=1
809 | filters=24
810 | activation=linear
811 | 
812 | 
813 | [yolo]
814 | mask = 0,1,2
815 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
816 | classes=3
817 | num=9
818 | jitter=.3
819 | ignore_thresh = .7
820 | truth_thresh = 1
821 | random=1
822 | 


--------------------------------------------------------------------------------
/cfg/yolov3-spp.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | # batch=1
  4 | # subdivisions=1
  5 | # Training
  6 | batch=64
  7 | subdivisions=16
  8 | width=608
  9 | height=608
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | ### SPP ###
576 | [maxpool]
577 | stride=1
578 | size=5
579 | 
580 | [route]
581 | layers=-2
582 | 
583 | [maxpool]
584 | stride=1
585 | size=9
586 | 
587 | [route]
588 | layers=-4
589 | 
590 | [maxpool]
591 | stride=1
592 | size=13
593 | 
594 | [route]
595 | layers=-1,-3,-5,-6
596 | 
597 | ### End SPP ###
598 | 
599 | [convolutional]
600 | batch_normalize=1
601 | filters=512
602 | size=1
603 | stride=1
604 | pad=1
605 | activation=leaky
606 | 
607 | 
608 | [convolutional]
609 | batch_normalize=1
610 | size=3
611 | stride=1
612 | pad=1
613 | filters=1024
614 | activation=leaky
615 | 
616 | [convolutional]
617 | batch_normalize=1
618 | filters=512
619 | size=1
620 | stride=1
621 | pad=1
622 | activation=leaky
623 | 
624 | [convolutional]
625 | batch_normalize=1
626 | size=3
627 | stride=1
628 | pad=1
629 | filters=1024
630 | activation=leaky
631 | 
632 | [convolutional]
633 | size=1
634 | stride=1
635 | pad=1
636 | filters=255
637 | activation=linear
638 | 
639 | 
640 | [yolo]
641 | mask = 6,7,8
642 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
643 | classes=80
644 | num=9
645 | jitter=.3
646 | ignore_thresh = .7
647 | truth_thresh = 1
648 | random=1
649 | 
650 | 
651 | [route]
652 | layers = -4
653 | 
654 | [convolutional]
655 | batch_normalize=1
656 | filters=256
657 | size=1
658 | stride=1
659 | pad=1
660 | activation=leaky
661 | 
662 | [upsample]
663 | stride=2
664 | 
665 | [route]
666 | layers = -1, 61
667 | 
668 | 
669 | 
670 | [convolutional]
671 | batch_normalize=1
672 | filters=256
673 | size=1
674 | stride=1
675 | pad=1
676 | activation=leaky
677 | 
678 | [convolutional]
679 | batch_normalize=1
680 | size=3
681 | stride=1
682 | pad=1
683 | filters=512
684 | activation=leaky
685 | 
686 | [convolutional]
687 | batch_normalize=1
688 | filters=256
689 | size=1
690 | stride=1
691 | pad=1
692 | activation=leaky
693 | 
694 | [convolutional]
695 | batch_normalize=1
696 | size=3
697 | stride=1
698 | pad=1
699 | filters=512
700 | activation=leaky
701 | 
702 | [convolutional]
703 | batch_normalize=1
704 | filters=256
705 | size=1
706 | stride=1
707 | pad=1
708 | activation=leaky
709 | 
710 | [convolutional]
711 | batch_normalize=1
712 | size=3
713 | stride=1
714 | pad=1
715 | filters=512
716 | activation=leaky
717 | 
718 | [convolutional]
719 | size=1
720 | stride=1
721 | pad=1
722 | filters=255
723 | activation=linear
724 | 
725 | 
726 | [yolo]
727 | mask = 3,4,5
728 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
729 | classes=80
730 | num=9
731 | jitter=.3
732 | ignore_thresh = .7
733 | truth_thresh = 1
734 | random=1
735 | 
736 | 
737 | 
738 | [route]
739 | layers = -4
740 | 
741 | [convolutional]
742 | batch_normalize=1
743 | filters=128
744 | size=1
745 | stride=1
746 | pad=1
747 | activation=leaky
748 | 
749 | [upsample]
750 | stride=2
751 | 
752 | [route]
753 | layers = -1, 36
754 | 
755 | 
756 | 
757 | [convolutional]
758 | batch_normalize=1
759 | filters=128
760 | size=1
761 | stride=1
762 | pad=1
763 | activation=leaky
764 | 
765 | [convolutional]
766 | batch_normalize=1
767 | size=3
768 | stride=1
769 | pad=1
770 | filters=256
771 | activation=leaky
772 | 
773 | [convolutional]
774 | batch_normalize=1
775 | filters=128
776 | size=1
777 | stride=1
778 | pad=1
779 | activation=leaky
780 | 
781 | [convolutional]
782 | batch_normalize=1
783 | size=3
784 | stride=1
785 | pad=1
786 | filters=256
787 | activation=leaky
788 | 
789 | [convolutional]
790 | batch_normalize=1
791 | filters=128
792 | size=1
793 | stride=1
794 | pad=1
795 | activation=leaky
796 | 
797 | [convolutional]
798 | batch_normalize=1
799 | size=3
800 | stride=1
801 | pad=1
802 | filters=256
803 | activation=leaky
804 | 
805 | [convolutional]
806 | size=1
807 | stride=1
808 | pad=1
809 | filters=255
810 | activation=linear
811 | 
812 | 
813 | [yolo]
814 | mask = 0,1,2
815 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
816 | classes=80
817 | num=9
818 | jitter=.3
819 | ignore_thresh = .7
820 | truth_thresh = 1
821 | random=1
822 | 


--------------------------------------------------------------------------------
/cfg/yolov3-asff.cfg:
--------------------------------------------------------------------------------
  1 | # Generated by Glenn Jocher (glenn.jocher@ultralytics.com) for https://github.com/ultralytics/yolov3
  2 | # def kmean_anchors(path='../coco/train2017.txt', n=12, img_size=(320, 640)):  # from utils.utils import *; kmean_anchors()
  3 | # Evolving anchors: 100%|██████████| 1000/1000 [41:15<00:00,  2.48s/it]
  4 | # 0.20 iou_thr: 0.992 best possible recall, 4.25 anchors > thr
  5 | # kmeans anchors (n=12, img_size=(320, 640), IoU=0.005/0.184/0.634-min/mean/best): 6,9,  15,16,  17,35,  37,26,  36,67,  63,42,  57,100,  121,81,  112,169,  241,158,  195,310,  426,359
  6 | 
  7 | [net]
  8 | # Testing
  9 | # batch=1
 10 | # subdivisions=1
 11 | # Training
 12 | batch=64
 13 | subdivisions=16
 14 | width=608
 15 | height=608
 16 | channels=3
 17 | momentum=0.9
 18 | decay=0.0005
 19 | angle=0
 20 | saturation = 1.5
 21 | exposure = 1.5
 22 | hue=.1
 23 | 
 24 | learning_rate=0.001
 25 | burn_in=1000
 26 | max_batches = 500200
 27 | policy=steps
 28 | steps=400000,450000
 29 | scales=.1,.1
 30 | 
 31 | [convolutional]
 32 | batch_normalize=1
 33 | filters=32
 34 | size=3
 35 | stride=1
 36 | pad=1
 37 | activation=leaky
 38 | 
 39 | # Downsample
 40 | 
 41 | [convolutional]
 42 | batch_normalize=1
 43 | filters=64
 44 | size=3
 45 | stride=2
 46 | pad=1
 47 | activation=leaky
 48 | 
 49 | [convolutional]
 50 | batch_normalize=1
 51 | filters=32
 52 | size=1
 53 | stride=1
 54 | pad=1
 55 | activation=leaky
 56 | 
 57 | [convolutional]
 58 | batch_normalize=1
 59 | filters=64
 60 | size=3
 61 | stride=1
 62 | pad=1
 63 | activation=leaky
 64 | 
 65 | [shortcut]
 66 | from=-3
 67 | activation=linear
 68 | 
 69 | # Downsample
 70 | 
 71 | [convolutional]
 72 | batch_normalize=1
 73 | filters=128
 74 | size=3
 75 | stride=2
 76 | pad=1
 77 | activation=leaky
 78 | 
 79 | [convolutional]
 80 | batch_normalize=1
 81 | filters=64
 82 | size=1
 83 | stride=1
 84 | pad=1
 85 | activation=leaky
 86 | 
 87 | [convolutional]
 88 | batch_normalize=1
 89 | filters=128
 90 | size=3
 91 | stride=1
 92 | pad=1
 93 | activation=leaky
 94 | 
 95 | [shortcut]
 96 | from=-3
 97 | activation=linear
 98 | 
 99 | [convolutional]
100 | batch_normalize=1
101 | filters=64
102 | size=1
103 | stride=1
104 | pad=1
105 | activation=leaky
106 | 
107 | [convolutional]
108 | batch_normalize=1
109 | filters=128
110 | size=3
111 | stride=1
112 | pad=1
113 | activation=leaky
114 | 
115 | [shortcut]
116 | from=-3
117 | activation=linear
118 | 
119 | # Downsample
120 | 
121 | [convolutional]
122 | batch_normalize=1
123 | filters=256
124 | size=3
125 | stride=2
126 | pad=1
127 | activation=leaky
128 | 
129 | [convolutional]
130 | batch_normalize=1
131 | filters=128
132 | size=1
133 | stride=1
134 | pad=1
135 | activation=leaky
136 | 
137 | [convolutional]
138 | batch_normalize=1
139 | filters=256
140 | size=3
141 | stride=1
142 | pad=1
143 | activation=leaky
144 | 
145 | [shortcut]
146 | from=-3
147 | activation=linear
148 | 
149 | [convolutional]
150 | batch_normalize=1
151 | filters=128
152 | size=1
153 | stride=1
154 | pad=1
155 | activation=leaky
156 | 
157 | [convolutional]
158 | batch_normalize=1
159 | filters=256
160 | size=3
161 | stride=1
162 | pad=1
163 | activation=leaky
164 | 
165 | [shortcut]
166 | from=-3
167 | activation=linear
168 | 
169 | [convolutional]
170 | batch_normalize=1
171 | filters=128
172 | size=1
173 | stride=1
174 | pad=1
175 | activation=leaky
176 | 
177 | [convolutional]
178 | batch_normalize=1
179 | filters=256
180 | size=3
181 | stride=1
182 | pad=1
183 | activation=leaky
184 | 
185 | [shortcut]
186 | from=-3
187 | activation=linear
188 | 
189 | [convolutional]
190 | batch_normalize=1
191 | filters=128
192 | size=1
193 | stride=1
194 | pad=1
195 | activation=leaky
196 | 
197 | [convolutional]
198 | batch_normalize=1
199 | filters=256
200 | size=3
201 | stride=1
202 | pad=1
203 | activation=leaky
204 | 
205 | [shortcut]
206 | from=-3
207 | activation=linear
208 | 
209 | [convolutional]
210 | batch_normalize=1
211 | filters=128
212 | size=1
213 | stride=1
214 | pad=1
215 | activation=leaky
216 | 
217 | [convolutional]
218 | batch_normalize=1
219 | filters=256
220 | size=3
221 | stride=1
222 | pad=1
223 | activation=leaky
224 | 
225 | [shortcut]
226 | from=-3
227 | activation=linear
228 | 
229 | [convolutional]
230 | batch_normalize=1
231 | filters=128
232 | size=1
233 | stride=1
234 | pad=1
235 | activation=leaky
236 | 
237 | [convolutional]
238 | batch_normalize=1
239 | filters=256
240 | size=3
241 | stride=1
242 | pad=1
243 | activation=leaky
244 | 
245 | [shortcut]
246 | from=-3
247 | activation=linear
248 | 
249 | [convolutional]
250 | batch_normalize=1
251 | filters=128
252 | size=1
253 | stride=1
254 | pad=1
255 | activation=leaky
256 | 
257 | [convolutional]
258 | batch_normalize=1
259 | filters=256
260 | size=3
261 | stride=1
262 | pad=1
263 | activation=leaky
264 | 
265 | [shortcut]
266 | from=-3
267 | activation=linear
268 | 
269 | [convolutional]
270 | batch_normalize=1
271 | filters=128
272 | size=1
273 | stride=1
274 | pad=1
275 | activation=leaky
276 | 
277 | [convolutional]
278 | batch_normalize=1
279 | filters=256
280 | size=3
281 | stride=1
282 | pad=1
283 | activation=leaky
284 | 
285 | [shortcut]
286 | from=-3
287 | activation=linear
288 | 
289 | # Downsample
290 | 
291 | [convolutional]
292 | batch_normalize=1
293 | filters=512
294 | size=3
295 | stride=2
296 | pad=1
297 | activation=leaky
298 | 
299 | [convolutional]
300 | batch_normalize=1
301 | filters=256
302 | size=1
303 | stride=1
304 | pad=1
305 | activation=leaky
306 | 
307 | [convolutional]
308 | batch_normalize=1
309 | filters=512
310 | size=3
311 | stride=1
312 | pad=1
313 | activation=leaky
314 | 
315 | [shortcut]
316 | from=-3
317 | activation=linear
318 | 
319 | [convolutional]
320 | batch_normalize=1
321 | filters=256
322 | size=1
323 | stride=1
324 | pad=1
325 | activation=leaky
326 | 
327 | [convolutional]
328 | batch_normalize=1
329 | filters=512
330 | size=3
331 | stride=1
332 | pad=1
333 | activation=leaky
334 | 
335 | [shortcut]
336 | from=-3
337 | activation=linear
338 | 
339 | [convolutional]
340 | batch_normalize=1
341 | filters=256
342 | size=1
343 | stride=1
344 | pad=1
345 | activation=leaky
346 | 
347 | [convolutional]
348 | batch_normalize=1
349 | filters=512
350 | size=3
351 | stride=1
352 | pad=1
353 | activation=leaky
354 | 
355 | [shortcut]
356 | from=-3
357 | activation=linear
358 | 
359 | [convolutional]
360 | batch_normalize=1
361 | filters=256
362 | size=1
363 | stride=1
364 | pad=1
365 | activation=leaky
366 | 
367 | [convolutional]
368 | batch_normalize=1
369 | filters=512
370 | size=3
371 | stride=1
372 | pad=1
373 | activation=leaky
374 | 
375 | [shortcut]
376 | from=-3
377 | activation=linear
378 | 
379 | [convolutional]
380 | batch_normalize=1
381 | filters=256
382 | size=1
383 | stride=1
384 | pad=1
385 | activation=leaky
386 | 
387 | [convolutional]
388 | batch_normalize=1
389 | filters=512
390 | size=3
391 | stride=1
392 | pad=1
393 | activation=leaky
394 | 
395 | [shortcut]
396 | from=-3
397 | activation=linear
398 | 
399 | [convolutional]
400 | batch_normalize=1
401 | filters=256
402 | size=1
403 | stride=1
404 | pad=1
405 | activation=leaky
406 | 
407 | [convolutional]
408 | batch_normalize=1
409 | filters=512
410 | size=3
411 | stride=1
412 | pad=1
413 | activation=leaky
414 | 
415 | [shortcut]
416 | from=-3
417 | activation=linear
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | # SPP --------------------------------------------------------------------------
576 | [maxpool]
577 | stride=1
578 | size=5
579 | 
580 | [route]
581 | layers=-2
582 | 
583 | [maxpool]
584 | stride=1
585 | size=9
586 | 
587 | [route]
588 | layers=-4
589 | 
590 | [maxpool]
591 | stride=1
592 | size=13
593 | 
594 | [route]
595 | layers=-1,-3,-5,-6
596 | # SPP --------------------------------------------------------------------------
597 | 
598 | [convolutional]
599 | batch_normalize=1
600 | filters=512
601 | size=1
602 | stride=1
603 | pad=1
604 | activation=leaky
605 | 
606 | [convolutional]
607 | batch_normalize=1
608 | size=3
609 | stride=1
610 | pad=1
611 | filters=1024
612 | activation=leaky
613 | 
614 | [convolutional]
615 | batch_normalize=1
616 | filters=512
617 | size=1
618 | stride=1
619 | pad=1
620 | activation=leaky
621 | 
622 | [convolutional]
623 | batch_normalize=1
624 | size=3
625 | stride=1
626 | pad=1
627 | filters=1024
628 | activation=leaky
629 | 
630 | [convolutional]
631 | size=1
632 | stride=1
633 | pad=1
634 | filters=258
635 | activation=linear
636 | 
637 | # YOLO -------------------------------------------------------------------------
638 | 
639 | [route]
640 | layers = -3
641 | 
642 | [convolutional]
643 | batch_normalize=1
644 | filters=256
645 | size=1
646 | stride=1
647 | pad=1
648 | activation=leaky
649 | 
650 | [upsample]
651 | stride=2
652 | 
653 | [route]
654 | layers = -1, 61
655 | 
656 | [convolutional]
657 | batch_normalize=1
658 | filters=256
659 | size=1
660 | stride=1
661 | pad=1
662 | activation=leaky
663 | 
664 | [convolutional]
665 | batch_normalize=1
666 | size=3
667 | stride=1
668 | pad=1
669 | filters=512
670 | activation=leaky
671 | 
672 | [convolutional]
673 | batch_normalize=1
674 | filters=256
675 | size=1
676 | stride=1
677 | pad=1
678 | activation=leaky
679 | 
680 | [convolutional]
681 | batch_normalize=1
682 | size=3
683 | stride=1
684 | pad=1
685 | filters=512
686 | activation=leaky
687 | 
688 | [convolutional]
689 | batch_normalize=1
690 | filters=256
691 | size=1
692 | stride=1
693 | pad=1
694 | activation=leaky
695 | 
696 | [convolutional]
697 | batch_normalize=1
698 | size=3
699 | stride=1
700 | pad=1
701 | filters=512
702 | activation=leaky
703 | 
704 | [convolutional]
705 | size=1
706 | stride=1
707 | pad=1
708 | filters=258
709 | activation=linear
710 | 
711 | # YOLO -------------------------------------------------------------------------
712 | 
713 | [route]
714 | layers = -3
715 | 
716 | [convolutional]
717 | batch_normalize=1
718 | filters=128
719 | size=1
720 | stride=1
721 | pad=1
722 | activation=leaky
723 | 
724 | [upsample]
725 | stride=2
726 | 
727 | [route]
728 | layers = -1, 36
729 | 
730 | [convolutional]
731 | batch_normalize=1
732 | filters=128
733 | size=1
734 | stride=1
735 | pad=1
736 | activation=leaky
737 | 
738 | [convolutional]
739 | batch_normalize=1
740 | size=3
741 | stride=1
742 | pad=1
743 | filters=256
744 | activation=leaky
745 | 
746 | [convolutional]
747 | batch_normalize=1
748 | filters=128
749 | size=1
750 | stride=1
751 | pad=1
752 | activation=leaky
753 | 
754 | [convolutional]
755 | batch_normalize=1
756 | size=3
757 | stride=1
758 | pad=1
759 | filters=256
760 | activation=leaky
761 | 
762 | [convolutional]
763 | batch_normalize=1
764 | filters=128
765 | size=1
766 | stride=1
767 | pad=1
768 | activation=leaky
769 | 
770 | [convolutional]
771 | batch_normalize=1
772 | size=3
773 | stride=1
774 | pad=1
775 | filters=256
776 | activation=leaky
777 | 
778 | [convolutional]
779 | size=1
780 | stride=1
781 | pad=1
782 | filters=258
783 | activation=linear
784 | 
785 | [yolo]
786 | from=88,99,110
787 | mask = 6,7,8
788 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
789 | classes=80
790 | num=9
791 | 
792 | [yolo]
793 | from=88,99,110
794 | mask = 3,4,5
795 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
796 | classes=80
797 | num=9
798 | 
799 | [yolo]
800 | from=88,99,110
801 | mask = 0,1,2
802 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
803 | classes=80
804 | num=9


--------------------------------------------------------------------------------
/cfg/yolov3-spp3.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | batch=1
  4 | subdivisions=1
  5 | # Training
  6 | # batch=64
  7 | # subdivisions=16
  8 | width=608
  9 | height=608
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 120200
 21 | policy=steps
 22 | steps=70000,100000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | ### SPP ###
576 | [maxpool]
577 | stride=1
578 | size=5
579 | 
580 | [route]
581 | layers=-2
582 | 
583 | [maxpool]
584 | stride=1
585 | size=9
586 | 
587 | [route]
588 | layers=-4
589 | 
590 | [maxpool]
591 | stride=1
592 | size=13
593 | 
594 | [route]
595 | layers=-1,-3,-5,-6
596 | 
597 | ### End SPP ###
598 | 
599 | [convolutional]
600 | batch_normalize=1
601 | filters=512
602 | size=1
603 | stride=1
604 | pad=1
605 | activation=leaky
606 | 
607 | 
608 | [convolutional]
609 | batch_normalize=1
610 | size=3
611 | stride=1
612 | pad=1
613 | filters=1024
614 | activation=leaky
615 | 
616 | [convolutional]
617 | batch_normalize=1
618 | filters=512
619 | size=1
620 | stride=1
621 | pad=1
622 | activation=leaky
623 | 
624 | [convolutional]
625 | batch_normalize=1
626 | size=3
627 | stride=1
628 | pad=1
629 | filters=1024
630 | activation=leaky
631 | 
632 | [convolutional]
633 | size=1
634 | stride=1
635 | pad=1
636 | filters=255
637 | activation=linear
638 | 
639 | 
640 | [yolo]
641 | mask = 6,7,8
642 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
643 | classes=80
644 | num=9
645 | jitter=.3
646 | ignore_thresh = .7
647 | truth_thresh = 1
648 | random=1
649 | 
650 | 
651 | [route]
652 | layers = -4
653 | 
654 | [convolutional]
655 | batch_normalize=1
656 | filters=256
657 | size=1
658 | stride=1
659 | pad=1
660 | activation=leaky
661 | 
662 | [upsample]
663 | stride=2
664 | 
665 | [route]
666 | layers = -1, 61
667 | 
668 | 
669 | 
670 | [convolutional]
671 | batch_normalize=1
672 | filters=256
673 | size=1
674 | stride=1
675 | pad=1
676 | activation=leaky
677 | 
678 | [convolutional]
679 | batch_normalize=1
680 | size=3
681 | stride=1
682 | pad=1
683 | filters=512
684 | activation=leaky
685 | 
686 | ### SPP ###
687 | [maxpool]
688 | stride=1
689 | size=5
690 | 
691 | [route]
692 | layers=-2
693 | 
694 | [maxpool]
695 | stride=1
696 | size=9
697 | 
698 | [route]
699 | layers=-4
700 | 
701 | [maxpool]
702 | stride=1
703 | size=13
704 | 
705 | [route]
706 | layers=-1,-3,-5,-6
707 | 
708 | ### End SPP ###
709 | 
710 | 
711 | [convolutional]
712 | batch_normalize=1
713 | filters=256
714 | size=1
715 | stride=1
716 | pad=1
717 | activation=leaky
718 | 
719 | [convolutional]
720 | batch_normalize=1
721 | size=3
722 | stride=1
723 | pad=1
724 | filters=512
725 | activation=leaky
726 | 
727 | [convolutional]
728 | batch_normalize=1
729 | filters=256
730 | size=1
731 | stride=1
732 | pad=1
733 | activation=leaky
734 | 
735 | [convolutional]
736 | batch_normalize=1
737 | size=3
738 | stride=1
739 | pad=1
740 | filters=512
741 | activation=leaky
742 | 
743 | [convolutional]
744 | size=1
745 | stride=1
746 | pad=1
747 | filters=255
748 | activation=linear
749 | 
750 | 
751 | [yolo]
752 | mask = 3,4,5
753 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
754 | classes=80
755 | num=9
756 | jitter=.3
757 | ignore_thresh = .7
758 | truth_thresh = 1
759 | random=1
760 | 
761 | 
762 | 
763 | [route]
764 | layers = -4
765 | 
766 | [convolutional]
767 | batch_normalize=1
768 | filters=128
769 | size=1
770 | stride=1
771 | pad=1
772 | activation=leaky
773 | 
774 | [upsample]
775 | stride=2
776 | 
777 | [route]
778 | layers = -1, 36
779 | 
780 | 
781 | 
782 | [convolutional]
783 | batch_normalize=1
784 | filters=128
785 | size=1
786 | stride=1
787 | pad=1
788 | activation=leaky
789 | 
790 | [convolutional]
791 | batch_normalize=1
792 | size=3
793 | stride=1
794 | pad=1
795 | filters=256
796 | activation=leaky
797 | 
798 | [convolutional]
799 | batch_normalize=1
800 | filters=128
801 | size=1
802 | stride=1
803 | pad=1
804 | activation=leaky
805 | 
806 | ### SPP ###
807 | [maxpool]
808 | stride=1
809 | size=5
810 | 
811 | [route]
812 | layers=-2
813 | 
814 | [maxpool]
815 | stride=1
816 | size=9
817 | 
818 | [route]
819 | layers=-4
820 | 
821 | [maxpool]
822 | stride=1
823 | size=13
824 | 
825 | [route]
826 | layers=-1,-3,-5,-6
827 | 
828 | ### End SPP ###
829 | 
830 | [convolutional]
831 | batch_normalize=1
832 | size=3
833 | stride=1
834 | pad=1
835 | filters=256
836 | activation=leaky
837 | 
838 | [convolutional]
839 | batch_normalize=1
840 | filters=128
841 | size=1
842 | stride=1
843 | pad=1
844 | activation=leaky
845 | 
846 | [convolutional]
847 | batch_normalize=1
848 | size=3
849 | stride=1
850 | pad=1
851 | filters=256
852 | activation=leaky
853 | 
854 | [convolutional]
855 | size=1
856 | stride=1
857 | pad=1
858 | filters=255
859 | activation=linear
860 | 
861 | 
862 | [yolo]
863 | mask = 0,1,2
864 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
865 | classes=80
866 | num=9
867 | jitter=.3
868 | ignore_thresh = .7
869 | truth_thresh = 1
870 | random=1
871 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | 
  4 | from torch.utils.data import DataLoader
  5 | 
  6 | from models import *
  7 | from utils.datasets import *
  8 | from utils.utils import *
  9 | 
 10 | 
 11 | def test(cfg,
 12 |          data,
 13 |          weights=None,
 14 |          batch_size=16,
 15 |          img_size=416,
 16 |          conf_thres=0.001,
 17 |          iou_thres=0.6,  # for nms
 18 |          save_json=False,
 19 |          single_cls=False,
 20 |          augment=False,
 21 |          model=None,
 22 |          dataloader=None):
 23 |     # Initialize/load model and set device
 24 |     if model is None:
 25 |         device = torch_utils.select_device(opt.device, batch_size=batch_size)
 26 |         verbose = opt.task == 'test'
 27 | 
 28 |         # Remove previous
 29 |         for f in glob.glob('test_batch*.png'):
 30 |             os.remove(f)
 31 | 
 32 |         # Initialize model
 33 |         model = Darknet(cfg, img_size)
 34 | 
 35 |         # Load weights
 36 |         attempt_download(weights)
 37 |         if weights.endswith('.pt'):  # pytorch format
 38 |             model.load_state_dict(torch.load(weights, map_location=device)['model'])
 39 |         else:  # darknet format
 40 |             load_darknet_weights(model, weights)
 41 | 
 42 |         # Fuse
 43 |         model.fuse()
 44 |         model.to(device)
 45 | 
 46 |         if device.type != 'cpu' and torch.cuda.device_count() > 1:
 47 |             model = nn.DataParallel(model)
 48 |     else:  # called by train.py
 49 |         device = next(model.parameters()).device  # get model device
 50 |         verbose = False
 51 | 
 52 |     # Configure run
 53 |     data = parse_data_cfg(data)
 54 |     nc = 1 if single_cls else int(data['classes'])  # number of classes
 55 |     path = data['valid']  # path to test images
 56 |     names = load_classes(data['names'])  # class names
 57 |     iouv = torch.linspace(0.5, 0.95, 10).to(device)  # iou vector for mAP@0.5:0.95
 58 |     iouv = iouv[0].view(1)  # comment for mAP@0.5:0.95
 59 |     niou = iouv.numel()
 60 | 
 61 |     # Dataloader
 62 |     if dataloader is None:
 63 |         dataset = LoadImagesAndLabels(path, img_size, batch_size, rect=True, single_cls=opt.single_cls)
 64 |         batch_size = min(batch_size, len(dataset))
 65 |         dataloader = DataLoader(dataset,
 66 |                                 batch_size=batch_size,
 67 |                                 num_workers=min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]),
 68 |                                 pin_memory=True,
 69 |                                 collate_fn=dataset.collate_fn)
 70 | 
 71 |     seen = 0
 72 |     model.eval()
 73 |     _ = model(torch.zeros((1, 3, img_size, img_size), device=device)) if device.type != 'cpu' else None  # run once
 74 |     coco91class = coco80_to_coco91_class()
 75 |     s = ('%20s' + '%10s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@0.5', 'F1')
 76 |     p, r, f1, mp, mr, map, mf1, t0, t1 = 0., 0., 0., 0., 0., 0., 0., 0., 0.
 77 |     loss = torch.zeros(3, device=device)
 78 |     jdict, stats, ap, ap_class = [], [], [], []
 79 |     for batch_i, (imgs, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)):
 80 |         imgs = imgs.to(device).float() / 255.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0
 81 |         targets = targets.to(device)
 82 |         nb, _, height, width = imgs.shape  # batch size, channels, height, width
 83 |         whwh = torch.Tensor([width, height, width, height]).to(device)
 84 | 
 85 |         # Plot images with bounding boxes
 86 |         f = 'test_batch%g.png' % batch_i  # filename
 87 |         if batch_i < 1 and not os.path.exists(f):
 88 |             plot_images(imgs=imgs, targets=targets, paths=paths, fname=f)
 89 | 
 90 |         # Disable gradients
 91 |         with torch.no_grad():
 92 |             # Run model
 93 |             t = torch_utils.time_synchronized()
 94 |             inf_out, train_out = model(imgs, augment=augment)  # inference and training outputs
 95 |             t0 += torch_utils.time_synchronized() - t
 96 | 
 97 |             # Compute loss
 98 |             if hasattr(model, 'hyp'):  # if model has loss hyperparameters
 99 |                 loss += compute_loss(train_out, targets, model)[1][:3]  # GIoU, obj, cls
100 | 
101 |             # Run NMS
102 |             t = torch_utils.time_synchronized()
103 |             output = non_max_suppression(inf_out, conf_thres=conf_thres, iou_thres=iou_thres)  # nms
104 |             t1 += torch_utils.time_synchronized() - t
105 | 
106 |         # Statistics per image
107 |         for si, pred in enumerate(output):
108 |             labels = targets[targets[:, 0] == si, 1:]
109 |             nl = len(labels)
110 |             tcls = labels[:, 0].tolist() if nl else []  # target class
111 |             seen += 1
112 | 
113 |             if pred is None:
114 |                 if nl:
115 |                     stats.append((torch.zeros(0, niou, dtype=torch.bool), torch.Tensor(), torch.Tensor(), tcls))
116 |                 continue
117 | 
118 |             # Append to text file
119 |             # with open('test.txt', 'a') as file:
120 |             #    [file.write('%11.5g' * 7 % tuple(x) + '\n') for x in pred]
121 | 
122 |             # Clip boxes to image bounds
123 |             clip_coords(pred, (height, width))
124 | 
125 |             # Append to pycocotools JSON dictionary
126 |             if save_json:
127 |                 # [{"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}, ...
128 |                 image_id = int(Path(paths[si]).stem.split('_')[-1])
129 |                 box = pred[:, :4].clone()  # xyxy
130 |                 scale_coords(imgs[si].shape[1:], box, shapes[si][0], shapes[si][1])  # to original shape
131 |                 box = xyxy2xywh(box)  # xywh
132 |                 box[:, :2] -= box[:, 2:] / 2  # xy center to top-left corner
133 |                 for p, b in zip(pred.tolist(), box.tolist()):
134 |                     jdict.append({'image_id': image_id,
135 |                                   'category_id': coco91class[int(p[5])],
136 |                                   'bbox': [round(x, 3) for x in b],
137 |                                   'score': round(p[4], 5)})
138 | 
139 |             # Assign all predictions as incorrect
140 |             correct = torch.zeros(pred.shape[0], niou, dtype=torch.bool, device=device)
141 |             if nl:
142 |                 detected = []  # target indices
143 |                 tcls_tensor = labels[:, 0]
144 | 
145 |                 # target boxes
146 |                 tbox = xywh2xyxy(labels[:, 1:5]) * whwh
147 | 
148 |                 # Per target class
149 |                 for cls in torch.unique(tcls_tensor):
150 |                     ti = (cls == tcls_tensor).nonzero().view(-1)  # prediction indices
151 |                     pi = (cls == pred[:, 5]).nonzero().view(-1)  # target indices
152 | 
153 |                     # Search for detections
154 |                     if pi.shape[0]:
155 |                         # Prediction to target ious
156 |                         ious, i = box_iou(pred[pi, :4], tbox[ti]).max(1)  # best ious, indices
157 | 
158 |                         # Append detections
159 |                         for j in (ious > iouv[0]).nonzero():
160 |                             d = ti[i[j]]  # detected target
161 |                             if d not in detected:
162 |                                 detected.append(d)
163 |                                 correct[pi[j]] = ious[j] > iouv  # iou_thres is 1xn
164 |                                 if len(detected) == nl:  # all targets already located in image
165 |                                     break
166 | 
167 |             # Append statistics (correct, conf, pcls, tcls)
168 |             stats.append((correct.cpu(), pred[:, 4].cpu(), pred[:, 5].cpu(), tcls))
169 | 
170 |     # Compute statistics
171 |     stats = [np.concatenate(x, 0) for x in zip(*stats)]  # to numpy
172 |     if len(stats):
173 |         p, r, ap, f1, ap_class = ap_per_class(*stats)
174 |         if niou > 1:
175 |             p, r, ap, f1 = p[:, 0], r[:, 0], ap.mean(1), ap[:, 0]  # [P, R, AP@0.5:0.95, AP@0.5]
176 |         mp, mr, map, mf1 = p.mean(), r.mean(), ap.mean(), f1.mean()
177 |         nt = np.bincount(stats[3].astype(np.int64), minlength=nc)  # number of targets per class
178 |     else:
179 |         nt = torch.zeros(1)
180 | 
181 |     # Print results
182 |     pf = '%20s' + '%10.3g' * 6  # print format
183 |     print(pf % ('all', seen, nt.sum(), mp, mr, map, mf1))
184 | 
185 |     # Print results per class
186 |     if verbose and nc > 1 and len(stats):
187 |         for i, c in enumerate(ap_class):
188 |             print(pf % (names[c], seen, nt[c], p[i], r[i], ap[i], f1[i]))
189 | 
190 |     # Print speeds
191 |     if verbose or save_json:
192 |         t = tuple(x / seen * 1E3 for x in (t0, t1, t0 + t1)) + (img_size, img_size, batch_size)  # tuple
193 |         print('Speed: %.1f/%.1f/%.1f ms inference/NMS/total per %gx%g image at batch-size %g' % t)
194 | 
195 |     # Save JSON
196 |     if save_json and map and len(jdict):
197 |         print('\nCOCO mAP with pycocotools...')
198 |         imgIds = [int(Path(x).stem.split('_')[-1]) for x in dataloader.dataset.img_files]
199 |         with open('results.json', 'w') as file:
200 |             json.dump(jdict, file)
201 | 
202 |         try:
203 |             from pycocotools.coco import COCO
204 |             from pycocotools.cocoeval import COCOeval
205 |         except:
206 |             print('WARNING: missing pycocotools package, can not compute official COCO mAP. See requirements.txt.')
207 | 
208 |         # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb
209 |         cocoGt = COCO(glob.glob('/mnt/sda/yolact/data/coco/annotations/instances_val2014.json')[0])  # initialize COCO ground truth api
210 |         cocoDt = cocoGt.loadRes('results.json')  # initialize COCO pred api
211 | 
212 |         cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
213 |         cocoEval.params.imgIds = imgIds  # [:32]  # only evaluate these images
214 |         cocoEval.evaluate()
215 |         cocoEval.accumulate()
216 |         cocoEval.summarize()
217 |         # mf1, map = cocoEval.stats[:2]  # update to pycocotools results (mAP@0.5:0.95, mAP@0.5)
218 | 
219 |     # Return results
220 |     maps = np.zeros(nc) + map
221 |     for i, c in enumerate(ap_class):
222 |         maps[c] = ap[i]
223 |     return (mp, mr, map, mf1, *(loss.cpu() / len(dataloader)).tolist()), maps
224 | 
225 | 
226 | if __name__ == '__main__':
227 |     parser = argparse.ArgumentParser(prog='test.py')
228 |     parser.add_argument('--cfg', type=str, default='cfg/yolov3-spp.cfg', help='*.cfg path')
229 |     parser.add_argument('--data', type=str, default='data/coco2014.data', help='*.data path')
230 |     parser.add_argument('--weights', type=str, default='weights/yolov3-spp-ultralytics.pt', help='weights path')
231 |     parser.add_argument('--batch-size', type=int, default=4, help='size of each image batch')
232 |     parser.add_argument('--img-size', type=int, default=608, help='inference size (pixels)')
233 |     parser.add_argument('--conf-thres', type=float, default=0.001, help='object confidence threshold')
234 |     parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS')
235 |     parser.add_argument('--save-json', action='store_true', help='save a cocoapi-compatible JSON results file')
236 |     parser.add_argument('--task', default='test', help="'test', 'study', 'benchmark'")
237 |     parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1) or cpu')
238 |     parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
239 |     parser.add_argument('--augment', action='store_true', help='augmented inference')
240 |     opt = parser.parse_args()
241 |     opt.save_json = opt.save_json or any([x in opt.data for x in ['coco.data', 'coco2014.data', 'coco2017.data']])
242 |     print(opt)
243 | 
244 |     # task = 'test', 'study', 'benchmark'
245 |     if opt.task == 'test':  # (default) test normally
246 |         test(opt.cfg,
247 |              opt.data,
248 |              opt.weights,
249 |              opt.batch_size,
250 |              opt.img_size,
251 |              opt.conf_thres,
252 |              opt.iou_thres,
253 |              opt.save_json,
254 |              opt.single_cls,
255 |              opt.augment)
256 | 
257 |     elif opt.task == 'benchmark':  # mAPs at 320-608 at conf 0.5 and 0.7
258 |         y = []
259 |         for i in [320, 416, 512, 608]:  # img-size
260 |             for j in [0.5, 0.7]:  # iou-thres
261 |                 t = time.time()
262 |                 r = test(opt.cfg, opt.data, opt.weights, opt.batch_size, i, opt.conf_thres, j, opt.save_json)[0]
263 |                 y.append(r + (time.time() - t,))
264 |         np.savetxt('benchmark.txt', y, fmt='%10.4g')  # y = np.loadtxt('study.txt')
265 | 
266 |     elif opt.task == 'study':  # Parameter study
267 |         y = []
268 |         x = np.arange(0.4, 0.9, 0.05)  # iou-thres
269 |         for i in x:
270 |             t = time.time()
271 |             r = test(opt.cfg, opt.data, opt.weights, opt.batch_size, opt.img_size, opt.conf_thres, i, opt.save_json)[0]
272 |             y.append(r + (time.time() - t,))
273 |         np.savetxt('study.txt', y, fmt='%10.4g')  # y = np.loadtxt('study.txt')
274 | 
275 |         # Plot
276 |         fig, ax = plt.subplots(3, 1, figsize=(6, 6))
277 |         y = np.stack(y, 0)
278 |         ax[0].plot(x, y[:, 2], marker='.', label='mAP@0.5')
279 |         ax[0].set_ylabel('mAP')
280 |         ax[1].plot(x, y[:, 3], marker='.', label='mAP@0.5:0.95')
281 |         ax[1].set_ylabel('mAP')
282 |         ax[2].plot(x, y[:, -1], marker='.', label='time')
283 |         ax[2].set_ylabel('time (s)')
284 |         for i in range(3):
285 |             ax[i].legend()
286 |             ax[i].set_xlabel('iou_thr')
287 |         fig.tight_layout()
288 |         plt.savefig('study.jpg', dpi=200)
289 | 


--------------------------------------------------------------------------------
/cfg/yolov3-spp-pan-scale.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | #batch=1
  4 | #subdivisions=1
  5 | # Training
  6 | batch=64
  7 | subdivisions=32
  8 | width=544
  9 | height=544
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | 
 19 | learning_rate=0.001
 20 | burn_in=1000
 21 | max_batches = 10000
 22 | 
 23 | policy=steps
 24 | steps=8000,9000
 25 | scales=.1,.1
 26 | 
 27 | #policy=sgdr
 28 | #sgdr_cycle=1000
 29 | #sgdr_mult=2
 30 | #steps=4000,6000,8000,9000
 31 | #scales=1, 1, 0.1, 0.1
 32 | 
 33 | [convolutional]
 34 | batch_normalize=1
 35 | filters=32
 36 | size=3
 37 | stride=1
 38 | pad=1
 39 | activation=leaky
 40 | 
 41 | # Downsample
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=64
 46 | size=3
 47 | stride=2
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=32
 54 | size=1
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [convolutional]
 60 | batch_normalize=1
 61 | filters=64
 62 | size=3
 63 | stride=1
 64 | pad=1
 65 | activation=leaky
 66 | 
 67 | [shortcut]
 68 | from=-3
 69 | activation=linear
 70 | 
 71 | # Downsample
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=128
 76 | size=3
 77 | stride=2
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=64
 84 | size=1
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [convolutional]
 90 | batch_normalize=1
 91 | filters=128
 92 | size=3
 93 | stride=1
 94 | pad=1
 95 | activation=leaky
 96 | 
 97 | [shortcut]
 98 | from=-3
 99 | activation=linear
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=64
104 | size=1
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [convolutional]
110 | batch_normalize=1
111 | filters=128
112 | size=3
113 | stride=1
114 | pad=1
115 | activation=leaky
116 | 
117 | [shortcut]
118 | from=-3
119 | activation=linear
120 | 
121 | # Downsample
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=256
126 | size=3
127 | stride=2
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=128
134 | size=1
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [convolutional]
140 | batch_normalize=1
141 | filters=256
142 | size=3
143 | stride=1
144 | pad=1
145 | activation=leaky
146 | 
147 | [shortcut]
148 | from=-3
149 | activation=linear
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=128
154 | size=1
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [convolutional]
160 | batch_normalize=1
161 | filters=256
162 | size=3
163 | stride=1
164 | pad=1
165 | activation=leaky
166 | 
167 | [shortcut]
168 | from=-3
169 | activation=linear
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=128
174 | size=1
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [convolutional]
180 | batch_normalize=1
181 | filters=256
182 | size=3
183 | stride=1
184 | pad=1
185 | activation=leaky
186 | 
187 | [shortcut]
188 | from=-3
189 | activation=linear
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=128
194 | size=1
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [convolutional]
200 | batch_normalize=1
201 | filters=256
202 | size=3
203 | stride=1
204 | pad=1
205 | activation=leaky
206 | 
207 | [shortcut]
208 | from=-3
209 | activation=linear
210 | 
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=128
215 | size=1
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [convolutional]
221 | batch_normalize=1
222 | filters=256
223 | size=3
224 | stride=1
225 | pad=1
226 | activation=leaky
227 | 
228 | [shortcut]
229 | from=-3
230 | activation=linear
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=128
235 | size=1
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [convolutional]
241 | batch_normalize=1
242 | filters=256
243 | size=3
244 | stride=1
245 | pad=1
246 | activation=leaky
247 | 
248 | [shortcut]
249 | from=-3
250 | activation=linear
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=128
255 | size=1
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [convolutional]
261 | batch_normalize=1
262 | filters=256
263 | size=3
264 | stride=1
265 | pad=1
266 | activation=leaky
267 | 
268 | [shortcut]
269 | from=-3
270 | activation=linear
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=128
275 | size=1
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [convolutional]
281 | batch_normalize=1
282 | filters=256
283 | size=3
284 | stride=1
285 | pad=1
286 | activation=leaky
287 | 
288 | [shortcut]
289 | from=-3
290 | activation=linear
291 | 
292 | # Downsample
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=512
297 | size=3
298 | stride=2
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=256
305 | size=1
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [convolutional]
311 | batch_normalize=1
312 | filters=512
313 | size=3
314 | stride=1
315 | pad=1
316 | activation=leaky
317 | 
318 | [shortcut]
319 | from=-3
320 | activation=linear
321 | 
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=256
326 | size=1
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [convolutional]
332 | batch_normalize=1
333 | filters=512
334 | size=3
335 | stride=1
336 | pad=1
337 | activation=leaky
338 | 
339 | [shortcut]
340 | from=-3
341 | activation=linear
342 | 
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=256
347 | size=1
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [convolutional]
353 | batch_normalize=1
354 | filters=512
355 | size=3
356 | stride=1
357 | pad=1
358 | activation=leaky
359 | 
360 | [shortcut]
361 | from=-3
362 | activation=linear
363 | 
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=256
368 | size=1
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [convolutional]
374 | batch_normalize=1
375 | filters=512
376 | size=3
377 | stride=1
378 | pad=1
379 | activation=leaky
380 | 
381 | [shortcut]
382 | from=-3
383 | activation=linear
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=256
388 | size=1
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [convolutional]
394 | batch_normalize=1
395 | filters=512
396 | size=3
397 | stride=1
398 | pad=1
399 | activation=leaky
400 | 
401 | [shortcut]
402 | from=-3
403 | activation=linear
404 | 
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=256
409 | size=1
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [convolutional]
415 | batch_normalize=1
416 | filters=512
417 | size=3
418 | stride=1
419 | pad=1
420 | activation=leaky
421 | 
422 | [shortcut]
423 | from=-3
424 | activation=linear
425 | 
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=256
430 | size=1
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [convolutional]
436 | batch_normalize=1
437 | filters=512
438 | size=3
439 | stride=1
440 | pad=1
441 | activation=leaky
442 | 
443 | [shortcut]
444 | from=-3
445 | activation=linear
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=256
450 | size=1
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [convolutional]
456 | batch_normalize=1
457 | filters=512
458 | size=3
459 | stride=1
460 | pad=1
461 | activation=leaky
462 | 
463 | [shortcut]
464 | from=-3
465 | activation=linear
466 | 
467 | # Downsample
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=1024
472 | size=3
473 | stride=2
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=512
480 | size=1
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [convolutional]
486 | batch_normalize=1
487 | filters=1024
488 | size=3
489 | stride=1
490 | pad=1
491 | activation=leaky
492 | 
493 | [shortcut]
494 | from=-3
495 | activation=linear
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=512
500 | size=1
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [convolutional]
506 | batch_normalize=1
507 | filters=1024
508 | size=3
509 | stride=1
510 | pad=1
511 | activation=leaky
512 | 
513 | [shortcut]
514 | from=-3
515 | activation=linear
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=512
520 | size=1
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [convolutional]
526 | batch_normalize=1
527 | filters=1024
528 | size=3
529 | stride=1
530 | pad=1
531 | activation=leaky
532 | 
533 | [shortcut]
534 | from=-3
535 | activation=linear
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=512
540 | size=1
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [convolutional]
546 | batch_normalize=1
547 | filters=1024
548 | size=3
549 | stride=1
550 | pad=1
551 | activation=leaky
552 | 
553 | [shortcut]
554 | from=-3
555 | activation=linear
556 | 
557 | ######################
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | filters=512
562 | size=1
563 | stride=1
564 | pad=1
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | size=3
570 | stride=1
571 | pad=1
572 | filters=1024
573 | activation=leaky
574 | 
575 | [convolutional]
576 | batch_normalize=1
577 | filters=512
578 | size=1
579 | stride=1
580 | pad=1
581 | activation=leaky
582 | 
583 | ### SPP ###
584 | [maxpool]
585 | stride=1
586 | size=5
587 | 
588 | [route]
589 | layers=-2
590 | 
591 | [maxpool]
592 | stride=1
593 | size=9
594 | 
595 | [route]
596 | layers=-4
597 | 
598 | [maxpool]
599 | stride=1
600 | size=13
601 | 
602 | [route]
603 | layers=-1,-3,-5,-6
604 | 
605 | ### End SPP ###
606 | 
607 | [convolutional]
608 | batch_normalize=1
609 | filters=512
610 | size=1
611 | stride=1
612 | pad=1
613 | activation=leaky
614 | 
615 | 
616 | [convolutional]
617 | batch_normalize=1
618 | size=3
619 | stride=1
620 | pad=1
621 | filters=1024
622 | activation=leaky
623 | 
624 | [convolutional]
625 | batch_normalize=1
626 | filters=512
627 | size=1
628 | stride=1
629 | pad=1
630 | activation=leaky
631 | 
632 | 
633 | 
634 | ########### to [yolo-3]
635 | 
636 | 
637 | 
638 | [route]
639 | layers = -4
640 | 
641 | [convolutional]
642 | batch_normalize=1
643 | filters=256
644 | size=1
645 | stride=1
646 | pad=1
647 | activation=leaky
648 | 
649 | [upsample]
650 | stride=2
651 | 
652 | [route]
653 | layers = -1, 61
654 | 
655 | 
656 | 
657 | [convolutional]
658 | batch_normalize=1
659 | filters=256
660 | size=1
661 | stride=1
662 | pad=1
663 | activation=leaky
664 | 
665 | [convolutional]
666 | batch_normalize=1
667 | size=3
668 | stride=1
669 | pad=1
670 | filters=512
671 | activation=leaky
672 | 
673 | [convolutional]
674 | batch_normalize=1
675 | filters=256
676 | size=1
677 | stride=1
678 | pad=1
679 | activation=leaky
680 | 
681 | [convolutional]
682 | batch_normalize=1
683 | size=3
684 | stride=1
685 | pad=1
686 | filters=512
687 | activation=leaky
688 | 
689 | [convolutional]
690 | batch_normalize=1
691 | filters=256
692 | size=1
693 | stride=1
694 | pad=1
695 | activation=leaky
696 | 
697 | 
698 | ########### to [yolo-2]
699 | 
700 | 
701 | 
702 | 
703 | [route]
704 | layers = -4
705 | 
706 | [convolutional]
707 | batch_normalize=1
708 | filters=128
709 | size=1
710 | stride=1
711 | pad=1
712 | activation=leaky
713 | 
714 | [upsample]
715 | stride=2
716 | 
717 | [route]
718 | layers = -1, 36
719 | 
720 | 
721 | 
722 | [convolutional]
723 | batch_normalize=1
724 | filters=128
725 | size=1
726 | stride=1
727 | pad=1
728 | activation=leaky
729 | 
730 | [convolutional]
731 | batch_normalize=1
732 | size=3
733 | stride=1
734 | pad=1
735 | filters=256
736 | activation=leaky
737 | 
738 | [convolutional]
739 | batch_normalize=1
740 | filters=128
741 | size=1
742 | stride=1
743 | pad=1
744 | activation=leaky
745 | 
746 | [convolutional]
747 | batch_normalize=1
748 | size=3
749 | stride=1
750 | pad=1
751 | filters=256
752 | activation=leaky
753 | 
754 | [convolutional]
755 | batch_normalize=1
756 | filters=128
757 | size=1
758 | stride=1
759 | pad=1
760 | activation=leaky
761 | 
762 | 
763 | 
764 | ########### to [yolo-1]
765 | 
766 | 
767 | ########### features of different layers
768 | 
769 | 
770 | [route]
771 | layers=1
772 | 
773 | [reorg3d]
774 | stride=2
775 | 
776 | [route]
777 | layers=5,-1
778 | 
779 | [reorg3d]
780 | stride=2
781 | 
782 | [route]
783 | layers=12,-1
784 | 
785 | [reorg3d]
786 | stride=2
787 | 
788 | [route]
789 | layers=37,-1
790 | 
791 | [reorg3d]
792 | stride=2
793 | 
794 | [route]
795 | layers=62,-1
796 | 
797 | 
798 | 
799 | ########### [yolo-1]
800 | 
801 | [convolutional]
802 | batch_normalize=1
803 | filters=128
804 | size=1
805 | stride=1
806 | pad=1
807 | activation=leaky
808 | 
809 | [upsample]
810 | stride=4
811 | 
812 | [route]
813 | layers = -1,-12
814 | 
815 | 
816 | [convolutional]
817 | batch_normalize=1
818 | size=3
819 | stride=1
820 | pad=1
821 | filters=256
822 | activation=leaky
823 | 
824 | [convolutional]
825 | size=1
826 | stride=1
827 | pad=1
828 | filters=340
829 | activation=linear
830 | 
831 | 
832 | [yolo]
833 | mask = 0,1,2,3
834 | anchors = 8,8, 10,13, 16,30, 33,23,  32,32, 30,61, 62,45, 64,64,  59,119, 116,90, 156,198, 373,326
835 | classes=80
836 | num=12
837 | jitter=.3
838 | ignore_thresh = .7
839 | truth_thresh = 1
840 | scale_x_y = 1.05
841 | random=0
842 | 
843 | 
844 | 
845 | 
846 | ########### [yolo-2]
847 | 
848 | 
849 | [route]
850 | layers = -7
851 | 
852 | [convolutional]
853 | batch_normalize=1
854 | filters=256
855 | size=1
856 | stride=1
857 | pad=1
858 | activation=leaky
859 | 
860 | [upsample]
861 | stride=2
862 | 
863 | [route]
864 | layers = -1,-28
865 | 
866 | 
867 | [convolutional]
868 | batch_normalize=1
869 | size=3
870 | stride=1
871 | pad=1
872 | filters=512
873 | activation=leaky
874 | 
875 | [convolutional]
876 | size=1
877 | stride=1
878 | pad=1
879 | filters=340
880 | activation=linear
881 | 
882 | 
883 | [yolo]
884 | mask = 4,5,6,7
885 | anchors = 8,8, 10,13, 16,30, 33,23,  32,32, 30,61, 62,45, 64,64,  59,119, 116,90, 156,198, 373,326
886 | classes=80
887 | num=12
888 | jitter=.3
889 | ignore_thresh = .7
890 | truth_thresh = 1
891 | scale_x_y = 1.1
892 | random=0
893 | 
894 | 
895 | 
896 | ########### [yolo-3]
897 | 
898 | [route]
899 | layers = -14
900 | 
901 | [convolutional]
902 | batch_normalize=1
903 | filters=512
904 | size=1
905 | stride=1
906 | pad=1
907 | activation=leaky
908 | 
909 | [route]
910 | layers = -1,-43
911 | 
912 | [convolutional]
913 | batch_normalize=1
914 | size=3
915 | stride=1
916 | pad=1
917 | filters=1024
918 | activation=leaky
919 | 
920 | 
921 | [convolutional]
922 | size=1
923 | stride=1
924 | pad=1
925 | filters=340
926 | activation=linear
927 | 
928 | 
929 | [yolo]
930 | mask = 8,9,10,11
931 | anchors = 8,8, 10,13, 16,30, 33,23,  32,32, 30,61, 62,45, 59,119,   80,80, 116,90, 156,198, 373,326
932 | classes=80
933 | num=12
934 | jitter=.3
935 | ignore_thresh = .7
936 | truth_thresh = 1
937 | scale_x_y = 1.2
938 | random=0
939 | 


--------------------------------------------------------------------------------
/cfg/csresnext50-panet-spp.cfg:
--------------------------------------------------------------------------------
   1 | [net]
   2 | # Testing
   3 | #batch=1
   4 | #subdivisions=1
   5 | # Training
   6 | batch=64
   7 | subdivisions=16
   8 | width=416
   9 | height=416
  10 | channels=3
  11 | momentum=0.9
  12 | decay=0.0005
  13 | angle=0
  14 | saturation = 1.5
  15 | exposure = 1.5
  16 | hue=.1
  17 | 
  18 | learning_rate=0.001
  19 | burn_in=1000
  20 | max_batches = 500500
  21 | policy=steps
  22 | steps=400000,450000
  23 | scales=.1,.1
  24 | 
  25 | #19:104x104 38:52x52 65:26x26 80:13x13 for 416
  26 | 
  27 | [convolutional]
  28 | batch_normalize=1
  29 | filters=64
  30 | size=7
  31 | stride=2
  32 | pad=1
  33 | activation=leaky
  34 | 
  35 | [maxpool]
  36 | size=2
  37 | stride=2
  38 | 
  39 | [convolutional]
  40 | batch_normalize=1
  41 | filters=128
  42 | size=1
  43 | stride=1
  44 | pad=1
  45 | activation=leaky
  46 | 
  47 | [route]
  48 | layers = -2
  49 | 
  50 | [convolutional]
  51 | batch_normalize=1
  52 | filters=64
  53 | size=1
  54 | stride=1
  55 | pad=1
  56 | activation=leaky
  57 | 
  58 | # 1-1
  59 | 
  60 | [convolutional]
  61 | batch_normalize=1
  62 | filters=128
  63 | size=1
  64 | stride=1
  65 | pad=1
  66 | activation=leaky
  67 | 
  68 | [convolutional]
  69 | batch_normalize=1
  70 | filters=128
  71 | size=3
  72 | groups=32
  73 | stride=1
  74 | pad=1
  75 | activation=leaky
  76 | 
  77 | [convolutional]
  78 | batch_normalize=1
  79 | filters=64
  80 | size=1
  81 | stride=1
  82 | pad=1
  83 | activation=linear
  84 | 
  85 | [shortcut]
  86 | from=-4
  87 | activation=leaky
  88 | 
  89 | # 1-2
  90 | 
  91 | [convolutional]
  92 | batch_normalize=1
  93 | filters=128
  94 | size=1
  95 | stride=1
  96 | pad=1
  97 | activation=leaky
  98 | 
  99 | [convolutional]
 100 | batch_normalize=1
 101 | filters=128
 102 | size=3
 103 | groups=32
 104 | stride=1
 105 | pad=1
 106 | activation=leaky
 107 | 
 108 | [convolutional]
 109 | batch_normalize=1
 110 | filters=64
 111 | size=1
 112 | stride=1
 113 | pad=1
 114 | activation=linear
 115 | 
 116 | [shortcut]
 117 | from=-4
 118 | activation=leaky
 119 | 
 120 | # 1-3
 121 | 
 122 | [convolutional]
 123 | batch_normalize=1
 124 | filters=128
 125 | size=1
 126 | stride=1
 127 | pad=1
 128 | activation=leaky
 129 | 
 130 | [convolutional]
 131 | batch_normalize=1
 132 | filters=128
 133 | size=3
 134 | groups=32
 135 | stride=1
 136 | pad=1
 137 | activation=leaky
 138 | 
 139 | [convolutional]
 140 | batch_normalize=1
 141 | filters=64
 142 | size=1
 143 | stride=1
 144 | pad=1
 145 | activation=linear
 146 | 
 147 | [shortcut]
 148 | from=-4
 149 | activation=leaky
 150 | 
 151 | # 1-T
 152 | 
 153 | [convolutional]
 154 | batch_normalize=1
 155 | filters=128
 156 | size=1
 157 | stride=1
 158 | pad=1
 159 | activation=leaky
 160 | 
 161 | [route]
 162 | layers = -1,-16
 163 | 
 164 | [convolutional]
 165 | batch_normalize=1
 166 | filters=256
 167 | size=1
 168 | stride=1
 169 | pad=1
 170 | activation=leaky
 171 | 
 172 | [convolutional]
 173 | batch_normalize=1
 174 | filters=256
 175 | size=3
 176 | groups=32
 177 | stride=2
 178 | pad=1
 179 | activation=leaky
 180 | 
 181 | [convolutional]
 182 | batch_normalize=1
 183 | filters=256
 184 | size=1
 185 | stride=1
 186 | pad=1
 187 | activation=linear
 188 | 
 189 | [route]
 190 | layers = -2
 191 | 
 192 | [convolutional]
 193 | batch_normalize=1
 194 | filters=256
 195 | size=1
 196 | stride=1
 197 | pad=1
 198 | activation=linear
 199 | 
 200 | # 2-1
 201 | 
 202 | [convolutional]
 203 | batch_normalize=1
 204 | filters=256
 205 | size=1
 206 | stride=1
 207 | pad=1
 208 | activation=leaky
 209 | 
 210 | [convolutional]
 211 | batch_normalize=1
 212 | filters=256
 213 | size=3
 214 | groups=32
 215 | stride=1
 216 | pad=1
 217 | activation=leaky
 218 | 
 219 | [convolutional]
 220 | batch_normalize=1
 221 | filters=256
 222 | size=1
 223 | stride=1
 224 | pad=1
 225 | activation=linear
 226 | 
 227 | [shortcut]
 228 | from=-4
 229 | activation=leaky
 230 | 
 231 | # 2-2
 232 | 
 233 | [convolutional]
 234 | batch_normalize=1
 235 | filters=256
 236 | size=1
 237 | stride=1
 238 | pad=1
 239 | activation=leaky
 240 | 
 241 | [convolutional]
 242 | batch_normalize=1
 243 | filters=256
 244 | size=3
 245 | groups=32
 246 | stride=1
 247 | pad=1
 248 | activation=leaky
 249 | 
 250 | [convolutional]
 251 | batch_normalize=1
 252 | filters=256
 253 | size=1
 254 | stride=1
 255 | pad=1
 256 | activation=linear
 257 | 
 258 | [shortcut]
 259 | from=-4
 260 | activation=leaky
 261 | 
 262 | # 2-3
 263 | 
 264 | [convolutional]
 265 | batch_normalize=1
 266 | filters=256
 267 | size=1
 268 | stride=1
 269 | pad=1
 270 | activation=leaky
 271 | 
 272 | [convolutional]
 273 | batch_normalize=1
 274 | filters=256
 275 | size=3
 276 | groups=32
 277 | stride=1
 278 | pad=1
 279 | activation=leaky
 280 | 
 281 | [convolutional]
 282 | batch_normalize=1
 283 | filters=256
 284 | size=1
 285 | stride=1
 286 | pad=1
 287 | activation=linear
 288 | 
 289 | [shortcut]
 290 | from=-4
 291 | activation=leaky
 292 | 
 293 | # 2-T
 294 | 
 295 | [convolutional]
 296 | batch_normalize=1
 297 | filters=256
 298 | size=1
 299 | stride=1
 300 | pad=1
 301 | activation=leaky
 302 | 
 303 | [route]
 304 | layers = -1,-16
 305 | 
 306 | [convolutional]
 307 | batch_normalize=1
 308 | filters=512
 309 | size=1
 310 | stride=1
 311 | pad=1
 312 | activation=leaky
 313 | 
 314 | [convolutional]
 315 | batch_normalize=1
 316 | filters=512
 317 | size=3
 318 | groups=32
 319 | stride=2
 320 | pad=1
 321 | activation=leaky
 322 | 
 323 | [convolutional]
 324 | batch_normalize=1
 325 | filters=512
 326 | size=1
 327 | stride=1
 328 | pad=1
 329 | activation=linear
 330 | 
 331 | [route]
 332 | layers = -2
 333 | 
 334 | [convolutional]
 335 | batch_normalize=1
 336 | filters=512
 337 | size=1
 338 | stride=1
 339 | pad=1
 340 | activation=linear
 341 | 
 342 | # 3-1
 343 | 
 344 | [convolutional]
 345 | batch_normalize=1
 346 | filters=512
 347 | size=1
 348 | stride=1
 349 | pad=1
 350 | activation=leaky
 351 | 
 352 | [convolutional]
 353 | batch_normalize=1
 354 | filters=512
 355 | size=3
 356 | groups=32
 357 | stride=1
 358 | pad=1
 359 | activation=leaky
 360 | 
 361 | [convolutional]
 362 | batch_normalize=1
 363 | filters=512
 364 | size=1
 365 | stride=1
 366 | pad=1
 367 | activation=linear
 368 | 
 369 | [shortcut]
 370 | from=-4
 371 | activation=leaky
 372 | 
 373 | # 3-2
 374 | 
 375 | [convolutional]
 376 | batch_normalize=1
 377 | filters=512
 378 | size=1
 379 | stride=1
 380 | pad=1
 381 | activation=leaky
 382 | 
 383 | [convolutional]
 384 | batch_normalize=1
 385 | filters=512
 386 | size=3
 387 | groups=32
 388 | stride=1
 389 | pad=1
 390 | activation=leaky
 391 | 
 392 | [convolutional]
 393 | batch_normalize=1
 394 | filters=512
 395 | size=1
 396 | stride=1
 397 | pad=1
 398 | activation=linear
 399 | 
 400 | [shortcut]
 401 | from=-4
 402 | activation=leaky
 403 | 
 404 | # 3-3
 405 | 
 406 | [convolutional]
 407 | batch_normalize=1
 408 | filters=512
 409 | size=1
 410 | stride=1
 411 | pad=1
 412 | activation=leaky
 413 | 
 414 | [convolutional]
 415 | batch_normalize=1
 416 | filters=512
 417 | size=3
 418 | groups=32
 419 | stride=1
 420 | pad=1
 421 | activation=leaky
 422 | 
 423 | [convolutional]
 424 | batch_normalize=1
 425 | filters=512
 426 | size=1
 427 | stride=1
 428 | pad=1
 429 | activation=linear
 430 | 
 431 | [shortcut]
 432 | from=-4
 433 | activation=leaky
 434 | 
 435 | # 3-4
 436 | 
 437 | [convolutional]
 438 | batch_normalize=1
 439 | filters=512
 440 | size=1
 441 | stride=1
 442 | pad=1
 443 | activation=leaky
 444 | 
 445 | [convolutional]
 446 | batch_normalize=1
 447 | filters=512
 448 | size=3
 449 | groups=32
 450 | stride=1
 451 | pad=1
 452 | activation=leaky
 453 | 
 454 | [convolutional]
 455 | batch_normalize=1
 456 | filters=512
 457 | size=1
 458 | stride=1
 459 | pad=1
 460 | activation=linear
 461 | 
 462 | [shortcut]
 463 | from=-4
 464 | activation=leaky
 465 | 
 466 | # 3-5
 467 | 
 468 | [convolutional]
 469 | batch_normalize=1
 470 | filters=512
 471 | size=1
 472 | stride=1
 473 | pad=1
 474 | activation=leaky
 475 | 
 476 | [convolutional]
 477 | batch_normalize=1
 478 | filters=512
 479 | size=3
 480 | groups=32
 481 | stride=1
 482 | pad=1
 483 | activation=leaky
 484 | 
 485 | [convolutional]
 486 | batch_normalize=1
 487 | filters=512
 488 | size=1
 489 | stride=1
 490 | pad=1
 491 | activation=linear
 492 | 
 493 | [shortcut]
 494 | from=-4
 495 | activation=leaky
 496 | 
 497 | # 3-T
 498 | 
 499 | [convolutional]
 500 | batch_normalize=1
 501 | filters=512
 502 | size=1
 503 | stride=1
 504 | pad=1
 505 | activation=leaky
 506 | 
 507 | [route]
 508 | layers = -1,-24
 509 | 
 510 | [convolutional]
 511 | batch_normalize=1
 512 | filters=1024
 513 | size=1
 514 | stride=1
 515 | pad=1
 516 | activation=leaky
 517 | 
 518 | [convolutional]
 519 | batch_normalize=1
 520 | filters=1024
 521 | size=3
 522 | groups=32
 523 | stride=2
 524 | pad=1
 525 | activation=leaky
 526 | 
 527 | [convolutional]
 528 | batch_normalize=1
 529 | filters=1024
 530 | size=1
 531 | stride=1
 532 | pad=1
 533 | activation=leaky
 534 | 
 535 | [route]
 536 | layers = -2
 537 | 
 538 | [convolutional]
 539 | batch_normalize=1
 540 | filters=1024
 541 | size=1
 542 | stride=1
 543 | pad=1
 544 | activation=leaky
 545 | 
 546 | # 4-1
 547 | 
 548 | [convolutional]
 549 | batch_normalize=1
 550 | filters=1024
 551 | size=1
 552 | stride=1
 553 | pad=1
 554 | activation=leaky
 555 | 
 556 | [convolutional]
 557 | batch_normalize=1
 558 | filters=1024
 559 | size=3
 560 | groups=32
 561 | stride=1
 562 | pad=1
 563 | activation=leaky
 564 | 
 565 | [convolutional]
 566 | batch_normalize=1
 567 | filters=1024
 568 | size=1
 569 | stride=1
 570 | pad=1
 571 | activation=linear
 572 | 
 573 | [shortcut]
 574 | from=-4
 575 | activation=leaky
 576 | 
 577 | # 4-2
 578 | 
 579 | [convolutional]
 580 | batch_normalize=1
 581 | filters=1024
 582 | size=1
 583 | stride=1
 584 | pad=1
 585 | activation=leaky
 586 | 
 587 | [convolutional]
 588 | batch_normalize=1
 589 | filters=1024
 590 | size=3
 591 | groups=32
 592 | stride=1
 593 | pad=1
 594 | activation=leaky
 595 | 
 596 | [convolutional]
 597 | batch_normalize=1
 598 | filters=1024
 599 | size=1
 600 | stride=1
 601 | pad=1
 602 | activation=linear
 603 | 
 604 | [shortcut]
 605 | from=-4
 606 | activation=leaky
 607 | 
 608 | # 4-T
 609 | 
 610 | [convolutional]
 611 | batch_normalize=1
 612 | filters=1024
 613 | size=1
 614 | stride=1
 615 | pad=1
 616 | activation=leaky
 617 | 
 618 | [route]
 619 | layers = -1,-12
 620 | 
 621 | [convolutional]
 622 | batch_normalize=1
 623 | filters=2048
 624 | size=1
 625 | stride=1
 626 | pad=1
 627 | activation=leaky
 628 | 
 629 | ##########################
 630 | 
 631 | [convolutional]
 632 | batch_normalize=1
 633 | filters=512
 634 | size=1
 635 | stride=1
 636 | pad=1
 637 | activation=leaky
 638 | 
 639 | [convolutional]
 640 | batch_normalize=1
 641 | size=3
 642 | stride=1
 643 | pad=1
 644 | filters=1024
 645 | activation=leaky
 646 | 
 647 | [convolutional]
 648 | batch_normalize=1
 649 | filters=512
 650 | size=1
 651 | stride=1
 652 | pad=1
 653 | activation=leaky
 654 | 
 655 | ### SPP ###
 656 | [maxpool]
 657 | stride=1
 658 | size=5
 659 | 
 660 | [route]
 661 | layers=-2
 662 | 
 663 | [maxpool]
 664 | stride=1
 665 | size=9
 666 | 
 667 | [route]
 668 | layers=-4
 669 | 
 670 | [maxpool]
 671 | stride=1
 672 | size=13
 673 | 
 674 | [route]
 675 | layers=-1,-3,-5,-6
 676 | ### End SPP ###
 677 | 
 678 | [convolutional]
 679 | batch_normalize=1
 680 | filters=512
 681 | size=1
 682 | stride=1
 683 | pad=1
 684 | activation=leaky
 685 | 
 686 | [convolutional]
 687 | batch_normalize=1
 688 | size=3
 689 | stride=1
 690 | pad=1
 691 | filters=1024
 692 | activation=leaky
 693 | 
 694 | [convolutional]
 695 | batch_normalize=1
 696 | filters=512
 697 | size=1
 698 | stride=1
 699 | pad=1
 700 | activation=leaky
 701 | 
 702 | [convolutional]
 703 | batch_normalize=1
 704 | filters=256
 705 | size=1
 706 | stride=1
 707 | pad=1
 708 | activation=leaky
 709 | 
 710 | [upsample]
 711 | stride=2
 712 | 
 713 | [route]
 714 | layers = 65
 715 | 
 716 | [convolutional]
 717 | batch_normalize=1
 718 | filters=256
 719 | size=1
 720 | stride=1
 721 | pad=1
 722 | activation=leaky
 723 | 
 724 | [route]
 725 | layers = -1, -3
 726 | 
 727 | [convolutional]
 728 | batch_normalize=1
 729 | filters=256
 730 | size=1
 731 | stride=1
 732 | pad=1
 733 | activation=leaky
 734 | 
 735 | [convolutional]
 736 | batch_normalize=1
 737 | size=3
 738 | stride=1
 739 | pad=1
 740 | filters=512
 741 | activation=leaky
 742 | 
 743 | [convolutional]
 744 | batch_normalize=1
 745 | filters=256
 746 | size=1
 747 | stride=1
 748 | pad=1
 749 | activation=leaky
 750 | 
 751 | [convolutional]
 752 | batch_normalize=1
 753 | size=3
 754 | stride=1
 755 | pad=1
 756 | filters=512
 757 | activation=leaky
 758 | 
 759 | [convolutional]
 760 | batch_normalize=1
 761 | filters=256
 762 | size=1
 763 | stride=1
 764 | pad=1
 765 | activation=leaky
 766 | 
 767 | [convolutional]
 768 | batch_normalize=1
 769 | filters=128
 770 | size=1
 771 | stride=1
 772 | pad=1
 773 | activation=leaky
 774 | 
 775 | [upsample]
 776 | stride=2
 777 | 
 778 | [route]
 779 | layers = 38
 780 | 
 781 | [convolutional]
 782 | batch_normalize=1
 783 | filters=128
 784 | size=1
 785 | stride=1
 786 | pad=1
 787 | activation=leaky
 788 | 
 789 | [route]
 790 | layers = -1, -3
 791 | 
 792 | [convolutional]
 793 | batch_normalize=1
 794 | filters=128
 795 | size=1
 796 | stride=1
 797 | pad=1
 798 | activation=leaky
 799 | 
 800 | [convolutional]
 801 | batch_normalize=1
 802 | size=3
 803 | stride=1
 804 | pad=1
 805 | filters=256
 806 | activation=leaky
 807 | 
 808 | [convolutional]
 809 | batch_normalize=1
 810 | filters=128
 811 | size=1
 812 | stride=1
 813 | pad=1
 814 | activation=leaky
 815 | 
 816 | [convolutional]
 817 | batch_normalize=1
 818 | size=3
 819 | stride=1
 820 | pad=1
 821 | filters=256
 822 | activation=leaky
 823 | 
 824 | [convolutional]
 825 | batch_normalize=1
 826 | filters=128
 827 | size=1
 828 | stride=1
 829 | pad=1
 830 | activation=leaky
 831 | 
 832 | ##########################
 833 | 
 834 | [convolutional]
 835 | batch_normalize=1
 836 | size=3
 837 | stride=1
 838 | pad=1
 839 | filters=256
 840 | activation=leaky
 841 | 
 842 | [convolutional]
 843 | size=1
 844 | stride=1
 845 | pad=1
 846 | filters=255
 847 | activation=linear
 848 | 
 849 | 
 850 | [yolo]
 851 | mask = 0,1,2
 852 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
 853 | classes=80
 854 | num=9
 855 | jitter=.3
 856 | ignore_thresh = .7
 857 | truth_thresh = 1
 858 | random=1
 859 | 
 860 | [route]
 861 | layers = -4
 862 | 
 863 | [convolutional]
 864 | batch_normalize=1
 865 | size=3
 866 | stride=2
 867 | pad=1
 868 | filters=256
 869 | activation=leaky
 870 | 
 871 | [route]
 872 | layers = -1, -16
 873 | 
 874 | [convolutional]
 875 | batch_normalize=1
 876 | filters=256
 877 | size=1
 878 | stride=1
 879 | pad=1
 880 | activation=leaky
 881 | 
 882 | [convolutional]
 883 | batch_normalize=1
 884 | size=3
 885 | stride=1
 886 | pad=1
 887 | filters=512
 888 | activation=leaky
 889 | 
 890 | [convolutional]
 891 | batch_normalize=1
 892 | filters=256
 893 | size=1
 894 | stride=1
 895 | pad=1
 896 | activation=leaky
 897 | 
 898 | [convolutional]
 899 | batch_normalize=1
 900 | size=3
 901 | stride=1
 902 | pad=1
 903 | filters=512
 904 | activation=leaky
 905 | 
 906 | [convolutional]
 907 | batch_normalize=1
 908 | filters=256
 909 | size=1
 910 | stride=1
 911 | pad=1
 912 | activation=leaky
 913 | 
 914 | [convolutional]
 915 | batch_normalize=1
 916 | size=3
 917 | stride=1
 918 | pad=1
 919 | filters=512
 920 | activation=leaky
 921 | 
 922 | [convolutional]
 923 | size=1
 924 | stride=1
 925 | pad=1
 926 | filters=255
 927 | activation=linear
 928 | 
 929 | 
 930 | [yolo]
 931 | mask = 3,4,5
 932 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
 933 | classes=80
 934 | num=9
 935 | jitter=.3
 936 | ignore_thresh = .7
 937 | truth_thresh = 1
 938 | random=1
 939 | 
 940 | [route]
 941 | layers = -4
 942 | 
 943 | [convolutional]
 944 | batch_normalize=1
 945 | size=3
 946 | stride=2
 947 | pad=1
 948 | filters=512
 949 | activation=leaky
 950 | 
 951 | [route]
 952 | layers = -1, -37
 953 | 
 954 | [convolutional]
 955 | batch_normalize=1
 956 | filters=512
 957 | size=1
 958 | stride=1
 959 | pad=1
 960 | activation=leaky
 961 | 
 962 | [convolutional]
 963 | batch_normalize=1
 964 | size=3
 965 | stride=1
 966 | pad=1
 967 | filters=1024
 968 | activation=leaky
 969 | 
 970 | [convolutional]
 971 | batch_normalize=1
 972 | filters=512
 973 | size=1
 974 | stride=1
 975 | pad=1
 976 | activation=leaky
 977 | 
 978 | [convolutional]
 979 | batch_normalize=1
 980 | size=3
 981 | stride=1
 982 | pad=1
 983 | filters=1024
 984 | activation=leaky
 985 | 
 986 | [convolutional]
 987 | batch_normalize=1
 988 | filters=512
 989 | size=1
 990 | stride=1
 991 | pad=1
 992 | activation=leaky
 993 | 
 994 | [convolutional]
 995 | batch_normalize=1
 996 | size=3
 997 | stride=1
 998 | pad=1
 999 | filters=1024
1000 | activation=leaky
1001 | 
1002 | [convolutional]
1003 | size=1
1004 | stride=1
1005 | pad=1
1006 | filters=255
1007 | activation=linear
1008 | 
1009 | 
1010 | [yolo]
1011 | mask = 6,7,8
1012 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
1013 | classes=80
1014 | num=9
1015 | jitter=.3
1016 | ignore_thresh = .7
1017 | truth_thresh = 1
1018 | random=1
1019 | 


--------------------------------------------------------------------------------