├── data
├── coco1.txt
├── coco1.data
├── coco16.data
├── coco64.data
├── coco1cls.data
├── coco2014.data
├── coco2017.data
├── coco16.txt
├── coco1cls.txt
├── get_coco2014.sh
├── get_coco2017.sh
├── coco.names
├── coco_paper.names
└── coco64.txt
├── CIoU.png
├── requirements.txt
├── utils
├── evolve.sh
├── gcp.sh
├── google_utils.py
├── parse_config.py
├── layers.py
├── torch_utils.py
└── adabound.py
├── weights
└── download_yolov3_weights.sh
├── .gitignore
├── Dockerfile
├── cfg
├── yolov3-tiny.cfg
├── yolov3-tiny-1cls.cfg
├── yolov3-tiny-3cls.cfg
├── yolov3-tiny3-1cls.cfg
├── yolov3-tiny3.cfg
├── yolov3-1cls.cfg
├── yolov3.cfg
├── yolov3-spp-1cls.cfg
├── yolov3-spp-3cls.cfg
├── yolov3-spp.cfg
├── yolov3-asff.cfg
├── yolov3-spp3.cfg
├── yolov3-spp-pan-scale.cfg
└── csresnext50-panet-spp.cfg
├── README.md
├── detect.py
└── test.py
/data/coco1.txt:
--------------------------------------------------------------------------------
1 | ../coco/images/train2017/000000109622.jpg
2 |
--------------------------------------------------------------------------------
/CIoU.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zzh-tju/ultralytics-YOLOv3-Cluster-NMS/HEAD/CIoU.png
--------------------------------------------------------------------------------
/data/coco1.data:
--------------------------------------------------------------------------------
1 | classes=80
2 | train=data/coco1.txt
3 | valid=data/coco1.txt
4 | names=data/coco.names
5 |
--------------------------------------------------------------------------------
/data/coco16.data:
--------------------------------------------------------------------------------
1 | classes=80
2 | train=data/coco16.txt
3 | valid=data/coco16.txt
4 | names=data/coco.names
5 |
--------------------------------------------------------------------------------
/data/coco64.data:
--------------------------------------------------------------------------------
1 | classes=80
2 | train=data/coco64.txt
3 | valid=data/coco64.txt
4 | names=data/coco.names
5 |
--------------------------------------------------------------------------------
/data/coco1cls.data:
--------------------------------------------------------------------------------
1 | classes=1
2 | train=data/coco1cls.txt
3 | valid=data/coco1cls.txt
4 | names=data/coco.names
5 |
--------------------------------------------------------------------------------
/data/coco2014.data:
--------------------------------------------------------------------------------
1 | classes=80
2 | train=../coco/trainvalno5k.txt
3 | valid=../coco/5k.txt
4 | names=data/coco.names
5 |
--------------------------------------------------------------------------------
/data/coco2017.data:
--------------------------------------------------------------------------------
1 | classes=80
2 | train=../coco/train2017.txt
3 | valid=../coco/val2017.txt
4 | names=data/coco.names
5 |
--------------------------------------------------------------------------------
/data/coco16.txt:
--------------------------------------------------------------------------------
1 | ../coco/images/train2017/000000109622.jpg
2 | ../coco/images/train2017/000000160694.jpg
3 | ../coco/images/train2017/000000308590.jpg
4 | ../coco/images/train2017/000000327573.jpg
5 | ../coco/images/train2017/000000062929.jpg
6 | ../coco/images/train2017/000000512793.jpg
7 | ../coco/images/train2017/000000371735.jpg
8 | ../coco/images/train2017/000000148118.jpg
9 | ../coco/images/train2017/000000309856.jpg
10 | ../coco/images/train2017/000000141882.jpg
11 | ../coco/images/train2017/000000318783.jpg
12 | ../coco/images/train2017/000000337760.jpg
13 | ../coco/images/train2017/000000298197.jpg
14 | ../coco/images/train2017/000000042421.jpg
15 | ../coco/images/train2017/000000328898.jpg
16 | ../coco/images/train2017/000000458856.jpg
17 |
--------------------------------------------------------------------------------
/data/coco1cls.txt:
--------------------------------------------------------------------------------
1 | ../coco/images/train2017/000000000901.jpg
2 | ../coco/images/train2017/000000001464.jpg
3 | ../coco/images/train2017/000000003220.jpg
4 | ../coco/images/train2017/000000003365.jpg
5 | ../coco/images/train2017/000000004772.jpg
6 | ../coco/images/train2017/000000009987.jpg
7 | ../coco/images/train2017/000000010498.jpg
8 | ../coco/images/train2017/000000012455.jpg
9 | ../coco/images/train2017/000000013992.jpg
10 | ../coco/images/train2017/000000014125.jpg
11 | ../coco/images/train2017/000000016314.jpg
12 | ../coco/images/train2017/000000016670.jpg
13 | ../coco/images/train2017/000000018412.jpg
14 | ../coco/images/train2017/000000021212.jpg
15 | ../coco/images/train2017/000000021826.jpg
16 | ../coco/images/train2017/000000030566.jpg
17 |
--------------------------------------------------------------------------------
/data/get_coco2014.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Zip coco folder
3 | # zip -r coco.zip coco
4 | # tar -czvf coco.tar.gz coco
5 |
6 | # Download labels from Google Drive, accepting presented query
7 | filename="coco2014labels.zip"
8 | fileid="1s6-CmF5_SElM28r52P1OUrCcuXZN-SFo"
9 | curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null
10 | curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=${fileid}" -o ${filename}
11 | rm ./cookie
12 |
13 | # Unzip labels
14 | unzip -q ${filename} # for coco.zip
15 | # tar -xzf ${filename} # for coco.tar.gz
16 | rm ${filename}
17 |
18 | # Download and unzip images
19 | cd coco/images
20 | f="train2014.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f
21 | f="val2014.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f
22 |
23 | # cd out
24 | cd ../..
25 |
--------------------------------------------------------------------------------
/data/get_coco2017.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Zip coco folder
3 | # zip -r coco.zip coco
4 | # tar -czvf coco.tar.gz coco
5 |
6 | # Download labels from Google Drive, accepting presented query
7 | filename="coco2017labels.zip"
8 | fileid="1cXZR_ckHki6nddOmcysCuuJFM--T-Q6L"
9 | curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null
10 | curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=${fileid}" -o ${filename}
11 | rm ./cookie
12 |
13 | # Unzip labels
14 | unzip -q ${filename} # for coco.zip
15 | # tar -xzf ${filename} # for coco.tar.gz
16 | rm ${filename}
17 |
18 | # Download and unzip images
19 | cd coco/images
20 | f="train2017.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f
21 | f="val2017.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f
22 |
23 | # cd out
24 | cd ../..
25 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # pip install -U -r requirements.txt
2 | numpy
3 | opencv-python >= 4.1
4 | torch >= 1.5
5 | matplotlib
6 | pycocotools
7 | tqdm
8 | pillow
9 | tensorboard >= 1.14
10 |
11 | # Nvidia Apex (optional) for mixed precision training --------------------------
12 | # git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . --user && cd .. && rm -rf apex
13 |
14 | # Conda commands (in place of pip) ---------------------------------------------
15 | # conda update -yn base -c defaults conda
16 | # conda install -yc anaconda numpy opencv matplotlib tqdm pillow ipython
17 | # conda install -yc conda-forge scikit-image pycocotools tensorboard
18 | # conda install -yc spyder-ide spyder-line-profiler
19 | # conda install -yc pytorch pytorch torchvision
20 | # conda install -yc conda-forge protobuf numpy && pip install onnx # https://github.com/onnx/onnx#linux-and-macos
21 |
--------------------------------------------------------------------------------
/data/coco.names:
--------------------------------------------------------------------------------
1 | person
2 | bicycle
3 | car
4 | motorcycle
5 | airplane
6 | bus
7 | train
8 | truck
9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | couch
59 | potted plant
60 | bed
61 | dining table
62 | toilet
63 | tv
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 |
--------------------------------------------------------------------------------
/utils/evolve.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #for i in 0 1 2 3
3 | #do
4 | # t=ultralytics/yolov3:v139 && sudo docker pull $t && sudo nvidia-docker run -d --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t utils/evolve.sh $i
5 | # sleep 30
6 | #done
7 |
8 | while true; do
9 | # python3 train.py --data ../data/sm4/out.data --img-size 320 --epochs 100 --batch 64 --accum 1 --weights yolov3-tiny.conv.15 --multi --bucket ult/wer --evolve --cache --device $1 --cfg yolov3-tiny3-1cls.cfg --single --adam
10 | # python3 train.py --data ../out/data.data --img-size 608 --epochs 10 --batch 8 --accum 8 --weights ultralytics68.pt --multi --bucket ult/athena --evolve --device $1 --cfg yolov3-spp-1cls.cfg
11 |
12 | python3 train.py --data coco2014.data --img-size 512 608 --epochs 27 --batch 8 --accum 8 --evolve --weights '' --bucket ult/coco/sppa_512 --device $1 --cfg yolov3-sppa.cfg --multi
13 | done
14 |
15 |
16 | # coco epoch times --img-size 416 608 --epochs 27 --batch 16 --accum 4
17 | # 36:34 2080ti
18 | # 21:58 V100
19 | # 63:00 T4
--------------------------------------------------------------------------------
/weights/download_yolov3_weights.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # make '/weights' directory if it does not exist and cd into it
4 | # mkdir -p weights && cd weights
5 |
6 | # copy darknet weight files, continue '-c' if partially downloaded
7 | # wget -c https://pjreddie.com/media/files/yolov3.weights
8 | # wget -c https://pjreddie.com/media/files/yolov3-tiny.weights
9 | # wget -c https://pjreddie.com/media/files/yolov3-spp.weights
10 |
11 | # yolov3 pytorch weights
12 | # download from Google Drive: https://drive.google.com/drive/folders/1uxgUBemJVw9wZsdpboYbzUN4bcRhsuAI
13 |
14 | # darknet53 weights (first 75 layers only)
15 | # wget -c https://pjreddie.com/media/files/darknet53.conv.74
16 |
17 | # yolov3-tiny weights from darknet (first 16 layers only)
18 | # ./darknet partial cfg/yolov3-tiny.cfg yolov3-tiny.weights yolov3-tiny.conv.15 15
19 | # mv yolov3-tiny.conv.15 ../
20 |
21 | # new method
22 | python3 -c "from models import *;
23 | attempt_download('weights/yolov3.pt');
24 | attempt_download('weights/yolov3-spp.pt')"
25 |
--------------------------------------------------------------------------------
/data/coco_paper.names:
--------------------------------------------------------------------------------
1 | person
2 | bicycle
3 | car
4 | motorcycle
5 | airplane
6 | bus
7 | train
8 | truck
9 | boat
10 | traffic light
11 | fire hydrant
12 | street sign
13 | stop sign
14 | parking meter
15 | bench
16 | bird
17 | cat
18 | dog
19 | horse
20 | sheep
21 | cow
22 | elephant
23 | bear
24 | zebra
25 | giraffe
26 | hat
27 | backpack
28 | umbrella
29 | shoe
30 | eye glasses
31 | handbag
32 | tie
33 | suitcase
34 | frisbee
35 | skis
36 | snowboard
37 | sports ball
38 | kite
39 | baseball bat
40 | baseball glove
41 | skateboard
42 | surfboard
43 | tennis racket
44 | bottle
45 | plate
46 | wine glass
47 | cup
48 | fork
49 | knife
50 | spoon
51 | bowl
52 | banana
53 | apple
54 | sandwich
55 | orange
56 | broccoli
57 | carrot
58 | hot dog
59 | pizza
60 | donut
61 | cake
62 | chair
63 | couch
64 | potted plant
65 | bed
66 | mirror
67 | dining table
68 | window
69 | desk
70 | toilet
71 | door
72 | tv
73 | laptop
74 | mouse
75 | remote
76 | keyboard
77 | cell phone
78 | microwave
79 | oven
80 | toaster
81 | sink
82 | refrigerator
83 | blender
84 | book
85 | clock
86 | vase
87 | scissors
88 | teddy bear
89 | hair drier
90 | toothbrush
91 | hair brush
--------------------------------------------------------------------------------
/utils/gcp.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # New VM
4 | rm -rf sample_data yolov3
5 | git clone https://github.com/ultralytics/yolov3
6 | # git clone -b test --depth 1 https://github.com/ultralytics/yolov3 test # branch
7 | # sudo apt-get install zip
8 | #git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . --user && cd .. && rm -rf apex
9 | sudo conda install -yc conda-forge scikit-image pycocotools
10 | # python3 -c "from yolov3.utils.google_utils import gdrive_download; gdrive_download('193Zp_ye-3qXMonR1nZj3YyxMtQkMy50k','coco2014.zip')"
11 | python3 -c "from yolov3.utils.google_utils import gdrive_download; gdrive_download('1WQT6SOktSe8Uw6r10-2JhbEhMY5DJaph','coco2017.zip')"
12 | python3 -c "from yolov3.utils.google_utils import gdrive_download; gdrive_download('1C3HewOG9akA3y456SZLBJZfNDPkBwAto','knife.zip')"
13 | python3 -c "from yolov3.utils.google_utils import gdrive_download; gdrive_download('13g3LqdpkNE8sPosVJT6KFXlfoMypzRP4','sm4.zip')"
14 | sudo shutdown
15 |
16 | # Mount local SSD
17 | lsblk
18 | sudo mkfs.ext4 -F /dev/nvme0n1
19 | sudo mkdir -p /mnt/disks/nvme0n1
20 | sudo mount /dev/nvme0n1 /mnt/disks/nvme0n1
21 | sudo chmod a+w /mnt/disks/nvme0n1
22 | cp -r coco /mnt/disks/nvme0n1
23 |
24 | # Kill All
25 | t=ultralytics/yolov3:v1
26 | docker kill $(docker ps -a -q --filter ancestor=$t)
27 |
28 | # Evolve coco
29 | sudo -s
30 | t=ultralytics/yolov3:evolve
31 | # docker kill $(docker ps -a -q --filter ancestor=$t)
32 | for i in 0 1 6 7
33 | do
34 | docker pull $t && docker run --gpus all -d --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t bash utils/evolve.sh $i
35 | sleep 30
36 | done
37 |
38 | #COCO training
39 | n=131 && t=ultralytics/coco:v131 && sudo docker pull $t && sudo docker run -it --gpus all --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t python3 train.py --data coco2014.data --img-size 320 640 --epochs 300 --batch 16 --accum 4 --weights '' --device 0 --cfg yolov3-spp.cfg --nosave --bucket ult/coco --name $n && sudo shutdown
40 | n=132 && t=ultralytics/coco:v131 && sudo docker pull $t && sudo docker run -it --gpus all --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t python3 train.py --data coco2014.data --img-size 320 640 --epochs 300 --batch 64 --accum 1 --weights '' --device 0 --cfg yolov3-tiny.cfg --nosave --bucket ult/coco --name $n && sudo shutdown
41 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Start FROM Nvidia PyTorch image https://ngc.nvidia.com/catalog/containers/nvidia:pytorch
2 | FROM nvcr.io/nvidia/pytorch:20.03-py3
3 |
4 | # Install dependencies (pip or conda)
5 | RUN pip install -U gsutil thop
6 | # RUN pip install -U -r requirements.txt
7 | # RUN conda update -n base -c defaults conda
8 | # RUN conda install -y -c anaconda future numpy opencv matplotlib tqdm pillow
9 | # RUN conda install -y -c conda-forge scikit-image tensorboard pycocotools
10 |
11 | ## Install OpenCV with Gstreamer support
12 | #WORKDIR /usr/src
13 | #RUN pip uninstall -y opencv-python
14 | #RUN apt-get update
15 | #RUN apt-get install -y gstreamer1.0-tools gstreamer1.0-python3-dbg-plugin-loader libgstreamer1.0-dev libgstreamer-plugins-base1.0-dev
16 | #RUN git clone https://github.com/opencv/opencv.git && cd opencv && git checkout 4.1.1 && mkdir build
17 | #RUN git clone https://github.com/opencv/opencv_contrib.git && cd opencv_contrib && git checkout 4.1.1
18 | #RUN cd opencv/build && cmake ../ \
19 | # -D OPENCV_EXTRA_MODULES_PATH=../../opencv_contrib/modules \
20 | # -D BUILD_OPENCV_PYTHON3=ON \
21 | # -D PYTHON3_EXECUTABLE=/opt/conda/bin/python \
22 | # -D PYTHON3_INCLUDE_PATH=/opt/conda/include/python3.6m \
23 | # -D PYTHON3_LIBRARIES=/opt/conda/lib/python3.6/site-packages \
24 | # -D WITH_GSTREAMER=ON \
25 | # -D WITH_FFMPEG=OFF \
26 | # && make && make install && ldconfig
27 | #RUN cd /usr/local/lib/python3.6/site-packages/cv2/python-3.6/ && mv cv2.cpython-36m-x86_64-linux-gnu.so cv2.so
28 | #RUN cd /opt/conda/lib/python3.6/site-packages/ && ln -s /usr/local/lib/python3.6/site-packages/cv2/python-3.6/cv2.so cv2.so
29 | #RUN python3 -c "import cv2; print(cv2.getBuildInformation())"
30 |
31 | # Create working directory
32 | RUN mkdir -p /usr/src/app
33 | WORKDIR /usr/src/app
34 |
35 | # Copy contents
36 | COPY . /usr/src/app
37 |
38 | # Copy weights
39 | #RUN python3 -c "from models import *; \
40 | #attempt_download('weights/yolov3.pt'); \
41 | #attempt_download('weights/yolov3-spp.pt')"
42 |
43 |
44 | # --------------------------------------------------- Extras Below ---------------------------------------------------
45 |
46 | # Build and Push
47 | # t=ultralytics/yolov3:v0 && sudo docker build -t $t . && sudo docker push $t
48 |
49 | # Run
50 | # t=ultralytics/yolov3:v0 && sudo docker pull $t && sudo docker run -it --gpus all --ipc=host $t bash
51 |
52 | # Pull and Run with local directory access
53 | # t=ultralytics/yolov3:v0 && sudo docker pull $t && sudo docker run -it --gpus all --ipc=host -v "$(pwd)"/coco:/usr/src/coco $t bash
54 |
55 | # Kill all
56 | # sudo docker kill "$(sudo docker ps -q)"
57 |
58 | # Kill all image-based
59 | # sudo docker kill $(sudo docker ps -a -q --filter ancestor=ultralytics/yolov3:v0)
60 |
61 | # Run bash for loop
62 | # sudo docker run --gpus all --ipc=host ultralytics/yolov3:v0 while true; do python3 train.py --evolve; done
63 |
--------------------------------------------------------------------------------
/cfg/yolov3-tiny.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | batch=1
4 | subdivisions=1
5 | # Training
6 | # batch=64
7 | # subdivisions=2
8 | width=416
9 | height=416
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=16
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | [maxpool]
34 | size=2
35 | stride=2
36 |
37 | [convolutional]
38 | batch_normalize=1
39 | filters=32
40 | size=3
41 | stride=1
42 | pad=1
43 | activation=leaky
44 |
45 | [maxpool]
46 | size=2
47 | stride=2
48 |
49 | [convolutional]
50 | batch_normalize=1
51 | filters=64
52 | size=3
53 | stride=1
54 | pad=1
55 | activation=leaky
56 |
57 | [maxpool]
58 | size=2
59 | stride=2
60 |
61 | [convolutional]
62 | batch_normalize=1
63 | filters=128
64 | size=3
65 | stride=1
66 | pad=1
67 | activation=leaky
68 |
69 | [maxpool]
70 | size=2
71 | stride=2
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=256
76 | size=3
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [maxpool]
82 | size=2
83 | stride=2
84 |
85 | [convolutional]
86 | batch_normalize=1
87 | filters=512
88 | size=3
89 | stride=1
90 | pad=1
91 | activation=leaky
92 |
93 | [maxpool]
94 | size=2
95 | stride=1
96 |
97 | [convolutional]
98 | batch_normalize=1
99 | filters=1024
100 | size=3
101 | stride=1
102 | pad=1
103 | activation=leaky
104 |
105 | ###########
106 |
107 | [convolutional]
108 | batch_normalize=1
109 | filters=256
110 | size=1
111 | stride=1
112 | pad=1
113 | activation=leaky
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=512
118 | size=3
119 | stride=1
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | size=1
125 | stride=1
126 | pad=1
127 | filters=255
128 | activation=linear
129 |
130 |
131 |
132 | [yolo]
133 | mask = 3,4,5
134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319
135 | classes=80
136 | num=6
137 | jitter=.3
138 | ignore_thresh = .7
139 | truth_thresh = 1
140 | random=1
141 |
142 | [route]
143 | layers = -4
144 |
145 | [convolutional]
146 | batch_normalize=1
147 | filters=128
148 | size=1
149 | stride=1
150 | pad=1
151 | activation=leaky
152 |
153 | [upsample]
154 | stride=2
155 |
156 | [route]
157 | layers = -1, 8
158 |
159 | [convolutional]
160 | batch_normalize=1
161 | filters=256
162 | size=3
163 | stride=1
164 | pad=1
165 | activation=leaky
166 |
167 | [convolutional]
168 | size=1
169 | stride=1
170 | pad=1
171 | filters=255
172 | activation=linear
173 |
174 | [yolo]
175 | mask = 1,2,3
176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319
177 | classes=80
178 | num=6
179 | jitter=.3
180 | ignore_thresh = .7
181 | truth_thresh = 1
182 | random=1
183 |
--------------------------------------------------------------------------------
/data/coco64.txt:
--------------------------------------------------------------------------------
1 | ../coco/images/train2017/000000109622.jpg
2 | ../coco/images/train2017/000000160694.jpg
3 | ../coco/images/train2017/000000308590.jpg
4 | ../coco/images/train2017/000000327573.jpg
5 | ../coco/images/train2017/000000062929.jpg
6 | ../coco/images/train2017/000000512793.jpg
7 | ../coco/images/train2017/000000371735.jpg
8 | ../coco/images/train2017/000000148118.jpg
9 | ../coco/images/train2017/000000309856.jpg
10 | ../coco/images/train2017/000000141882.jpg
11 | ../coco/images/train2017/000000318783.jpg
12 | ../coco/images/train2017/000000337760.jpg
13 | ../coco/images/train2017/000000298197.jpg
14 | ../coco/images/train2017/000000042421.jpg
15 | ../coco/images/train2017/000000328898.jpg
16 | ../coco/images/train2017/000000458856.jpg
17 | ../coco/images/train2017/000000073824.jpg
18 | ../coco/images/train2017/000000252846.jpg
19 | ../coco/images/train2017/000000459590.jpg
20 | ../coco/images/train2017/000000273650.jpg
21 | ../coco/images/train2017/000000331311.jpg
22 | ../coco/images/train2017/000000156326.jpg
23 | ../coco/images/train2017/000000262985.jpg
24 | ../coco/images/train2017/000000253580.jpg
25 | ../coco/images/train2017/000000447976.jpg
26 | ../coco/images/train2017/000000378077.jpg
27 | ../coco/images/train2017/000000259913.jpg
28 | ../coco/images/train2017/000000424553.jpg
29 | ../coco/images/train2017/000000000612.jpg
30 | ../coco/images/train2017/000000267625.jpg
31 | ../coco/images/train2017/000000566012.jpg
32 | ../coco/images/train2017/000000196664.jpg
33 | ../coco/images/train2017/000000363331.jpg
34 | ../coco/images/train2017/000000057992.jpg
35 | ../coco/images/train2017/000000520047.jpg
36 | ../coco/images/train2017/000000453903.jpg
37 | ../coco/images/train2017/000000162083.jpg
38 | ../coco/images/train2017/000000268516.jpg
39 | ../coco/images/train2017/000000277436.jpg
40 | ../coco/images/train2017/000000189744.jpg
41 | ../coco/images/train2017/000000041128.jpg
42 | ../coco/images/train2017/000000527728.jpg
43 | ../coco/images/train2017/000000465269.jpg
44 | ../coco/images/train2017/000000246833.jpg
45 | ../coco/images/train2017/000000076784.jpg
46 | ../coco/images/train2017/000000323715.jpg
47 | ../coco/images/train2017/000000560463.jpg
48 | ../coco/images/train2017/000000006263.jpg
49 | ../coco/images/train2017/000000094701.jpg
50 | ../coco/images/train2017/000000521359.jpg
51 | ../coco/images/train2017/000000302903.jpg
52 | ../coco/images/train2017/000000047559.jpg
53 | ../coco/images/train2017/000000480583.jpg
54 | ../coco/images/train2017/000000050025.jpg
55 | ../coco/images/train2017/000000084512.jpg
56 | ../coco/images/train2017/000000508913.jpg
57 | ../coco/images/train2017/000000093708.jpg
58 | ../coco/images/train2017/000000070493.jpg
59 | ../coco/images/train2017/000000539270.jpg
60 | ../coco/images/train2017/000000474402.jpg
61 | ../coco/images/train2017/000000209842.jpg
62 | ../coco/images/train2017/000000028820.jpg
63 | ../coco/images/train2017/000000154257.jpg
64 | ../coco/images/train2017/000000342499.jpg
65 |
--------------------------------------------------------------------------------
/cfg/yolov3-tiny-1cls.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | batch=1
4 | subdivisions=1
5 | # Training
6 | # batch=64
7 | # subdivisions=2
8 | width=416
9 | height=416
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=16
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | [maxpool]
34 | size=2
35 | stride=2
36 |
37 | [convolutional]
38 | batch_normalize=1
39 | filters=32
40 | size=3
41 | stride=1
42 | pad=1
43 | activation=leaky
44 |
45 | [maxpool]
46 | size=2
47 | stride=2
48 |
49 | [convolutional]
50 | batch_normalize=1
51 | filters=64
52 | size=3
53 | stride=1
54 | pad=1
55 | activation=leaky
56 |
57 | [maxpool]
58 | size=2
59 | stride=2
60 |
61 | [convolutional]
62 | batch_normalize=1
63 | filters=128
64 | size=3
65 | stride=1
66 | pad=1
67 | activation=leaky
68 |
69 | [maxpool]
70 | size=2
71 | stride=2
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=256
76 | size=3
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [maxpool]
82 | size=2
83 | stride=2
84 |
85 | [convolutional]
86 | batch_normalize=1
87 | filters=512
88 | size=3
89 | stride=1
90 | pad=1
91 | activation=leaky
92 |
93 | [maxpool]
94 | size=2
95 | stride=1
96 |
97 | [convolutional]
98 | batch_normalize=1
99 | filters=1024
100 | size=3
101 | stride=1
102 | pad=1
103 | activation=leaky
104 |
105 | ###########
106 |
107 | [convolutional]
108 | batch_normalize=1
109 | filters=256
110 | size=1
111 | stride=1
112 | pad=1
113 | activation=leaky
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=512
118 | size=3
119 | stride=1
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | size=1
125 | stride=1
126 | pad=1
127 | filters=18
128 | activation=linear
129 |
130 |
131 |
132 | [yolo]
133 | mask = 3,4,5
134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319
135 | classes=1
136 | num=6
137 | jitter=.3
138 | ignore_thresh = .7
139 | truth_thresh = 1
140 | random=1
141 |
142 | [route]
143 | layers = -4
144 |
145 | [convolutional]
146 | batch_normalize=1
147 | filters=128
148 | size=1
149 | stride=1
150 | pad=1
151 | activation=leaky
152 |
153 | [upsample]
154 | stride=2
155 |
156 | [route]
157 | layers = -1, 8
158 |
159 | [convolutional]
160 | batch_normalize=1
161 | filters=256
162 | size=3
163 | stride=1
164 | pad=1
165 | activation=leaky
166 |
167 | [convolutional]
168 | size=1
169 | stride=1
170 | pad=1
171 | filters=18
172 | activation=linear
173 |
174 | [yolo]
175 | mask = 0,1,2
176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319
177 | classes=1
178 | num=6
179 | jitter=.3
180 | ignore_thresh = .7
181 | truth_thresh = 1
182 | random=1
183 |
--------------------------------------------------------------------------------
/cfg/yolov3-tiny-3cls.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | batch=1
4 | subdivisions=1
5 | # Training
6 | # batch=64
7 | # subdivisions=2
8 | width=416
9 | height=416
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=16
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | [maxpool]
34 | size=2
35 | stride=2
36 |
37 | [convolutional]
38 | batch_normalize=1
39 | filters=32
40 | size=3
41 | stride=1
42 | pad=1
43 | activation=leaky
44 |
45 | [maxpool]
46 | size=2
47 | stride=2
48 |
49 | [convolutional]
50 | batch_normalize=1
51 | filters=64
52 | size=3
53 | stride=1
54 | pad=1
55 | activation=leaky
56 |
57 | [maxpool]
58 | size=2
59 | stride=2
60 |
61 | [convolutional]
62 | batch_normalize=1
63 | filters=128
64 | size=3
65 | stride=1
66 | pad=1
67 | activation=leaky
68 |
69 | [maxpool]
70 | size=2
71 | stride=2
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=256
76 | size=3
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [maxpool]
82 | size=2
83 | stride=2
84 |
85 | [convolutional]
86 | batch_normalize=1
87 | filters=512
88 | size=3
89 | stride=1
90 | pad=1
91 | activation=leaky
92 |
93 | [maxpool]
94 | size=2
95 | stride=1
96 |
97 | [convolutional]
98 | batch_normalize=1
99 | filters=1024
100 | size=3
101 | stride=1
102 | pad=1
103 | activation=leaky
104 |
105 | ###########
106 |
107 | [convolutional]
108 | batch_normalize=1
109 | filters=256
110 | size=1
111 | stride=1
112 | pad=1
113 | activation=leaky
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=512
118 | size=3
119 | stride=1
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | size=1
125 | stride=1
126 | pad=1
127 | filters=24
128 | activation=linear
129 |
130 |
131 |
132 | [yolo]
133 | mask = 3,4,5
134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319
135 | classes=3
136 | num=6
137 | jitter=.3
138 | ignore_thresh = .7
139 | truth_thresh = 1
140 | random=1
141 |
142 | [route]
143 | layers = -4
144 |
145 | [convolutional]
146 | batch_normalize=1
147 | filters=128
148 | size=1
149 | stride=1
150 | pad=1
151 | activation=leaky
152 |
153 | [upsample]
154 | stride=2
155 |
156 | [route]
157 | layers = -1, 8
158 |
159 | [convolutional]
160 | batch_normalize=1
161 | filters=256
162 | size=3
163 | stride=1
164 | pad=1
165 | activation=leaky
166 |
167 | [convolutional]
168 | size=1
169 | stride=1
170 | pad=1
171 | filters=24
172 | activation=linear
173 |
174 | [yolo]
175 | mask = 0,1,2
176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319
177 | classes=3
178 | num=6
179 | jitter=.3
180 | ignore_thresh = .7
181 | truth_thresh = 1
182 | random=1
183 |
--------------------------------------------------------------------------------
/utils/google_utils.py:
--------------------------------------------------------------------------------
1 | # This file contains google utils: https://cloud.google.com/storage/docs/reference/libraries
2 | # pip install --upgrade google-cloud-storage
3 |
4 | import os
5 | import time
6 |
7 |
8 | # from google.cloud import storage
9 |
10 |
11 | def gdrive_download(id='1HaXkef9z6y5l4vUnCYgdmEAj61c6bfWO', name='coco.zip'):
12 | # https://gist.github.com/tanaikech/f0f2d122e05bf5f971611258c22c110f
13 | # Downloads a file from Google Drive, accepting presented query
14 | # from utils.google_utils import *; gdrive_download()
15 | t = time.time()
16 |
17 | print('Downloading https://drive.google.com/uc?export=download&id=%s as %s... ' % (id, name), end='')
18 | os.remove(name) if os.path.exists(name) else None # remove existing
19 | os.remove('cookie') if os.path.exists('cookie') else None
20 |
21 | # Attempt file download
22 | os.system("curl -c ./cookie -s -L \"https://drive.google.com/uc?export=download&id=%s\" > /dev/null" % id)
23 | if os.path.exists('cookie'): # large file
24 | s = "curl -Lb ./cookie \"https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=%s\" -o %s" % (
25 | id, name)
26 | else: # small file
27 | s = "curl -s -L -o %s 'https://drive.google.com/uc?export=download&id=%s'" % (name, id)
28 | r = os.system(s) # execute, capture return values
29 | os.remove('cookie') if os.path.exists('cookie') else None
30 |
31 | # Error check
32 | if r != 0:
33 | os.remove(name) if os.path.exists(name) else None # remove partial
34 | print('Download error ') # raise Exception('Download error')
35 | return r
36 |
37 | # Unzip if archive
38 | if name.endswith('.zip'):
39 | print('unzipping... ', end='')
40 | os.system('unzip -q %s' % name) # unzip
41 | os.remove(name) # remove zip to free space
42 |
43 | print('Done (%.1fs)' % (time.time() - t))
44 | return r
45 |
46 |
47 | def upload_blob(bucket_name, source_file_name, destination_blob_name):
48 | # Uploads a file to a bucket
49 | # https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
50 |
51 | storage_client = storage.Client()
52 | bucket = storage_client.get_bucket(bucket_name)
53 | blob = bucket.blob(destination_blob_name)
54 |
55 | blob.upload_from_filename(source_file_name)
56 |
57 | print('File {} uploaded to {}.'.format(
58 | source_file_name,
59 | destination_blob_name))
60 |
61 |
62 | def download_blob(bucket_name, source_blob_name, destination_file_name):
63 | # Uploads a blob from a bucket
64 | storage_client = storage.Client()
65 | bucket = storage_client.get_bucket(bucket_name)
66 | blob = bucket.blob(source_blob_name)
67 |
68 | blob.download_to_filename(destination_file_name)
69 |
70 | print('Blob {} downloaded to {}.'.format(
71 | source_blob_name,
72 | destination_file_name))
73 |
--------------------------------------------------------------------------------
/utils/parse_config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 |
5 |
6 | def parse_model_cfg(path):
7 | # Parse the yolo *.cfg file and return module definitions path may be 'cfg/yolov3.cfg', 'yolov3.cfg', or 'yolov3'
8 | if not path.endswith('.cfg'): # add .cfg suffix if omitted
9 | path += '.cfg'
10 | if not os.path.exists(path) and os.path.exists('cfg' + os.sep + path): # add cfg/ prefix if omitted
11 | path = 'cfg' + os.sep + path
12 |
13 | with open(path, 'r') as f:
14 | lines = f.read().split('\n')
15 | lines = [x for x in lines if x and not x.startswith('#')]
16 | lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces
17 | mdefs = [] # module definitions
18 | for line in lines:
19 | if line.startswith('['): # This marks the start of a new block
20 | mdefs.append({})
21 | mdefs[-1]['type'] = line[1:-1].rstrip()
22 | if mdefs[-1]['type'] == 'convolutional':
23 | mdefs[-1]['batch_normalize'] = 0 # pre-populate with zeros (may be overwritten later)
24 | else:
25 | key, val = line.split("=")
26 | key = key.rstrip()
27 |
28 | if key == 'anchors': # return nparray
29 | mdefs[-1][key] = np.array([float(x) for x in val.split(',')]).reshape((-1, 2)) # np anchors
30 | elif (key in ['from', 'layers', 'mask']) or (key == 'size' and ',' in val): # return array
31 | mdefs[-1][key] = [int(x) for x in val.split(',')]
32 | else:
33 | val = val.strip()
34 | if val.isnumeric(): # return int or float
35 | mdefs[-1][key] = int(val) if (int(val) - float(val)) == 0 else float(val)
36 | else:
37 | mdefs[-1][key] = val # return string
38 |
39 | # Check all fields are supported
40 | supported = ['type', 'batch_normalize', 'filters', 'size', 'stride', 'pad', 'activation', 'layers', 'groups',
41 | 'from', 'mask', 'anchors', 'classes', 'num', 'jitter', 'ignore_thresh', 'truth_thresh', 'random',
42 | 'stride_x', 'stride_y', 'weights_type', 'weights_normalization', 'scale_x_y', 'beta_nms', 'nms_kind',
43 | 'iou_loss', 'iou_normalizer', 'cls_normalizer', 'iou_thresh']
44 |
45 | f = [] # fields
46 | for x in mdefs[1:]:
47 | [f.append(k) for k in x if k not in f]
48 | u = [x for x in f if x not in supported] # unsupported fields
49 | assert not any(u), "Unsupported fields %s in %s. See https://github.com/ultralytics/yolov3/issues/631" % (u, path)
50 |
51 | return mdefs
52 |
53 |
54 | def parse_data_cfg(path):
55 | # Parses the data configuration file
56 | if not os.path.exists(path) and os.path.exists('data' + os.sep + path): # add data/ prefix if omitted
57 | path = 'data' + os.sep + path
58 |
59 | with open(path, 'r') as f:
60 | lines = f.readlines()
61 |
62 | options = dict()
63 | for line in lines:
64 | line = line.strip()
65 | if line == '' or line.startswith('#'):
66 | continue
67 | key, val = line.split('=')
68 | options[key.strip()] = val.strip()
69 |
70 | return options
71 |
--------------------------------------------------------------------------------
/cfg/yolov3-tiny3-1cls.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | # batch=1
4 | # subdivisions=1
5 | # Training
6 | batch=64
7 | subdivisions=16
8 | width=608
9 | height=608
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 200000
21 | policy=steps
22 | steps=180000,190000
23 | scales=.1,.1
24 |
25 |
26 | [convolutional]
27 | batch_normalize=1
28 | filters=16
29 | size=3
30 | stride=1
31 | pad=1
32 | activation=leaky
33 |
34 | [maxpool]
35 | size=2
36 | stride=2
37 |
38 | [convolutional]
39 | batch_normalize=1
40 | filters=32
41 | size=3
42 | stride=1
43 | pad=1
44 | activation=leaky
45 |
46 | [maxpool]
47 | size=2
48 | stride=2
49 |
50 | [convolutional]
51 | batch_normalize=1
52 | filters=64
53 | size=3
54 | stride=1
55 | pad=1
56 | activation=leaky
57 |
58 | [maxpool]
59 | size=2
60 | stride=2
61 |
62 | [convolutional]
63 | batch_normalize=1
64 | filters=128
65 | size=3
66 | stride=1
67 | pad=1
68 | activation=leaky
69 |
70 | [maxpool]
71 | size=2
72 | stride=2
73 |
74 | [convolutional]
75 | batch_normalize=1
76 | filters=256
77 | size=3
78 | stride=1
79 | pad=1
80 | activation=leaky
81 |
82 | [maxpool]
83 | size=2
84 | stride=2
85 |
86 | [convolutional]
87 | batch_normalize=1
88 | filters=512
89 | size=3
90 | stride=1
91 | pad=1
92 | activation=leaky
93 |
94 | [maxpool]
95 | size=2
96 | stride=1
97 |
98 | [convolutional]
99 | batch_normalize=1
100 | filters=1024
101 | size=3
102 | stride=1
103 | pad=1
104 | activation=leaky
105 |
106 | ###########
107 |
108 | [convolutional]
109 | batch_normalize=1
110 | filters=256
111 | size=1
112 | stride=1
113 | pad=1
114 | activation=leaky
115 |
116 | [convolutional]
117 | batch_normalize=1
118 | filters=512
119 | size=3
120 | stride=1
121 | pad=1
122 | activation=leaky
123 |
124 | [convolutional]
125 | size=1
126 | stride=1
127 | pad=1
128 | filters=18
129 | activation=linear
130 |
131 |
132 |
133 | [yolo]
134 | mask = 6,7,8
135 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
136 | classes=1
137 | num=9
138 | jitter=.3
139 | ignore_thresh = .7
140 | truth_thresh = 1
141 | random=1
142 |
143 | [route]
144 | layers = -4
145 |
146 | [convolutional]
147 | batch_normalize=1
148 | filters=128
149 | size=1
150 | stride=1
151 | pad=1
152 | activation=leaky
153 |
154 | [upsample]
155 | stride=2
156 |
157 | [route]
158 | layers = -1, 8
159 |
160 | [convolutional]
161 | batch_normalize=1
162 | filters=256
163 | size=3
164 | stride=1
165 | pad=1
166 | activation=leaky
167 |
168 | [convolutional]
169 | size=1
170 | stride=1
171 | pad=1
172 | filters=18
173 | activation=linear
174 |
175 | [yolo]
176 | mask = 3,4,5
177 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
178 | classes=1
179 | num=9
180 | jitter=.3
181 | ignore_thresh = .7
182 | truth_thresh = 1
183 | random=1
184 |
185 |
186 |
187 | [route]
188 | layers = -3
189 |
190 | [convolutional]
191 | batch_normalize=1
192 | filters=128
193 | size=1
194 | stride=1
195 | pad=1
196 | activation=leaky
197 |
198 | [upsample]
199 | stride=2
200 |
201 | [route]
202 | layers = -1, 6
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=3
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | size=1
214 | stride=1
215 | pad=1
216 | filters=18
217 | activation=linear
218 |
219 | [yolo]
220 | mask = 0,1,2
221 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
222 | classes=1
223 | num=9
224 | jitter=.3
225 | ignore_thresh = .7
226 | truth_thresh = 1
227 | random=1
228 |
--------------------------------------------------------------------------------
/cfg/yolov3-tiny3.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | # batch=1
4 | # subdivisions=1
5 | # Training
6 | batch=64
7 | subdivisions=16
8 | width=608
9 | height=608
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 200000
21 | policy=steps
22 | steps=180000,190000
23 | scales=.1,.1
24 |
25 |
26 | [convolutional]
27 | batch_normalize=1
28 | filters=16
29 | size=3
30 | stride=1
31 | pad=1
32 | activation=leaky
33 |
34 | [maxpool]
35 | size=2
36 | stride=2
37 |
38 | [convolutional]
39 | batch_normalize=1
40 | filters=32
41 | size=3
42 | stride=1
43 | pad=1
44 | activation=leaky
45 |
46 | [maxpool]
47 | size=2
48 | stride=2
49 |
50 | [convolutional]
51 | batch_normalize=1
52 | filters=64
53 | size=3
54 | stride=1
55 | pad=1
56 | activation=leaky
57 |
58 | [maxpool]
59 | size=2
60 | stride=2
61 |
62 | [convolutional]
63 | batch_normalize=1
64 | filters=128
65 | size=3
66 | stride=1
67 | pad=1
68 | activation=leaky
69 |
70 | [maxpool]
71 | size=2
72 | stride=2
73 |
74 | [convolutional]
75 | batch_normalize=1
76 | filters=256
77 | size=3
78 | stride=1
79 | pad=1
80 | activation=leaky
81 |
82 | [maxpool]
83 | size=2
84 | stride=2
85 |
86 | [convolutional]
87 | batch_normalize=1
88 | filters=512
89 | size=3
90 | stride=1
91 | pad=1
92 | activation=leaky
93 |
94 | [maxpool]
95 | size=2
96 | stride=1
97 |
98 | [convolutional]
99 | batch_normalize=1
100 | filters=1024
101 | size=3
102 | stride=1
103 | pad=1
104 | activation=leaky
105 |
106 | ###########
107 |
108 | [convolutional]
109 | batch_normalize=1
110 | filters=256
111 | size=1
112 | stride=1
113 | pad=1
114 | activation=leaky
115 |
116 | [convolutional]
117 | batch_normalize=1
118 | filters=512
119 | size=3
120 | stride=1
121 | pad=1
122 | activation=leaky
123 |
124 | [convolutional]
125 | size=1
126 | stride=1
127 | pad=1
128 | filters=255
129 | activation=linear
130 |
131 |
132 |
133 | [yolo]
134 | mask = 6,7,8
135 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
136 | classes=80
137 | num=9
138 | jitter=.3
139 | ignore_thresh = .7
140 | truth_thresh = 1
141 | random=1
142 |
143 | [route]
144 | layers = -4
145 |
146 | [convolutional]
147 | batch_normalize=1
148 | filters=128
149 | size=1
150 | stride=1
151 | pad=1
152 | activation=leaky
153 |
154 | [upsample]
155 | stride=2
156 |
157 | [route]
158 | layers = -1, 8
159 |
160 | [convolutional]
161 | batch_normalize=1
162 | filters=256
163 | size=3
164 | stride=1
165 | pad=1
166 | activation=leaky
167 |
168 | [convolutional]
169 | size=1
170 | stride=1
171 | pad=1
172 | filters=255
173 | activation=linear
174 |
175 | [yolo]
176 | mask = 3,4,5
177 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
178 | classes=80
179 | num=9
180 | jitter=.3
181 | ignore_thresh = .7
182 | truth_thresh = 1
183 | random=1
184 |
185 |
186 |
187 | [route]
188 | layers = -3
189 |
190 | [convolutional]
191 | batch_normalize=1
192 | filters=128
193 | size=1
194 | stride=1
195 | pad=1
196 | activation=leaky
197 |
198 | [upsample]
199 | stride=2
200 |
201 | [route]
202 | layers = -1, 6
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=3
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | size=1
214 | stride=1
215 | pad=1
216 | filters=255
217 | activation=linear
218 |
219 | [yolo]
220 | mask = 0,1,2
221 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
222 | classes=80
223 | num=9
224 | jitter=.3
225 | ignore_thresh = .7
226 | truth_thresh = 1
227 | random=1
228 |
--------------------------------------------------------------------------------
/utils/layers.py:
--------------------------------------------------------------------------------
1 | import torch.nn.functional as F
2 |
3 | from utils.utils import *
4 |
5 |
6 | def make_divisible(v, divisor):
7 | # Function ensures all layers have a channel number that is divisible by 8
8 | # https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
9 | return math.ceil(v / divisor) * divisor
10 |
11 |
12 | class Flatten(nn.Module):
13 | # Use after nn.AdaptiveAvgPool2d(1) to remove last 2 dimensions
14 | def forward(self, x):
15 | return x.view(x.size(0), -1)
16 |
17 |
18 | class Concat(nn.Module):
19 | # Concatenate a list of tensors along dimension
20 | def __init__(self, dimension=1):
21 | super(Concat, self).__init__()
22 | self.d = dimension
23 |
24 | def forward(self, x):
25 | return torch.cat(x, self.d)
26 |
27 |
28 | class FeatureConcat(nn.Module):
29 | def __init__(self, layers):
30 | super(FeatureConcat, self).__init__()
31 | self.layers = layers # layer indices
32 | self.multiple = len(layers) > 1 # multiple layers flag
33 |
34 | def forward(self, x, outputs):
35 | return torch.cat([outputs[i] for i in self.layers], 1) if self.multiple else outputs[self.layers[0]]
36 |
37 |
38 | class WeightedFeatureFusion(nn.Module): # weighted sum of 2 or more layers https://arxiv.org/abs/1911.09070
39 | def __init__(self, layers, weight=False):
40 | super(WeightedFeatureFusion, self).__init__()
41 | self.layers = layers # layer indices
42 | self.weight = weight # apply weights boolean
43 | self.n = len(layers) + 1 # number of layers
44 | if weight:
45 | self.w = nn.Parameter(torch.zeros(self.n), requires_grad=True) # layer weights
46 |
47 | def forward(self, x, outputs):
48 | # Weights
49 | if self.weight:
50 | w = torch.sigmoid(self.w) * (2 / self.n) # sigmoid weights (0-1)
51 | x = x * w[0]
52 |
53 | # Fusion
54 | nx = x.shape[1] # input channels
55 | for i in range(self.n - 1):
56 | a = outputs[self.layers[i]] * w[i + 1] if self.weight else outputs[self.layers[i]] # feature to add
57 | na = a.shape[1] # feature channels
58 |
59 | # Adjust channels
60 | if nx == na: # same shape
61 | x = x + a
62 | elif nx > na: # slice input
63 | x[:, :na] = x[:, :na] + a # or a = nn.ZeroPad2d((0, 0, 0, 0, 0, dc))(a); x = x + a
64 | else: # slice feature
65 | x = x + a[:, :nx]
66 |
67 | return x
68 |
69 |
70 | class MixConv2d(nn.Module): # MixConv: Mixed Depthwise Convolutional Kernels https://arxiv.org/abs/1907.09595
71 | def __init__(self, in_ch, out_ch, k=(3, 5, 7), stride=1, dilation=1, bias=True, method='equal_params'):
72 | super(MixConv2d, self).__init__()
73 |
74 | groups = len(k)
75 | if method == 'equal_ch': # equal channels per group
76 | i = torch.linspace(0, groups - 1E-6, out_ch).floor() # out_ch indices
77 | ch = [(i == g).sum() for g in range(groups)]
78 | else: # 'equal_params': equal parameter count per group
79 | b = [out_ch] + [0] * groups
80 | a = np.eye(groups + 1, groups, k=-1)
81 | a -= np.roll(a, 1, axis=1)
82 | a *= np.array(k) ** 2
83 | a[0] = 1
84 | ch = np.linalg.lstsq(a, b, rcond=None)[0].round().astype(int) # solve for equal weight indices, ax = b
85 |
86 | self.m = nn.ModuleList([nn.Conv2d(in_channels=in_ch,
87 | out_channels=ch[g],
88 | kernel_size=k[g],
89 | stride=stride,
90 | padding=k[g] // 2, # 'same' pad
91 | dilation=dilation,
92 | bias=bias) for g in range(groups)])
93 |
94 | def forward(self, x):
95 | return torch.cat([m(x) for m in self.m], 1)
96 |
97 |
98 | # Activation functions below -------------------------------------------------------------------------------------------
99 | class SwishImplementation(torch.autograd.Function):
100 | @staticmethod
101 | def forward(ctx, x):
102 | ctx.save_for_backward(x)
103 | return x * torch.sigmoid(x)
104 |
105 | @staticmethod
106 | def backward(ctx, grad_output):
107 | x = ctx.saved_tensors[0]
108 | sx = torch.sigmoid(x) # sigmoid(ctx)
109 | return grad_output * (sx * (1 + x * (1 - sx)))
110 |
111 |
112 | class MishImplementation(torch.autograd.Function):
113 | @staticmethod
114 | def forward(ctx, x):
115 | ctx.save_for_backward(x)
116 | return x.mul(torch.tanh(F.softplus(x))) # x * tanh(ln(1 + exp(x)))
117 |
118 | @staticmethod
119 | def backward(ctx, grad_output):
120 | x = ctx.saved_tensors[0]
121 | sx = torch.sigmoid(x)
122 | fx = F.softplus(x).tanh()
123 | return grad_output * (fx + x * sx * (1 - fx * fx))
124 |
125 |
126 | class MemoryEfficientSwish(nn.Module):
127 | def forward(self, x):
128 | return SwishImplementation.apply(x)
129 |
130 |
131 | class MemoryEfficientMish(nn.Module):
132 | def forward(self, x):
133 | return MishImplementation.apply(x)
134 |
135 |
136 | class Swish(nn.Module):
137 | def forward(self, x):
138 | return x * torch.sigmoid(x)
139 |
140 |
141 | class HardSwish(nn.Module): # https://arxiv.org/pdf/1905.02244.pdf
142 | def forward(self, x):
143 | return x * F.hardtanh(x + 3, 0., 6., True) / 6.
144 |
145 |
146 | class Mish(nn.Module): # https://github.com/digantamisra98/Mish
147 | def forward(self, x):
148 | return x * F.softplus(x).tanh()
149 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Ultralytics-YOLOv3-Cluster-NMS
4 | ## Cluster-NMS into YOLOv3 Pytorch
5 | Our paper is accepted by **IEEE Transactions on Cybernetics (TCYB)**.
6 |
7 | #### This is the code for our paper:
8 | - [Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression](https://arxiv.org/abs/1911.08287)
9 | - [Enhancing Geometric Factors into Model Learning and Inference for Object Detection and Instance Segmentation](http://arxiv.org/abs/2005.03572)
10 |
11 | ```
12 | @Inproceedings{zheng2020diou,
13 | author = {Zheng, Zhaohui and Wang, Ping and Liu, Wei and Li, Jinze and Ye, Rongguang and Ren, Dongwei},
14 | title = {Distance-IoU Loss: Faster and Better Learning for Bounding Box Regression},
15 | booktitle = {The AAAI Conference on Artificial Intelligence (AAAI)},
16 | year = {2020},
17 | }
18 |
19 | @Article{zheng2021ciou,
20 | author = {Zheng, Zhaohui and Wang, Ping and Ren, Dongwei and Liu, Wei and Ye, Rongguang and Hu, Qinghua and Zuo, Wangmeng},
21 | title = {Enhancing Geometric Factors in Model Learning and Inference for Object Detection and Instance Segmentation},
22 | booktitle = {IEEE Transactions on Cybernetics},
23 | year = {2021},
24 | }
25 | ```
26 | # Introduction
27 |
28 | In this [paper](http://arxiv.org/abs/2005.03572), we propose Complete-IoU (CIoU) loss and Cluster-NMS for enhancing geometric factors in both bounding box regression and Non-Maximum Suppression (NMS), leading to notable gains of average precision (AP) and average recall (AR), without the sacrifice of inference efficiency. In particular, we consider three geometric factors, i.e., overlap area, normalized central point distance and aspect ratio, which are crucial for measuring bounding box regression in object detection and instance segmentation. The three geometric factors are then incorporated into CIoU loss for better distinguishing difficult regression cases. The training of deep models using CIoU loss results in consistent AP and AR improvements in comparison to widely adopted Ln-norm loss and IoU-based loss. Furthermore, we propose Cluster-NMS, where NMS during inference is done by implicitly clustering detected boxes and usually requires less iterations. Cluster-NMS is very efficient due to its pure GPU implementation, and geometric factors can be incorporated to improve both AP and AR. In the experiments, CIoU loss and Cluster-NMS have been applied to state-of-the-art instance segmentation (e.g., YOLACT), and object detection (e.g., YOLO v3, SSD and Faster R-CNN) models.
29 |
30 | ### This repo only focuses on NMS improvement based on https://github.com/ultralytics/yolov3.
31 |
32 | ### See `non_max_suppression` function of [utils/utils.py](utils/utils.py) for our Cluster-NMS implementation.
33 |
34 | This directory contains PyTorch YOLOv3 software developed by Ultralytics LLC, and **is freely available for redistribution under the GPL-3.0 license**. For more information please visit https://www.ultralytics.com.
35 |
36 | # Description
37 |
38 | The https://github.com/ultralytics/yolov3 repo contains inference and training code for YOLOv3 in PyTorch. The code works on Linux, MacOS and Windows. Training is done on the COCO dataset by default: https://cocodataset.org/#home. **Credit to Joseph Redmon for YOLO:** https://pjreddie.com/darknet/yolo/.
39 |
40 | # Requirements
41 |
42 | Python 3.7 or later with all `pip install -U -r requirements.txt` packages including `torch >= 1.5`. Docker images come with all dependencies preinstalled. Docker requirements are:
43 | - Nvidia Driver >= 440.44
44 | - Docker Engine - CE >= 19.03
45 |
46 | # mAP
47 |
48 | |Size |COCO mAP
@0.5...0.95 |COCO mAP
@0.5
49 | --- | --- | --- | ---
50 | YOLOv3-tiny
YOLOv3
YOLOv3-SPP
**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |320 |14.0
28.7
30.5
**37.7** |29.1
51.8
52.3
**56.8**
51 | YOLOv3-tiny
YOLOv3
YOLOv3-SPP
**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |416 |16.0
31.2
33.9
**41.2** |33.0
55.4
56.9
**60.6**
52 | YOLOv3-tiny
YOLOv3
YOLOv3-SPP
**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |512 |16.6
32.7
35.6
**42.6** |34.9
57.7
59.5
**62.4**
53 | YOLOv3-tiny
YOLOv3
YOLOv3-SPP
**[YOLOv3-SPP-ultralytics](https://drive.google.com/open?id=1UcR-zVoMs7DH5dj3N1bswkiQTA4dmKF4)** |608 |16.6
33.1
37.0
**43.1** |35.4
58.2
60.7
**62.8**
54 |
55 | - mAP@0.5 run at `--iou-thr 0.5`, mAP@0.5...0.95 run at `--iou-thr 0.7`
56 | - Darknet results: https://arxiv.org/abs/1804.02767
57 |
58 | ## Cluster-NMS
59 |
60 | #### Hardware
61 | - 2 GTX 1080 Ti
62 | - Intel(R) Core(TM) i7-6850K CPU @ 3.60GHz
63 |
64 | Evaluation command: `python3 test.py --cfg yolov3-spp.cfg --weights yolov3-spp-ultralytics.pt`
65 |
66 | AP reports on `coco 2014 minival`.
67 |
68 | | Image Size | Model | NMS | FPS | box AP | box AP75 | box AR100 |
69 | |:----:|:-------------:|:------------------------------------:|:----:|:----:|:----:|:----:|
70 | | 608 | YOLOv3-SPP-ultralytics | Fast NMS | 85.5 | 42.2 | 45.1 | 60.1 |
71 | | 608 | YOLOv3-SPP-ultralytics | Original NMS | 14.6 | 42.6 | 45.8 | 62.5 |
72 | | 608 | YOLOv3-SPP-ultralytics | DIoU-NMS | 7.9 | 42.7 | 46.2 | 63.4 |
73 | | 608 | YOLOv3-SPP-ultralytics | Original NMS Torchvision | **95.2** | 42.6 | 45.8 | 62.5 |
74 | | 608 | YOLOv3-SPP-ultralytics | Cluster-NMS | 82.6 | 42.6 | 45.8 | 62.5 |
75 | | 608 | YOLOv3-SPP-ultralytics | Cluster-DIoU-NMS | 76.9 | 42.7 | 46.2 | 63.4 |
76 | | 608 | YOLOv3-SPP-ultralytics | Weighted-NMS | 11.2 | 42.9 | 46.4 | 62.7 |
77 | | 608 | YOLOv3-SPP-ultralytics | Weighted Cluster-NMS | 68.0 | 42.9 | 46.4 | 62.7 |
78 | | 608 | YOLOv3-SPP-ultralytics | Weighted + Cluster-DIoU-NMS | 64.9 | **43.1** | **46.8** | **63.7** |
79 | | 608 | YOLOv3-SPP-ultralytics | Merge + Torchvision NMS | 88.5 | 42.8 | 46.3 | 63.0 |
80 | | 608 | YOLOv3-SPP-ultralytics | Merge + DIoU + Torchvision NMS | 82.5 | 43.0 | 46.6 | 63.2 |
81 | ## Conclusion
82 |
83 | - Merge NMS is a simplified version of Weighted-NMS. It just use score vector for weighted coordinates, not combine score and IoU. (Refer to [CAD](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8265304) for the details of Weighted-NMS.)
84 |
85 | - We further incorporate DIoU into NMS for YOLOv3 which can get higher AP and AR.
86 |
87 | - Note that Torchvision NMS has the fastest speed, that is owing to CUDA implementation and engineering accelerations (like upper triangular IoU matrix only). However, our Cluster-NMS requires less iterations for NMS and can also be further accelerated by adopting engineering tricks. Almost completed at the same time as the work of our paper is Glenn Jocher's Torchvision NMS + Merge. First, we do Torchvision NMS, then convert the output to vector to multiply the IoU matrix. Also, for Merge NMS, the IoU matrix is no need to be square shape `n*n`. It can be `m*n` to save more time, where `m` is the boxes that NMS outputs.
88 |
89 | - Currently, Torchvision NMS use IoU as criterion, not DIoU. However, if we directly replace IoU with DIoU in Original NMS, it will costs much more time due to the sequence operation. Now, Cluster-DIoU-NMS will significantly speed up DIoU-NMS and obtain exactly the same result.
90 |
91 | - Torchvision NMS is a function in Torchvision>=0.3, and our Cluster-NMS can be applied to any projects that use low version of Torchvision and other deep learning frameworks as long as it can do matrix operations. **No other import, no need to compile, less iteration, fully GPU-accelerated and better performance**.
92 |
93 | # Citation
94 |
95 | [](https://zenodo.org/badge/latestdoi/146165888)
96 |
--------------------------------------------------------------------------------
/detect.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from sys import platform
3 |
4 | from models import * # set ONNX_EXPORT in models.py
5 | from utils.datasets import *
6 | from utils.utils import *
7 |
8 |
9 | def detect(save_img=False):
10 | img_size = (320, 192) if ONNX_EXPORT else opt.img_size # (320, 192) or (416, 256) or (608, 352) for (height, width)
11 | out, source, weights, half, view_img, save_txt = opt.output, opt.source, opt.weights, opt.half, opt.view_img, opt.save_txt
12 | webcam = source == '0' or source.startswith('rtsp') or source.startswith('http') or source.endswith('.txt')
13 |
14 | # Initialize
15 | device = torch_utils.select_device(device='cpu' if ONNX_EXPORT else opt.device)
16 | if os.path.exists(out):
17 | shutil.rmtree(out) # delete output folder
18 | os.makedirs(out) # make new output folder
19 |
20 | # Initialize model
21 | model = Darknet(opt.cfg, img_size)
22 |
23 | # Load weights
24 | attempt_download(weights)
25 | if weights.endswith('.pt'): # pytorch format
26 | model.load_state_dict(torch.load(weights, map_location=device)['model'])
27 | else: # darknet format
28 | load_darknet_weights(model, weights)
29 |
30 | # Second-stage classifier
31 | classify = False
32 | if classify:
33 | modelc = torch_utils.load_classifier(name='resnet101', n=2) # initialize
34 | modelc.load_state_dict(torch.load('weights/resnet101.pt', map_location=device)['model']) # load weights
35 | modelc.to(device).eval()
36 |
37 | # Eval mode
38 | model.to(device).eval()
39 |
40 | # Fuse Conv2d + BatchNorm2d layers
41 | # model.fuse()
42 |
43 | # Export mode
44 | if ONNX_EXPORT:
45 | model.fuse()
46 | img = torch.zeros((1, 3) + img_size) # (1, 3, 320, 192)
47 | f = opt.weights.replace(opt.weights.split('.')[-1], 'onnx') # *.onnx filename
48 | torch.onnx.export(model, img, f, verbose=False, opset_version=11,
49 | input_names=['images'], output_names=['classes', 'boxes'])
50 |
51 | # Validate exported model
52 | import onnx
53 | model = onnx.load(f) # Load the ONNX model
54 | onnx.checker.check_model(model) # Check that the IR is well formed
55 | print(onnx.helper.printable_graph(model.graph)) # Print a human readable representation of the graph
56 | return
57 |
58 | # Half precision
59 | half = half and device.type != 'cpu' # half precision only supported on CUDA
60 | if half:
61 | model.half()
62 |
63 | # Set Dataloader
64 | vid_path, vid_writer = None, None
65 | if webcam:
66 | view_img = True
67 | torch.backends.cudnn.benchmark = True # set True to speed up constant image size inference
68 | dataset = LoadStreams(source, img_size=img_size)
69 | else:
70 | save_img = True
71 | dataset = LoadImages(source, img_size=img_size)
72 |
73 | # Get names and colors
74 | names = load_classes(opt.names)
75 | colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))]
76 |
77 | # Run inference
78 | t0 = time.time()
79 | img = torch.zeros((1, 3, img_size, img_size), device=device) # init img
80 | _ = model(img.half() if half else img.float()) if device.type != 'cpu' else None # run once
81 | for path, img, im0s, vid_cap in dataset:
82 | img = torch.from_numpy(img).to(device)
83 | img = img.half() if half else img.float() # uint8 to fp16/32
84 | img /= 255.0 # 0 - 255 to 0.0 - 1.0
85 | if img.ndimension() == 3:
86 | img = img.unsqueeze(0)
87 |
88 | # Inference
89 | t1 = torch_utils.time_synchronized()
90 | pred = model(img, augment=opt.augment)[0]
91 | t2 = torch_utils.time_synchronized()
92 |
93 | # to float
94 | if half:
95 | pred = pred.float()
96 |
97 | # Apply NMS
98 | pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres,
99 | multi_label=False, classes=opt.classes, agnostic=opt.agnostic_nms)
100 |
101 | # Apply Classifier
102 | if classify:
103 | pred = apply_classifier(pred, modelc, img, im0s)
104 |
105 | # Process detections
106 | for i, det in enumerate(pred): # detections per image
107 | if webcam: # batch_size >= 1
108 | p, s, im0 = path[i], '%g: ' % i, im0s[i]
109 | else:
110 | p, s, im0 = path, '', im0s
111 |
112 | save_path = str(Path(out) / Path(p).name)
113 | s += '%gx%g ' % img.shape[2:] # print string
114 | if det is not None and len(det):
115 | # Rescale boxes from img_size to im0 size
116 | det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()
117 |
118 | # Print results
119 | for c in det[:, -1].unique():
120 | n = (det[:, -1] == c).sum() # detections per class
121 | s += '%g %ss, ' % (n, names[int(c)]) # add to string
122 |
123 | # Write results
124 | for *xyxy, conf, cls in det:
125 | if save_txt: # Write to file
126 | with open(save_path + '.txt', 'a') as file:
127 | file.write(('%g ' * 6 + '\n') % (*xyxy, cls, conf))
128 |
129 | if save_img or view_img: # Add bbox to image
130 | label = '%s %.2f' % (names[int(cls)], conf)
131 | plot_one_box(xyxy, im0, label=label, color=colors[int(cls)])
132 |
133 | # Print time (inference + NMS)
134 | print('%sDone. (%.3fs)' % (s, t2 - t1))
135 |
136 | # Stream results
137 | if view_img:
138 | cv2.imshow(p, im0)
139 | if cv2.waitKey(1) == ord('q'): # q to quit
140 | raise StopIteration
141 |
142 | # Save results (image with detections)
143 | if save_img:
144 | if dataset.mode == 'images':
145 | cv2.imwrite(save_path, im0)
146 | else:
147 | if vid_path != save_path: # new video
148 | vid_path = save_path
149 | if isinstance(vid_writer, cv2.VideoWriter):
150 | vid_writer.release() # release previous video writer
151 |
152 | fps = vid_cap.get(cv2.CAP_PROP_FPS)
153 | w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
154 | h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
155 | vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*opt.fourcc), fps, (w, h))
156 | vid_writer.write(im0)
157 |
158 | if save_txt or save_img:
159 | print('Results saved to %s' % os.getcwd() + os.sep + out)
160 | if platform == 'darwin': # MacOS
161 | os.system('open ' + save_path)
162 |
163 | print('Done. (%.3fs)' % (time.time() - t0))
164 |
165 |
166 | if __name__ == '__main__':
167 | parser = argparse.ArgumentParser()
168 | parser.add_argument('--cfg', type=str, default='cfg/yolov3-spp.cfg', help='*.cfg path')
169 | parser.add_argument('--names', type=str, default='data/coco.names', help='*.names path')
170 | parser.add_argument('--weights', type=str, default='weights/yolov3-spp-ultralytics.pt', help='weights path')
171 | parser.add_argument('--source', type=str, default='data/samples', help='source') # input file/folder, 0 for webcam
172 | parser.add_argument('--output', type=str, default='output', help='output folder') # output folder
173 | parser.add_argument('--img-size', type=int, default=512, help='inference size (pixels)')
174 | parser.add_argument('--conf-thres', type=float, default=0.3, help='object confidence threshold')
175 | parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS')
176 | parser.add_argument('--fourcc', type=str, default='mp4v', help='output video codec (verify ffmpeg support)')
177 | parser.add_argument('--half', action='store_true', help='half precision FP16 inference')
178 | parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1) or cpu')
179 | parser.add_argument('--view-img', action='store_true', help='display results')
180 | parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
181 | parser.add_argument('--classes', nargs='+', type=int, help='filter by class')
182 | parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
183 | parser.add_argument('--augment', action='store_true', help='augmented inference')
184 | opt = parser.parse_args()
185 | print(opt)
186 |
187 | with torch.no_grad():
188 | detect()
189 |
--------------------------------------------------------------------------------
/utils/torch_utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import os
3 | import time
4 | from copy import deepcopy
5 |
6 | import torch
7 | import torch.backends.cudnn as cudnn
8 | import torch.nn as nn
9 | import torch.nn.functional as F
10 |
11 |
12 | def init_seeds(seed=0):
13 | torch.manual_seed(seed)
14 |
15 | # Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html
16 | if seed == 0:
17 | cudnn.deterministic = True
18 | cudnn.benchmark = False
19 |
20 |
21 | def select_device(device='', apex=False, batch_size=None):
22 | # device = 'cpu' or '0' or '0,1,2,3'
23 | cpu_request = device.lower() == 'cpu'
24 | if device and not cpu_request: # if device requested other than 'cpu'
25 | os.environ['CUDA_VISIBLE_DEVICES'] = device # set environment variable
26 | assert torch.cuda.is_available(), 'CUDA unavailable, invalid device %s requested' % device # check availablity
27 |
28 | cuda = False if cpu_request else torch.cuda.is_available()
29 | if cuda:
30 | c = 1024 ** 2 # bytes to MB
31 | ng = torch.cuda.device_count()
32 | if ng > 1 and batch_size: # check that batch_size is compatible with device_count
33 | assert batch_size % ng == 0, 'batch-size %g not multiple of GPU count %g' % (batch_size, ng)
34 | x = [torch.cuda.get_device_properties(i) for i in range(ng)]
35 | s = 'Using CUDA ' + ('Apex ' if apex else '') # apex for mixed precision https://github.com/NVIDIA/apex
36 | for i in range(0, ng):
37 | if i == 1:
38 | s = ' ' * len(s)
39 | print("%sdevice%g _CudaDeviceProperties(name='%s', total_memory=%dMB)" %
40 | (s, i, x[i].name, x[i].total_memory / c))
41 | else:
42 | print('Using CPU')
43 |
44 | print('') # skip a line
45 | return torch.device('cuda:0' if cuda else 'cpu')
46 |
47 |
48 | def time_synchronized():
49 | torch.cuda.synchronize() if torch.cuda.is_available() else None
50 | return time.time()
51 |
52 |
53 | def initialize_weights(model):
54 | for m in model.modules():
55 | t = type(m)
56 | if t is nn.Conv2d:
57 | pass # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
58 | elif t is nn.BatchNorm2d:
59 | m.eps = 1e-4
60 | m.momentum = 0.03
61 | elif t in [nn.LeakyReLU, nn.ReLU, nn.ReLU6]:
62 | m.inplace = True
63 |
64 |
65 | def find_modules(model, mclass=nn.Conv2d):
66 | # finds layer indices matching module class 'mclass'
67 | return [i for i, m in enumerate(model.module_list) if isinstance(m, mclass)]
68 |
69 |
70 | def fuse_conv_and_bn(conv, bn):
71 | # https://tehnokv.com/posts/fusing-batchnorm-and-conv/
72 | with torch.no_grad():
73 | # init
74 | fusedconv = torch.nn.Conv2d(conv.in_channels,
75 | conv.out_channels,
76 | kernel_size=conv.kernel_size,
77 | stride=conv.stride,
78 | padding=conv.padding,
79 | bias=True)
80 |
81 | # prepare filters
82 | w_conv = conv.weight.clone().view(conv.out_channels, -1)
83 | w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
84 | fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.size()))
85 |
86 | # prepare spatial bias
87 | if conv.bias is not None:
88 | b_conv = conv.bias
89 | else:
90 | b_conv = torch.zeros(conv.weight.size(0))
91 | b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
92 | fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
93 |
94 | return fusedconv
95 |
96 |
97 | def model_info(model, verbose=False):
98 | # Plots a line-by-line description of a PyTorch model
99 | n_p = sum(x.numel() for x in model.parameters()) # number parameters
100 | n_g = sum(x.numel() for x in model.parameters() if x.requires_grad) # number gradients
101 | if verbose:
102 | print('%5s %40s %9s %12s %20s %10s %10s' % ('layer', 'name', 'gradient', 'parameters', 'shape', 'mu', 'sigma'))
103 | for i, (name, p) in enumerate(model.named_parameters()):
104 | name = name.replace('module_list.', '')
105 | print('%5g %40s %9s %12g %20s %10.3g %10.3g' %
106 | (i, name, p.requires_grad, p.numel(), list(p.shape), p.mean(), p.std()))
107 |
108 | try: # FLOPS
109 | from thop import profile
110 | macs, _ = profile(model, inputs=(torch.zeros(1, 3, 480, 640),), verbose=False)
111 | fs = ', %.1f GFLOPS' % (macs / 1E9 * 2)
112 | except:
113 | fs = ''
114 |
115 | print('Model Summary: %g layers, %g parameters, %g gradients%s' % (len(list(model.parameters())), n_p, n_g, fs))
116 |
117 |
118 | def load_classifier(name='resnet101', n=2):
119 | # Loads a pretrained model reshaped to n-class output
120 | import pretrainedmodels # https://github.com/Cadene/pretrained-models.pytorch#torchvision
121 | model = pretrainedmodels.__dict__[name](num_classes=1000, pretrained='imagenet')
122 |
123 | # Display model properties
124 | for x in ['model.input_size', 'model.input_space', 'model.input_range', 'model.mean', 'model.std']:
125 | print(x + ' =', eval(x))
126 |
127 | # Reshape output to n classes
128 | filters = model.last_linear.weight.shape[1]
129 | model.last_linear.bias = torch.nn.Parameter(torch.zeros(n))
130 | model.last_linear.weight = torch.nn.Parameter(torch.zeros(n, filters))
131 | model.last_linear.out_features = n
132 | return model
133 |
134 |
135 | def scale_img(img, ratio=1.0, same_shape=True): # img(16,3,256,416), r=ratio
136 | # scales img(bs,3,y,x) by ratio
137 | h, w = img.shape[2:]
138 | s = (int(h * ratio), int(w * ratio)) # new size
139 | img = F.interpolate(img, size=s, mode='bilinear', align_corners=False) # resize
140 | if not same_shape: # pad/crop img
141 | gs = 64 # (pixels) grid size
142 | h, w = [math.ceil(x * ratio / gs) * gs for x in (h, w)]
143 | return F.pad(img, [0, w - s[1], 0, h - s[0]], value=0.447) # value = imagenet mean
144 |
145 |
146 | class ModelEMA:
147 | """ Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
148 | Keep a moving average of everything in the model state_dict (parameters and buffers).
149 | This is intended to allow functionality like
150 | https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
151 | A smoothed version of the weights is necessary for some training schemes to perform well.
152 | E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use
153 | RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA
154 | smoothing of weights to match results. Pay attention to the decay constant you are using
155 | relative to your update count per epoch.
156 | To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but
157 | disable validation of the EMA weights. Validation will have to be done manually in a separate
158 | process, or after the training stops converging.
159 | This class is sensitive where it is initialized in the sequence of model init,
160 | GPU assignment and distributed training wrappers.
161 | I've tested with the sequence in my own train.py for torch.DataParallel, apex.DDP, and single-GPU.
162 | """
163 |
164 | def __init__(self, model, decay=0.9999, device=''):
165 | # make a copy of the model for accumulating moving average of weights
166 | self.ema = deepcopy(model)
167 | self.ema.eval()
168 | self.updates = 0 # number of EMA updates
169 | self.decay = lambda x: decay * (1 - math.exp(-x / 2000)) # decay exponential ramp (to help early epochs)
170 | self.device = device # perform ema on different device from model if set
171 | if device:
172 | self.ema.to(device=device)
173 | for p in self.ema.parameters():
174 | p.requires_grad_(False)
175 |
176 | def update(self, model):
177 | self.updates += 1
178 | d = self.decay(self.updates)
179 | with torch.no_grad():
180 | if type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel):
181 | msd, esd = model.module.state_dict(), self.ema.module.state_dict()
182 | else:
183 | msd, esd = model.state_dict(), self.ema.state_dict()
184 |
185 | for k, v in esd.items():
186 | if v.dtype.is_floating_point:
187 | v *= d
188 | v += (1. - d) * msd[k].detach()
189 |
190 | def update_attr(self, model):
191 | # Assign attributes (which may change during training)
192 | for k in model.__dict__.keys():
193 | if not k.startswith('_'):
194 | setattr(self.ema, k, getattr(model, k))
195 |
--------------------------------------------------------------------------------
/utils/adabound.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | from torch.optim.optimizer import Optimizer
5 |
6 |
7 | class AdaBound(Optimizer):
8 | """Implements AdaBound algorithm.
9 | It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of Learning Rate`_.
10 | Arguments:
11 | params (iterable): iterable of parameters to optimize or dicts defining
12 | parameter groups
13 | lr (float, optional): Adam learning rate (default: 1e-3)
14 | betas (Tuple[float, float], optional): coefficients used for computing
15 | running averages of gradient and its square (default: (0.9, 0.999))
16 | final_lr (float, optional): final (SGD) learning rate (default: 0.1)
17 | gamma (float, optional): convergence speed of the bound functions (default: 1e-3)
18 | eps (float, optional): term added to the denominator to improve
19 | numerical stability (default: 1e-8)
20 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
21 | amsbound (boolean, optional): whether to use the AMSBound variant of this algorithm
22 | .. Adaptive Gradient Methods with Dynamic Bound of Learning Rate:
23 | https://openreview.net/forum?id=Bkg3g2R9FX
24 | """
25 |
26 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), final_lr=0.1, gamma=1e-3,
27 | eps=1e-8, weight_decay=0, amsbound=False):
28 | if not 0.0 <= lr:
29 | raise ValueError("Invalid learning rate: {}".format(lr))
30 | if not 0.0 <= eps:
31 | raise ValueError("Invalid epsilon value: {}".format(eps))
32 | if not 0.0 <= betas[0] < 1.0:
33 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
34 | if not 0.0 <= betas[1] < 1.0:
35 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
36 | if not 0.0 <= final_lr:
37 | raise ValueError("Invalid final learning rate: {}".format(final_lr))
38 | if not 0.0 <= gamma < 1.0:
39 | raise ValueError("Invalid gamma parameter: {}".format(gamma))
40 | defaults = dict(lr=lr, betas=betas, final_lr=final_lr, gamma=gamma, eps=eps,
41 | weight_decay=weight_decay, amsbound=amsbound)
42 | super(AdaBound, self).__init__(params, defaults)
43 |
44 | self.base_lrs = list(map(lambda group: group['lr'], self.param_groups))
45 |
46 | def __setstate__(self, state):
47 | super(AdaBound, self).__setstate__(state)
48 | for group in self.param_groups:
49 | group.setdefault('amsbound', False)
50 |
51 | def step(self, closure=None):
52 | """Performs a single optimization step.
53 | Arguments:
54 | closure (callable, optional): A closure that reevaluates the model
55 | and returns the loss.
56 | """
57 | loss = None
58 | if closure is not None:
59 | loss = closure()
60 |
61 | for group, base_lr in zip(self.param_groups, self.base_lrs):
62 | for p in group['params']:
63 | if p.grad is None:
64 | continue
65 | grad = p.grad.data
66 | if grad.is_sparse:
67 | raise RuntimeError(
68 | 'Adam does not support sparse gradients, please consider SparseAdam instead')
69 | amsbound = group['amsbound']
70 |
71 | state = self.state[p]
72 |
73 | # State initialization
74 | if len(state) == 0:
75 | state['step'] = 0
76 | # Exponential moving average of gradient values
77 | state['exp_avg'] = torch.zeros_like(p.data)
78 | # Exponential moving average of squared gradient values
79 | state['exp_avg_sq'] = torch.zeros_like(p.data)
80 | if amsbound:
81 | # Maintains max of all exp. moving avg. of sq. grad. values
82 | state['max_exp_avg_sq'] = torch.zeros_like(p.data)
83 |
84 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
85 | if amsbound:
86 | max_exp_avg_sq = state['max_exp_avg_sq']
87 | beta1, beta2 = group['betas']
88 |
89 | state['step'] += 1
90 |
91 | if group['weight_decay'] != 0:
92 | grad = grad.add(group['weight_decay'], p.data)
93 |
94 | # Decay the first and second moment running average coefficient
95 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
96 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
97 | if amsbound:
98 | # Maintains the maximum of all 2nd moment running avg. till now
99 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
100 | # Use the max. for normalizing running avg. of gradient
101 | denom = max_exp_avg_sq.sqrt().add_(group['eps'])
102 | else:
103 | denom = exp_avg_sq.sqrt().add_(group['eps'])
104 |
105 | bias_correction1 = 1 - beta1 ** state['step']
106 | bias_correction2 = 1 - beta2 ** state['step']
107 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
108 |
109 | # Applies bounds on actual learning rate
110 | # lr_scheduler cannot affect final_lr, this is a workaround to apply lr decay
111 | final_lr = group['final_lr'] * group['lr'] / base_lr
112 | lower_bound = final_lr * (1 - 1 / (group['gamma'] * state['step'] + 1))
113 | upper_bound = final_lr * (1 + 1 / (group['gamma'] * state['step']))
114 | step_size = torch.full_like(denom, step_size)
115 | step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(exp_avg)
116 |
117 | p.data.add_(-step_size)
118 |
119 | return loss
120 |
121 |
122 | class AdaBoundW(Optimizer):
123 | """Implements AdaBound algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101)
124 | It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of Learning Rate`_.
125 | Arguments:
126 | params (iterable): iterable of parameters to optimize or dicts defining
127 | parameter groups
128 | lr (float, optional): Adam learning rate (default: 1e-3)
129 | betas (Tuple[float, float], optional): coefficients used for computing
130 | running averages of gradient and its square (default: (0.9, 0.999))
131 | final_lr (float, optional): final (SGD) learning rate (default: 0.1)
132 | gamma (float, optional): convergence speed of the bound functions (default: 1e-3)
133 | eps (float, optional): term added to the denominator to improve
134 | numerical stability (default: 1e-8)
135 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
136 | amsbound (boolean, optional): whether to use the AMSBound variant of this algorithm
137 | .. Adaptive Gradient Methods with Dynamic Bound of Learning Rate:
138 | https://openreview.net/forum?id=Bkg3g2R9FX
139 | """
140 |
141 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), final_lr=0.1, gamma=1e-3,
142 | eps=1e-8, weight_decay=0, amsbound=False):
143 | if not 0.0 <= lr:
144 | raise ValueError("Invalid learning rate: {}".format(lr))
145 | if not 0.0 <= eps:
146 | raise ValueError("Invalid epsilon value: {}".format(eps))
147 | if not 0.0 <= betas[0] < 1.0:
148 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
149 | if not 0.0 <= betas[1] < 1.0:
150 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
151 | if not 0.0 <= final_lr:
152 | raise ValueError("Invalid final learning rate: {}".format(final_lr))
153 | if not 0.0 <= gamma < 1.0:
154 | raise ValueError("Invalid gamma parameter: {}".format(gamma))
155 | defaults = dict(lr=lr, betas=betas, final_lr=final_lr, gamma=gamma, eps=eps,
156 | weight_decay=weight_decay, amsbound=amsbound)
157 | super(AdaBoundW, self).__init__(params, defaults)
158 |
159 | self.base_lrs = list(map(lambda group: group['lr'], self.param_groups))
160 |
161 | def __setstate__(self, state):
162 | super(AdaBoundW, self).__setstate__(state)
163 | for group in self.param_groups:
164 | group.setdefault('amsbound', False)
165 |
166 | def step(self, closure=None):
167 | """Performs a single optimization step.
168 | Arguments:
169 | closure (callable, optional): A closure that reevaluates the model
170 | and returns the loss.
171 | """
172 | loss = None
173 | if closure is not None:
174 | loss = closure()
175 |
176 | for group, base_lr in zip(self.param_groups, self.base_lrs):
177 | for p in group['params']:
178 | if p.grad is None:
179 | continue
180 | grad = p.grad.data
181 | if grad.is_sparse:
182 | raise RuntimeError(
183 | 'Adam does not support sparse gradients, please consider SparseAdam instead')
184 | amsbound = group['amsbound']
185 |
186 | state = self.state[p]
187 |
188 | # State initialization
189 | if len(state) == 0:
190 | state['step'] = 0
191 | # Exponential moving average of gradient values
192 | state['exp_avg'] = torch.zeros_like(p.data)
193 | # Exponential moving average of squared gradient values
194 | state['exp_avg_sq'] = torch.zeros_like(p.data)
195 | if amsbound:
196 | # Maintains max of all exp. moving avg. of sq. grad. values
197 | state['max_exp_avg_sq'] = torch.zeros_like(p.data)
198 |
199 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
200 | if amsbound:
201 | max_exp_avg_sq = state['max_exp_avg_sq']
202 | beta1, beta2 = group['betas']
203 |
204 | state['step'] += 1
205 |
206 | # Decay the first and second moment running average coefficient
207 | exp_avg.mul_(beta1).add_(1 - beta1, grad)
208 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
209 | if amsbound:
210 | # Maintains the maximum of all 2nd moment running avg. till now
211 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
212 | # Use the max. for normalizing running avg. of gradient
213 | denom = max_exp_avg_sq.sqrt().add_(group['eps'])
214 | else:
215 | denom = exp_avg_sq.sqrt().add_(group['eps'])
216 |
217 | bias_correction1 = 1 - beta1 ** state['step']
218 | bias_correction2 = 1 - beta2 ** state['step']
219 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
220 |
221 | # Applies bounds on actual learning rate
222 | # lr_scheduler cannot affect final_lr, this is a workaround to apply lr decay
223 | final_lr = group['final_lr'] * group['lr'] / base_lr
224 | lower_bound = final_lr * (1 - 1 / (group['gamma'] * state['step'] + 1))
225 | upper_bound = final_lr * (1 + 1 / (group['gamma'] * state['step']))
226 | step_size = torch.full_like(denom, step_size)
227 | step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(exp_avg)
228 |
229 | if group['weight_decay'] != 0:
230 | decayed_weights = torch.mul(p.data, group['weight_decay'])
231 | p.data.add_(-step_size)
232 | p.data.sub_(decayed_weights)
233 | else:
234 | p.data.add_(-step_size)
235 |
236 | return loss
237 |
--------------------------------------------------------------------------------
/cfg/yolov3-1cls.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | #batch=1
4 | #subdivisions=1
5 | # Training
6 | batch=16
7 | subdivisions=1
8 | width=416
9 | height=416
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=leaky
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=leaky
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 |
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 |
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 |
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=18
604 | activation=linear
605 |
606 |
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
610 | classes=1
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 |
617 |
618 | [route]
619 | layers = -4
620 |
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 |
629 | [upsample]
630 | stride=2
631 |
632 | [route]
633 | layers = -1, 61
634 |
635 |
636 |
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 |
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 |
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 |
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 |
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 |
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 |
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=18
690 | activation=linear
691 |
692 |
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
696 | classes=1
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 |
703 |
704 |
705 | [route]
706 | layers = -4
707 |
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 |
716 | [upsample]
717 | stride=2
718 |
719 | [route]
720 | layers = -1, 36
721 |
722 |
723 |
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 |
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 |
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 |
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 |
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 |
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 |
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=18
777 | activation=linear
778 |
779 |
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
783 | classes=1
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 |
--------------------------------------------------------------------------------
/cfg/yolov3.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | #batch=1
4 | #subdivisions=1
5 | # Training
6 | batch=16
7 | subdivisions=1
8 | width=416
9 | height=416
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=leaky
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=leaky
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 |
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 |
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 |
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 |
606 |
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 |
617 |
618 | [route]
619 | layers = -4
620 |
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 |
629 | [upsample]
630 | stride=2
631 |
632 | [route]
633 | layers = -1, 61
634 |
635 |
636 |
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 |
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 |
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 |
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 |
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 |
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 |
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 |
692 |
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 |
703 |
704 |
705 | [route]
706 | layers = -4
707 |
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 |
716 | [upsample]
717 | stride=2
718 |
719 | [route]
720 | layers = -1, 36
721 |
722 |
723 |
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 |
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 |
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 |
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 |
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 |
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 |
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 |
779 |
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 |
--------------------------------------------------------------------------------
/cfg/yolov3-spp-1cls.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | # batch=1
4 | # subdivisions=1
5 | # Training
6 | batch=64
7 | subdivisions=16
8 | width=608
9 | height=608
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=100
20 | max_batches = 5000
21 | policy=steps
22 | steps=4000,4500
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=leaky
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=leaky
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | ### SPP ###
576 | [maxpool]
577 | stride=1
578 | size=5
579 |
580 | [route]
581 | layers=-2
582 |
583 | [maxpool]
584 | stride=1
585 | size=9
586 |
587 | [route]
588 | layers=-4
589 |
590 | [maxpool]
591 | stride=1
592 | size=13
593 |
594 | [route]
595 | layers=-1,-3,-5,-6
596 |
597 | ### End SPP ###
598 |
599 | [convolutional]
600 | batch_normalize=1
601 | filters=512
602 | size=1
603 | stride=1
604 | pad=1
605 | activation=leaky
606 |
607 |
608 | [convolutional]
609 | batch_normalize=1
610 | size=3
611 | stride=1
612 | pad=1
613 | filters=1024
614 | activation=leaky
615 |
616 | [convolutional]
617 | batch_normalize=1
618 | filters=512
619 | size=1
620 | stride=1
621 | pad=1
622 | activation=leaky
623 |
624 | [convolutional]
625 | batch_normalize=1
626 | size=3
627 | stride=1
628 | pad=1
629 | filters=1024
630 | activation=leaky
631 |
632 | [convolutional]
633 | size=1
634 | stride=1
635 | pad=1
636 | filters=18
637 | activation=linear
638 |
639 |
640 | [yolo]
641 | mask = 6,7,8
642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
643 | classes=1
644 | num=9
645 | jitter=.3
646 | ignore_thresh = .7
647 | truth_thresh = 1
648 | random=1
649 |
650 |
651 | [route]
652 | layers = -4
653 |
654 | [convolutional]
655 | batch_normalize=1
656 | filters=256
657 | size=1
658 | stride=1
659 | pad=1
660 | activation=leaky
661 |
662 | [upsample]
663 | stride=2
664 |
665 | [route]
666 | layers = -1, 61
667 |
668 |
669 |
670 | [convolutional]
671 | batch_normalize=1
672 | filters=256
673 | size=1
674 | stride=1
675 | pad=1
676 | activation=leaky
677 |
678 | [convolutional]
679 | batch_normalize=1
680 | size=3
681 | stride=1
682 | pad=1
683 | filters=512
684 | activation=leaky
685 |
686 | [convolutional]
687 | batch_normalize=1
688 | filters=256
689 | size=1
690 | stride=1
691 | pad=1
692 | activation=leaky
693 |
694 | [convolutional]
695 | batch_normalize=1
696 | size=3
697 | stride=1
698 | pad=1
699 | filters=512
700 | activation=leaky
701 |
702 | [convolutional]
703 | batch_normalize=1
704 | filters=256
705 | size=1
706 | stride=1
707 | pad=1
708 | activation=leaky
709 |
710 | [convolutional]
711 | batch_normalize=1
712 | size=3
713 | stride=1
714 | pad=1
715 | filters=512
716 | activation=leaky
717 |
718 | [convolutional]
719 | size=1
720 | stride=1
721 | pad=1
722 | filters=18
723 | activation=linear
724 |
725 |
726 | [yolo]
727 | mask = 3,4,5
728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
729 | classes=1
730 | num=9
731 | jitter=.3
732 | ignore_thresh = .7
733 | truth_thresh = 1
734 | random=1
735 |
736 |
737 |
738 | [route]
739 | layers = -4
740 |
741 | [convolutional]
742 | batch_normalize=1
743 | filters=128
744 | size=1
745 | stride=1
746 | pad=1
747 | activation=leaky
748 |
749 | [upsample]
750 | stride=2
751 |
752 | [route]
753 | layers = -1, 36
754 |
755 |
756 |
757 | [convolutional]
758 | batch_normalize=1
759 | filters=128
760 | size=1
761 | stride=1
762 | pad=1
763 | activation=leaky
764 |
765 | [convolutional]
766 | batch_normalize=1
767 | size=3
768 | stride=1
769 | pad=1
770 | filters=256
771 | activation=leaky
772 |
773 | [convolutional]
774 | batch_normalize=1
775 | filters=128
776 | size=1
777 | stride=1
778 | pad=1
779 | activation=leaky
780 |
781 | [convolutional]
782 | batch_normalize=1
783 | size=3
784 | stride=1
785 | pad=1
786 | filters=256
787 | activation=leaky
788 |
789 | [convolutional]
790 | batch_normalize=1
791 | filters=128
792 | size=1
793 | stride=1
794 | pad=1
795 | activation=leaky
796 |
797 | [convolutional]
798 | batch_normalize=1
799 | size=3
800 | stride=1
801 | pad=1
802 | filters=256
803 | activation=leaky
804 |
805 | [convolutional]
806 | size=1
807 | stride=1
808 | pad=1
809 | filters=18
810 | activation=linear
811 |
812 |
813 | [yolo]
814 | mask = 0,1,2
815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
816 | classes=1
817 | num=9
818 | jitter=.3
819 | ignore_thresh = .7
820 | truth_thresh = 1
821 | random=1
822 |
--------------------------------------------------------------------------------
/cfg/yolov3-spp-3cls.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | # batch=1
4 | # subdivisions=1
5 | # Training
6 | batch=64
7 | subdivisions=16
8 | width=608
9 | height=608
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=100
20 | max_batches = 5000
21 | policy=steps
22 | steps=4000,4500
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=leaky
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=leaky
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | ### SPP ###
576 | [maxpool]
577 | stride=1
578 | size=5
579 |
580 | [route]
581 | layers=-2
582 |
583 | [maxpool]
584 | stride=1
585 | size=9
586 |
587 | [route]
588 | layers=-4
589 |
590 | [maxpool]
591 | stride=1
592 | size=13
593 |
594 | [route]
595 | layers=-1,-3,-5,-6
596 |
597 | ### End SPP ###
598 |
599 | [convolutional]
600 | batch_normalize=1
601 | filters=512
602 | size=1
603 | stride=1
604 | pad=1
605 | activation=leaky
606 |
607 |
608 | [convolutional]
609 | batch_normalize=1
610 | size=3
611 | stride=1
612 | pad=1
613 | filters=1024
614 | activation=leaky
615 |
616 | [convolutional]
617 | batch_normalize=1
618 | filters=512
619 | size=1
620 | stride=1
621 | pad=1
622 | activation=leaky
623 |
624 | [convolutional]
625 | batch_normalize=1
626 | size=3
627 | stride=1
628 | pad=1
629 | filters=1024
630 | activation=leaky
631 |
632 | [convolutional]
633 | size=1
634 | stride=1
635 | pad=1
636 | filters=24
637 | activation=linear
638 |
639 |
640 | [yolo]
641 | mask = 6,7,8
642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
643 | classes=3
644 | num=9
645 | jitter=.3
646 | ignore_thresh = .7
647 | truth_thresh = 1
648 | random=1
649 |
650 |
651 | [route]
652 | layers = -4
653 |
654 | [convolutional]
655 | batch_normalize=1
656 | filters=256
657 | size=1
658 | stride=1
659 | pad=1
660 | activation=leaky
661 |
662 | [upsample]
663 | stride=2
664 |
665 | [route]
666 | layers = -1, 61
667 |
668 |
669 |
670 | [convolutional]
671 | batch_normalize=1
672 | filters=256
673 | size=1
674 | stride=1
675 | pad=1
676 | activation=leaky
677 |
678 | [convolutional]
679 | batch_normalize=1
680 | size=3
681 | stride=1
682 | pad=1
683 | filters=512
684 | activation=leaky
685 |
686 | [convolutional]
687 | batch_normalize=1
688 | filters=256
689 | size=1
690 | stride=1
691 | pad=1
692 | activation=leaky
693 |
694 | [convolutional]
695 | batch_normalize=1
696 | size=3
697 | stride=1
698 | pad=1
699 | filters=512
700 | activation=leaky
701 |
702 | [convolutional]
703 | batch_normalize=1
704 | filters=256
705 | size=1
706 | stride=1
707 | pad=1
708 | activation=leaky
709 |
710 | [convolutional]
711 | batch_normalize=1
712 | size=3
713 | stride=1
714 | pad=1
715 | filters=512
716 | activation=leaky
717 |
718 | [convolutional]
719 | size=1
720 | stride=1
721 | pad=1
722 | filters=24
723 | activation=linear
724 |
725 |
726 | [yolo]
727 | mask = 3,4,5
728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
729 | classes=3
730 | num=9
731 | jitter=.3
732 | ignore_thresh = .7
733 | truth_thresh = 1
734 | random=1
735 |
736 |
737 |
738 | [route]
739 | layers = -4
740 |
741 | [convolutional]
742 | batch_normalize=1
743 | filters=128
744 | size=1
745 | stride=1
746 | pad=1
747 | activation=leaky
748 |
749 | [upsample]
750 | stride=2
751 |
752 | [route]
753 | layers = -1, 36
754 |
755 |
756 |
757 | [convolutional]
758 | batch_normalize=1
759 | filters=128
760 | size=1
761 | stride=1
762 | pad=1
763 | activation=leaky
764 |
765 | [convolutional]
766 | batch_normalize=1
767 | size=3
768 | stride=1
769 | pad=1
770 | filters=256
771 | activation=leaky
772 |
773 | [convolutional]
774 | batch_normalize=1
775 | filters=128
776 | size=1
777 | stride=1
778 | pad=1
779 | activation=leaky
780 |
781 | [convolutional]
782 | batch_normalize=1
783 | size=3
784 | stride=1
785 | pad=1
786 | filters=256
787 | activation=leaky
788 |
789 | [convolutional]
790 | batch_normalize=1
791 | filters=128
792 | size=1
793 | stride=1
794 | pad=1
795 | activation=leaky
796 |
797 | [convolutional]
798 | batch_normalize=1
799 | size=3
800 | stride=1
801 | pad=1
802 | filters=256
803 | activation=leaky
804 |
805 | [convolutional]
806 | size=1
807 | stride=1
808 | pad=1
809 | filters=24
810 | activation=linear
811 |
812 |
813 | [yolo]
814 | mask = 0,1,2
815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
816 | classes=3
817 | num=9
818 | jitter=.3
819 | ignore_thresh = .7
820 | truth_thresh = 1
821 | random=1
822 |
--------------------------------------------------------------------------------
/cfg/yolov3-spp.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | # batch=1
4 | # subdivisions=1
5 | # Training
6 | batch=64
7 | subdivisions=16
8 | width=608
9 | height=608
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=leaky
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=leaky
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | ### SPP ###
576 | [maxpool]
577 | stride=1
578 | size=5
579 |
580 | [route]
581 | layers=-2
582 |
583 | [maxpool]
584 | stride=1
585 | size=9
586 |
587 | [route]
588 | layers=-4
589 |
590 | [maxpool]
591 | stride=1
592 | size=13
593 |
594 | [route]
595 | layers=-1,-3,-5,-6
596 |
597 | ### End SPP ###
598 |
599 | [convolutional]
600 | batch_normalize=1
601 | filters=512
602 | size=1
603 | stride=1
604 | pad=1
605 | activation=leaky
606 |
607 |
608 | [convolutional]
609 | batch_normalize=1
610 | size=3
611 | stride=1
612 | pad=1
613 | filters=1024
614 | activation=leaky
615 |
616 | [convolutional]
617 | batch_normalize=1
618 | filters=512
619 | size=1
620 | stride=1
621 | pad=1
622 | activation=leaky
623 |
624 | [convolutional]
625 | batch_normalize=1
626 | size=3
627 | stride=1
628 | pad=1
629 | filters=1024
630 | activation=leaky
631 |
632 | [convolutional]
633 | size=1
634 | stride=1
635 | pad=1
636 | filters=255
637 | activation=linear
638 |
639 |
640 | [yolo]
641 | mask = 6,7,8
642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
643 | classes=80
644 | num=9
645 | jitter=.3
646 | ignore_thresh = .7
647 | truth_thresh = 1
648 | random=1
649 |
650 |
651 | [route]
652 | layers = -4
653 |
654 | [convolutional]
655 | batch_normalize=1
656 | filters=256
657 | size=1
658 | stride=1
659 | pad=1
660 | activation=leaky
661 |
662 | [upsample]
663 | stride=2
664 |
665 | [route]
666 | layers = -1, 61
667 |
668 |
669 |
670 | [convolutional]
671 | batch_normalize=1
672 | filters=256
673 | size=1
674 | stride=1
675 | pad=1
676 | activation=leaky
677 |
678 | [convolutional]
679 | batch_normalize=1
680 | size=3
681 | stride=1
682 | pad=1
683 | filters=512
684 | activation=leaky
685 |
686 | [convolutional]
687 | batch_normalize=1
688 | filters=256
689 | size=1
690 | stride=1
691 | pad=1
692 | activation=leaky
693 |
694 | [convolutional]
695 | batch_normalize=1
696 | size=3
697 | stride=1
698 | pad=1
699 | filters=512
700 | activation=leaky
701 |
702 | [convolutional]
703 | batch_normalize=1
704 | filters=256
705 | size=1
706 | stride=1
707 | pad=1
708 | activation=leaky
709 |
710 | [convolutional]
711 | batch_normalize=1
712 | size=3
713 | stride=1
714 | pad=1
715 | filters=512
716 | activation=leaky
717 |
718 | [convolutional]
719 | size=1
720 | stride=1
721 | pad=1
722 | filters=255
723 | activation=linear
724 |
725 |
726 | [yolo]
727 | mask = 3,4,5
728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
729 | classes=80
730 | num=9
731 | jitter=.3
732 | ignore_thresh = .7
733 | truth_thresh = 1
734 | random=1
735 |
736 |
737 |
738 | [route]
739 | layers = -4
740 |
741 | [convolutional]
742 | batch_normalize=1
743 | filters=128
744 | size=1
745 | stride=1
746 | pad=1
747 | activation=leaky
748 |
749 | [upsample]
750 | stride=2
751 |
752 | [route]
753 | layers = -1, 36
754 |
755 |
756 |
757 | [convolutional]
758 | batch_normalize=1
759 | filters=128
760 | size=1
761 | stride=1
762 | pad=1
763 | activation=leaky
764 |
765 | [convolutional]
766 | batch_normalize=1
767 | size=3
768 | stride=1
769 | pad=1
770 | filters=256
771 | activation=leaky
772 |
773 | [convolutional]
774 | batch_normalize=1
775 | filters=128
776 | size=1
777 | stride=1
778 | pad=1
779 | activation=leaky
780 |
781 | [convolutional]
782 | batch_normalize=1
783 | size=3
784 | stride=1
785 | pad=1
786 | filters=256
787 | activation=leaky
788 |
789 | [convolutional]
790 | batch_normalize=1
791 | filters=128
792 | size=1
793 | stride=1
794 | pad=1
795 | activation=leaky
796 |
797 | [convolutional]
798 | batch_normalize=1
799 | size=3
800 | stride=1
801 | pad=1
802 | filters=256
803 | activation=leaky
804 |
805 | [convolutional]
806 | size=1
807 | stride=1
808 | pad=1
809 | filters=255
810 | activation=linear
811 |
812 |
813 | [yolo]
814 | mask = 0,1,2
815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
816 | classes=80
817 | num=9
818 | jitter=.3
819 | ignore_thresh = .7
820 | truth_thresh = 1
821 | random=1
822 |
--------------------------------------------------------------------------------
/cfg/yolov3-asff.cfg:
--------------------------------------------------------------------------------
1 | # Generated by Glenn Jocher (glenn.jocher@ultralytics.com) for https://github.com/ultralytics/yolov3
2 | # def kmean_anchors(path='../coco/train2017.txt', n=12, img_size=(320, 640)): # from utils.utils import *; kmean_anchors()
3 | # Evolving anchors: 100%|██████████| 1000/1000 [41:15<00:00, 2.48s/it]
4 | # 0.20 iou_thr: 0.992 best possible recall, 4.25 anchors > thr
5 | # kmeans anchors (n=12, img_size=(320, 640), IoU=0.005/0.184/0.634-min/mean/best): 6,9, 15,16, 17,35, 37,26, 36,67, 63,42, 57,100, 121,81, 112,169, 241,158, 195,310, 426,359
6 |
7 | [net]
8 | # Testing
9 | # batch=1
10 | # subdivisions=1
11 | # Training
12 | batch=64
13 | subdivisions=16
14 | width=608
15 | height=608
16 | channels=3
17 | momentum=0.9
18 | decay=0.0005
19 | angle=0
20 | saturation = 1.5
21 | exposure = 1.5
22 | hue=.1
23 |
24 | learning_rate=0.001
25 | burn_in=1000
26 | max_batches = 500200
27 | policy=steps
28 | steps=400000,450000
29 | scales=.1,.1
30 |
31 | [convolutional]
32 | batch_normalize=1
33 | filters=32
34 | size=3
35 | stride=1
36 | pad=1
37 | activation=leaky
38 |
39 | # Downsample
40 |
41 | [convolutional]
42 | batch_normalize=1
43 | filters=64
44 | size=3
45 | stride=2
46 | pad=1
47 | activation=leaky
48 |
49 | [convolutional]
50 | batch_normalize=1
51 | filters=32
52 | size=1
53 | stride=1
54 | pad=1
55 | activation=leaky
56 |
57 | [convolutional]
58 | batch_normalize=1
59 | filters=64
60 | size=3
61 | stride=1
62 | pad=1
63 | activation=leaky
64 |
65 | [shortcut]
66 | from=-3
67 | activation=linear
68 |
69 | # Downsample
70 |
71 | [convolutional]
72 | batch_normalize=1
73 | filters=128
74 | size=3
75 | stride=2
76 | pad=1
77 | activation=leaky
78 |
79 | [convolutional]
80 | batch_normalize=1
81 | filters=64
82 | size=1
83 | stride=1
84 | pad=1
85 | activation=leaky
86 |
87 | [convolutional]
88 | batch_normalize=1
89 | filters=128
90 | size=3
91 | stride=1
92 | pad=1
93 | activation=leaky
94 |
95 | [shortcut]
96 | from=-3
97 | activation=linear
98 |
99 | [convolutional]
100 | batch_normalize=1
101 | filters=64
102 | size=1
103 | stride=1
104 | pad=1
105 | activation=leaky
106 |
107 | [convolutional]
108 | batch_normalize=1
109 | filters=128
110 | size=3
111 | stride=1
112 | pad=1
113 | activation=leaky
114 |
115 | [shortcut]
116 | from=-3
117 | activation=linear
118 |
119 | # Downsample
120 |
121 | [convolutional]
122 | batch_normalize=1
123 | filters=256
124 | size=3
125 | stride=2
126 | pad=1
127 | activation=leaky
128 |
129 | [convolutional]
130 | batch_normalize=1
131 | filters=128
132 | size=1
133 | stride=1
134 | pad=1
135 | activation=leaky
136 |
137 | [convolutional]
138 | batch_normalize=1
139 | filters=256
140 | size=3
141 | stride=1
142 | pad=1
143 | activation=leaky
144 |
145 | [shortcut]
146 | from=-3
147 | activation=linear
148 |
149 | [convolutional]
150 | batch_normalize=1
151 | filters=128
152 | size=1
153 | stride=1
154 | pad=1
155 | activation=leaky
156 |
157 | [convolutional]
158 | batch_normalize=1
159 | filters=256
160 | size=3
161 | stride=1
162 | pad=1
163 | activation=leaky
164 |
165 | [shortcut]
166 | from=-3
167 | activation=linear
168 |
169 | [convolutional]
170 | batch_normalize=1
171 | filters=128
172 | size=1
173 | stride=1
174 | pad=1
175 | activation=leaky
176 |
177 | [convolutional]
178 | batch_normalize=1
179 | filters=256
180 | size=3
181 | stride=1
182 | pad=1
183 | activation=leaky
184 |
185 | [shortcut]
186 | from=-3
187 | activation=linear
188 |
189 | [convolutional]
190 | batch_normalize=1
191 | filters=128
192 | size=1
193 | stride=1
194 | pad=1
195 | activation=leaky
196 |
197 | [convolutional]
198 | batch_normalize=1
199 | filters=256
200 | size=3
201 | stride=1
202 | pad=1
203 | activation=leaky
204 |
205 | [shortcut]
206 | from=-3
207 | activation=linear
208 |
209 | [convolutional]
210 | batch_normalize=1
211 | filters=128
212 | size=1
213 | stride=1
214 | pad=1
215 | activation=leaky
216 |
217 | [convolutional]
218 | batch_normalize=1
219 | filters=256
220 | size=3
221 | stride=1
222 | pad=1
223 | activation=leaky
224 |
225 | [shortcut]
226 | from=-3
227 | activation=linear
228 |
229 | [convolutional]
230 | batch_normalize=1
231 | filters=128
232 | size=1
233 | stride=1
234 | pad=1
235 | activation=leaky
236 |
237 | [convolutional]
238 | batch_normalize=1
239 | filters=256
240 | size=3
241 | stride=1
242 | pad=1
243 | activation=leaky
244 |
245 | [shortcut]
246 | from=-3
247 | activation=linear
248 |
249 | [convolutional]
250 | batch_normalize=1
251 | filters=128
252 | size=1
253 | stride=1
254 | pad=1
255 | activation=leaky
256 |
257 | [convolutional]
258 | batch_normalize=1
259 | filters=256
260 | size=3
261 | stride=1
262 | pad=1
263 | activation=leaky
264 |
265 | [shortcut]
266 | from=-3
267 | activation=linear
268 |
269 | [convolutional]
270 | batch_normalize=1
271 | filters=128
272 | size=1
273 | stride=1
274 | pad=1
275 | activation=leaky
276 |
277 | [convolutional]
278 | batch_normalize=1
279 | filters=256
280 | size=3
281 | stride=1
282 | pad=1
283 | activation=leaky
284 |
285 | [shortcut]
286 | from=-3
287 | activation=linear
288 |
289 | # Downsample
290 |
291 | [convolutional]
292 | batch_normalize=1
293 | filters=512
294 | size=3
295 | stride=2
296 | pad=1
297 | activation=leaky
298 |
299 | [convolutional]
300 | batch_normalize=1
301 | filters=256
302 | size=1
303 | stride=1
304 | pad=1
305 | activation=leaky
306 |
307 | [convolutional]
308 | batch_normalize=1
309 | filters=512
310 | size=3
311 | stride=1
312 | pad=1
313 | activation=leaky
314 |
315 | [shortcut]
316 | from=-3
317 | activation=linear
318 |
319 | [convolutional]
320 | batch_normalize=1
321 | filters=256
322 | size=1
323 | stride=1
324 | pad=1
325 | activation=leaky
326 |
327 | [convolutional]
328 | batch_normalize=1
329 | filters=512
330 | size=3
331 | stride=1
332 | pad=1
333 | activation=leaky
334 |
335 | [shortcut]
336 | from=-3
337 | activation=linear
338 |
339 | [convolutional]
340 | batch_normalize=1
341 | filters=256
342 | size=1
343 | stride=1
344 | pad=1
345 | activation=leaky
346 |
347 | [convolutional]
348 | batch_normalize=1
349 | filters=512
350 | size=3
351 | stride=1
352 | pad=1
353 | activation=leaky
354 |
355 | [shortcut]
356 | from=-3
357 | activation=linear
358 |
359 | [convolutional]
360 | batch_normalize=1
361 | filters=256
362 | size=1
363 | stride=1
364 | pad=1
365 | activation=leaky
366 |
367 | [convolutional]
368 | batch_normalize=1
369 | filters=512
370 | size=3
371 | stride=1
372 | pad=1
373 | activation=leaky
374 |
375 | [shortcut]
376 | from=-3
377 | activation=linear
378 |
379 | [convolutional]
380 | batch_normalize=1
381 | filters=256
382 | size=1
383 | stride=1
384 | pad=1
385 | activation=leaky
386 |
387 | [convolutional]
388 | batch_normalize=1
389 | filters=512
390 | size=3
391 | stride=1
392 | pad=1
393 | activation=leaky
394 |
395 | [shortcut]
396 | from=-3
397 | activation=linear
398 |
399 | [convolutional]
400 | batch_normalize=1
401 | filters=256
402 | size=1
403 | stride=1
404 | pad=1
405 | activation=leaky
406 |
407 | [convolutional]
408 | batch_normalize=1
409 | filters=512
410 | size=3
411 | stride=1
412 | pad=1
413 | activation=leaky
414 |
415 | [shortcut]
416 | from=-3
417 | activation=linear
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | # SPP --------------------------------------------------------------------------
576 | [maxpool]
577 | stride=1
578 | size=5
579 |
580 | [route]
581 | layers=-2
582 |
583 | [maxpool]
584 | stride=1
585 | size=9
586 |
587 | [route]
588 | layers=-4
589 |
590 | [maxpool]
591 | stride=1
592 | size=13
593 |
594 | [route]
595 | layers=-1,-3,-5,-6
596 | # SPP --------------------------------------------------------------------------
597 |
598 | [convolutional]
599 | batch_normalize=1
600 | filters=512
601 | size=1
602 | stride=1
603 | pad=1
604 | activation=leaky
605 |
606 | [convolutional]
607 | batch_normalize=1
608 | size=3
609 | stride=1
610 | pad=1
611 | filters=1024
612 | activation=leaky
613 |
614 | [convolutional]
615 | batch_normalize=1
616 | filters=512
617 | size=1
618 | stride=1
619 | pad=1
620 | activation=leaky
621 |
622 | [convolutional]
623 | batch_normalize=1
624 | size=3
625 | stride=1
626 | pad=1
627 | filters=1024
628 | activation=leaky
629 |
630 | [convolutional]
631 | size=1
632 | stride=1
633 | pad=1
634 | filters=258
635 | activation=linear
636 |
637 | # YOLO -------------------------------------------------------------------------
638 |
639 | [route]
640 | layers = -3
641 |
642 | [convolutional]
643 | batch_normalize=1
644 | filters=256
645 | size=1
646 | stride=1
647 | pad=1
648 | activation=leaky
649 |
650 | [upsample]
651 | stride=2
652 |
653 | [route]
654 | layers = -1, 61
655 |
656 | [convolutional]
657 | batch_normalize=1
658 | filters=256
659 | size=1
660 | stride=1
661 | pad=1
662 | activation=leaky
663 |
664 | [convolutional]
665 | batch_normalize=1
666 | size=3
667 | stride=1
668 | pad=1
669 | filters=512
670 | activation=leaky
671 |
672 | [convolutional]
673 | batch_normalize=1
674 | filters=256
675 | size=1
676 | stride=1
677 | pad=1
678 | activation=leaky
679 |
680 | [convolutional]
681 | batch_normalize=1
682 | size=3
683 | stride=1
684 | pad=1
685 | filters=512
686 | activation=leaky
687 |
688 | [convolutional]
689 | batch_normalize=1
690 | filters=256
691 | size=1
692 | stride=1
693 | pad=1
694 | activation=leaky
695 |
696 | [convolutional]
697 | batch_normalize=1
698 | size=3
699 | stride=1
700 | pad=1
701 | filters=512
702 | activation=leaky
703 |
704 | [convolutional]
705 | size=1
706 | stride=1
707 | pad=1
708 | filters=258
709 | activation=linear
710 |
711 | # YOLO -------------------------------------------------------------------------
712 |
713 | [route]
714 | layers = -3
715 |
716 | [convolutional]
717 | batch_normalize=1
718 | filters=128
719 | size=1
720 | stride=1
721 | pad=1
722 | activation=leaky
723 |
724 | [upsample]
725 | stride=2
726 |
727 | [route]
728 | layers = -1, 36
729 |
730 | [convolutional]
731 | batch_normalize=1
732 | filters=128
733 | size=1
734 | stride=1
735 | pad=1
736 | activation=leaky
737 |
738 | [convolutional]
739 | batch_normalize=1
740 | size=3
741 | stride=1
742 | pad=1
743 | filters=256
744 | activation=leaky
745 |
746 | [convolutional]
747 | batch_normalize=1
748 | filters=128
749 | size=1
750 | stride=1
751 | pad=1
752 | activation=leaky
753 |
754 | [convolutional]
755 | batch_normalize=1
756 | size=3
757 | stride=1
758 | pad=1
759 | filters=256
760 | activation=leaky
761 |
762 | [convolutional]
763 | batch_normalize=1
764 | filters=128
765 | size=1
766 | stride=1
767 | pad=1
768 | activation=leaky
769 |
770 | [convolutional]
771 | batch_normalize=1
772 | size=3
773 | stride=1
774 | pad=1
775 | filters=256
776 | activation=leaky
777 |
778 | [convolutional]
779 | size=1
780 | stride=1
781 | pad=1
782 | filters=258
783 | activation=linear
784 |
785 | [yolo]
786 | from=88,99,110
787 | mask = 6,7,8
788 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
789 | classes=80
790 | num=9
791 |
792 | [yolo]
793 | from=88,99,110
794 | mask = 3,4,5
795 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
796 | classes=80
797 | num=9
798 |
799 | [yolo]
800 | from=88,99,110
801 | mask = 0,1,2
802 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
803 | classes=80
804 | num=9
--------------------------------------------------------------------------------
/cfg/yolov3-spp3.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | batch=1
4 | subdivisions=1
5 | # Training
6 | # batch=64
7 | # subdivisions=16
8 | width=608
9 | height=608
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 120200
21 | policy=steps
22 | steps=70000,100000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=leaky
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=leaky
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | ### SPP ###
576 | [maxpool]
577 | stride=1
578 | size=5
579 |
580 | [route]
581 | layers=-2
582 |
583 | [maxpool]
584 | stride=1
585 | size=9
586 |
587 | [route]
588 | layers=-4
589 |
590 | [maxpool]
591 | stride=1
592 | size=13
593 |
594 | [route]
595 | layers=-1,-3,-5,-6
596 |
597 | ### End SPP ###
598 |
599 | [convolutional]
600 | batch_normalize=1
601 | filters=512
602 | size=1
603 | stride=1
604 | pad=1
605 | activation=leaky
606 |
607 |
608 | [convolutional]
609 | batch_normalize=1
610 | size=3
611 | stride=1
612 | pad=1
613 | filters=1024
614 | activation=leaky
615 |
616 | [convolutional]
617 | batch_normalize=1
618 | filters=512
619 | size=1
620 | stride=1
621 | pad=1
622 | activation=leaky
623 |
624 | [convolutional]
625 | batch_normalize=1
626 | size=3
627 | stride=1
628 | pad=1
629 | filters=1024
630 | activation=leaky
631 |
632 | [convolutional]
633 | size=1
634 | stride=1
635 | pad=1
636 | filters=255
637 | activation=linear
638 |
639 |
640 | [yolo]
641 | mask = 6,7,8
642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
643 | classes=80
644 | num=9
645 | jitter=.3
646 | ignore_thresh = .7
647 | truth_thresh = 1
648 | random=1
649 |
650 |
651 | [route]
652 | layers = -4
653 |
654 | [convolutional]
655 | batch_normalize=1
656 | filters=256
657 | size=1
658 | stride=1
659 | pad=1
660 | activation=leaky
661 |
662 | [upsample]
663 | stride=2
664 |
665 | [route]
666 | layers = -1, 61
667 |
668 |
669 |
670 | [convolutional]
671 | batch_normalize=1
672 | filters=256
673 | size=1
674 | stride=1
675 | pad=1
676 | activation=leaky
677 |
678 | [convolutional]
679 | batch_normalize=1
680 | size=3
681 | stride=1
682 | pad=1
683 | filters=512
684 | activation=leaky
685 |
686 | ### SPP ###
687 | [maxpool]
688 | stride=1
689 | size=5
690 |
691 | [route]
692 | layers=-2
693 |
694 | [maxpool]
695 | stride=1
696 | size=9
697 |
698 | [route]
699 | layers=-4
700 |
701 | [maxpool]
702 | stride=1
703 | size=13
704 |
705 | [route]
706 | layers=-1,-3,-5,-6
707 |
708 | ### End SPP ###
709 |
710 |
711 | [convolutional]
712 | batch_normalize=1
713 | filters=256
714 | size=1
715 | stride=1
716 | pad=1
717 | activation=leaky
718 |
719 | [convolutional]
720 | batch_normalize=1
721 | size=3
722 | stride=1
723 | pad=1
724 | filters=512
725 | activation=leaky
726 |
727 | [convolutional]
728 | batch_normalize=1
729 | filters=256
730 | size=1
731 | stride=1
732 | pad=1
733 | activation=leaky
734 |
735 | [convolutional]
736 | batch_normalize=1
737 | size=3
738 | stride=1
739 | pad=1
740 | filters=512
741 | activation=leaky
742 |
743 | [convolutional]
744 | size=1
745 | stride=1
746 | pad=1
747 | filters=255
748 | activation=linear
749 |
750 |
751 | [yolo]
752 | mask = 3,4,5
753 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
754 | classes=80
755 | num=9
756 | jitter=.3
757 | ignore_thresh = .7
758 | truth_thresh = 1
759 | random=1
760 |
761 |
762 |
763 | [route]
764 | layers = -4
765 |
766 | [convolutional]
767 | batch_normalize=1
768 | filters=128
769 | size=1
770 | stride=1
771 | pad=1
772 | activation=leaky
773 |
774 | [upsample]
775 | stride=2
776 |
777 | [route]
778 | layers = -1, 36
779 |
780 |
781 |
782 | [convolutional]
783 | batch_normalize=1
784 | filters=128
785 | size=1
786 | stride=1
787 | pad=1
788 | activation=leaky
789 |
790 | [convolutional]
791 | batch_normalize=1
792 | size=3
793 | stride=1
794 | pad=1
795 | filters=256
796 | activation=leaky
797 |
798 | [convolutional]
799 | batch_normalize=1
800 | filters=128
801 | size=1
802 | stride=1
803 | pad=1
804 | activation=leaky
805 |
806 | ### SPP ###
807 | [maxpool]
808 | stride=1
809 | size=5
810 |
811 | [route]
812 | layers=-2
813 |
814 | [maxpool]
815 | stride=1
816 | size=9
817 |
818 | [route]
819 | layers=-4
820 |
821 | [maxpool]
822 | stride=1
823 | size=13
824 |
825 | [route]
826 | layers=-1,-3,-5,-6
827 |
828 | ### End SPP ###
829 |
830 | [convolutional]
831 | batch_normalize=1
832 | size=3
833 | stride=1
834 | pad=1
835 | filters=256
836 | activation=leaky
837 |
838 | [convolutional]
839 | batch_normalize=1
840 | filters=128
841 | size=1
842 | stride=1
843 | pad=1
844 | activation=leaky
845 |
846 | [convolutional]
847 | batch_normalize=1
848 | size=3
849 | stride=1
850 | pad=1
851 | filters=256
852 | activation=leaky
853 |
854 | [convolutional]
855 | size=1
856 | stride=1
857 | pad=1
858 | filters=255
859 | activation=linear
860 |
861 |
862 | [yolo]
863 | mask = 0,1,2
864 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
865 | classes=80
866 | num=9
867 | jitter=.3
868 | ignore_thresh = .7
869 | truth_thresh = 1
870 | random=1
871 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 |
4 | from torch.utils.data import DataLoader
5 |
6 | from models import *
7 | from utils.datasets import *
8 | from utils.utils import *
9 |
10 |
11 | def test(cfg,
12 | data,
13 | weights=None,
14 | batch_size=16,
15 | img_size=416,
16 | conf_thres=0.001,
17 | iou_thres=0.6, # for nms
18 | save_json=False,
19 | single_cls=False,
20 | augment=False,
21 | model=None,
22 | dataloader=None):
23 | # Initialize/load model and set device
24 | if model is None:
25 | device = torch_utils.select_device(opt.device, batch_size=batch_size)
26 | verbose = opt.task == 'test'
27 |
28 | # Remove previous
29 | for f in glob.glob('test_batch*.png'):
30 | os.remove(f)
31 |
32 | # Initialize model
33 | model = Darknet(cfg, img_size)
34 |
35 | # Load weights
36 | attempt_download(weights)
37 | if weights.endswith('.pt'): # pytorch format
38 | model.load_state_dict(torch.load(weights, map_location=device)['model'])
39 | else: # darknet format
40 | load_darknet_weights(model, weights)
41 |
42 | # Fuse
43 | model.fuse()
44 | model.to(device)
45 |
46 | if device.type != 'cpu' and torch.cuda.device_count() > 1:
47 | model = nn.DataParallel(model)
48 | else: # called by train.py
49 | device = next(model.parameters()).device # get model device
50 | verbose = False
51 |
52 | # Configure run
53 | data = parse_data_cfg(data)
54 | nc = 1 if single_cls else int(data['classes']) # number of classes
55 | path = data['valid'] # path to test images
56 | names = load_classes(data['names']) # class names
57 | iouv = torch.linspace(0.5, 0.95, 10).to(device) # iou vector for mAP@0.5:0.95
58 | iouv = iouv[0].view(1) # comment for mAP@0.5:0.95
59 | niou = iouv.numel()
60 |
61 | # Dataloader
62 | if dataloader is None:
63 | dataset = LoadImagesAndLabels(path, img_size, batch_size, rect=True, single_cls=opt.single_cls)
64 | batch_size = min(batch_size, len(dataset))
65 | dataloader = DataLoader(dataset,
66 | batch_size=batch_size,
67 | num_workers=min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]),
68 | pin_memory=True,
69 | collate_fn=dataset.collate_fn)
70 |
71 | seen = 0
72 | model.eval()
73 | _ = model(torch.zeros((1, 3, img_size, img_size), device=device)) if device.type != 'cpu' else None # run once
74 | coco91class = coco80_to_coco91_class()
75 | s = ('%20s' + '%10s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@0.5', 'F1')
76 | p, r, f1, mp, mr, map, mf1, t0, t1 = 0., 0., 0., 0., 0., 0., 0., 0., 0.
77 | loss = torch.zeros(3, device=device)
78 | jdict, stats, ap, ap_class = [], [], [], []
79 | for batch_i, (imgs, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)):
80 | imgs = imgs.to(device).float() / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0
81 | targets = targets.to(device)
82 | nb, _, height, width = imgs.shape # batch size, channels, height, width
83 | whwh = torch.Tensor([width, height, width, height]).to(device)
84 |
85 | # Plot images with bounding boxes
86 | f = 'test_batch%g.png' % batch_i # filename
87 | if batch_i < 1 and not os.path.exists(f):
88 | plot_images(imgs=imgs, targets=targets, paths=paths, fname=f)
89 |
90 | # Disable gradients
91 | with torch.no_grad():
92 | # Run model
93 | t = torch_utils.time_synchronized()
94 | inf_out, train_out = model(imgs, augment=augment) # inference and training outputs
95 | t0 += torch_utils.time_synchronized() - t
96 |
97 | # Compute loss
98 | if hasattr(model, 'hyp'): # if model has loss hyperparameters
99 | loss += compute_loss(train_out, targets, model)[1][:3] # GIoU, obj, cls
100 |
101 | # Run NMS
102 | t = torch_utils.time_synchronized()
103 | output = non_max_suppression(inf_out, conf_thres=conf_thres, iou_thres=iou_thres) # nms
104 | t1 += torch_utils.time_synchronized() - t
105 |
106 | # Statistics per image
107 | for si, pred in enumerate(output):
108 | labels = targets[targets[:, 0] == si, 1:]
109 | nl = len(labels)
110 | tcls = labels[:, 0].tolist() if nl else [] # target class
111 | seen += 1
112 |
113 | if pred is None:
114 | if nl:
115 | stats.append((torch.zeros(0, niou, dtype=torch.bool), torch.Tensor(), torch.Tensor(), tcls))
116 | continue
117 |
118 | # Append to text file
119 | # with open('test.txt', 'a') as file:
120 | # [file.write('%11.5g' * 7 % tuple(x) + '\n') for x in pred]
121 |
122 | # Clip boxes to image bounds
123 | clip_coords(pred, (height, width))
124 |
125 | # Append to pycocotools JSON dictionary
126 | if save_json:
127 | # [{"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}, ...
128 | image_id = int(Path(paths[si]).stem.split('_')[-1])
129 | box = pred[:, :4].clone() # xyxy
130 | scale_coords(imgs[si].shape[1:], box, shapes[si][0], shapes[si][1]) # to original shape
131 | box = xyxy2xywh(box) # xywh
132 | box[:, :2] -= box[:, 2:] / 2 # xy center to top-left corner
133 | for p, b in zip(pred.tolist(), box.tolist()):
134 | jdict.append({'image_id': image_id,
135 | 'category_id': coco91class[int(p[5])],
136 | 'bbox': [round(x, 3) for x in b],
137 | 'score': round(p[4], 5)})
138 |
139 | # Assign all predictions as incorrect
140 | correct = torch.zeros(pred.shape[0], niou, dtype=torch.bool, device=device)
141 | if nl:
142 | detected = [] # target indices
143 | tcls_tensor = labels[:, 0]
144 |
145 | # target boxes
146 | tbox = xywh2xyxy(labels[:, 1:5]) * whwh
147 |
148 | # Per target class
149 | for cls in torch.unique(tcls_tensor):
150 | ti = (cls == tcls_tensor).nonzero().view(-1) # prediction indices
151 | pi = (cls == pred[:, 5]).nonzero().view(-1) # target indices
152 |
153 | # Search for detections
154 | if pi.shape[0]:
155 | # Prediction to target ious
156 | ious, i = box_iou(pred[pi, :4], tbox[ti]).max(1) # best ious, indices
157 |
158 | # Append detections
159 | for j in (ious > iouv[0]).nonzero():
160 | d = ti[i[j]] # detected target
161 | if d not in detected:
162 | detected.append(d)
163 | correct[pi[j]] = ious[j] > iouv # iou_thres is 1xn
164 | if len(detected) == nl: # all targets already located in image
165 | break
166 |
167 | # Append statistics (correct, conf, pcls, tcls)
168 | stats.append((correct.cpu(), pred[:, 4].cpu(), pred[:, 5].cpu(), tcls))
169 |
170 | # Compute statistics
171 | stats = [np.concatenate(x, 0) for x in zip(*stats)] # to numpy
172 | if len(stats):
173 | p, r, ap, f1, ap_class = ap_per_class(*stats)
174 | if niou > 1:
175 | p, r, ap, f1 = p[:, 0], r[:, 0], ap.mean(1), ap[:, 0] # [P, R, AP@0.5:0.95, AP@0.5]
176 | mp, mr, map, mf1 = p.mean(), r.mean(), ap.mean(), f1.mean()
177 | nt = np.bincount(stats[3].astype(np.int64), minlength=nc) # number of targets per class
178 | else:
179 | nt = torch.zeros(1)
180 |
181 | # Print results
182 | pf = '%20s' + '%10.3g' * 6 # print format
183 | print(pf % ('all', seen, nt.sum(), mp, mr, map, mf1))
184 |
185 | # Print results per class
186 | if verbose and nc > 1 and len(stats):
187 | for i, c in enumerate(ap_class):
188 | print(pf % (names[c], seen, nt[c], p[i], r[i], ap[i], f1[i]))
189 |
190 | # Print speeds
191 | if verbose or save_json:
192 | t = tuple(x / seen * 1E3 for x in (t0, t1, t0 + t1)) + (img_size, img_size, batch_size) # tuple
193 | print('Speed: %.1f/%.1f/%.1f ms inference/NMS/total per %gx%g image at batch-size %g' % t)
194 |
195 | # Save JSON
196 | if save_json and map and len(jdict):
197 | print('\nCOCO mAP with pycocotools...')
198 | imgIds = [int(Path(x).stem.split('_')[-1]) for x in dataloader.dataset.img_files]
199 | with open('results.json', 'w') as file:
200 | json.dump(jdict, file)
201 |
202 | try:
203 | from pycocotools.coco import COCO
204 | from pycocotools.cocoeval import COCOeval
205 | except:
206 | print('WARNING: missing pycocotools package, can not compute official COCO mAP. See requirements.txt.')
207 |
208 | # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb
209 | cocoGt = COCO(glob.glob('/mnt/sda/yolact/data/coco/annotations/instances_val2014.json')[0]) # initialize COCO ground truth api
210 | cocoDt = cocoGt.loadRes('results.json') # initialize COCO pred api
211 |
212 | cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
213 | cocoEval.params.imgIds = imgIds # [:32] # only evaluate these images
214 | cocoEval.evaluate()
215 | cocoEval.accumulate()
216 | cocoEval.summarize()
217 | # mf1, map = cocoEval.stats[:2] # update to pycocotools results (mAP@0.5:0.95, mAP@0.5)
218 |
219 | # Return results
220 | maps = np.zeros(nc) + map
221 | for i, c in enumerate(ap_class):
222 | maps[c] = ap[i]
223 | return (mp, mr, map, mf1, *(loss.cpu() / len(dataloader)).tolist()), maps
224 |
225 |
226 | if __name__ == '__main__':
227 | parser = argparse.ArgumentParser(prog='test.py')
228 | parser.add_argument('--cfg', type=str, default='cfg/yolov3-spp.cfg', help='*.cfg path')
229 | parser.add_argument('--data', type=str, default='data/coco2014.data', help='*.data path')
230 | parser.add_argument('--weights', type=str, default='weights/yolov3-spp-ultralytics.pt', help='weights path')
231 | parser.add_argument('--batch-size', type=int, default=4, help='size of each image batch')
232 | parser.add_argument('--img-size', type=int, default=608, help='inference size (pixels)')
233 | parser.add_argument('--conf-thres', type=float, default=0.001, help='object confidence threshold')
234 | parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS')
235 | parser.add_argument('--save-json', action='store_true', help='save a cocoapi-compatible JSON results file')
236 | parser.add_argument('--task', default='test', help="'test', 'study', 'benchmark'")
237 | parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1) or cpu')
238 | parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
239 | parser.add_argument('--augment', action='store_true', help='augmented inference')
240 | opt = parser.parse_args()
241 | opt.save_json = opt.save_json or any([x in opt.data for x in ['coco.data', 'coco2014.data', 'coco2017.data']])
242 | print(opt)
243 |
244 | # task = 'test', 'study', 'benchmark'
245 | if opt.task == 'test': # (default) test normally
246 | test(opt.cfg,
247 | opt.data,
248 | opt.weights,
249 | opt.batch_size,
250 | opt.img_size,
251 | opt.conf_thres,
252 | opt.iou_thres,
253 | opt.save_json,
254 | opt.single_cls,
255 | opt.augment)
256 |
257 | elif opt.task == 'benchmark': # mAPs at 320-608 at conf 0.5 and 0.7
258 | y = []
259 | for i in [320, 416, 512, 608]: # img-size
260 | for j in [0.5, 0.7]: # iou-thres
261 | t = time.time()
262 | r = test(opt.cfg, opt.data, opt.weights, opt.batch_size, i, opt.conf_thres, j, opt.save_json)[0]
263 | y.append(r + (time.time() - t,))
264 | np.savetxt('benchmark.txt', y, fmt='%10.4g') # y = np.loadtxt('study.txt')
265 |
266 | elif opt.task == 'study': # Parameter study
267 | y = []
268 | x = np.arange(0.4, 0.9, 0.05) # iou-thres
269 | for i in x:
270 | t = time.time()
271 | r = test(opt.cfg, opt.data, opt.weights, opt.batch_size, opt.img_size, opt.conf_thres, i, opt.save_json)[0]
272 | y.append(r + (time.time() - t,))
273 | np.savetxt('study.txt', y, fmt='%10.4g') # y = np.loadtxt('study.txt')
274 |
275 | # Plot
276 | fig, ax = plt.subplots(3, 1, figsize=(6, 6))
277 | y = np.stack(y, 0)
278 | ax[0].plot(x, y[:, 2], marker='.', label='mAP@0.5')
279 | ax[0].set_ylabel('mAP')
280 | ax[1].plot(x, y[:, 3], marker='.', label='mAP@0.5:0.95')
281 | ax[1].set_ylabel('mAP')
282 | ax[2].plot(x, y[:, -1], marker='.', label='time')
283 | ax[2].set_ylabel('time (s)')
284 | for i in range(3):
285 | ax[i].legend()
286 | ax[i].set_xlabel('iou_thr')
287 | fig.tight_layout()
288 | plt.savefig('study.jpg', dpi=200)
289 |
--------------------------------------------------------------------------------
/cfg/yolov3-spp-pan-scale.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | #batch=1
4 | #subdivisions=1
5 | # Training
6 | batch=64
7 | subdivisions=32
8 | width=544
9 | height=544
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 |
19 | learning_rate=0.001
20 | burn_in=1000
21 | max_batches = 10000
22 |
23 | policy=steps
24 | steps=8000,9000
25 | scales=.1,.1
26 |
27 | #policy=sgdr
28 | #sgdr_cycle=1000
29 | #sgdr_mult=2
30 | #steps=4000,6000,8000,9000
31 | #scales=1, 1, 0.1, 0.1
32 |
33 | [convolutional]
34 | batch_normalize=1
35 | filters=32
36 | size=3
37 | stride=1
38 | pad=1
39 | activation=leaky
40 |
41 | # Downsample
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=64
46 | size=3
47 | stride=2
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=32
54 | size=1
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [convolutional]
60 | batch_normalize=1
61 | filters=64
62 | size=3
63 | stride=1
64 | pad=1
65 | activation=leaky
66 |
67 | [shortcut]
68 | from=-3
69 | activation=linear
70 |
71 | # Downsample
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=128
76 | size=3
77 | stride=2
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=64
84 | size=1
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [convolutional]
90 | batch_normalize=1
91 | filters=128
92 | size=3
93 | stride=1
94 | pad=1
95 | activation=leaky
96 |
97 | [shortcut]
98 | from=-3
99 | activation=linear
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=64
104 | size=1
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [convolutional]
110 | batch_normalize=1
111 | filters=128
112 | size=3
113 | stride=1
114 | pad=1
115 | activation=leaky
116 |
117 | [shortcut]
118 | from=-3
119 | activation=linear
120 |
121 | # Downsample
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=256
126 | size=3
127 | stride=2
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=128
134 | size=1
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [convolutional]
140 | batch_normalize=1
141 | filters=256
142 | size=3
143 | stride=1
144 | pad=1
145 | activation=leaky
146 |
147 | [shortcut]
148 | from=-3
149 | activation=linear
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=128
154 | size=1
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [convolutional]
160 | batch_normalize=1
161 | filters=256
162 | size=3
163 | stride=1
164 | pad=1
165 | activation=leaky
166 |
167 | [shortcut]
168 | from=-3
169 | activation=linear
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=128
174 | size=1
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [convolutional]
180 | batch_normalize=1
181 | filters=256
182 | size=3
183 | stride=1
184 | pad=1
185 | activation=leaky
186 |
187 | [shortcut]
188 | from=-3
189 | activation=linear
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=128
194 | size=1
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [convolutional]
200 | batch_normalize=1
201 | filters=256
202 | size=3
203 | stride=1
204 | pad=1
205 | activation=leaky
206 |
207 | [shortcut]
208 | from=-3
209 | activation=linear
210 |
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=128
215 | size=1
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [convolutional]
221 | batch_normalize=1
222 | filters=256
223 | size=3
224 | stride=1
225 | pad=1
226 | activation=leaky
227 |
228 | [shortcut]
229 | from=-3
230 | activation=linear
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=128
235 | size=1
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [convolutional]
241 | batch_normalize=1
242 | filters=256
243 | size=3
244 | stride=1
245 | pad=1
246 | activation=leaky
247 |
248 | [shortcut]
249 | from=-3
250 | activation=linear
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=128
255 | size=1
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [convolutional]
261 | batch_normalize=1
262 | filters=256
263 | size=3
264 | stride=1
265 | pad=1
266 | activation=leaky
267 |
268 | [shortcut]
269 | from=-3
270 | activation=linear
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=128
275 | size=1
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [convolutional]
281 | batch_normalize=1
282 | filters=256
283 | size=3
284 | stride=1
285 | pad=1
286 | activation=leaky
287 |
288 | [shortcut]
289 | from=-3
290 | activation=linear
291 |
292 | # Downsample
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=512
297 | size=3
298 | stride=2
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=256
305 | size=1
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [convolutional]
311 | batch_normalize=1
312 | filters=512
313 | size=3
314 | stride=1
315 | pad=1
316 | activation=leaky
317 |
318 | [shortcut]
319 | from=-3
320 | activation=linear
321 |
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=256
326 | size=1
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [convolutional]
332 | batch_normalize=1
333 | filters=512
334 | size=3
335 | stride=1
336 | pad=1
337 | activation=leaky
338 |
339 | [shortcut]
340 | from=-3
341 | activation=linear
342 |
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=256
347 | size=1
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [convolutional]
353 | batch_normalize=1
354 | filters=512
355 | size=3
356 | stride=1
357 | pad=1
358 | activation=leaky
359 |
360 | [shortcut]
361 | from=-3
362 | activation=linear
363 |
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=256
368 | size=1
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [convolutional]
374 | batch_normalize=1
375 | filters=512
376 | size=3
377 | stride=1
378 | pad=1
379 | activation=leaky
380 |
381 | [shortcut]
382 | from=-3
383 | activation=linear
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=256
388 | size=1
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [convolutional]
394 | batch_normalize=1
395 | filters=512
396 | size=3
397 | stride=1
398 | pad=1
399 | activation=leaky
400 |
401 | [shortcut]
402 | from=-3
403 | activation=linear
404 |
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=256
409 | size=1
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [convolutional]
415 | batch_normalize=1
416 | filters=512
417 | size=3
418 | stride=1
419 | pad=1
420 | activation=leaky
421 |
422 | [shortcut]
423 | from=-3
424 | activation=linear
425 |
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=256
430 | size=1
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [convolutional]
436 | batch_normalize=1
437 | filters=512
438 | size=3
439 | stride=1
440 | pad=1
441 | activation=leaky
442 |
443 | [shortcut]
444 | from=-3
445 | activation=linear
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=256
450 | size=1
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [convolutional]
456 | batch_normalize=1
457 | filters=512
458 | size=3
459 | stride=1
460 | pad=1
461 | activation=leaky
462 |
463 | [shortcut]
464 | from=-3
465 | activation=linear
466 |
467 | # Downsample
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=1024
472 | size=3
473 | stride=2
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=512
480 | size=1
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [convolutional]
486 | batch_normalize=1
487 | filters=1024
488 | size=3
489 | stride=1
490 | pad=1
491 | activation=leaky
492 |
493 | [shortcut]
494 | from=-3
495 | activation=linear
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=512
500 | size=1
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [convolutional]
506 | batch_normalize=1
507 | filters=1024
508 | size=3
509 | stride=1
510 | pad=1
511 | activation=leaky
512 |
513 | [shortcut]
514 | from=-3
515 | activation=linear
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=512
520 | size=1
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [convolutional]
526 | batch_normalize=1
527 | filters=1024
528 | size=3
529 | stride=1
530 | pad=1
531 | activation=leaky
532 |
533 | [shortcut]
534 | from=-3
535 | activation=linear
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=512
540 | size=1
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [convolutional]
546 | batch_normalize=1
547 | filters=1024
548 | size=3
549 | stride=1
550 | pad=1
551 | activation=leaky
552 |
553 | [shortcut]
554 | from=-3
555 | activation=linear
556 |
557 | ######################
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | filters=512
562 | size=1
563 | stride=1
564 | pad=1
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | size=3
570 | stride=1
571 | pad=1
572 | filters=1024
573 | activation=leaky
574 |
575 | [convolutional]
576 | batch_normalize=1
577 | filters=512
578 | size=1
579 | stride=1
580 | pad=1
581 | activation=leaky
582 |
583 | ### SPP ###
584 | [maxpool]
585 | stride=1
586 | size=5
587 |
588 | [route]
589 | layers=-2
590 |
591 | [maxpool]
592 | stride=1
593 | size=9
594 |
595 | [route]
596 | layers=-4
597 |
598 | [maxpool]
599 | stride=1
600 | size=13
601 |
602 | [route]
603 | layers=-1,-3,-5,-6
604 |
605 | ### End SPP ###
606 |
607 | [convolutional]
608 | batch_normalize=1
609 | filters=512
610 | size=1
611 | stride=1
612 | pad=1
613 | activation=leaky
614 |
615 |
616 | [convolutional]
617 | batch_normalize=1
618 | size=3
619 | stride=1
620 | pad=1
621 | filters=1024
622 | activation=leaky
623 |
624 | [convolutional]
625 | batch_normalize=1
626 | filters=512
627 | size=1
628 | stride=1
629 | pad=1
630 | activation=leaky
631 |
632 |
633 |
634 | ########### to [yolo-3]
635 |
636 |
637 |
638 | [route]
639 | layers = -4
640 |
641 | [convolutional]
642 | batch_normalize=1
643 | filters=256
644 | size=1
645 | stride=1
646 | pad=1
647 | activation=leaky
648 |
649 | [upsample]
650 | stride=2
651 |
652 | [route]
653 | layers = -1, 61
654 |
655 |
656 |
657 | [convolutional]
658 | batch_normalize=1
659 | filters=256
660 | size=1
661 | stride=1
662 | pad=1
663 | activation=leaky
664 |
665 | [convolutional]
666 | batch_normalize=1
667 | size=3
668 | stride=1
669 | pad=1
670 | filters=512
671 | activation=leaky
672 |
673 | [convolutional]
674 | batch_normalize=1
675 | filters=256
676 | size=1
677 | stride=1
678 | pad=1
679 | activation=leaky
680 |
681 | [convolutional]
682 | batch_normalize=1
683 | size=3
684 | stride=1
685 | pad=1
686 | filters=512
687 | activation=leaky
688 |
689 | [convolutional]
690 | batch_normalize=1
691 | filters=256
692 | size=1
693 | stride=1
694 | pad=1
695 | activation=leaky
696 |
697 |
698 | ########### to [yolo-2]
699 |
700 |
701 |
702 |
703 | [route]
704 | layers = -4
705 |
706 | [convolutional]
707 | batch_normalize=1
708 | filters=128
709 | size=1
710 | stride=1
711 | pad=1
712 | activation=leaky
713 |
714 | [upsample]
715 | stride=2
716 |
717 | [route]
718 | layers = -1, 36
719 |
720 |
721 |
722 | [convolutional]
723 | batch_normalize=1
724 | filters=128
725 | size=1
726 | stride=1
727 | pad=1
728 | activation=leaky
729 |
730 | [convolutional]
731 | batch_normalize=1
732 | size=3
733 | stride=1
734 | pad=1
735 | filters=256
736 | activation=leaky
737 |
738 | [convolutional]
739 | batch_normalize=1
740 | filters=128
741 | size=1
742 | stride=1
743 | pad=1
744 | activation=leaky
745 |
746 | [convolutional]
747 | batch_normalize=1
748 | size=3
749 | stride=1
750 | pad=1
751 | filters=256
752 | activation=leaky
753 |
754 | [convolutional]
755 | batch_normalize=1
756 | filters=128
757 | size=1
758 | stride=1
759 | pad=1
760 | activation=leaky
761 |
762 |
763 |
764 | ########### to [yolo-1]
765 |
766 |
767 | ########### features of different layers
768 |
769 |
770 | [route]
771 | layers=1
772 |
773 | [reorg3d]
774 | stride=2
775 |
776 | [route]
777 | layers=5,-1
778 |
779 | [reorg3d]
780 | stride=2
781 |
782 | [route]
783 | layers=12,-1
784 |
785 | [reorg3d]
786 | stride=2
787 |
788 | [route]
789 | layers=37,-1
790 |
791 | [reorg3d]
792 | stride=2
793 |
794 | [route]
795 | layers=62,-1
796 |
797 |
798 |
799 | ########### [yolo-1]
800 |
801 | [convolutional]
802 | batch_normalize=1
803 | filters=128
804 | size=1
805 | stride=1
806 | pad=1
807 | activation=leaky
808 |
809 | [upsample]
810 | stride=4
811 |
812 | [route]
813 | layers = -1,-12
814 |
815 |
816 | [convolutional]
817 | batch_normalize=1
818 | size=3
819 | stride=1
820 | pad=1
821 | filters=256
822 | activation=leaky
823 |
824 | [convolutional]
825 | size=1
826 | stride=1
827 | pad=1
828 | filters=340
829 | activation=linear
830 |
831 |
832 | [yolo]
833 | mask = 0,1,2,3
834 | anchors = 8,8, 10,13, 16,30, 33,23, 32,32, 30,61, 62,45, 64,64, 59,119, 116,90, 156,198, 373,326
835 | classes=80
836 | num=12
837 | jitter=.3
838 | ignore_thresh = .7
839 | truth_thresh = 1
840 | scale_x_y = 1.05
841 | random=0
842 |
843 |
844 |
845 |
846 | ########### [yolo-2]
847 |
848 |
849 | [route]
850 | layers = -7
851 |
852 | [convolutional]
853 | batch_normalize=1
854 | filters=256
855 | size=1
856 | stride=1
857 | pad=1
858 | activation=leaky
859 |
860 | [upsample]
861 | stride=2
862 |
863 | [route]
864 | layers = -1,-28
865 |
866 |
867 | [convolutional]
868 | batch_normalize=1
869 | size=3
870 | stride=1
871 | pad=1
872 | filters=512
873 | activation=leaky
874 |
875 | [convolutional]
876 | size=1
877 | stride=1
878 | pad=1
879 | filters=340
880 | activation=linear
881 |
882 |
883 | [yolo]
884 | mask = 4,5,6,7
885 | anchors = 8,8, 10,13, 16,30, 33,23, 32,32, 30,61, 62,45, 64,64, 59,119, 116,90, 156,198, 373,326
886 | classes=80
887 | num=12
888 | jitter=.3
889 | ignore_thresh = .7
890 | truth_thresh = 1
891 | scale_x_y = 1.1
892 | random=0
893 |
894 |
895 |
896 | ########### [yolo-3]
897 |
898 | [route]
899 | layers = -14
900 |
901 | [convolutional]
902 | batch_normalize=1
903 | filters=512
904 | size=1
905 | stride=1
906 | pad=1
907 | activation=leaky
908 |
909 | [route]
910 | layers = -1,-43
911 |
912 | [convolutional]
913 | batch_normalize=1
914 | size=3
915 | stride=1
916 | pad=1
917 | filters=1024
918 | activation=leaky
919 |
920 |
921 | [convolutional]
922 | size=1
923 | stride=1
924 | pad=1
925 | filters=340
926 | activation=linear
927 |
928 |
929 | [yolo]
930 | mask = 8,9,10,11
931 | anchors = 8,8, 10,13, 16,30, 33,23, 32,32, 30,61, 62,45, 59,119, 80,80, 116,90, 156,198, 373,326
932 | classes=80
933 | num=12
934 | jitter=.3
935 | ignore_thresh = .7
936 | truth_thresh = 1
937 | scale_x_y = 1.2
938 | random=0
939 |
--------------------------------------------------------------------------------
/cfg/csresnext50-panet-spp.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | #batch=1
4 | #subdivisions=1
5 | # Training
6 | batch=64
7 | subdivisions=16
8 | width=416
9 | height=416
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500500
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | #19:104x104 38:52x52 65:26x26 80:13x13 for 416
26 |
27 | [convolutional]
28 | batch_normalize=1
29 | filters=64
30 | size=7
31 | stride=2
32 | pad=1
33 | activation=leaky
34 |
35 | [maxpool]
36 | size=2
37 | stride=2
38 |
39 | [convolutional]
40 | batch_normalize=1
41 | filters=128
42 | size=1
43 | stride=1
44 | pad=1
45 | activation=leaky
46 |
47 | [route]
48 | layers = -2
49 |
50 | [convolutional]
51 | batch_normalize=1
52 | filters=64
53 | size=1
54 | stride=1
55 | pad=1
56 | activation=leaky
57 |
58 | # 1-1
59 |
60 | [convolutional]
61 | batch_normalize=1
62 | filters=128
63 | size=1
64 | stride=1
65 | pad=1
66 | activation=leaky
67 |
68 | [convolutional]
69 | batch_normalize=1
70 | filters=128
71 | size=3
72 | groups=32
73 | stride=1
74 | pad=1
75 | activation=leaky
76 |
77 | [convolutional]
78 | batch_normalize=1
79 | filters=64
80 | size=1
81 | stride=1
82 | pad=1
83 | activation=linear
84 |
85 | [shortcut]
86 | from=-4
87 | activation=leaky
88 |
89 | # 1-2
90 |
91 | [convolutional]
92 | batch_normalize=1
93 | filters=128
94 | size=1
95 | stride=1
96 | pad=1
97 | activation=leaky
98 |
99 | [convolutional]
100 | batch_normalize=1
101 | filters=128
102 | size=3
103 | groups=32
104 | stride=1
105 | pad=1
106 | activation=leaky
107 |
108 | [convolutional]
109 | batch_normalize=1
110 | filters=64
111 | size=1
112 | stride=1
113 | pad=1
114 | activation=linear
115 |
116 | [shortcut]
117 | from=-4
118 | activation=leaky
119 |
120 | # 1-3
121 |
122 | [convolutional]
123 | batch_normalize=1
124 | filters=128
125 | size=1
126 | stride=1
127 | pad=1
128 | activation=leaky
129 |
130 | [convolutional]
131 | batch_normalize=1
132 | filters=128
133 | size=3
134 | groups=32
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [convolutional]
140 | batch_normalize=1
141 | filters=64
142 | size=1
143 | stride=1
144 | pad=1
145 | activation=linear
146 |
147 | [shortcut]
148 | from=-4
149 | activation=leaky
150 |
151 | # 1-T
152 |
153 | [convolutional]
154 | batch_normalize=1
155 | filters=128
156 | size=1
157 | stride=1
158 | pad=1
159 | activation=leaky
160 |
161 | [route]
162 | layers = -1,-16
163 |
164 | [convolutional]
165 | batch_normalize=1
166 | filters=256
167 | size=1
168 | stride=1
169 | pad=1
170 | activation=leaky
171 |
172 | [convolutional]
173 | batch_normalize=1
174 | filters=256
175 | size=3
176 | groups=32
177 | stride=2
178 | pad=1
179 | activation=leaky
180 |
181 | [convolutional]
182 | batch_normalize=1
183 | filters=256
184 | size=1
185 | stride=1
186 | pad=1
187 | activation=linear
188 |
189 | [route]
190 | layers = -2
191 |
192 | [convolutional]
193 | batch_normalize=1
194 | filters=256
195 | size=1
196 | stride=1
197 | pad=1
198 | activation=linear
199 |
200 | # 2-1
201 |
202 | [convolutional]
203 | batch_normalize=1
204 | filters=256
205 | size=1
206 | stride=1
207 | pad=1
208 | activation=leaky
209 |
210 | [convolutional]
211 | batch_normalize=1
212 | filters=256
213 | size=3
214 | groups=32
215 | stride=1
216 | pad=1
217 | activation=leaky
218 |
219 | [convolutional]
220 | batch_normalize=1
221 | filters=256
222 | size=1
223 | stride=1
224 | pad=1
225 | activation=linear
226 |
227 | [shortcut]
228 | from=-4
229 | activation=leaky
230 |
231 | # 2-2
232 |
233 | [convolutional]
234 | batch_normalize=1
235 | filters=256
236 | size=1
237 | stride=1
238 | pad=1
239 | activation=leaky
240 |
241 | [convolutional]
242 | batch_normalize=1
243 | filters=256
244 | size=3
245 | groups=32
246 | stride=1
247 | pad=1
248 | activation=leaky
249 |
250 | [convolutional]
251 | batch_normalize=1
252 | filters=256
253 | size=1
254 | stride=1
255 | pad=1
256 | activation=linear
257 |
258 | [shortcut]
259 | from=-4
260 | activation=leaky
261 |
262 | # 2-3
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=256
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | groups=32
277 | stride=1
278 | pad=1
279 | activation=leaky
280 |
281 | [convolutional]
282 | batch_normalize=1
283 | filters=256
284 | size=1
285 | stride=1
286 | pad=1
287 | activation=linear
288 |
289 | [shortcut]
290 | from=-4
291 | activation=leaky
292 |
293 | # 2-T
294 |
295 | [convolutional]
296 | batch_normalize=1
297 | filters=256
298 | size=1
299 | stride=1
300 | pad=1
301 | activation=leaky
302 |
303 | [route]
304 | layers = -1,-16
305 |
306 | [convolutional]
307 | batch_normalize=1
308 | filters=512
309 | size=1
310 | stride=1
311 | pad=1
312 | activation=leaky
313 |
314 | [convolutional]
315 | batch_normalize=1
316 | filters=512
317 | size=3
318 | groups=32
319 | stride=2
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=1
327 | stride=1
328 | pad=1
329 | activation=linear
330 |
331 | [route]
332 | layers = -2
333 |
334 | [convolutional]
335 | batch_normalize=1
336 | filters=512
337 | size=1
338 | stride=1
339 | pad=1
340 | activation=linear
341 |
342 | # 3-1
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=1
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [convolutional]
353 | batch_normalize=1
354 | filters=512
355 | size=3
356 | groups=32
357 | stride=1
358 | pad=1
359 | activation=leaky
360 |
361 | [convolutional]
362 | batch_normalize=1
363 | filters=512
364 | size=1
365 | stride=1
366 | pad=1
367 | activation=linear
368 |
369 | [shortcut]
370 | from=-4
371 | activation=leaky
372 |
373 | # 3-2
374 |
375 | [convolutional]
376 | batch_normalize=1
377 | filters=512
378 | size=1
379 | stride=1
380 | pad=1
381 | activation=leaky
382 |
383 | [convolutional]
384 | batch_normalize=1
385 | filters=512
386 | size=3
387 | groups=32
388 | stride=1
389 | pad=1
390 | activation=leaky
391 |
392 | [convolutional]
393 | batch_normalize=1
394 | filters=512
395 | size=1
396 | stride=1
397 | pad=1
398 | activation=linear
399 |
400 | [shortcut]
401 | from=-4
402 | activation=leaky
403 |
404 | # 3-3
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=1
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [convolutional]
415 | batch_normalize=1
416 | filters=512
417 | size=3
418 | groups=32
419 | stride=1
420 | pad=1
421 | activation=leaky
422 |
423 | [convolutional]
424 | batch_normalize=1
425 | filters=512
426 | size=1
427 | stride=1
428 | pad=1
429 | activation=linear
430 |
431 | [shortcut]
432 | from=-4
433 | activation=leaky
434 |
435 | # 3-4
436 |
437 | [convolutional]
438 | batch_normalize=1
439 | filters=512
440 | size=1
441 | stride=1
442 | pad=1
443 | activation=leaky
444 |
445 | [convolutional]
446 | batch_normalize=1
447 | filters=512
448 | size=3
449 | groups=32
450 | stride=1
451 | pad=1
452 | activation=leaky
453 |
454 | [convolutional]
455 | batch_normalize=1
456 | filters=512
457 | size=1
458 | stride=1
459 | pad=1
460 | activation=linear
461 |
462 | [shortcut]
463 | from=-4
464 | activation=leaky
465 |
466 | # 3-5
467 |
468 | [convolutional]
469 | batch_normalize=1
470 | filters=512
471 | size=1
472 | stride=1
473 | pad=1
474 | activation=leaky
475 |
476 | [convolutional]
477 | batch_normalize=1
478 | filters=512
479 | size=3
480 | groups=32
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [convolutional]
486 | batch_normalize=1
487 | filters=512
488 | size=1
489 | stride=1
490 | pad=1
491 | activation=linear
492 |
493 | [shortcut]
494 | from=-4
495 | activation=leaky
496 |
497 | # 3-T
498 |
499 | [convolutional]
500 | batch_normalize=1
501 | filters=512
502 | size=1
503 | stride=1
504 | pad=1
505 | activation=leaky
506 |
507 | [route]
508 | layers = -1,-24
509 |
510 | [convolutional]
511 | batch_normalize=1
512 | filters=1024
513 | size=1
514 | stride=1
515 | pad=1
516 | activation=leaky
517 |
518 | [convolutional]
519 | batch_normalize=1
520 | filters=1024
521 | size=3
522 | groups=32
523 | stride=2
524 | pad=1
525 | activation=leaky
526 |
527 | [convolutional]
528 | batch_normalize=1
529 | filters=1024
530 | size=1
531 | stride=1
532 | pad=1
533 | activation=leaky
534 |
535 | [route]
536 | layers = -2
537 |
538 | [convolutional]
539 | batch_normalize=1
540 | filters=1024
541 | size=1
542 | stride=1
543 | pad=1
544 | activation=leaky
545 |
546 | # 4-1
547 |
548 | [convolutional]
549 | batch_normalize=1
550 | filters=1024
551 | size=1
552 | stride=1
553 | pad=1
554 | activation=leaky
555 |
556 | [convolutional]
557 | batch_normalize=1
558 | filters=1024
559 | size=3
560 | groups=32
561 | stride=1
562 | pad=1
563 | activation=leaky
564 |
565 | [convolutional]
566 | batch_normalize=1
567 | filters=1024
568 | size=1
569 | stride=1
570 | pad=1
571 | activation=linear
572 |
573 | [shortcut]
574 | from=-4
575 | activation=leaky
576 |
577 | # 4-2
578 |
579 | [convolutional]
580 | batch_normalize=1
581 | filters=1024
582 | size=1
583 | stride=1
584 | pad=1
585 | activation=leaky
586 |
587 | [convolutional]
588 | batch_normalize=1
589 | filters=1024
590 | size=3
591 | groups=32
592 | stride=1
593 | pad=1
594 | activation=leaky
595 |
596 | [convolutional]
597 | batch_normalize=1
598 | filters=1024
599 | size=1
600 | stride=1
601 | pad=1
602 | activation=linear
603 |
604 | [shortcut]
605 | from=-4
606 | activation=leaky
607 |
608 | # 4-T
609 |
610 | [convolutional]
611 | batch_normalize=1
612 | filters=1024
613 | size=1
614 | stride=1
615 | pad=1
616 | activation=leaky
617 |
618 | [route]
619 | layers = -1,-12
620 |
621 | [convolutional]
622 | batch_normalize=1
623 | filters=2048
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 |
629 | ##########################
630 |
631 | [convolutional]
632 | batch_normalize=1
633 | filters=512
634 | size=1
635 | stride=1
636 | pad=1
637 | activation=leaky
638 |
639 | [convolutional]
640 | batch_normalize=1
641 | size=3
642 | stride=1
643 | pad=1
644 | filters=1024
645 | activation=leaky
646 |
647 | [convolutional]
648 | batch_normalize=1
649 | filters=512
650 | size=1
651 | stride=1
652 | pad=1
653 | activation=leaky
654 |
655 | ### SPP ###
656 | [maxpool]
657 | stride=1
658 | size=5
659 |
660 | [route]
661 | layers=-2
662 |
663 | [maxpool]
664 | stride=1
665 | size=9
666 |
667 | [route]
668 | layers=-4
669 |
670 | [maxpool]
671 | stride=1
672 | size=13
673 |
674 | [route]
675 | layers=-1,-3,-5,-6
676 | ### End SPP ###
677 |
678 | [convolutional]
679 | batch_normalize=1
680 | filters=512
681 | size=1
682 | stride=1
683 | pad=1
684 | activation=leaky
685 |
686 | [convolutional]
687 | batch_normalize=1
688 | size=3
689 | stride=1
690 | pad=1
691 | filters=1024
692 | activation=leaky
693 |
694 | [convolutional]
695 | batch_normalize=1
696 | filters=512
697 | size=1
698 | stride=1
699 | pad=1
700 | activation=leaky
701 |
702 | [convolutional]
703 | batch_normalize=1
704 | filters=256
705 | size=1
706 | stride=1
707 | pad=1
708 | activation=leaky
709 |
710 | [upsample]
711 | stride=2
712 |
713 | [route]
714 | layers = 65
715 |
716 | [convolutional]
717 | batch_normalize=1
718 | filters=256
719 | size=1
720 | stride=1
721 | pad=1
722 | activation=leaky
723 |
724 | [route]
725 | layers = -1, -3
726 |
727 | [convolutional]
728 | batch_normalize=1
729 | filters=256
730 | size=1
731 | stride=1
732 | pad=1
733 | activation=leaky
734 |
735 | [convolutional]
736 | batch_normalize=1
737 | size=3
738 | stride=1
739 | pad=1
740 | filters=512
741 | activation=leaky
742 |
743 | [convolutional]
744 | batch_normalize=1
745 | filters=256
746 | size=1
747 | stride=1
748 | pad=1
749 | activation=leaky
750 |
751 | [convolutional]
752 | batch_normalize=1
753 | size=3
754 | stride=1
755 | pad=1
756 | filters=512
757 | activation=leaky
758 |
759 | [convolutional]
760 | batch_normalize=1
761 | filters=256
762 | size=1
763 | stride=1
764 | pad=1
765 | activation=leaky
766 |
767 | [convolutional]
768 | batch_normalize=1
769 | filters=128
770 | size=1
771 | stride=1
772 | pad=1
773 | activation=leaky
774 |
775 | [upsample]
776 | stride=2
777 |
778 | [route]
779 | layers = 38
780 |
781 | [convolutional]
782 | batch_normalize=1
783 | filters=128
784 | size=1
785 | stride=1
786 | pad=1
787 | activation=leaky
788 |
789 | [route]
790 | layers = -1, -3
791 |
792 | [convolutional]
793 | batch_normalize=1
794 | filters=128
795 | size=1
796 | stride=1
797 | pad=1
798 | activation=leaky
799 |
800 | [convolutional]
801 | batch_normalize=1
802 | size=3
803 | stride=1
804 | pad=1
805 | filters=256
806 | activation=leaky
807 |
808 | [convolutional]
809 | batch_normalize=1
810 | filters=128
811 | size=1
812 | stride=1
813 | pad=1
814 | activation=leaky
815 |
816 | [convolutional]
817 | batch_normalize=1
818 | size=3
819 | stride=1
820 | pad=1
821 | filters=256
822 | activation=leaky
823 |
824 | [convolutional]
825 | batch_normalize=1
826 | filters=128
827 | size=1
828 | stride=1
829 | pad=1
830 | activation=leaky
831 |
832 | ##########################
833 |
834 | [convolutional]
835 | batch_normalize=1
836 | size=3
837 | stride=1
838 | pad=1
839 | filters=256
840 | activation=leaky
841 |
842 | [convolutional]
843 | size=1
844 | stride=1
845 | pad=1
846 | filters=255
847 | activation=linear
848 |
849 |
850 | [yolo]
851 | mask = 0,1,2
852 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
853 | classes=80
854 | num=9
855 | jitter=.3
856 | ignore_thresh = .7
857 | truth_thresh = 1
858 | random=1
859 |
860 | [route]
861 | layers = -4
862 |
863 | [convolutional]
864 | batch_normalize=1
865 | size=3
866 | stride=2
867 | pad=1
868 | filters=256
869 | activation=leaky
870 |
871 | [route]
872 | layers = -1, -16
873 |
874 | [convolutional]
875 | batch_normalize=1
876 | filters=256
877 | size=1
878 | stride=1
879 | pad=1
880 | activation=leaky
881 |
882 | [convolutional]
883 | batch_normalize=1
884 | size=3
885 | stride=1
886 | pad=1
887 | filters=512
888 | activation=leaky
889 |
890 | [convolutional]
891 | batch_normalize=1
892 | filters=256
893 | size=1
894 | stride=1
895 | pad=1
896 | activation=leaky
897 |
898 | [convolutional]
899 | batch_normalize=1
900 | size=3
901 | stride=1
902 | pad=1
903 | filters=512
904 | activation=leaky
905 |
906 | [convolutional]
907 | batch_normalize=1
908 | filters=256
909 | size=1
910 | stride=1
911 | pad=1
912 | activation=leaky
913 |
914 | [convolutional]
915 | batch_normalize=1
916 | size=3
917 | stride=1
918 | pad=1
919 | filters=512
920 | activation=leaky
921 |
922 | [convolutional]
923 | size=1
924 | stride=1
925 | pad=1
926 | filters=255
927 | activation=linear
928 |
929 |
930 | [yolo]
931 | mask = 3,4,5
932 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
933 | classes=80
934 | num=9
935 | jitter=.3
936 | ignore_thresh = .7
937 | truth_thresh = 1
938 | random=1
939 |
940 | [route]
941 | layers = -4
942 |
943 | [convolutional]
944 | batch_normalize=1
945 | size=3
946 | stride=2
947 | pad=1
948 | filters=512
949 | activation=leaky
950 |
951 | [route]
952 | layers = -1, -37
953 |
954 | [convolutional]
955 | batch_normalize=1
956 | filters=512
957 | size=1
958 | stride=1
959 | pad=1
960 | activation=leaky
961 |
962 | [convolutional]
963 | batch_normalize=1
964 | size=3
965 | stride=1
966 | pad=1
967 | filters=1024
968 | activation=leaky
969 |
970 | [convolutional]
971 | batch_normalize=1
972 | filters=512
973 | size=1
974 | stride=1
975 | pad=1
976 | activation=leaky
977 |
978 | [convolutional]
979 | batch_normalize=1
980 | size=3
981 | stride=1
982 | pad=1
983 | filters=1024
984 | activation=leaky
985 |
986 | [convolutional]
987 | batch_normalize=1
988 | filters=512
989 | size=1
990 | stride=1
991 | pad=1
992 | activation=leaky
993 |
994 | [convolutional]
995 | batch_normalize=1
996 | size=3
997 | stride=1
998 | pad=1
999 | filters=1024
1000 | activation=leaky
1001 |
1002 | [convolutional]
1003 | size=1
1004 | stride=1
1005 | pad=1
1006 | filters=255
1007 | activation=linear
1008 |
1009 |
1010 | [yolo]
1011 | mask = 6,7,8
1012 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
1013 | classes=80
1014 | num=9
1015 | jitter=.3
1016 | ignore_thresh = .7
1017 | truth_thresh = 1
1018 | random=1
1019 |
--------------------------------------------------------------------------------