├── utils ├── init.py ├── parse_config.py ├── torch_utils.py ├── google_utils.py ├── gcp.sh └── adabound.py ├── plot_results.py ├── data ├── coco_1img.txt ├── samples │ ├── bus.jpg │ └── zidane.jpg ├── coco.data ├── coco_1k5k.data ├── coco_16img.data ├── coco_1cls.data ├── coco_1img.data ├── coco_32img.data ├── coco_64img.data ├── coco_500val.data ├── coco_1000img.data ├── coco_1000val.data ├── coco_1cls.txt ├── get_coco_dataset_gdrive.sh ├── coco_16img.txt ├── coco.names ├── coco_paper.names ├── get_coco_dataset.sh ├── coco_32img.txt └── coco_64img.txt ├── convert_pt_weights.py ├── requirements.txt ├── .github └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── weights └── download_yolov3_weights.sh ├── cfg ├── yolov3-tiny.cfg ├── yolov3-1cls.cfg ├── yolov3.cfg ├── yolov3s-3a320.cfg ├── yolov3-spp-1cls.cfg ├── yolov3-spp.cfg ├── yolov3s-9a320.cfg ├── yolov3s-18a320.cfg ├── yolov3s-30a320.cfg └── yolov3-spp-pan-scale.cfg ├── .gitignore ├── detect.py ├── test.py ├── README.md └── prune.py /utils/init.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /plot_results.py: -------------------------------------------------------------------------------- 1 | from utils import utils 2 | utils.plot_results() 3 | -------------------------------------------------------------------------------- /data/coco_1img.txt: -------------------------------------------------------------------------------- 1 | ../coco/images/val2014/COCO_val2014_000000581886.jpg 2 | -------------------------------------------------------------------------------- /data/samples/bus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikguo/yolov3/HEAD/data/samples/bus.jpg -------------------------------------------------------------------------------- /data/samples/zidane.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erikguo/yolov3/HEAD/data/samples/zidane.jpg -------------------------------------------------------------------------------- /convert_pt_weights.py: -------------------------------------------------------------------------------- 1 | from models import *; 2 | import sys 3 | 4 | print(sys.argv) 5 | convert(sys.argv[1], sys.argv[2]) 6 | -------------------------------------------------------------------------------- /data/coco.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=../coco/trainvalno5k.txt 3 | valid=../coco/5k.txt 4 | names=data/coco.names 5 | backup=backup/ 6 | eval=coco 7 | -------------------------------------------------------------------------------- /data/coco_1k5k.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=./data/coco_1000img.txt 3 | valid=./data/5k.txt 4 | names=data/coco.names 5 | backup=backup/ 6 | eval=coco 7 | -------------------------------------------------------------------------------- /data/coco_16img.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=./data/coco_16img.txt 3 | valid=./data/coco_16img.txt 4 | names=data/coco.names 5 | backup=backup/ 6 | eval=coco 7 | -------------------------------------------------------------------------------- /data/coco_1cls.data: -------------------------------------------------------------------------------- 1 | classes=1 2 | train=./data/coco_1cls.txt 3 | valid=./data/coco_1cls.txt 4 | names=data/coco.names 5 | backup=backup/ 6 | eval=coco 7 | -------------------------------------------------------------------------------- /data/coco_1img.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=./data/coco_1img.txt 3 | valid=./data/coco_1img.txt 4 | names=data/coco.names 5 | backup=backup/ 6 | eval=coco 7 | -------------------------------------------------------------------------------- /data/coco_32img.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=./data/coco_32img.txt 3 | valid=./data/coco_32img.txt 4 | names=data/coco.names 5 | backup=backup/ 6 | eval=coco 7 | -------------------------------------------------------------------------------- /data/coco_64img.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=./data/coco_64img.txt 3 | valid=./data/coco_64img.txt 4 | names=data/coco.names 5 | backup=backup/ 6 | eval=coco 7 | -------------------------------------------------------------------------------- /data/coco_500val.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=./data/coco_500img.txt 3 | valid=./data/coco_500val.txt 4 | names=data/coco.names 5 | backup=backup/ 6 | eval=coco 7 | -------------------------------------------------------------------------------- /data/coco_1000img.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=./data/coco_1000img.txt 3 | valid=./data/coco_1000img.txt 4 | names=data/coco.names 5 | backup=backup/ 6 | eval=coco 7 | -------------------------------------------------------------------------------- /data/coco_1000val.data: -------------------------------------------------------------------------------- 1 | classes=80 2 | train=./data/coco_1000img.txt 3 | valid=./data/coco_1000val.txt 4 | names=data/coco.names 5 | backup=backup/ 6 | eval=coco 7 | -------------------------------------------------------------------------------- /data/coco_1cls.txt: -------------------------------------------------------------------------------- 1 | ../coco/images/val2014/COCO_val2014_000000013992.jpg 2 | ../coco/images/val2014/COCO_val2014_000000047226.jpg 3 | ../coco/images/val2014/COCO_val2014_000000050324.jpg 4 | ../coco/images/val2014/COCO_val2014_000000121497.jpg 5 | ../coco/images/val2014/COCO_val2014_000000001464.jpg 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # pip3 install -U -r requirements.txt 2 | numpy 3 | opencv-python 4 | torch >= 1.1.0 5 | matplotlib 6 | pycocotools 7 | tqdm 8 | tb-nightly 9 | future 10 | Pillow 11 | 12 | # Equivalent conda commands ---------------------------------------------------- 13 | # conda update -n base -c defaults conda 14 | # conda install -y -c anaconda future numpy opencv matplotlib tqdm pillow 15 | # conda install -y -c conda-forge scikit-image tensorboard pycocotools 16 | # conda install -y -c spyder-ide spyder-line-profiler 17 | # conda install pytorch torchvision -c pytorch 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /data/get_coco_dataset_gdrive.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # https://stackoverflow.com/questions/48133080/how-to-download-a-google-drive-url-via-curl-or-wget/48133859 3 | 4 | # Zip coco folder 5 | # zip -r coco.zip coco 6 | # tar -czvf coco.tar.gz coco 7 | 8 | # Set fileid and filename 9 | filename="coco.zip" 10 | fileid="1HaXkef9z6y5l4vUnCYgdmEAj61c6bfWO" # coco.zip 11 | 12 | # Download from Google Drive, accepting presented query 13 | curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null 14 | curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=${fileid}" -o ${filename} 15 | rm ./cookie 16 | 17 | # Unzip 18 | unzip -q ${filename} # for coco.zip 19 | # tar -xzf ${filename} # for coco.tar.gz 20 | -------------------------------------------------------------------------------- /weights/download_yolov3_weights.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # make '/weights' directory if it does not exist and cd into it 4 | mkdir -p weights && cd weights 5 | 6 | # copy darknet weight files, continue '-c' if partially downloaded 7 | wget -c https://pjreddie.com/media/files/yolov3.weights 8 | wget -c https://pjreddie.com/media/files/yolov3-tiny.weights 9 | wget -c https://pjreddie.com/media/files/yolov3-spp.weights 10 | 11 | # yolov3 pytorch weights 12 | # download from Google Drive: https://drive.google.com/drive/folders/1uxgUBemJVw9wZsdpboYbzUN4bcRhsuAI 13 | 14 | # darknet53 weights (first 75 layers only) 15 | wget -c https://pjreddie.com/media/files/darknet53.conv.74 16 | 17 | # yolov3-tiny weights from darknet (first 16 layers only) 18 | # ./darknet partial cfg/yolov3-tiny.cfg yolov3-tiny.weights yolov3-tiny.conv.15 15 19 | # mv yolov3-tiny.conv.15 ../ 20 | 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Version [e.g. 22] 29 | 30 | **Smartphone (please complete the following information):** 31 | - Device: [e.g. iPhoneXS] 32 | - OS: [e.g. iOS8.1] 33 | - Version [e.g. 22] 34 | 35 | **Additional context** 36 | Add any other context about the problem here. 37 | -------------------------------------------------------------------------------- /data/coco_16img.txt: -------------------------------------------------------------------------------- 1 | ../coco/images/train2014/COCO_train2014_000000000009.jpg 2 | ../coco/images/train2014/COCO_train2014_000000000025.jpg 3 | ../coco/images/train2014/COCO_train2014_000000000030.jpg 4 | ../coco/images/train2014/COCO_train2014_000000000034.jpg 5 | ../coco/images/train2014/COCO_train2014_000000000036.jpg 6 | ../coco/images/train2014/COCO_train2014_000000000049.jpg 7 | ../coco/images/train2014/COCO_train2014_000000000061.jpg 8 | ../coco/images/train2014/COCO_train2014_000000000064.jpg 9 | ../coco/images/train2014/COCO_train2014_000000000071.jpg 10 | ../coco/images/train2014/COCO_train2014_000000000072.jpg 11 | ../coco/images/train2014/COCO_train2014_000000000077.jpg 12 | ../coco/images/train2014/COCO_train2014_000000000078.jpg 13 | ../coco/images/train2014/COCO_train2014_000000000081.jpg 14 | ../coco/images/train2014/COCO_train2014_000000000086.jpg 15 | ../coco/images/train2014/COCO_train2014_000000000089.jpg 16 | ../coco/images/train2014/COCO_train2014_000000000092.jpg 17 | -------------------------------------------------------------------------------- /data/coco.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorcycle 5 | airplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | couch 59 | potted plant 60 | bed 61 | dining table 62 | toilet 63 | tv 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /data/coco_paper.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorcycle 5 | airplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | street sign 13 | stop sign 14 | parking meter 15 | bench 16 | bird 17 | cat 18 | dog 19 | horse 20 | sheep 21 | cow 22 | elephant 23 | bear 24 | zebra 25 | giraffe 26 | hat 27 | backpack 28 | umbrella 29 | shoe 30 | eye glasses 31 | handbag 32 | tie 33 | suitcase 34 | frisbee 35 | skis 36 | snowboard 37 | sports ball 38 | kite 39 | baseball bat 40 | baseball glove 41 | skateboard 42 | surfboard 43 | tennis racket 44 | bottle 45 | plate 46 | wine glass 47 | cup 48 | fork 49 | knife 50 | spoon 51 | bowl 52 | banana 53 | apple 54 | sandwich 55 | orange 56 | broccoli 57 | carrot 58 | hot dog 59 | pizza 60 | donut 61 | cake 62 | chair 63 | couch 64 | potted plant 65 | bed 66 | mirror 67 | dining table 68 | window 69 | desk 70 | toilet 71 | door 72 | tv 73 | laptop 74 | mouse 75 | remote 76 | keyboard 77 | cell phone 78 | microwave 79 | oven 80 | toaster 81 | sink 82 | refrigerator 83 | blender 84 | book 85 | clock 86 | vase 87 | scissors 88 | teddy bear 89 | hair drier 90 | toothbrush 91 | hair brush -------------------------------------------------------------------------------- /utils/parse_config.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def parse_model_cfg(path): 5 | # Parses the yolo-v3 layer configuration file and returns module definitions 6 | file = open(path, 'r') 7 | lines = file.read().split('\n') 8 | lines = [x for x in lines if x and not x.startswith('#')] 9 | lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces 10 | mdefs = [] # module definitions 11 | for line in lines: 12 | if line.startswith('['): # This marks the start of a new block 13 | mdefs.append({}) 14 | mdefs[-1]['type'] = line[1:-1].rstrip() 15 | if mdefs[-1]['type'] == 'convolutional': 16 | mdefs[-1]['batch_normalize'] = 0 # pre-populate with zeros (may be overwritten later) 17 | else: 18 | key, val = line.split("=") 19 | key = key.rstrip() 20 | 21 | if 'anchors' in key: 22 | mdefs[-1][key] = np.array([float(x) for x in val.split(',')]).reshape((-1, 2)) # np anchors 23 | else: 24 | mdefs[-1][key] = val.strip() 25 | 26 | return mdefs 27 | 28 | 29 | def parse_data_cfg(path): 30 | # Parses the data configuration file 31 | options = dict() 32 | with open(path, 'r') as fp: 33 | lines = fp.readlines() 34 | 35 | for line in lines: 36 | line = line.strip() 37 | if line == '' or line.startswith('#'): 38 | continue 39 | key, val = line.split('=') 40 | options[key.strip()] = val.strip() 41 | 42 | return options 43 | -------------------------------------------------------------------------------- /data/get_coco_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # CREDIT: https://github.com/pjreddie/darknet/tree/master/scripts/get_coco_dataset.sh 3 | 4 | # Clone COCO API 5 | git clone https://github.com/pdollar/coco && cd coco 6 | 7 | # Download Images 8 | mkdir images && cd images 9 | wget -c https://pjreddie.com/media/files/train2014.zip 10 | wget -c https://pjreddie.com/media/files/val2014.zip 11 | 12 | # Unzip 13 | unzip -q train2014.zip 14 | unzip -q val2014.zip 15 | 16 | # (optional) Delete zip files 17 | rm -rf *.zip 18 | 19 | cd .. 20 | 21 | # Download COCO Metadata 22 | wget -c https://pjreddie.com/media/files/instances_train-val2014.zip 23 | wget -c https://pjreddie.com/media/files/coco/5k.part 24 | wget -c https://pjreddie.com/media/files/coco/trainvalno5k.part 25 | wget -c https://pjreddie.com/media/files/coco/labels.tgz 26 | tar xzf labels.tgz 27 | unzip -q instances_train-val2014.zip 28 | 29 | # Set Up Image Lists 30 | paste <(awk "{print \"$PWD\"}" <5k.part) 5k.part | tr -d '\t' > 5k.txt 31 | paste <(awk "{print \"$PWD\"}" trainvalno5k.txt 32 | 33 | # get xview training data 34 | # wget -O train_images.tgz 'https://d307kc0mrhucc3.cloudfront.net/train_images.tgz?Expires=1530124049&Signature=JrQoxipmsETvb7eQHCfDFUO-QEHJGAayUv0i-ParmS-1hn7hl9D~bzGuHWG82imEbZSLUARTtm0wOJ7EmYMGmG5PtLKz9H5qi6DjoSUuFc13NQ-~6yUhE~NfPaTnehUdUMCa3On2wl1h1ZtRG~0Jq1P-AJbpe~oQxbyBrs1KccaMa7FK4F4oMM6sMnNgoXx8-3O77kYw~uOpTMFmTaQdHln6EztW0Lx17i57kK3ogbSUpXgaUTqjHCRA1dWIl7PY1ngQnLslkLhZqmKcaL-BvWf0ZGjHxCDQBpnUjIlvMu5NasegkwD9Jjc0ClgTxsttSkmbapVqaVC8peR0pO619Q__&Key-Pair-Id=APKAIKGDJB5C3XUL2DXQ' 35 | # tar -xvzf train_images.tgz 36 | # sudo rm -rf train_images/._* 37 | # lastly convert each .tif to a .bmp for faster loading in cv2 38 | 39 | # ./coco/images/train2014/COCO_train2014_000000167126.jpg # corrupted image 40 | -------------------------------------------------------------------------------- /data/coco_32img.txt: -------------------------------------------------------------------------------- 1 | ../coco/images/train2014/COCO_train2014_000000000009.jpg 2 | ../coco/images/train2014/COCO_train2014_000000000025.jpg 3 | ../coco/images/train2014/COCO_train2014_000000000030.jpg 4 | ../coco/images/train2014/COCO_train2014_000000000034.jpg 5 | ../coco/images/train2014/COCO_train2014_000000000036.jpg 6 | ../coco/images/train2014/COCO_train2014_000000000049.jpg 7 | ../coco/images/train2014/COCO_train2014_000000000061.jpg 8 | ../coco/images/train2014/COCO_train2014_000000000064.jpg 9 | ../coco/images/train2014/COCO_train2014_000000000071.jpg 10 | ../coco/images/train2014/COCO_train2014_000000000072.jpg 11 | ../coco/images/train2014/COCO_train2014_000000000077.jpg 12 | ../coco/images/train2014/COCO_train2014_000000000078.jpg 13 | ../coco/images/train2014/COCO_train2014_000000000081.jpg 14 | ../coco/images/train2014/COCO_train2014_000000000086.jpg 15 | ../coco/images/train2014/COCO_train2014_000000000089.jpg 16 | ../coco/images/train2014/COCO_train2014_000000000092.jpg 17 | ../coco/images/train2014/COCO_train2014_000000000094.jpg 18 | ../coco/images/train2014/COCO_train2014_000000000109.jpg 19 | ../coco/images/train2014/COCO_train2014_000000000110.jpg 20 | ../coco/images/train2014/COCO_train2014_000000000113.jpg 21 | ../coco/images/train2014/COCO_train2014_000000000127.jpg 22 | ../coco/images/train2014/COCO_train2014_000000000138.jpg 23 | ../coco/images/train2014/COCO_train2014_000000000142.jpg 24 | ../coco/images/train2014/COCO_train2014_000000000144.jpg 25 | ../coco/images/train2014/COCO_train2014_000000000149.jpg 26 | ../coco/images/train2014/COCO_train2014_000000000151.jpg 27 | ../coco/images/train2014/COCO_train2014_000000000154.jpg 28 | ../coco/images/train2014/COCO_train2014_000000000165.jpg 29 | ../coco/images/train2014/COCO_train2014_000000000194.jpg 30 | ../coco/images/train2014/COCO_train2014_000000000201.jpg 31 | ../coco/images/train2014/COCO_train2014_000000000247.jpg 32 | ../coco/images/train2014/COCO_train2014_000000000260.jpg 33 | -------------------------------------------------------------------------------- /utils/torch_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def init_seeds(seed=0): 5 | torch.cuda.empty_cache() 6 | torch.manual_seed(seed) 7 | torch.cuda.manual_seed(seed) 8 | torch.cuda.manual_seed_all(seed) 9 | # torch.backends.cudnn.deterministic = True # https://pytorch.org/docs/stable/notes/randomness.html 10 | 11 | 12 | def select_device(force_cpu=False, apex=False): 13 | # apex if mixed precision training https://github.com/NVIDIA/apex 14 | cuda = False if force_cpu else torch.cuda.is_available() 15 | device = torch.device('cuda:0' if cuda else 'cpu') 16 | 17 | if not cuda: 18 | print('Using CPU') 19 | if cuda: 20 | torch.backends.cudnn.benchmark = True # set False for reproducible results 21 | c = 1024 ** 2 # bytes to MB 22 | ng = torch.cuda.device_count() 23 | x = [torch.cuda.get_device_properties(i) for i in range(ng)] 24 | cuda_str = 'Using CUDA ' + ('Apex ' if apex else '') 25 | for i in range(0, ng): 26 | if i == 1: 27 | # torch.cuda.set_device(0) # OPTIONAL: Set GPU ID 28 | cuda_str = ' ' * len(cuda_str) 29 | print("%sdevice%g _CudaDeviceProperties(name='%s', total_memory=%dMB)" % 30 | (cuda_str, i, x[i].name, x[i].total_memory / c)) 31 | 32 | print('') # skip a line 33 | return device 34 | 35 | 36 | def fuse_conv_and_bn(conv, bn): 37 | # https://tehnokv.com/posts/fusing-batchnorm-and-conv/ 38 | with torch.no_grad(): 39 | # init 40 | fusedconv = torch.nn.Conv2d(conv.in_channels, 41 | conv.out_channels, 42 | kernel_size=conv.kernel_size, 43 | stride=conv.stride, 44 | padding=conv.padding, 45 | bias=True) 46 | 47 | # prepare filters 48 | w_conv = conv.weight.clone().view(conv.out_channels, -1) 49 | w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) 50 | fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.size())) 51 | 52 | # prepare spatial bias 53 | if conv.bias is not None: 54 | b_conv = conv.bias 55 | else: 56 | b_conv = torch.zeros(conv.weight.size(0)) 57 | b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps)) 58 | fusedconv.bias.copy_(b_conv + b_bn) 59 | 60 | return fusedconv 61 | -------------------------------------------------------------------------------- /utils/google_utils.py: -------------------------------------------------------------------------------- 1 | # This file contains google utils: https://cloud.google.com/storage/docs/reference/libraries 2 | # pip install --upgrade google-cloud-storage 3 | 4 | import os 5 | import time 6 | 7 | 8 | # from google.cloud import storage 9 | 10 | 11 | def gdrive_download(id='1HaXkef9z6y5l4vUnCYgdmEAj61c6bfWO', name='coco.zip'): 12 | # https://gist.github.com/tanaikech/f0f2d122e05bf5f971611258c22c110f 13 | # Downloads a file from Google Drive, accepting presented query 14 | # from utils.google_utils import *; gdrive_download() 15 | t = time.time() 16 | 17 | print('Downloading https://drive.google.com/uc?export=download&id=%s as %s... ' % (id, name), end='') 18 | if os.path.exists(name): # remove existing 19 | os.remove(name) 20 | 21 | # Attempt small file download 22 | s = 'curl -f -L -o %s https://drive.google.com/uc?export=download&id=%s' % (name, id) 23 | os.system(s) 24 | 25 | # Attempt large file download 26 | if not os.path.exists(name): # file size > 40MB 27 | s = ["curl -c ./cookie -s -L \"https://drive.google.com/uc?export=download&id=%s\" > /dev/null" % id, 28 | "curl -Lb ./cookie \"https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=%s\" -o %s" % ( 29 | id, name), 30 | 'rm ./cookie'] 31 | [os.system(x) for x in s] # run commands 32 | 33 | # Unzip if archive 34 | if name.endswith('.zip'): 35 | print('unzipping... ', end='') 36 | os.system('unzip -q %s' % name) # unzip 37 | os.remove(name) # remove zip to free space 38 | 39 | print('Done (%.1fs)' % (time.time() - t)) 40 | 41 | 42 | def upload_blob(bucket_name, source_file_name, destination_blob_name): 43 | # Uploads a file to a bucket 44 | # https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python 45 | 46 | storage_client = storage.Client() 47 | bucket = storage_client.get_bucket(bucket_name) 48 | blob = bucket.blob(destination_blob_name) 49 | 50 | blob.upload_from_filename(source_file_name) 51 | 52 | print('File {} uploaded to {}.'.format( 53 | source_file_name, 54 | destination_blob_name)) 55 | 56 | 57 | def download_blob(bucket_name, source_blob_name, destination_file_name): 58 | # Uploads a blob from a bucket 59 | storage_client = storage.Client() 60 | bucket = storage_client.get_bucket(bucket_name) 61 | blob = bucket.blob(source_blob_name) 62 | 63 | blob.download_to_filename(destination_file_name) 64 | 65 | print('Blob {} downloaded to {}.'.format( 66 | source_blob_name, 67 | destination_file_name)) 68 | -------------------------------------------------------------------------------- /cfg/yolov3-tiny.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=2 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=16 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=32 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=64 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [maxpool] 58 | size=2 59 | stride=2 60 | 61 | [convolutional] 62 | batch_normalize=1 63 | filters=128 64 | size=3 65 | stride=1 66 | pad=1 67 | activation=leaky 68 | 69 | [maxpool] 70 | size=2 71 | stride=2 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=256 76 | size=3 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [maxpool] 82 | size=2 83 | stride=2 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=512 88 | size=3 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [maxpool] 94 | size=2 95 | stride=1 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=1024 100 | size=3 101 | stride=1 102 | pad=1 103 | activation=leaky 104 | 105 | ########### 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=256 110 | size=1 111 | stride=1 112 | pad=1 113 | activation=leaky 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=512 118 | size=3 119 | stride=1 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | size=1 125 | stride=1 126 | pad=1 127 | filters=255 128 | activation=linear 129 | 130 | 131 | 132 | [yolo] 133 | mask = 3,4,5 134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 135 | classes=80 136 | num=6 137 | jitter=.3 138 | ignore_thresh = .7 139 | truth_thresh = 1 140 | random=1 141 | 142 | [route] 143 | layers = -4 144 | 145 | [convolutional] 146 | batch_normalize=1 147 | filters=128 148 | size=1 149 | stride=1 150 | pad=1 151 | activation=leaky 152 | 153 | [upsample] 154 | stride=2 155 | 156 | [route] 157 | layers = -1, 8 158 | 159 | [convolutional] 160 | batch_normalize=1 161 | filters=256 162 | size=3 163 | stride=1 164 | pad=1 165 | activation=leaky 166 | 167 | [convolutional] 168 | size=1 169 | stride=1 170 | pad=1 171 | filters=255 172 | activation=linear 173 | 174 | [yolo] 175 | mask = 1,2,3 176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 177 | classes=80 178 | num=6 179 | jitter=.3 180 | ignore_thresh = .7 181 | truth_thresh = 1 182 | random=1 183 | -------------------------------------------------------------------------------- /data/coco_64img.txt: -------------------------------------------------------------------------------- 1 | ../coco/images/train2014/COCO_train2014_000000000009.jpg 2 | ../coco/images/train2014/COCO_train2014_000000000025.jpg 3 | ../coco/images/train2014/COCO_train2014_000000000030.jpg 4 | ../coco/images/train2014/COCO_train2014_000000000034.jpg 5 | ../coco/images/train2014/COCO_train2014_000000000036.jpg 6 | ../coco/images/train2014/COCO_train2014_000000000049.jpg 7 | ../coco/images/train2014/COCO_train2014_000000000061.jpg 8 | ../coco/images/train2014/COCO_train2014_000000000064.jpg 9 | ../coco/images/train2014/COCO_train2014_000000000071.jpg 10 | ../coco/images/train2014/COCO_train2014_000000000072.jpg 11 | ../coco/images/train2014/COCO_train2014_000000000077.jpg 12 | ../coco/images/train2014/COCO_train2014_000000000078.jpg 13 | ../coco/images/train2014/COCO_train2014_000000000081.jpg 14 | ../coco/images/train2014/COCO_train2014_000000000086.jpg 15 | ../coco/images/train2014/COCO_train2014_000000000089.jpg 16 | ../coco/images/train2014/COCO_train2014_000000000092.jpg 17 | ../coco/images/train2014/COCO_train2014_000000000094.jpg 18 | ../coco/images/train2014/COCO_train2014_000000000109.jpg 19 | ../coco/images/train2014/COCO_train2014_000000000110.jpg 20 | ../coco/images/train2014/COCO_train2014_000000000113.jpg 21 | ../coco/images/train2014/COCO_train2014_000000000127.jpg 22 | ../coco/images/train2014/COCO_train2014_000000000138.jpg 23 | ../coco/images/train2014/COCO_train2014_000000000142.jpg 24 | ../coco/images/train2014/COCO_train2014_000000000144.jpg 25 | ../coco/images/train2014/COCO_train2014_000000000149.jpg 26 | ../coco/images/train2014/COCO_train2014_000000000151.jpg 27 | ../coco/images/train2014/COCO_train2014_000000000154.jpg 28 | ../coco/images/train2014/COCO_train2014_000000000165.jpg 29 | ../coco/images/train2014/COCO_train2014_000000000194.jpg 30 | ../coco/images/train2014/COCO_train2014_000000000201.jpg 31 | ../coco/images/train2014/COCO_train2014_000000000247.jpg 32 | ../coco/images/train2014/COCO_train2014_000000000260.jpg 33 | ../coco/images/train2014/COCO_train2014_000000000263.jpg 34 | ../coco/images/train2014/COCO_train2014_000000000307.jpg 35 | ../coco/images/train2014/COCO_train2014_000000000308.jpg 36 | ../coco/images/train2014/COCO_train2014_000000000309.jpg 37 | ../coco/images/train2014/COCO_train2014_000000000312.jpg 38 | ../coco/images/train2014/COCO_train2014_000000000315.jpg 39 | ../coco/images/train2014/COCO_train2014_000000000321.jpg 40 | ../coco/images/train2014/COCO_train2014_000000000322.jpg 41 | ../coco/images/train2014/COCO_train2014_000000000326.jpg 42 | ../coco/images/train2014/COCO_train2014_000000000332.jpg 43 | ../coco/images/train2014/COCO_train2014_000000000349.jpg 44 | ../coco/images/train2014/COCO_train2014_000000000368.jpg 45 | ../coco/images/train2014/COCO_train2014_000000000370.jpg 46 | ../coco/images/train2014/COCO_train2014_000000000382.jpg 47 | ../coco/images/train2014/COCO_train2014_000000000384.jpg 48 | ../coco/images/train2014/COCO_train2014_000000000389.jpg 49 | ../coco/images/train2014/COCO_train2014_000000000394.jpg 50 | ../coco/images/train2014/COCO_train2014_000000000404.jpg 51 | ../coco/images/train2014/COCO_train2014_000000000419.jpg 52 | ../coco/images/train2014/COCO_train2014_000000000431.jpg 53 | ../coco/images/train2014/COCO_train2014_000000000436.jpg 54 | ../coco/images/train2014/COCO_train2014_000000000438.jpg 55 | ../coco/images/train2014/COCO_train2014_000000000443.jpg 56 | ../coco/images/train2014/COCO_train2014_000000000446.jpg 57 | ../coco/images/train2014/COCO_train2014_000000000450.jpg 58 | ../coco/images/train2014/COCO_train2014_000000000471.jpg 59 | ../coco/images/train2014/COCO_train2014_000000000490.jpg 60 | ../coco/images/train2014/COCO_train2014_000000000491.jpg 61 | ../coco/images/train2014/COCO_train2014_000000000510.jpg 62 | ../coco/images/train2014/COCO_train2014_000000000514.jpg 63 | ../coco/images/train2014/COCO_train2014_000000000529.jpg 64 | ../coco/images/train2014/COCO_train2014_000000000531.jpg 65 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Repo-specific GitIgnore ---------------------------------------------------------------------------------------------- 2 | *.jpg 3 | *.png 4 | *.bmp 5 | *.tif 6 | *.heic 7 | *.JPG 8 | *.PNG 9 | *.TIF 10 | *.HEIC 11 | *.mp4 12 | *.mov 13 | *.MOV 14 | *.avi 15 | *.data 16 | *.json 17 | 18 | *.cfg 19 | !cfg/yolov3*.cfg 20 | 21 | runs/* 22 | data/* 23 | !data/samples/zidane.jpg 24 | !data/samples/bus.jpg 25 | !data/coco.names 26 | !data/coco_paper.names 27 | !data/coco.data 28 | !data/coco_*.data 29 | !data/coco_*.txt 30 | !data/coco_*.txt 31 | !data/trainvalno5k.shapes 32 | !data/5k.shapes 33 | !data/5k.txt 34 | !data/*.sh 35 | 36 | pycocotools/* 37 | results*.txt 38 | gcp_test*.sh 39 | 40 | # MATLAB GitIgnore ----------------------------------------------------------------------------------------------------- 41 | *.m~ 42 | *.mat 43 | !targets*.mat 44 | 45 | # Neural Network weights ----------------------------------------------------------------------------------------------- 46 | *.weights 47 | *.pt 48 | *.onnx 49 | *.mlmodel 50 | darknet53.conv.74 51 | yolov3-tiny.conv.15 52 | 53 | # GitHub Python GitIgnore ---------------------------------------------------------------------------------------------- 54 | # Byte-compiled / optimized / DLL files 55 | __pycache__/ 56 | *.py[cod] 57 | *$py.class 58 | 59 | # C extensions 60 | *.so 61 | 62 | # Distribution / packaging 63 | .Python 64 | env/ 65 | build/ 66 | develop-eggs/ 67 | dist/ 68 | downloads/ 69 | eggs/ 70 | .eggs/ 71 | lib/ 72 | lib64/ 73 | parts/ 74 | sdist/ 75 | var/ 76 | wheels/ 77 | *.egg-info/ 78 | .installed.cfg 79 | *.egg 80 | 81 | # PyInstaller 82 | # Usually these files are written by a python script from a template 83 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 84 | *.manifest 85 | *.spec 86 | 87 | # Installer logs 88 | pip-log.txt 89 | pip-delete-this-directory.txt 90 | 91 | # Unit test / coverage reports 92 | htmlcov/ 93 | .tox/ 94 | .coverage 95 | .coverage.* 96 | .cache 97 | nosetests.xml 98 | coverage.xml 99 | *.cover 100 | .hypothesis/ 101 | 102 | # Translations 103 | *.mo 104 | *.pot 105 | 106 | # Django stuff: 107 | *.log 108 | local_settings.py 109 | 110 | # Flask stuff: 111 | instance/ 112 | .webassets-cache 113 | 114 | # Scrapy stuff: 115 | .scrapy 116 | 117 | # Sphinx documentation 118 | docs/_build/ 119 | 120 | # PyBuilder 121 | target/ 122 | 123 | # Jupyter Notebook 124 | .ipynb_checkpoints 125 | 126 | # pyenv 127 | .python-version 128 | 129 | # celery beat schedule file 130 | celerybeat-schedule 131 | 132 | # SageMath parsed files 133 | *.sage.py 134 | 135 | # dotenv 136 | .env 137 | 138 | # virtualenv 139 | .venv 140 | venv/ 141 | ENV/ 142 | 143 | # Spyder project settings 144 | .spyderproject 145 | .spyproject 146 | 147 | # Rope project settings 148 | .ropeproject 149 | 150 | # mkdocs documentation 151 | /site 152 | 153 | # mypy 154 | .mypy_cache/ 155 | 156 | 157 | # https://github.com/github/gitignore/blob/master/Global/macOS.gitignore ----------------------------------------------- 158 | 159 | # General 160 | .DS_Store 161 | .AppleDouble 162 | .LSOverride 163 | 164 | # Icon must end with two \r 165 | Icon 166 | Icon? 167 | 168 | # Thumbnails 169 | ._* 170 | 171 | # Files that might appear in the root of a volume 172 | .DocumentRevisions-V100 173 | .fseventsd 174 | .Spotlight-V100 175 | .TemporaryItems 176 | .Trashes 177 | .VolumeIcon.icns 178 | .com.apple.timemachine.donotpresent 179 | 180 | # Directories potentially created on remote AFP share 181 | .AppleDB 182 | .AppleDesktop 183 | Network Trash Folder 184 | Temporary Items 185 | .apdisk 186 | 187 | 188 | # https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore 189 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 190 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 191 | 192 | # User-specific stuff: 193 | .idea/* 194 | .idea/**/workspace.xml 195 | .idea/**/tasks.xml 196 | .idea/dictionaries 197 | .html # Bokeh Plots 198 | .pg # TensorFlow Frozen Graphs 199 | .avi # videos 200 | 201 | # Sensitive or high-churn files: 202 | .idea/**/dataSources/ 203 | .idea/**/dataSources.ids 204 | .idea/**/dataSources.local.xml 205 | .idea/**/sqlDataSources.xml 206 | .idea/**/dynamic.xml 207 | .idea/**/uiDesigner.xml 208 | 209 | # Gradle: 210 | .idea/**/gradle.xml 211 | .idea/**/libraries 212 | 213 | # CMake 214 | cmake-build-debug/ 215 | cmake-build-release/ 216 | 217 | # Mongo Explorer plugin: 218 | .idea/**/mongoSettings.xml 219 | 220 | ## File-based project format: 221 | *.iws 222 | 223 | ## Plugin-specific files: 224 | 225 | # IntelliJ 226 | out/ 227 | 228 | # mpeltonen/sbt-idea plugin 229 | .idea_modules/ 230 | 231 | # JIRA plugin 232 | atlassian-ide-plugin.xml 233 | 234 | # Cursive Clojure plugin 235 | .idea/replstate.xml 236 | 237 | # Crashlytics plugin (for Android Studio and IntelliJ) 238 | com_crashlytics_export_strings.xml 239 | crashlytics.properties 240 | crashlytics-build.properties 241 | fabric.properties 242 | -------------------------------------------------------------------------------- /detect.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | from sys import platform 4 | 5 | from models import * 6 | from utils.datasets import * 7 | from utils.utils import * 8 | 9 | 10 | def detect(cfg, 11 | data, 12 | weights, 13 | images='data/samples', # input folder 14 | output='output', # output folder 15 | fourcc='mp4v', # video codec 16 | img_size=416, 17 | conf_thres=0.5, 18 | nms_thres=0.5, 19 | save_txt=False, 20 | save_images=True): 21 | # Initialize 22 | device = torch_utils.select_device(force_cpu=ONNX_EXPORT) 23 | torch.backends.cudnn.benchmark = False # set False for reproducible results 24 | if os.path.exists(output): 25 | shutil.rmtree(output) # delete output folder 26 | os.makedirs(output) # make new output folder 27 | 28 | # Initialize model 29 | if ONNX_EXPORT: 30 | s = (320, 192) # (320, 192) or (416, 256) or (608, 352) onnx model image size (height, width) 31 | model = Darknet(cfg, s) 32 | else: 33 | model = Darknet(cfg, img_size) 34 | 35 | # Load weights 36 | if weights.endswith('.pt'): # pytorch format 37 | model.load_state_dict(torch.load(weights, map_location=device)['model']) 38 | else: # darknet format 39 | _ = load_darknet_weights(model, weights) 40 | 41 | # Fuse Conv2d + BatchNorm2d layers 42 | # model.fuse() 43 | 44 | # Eval mode 45 | model.to(device).eval() 46 | 47 | # Export mode 48 | if ONNX_EXPORT: 49 | img = torch.zeros((1, 3, s[0], s[1])) 50 | torch.onnx.export(model, img, 'weights/export.onnx', verbose=True) 51 | return 52 | 53 | # Half precision 54 | opt.half = opt.half and device.type != 'cpu' # half precision only supported on CUDA 55 | if opt.half: 56 | model.half() 57 | 58 | # Set Dataloader 59 | vid_path, vid_writer = None, None 60 | if opt.webcam: 61 | save_images = False 62 | dataloader = LoadWebcam(img_size=img_size, half=opt.half) 63 | else: 64 | dataloader = LoadImages(images, img_size=img_size, half=opt.half) 65 | 66 | # Get classes and colors 67 | classes = load_classes(parse_data_cfg(data)['names']) 68 | colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(classes))] 69 | 70 | # Run inference 71 | t0 = time.time() 72 | for i, (path, img, im0, vid_cap) in enumerate(dataloader): 73 | t = time.time() 74 | save_path = str(Path(output) / Path(path).name) 75 | 76 | # Get detections 77 | img = torch.from_numpy(img).unsqueeze(0).to(device) 78 | pred, _ = model(img) 79 | det = non_max_suppression(pred.float(), conf_thres, nms_thres)[0] 80 | 81 | if det is not None and len(det) > 0: 82 | # Rescale boxes from 416 to true image size 83 | det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round() 84 | 85 | # Print results to screen 86 | print('%gx%g ' % img.shape[2:], end='') # print image size 87 | for c in det[:, -1].unique(): 88 | n = (det[:, -1] == c).sum() 89 | print('%g %ss' % (n, classes[int(c)]), end=', ') 90 | 91 | # Draw bounding boxes and labels of detections 92 | for *xyxy, conf, cls_conf, cls in det: 93 | if save_txt: # Write to file 94 | with open(save_path + '.txt', 'a') as file: 95 | file.write(('%g ' * 6 + '\n') % (*xyxy, cls, conf)) 96 | 97 | # Add bbox to the image 98 | label = '%s %.2f' % (classes[int(cls)], conf) 99 | plot_one_box(xyxy, im0, label=label, color=colors[int(cls)]) 100 | 101 | print('Done. (%.3fs)' % (time.time() - t)) 102 | 103 | if opt.webcam: # Show live webcam 104 | cv2.imshow(weights, im0) 105 | 106 | if save_images: # Save image with detections 107 | if dataloader.mode == 'images': 108 | cv2.imwrite(save_path, im0) 109 | else: 110 | if vid_path != save_path: # new video 111 | vid_path = save_path 112 | if isinstance(vid_writer, cv2.VideoWriter): 113 | vid_writer.release() # release previous video writer 114 | 115 | fps = vid_cap.get(cv2.CAP_PROP_FPS) 116 | width = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) 117 | height = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) 118 | vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*fourcc), fps, (width, height)) 119 | vid_writer.write(im0) 120 | 121 | if save_images: 122 | print('Results saved to %s' % os.getcwd() + os.sep + output) 123 | if platform == 'darwin': # macos 124 | os.system('open ' + output + ' ' + save_path) 125 | 126 | print('Done. (%.3fs)' % (time.time() - t0)) 127 | 128 | 129 | if __name__ == '__main__': 130 | parser = argparse.ArgumentParser() 131 | parser.add_argument('--cfg', type=str, default='cfg/yolov3-spp.cfg', help='cfg file path') 132 | parser.add_argument('--data', type=str, default='data/coco.data', help='coco.data file path') 133 | parser.add_argument('--weights', type=str, default='weights/yolov3-spp.weights', help='path to weights file') 134 | parser.add_argument('--images', type=str, default='data/samples', help='path to images') 135 | parser.add_argument('--img-size', type=int, default=416, help='inference size (pixels)') 136 | parser.add_argument('--conf-thres', type=float, default=0.3, help='object confidence threshold') 137 | parser.add_argument('--nms-thres', type=float, default=0.5, help='iou threshold for non-maximum suppression') 138 | parser.add_argument('--fourcc', type=str, default='mp4v', help='fourcc output video codec (verify ffmpeg support)') 139 | parser.add_argument('--output', type=str, default='output', help='specifies the output path for images and videos') 140 | parser.add_argument('--half', action='store_true', help='half precision FP16 inference') 141 | parser.add_argument('--webcam', action='store_true', help='use webcam') 142 | opt = parser.parse_args() 143 | print(opt) 144 | 145 | with torch.no_grad(): 146 | detect(opt.cfg, 147 | opt.data, 148 | opt.weights, 149 | images=opt.images, 150 | img_size=opt.img_size, 151 | conf_thres=opt.conf_thres, 152 | nms_thres=opt.nms_thres, 153 | fourcc=opt.fourcc, 154 | output=opt.output) 155 | -------------------------------------------------------------------------------- /utils/gcp.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # New VM 4 | rm -rf sample_data yolov3 darknet apex coco cocoapi knife knifec 5 | git clone https://github.com/ultralytics/yolov3 6 | git clone https://github.com/AlexeyAB/darknet && cd darknet && make GPU=1 CUDNN=1 CUDNN_HALF=1 OPENCV=1 && wget -c https://pjreddie.com/media/files/darknet53.conv.74 && cd .. 7 | git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . --user && cd .. && rm -rf apex 8 | #git clone https://github.com/cocodataset/cocoapi && cd cocoapi/PythonAPI && make && cd ../.. && cp -r cocoapi/PythonAPI/pycocotools yolov3 9 | sudo conda install -y -c conda-forge scikit-image tensorboard pycocotools 10 | python3 -c " 11 | from yolov3.utils.google_utils import gdrive_download 12 | gdrive_download('1HaXkef9z6y5l4vUnCYgdmEAj61c6bfWO','coco.zip') 13 | gdrive_download('1GrFcTIIsKzOafZltUOS75RSahPrj2KyT','knife.zip') 14 | gdrive_download('19sLJEGHlIAIFHcEftq4aLCw_tkWZmhD1','knifec.zip')" 15 | sudo shutdown 16 | 17 | # Re-clone 18 | rm -rf yolov3 # Warning: remove existing 19 | git clone https://github.com/ultralytics/yolov3 && cd yolov3 # master 20 | # git clone -b test --depth 1 https://github.com/ultralytics/yolov3 test # branch 21 | 22 | # Train 23 | python3 train.py 24 | 25 | # Resume 26 | python3 train.py --resume 27 | 28 | # Detect 29 | python3 detect.py 30 | 31 | # Test 32 | python3 test.py --save-json 33 | 34 | # Evolve 35 | for i in {0..500} 36 | do 37 | python3 train.py --data data/coco.data --img-size 320 --epochs 1 --batch-size 64 --accumulate 1 --evolve --bucket yolov4 38 | done 39 | 40 | # Git pull 41 | git pull https://github.com/ultralytics/yolov3 # master 42 | git pull https://github.com/ultralytics/yolov3 test # branch 43 | 44 | # Test Darknet training 45 | python3 test.py --weights ../darknet/backup/yolov3.backup 46 | 47 | # Copy last.pt TO bucket 48 | gsutil cp yolov3/weights/last1gpu.pt gs://ultralytics 49 | 50 | # Copy last.pt FROM bucket 51 | gsutil cp gs://ultralytics/last.pt yolov3/weights/last.pt 52 | wget https://storage.googleapis.com/ultralytics/yolov3/last_v1_0.pt -O weights/last_v1_0.pt 53 | wget https://storage.googleapis.com/ultralytics/yolov3/best_v1_0.pt -O weights/best_v1_0.pt 54 | 55 | # Reproduce tutorials 56 | rm results*.txt # WARNING: removes existing results 57 | python3 train.py --nosave --data data/coco_1img.data && mv results.txt results0r_1img.txt 58 | python3 train.py --nosave --data data/coco_10img.data && mv results.txt results0r_10img.txt 59 | python3 train.py --nosave --data data/coco_100img.data && mv results.txt results0r_100img.txt 60 | # python3 train.py --nosave --data data/coco_100img.data --transfer && mv results.txt results3_100imgTL.txt 61 | python3 -c "from utils import utils; utils.plot_results()" 62 | # gsutil cp results*.txt gs://ultralytics 63 | gsutil cp results.png gs://ultralytics 64 | sudo shutdown 65 | 66 | # Reproduce mAP 67 | python3 test.py --save-json --img-size 608 68 | python3 test.py --save-json --img-size 416 69 | python3 test.py --save-json --img-size 320 70 | sudo shutdown 71 | 72 | # Benchmark script 73 | git clone https://github.com/ultralytics/yolov3 # clone our repo 74 | git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . --user && cd .. && rm -rf apex # install nvidia apex 75 | python3 -c "from yolov3.utils.google_utils import gdrive_download; gdrive_download('1HaXkef9z6y5l4vUnCYgdmEAj61c6bfWO','coco.zip')" # download coco dataset (20GB) 76 | cd yolov3 && clear && python3 train.py --epochs 1 # run benchmark (~30 min) 77 | 78 | # Unit tests 79 | python3 detect.py # detect 2 persons, 1 tie 80 | python3 test.py --data data/coco_32img.data # test mAP = 0.8 81 | python3 train.py --data data/coco_32img.data --epochs 5 --nosave # train 5 epochs 82 | python3 train.py --data data/coco_1cls.data --epochs 5 --nosave # train 5 epochs 83 | python3 train.py --data data/coco_1img.data --epochs 5 --nosave # train 5 epochs 84 | 85 | # AlexyAB Darknet 86 | gsutil cp -r gs://sm6/supermarket2 . # dataset from bucket 87 | rm -rf darknet && git clone https://github.com/AlexeyAB/darknet && cd darknet && wget -c https://pjreddie.com/media/files/darknet53.conv.74 # sudo apt install libopencv-dev && make 88 | ./darknet detector calc_anchors data/coco_img64.data -num_of_clusters 9 -width 320 -height 320 # kmeans anchor calculation 89 | ./darknet detector train ../supermarket2/supermarket2.data ../yolo_v3_spp_pan_scale.cfg darknet53.conv.74 -map -dont_show # train spp 90 | ./darknet detector train ../yolov3/data/coco.data ../yolov3-spp.cfg darknet53.conv.74 -map -dont_show # train spp coco 91 | 92 | ./darknet detector train data/coco.data ../yolov3-spp.cfg darknet53.conv.74 -map -dont_show # train spp 93 | gsutil cp -r backup/*5000.weights gs://sm6/weights 94 | sudo shutdown 95 | 96 | 97 | ./darknet detector train ../supermarket2/supermarket2.data ../yolov3-tiny-sm2-1cls.cfg yolov3-tiny.conv.15 -map -dont_show # train tiny 98 | ./darknet detector train ../supermarket2/supermarket2.data cfg/yolov3-spp-sm2-1cls.cfg backup/yolov3-spp-sm2-1cls_last.weights # resume 99 | python3 train.py --data ../supermarket2/supermarket2.data --cfg ../yolov3-spp-sm2-1cls.cfg --epochs 100 --num-workers 8 --img-size 320 --nosave # train ultralytics 100 | python3 test.py --data ../supermarket2/supermarket2.data --weights ../darknet/backup/yolov3-spp-sm2-1cls_5000.weights --cfg cfg/yolov3-spp-sm2-1cls.cfg # test 101 | gsutil cp -r backup/*.weights gs://sm6/weights # weights to bucket 102 | 103 | python3 test.py --data ../supermarket2/supermarket2.data --weights weights/yolov3-spp-sm2-1cls_5000.weights --cfg ../yolov3-spp-sm2-1cls.cfg --img-size 320 --conf-thres 0.2 # test 104 | python3 test.py --data ../supermarket2/supermarket2.data --weights weights/yolov3-spp-sm2-1cls-scalexy_125_5000.weights --cfg ../yolov3-spp-sm2-1cls-scalexy_125.cfg --img-size 320 --conf-thres 0.2 # test 105 | python3 test.py --data ../supermarket2/supermarket2.data --weights weights/yolov3-spp-sm2-1cls-scalexy_150_5000.weights --cfg ../yolov3-spp-sm2-1cls-scalexy_150.cfg --img-size 320 --conf-thres 0.2 # test 106 | python3 test.py --data ../supermarket2/supermarket2.data --weights weights/yolov3-spp-sm2-1cls-scalexy_200_5000.weights --cfg ../yolov3-spp-sm2-1cls-scalexy_200.cfg --img-size 320 --conf-thres 0.2 # test 107 | python3 test.py --data ../supermarket2/supermarket2.data --weights ../darknet/backup/yolov3-spp-sm2-1cls-scalexy_variable_5000.weights --cfg ../yolov3-spp-sm2-1cls-scalexy_variable.cfg --img-size 320 --conf-thres 0.2 # test 108 | 109 | python3 train.py --img-size 320 --epochs 27 --batch-size 64 --accumulate 1 --nosave --notest && python3 test.py --weights weights/last.pt --img-size 320 --save-json && sudo shutdown 110 | 111 | # Debug/Development 112 | python3 train.py --data data/coco.data --img-size 320 --single-scale --batch-size 64 --accumulate 1 --epochs 1 --evolve --giou 113 | python3 test.py --weights weights/last.pt --cfg cfg/yolov3-spp.cfg --img-size 320 114 | 115 | gsutil cp evolve.txt gs://ultralytics 116 | sudo shutdown 117 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | from torch.utils.data import DataLoader 5 | 6 | from models import * 7 | from utils.datasets import * 8 | from utils.utils import * 9 | 10 | 11 | def test(cfg, 12 | data, 13 | weights=None, 14 | batch_size=16, 15 | img_size=416, 16 | iou_thres=0.5, 17 | conf_thres=0.001, 18 | nms_thres=0.5, 19 | save_json=False, 20 | model=None): 21 | # Initialize/load model and set device 22 | if model is None: 23 | device = torch_utils.select_device() 24 | verbose = True 25 | 26 | # Initialize model 27 | model = Darknet(cfg, img_size).to(device) 28 | 29 | # Load weights 30 | if weights.endswith('.pt'): # pytorch format 31 | model.load_state_dict(torch.load(weights, map_location=device)['model']) 32 | else: # darknet format 33 | _ = load_darknet_weights(model, weights) 34 | 35 | if torch.cuda.device_count() > 1: 36 | model = nn.DataParallel(model) 37 | else: 38 | device = next(model.parameters()).device # get model device 39 | verbose = False 40 | 41 | # Configure run 42 | data = parse_data_cfg(data) 43 | nc = int(data['classes']) # number of classes 44 | test_path = data['valid'] # path to test images 45 | names = load_classes(data['names']) # class names 46 | 47 | # Dataloader 48 | dataset = LoadImagesAndLabels(test_path, img_size, batch_size) 49 | dataloader = DataLoader(dataset, 50 | batch_size=batch_size, 51 | num_workers=min(os.cpu_count(), batch_size), 52 | pin_memory=True, 53 | collate_fn=dataset.collate_fn) 54 | 55 | seen = 0 56 | model.eval() 57 | coco91class = coco80_to_coco91_class() 58 | s = ('%20s' + '%10s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP', 'F1') 59 | p, r, f1, mp, mr, map, mf1 = 0., 0., 0., 0., 0., 0., 0. 60 | loss = torch.zeros(3) 61 | jdict, stats, ap, ap_class = [], [], [], [] 62 | for batch_i, (imgs, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)): 63 | targets = targets.to(device) 64 | imgs = imgs.to(device) 65 | _, _, height, width = imgs.shape # batch size, channels, height, width 66 | 67 | # Plot images with bounding boxes 68 | if batch_i == 0 and not os.path.exists('test_batch0.jpg'): 69 | plot_images(imgs=imgs, targets=targets, paths=paths, fname='test_batch0.jpg') 70 | 71 | # Run model 72 | inf_out, train_out = model(imgs) # inference and training outputs 73 | 74 | # Compute loss 75 | if hasattr(model, 'hyp'): # if model has loss hyperparameters 76 | loss += compute_loss(train_out, targets, model)[1][:3].cpu() # GIoU, obj, cls 77 | 78 | # Run NMS 79 | output = non_max_suppression(inf_out, conf_thres=conf_thres, nms_thres=nms_thres) 80 | 81 | # Statistics per image 82 | for si, pred in enumerate(output): 83 | labels = targets[targets[:, 0] == si, 1:] 84 | nl = len(labels) 85 | tcls = labels[:, 0].tolist() if nl else [] # target class 86 | seen += 1 87 | 88 | if pred is None: 89 | if nl: 90 | stats.append(([], torch.Tensor(), torch.Tensor(), tcls)) 91 | continue 92 | 93 | # Append to text file 94 | # with open('test.txt', 'a') as file: 95 | # [file.write('%11.5g' * 7 % tuple(x) + '\n') for x in pred] 96 | 97 | # Append to pycocotools JSON dictionary 98 | if save_json: 99 | # [{"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}, ... 100 | image_id = int(Path(paths[si]).stem.split('_')[-1]) 101 | box = pred[:, :4].clone() # xyxy 102 | scale_coords(imgs[si].shape[1:], box, shapes[si]) # to original shape 103 | box = xyxy2xywh(box) # xywh 104 | box[:, :2] -= box[:, 2:] / 2 # xy center to top-left corner 105 | for di, d in enumerate(pred): 106 | jdict.append({'image_id': image_id, 107 | 'category_id': coco91class[int(d[6])], 108 | 'bbox': [floatn(x, 3) for x in box[di]], 109 | 'score': floatn(d[4], 5)}) 110 | 111 | # Clip boxes to image bounds 112 | clip_coords(pred, (height, width)) 113 | 114 | # Assign all predictions as incorrect 115 | correct = [0] * len(pred) 116 | if nl: 117 | detected = [] 118 | tcls_tensor = labels[:, 0] 119 | 120 | # target boxes 121 | tbox = xywh2xyxy(labels[:, 1:5]) 122 | tbox[:, [0, 2]] *= width 123 | tbox[:, [1, 3]] *= height 124 | 125 | # Search for correct predictions 126 | for i, (*pbox, pconf, pcls_conf, pcls) in enumerate(pred): 127 | 128 | # Break if all targets already located in image 129 | if len(detected) == nl: 130 | break 131 | 132 | # Continue if predicted class not among image classes 133 | if pcls.item() not in tcls: 134 | continue 135 | 136 | # Best iou, index between pred and targets 137 | m = (pcls == tcls_tensor).nonzero().view(-1) 138 | iou, bi = bbox_iou(pbox, tbox[m]).max(0) 139 | 140 | # If iou > threshold and class is correct mark as correct 141 | if iou > iou_thres and m[bi] not in detected: # and pcls == tcls[bi]: 142 | correct[i] = 1 143 | detected.append(m[bi]) 144 | 145 | # Append statistics (correct, conf, pcls, tcls) 146 | stats.append((correct, pred[:, 4].cpu(), pred[:, 6].cpu(), tcls)) 147 | 148 | # Compute statistics 149 | stats = [np.concatenate(x, 0) for x in list(zip(*stats))] # to numpy 150 | if len(stats): 151 | p, r, ap, f1, ap_class = ap_per_class(*stats) 152 | mp, mr, map, mf1 = p.mean(), r.mean(), ap.mean(), f1.mean() 153 | nt = np.bincount(stats[3].astype(np.int64), minlength=nc) # number of targets per class 154 | else: 155 | nt = torch.zeros(1) 156 | 157 | # Print results 158 | pf = '%20s' + '%10.3g' * 6 # print format 159 | print(pf % ('all', seen, nt.sum(), mp, mr, map, mf1)) 160 | 161 | # Print results per class 162 | if verbose and nc > 1 and len(stats): 163 | for i, c in enumerate(ap_class): 164 | print(pf % (names[c], seen, nt[c], p[i], r[i], ap[i], f1[i])) 165 | 166 | # Save JSON 167 | if save_json and map and len(jdict): 168 | imgIds = [int(Path(x).stem.split('_')[-1]) for x in dataset.img_files] 169 | with open('results.json', 'w') as file: 170 | json.dump(jdict, file) 171 | 172 | from pycocotools.coco import COCO 173 | from pycocotools.cocoeval import COCOeval 174 | 175 | # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb 176 | cocoGt = COCO('../coco/annotations/instances_val2014.json') # initialize COCO ground truth api 177 | cocoDt = cocoGt.loadRes('results.json') # initialize COCO pred api 178 | 179 | cocoEval = COCOeval(cocoGt, cocoDt, 'bbox') 180 | cocoEval.params.imgIds = imgIds # [:32] # only evaluate these images 181 | cocoEval.evaluate() 182 | cocoEval.accumulate() 183 | cocoEval.summarize() 184 | map = cocoEval.stats[1] # update mAP to pycocotools mAP 185 | 186 | # Return results 187 | maps = np.zeros(nc) + map 188 | for i, c in enumerate(ap_class): 189 | maps[c] = ap[i] 190 | return (mp, mr, map, mf1, *(loss / len(dataloader)).tolist()), maps 191 | 192 | 193 | if __name__ == '__main__': 194 | parser = argparse.ArgumentParser(prog='test.py') 195 | parser.add_argument('--batch-size', type=int, default=16, help='size of each image batch') 196 | parser.add_argument('--cfg', type=str, default='cfg/yolov3-spp.cfg', help='cfg file path') 197 | parser.add_argument('--data', type=str, default='data/coco.data', help='coco.data file path') 198 | parser.add_argument('--weights', type=str, default='weights/yolov3-spp.weights', help='path to weights file') 199 | parser.add_argument('--iou-thres', type=float, default=0.5, help='iou threshold required to qualify as detected') 200 | parser.add_argument('--conf-thres', type=float, default=0.001, help='object confidence threshold') 201 | parser.add_argument('--nms-thres', type=float, default=0.5, help='iou threshold for non-maximum suppression') 202 | parser.add_argument('--save-json', action='store_true', help='save a cocoapi-compatible JSON results file') 203 | parser.add_argument('--img-size', type=int, default=416, help='inference size (pixels)') 204 | opt = parser.parse_args() 205 | print(opt) 206 | 207 | with torch.no_grad(): 208 | test(opt.cfg, 209 | opt.data, 210 | opt.weights, 211 | opt.batch_size, 212 | opt.img_size, 213 | opt.iou_thres, 214 | opt.conf_thres, 215 | opt.nms_thres, 216 | opt.save_json) 217 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | I combined SLIMYOLO here. 2 | 3 | You can use train.py to train sparsity with argument "--sparsity 0.0001", and use prune.py to pruning. 4 | 5 | 6 | 7 | 8 | 9 | 10 | 13 | 20 | 23 | 24 |
11 | 12 | 14 | 15 | 16 | 17 | 18 | 19 | 21 | 22 |
25 | 26 | # Introduction 27 | 28 | This directory contains PyTorch YOLOv3 software developed by Ultralytics LLC, and **is freely available for redistribution under the GPL-3.0 license**. For more information please visit https://www.ultralytics.com. 29 | 30 | # Description 31 | 32 | The https://github.com/ultralytics/yolov3 repo contains inference and training code for YOLOv3 in PyTorch. The code works on Linux, MacOS and Windows. Training is done on the COCO dataset by default: https://cocodataset.org/#home. **Credit to Joseph Redmon for YOLO:** https://pjreddie.com/darknet/yolo/. 33 | 34 | # Requirements 35 | 36 | Python 3.7 or later with the following `pip3 install -U -r requirements.txt` packages: 37 | 38 | - `numpy` 39 | - `torch >= 1.1.0` 40 | - `opencv-python` 41 | - `tqdm` 42 | 43 | # Tutorials 44 | 45 | * [GCP Quickstart](https://github.com/ultralytics/yolov3/wiki/GCP-Quickstart) 46 | * [Transfer Learning](https://github.com/ultralytics/yolov3/wiki/Example:-Transfer-Learning) 47 | * [Train Single Image](https://github.com/ultralytics/yolov3/wiki/Example:-Train-Single-Image) 48 | * [Train Single Class](https://github.com/ultralytics/yolov3/wiki/Example:-Train-Single-Class) 49 | * [Train Custom Data](https://github.com/ultralytics/yolov3/wiki/Train-Custom-Data) 50 | 51 | # Jupyter Notebook 52 | 53 | Our Jupyter [notebook](https://colab.research.google.com/github/ultralytics/yolov3/blob/master/examples.ipynb) provides quick training, inference and testing examples. 54 | 55 | # Training 56 | 57 | **Start Training:** `python3 train.py` to begin training after downloading COCO data with `data/get_coco_dataset.sh`. Each epoch trains on 117,263 images from the train and validate COCO sets, and tests on 5000 images from the COCO validate set. 58 | 59 | **Resume Training:** `python3 train.py --resume` to resume training from `weights/last.pt`. 60 | 61 | **Plot Training:** `from utils import utils; utils.plot_results()` plots training results from `coco_16img.data`, `coco_64img.data`, 2 example datasets available in the `data/` folder, which train and test on the first 16 and 64 images of the COCO2014-trainval dataset. 62 | ![image](https://user-images.githubusercontent.com/26833433/63258271-fe9d5300-c27b-11e9-9a15-95038daf4438.png) 63 | 64 | ## Image Augmentation 65 | 66 | `datasets.py` applies random OpenCV-powered (https://opencv.org/) augmentation to the input images in accordance with the following specifications. Augmentation is applied **only** during training, not during inference. Bounding boxes are automatically tracked and updated with the images. 416 x 416 examples pictured below. 67 | 68 | Augmentation | Description 69 | --- | --- 70 | Translation | +/- 10% (vertical and horizontal) 71 | Rotation | +/- 5 degrees 72 | Shear | +/- 2 degrees (vertical and horizontal) 73 | Scale | +/- 10% 74 | Reflection | 50% probability (horizontal-only) 75 | H**S**V Saturation | +/- 50% 76 | HS**V** Intensity | +/- 50% 77 | 78 | 79 | 80 | ## Speed 81 | 82 | https://cloud.google.com/deep-learning-vm/ 83 | **Machine type:** n1-standard-8 (8 vCPUs, 30 GB memory) 84 | **CPU platform:** Intel Skylake 85 | **GPUs:** K80 ($0.20/hr), T4 ($0.35/hr), V100 ($0.83/hr) CUDA with [Nvidia Apex](https://github.com/NVIDIA/apex) FP16/32 86 | **HDD:** 100 GB SSD 87 | **Dataset:** COCO train 2014 (117,263 images) 88 | 89 | GPUs | `batch_size` | images/sec | epoch time | epoch cost 90 | --- |---| --- | --- | --- 91 | K80 | 64 (32x2) | 11 | 175 min | $0.58 92 | T4 | 64 (32x2) | 40 | 49 min | $0.29 93 | T4 x2 | 64 (64x1) | 61 | 32 min | $0.36 94 | V100 | 64 (32x2) | 115 | 17 min | $0.24 95 | V100 x2 | 64 (64x1) | 150 | 13 min | $0.36 96 | 2080Ti | 64 (32x2) | 69 | 28 min | - 97 | 98 | 99 | # Inference 100 | 101 | `detect.py` runs inference on all images **and videos** in the `data/samples` folder: 102 | 103 | **YOLOv3:** `python3 detect.py --cfg cfg/yolov3.cfg --weights weights/yolov3.weights` 104 | 105 | 106 | **YOLOv3-tiny:** `python3 detect.py --cfg cfg/yolov3-tiny.cfg --weights weights/yolov3-tiny.weights` 107 | 108 | 109 | **YOLOv3-SPP:** `python3 detect.py --cfg cfg/yolov3-spp.cfg --weights weights/yolov3-spp.weights` 110 | 111 | 112 | ## Webcam 113 | 114 | `python3 detect.py --webcam` shows a live webcam feed. 115 | 116 | # Pretrained Weights 117 | 118 | - Darknet `*.weights` format: https://pjreddie.com/media/files/yolov3.weights 119 | - PyTorch `*.pt` format: https://drive.google.com/drive/folders/1uxgUBemJVw9wZsdpboYbzUN4bcRhsuAI 120 | 121 | ## Darknet Conversion 122 | 123 | ```bash 124 | git clone https://github.com/ultralytics/yolov3 && cd yolov3 125 | 126 | # convert darknet cfg/weights to pytorch model 127 | python3 -c "from models import *; convert('cfg/yolov3-spp.cfg', 'weights/yolov3-spp.weights')" 128 | Success: converted 'weights/yolov3-spp.weights' to 'converted.pt' 129 | 130 | # convert cfg/pytorch model to darknet weights 131 | python3 -c "from models import *; convert('cfg/yolov3-spp.cfg', 'weights/yolov3-spp.pt')" 132 | Success: converted 'weights/yolov3-spp.pt' to 'converted.weights' 133 | ``` 134 | 135 | # mAP 136 | 137 | - `test.py --weights weights/yolov3.weights` tests official YOLOv3 weights. 138 | - `test.py --weights weights/last.pt` tests most recent checkpoint. 139 | - `test.py --weights weights/best.pt` tests best checkpoint. 140 | - Compare to darknet published results https://arxiv.org/abs/1804.02767. 141 | 142 | [ultralytics/yolov3](https://github.com/ultralytics/yolov3) mAP@0.5 ([darknet](https://arxiv.org/abs/1804.02767)-reported mAP@0.5) 143 | 144 | | 320 | 416 | 608 145 | --- | --- | --- | --- 146 | `YOLOv3` | 51.8 (51.5) | 55.4 (55.3) | 58.2 (57.9) 147 | `YOLOv3-SPP` | 52.4 | 56.5 | 60.7 (60.6) 148 | `YOLOv3-tiny` | 29.0 | 32.9 (33.1) | 35.5 149 | 150 | ``` bash 151 | # install pycocotools 152 | git clone https://github.com/cocodataset/cocoapi && cd cocoapi/PythonAPI && make && cd ../.. && cp -r cocoapi/PythonAPI/pycocotools yolov3 153 | cd yolov3 154 | 155 | python3 test.py --save-json --img-size 608 156 | Namespace(batch_size=16, cfg='cfg/yolov3-spp.cfg', conf_thres=0.001, data='data/coco.data', img_size=608, iou_thres=0.5, nms_thres=0.5, save_json=True, weights='weights/yolov3-spp.weights') 157 | Using CUDA device0 _CudaDeviceProperties(name='Tesla T4', total_memory=15079MB) 158 | Class Images Targets P R mAP F1: 100% 313/313 [07:40<00:00, 2.34s/it] 159 | all 5e+03 3.58e+04 0.117 0.788 0.595 0.199 160 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.367 <--- 161 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.607 <--- 162 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.387 163 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.208 164 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.392 165 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.487 166 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.297 167 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.465 168 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.495 169 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.332 170 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.518 171 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.621 172 | 173 | python3 test.py --save-json --img-size 416 174 | Namespace(batch_size=16, cfg='cfg/yolov3-spp.cfg', conf_thres=0.001, data='data/coco.data', img_size=416, iou_thres=0.5, nms_thres=0.5, save_json=True, weights='weights/yolov3-spp.weights') 175 | Using CUDA device0 _CudaDeviceProperties(name='Tesla T4', total_memory=15079MB) 176 | Class Images Targets P R mAP F1: 100% 313/313 [07:01<00:00, 1.41s/it] 177 | all 5e+03 3.58e+04 0.105 0.746 0.554 0.18 178 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.336 <--- 179 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.565 <--- 180 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.350 181 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.151 182 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.361 183 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.494 184 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.281 185 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.433 186 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.459 187 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.256 188 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.495 189 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.622 190 | ``` 191 | 192 | # Citation 193 | 194 | [![DOI](https://zenodo.org/badge/146165888.svg)](https://zenodo.org/badge/latestdoi/146165888) 195 | 196 | # Contact 197 | 198 | Issues should be raised directly in the repository. For additional questions or comments please email Glenn Jocher at glenn.jocher@ultralytics.com or visit us at https://contact.ultralytics.com. 199 | -------------------------------------------------------------------------------- /utils/adabound.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | from torch.optim import Optimizer 5 | 6 | 7 | class AdaBound(Optimizer): 8 | """Implements AdaBound algorithm. 9 | It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of Learning Rate`_. 10 | Arguments: 11 | params (iterable): iterable of parameters to optimize or dicts defining 12 | parameter groups 13 | lr (float, optional): Adam learning rate (default: 1e-3) 14 | betas (Tuple[float, float], optional): coefficients used for computing 15 | running averages of gradient and its square (default: (0.9, 0.999)) 16 | final_lr (float, optional): final (SGD) learning rate (default: 0.1) 17 | gamma (float, optional): convergence speed of the bound functions (default: 1e-3) 18 | eps (float, optional): term added to the denominator to improve 19 | numerical stability (default: 1e-8) 20 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 21 | amsbound (boolean, optional): whether to use the AMSBound variant of this algorithm 22 | .. Adaptive Gradient Methods with Dynamic Bound of Learning Rate: 23 | https://openreview.net/forum?id=Bkg3g2R9FX 24 | """ 25 | 26 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), final_lr=0.1, gamma=1e-3, 27 | eps=1e-8, weight_decay=0, amsbound=False): 28 | if not 0.0 <= lr: 29 | raise ValueError("Invalid learning rate: {}".format(lr)) 30 | if not 0.0 <= eps: 31 | raise ValueError("Invalid epsilon value: {}".format(eps)) 32 | if not 0.0 <= betas[0] < 1.0: 33 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 34 | if not 0.0 <= betas[1] < 1.0: 35 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 36 | if not 0.0 <= final_lr: 37 | raise ValueError("Invalid final learning rate: {}".format(final_lr)) 38 | if not 0.0 <= gamma < 1.0: 39 | raise ValueError("Invalid gamma parameter: {}".format(gamma)) 40 | defaults = dict(lr=lr, betas=betas, final_lr=final_lr, gamma=gamma, eps=eps, 41 | weight_decay=weight_decay, amsbound=amsbound) 42 | super(AdaBound, self).__init__(params, defaults) 43 | 44 | self.base_lrs = list(map(lambda group: group['lr'], self.param_groups)) 45 | 46 | def __setstate__(self, state): 47 | super(AdaBound, self).__setstate__(state) 48 | for group in self.param_groups: 49 | group.setdefault('amsbound', False) 50 | 51 | def step(self, closure=None): 52 | """Performs a single optimization step. 53 | Arguments: 54 | closure (callable, optional): A closure that reevaluates the model 55 | and returns the loss. 56 | """ 57 | loss = None 58 | if closure is not None: 59 | loss = closure() 60 | 61 | for group, base_lr in zip(self.param_groups, self.base_lrs): 62 | for p in group['params']: 63 | if p.grad is None: 64 | continue 65 | grad = p.grad.data 66 | if grad.is_sparse: 67 | raise RuntimeError( 68 | 'Adam does not support sparse gradients, please consider SparseAdam instead') 69 | amsbound = group['amsbound'] 70 | 71 | state = self.state[p] 72 | 73 | # State initialization 74 | if len(state) == 0: 75 | state['step'] = 0 76 | # Exponential moving average of gradient values 77 | state['exp_avg'] = torch.zeros_like(p.data) 78 | # Exponential moving average of squared gradient values 79 | state['exp_avg_sq'] = torch.zeros_like(p.data) 80 | if amsbound: 81 | # Maintains max of all exp. moving avg. of sq. grad. values 82 | state['max_exp_avg_sq'] = torch.zeros_like(p.data) 83 | 84 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 85 | if amsbound: 86 | max_exp_avg_sq = state['max_exp_avg_sq'] 87 | beta1, beta2 = group['betas'] 88 | 89 | state['step'] += 1 90 | 91 | if group['weight_decay'] != 0: 92 | grad = grad.add(group['weight_decay'], p.data) 93 | 94 | # Decay the first and second moment running average coefficient 95 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 96 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 97 | if amsbound: 98 | # Maintains the maximum of all 2nd moment running avg. till now 99 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) 100 | # Use the max. for normalizing running avg. of gradient 101 | denom = max_exp_avg_sq.sqrt().add_(group['eps']) 102 | else: 103 | denom = exp_avg_sq.sqrt().add_(group['eps']) 104 | 105 | bias_correction1 = 1 - beta1 ** state['step'] 106 | bias_correction2 = 1 - beta2 ** state['step'] 107 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 108 | 109 | # Applies bounds on actual learning rate 110 | # lr_scheduler cannot affect final_lr, this is a workaround to apply lr decay 111 | final_lr = group['final_lr'] * group['lr'] / base_lr 112 | lower_bound = final_lr * (1 - 1 / (group['gamma'] * state['step'] + 1)) 113 | upper_bound = final_lr * (1 + 1 / (group['gamma'] * state['step'])) 114 | step_size = torch.full_like(denom, step_size) 115 | step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(exp_avg) 116 | 117 | p.data.add_(-step_size) 118 | 119 | return loss 120 | 121 | 122 | class AdaBoundW(Optimizer): 123 | """Implements AdaBound algorithm with Decoupled Weight Decay (arxiv.org/abs/1711.05101) 124 | It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of Learning Rate`_. 125 | Arguments: 126 | params (iterable): iterable of parameters to optimize or dicts defining 127 | parameter groups 128 | lr (float, optional): Adam learning rate (default: 1e-3) 129 | betas (Tuple[float, float], optional): coefficients used for computing 130 | running averages of gradient and its square (default: (0.9, 0.999)) 131 | final_lr (float, optional): final (SGD) learning rate (default: 0.1) 132 | gamma (float, optional): convergence speed of the bound functions (default: 1e-3) 133 | eps (float, optional): term added to the denominator to improve 134 | numerical stability (default: 1e-8) 135 | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) 136 | amsbound (boolean, optional): whether to use the AMSBound variant of this algorithm 137 | .. Adaptive Gradient Methods with Dynamic Bound of Learning Rate: 138 | https://openreview.net/forum?id=Bkg3g2R9FX 139 | """ 140 | 141 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), final_lr=0.1, gamma=1e-3, 142 | eps=1e-8, weight_decay=0, amsbound=False): 143 | if not 0.0 <= lr: 144 | raise ValueError("Invalid learning rate: {}".format(lr)) 145 | if not 0.0 <= eps: 146 | raise ValueError("Invalid epsilon value: {}".format(eps)) 147 | if not 0.0 <= betas[0] < 1.0: 148 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 149 | if not 0.0 <= betas[1] < 1.0: 150 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 151 | if not 0.0 <= final_lr: 152 | raise ValueError("Invalid final learning rate: {}".format(final_lr)) 153 | if not 0.0 <= gamma < 1.0: 154 | raise ValueError("Invalid gamma parameter: {}".format(gamma)) 155 | defaults = dict(lr=lr, betas=betas, final_lr=final_lr, gamma=gamma, eps=eps, 156 | weight_decay=weight_decay, amsbound=amsbound) 157 | super(AdaBoundW, self).__init__(params, defaults) 158 | 159 | self.base_lrs = list(map(lambda group: group['lr'], self.param_groups)) 160 | 161 | def __setstate__(self, state): 162 | super(AdaBoundW, self).__setstate__(state) 163 | for group in self.param_groups: 164 | group.setdefault('amsbound', False) 165 | 166 | def step(self, closure=None): 167 | """Performs a single optimization step. 168 | Arguments: 169 | closure (callable, optional): A closure that reevaluates the model 170 | and returns the loss. 171 | """ 172 | loss = None 173 | if closure is not None: 174 | loss = closure() 175 | 176 | for group, base_lr in zip(self.param_groups, self.base_lrs): 177 | for p in group['params']: 178 | if p.grad is None: 179 | continue 180 | grad = p.grad.data 181 | if grad.is_sparse: 182 | raise RuntimeError( 183 | 'Adam does not support sparse gradients, please consider SparseAdam instead') 184 | amsbound = group['amsbound'] 185 | 186 | state = self.state[p] 187 | 188 | # State initialization 189 | if len(state) == 0: 190 | state['step'] = 0 191 | # Exponential moving average of gradient values 192 | state['exp_avg'] = torch.zeros_like(p.data) 193 | # Exponential moving average of squared gradient values 194 | state['exp_avg_sq'] = torch.zeros_like(p.data) 195 | if amsbound: 196 | # Maintains max of all exp. moving avg. of sq. grad. values 197 | state['max_exp_avg_sq'] = torch.zeros_like(p.data) 198 | 199 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 200 | if amsbound: 201 | max_exp_avg_sq = state['max_exp_avg_sq'] 202 | beta1, beta2 = group['betas'] 203 | 204 | state['step'] += 1 205 | 206 | # Decay the first and second moment running average coefficient 207 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 208 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 209 | if amsbound: 210 | # Maintains the maximum of all 2nd moment running avg. till now 211 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) 212 | # Use the max. for normalizing running avg. of gradient 213 | denom = max_exp_avg_sq.sqrt().add_(group['eps']) 214 | else: 215 | denom = exp_avg_sq.sqrt().add_(group['eps']) 216 | 217 | bias_correction1 = 1 - beta1 ** state['step'] 218 | bias_correction2 = 1 - beta2 ** state['step'] 219 | step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 220 | 221 | # Applies bounds on actual learning rate 222 | # lr_scheduler cannot affect final_lr, this is a workaround to apply lr decay 223 | final_lr = group['final_lr'] * group['lr'] / base_lr 224 | lower_bound = final_lr * (1 - 1 / (group['gamma'] * state['step'] + 1)) 225 | upper_bound = final_lr * (1 + 1 / (group['gamma'] * state['step'])) 226 | step_size = torch.full_like(denom, step_size) 227 | step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(exp_avg) 228 | 229 | if group['weight_decay'] != 0: 230 | decayed_weights = torch.mul(p.data, group['weight_decay']) 231 | p.data.add_(-step_size) 232 | p.data.sub_(decayed_weights) 233 | else: 234 | p.data.add_(-step_size) 235 | 236 | return loss 237 | -------------------------------------------------------------------------------- /cfg/yolov3-1cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=16 7 | subdivisions=1 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=18 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=1 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .7 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=18 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=1 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .7 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=18 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=1 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .7 787 | truth_thresh = 1 788 | random=1 789 | -------------------------------------------------------------------------------- /cfg/yolov3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=16 7 | subdivisions=1 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .7 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .7 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .7 787 | truth_thresh = 1 788 | random=1 789 | -------------------------------------------------------------------------------- /cfg/yolov3s-3a320.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=85 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 2 642 | anchors = 16,30, 62,45, 156,198 643 | classes=80 644 | num=3 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=85 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 1 728 | anchors = 16,30, 62,45, 156,198 729 | classes=80 730 | num=3 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=85 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0 815 | anchors = 16,30, 62,45, 156,198 816 | classes=80 817 | num=3 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | -------------------------------------------------------------------------------- /cfg/yolov3-spp-1cls.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=100 20 | max_batches = 5000 21 | policy=steps 22 | steps=4000,4500 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=18 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=1 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=18 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 3,4,5 728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 729 | classes=1 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=18 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2 815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 816 | classes=1 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | -------------------------------------------------------------------------------- /cfg/yolov3-spp.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=255 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 643 | classes=80 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=255 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 3,4,5 728 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 729 | classes=80 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=255 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2 815 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 816 | classes=80 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | -------------------------------------------------------------------------------- /cfg/yolov3s-9a320.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=255 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 6,7,8 642 | anchors = 9,11, 25,27, 33,63, 71,43, 62,120, 135,86, 123,199, 257,100, 264,223 643 | classes=80 644 | num=9 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=255 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 3,4,5 728 | anchors = 9,11, 25,27, 33,63, 71,43, 62,120, 135,86, 123,199, 257,100, 264,223 729 | classes=80 730 | num=9 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=255 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2 815 | anchors = 9,11, 25,27, 33,63, 71,43, 62,120, 135,86, 123,199, 257,100, 264,223 816 | classes=80 817 | num=9 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | -------------------------------------------------------------------------------- /cfg/yolov3s-18a320.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=510 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 12,13,14,15,16,17 642 | anchors = 7,8, 11,20, 27,15, 20,36, 50,29, 28,60, 61,61, 99,39, 43,99, 98,91, 66,148, 180,68, 139,135, 104,210, 285,92, 205,173, 186,274, 302,212 643 | classes=80 644 | num=18 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=510 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 6,7,8,9,10,11 728 | anchors = 7,8, 11,20, 27,15, 20,36, 50,29, 28,60, 61,61, 99,39, 43,99, 98,91, 66,148, 180,68, 139,135, 104,210, 285,92, 205,173, 186,274, 302,212 729 | classes=80 730 | num=18 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=510 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2,3,4,5 815 | anchors = 7,8, 11,20, 27,15, 20,36, 50,29, 28,60, 61,61, 99,39, 43,99, 98,91, 66,148, 180,68, 139,135, 104,210, 285,92, 205,173, 186,274, 302,212 816 | classes=80 817 | num=18 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | -------------------------------------------------------------------------------- /cfg/yolov3s-30a320.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | # batch=1 4 | # subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=16 8 | width=608 9 | height=608 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | ### SPP ### 576 | [maxpool] 577 | stride=1 578 | size=5 579 | 580 | [route] 581 | layers=-2 582 | 583 | [maxpool] 584 | stride=1 585 | size=9 586 | 587 | [route] 588 | layers=-4 589 | 590 | [maxpool] 591 | stride=1 592 | size=13 593 | 594 | [route] 595 | layers=-1,-3,-5,-6 596 | 597 | ### End SPP ### 598 | 599 | [convolutional] 600 | batch_normalize=1 601 | filters=512 602 | size=1 603 | stride=1 604 | pad=1 605 | activation=leaky 606 | 607 | 608 | [convolutional] 609 | batch_normalize=1 610 | size=3 611 | stride=1 612 | pad=1 613 | filters=1024 614 | activation=leaky 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | filters=512 619 | size=1 620 | stride=1 621 | pad=1 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | size=3 627 | stride=1 628 | pad=1 629 | filters=1024 630 | activation=leaky 631 | 632 | [convolutional] 633 | size=1 634 | stride=1 635 | pad=1 636 | filters=850 637 | activation=linear 638 | 639 | 640 | [yolo] 641 | mask = 20,21,22,23,24,25,26,27,28,29 642 | anchors = 6,7, 9,18, 17,10, 21,22, 14,33, 36,15, 22,51, 34,34, 59,24, 32,74, 51,49, 90,38, 41,105, 67,72, 144,48, 54,148, 106,79, 81,109, 211,63, 107,147, 81,200, 149,112, 297,73, 152,187, 214,135, 121,264, 220,206, 299,153, 211,291, 309,230 643 | classes=80 644 | num=30 645 | jitter=.3 646 | ignore_thresh = .7 647 | truth_thresh = 1 648 | random=1 649 | 650 | 651 | [route] 652 | layers = -4 653 | 654 | [convolutional] 655 | batch_normalize=1 656 | filters=256 657 | size=1 658 | stride=1 659 | pad=1 660 | activation=leaky 661 | 662 | [upsample] 663 | stride=2 664 | 665 | [route] 666 | layers = -1, 61 667 | 668 | 669 | 670 | [convolutional] 671 | batch_normalize=1 672 | filters=256 673 | size=1 674 | stride=1 675 | pad=1 676 | activation=leaky 677 | 678 | [convolutional] 679 | batch_normalize=1 680 | size=3 681 | stride=1 682 | pad=1 683 | filters=512 684 | activation=leaky 685 | 686 | [convolutional] 687 | batch_normalize=1 688 | filters=256 689 | size=1 690 | stride=1 691 | pad=1 692 | activation=leaky 693 | 694 | [convolutional] 695 | batch_normalize=1 696 | size=3 697 | stride=1 698 | pad=1 699 | filters=512 700 | activation=leaky 701 | 702 | [convolutional] 703 | batch_normalize=1 704 | filters=256 705 | size=1 706 | stride=1 707 | pad=1 708 | activation=leaky 709 | 710 | [convolutional] 711 | batch_normalize=1 712 | size=3 713 | stride=1 714 | pad=1 715 | filters=512 716 | activation=leaky 717 | 718 | [convolutional] 719 | size=1 720 | stride=1 721 | pad=1 722 | filters=850 723 | activation=linear 724 | 725 | 726 | [yolo] 727 | mask = 10,11,12,13,14,15,16,17,18,19 728 | anchors = 6,7, 9,18, 17,10, 21,22, 14,33, 36,15, 22,51, 34,34, 59,24, 32,74, 51,49, 90,38, 41,105, 67,72, 144,48, 54,148, 106,79, 81,109, 211,63, 107,147, 81,200, 149,112, 297,73, 152,187, 214,135, 121,264, 220,206, 299,153, 211,291, 309,230 729 | classes=80 730 | num=30 731 | jitter=.3 732 | ignore_thresh = .7 733 | truth_thresh = 1 734 | random=1 735 | 736 | 737 | 738 | [route] 739 | layers = -4 740 | 741 | [convolutional] 742 | batch_normalize=1 743 | filters=128 744 | size=1 745 | stride=1 746 | pad=1 747 | activation=leaky 748 | 749 | [upsample] 750 | stride=2 751 | 752 | [route] 753 | layers = -1, 36 754 | 755 | 756 | 757 | [convolutional] 758 | batch_normalize=1 759 | filters=128 760 | size=1 761 | stride=1 762 | pad=1 763 | activation=leaky 764 | 765 | [convolutional] 766 | batch_normalize=1 767 | size=3 768 | stride=1 769 | pad=1 770 | filters=256 771 | activation=leaky 772 | 773 | [convolutional] 774 | batch_normalize=1 775 | filters=128 776 | size=1 777 | stride=1 778 | pad=1 779 | activation=leaky 780 | 781 | [convolutional] 782 | batch_normalize=1 783 | size=3 784 | stride=1 785 | pad=1 786 | filters=256 787 | activation=leaky 788 | 789 | [convolutional] 790 | batch_normalize=1 791 | filters=128 792 | size=1 793 | stride=1 794 | pad=1 795 | activation=leaky 796 | 797 | [convolutional] 798 | batch_normalize=1 799 | size=3 800 | stride=1 801 | pad=1 802 | filters=256 803 | activation=leaky 804 | 805 | [convolutional] 806 | size=1 807 | stride=1 808 | pad=1 809 | filters=850 810 | activation=linear 811 | 812 | 813 | [yolo] 814 | mask = 0,1,2,3,4,5,6,7,8,9 815 | anchors = 6,7, 9,18, 17,10, 21,22, 14,33, 36,15, 22,51, 34,34, 59,24, 32,74, 51,49, 90,38, 41,105, 67,72, 144,48, 54,148, 106,79, 81,109, 211,63, 107,147, 81,200, 149,112, 297,73, 152,187, 214,135, 121,264, 220,206, 299,153, 211,291, 309,230 816 | classes=80 817 | num=30 818 | jitter=.3 819 | ignore_thresh = .7 820 | truth_thresh = 1 821 | random=1 822 | -------------------------------------------------------------------------------- /cfg/yolov3-spp-pan-scale.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=64 7 | subdivisions=32 8 | width=544 9 | height=544 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | 19 | learning_rate=0.001 20 | burn_in=1000 21 | max_batches = 10000 22 | 23 | policy=steps 24 | steps=8000,9000 25 | scales=.1,.1 26 | 27 | #policy=sgdr 28 | #sgdr_cycle=1000 29 | #sgdr_mult=2 30 | #steps=4000,6000,8000,9000 31 | #scales=1, 1, 0.1, 0.1 32 | 33 | [convolutional] 34 | batch_normalize=1 35 | filters=32 36 | size=3 37 | stride=1 38 | pad=1 39 | activation=leaky 40 | 41 | # Downsample 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=64 46 | size=3 47 | stride=2 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=32 54 | size=1 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [convolutional] 60 | batch_normalize=1 61 | filters=64 62 | size=3 63 | stride=1 64 | pad=1 65 | activation=leaky 66 | 67 | [shortcut] 68 | from=-3 69 | activation=linear 70 | 71 | # Downsample 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=128 76 | size=3 77 | stride=2 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=64 84 | size=1 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [convolutional] 90 | batch_normalize=1 91 | filters=128 92 | size=3 93 | stride=1 94 | pad=1 95 | activation=leaky 96 | 97 | [shortcut] 98 | from=-3 99 | activation=linear 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=64 104 | size=1 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [convolutional] 110 | batch_normalize=1 111 | filters=128 112 | size=3 113 | stride=1 114 | pad=1 115 | activation=leaky 116 | 117 | [shortcut] 118 | from=-3 119 | activation=linear 120 | 121 | # Downsample 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=256 126 | size=3 127 | stride=2 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=128 134 | size=1 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [convolutional] 140 | batch_normalize=1 141 | filters=256 142 | size=3 143 | stride=1 144 | pad=1 145 | activation=leaky 146 | 147 | [shortcut] 148 | from=-3 149 | activation=linear 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=128 154 | size=1 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [convolutional] 160 | batch_normalize=1 161 | filters=256 162 | size=3 163 | stride=1 164 | pad=1 165 | activation=leaky 166 | 167 | [shortcut] 168 | from=-3 169 | activation=linear 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=128 174 | size=1 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [convolutional] 180 | batch_normalize=1 181 | filters=256 182 | size=3 183 | stride=1 184 | pad=1 185 | activation=leaky 186 | 187 | [shortcut] 188 | from=-3 189 | activation=linear 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=128 194 | size=1 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [convolutional] 200 | batch_normalize=1 201 | filters=256 202 | size=3 203 | stride=1 204 | pad=1 205 | activation=leaky 206 | 207 | [shortcut] 208 | from=-3 209 | activation=linear 210 | 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=128 215 | size=1 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [convolutional] 221 | batch_normalize=1 222 | filters=256 223 | size=3 224 | stride=1 225 | pad=1 226 | activation=leaky 227 | 228 | [shortcut] 229 | from=-3 230 | activation=linear 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=128 235 | size=1 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [convolutional] 241 | batch_normalize=1 242 | filters=256 243 | size=3 244 | stride=1 245 | pad=1 246 | activation=leaky 247 | 248 | [shortcut] 249 | from=-3 250 | activation=linear 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=128 255 | size=1 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [convolutional] 261 | batch_normalize=1 262 | filters=256 263 | size=3 264 | stride=1 265 | pad=1 266 | activation=leaky 267 | 268 | [shortcut] 269 | from=-3 270 | activation=linear 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=128 275 | size=1 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [convolutional] 281 | batch_normalize=1 282 | filters=256 283 | size=3 284 | stride=1 285 | pad=1 286 | activation=leaky 287 | 288 | [shortcut] 289 | from=-3 290 | activation=linear 291 | 292 | # Downsample 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=512 297 | size=3 298 | stride=2 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=256 305 | size=1 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [convolutional] 311 | batch_normalize=1 312 | filters=512 313 | size=3 314 | stride=1 315 | pad=1 316 | activation=leaky 317 | 318 | [shortcut] 319 | from=-3 320 | activation=linear 321 | 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=256 326 | size=1 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [convolutional] 332 | batch_normalize=1 333 | filters=512 334 | size=3 335 | stride=1 336 | pad=1 337 | activation=leaky 338 | 339 | [shortcut] 340 | from=-3 341 | activation=linear 342 | 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=256 347 | size=1 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [convolutional] 353 | batch_normalize=1 354 | filters=512 355 | size=3 356 | stride=1 357 | pad=1 358 | activation=leaky 359 | 360 | [shortcut] 361 | from=-3 362 | activation=linear 363 | 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=256 368 | size=1 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [convolutional] 374 | batch_normalize=1 375 | filters=512 376 | size=3 377 | stride=1 378 | pad=1 379 | activation=leaky 380 | 381 | [shortcut] 382 | from=-3 383 | activation=linear 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=256 388 | size=1 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [convolutional] 394 | batch_normalize=1 395 | filters=512 396 | size=3 397 | stride=1 398 | pad=1 399 | activation=leaky 400 | 401 | [shortcut] 402 | from=-3 403 | activation=linear 404 | 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=256 409 | size=1 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [convolutional] 415 | batch_normalize=1 416 | filters=512 417 | size=3 418 | stride=1 419 | pad=1 420 | activation=leaky 421 | 422 | [shortcut] 423 | from=-3 424 | activation=linear 425 | 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=256 430 | size=1 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [convolutional] 436 | batch_normalize=1 437 | filters=512 438 | size=3 439 | stride=1 440 | pad=1 441 | activation=leaky 442 | 443 | [shortcut] 444 | from=-3 445 | activation=linear 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=256 450 | size=1 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [convolutional] 456 | batch_normalize=1 457 | filters=512 458 | size=3 459 | stride=1 460 | pad=1 461 | activation=leaky 462 | 463 | [shortcut] 464 | from=-3 465 | activation=linear 466 | 467 | # Downsample 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=1024 472 | size=3 473 | stride=2 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=512 480 | size=1 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [convolutional] 486 | batch_normalize=1 487 | filters=1024 488 | size=3 489 | stride=1 490 | pad=1 491 | activation=leaky 492 | 493 | [shortcut] 494 | from=-3 495 | activation=linear 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=512 500 | size=1 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [convolutional] 506 | batch_normalize=1 507 | filters=1024 508 | size=3 509 | stride=1 510 | pad=1 511 | activation=leaky 512 | 513 | [shortcut] 514 | from=-3 515 | activation=linear 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=512 520 | size=1 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [convolutional] 526 | batch_normalize=1 527 | filters=1024 528 | size=3 529 | stride=1 530 | pad=1 531 | activation=leaky 532 | 533 | [shortcut] 534 | from=-3 535 | activation=linear 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=512 540 | size=1 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [convolutional] 546 | batch_normalize=1 547 | filters=1024 548 | size=3 549 | stride=1 550 | pad=1 551 | activation=leaky 552 | 553 | [shortcut] 554 | from=-3 555 | activation=linear 556 | 557 | ###################### 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | filters=512 562 | size=1 563 | stride=1 564 | pad=1 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | size=3 570 | stride=1 571 | pad=1 572 | filters=1024 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | filters=512 578 | size=1 579 | stride=1 580 | pad=1 581 | activation=leaky 582 | 583 | ### SPP ### 584 | [maxpool] 585 | stride=1 586 | size=5 587 | 588 | [route] 589 | layers=-2 590 | 591 | [maxpool] 592 | stride=1 593 | size=9 594 | 595 | [route] 596 | layers=-4 597 | 598 | [maxpool] 599 | stride=1 600 | size=13 601 | 602 | [route] 603 | layers=-1,-3,-5,-6 604 | 605 | ### End SPP ### 606 | 607 | [convolutional] 608 | batch_normalize=1 609 | filters=512 610 | size=1 611 | stride=1 612 | pad=1 613 | activation=leaky 614 | 615 | 616 | [convolutional] 617 | batch_normalize=1 618 | size=3 619 | stride=1 620 | pad=1 621 | filters=1024 622 | activation=leaky 623 | 624 | [convolutional] 625 | batch_normalize=1 626 | filters=512 627 | size=1 628 | stride=1 629 | pad=1 630 | activation=leaky 631 | 632 | 633 | 634 | ########### to [yolo-3] 635 | 636 | 637 | 638 | [route] 639 | layers = -4 640 | 641 | [convolutional] 642 | batch_normalize=1 643 | filters=256 644 | size=1 645 | stride=1 646 | pad=1 647 | activation=leaky 648 | 649 | [upsample] 650 | stride=2 651 | 652 | [route] 653 | layers = -1, 61 654 | 655 | 656 | 657 | [convolutional] 658 | batch_normalize=1 659 | filters=256 660 | size=1 661 | stride=1 662 | pad=1 663 | activation=leaky 664 | 665 | [convolutional] 666 | batch_normalize=1 667 | size=3 668 | stride=1 669 | pad=1 670 | filters=512 671 | activation=leaky 672 | 673 | [convolutional] 674 | batch_normalize=1 675 | filters=256 676 | size=1 677 | stride=1 678 | pad=1 679 | activation=leaky 680 | 681 | [convolutional] 682 | batch_normalize=1 683 | size=3 684 | stride=1 685 | pad=1 686 | filters=512 687 | activation=leaky 688 | 689 | [convolutional] 690 | batch_normalize=1 691 | filters=256 692 | size=1 693 | stride=1 694 | pad=1 695 | activation=leaky 696 | 697 | 698 | ########### to [yolo-2] 699 | 700 | 701 | 702 | 703 | [route] 704 | layers = -4 705 | 706 | [convolutional] 707 | batch_normalize=1 708 | filters=128 709 | size=1 710 | stride=1 711 | pad=1 712 | activation=leaky 713 | 714 | [upsample] 715 | stride=2 716 | 717 | [route] 718 | layers = -1, 36 719 | 720 | 721 | 722 | [convolutional] 723 | batch_normalize=1 724 | filters=128 725 | size=1 726 | stride=1 727 | pad=1 728 | activation=leaky 729 | 730 | [convolutional] 731 | batch_normalize=1 732 | size=3 733 | stride=1 734 | pad=1 735 | filters=256 736 | activation=leaky 737 | 738 | [convolutional] 739 | batch_normalize=1 740 | filters=128 741 | size=1 742 | stride=1 743 | pad=1 744 | activation=leaky 745 | 746 | [convolutional] 747 | batch_normalize=1 748 | size=3 749 | stride=1 750 | pad=1 751 | filters=256 752 | activation=leaky 753 | 754 | [convolutional] 755 | batch_normalize=1 756 | filters=128 757 | size=1 758 | stride=1 759 | pad=1 760 | activation=leaky 761 | 762 | 763 | 764 | ########### to [yolo-1] 765 | 766 | 767 | ########### features of different layers 768 | 769 | 770 | [route] 771 | layers=1 772 | 773 | [reorg3d] 774 | stride=2 775 | 776 | [route] 777 | layers=5,-1 778 | 779 | [reorg3d] 780 | stride=2 781 | 782 | [route] 783 | layers=12,-1 784 | 785 | [reorg3d] 786 | stride=2 787 | 788 | [route] 789 | layers=37,-1 790 | 791 | [reorg3d] 792 | stride=2 793 | 794 | [route] 795 | layers=62,-1 796 | 797 | 798 | 799 | ########### [yolo-1] 800 | 801 | [convolutional] 802 | batch_normalize=1 803 | filters=128 804 | size=1 805 | stride=1 806 | pad=1 807 | activation=leaky 808 | 809 | [upsample] 810 | stride=4 811 | 812 | [route] 813 | layers = -1,-12 814 | 815 | 816 | [convolutional] 817 | batch_normalize=1 818 | size=3 819 | stride=1 820 | pad=1 821 | filters=256 822 | activation=leaky 823 | 824 | [convolutional] 825 | size=1 826 | stride=1 827 | pad=1 828 | filters=340 829 | activation=linear 830 | 831 | 832 | [yolo] 833 | mask = 0,1,2,3 834 | anchors = 8,8, 10,13, 16,30, 33,23, 32,32, 30,61, 62,45, 64,64, 59,119, 116,90, 156,198, 373,326 835 | classes=80 836 | num=12 837 | jitter=.3 838 | ignore_thresh = .7 839 | truth_thresh = 1 840 | scale_x_y = 1.05 841 | random=0 842 | 843 | 844 | 845 | 846 | ########### [yolo-2] 847 | 848 | 849 | [route] 850 | layers = -7 851 | 852 | [convolutional] 853 | batch_normalize=1 854 | filters=256 855 | size=1 856 | stride=1 857 | pad=1 858 | activation=leaky 859 | 860 | [upsample] 861 | stride=2 862 | 863 | [route] 864 | layers = -1,-28 865 | 866 | 867 | [convolutional] 868 | batch_normalize=1 869 | size=3 870 | stride=1 871 | pad=1 872 | filters=512 873 | activation=leaky 874 | 875 | [convolutional] 876 | size=1 877 | stride=1 878 | pad=1 879 | filters=340 880 | activation=linear 881 | 882 | 883 | [yolo] 884 | mask = 4,5,6,7 885 | anchors = 8,8, 10,13, 16,30, 33,23, 32,32, 30,61, 62,45, 64,64, 59,119, 116,90, 156,198, 373,326 886 | classes=80 887 | num=12 888 | jitter=.3 889 | ignore_thresh = .7 890 | truth_thresh = 1 891 | scale_x_y = 1.1 892 | random=0 893 | 894 | 895 | 896 | ########### [yolo-3] 897 | 898 | [route] 899 | layers = -14 900 | 901 | [convolutional] 902 | batch_normalize=1 903 | filters=512 904 | size=1 905 | stride=1 906 | pad=1 907 | activation=leaky 908 | 909 | [route] 910 | layers = -1,-43 911 | 912 | [convolutional] 913 | batch_normalize=1 914 | size=3 915 | stride=1 916 | pad=1 917 | filters=1024 918 | activation=leaky 919 | 920 | 921 | [convolutional] 922 | size=1 923 | stride=1 924 | pad=1 925 | filters=340 926 | activation=linear 927 | 928 | 929 | [yolo] 930 | mask = 8,9,10,11 931 | anchors = 8,8, 10,13, 16,30, 33,23, 32,32, 30,61, 62,45, 59,119, 80,80, 116,90, 156,198, 373,326 932 | classes=80 933 | num=12 934 | jitter=.3 935 | ignore_thresh = .7 936 | truth_thresh = 1 937 | scale_x_y = 1.2 938 | random=0 939 | -------------------------------------------------------------------------------- /prune.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | """ 4 | Pengyi Zhang 5 | 201906 6 | """ 7 | import cv2 8 | 9 | import argparse 10 | import json 11 | import os 12 | 13 | import numpy 14 | 15 | import torch 16 | import torch.nn as nn 17 | 18 | from torch.utils.data import DataLoader 19 | 20 | from models import * 21 | from utils.datasets import * 22 | from utils.utils import * 23 | from utils.parse_config import * 24 | 25 | """ Slim Principle 26 | (1) Use global threshold to control pruning ratio 27 | (2) Use local threshold to keep at least 10% unpruned 28 | """ 29 | 30 | def route_conv(layer_index, module_defs): 31 | """ find the convolutional layers connected by route layer 32 | """ 33 | module_def = module_defs[layer_index] 34 | mtype = module_def['type'] 35 | 36 | before_conv_id = [] 37 | if mtype in ['convolutional', 'shortcut', 'upsample', 'maxpool', 'reorg3d']: 38 | if module_defs[layer_index-1]['type'] == 'convolutional': 39 | return [layer_index-1] 40 | before_conv_id += route_conv(layer_index-1, module_defs) 41 | 42 | elif mtype == "route": 43 | layer_is = [int(x)+layer_index if int(x) < 0 else int(x) for x in module_defs[layer_index]['layers'].split(',')] 44 | for layer_i in layer_is: 45 | if module_defs[layer_i]['type'] == 'convolutional': 46 | before_conv_id += [layer_i] 47 | else: 48 | before_conv_id += route_conv(layer_i, module_defs) 49 | 50 | return before_conv_id 51 | 52 | 53 | def write_model_cfg(old_path, new_path, new_module_defs): 54 | """Parses the yolo-v3 layer configuration file and returns module definitions""" 55 | lines = [] 56 | with open(old_path, 'r') as fp: 57 | old_lines = fp.readlines() 58 | for _line in old_lines: 59 | if "[convolutional]" in _line: 60 | break 61 | lines.append(_line) 62 | 63 | for i, module_def in enumerate(new_module_defs): 64 | 65 | mtype = module_def['type'] 66 | lines.append("[{}]\n".format(mtype)) 67 | print("layer:", i, mtype) 68 | if mtype == "convolutional": 69 | bn = 0 70 | filters = module_def['filters'] 71 | bn = int(module_def['batch_normalize']) 72 | if bn: 73 | lines.append("batch_normalize={}\n".format(bn)) 74 | filters = torch.sum(module_def['mask']).cpu().numpy().astype('int') 75 | lines.append("filters={}\n".format(filters)) 76 | lines.append("size={}\n".format(module_def['size'])) 77 | lines.append("stride={}\n".format(module_def['stride'])) 78 | lines.append("pad={}\n".format(module_def['pad'])) 79 | lines.append("activation={}\n\n".format(module_def['activation'])) 80 | elif mtype == "shortcut": 81 | lines.append("from={}\n".format(module_def['from'])) 82 | lines.append("activation={}\n\n".format(module_def['activation'])) 83 | elif mtype == 'route': 84 | lines.append("layers={}\n\n".format(module_def['layers'])) 85 | elif mtype == 'reorg3d': 86 | lines.append("stride={}\n\n".format(module_def['stride'])) 87 | 88 | elif mtype == 'upsample': 89 | lines.append("stride={}\n\n".format(module_def['stride'])) 90 | elif mtype == 'maxpool': 91 | lines.append("stride={}\n".format(module_def['stride'])) 92 | lines.append("size={}\n\n".format(module_def['size'])) 93 | elif mtype == 'yolo': 94 | lines.append("mask = {}\n".format(module_def['mask'])) 95 | an_rows, an_cols = module_def['anchors'].shape 96 | tmp_str = '' 97 | for an_i in range(an_rows): 98 | tmp_str += '%d,%d, ' % (module_def['anchors'][an_i, 0], module_def['anchors'][an_i, 1]) 99 | print('===============', tmp_str[0:-2]) 100 | lines.append("anchors = {}\n".format(tmp_str[0:-2])) 101 | lines.append("classes = {}\n".format(module_def['classes'])) 102 | lines.append("num = {}\n".format(module_def['num'])) 103 | lines.append("jitter = {}\n".format(module_def['jitter'])) 104 | lines.append("ignore_thresh = {}\n".format(module_def['ignore_thresh'])) 105 | lines.append("truth_thresh = {}\n".format(module_def['truth_thresh'])) 106 | lines.append("random = {}\n\n".format(module_def['random'])) 107 | 108 | with open(new_path, "w") as f: 109 | f.writelines(lines) 110 | 111 | 112 | 113 | def test( 114 | cfg, 115 | weights=None, 116 | img_size=406, 117 | save=None, 118 | overall_ratio=0.5, 119 | perlayer_ratio=0.1 120 | ): 121 | 122 | """prune yolov3 and generate cfg, weights 123 | """ 124 | if save != None: 125 | if not os.path.exists(save): 126 | os.makedirs(save) 127 | device = torch_utils.select_device() 128 | # Initialize model 129 | model = Darknet(cfg, img_size).to(device) 130 | 131 | # Load weights 132 | if weights.endswith('.pt'): # pytorch format 133 | _state_dict = torch.load(weights, map_location=device)['model'] 134 | model.load_state_dict(_state_dict) 135 | else: # darknet format 136 | _ = load_darknet_weights(model, weights) 137 | 138 | ## output a new cfg file 139 | total = 0 140 | for m in model.modules(): 141 | if isinstance(m, nn.BatchNorm2d): 142 | total += m.weight.data.shape[0] # channels numbers 143 | 144 | bn = torch.zeros(total) 145 | index = 0 146 | 147 | for m in model.modules(): 148 | if isinstance(m, nn.BatchNorm2d): 149 | size = m.weight.data.shape[0] 150 | bn[index:(index+size)] = m.weight.data.abs().clone() 151 | index += size 152 | 153 | sorted_bn, sorted_index = torch.sort(bn) 154 | thresh_index = int(total*overall_ratio) 155 | thresh = sorted_bn[thresh_index].cuda() 156 | 157 | print("--"*30) 158 | print() 159 | #print(list(model.modules())) 160 | # 161 | proned_module_defs = model.module_defs 162 | for i, (module_def, module) in enumerate(zip(model.module_defs, model.module_list)): 163 | print("layer:", i) 164 | mtype = module_def['type'] 165 | if mtype == 'convolutional': 166 | bn = int(module_def['batch_normalize']) 167 | if bn: 168 | m = getattr(module, 'BatchNorm2d') # batch_norm layer 169 | weight_copy = m.weight.data.abs().clone() 170 | channels = weight_copy.shape[0] # 171 | min_channel_num = int(channels * perlayer_ratio) if int(channels * perlayer_ratio) > 0 else 1 172 | mask = weight_copy.gt(thresh).float().cuda() 173 | 174 | if int(torch.sum(mask)) < min_channel_num: 175 | _, sorted_index_weights = torch.sort(weight_copy,descending=True) 176 | mask[sorted_index_weights[:min_channel_num]]=1. 177 | 178 | proned_module_defs[i]['mask'] = mask.clone() 179 | 180 | print('layer index: {:d} \t total channel: {:d} \t remaining channel: {:d}'. 181 | format(i, mask.shape[0], int(torch.sum(mask)))) 182 | 183 | print("layer:", mtype) 184 | 185 | elif mtype in ['upsample', 'maxpool', 'reorg3d']: 186 | print("layer:", mtype) 187 | 188 | elif mtype == 'route': 189 | print("layer:", mtype) 190 | # 191 | 192 | elif mtype == 'shortcut': 193 | layer_i = int(module_def['from'])+i 194 | print("from layer ", layer_i) 195 | print("layer:", mtype) 196 | proned_module_defs[i]['is_access'] = False 197 | 198 | 199 | elif mtype == 'yolo': 200 | print("layer:", mtype) 201 | 202 | 203 | layer_number = len(proned_module_defs) 204 | for i in range(layer_number-1, -1, -1): 205 | mtype = proned_module_defs[i]['type'] 206 | if mtype == 'shortcut': 207 | if proned_module_defs[i]['is_access']: 208 | continue 209 | 210 | Merge_masks = [] 211 | layer_i = i 212 | while mtype == 'shortcut': 213 | proned_module_defs[layer_i]['is_access'] = True 214 | 215 | if proned_module_defs[layer_i-1]['type'] == 'convolutional': 216 | bn = int(proned_module_defs[layer_i-1]['batch_normalize']) 217 | if bn: 218 | Merge_masks.append(proned_module_defs[layer_i-1]["mask"].unsqueeze(0)) 219 | 220 | layer_i = int(proned_module_defs[layer_i]['from'])+layer_i 221 | mtype = proned_module_defs[layer_i]['type'] 222 | 223 | if mtype == 'convolutional': 224 | bn = int(proned_module_defs[layer_i]['batch_normalize']) 225 | if bn: 226 | Merge_masks.append(proned_module_defs[layer_i]["mask"].unsqueeze(0)) 227 | 228 | 229 | if len(Merge_masks) > 1: 230 | Merge_masks = torch.cat(Merge_masks, 0) 231 | merge_mask = (torch.sum(Merge_masks, dim=0) > 0).float().cuda() 232 | else: 233 | merge_mask = Merge_masks[0].float().cuda() 234 | 235 | layer_i = i 236 | mtype = 'shortcut' 237 | while mtype == 'shortcut': 238 | 239 | if proned_module_defs[layer_i-1]['type'] == 'convolutional': 240 | bn = int(proned_module_defs[layer_i-1]['batch_normalize']) 241 | if bn: 242 | proned_module_defs[layer_i-1]["mask"] = merge_mask 243 | 244 | layer_i = int(proned_module_defs[layer_i]['from'])+layer_i 245 | mtype = proned_module_defs[layer_i]['type'] 246 | 247 | if mtype == 'convolutional': 248 | bn = int(proned_module_defs[layer_i]['batch_normalize']) 249 | if bn: 250 | proned_module_defs[layer_i]["mask"] = merge_mask 251 | 252 | 253 | 254 | for i, (module_def, module) in enumerate(zip(model.module_defs, model.module_list)): 255 | print("layer:", i) 256 | mtype = module_def['type'] 257 | if mtype == 'convolutional': 258 | bn = int(module_def['batch_normalize']) 259 | if bn: 260 | 261 | layer_i_1 = i - 1 262 | proned_module_defs[i]['mask_before'] = None 263 | 264 | mask_before = [] 265 | conv_indexs = [] 266 | if i > 0: 267 | conv_indexs = route_conv(i, proned_module_defs) 268 | for conv_index in conv_indexs: 269 | mask_before += proned_module_defs[conv_index]["mask"].clone().cpu().numpy().tolist() 270 | proned_module_defs[i]['mask_before'] = torch.tensor(mask_before).float().cuda() 271 | 272 | 273 | 274 | 275 | output_cfg_path = os.path.join(save, "prune.cfg") 276 | write_model_cfg(cfg, output_cfg_path, proned_module_defs) 277 | 278 | pruned_model = Darknet(output_cfg_path, img_size).to(device) 279 | print(list(pruned_model.modules())) 280 | for i, (module_def, old_module, new_module) in enumerate(zip(proned_module_defs, model.module_list, pruned_model.module_list)): 281 | mtype = module_def['type'] 282 | print("layer: ",i, mtype) 283 | if mtype == 'convolutional': # 284 | bn = int(module_def['batch_normalize']) 285 | if bn: 286 | new_norm = getattr(new_module, 'BatchNorm2d') # batch_norm layer 287 | old_norm = getattr(old_module, 'BatchNorm2d') # batch_norm layer 288 | 289 | new_conv = getattr(new_module, 'Conv2d') # conv layer 290 | old_conv = getattr(old_module, 'Conv2d') # conv layer 291 | 292 | 293 | idx1 = np.squeeze(np.argwhere(np.asarray(module_def['mask'].cpu().numpy()))) 294 | if i > 0: 295 | idx2 = np.squeeze(np.argwhere(np.asarray(module_def['mask_before'].cpu().numpy()))) 296 | new_conv.weight.data = old_conv.weight.data[idx1.tolist()][:, idx2.tolist(), :, :].clone() 297 | 298 | print("idx1: ", len(idx1), ", idx2: ", len(idx2)) 299 | else: 300 | new_conv.weight.data = old_conv.weight.data[idx1.tolist()].clone() 301 | 302 | new_norm.weight.data = old_norm.weight.data[idx1.tolist()].clone() 303 | new_norm.bias.data = old_norm.bias.data[idx1.tolist()].clone() 304 | new_norm.running_mean = old_norm.running_mean[idx1.tolist()].clone() 305 | new_norm.running_var = old_norm.running_var[idx1.tolist()].clone() 306 | 307 | 308 | print('layer index: ', i, 'idx1: ', idx1) 309 | else: 310 | 311 | new_conv = getattr(new_module, 'Conv2d') # batch_norm layer 312 | old_conv = getattr(old_module, 'Conv2d') # batch_norm layer 313 | idx2 = np.squeeze(np.argwhere(np.asarray(proned_module_defs[i-1]['mask'].cpu().numpy()))) 314 | new_conv.weight.data = old_conv.weight.data[:,idx2.tolist(),:,:].clone() 315 | new_conv.bias.data = old_conv.bias.data.clone() 316 | print('layer index: ', i, "entire copy") 317 | 318 | print('--'*30) 319 | print('prune done!') 320 | print('pruned ratio %.3f'%overall_ratio) 321 | prune_weights_path = os.path.join(save, "prune.pt") 322 | _pruned_state_dict = pruned_model.state_dict() 323 | torch.save(_pruned_state_dict, prune_weights_path) 324 | 325 | print("Done!") 326 | 327 | 328 | 329 | # test 330 | pruned_model.eval() 331 | img_path = "test.jpg" 332 | 333 | org_img = cv2.imread(img_path) # BGR 334 | # img, ratiow, ratioh, padw, padh = letterbox(org_img, new_shape=[img_size,img_size], mode='rect') 335 | img = org_img 336 | 337 | # Normalize 338 | img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416 339 | img = np.ascontiguousarray(img, dtype=np.float32) # uint8 to float32 340 | img /= 255.0 # 0 - 255 to 0.0 - 1.0 341 | 342 | imgs = torch.from_numpy(img).unsqueeze(0).to(device) 343 | _, _, height, width = imgs.shape # batch size, channels, height, width 344 | 345 | # Run model 346 | inf_out, train_out = pruned_model(imgs) # inference and training outputs 347 | # Run NMS 348 | output = non_max_suppression(inf_out, conf_thres=0.005, nms_thres=0.5) 349 | # Statistics per image 350 | for si, pred in enumerate(output): 351 | if pred is None: 352 | continue 353 | if True: 354 | box = pred[:, :4].clone() # xyxy 355 | scale_coords(imgs[si].shape[1:], box, org_img.shape[:2]) # to original shape 356 | for di, d in enumerate(pred): 357 | category_id = int(d[6]) 358 | left, top, right, bot = [float(x) for x in box[di]] 359 | confidence = float(d[4]) 360 | 361 | cv2.rectangle(org_img, (int(left), int(top)), (int(right), int(bot)), 362 | (255, 0, 0), 2) 363 | cv2.putText(org_img, str(category_id) + ":" + str('%.1f' % (float(confidence) * 100)) + "%", (int(left), int(top) - 8), 364 | cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 1) 365 | cv2.imshow("result", org_img) 366 | cv2.waitKey(-1) 367 | cv2.imwrite('result_{}'.format(img_path), org_img) 368 | 369 | 370 | # convert pt to weights: 371 | prune_c_weights_path = os.path.join(save, "prune.weights") 372 | save_weights(pruned_model, prune_c_weights_path) 373 | 374 | 375 | if __name__ == '__main__': 376 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 377 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 378 | parser = argparse.ArgumentParser(description='PyTorch Slimming Yolov3 prune') 379 | parser.add_argument('--cfg', type=str, default='VisDrone2019/yolov3-spp3.cfg', help='cfg file path') 380 | parser.add_argument('--weights', type=str, default='yolov3-spp3_final.weights', help='path to weights file') 381 | parser.add_argument('--img_size', type=int, default=608, help='inference size (pixels)') 382 | parser.add_argument('--save', default='prune', type=str, metavar='PATH', help='path to save pruned model (default: none)') 383 | parser.add_argument('--overall_ratio', type=float, default=0.5, help='scale sparse rate (default: 0.5)') 384 | parser.add_argument('--perlayer_ratio', type=float, default=0.1, help='minimal scale sparse rate (default: 0.1) to prevent disconnect') 385 | 386 | opt = parser.parse_args() 387 | opt.save += "_{}_{}".format(opt.overall_ratio, opt.perlayer_ratio) 388 | 389 | print(opt) 390 | 391 | with torch.no_grad(): 392 | test( 393 | opt.cfg, 394 | opt.weights, 395 | opt.img_size, 396 | opt.save, 397 | opt.overall_ratio, 398 | opt.perlayer_ratio, 399 | ) 400 | --------------------------------------------------------------------------------